Fixed python/pandas compatibility issues and misnamed columns

SystemsGenetics · Sep 25, 2023 · fd6bfc8 · fd6bfc8
1 parent fa46d2c
commit fd6bfc8
Show file tree

Hide file tree

Showing 7 changed files with 88,371 additions and 70,695 deletions.
diff --git a/README.md b/README.md
@@ -211,7 +211,7 @@ fe.importFiles({
                        'arabidopsis_thaliana.TAIR10.genes2IPR.txt',
                        'arabidopsis_thaliana.TAIR10.genes2Pfam.txt',
                        'arabidopsis_thaliana.TAIR10.genes2PO.txt']
-    'terms': ['IPR.terms.tsv', 'GO.terms.tsv', 'KEGG>terms.tsv']
+    'terms': ['IPR.terms.tsv', 'GO.terms.tsv', 'KEGG.terms.tsv']
 })
 ```
 Alternatively, you may have created the terms DataFrame using the `vocabs.getTerms()` function described above.  If so, you can leave out the `terms` argument in the `importFiles()` function call above and set the terms manually:
@@ -245,9 +245,9 @@ Once completed you can access results using the following attributes of the `FUN
 
 Finally, below are example commands to save results to a file:
 ```Python
-fe.enrichment.sort_values('Fishers_pvalue').to_csv('FUNC-E.enriched_terms.tsv', sep="\t", index=None)
+fe.enrichment.sort_values(['Module', 'Fishers p-value']).to_csv('FUNC-E.enriched_terms.tsv', sep="\t", index=None)
 
-fe.clusters.to_csv('FUNC-E.clusters.tsv', sep="\t", index=None)
+fe.clusters.sort_values(['Module','Cluster Index', 'EASE Score']).to_csv('FUNC-E.clusters.tsv', sep="\t", index=None)
 
-fe.cluster_terms.to_csv('FUNC-E.cluster_terms.tsv', sep="\t", index=None)
+fe.cluster_terms.sort_values(['Module','Cluster Index', 'Fishers p-value']).to_csv('FUNC-E.cluster_terms.tsv', sep="\t", index=None)
 ```
diff --git a/func_e/FUNC_E.py b/func_e/FUNC_E.py
@@ -39,7 +39,7 @@ def reset(self):
         # Dataframes containing results
         self.enrichment = pd.DataFrame(columns=["Module", "ID_Space", "Vocabulary", "Term", "Name", "Module Size", "Count In Module", "Count In Background", "Fishers p-value"])
         self.cluster_list = {}
-        self.clusters = pd.DataFrame(columns = ['Module', 'Cluster Index', 'GEometric Mean', 'EASE Score', 'Features', 'Enriched Terms'])
+        self.clusters = pd.DataFrame(columns = ['Module', 'Cluster Index', 'Geometric Mean', 'EASE Score', 'Features', 'Enriched Terms'])
         self.kappa = pd.DataFrame(columns=['Feature1', 'Feature2', 'Module', 'Score'])
         self.cluster_terms = pd.DataFrame(columns = ['Module', 'Cluster Index', "ID_Space", "Vocabulary", 'Term', 'Name', 'Module Size', 'Count In Module', 'Count In Background', 'Fishers p-value', 'Bonferroni', 'Benjamini'])
         self.efeatures = pd.DataFrame(columns=["Feature", "Module", "Term"])
@@ -217,6 +217,7 @@ def _log(self, message, level=1, end="\n"):
 
     def run(self, cluster=True, modules=[], vocabs=[]):
         """
+        Performs functional enrichment after all data is loaded and added to the object.
         """
         if self.isReady() is False:
             self._log("Cannot perform this step as all necessary inputs are not set.")
@@ -278,35 +279,49 @@ def doModuleEnrichment(self, module, vocabs=[]):
                 if pvalue <= self.ecut:
                     term_details = self.terms.loc[self.terms['Term'] == term]
                     name = term_details['Name'].iloc[0]
-                    idspace = term_details['ID_Space'].iloc[0]
-                    modResults = modResults.append({
-                      "Module": module,
-                      "ID_Space": idspace,
-                      "Vocabulary": vocab,
-                      "Term": term,
-                      "Name": name,
-                      "Module Size": modSize,
-                      "Count In Module": n11,
-                      "Count In Background": n21,
-                      "Fishers p-value": pvalue}, ignore_index=True)
+                    idspace = term_details['ID_Space'].iloc[0]                 
+                    new_row = pd.DataFrame({
+                      "Module": [module],
+                      "ID_Space": [idspace],
+                      "Vocabulary": [vocab],
+                      "Term": [term],
+                      "Name": [name],
+                      "Module Size": [modSize],
+                      "Count In Module": [n11],
+                      "Count In Background": [n21],
+                      "Fishers p-value": [pvalue]})
+                    if (modResults.shape[0] == 0):
+                        modResults = new_row                        
+                    else:
+                        modResults = pd.concat([modResults, new_row], ignore_index = True)
+
 
         if self.verbose > 0:
             pbar.update(total_tests)
             pbar.finish()
 
         # Combine the module's enriched terms with the full result set.
         modResults.sort_values(['Module', 'Fishers p-value'], inplace=True)
-        self.enrichment = pd.concat([self.enrichment, modResults], ignore_index=True)
+        if (self.enrichment.shape[0] > 0):
+            if (modResults.shape[0] > 0):
+                self.enrichment = pd.concat([self.enrichment, modResults], ignore_index=True)
+        else:
+            if (modResults.shape[0] > 0):
+                self.enrichment = modResults
 
         # Create the list of genes with enriched features.
-        efeatures = pd.DataFrame(self.terms2features.set_index('Term')
-            .join(modResults.set_index('Term'), how="inner")[['Feature', 'Module']]
-            .reset_index()
-            .groupby(['Feature', 'Module'])['Term']
-            .apply(list)).reset_index()
-
-        efeatures = efeatures.set_index(['Feature','Module']).join(self.query.set_index(['Feature','Module']), how='inner').reset_index()
-        self.efeatures = pd.concat([self.efeatures, efeatures], ignore_index=True)
+        if modResults.shape[0] > 0:
+            efeatures = pd.DataFrame(self.terms2features.set_index('Term')
+                .join(modResults.set_index('Term'), how="inner")[['Feature', 'Module']]
+                .reset_index()
+                .groupby(['Feature', 'Module'])['Term']
+                .apply(list)).reset_index()
+
+            efeatures = efeatures.set_index(['Feature','Module']).join(self.query.set_index(['Feature','Module']), how='inner').reset_index()        
+            if self.efeatures.shape[0] > 0:
+                self.efeatures = pd.concat([self.efeatures, efeatures], ignore_index=True)
+            else:
+                self.efeatures = efeatures
 
 
     def doEnrichment(self, modules = [], vocabs = []):
@@ -411,7 +426,11 @@ def doModuleKappa(self, module):
         if self.verbose > 0:
             pbar.update(total_comps)
             pbar.finish()
-        self.kappa = pd.concat([self.kappa, pd.DataFrame(scores, columns=['Feature1', 'Feature2', 'Module', 'Score', 'Overlap'])], ignore_index=True)
+
+        if (self.kappa.shape[0] == 0):
+            self.kappa = pd.concat([pd.DataFrame(scores, columns=['Feature1', 'Feature2', 'Module', 'Score', 'Overlap'])], ignore_index=True)
+        else:
+            self.kappa = pd.concat([self.kappa, pd.DataFrame(scores, columns=['Feature1', 'Feature2', 'Module', 'Score', 'Overlap'])], ignore_index=True)
 
     def doKappa(self, modules = [] ):
         """
@@ -529,20 +548,24 @@ def _mergeGroups(self, groups):
     def _calculateClusterStats(self, clusters, module):
         """
         """
-        cluster_stats = pd.DataFrame(columns = ['Module', 'Cluster Index', 'GEometric Mean', 'EASE Score', 'Features', 'Enriched Terms'])
+        cluster_stats = pd.DataFrame(columns = ['Module', 'Cluster Index', 'Geometric Mean', 'EASE Score', 'Features', 'Enriched Terms'])
         menrichment = self.enrichment[self.enrichment['Module'] == module].set_index('Term')
         for i in range(0, len(clusters)):
             features = clusters[i]
             eterms = self.terms2features.loc[features].set_index('Term').join(menrichment, how='inner').reset_index()[['Term','Fishers p-value']].drop_duplicates()
             gmean = stats.gmean(eterms['Fishers p-value'])
-            cluster_stats = cluster_stats.append({
-                'Module': module,
-                'Cluster Index': i + 1,
-                'GEometric Mean': gmean,
-                'EASE Score': - np.log10(gmean),
-                'Features': features,
-                'Enriched Terms': list(eterms['Term'].values)
-            }, ignore_index=True)
+            new_row = pd.DataFrame({
+                'Module': [module],
+                'Cluster Index': [i + 1],
+                'Geometric Mean': [gmean],
+                'EASE Score': [- np.log10(gmean)],
+                'Features': [features],
+                'Enriched Terms': [list(eterms['Term'].values)]})
+
+            if (cluster_stats.shape[0] == 0):
+                cluster_stats = new_row
+            else:
+                cluster_stats = pd.concat([cluster_stats, new_row], ignore_index = True)
         return cluster_stats
 
     def doModuleClustering(self, module):
@@ -589,7 +612,13 @@ def doModuleClustering(self, module):
         stats = self._calculateClusterStats(final_clusters, module)
 
         # Add to the clusters data frame.
-        self.clusters = pd.concat([self.clusters, stats], ignore_index=True)
+        if self.clusters.shape[0] > 0:
+            if stats.shape[0] > 0:
+                self.clusters = pd.concat([self.clusters, stats], ignore_index=True)
+        else:
+            if stats.shape[0] > 0:
+                self.clusters = stats
+
 
         # Add to the cluster list.
         self.cluster_list[module] = final_clusters
@@ -599,7 +628,13 @@ def doModuleClustering(self, module):
         for i in range(0, len(cluster_terms)):
             cluster_terms[i][1]['Cluster Index'] = cluster_terms[i][0]['Cluster Index']
             cluster_terms[i][1].sort_values(['Module', 'Cluster Index', 'Fishers p-value'], inplace=True)
-            self.cluster_terms = pd.concat([self.cluster_terms, cluster_terms[i][1]], ignore_index=True)
+            if self.cluster_terms.shape[0] > 0:
+                if cluster_terms[i][1].shape[0] > 0:
+                    self.cluster_terms = pd.concat([self.cluster_terms, cluster_terms[i][1]], ignore_index=True)
+            else:
+                if cluster_terms[i][1].shape[0] > 0:
+                    self.cluster_terms = cluster_terms[i][1]
+
 
     def doClustering(self, modules = []):
         """

diff --git a/func_e/cmd.py b/func_e/cmd.py
@@ -104,8 +104,8 @@ def func_e():
     # Write out the results files.
     outprefix = args.outprefix + '.' if args.outprefix else ''
 
-    fe.enrichment.sort_values(['Module', 'Fishers_pvalue']).to_csv(outprefix + 'FUNC-E.enriched_terms.tsv', sep="\t", index=None)
-    fe.clusters.sort_values(['Module','Cluster_Index']).to_csv(outprefix + 'FUNC-E.clusters.tsv', sep="\t", index=None)
-    fe.cluster_terms.sort_values(['Module','Cluster_Index','Fishers_pvalue']).to_csv(outprefix + 'FUNC-E.cluster_terms.tsv', sep="\t", index=None)
+    fe.enrichment.sort_values(['Module', 'Fishers p-value']).to_csv(outprefix + 'FUNC-E.enriched_terms.tsv', sep="\t", index=None)
+    fe.clusters.sort_values(['Module','Cluster Index', 'EASE Score']).to_csv(outprefix + 'FUNC-E.clusters.tsv', sep="\t", index=None)
+    fe.cluster_terms.sort_values(['Module','Cluster Index','Fishers p-value']).to_csv(outprefix + 'FUNC-E.cluster_terms.tsv', sep="\t", index=None)
     fe.kappa.to_csv(outprefix + 'FUNC-E.kappa.tsv', sep="\t", index=None)
     fe.efeatures.to_csv(outprefix + 'FUNC-E.efeatures.tsv', sep="\t", index=None)
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
   pandas
   scipy
   statsmodels
-  sklearn
+  scikit-learn
   progressbar2
   scipy
   requests