Initial commit

ZzakB · Dec 15, 2021 · ebfee75 · ebfee75
1 parent 7cbb105
commit ebfee75
Show file tree

Hide file tree

Showing 18 changed files with 3,256 additions and 47,526 deletions.
diff --git a/MolNotator.egg-info/PKG-INFO b/MolNotator.egg-info/PKG-INFO
@@ -1,13 +1,13 @@
 Metadata-Version: 2.1
 Name: MolNotator
-Version: 0.0.70
+Version: 0.1.0
 Summary: Predict the actual molecules in LC-MS/MS data through an interpretation of the ions detected via combinatorial triangulation.
 Home-page: https://github.com/ZzakB/MolNotator
-Author: Damien Olivier-Jimenez
-Author-email: [email protected]
+Author: Damien Olivier-Jimenez, Zakaria Bouchouireb
+Author-email: [email protected], [email protected]
 License: MIT
 Platform: UNKNOWN
-Classifier: Development Status :: 3 - Alpha
+Classifier: Development Status :: 4 - Beta
 Classifier: Environment :: Console
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
@@ -107,14 +107,14 @@ Before installing MolNotator, make sure you have the following requirements inst
 
 - pandas
 - NumPy
-- matchms >= 0.10.0
+- matchms <= 0.6.2
 - tqdm
 - PyYaml
 
 These dependencies can be installed using the following command :
 
 ```bash
- pip install -U pandas numpy matchms tqdm pyyaml
+ pip install -U pandas==1.3.5 numpy matchms==0.6.2 tqdm pyyaml
 ```
 ### Via PyPI
 We deploy the MolNotator package to [PyPi](https://test.pypi.org/project/MolNotator). You can install MolNotator as a python module with:
@@ -149,8 +149,13 @@ from MolNotator.Dereplicator import Dereplicator
 from MolNotator.Cosiner import Cosiner
 from MolNotator.MolNet import MolNet
 
-wd = './examples/working_directory' # <---- change the path to your working directory
+wd = './working_directory' # <---- change the path to your working directory
 os.chdir(wd)
+
+for files in os.listdir(os.getcwd()):
+    if files not in ['databases','mzmine_out','params']:
+        raise Exception('Potential output files already exist! They need to be removed or moved outside the working directory.')
+
 with open("./params/params.yaml") as info:
     params = yaml.load(info, Loader=yaml.FullLoader)
 

diff --git a/MolNotator/Adnotator.py b/MolNotator/Adnotator.py
diff --git a/MolNotator/Cosiner.py b/MolNotator/Cosiner.py
@@ -10,8 +10,8 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
         pos_csv = pd.read_csv(pos_csv_file, index_col ="row ID")
         neg_csv = Column_correction(neg_csv)
         pos_csv = Column_correction(pos_csv)
-        neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "").str.replace('NEG_', '')
-        pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "").str.replace('POS_', '')
+        neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('NEG_', '', regex = False)
+        pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('POS_', '', regex = False)
         neg_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
         pos_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
         samples = list(set(list(neg_csv.columns) + list(pos_csv.columns)))
@@ -121,11 +121,11 @@ def Spectrum_processing(s):
 
 
     # Make a Series with MGF indexes as data and feature IDs as indexes
-    neg_mgf_data = pd.Series()
+    neg_mgf_data = pd.Series(dtype = int)
     for i in range(len(neg_mgf)):
         neg_mgf_data.loc[int(neg_mgf[i].get("feature_id"))] = i
 
-    pos_mgf_data = pd.Series()
+    pos_mgf_data = pd.Series(dtype = int)
     for i in range(len(pos_mgf)):
         pos_mgf_data.loc[int(pos_mgf[i].get("feature_id"))] = i
 
@@ -153,7 +153,7 @@ def Spectrum_processing(s):
     remains_ions_pos +=  list(node_table.loc[unclustered_ions].index[node_table.loc[unclustered_ions]['status'] == 'pos_precursor'])
 
     neg_node_table = node_table[node_table['ion_mode'] == "NEG"]
-    cluster_ion_list = pd.Series(index = cluster_list_neg)
+    cluster_ion_list = pd.Series(index = cluster_list_neg, dtype = int)
     for i in cluster_ion_list.index:
         tmp_rows = neg_node_table.index[neg_node_table['cluster_id'] == i]
         cluster_ion_list[i] = '|'.join(neg_node_table.loc[tmp_rows, 'mgf_index'].dropna().astype(int).astype(str))
@@ -201,7 +201,7 @@ def Spectrum_processing(s):
         full_matches.append(tmp_table['matches'])
 
     pos_node_table = node_table[node_table['ion_mode'] == "POS"]
-    cluster_ion_list = pd.Series(index = cluster_list_pos)
+    cluster_ion_list = pd.Series(index = cluster_list_pos, dtype = int)
     for i in cluster_ion_list.index:
         tmp_rows = pos_node_table.index[pos_node_table['cluster_id'] == i]
         cluster_ion_list[i] = '|'.join(pos_node_table.loc[tmp_rows, 'mgf_index'].dropna().astype(int).astype(str))
@@ -462,4 +462,4 @@ def Spectrum_processing(s):
                           out_path = out_path_samples,
                           merged_edge_table = edge_table,
                           merged_node_table = node_table)
-    return
+    return
diff --git a/MolNotator/Dereplicator.py b/MolNotator/Dereplicator.py
@@ -174,8 +174,8 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
         pos_csv = pd.read_csv(pos_csv_file, index_col ="row ID")
         neg_csv = Column_correction(neg_csv)
         pos_csv = Column_correction(pos_csv)
-        neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "").str.replace('NEG_', '')
-        pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "").str.replace('POS_', '')
+        neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('NEG_', '', regex = False)
+        pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('POS_', '', regex = False)
         neg_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
         pos_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
         samples = list(set(list(neg_csv.columns) + list(pos_csv.columns)))
@@ -331,7 +331,7 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
             ion_mz = node_table.loc[i, "mz"]
             ion_mgf_idx = int(node_table.loc[i, "mgf_index"])
             ion_mode = node_table.loc[i, "ion_mode"]
-            hits = database_table[database_table['mz'].between(ion_mz - db_prec_error, ion_mz + db_prec_error, inclusive = True)].copy()
+            hits = database_table[database_table['mz'].between(ion_mz - db_prec_error, ion_mz + db_prec_error, inclusive = "both")].copy()
 
             # Ion mode filter
             if ion_mode == "NEG":
@@ -348,7 +348,7 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
                     hits = hits[hits['adduct'] == adduct]
             if db_rt_filter:
                 rt = node_table.loc[i, 'rt']
-                hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = True)]
+                hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = "both")]
 
             # Calculate cosine similarity if hit table is not empty
             similarity_list = list()
@@ -461,10 +461,10 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
         for i in tqdm(node_table.index):
             if node_table.loc[i, "status_universal"] == "neutral":
                 mass = node_table.loc[i, "mz"]
-                hits = database_table[database_table['mass'].between(mass - db_prec_error, mass + db_prec_error, inclusive = True)].copy()
+                hits = database_table[database_table['mass'].between(mass - db_prec_error, mass + db_prec_error, inclusive = "both")].copy()
                 if db_adduct_filter :
                     rt = node_table.loc[i, 'rt']
-                    hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = True)]
+                    hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = "both")]
                 if len(hits) == 0 : 
                     new_row = [i, None, None, None] + [None]*len(db_params['db_export_fields'])
                     derep_table.append(new_row)    

diff --git a/MolNotator/Duplicate_filter.py b/MolNotator/Duplicate_filter.py
@@ -76,8 +76,8 @@ def Duplicate_finder(node_table, mgf_file):
                 candidates.sort()
                 candidates = pool_table.loc[candidates]
 
-                candidates = candidates[candidates['rt'].between(seed_rt - rt_error, seed_rt + rt_error, inclusive = True)]
-                candidates = candidates[candidates['mz'].between(seed_mz - mass_error, seed_mz + mass_error, inclusive = True)]
+                candidates = candidates[candidates['rt'].between(seed_rt - rt_error, seed_rt + rt_error, inclusive = "both")]
+                candidates = candidates[candidates['mz'].between(seed_mz - mass_error, seed_mz + mass_error, inclusive = "both")]
                 candidates = candidates.index.tolist()
                 ion_pool.remove(ion_seed)
                 ion_pool = list(set(ion_pool) - set(candidates))
@@ -194,9 +194,9 @@ def Duplicate_finder(node_table, mgf_file):
             samples_j = csv_file.loc[id_j, samples][samples_j]
             for sample in samples_j.index:
                 if sample in samples_i.index:
-                    csv_file.loc[idx_i, sample] = max(samples_i[sample], samples_j[sample])
+                    csv_file.loc[id_i, sample] = max(samples_i[sample], samples_j[sample])
                 else:
-                    csv_file.loc[idx_i, sample] = samples_j[sample]
+                    csv_file.loc[id_i, sample] = samples_j[sample]
 
     # Filter kept ions to eliminate some ions that were missed
 

diff --git a/MolNotator/Fragnotator.py b/MolNotator/Fragnotator.py
@@ -12,7 +12,7 @@ def Rt_slicer(rt, rt_error, ion_id, input_table) :
         rt_low = rt - rt_error
         rt_high = rt + rt_error
         sliced_table = input_table[input_table['rt'].between(rt_low,
-                                  rt_high, inclusive = True)].copy()
+                                  rt_high, inclusive = "both")].copy()
         return sliced_table.drop(ion_id)
 
 
@@ -74,15 +74,15 @@ def Fragnotator_edge_table(mass_error, rt_error, min_shared_peaks, score_thresho
                 ion2_mz = node_table['mz'][j]
                 ion2_mz_low = ion2_mz - mass_error
                 ion2_mz_high = ion2_mz + mass_error
-                match = ion1_msms.between(ion2_mz_low, ion2_mz_high, inclusive = True)
+                match = ion1_msms.between(ion2_mz_low, ion2_mz_high, inclusive = "both")
                 if match.sum() > 0 : # if the frag candidate m/z is found in MSMS:
                     ion2_msms = mgf_file[j].peaks.mz # extract frag's MSMS to run tests
                     matched_peaks = 0
                     total_peaks = list(ion1_msms)
                     for frag in ion2_msms : # find the number of matched peaks
                         frag_low = frag - mass_error
                         frag_high = frag + mass_error
-                        frag_found = ion1_msms.between(frag_low, frag_high, inclusive = True).sum()
+                        frag_found = ion1_msms.between(frag_low, frag_high, inclusive = "both").sum()
                         if frag_found > 0 :
                             matched_peaks += 1
                         else :
@@ -166,7 +166,7 @@ def Fragnotation():
         for i in frag_table.index:
             low_mass = frag_table.loc[i, 'mass'] - mass_error
             high_mass = frag_table.loc[i, 'mass'] + mass_error
-            temp_edge_table = edge_table[edge_table['mz_gap'].between(low_mass, high_mass, inclusive = True)]
+            temp_edge_table = edge_table[edge_table['mz_gap'].between(low_mass, high_mass, inclusive = "both")]
             for j in temp_edge_table.index : 
                 edge_table.loc[j, 'Fragnotation'] = frag_table.loc[i, 'loss']
         return(edge_table)

diff --git a/MolNotator/MGF_sample_slicer.py b/MolNotator/MGF_sample_slicer.py
@@ -39,7 +39,7 @@ def mgf_slicer(params : dict, ion_mode : str):
 
     # Get the sample list
     samples = pd.Series(csv_file.columns)
-    samples = list(samples.str.replace(mzmine_suffix, '.mgf'))
+    samples = list(samples.str.replace(mzmine_suffix, '.mgf', regex = False))
     csv_file.columns = samples
     samples.remove('row ID')
     samples.remove('row m/z')