Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ZzakB committed Dec 15, 2021
1 parent 7cbb105 commit ebfee75
Show file tree
Hide file tree
Showing 18 changed files with 3,256 additions and 47,526 deletions.
19 changes: 12 additions & 7 deletions MolNotator.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
Metadata-Version: 2.1
Name: MolNotator
Version: 0.0.70
Version: 0.1.0
Summary: Predict the actual molecules in LC-MS/MS data through an interpretation of the ions detected via combinatorial triangulation.
Home-page: https://github.com/ZzakB/MolNotator
Author: Damien Olivier-Jimenez
Author-email: [email protected]
Author: Damien Olivier-Jimenez, Zakaria Bouchouireb
Author-email: [email protected], [email protected]
License: MIT
Platform: UNKNOWN
Classifier: Development Status :: 3 - Alpha
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Console
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: MIT License
Expand Down Expand Up @@ -107,14 +107,14 @@ Before installing MolNotator, make sure you have the following requirements inst

- pandas
- NumPy
- matchms >= 0.10.0
- matchms <= 0.6.2
- tqdm
- PyYaml

These dependencies can be installed using the following command :

```bash
pip install -U pandas numpy matchms tqdm pyyaml
pip install -U pandas==1.3.5 numpy matchms==0.6.2 tqdm pyyaml
```
### Via PyPI
We deploy the MolNotator package to [PyPi](https://test.pypi.org/project/MolNotator). You can install MolNotator as a python module with:
Expand Down Expand Up @@ -149,8 +149,13 @@ from MolNotator.Dereplicator import Dereplicator
from MolNotator.Cosiner import Cosiner
from MolNotator.MolNet import MolNet

wd = './examples/working_directory' # <---- change the path to your working directory
wd = './working_directory' # <---- change the path to your working directory
os.chdir(wd)

for files in os.listdir(os.getcwd()):
if files not in ['databases','mzmine_out','params']:
raise Exception('Potential output files already exist! They need to be removed or moved outside the working directory.')

with open("./params/params.yaml") as info:
params = yaml.load(info, Loader=yaml.FullLoader)

Expand Down
913 changes: 457 additions & 456 deletions MolNotator/Adnotator.py

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions MolNotator/Cosiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
pos_csv = pd.read_csv(pos_csv_file, index_col ="row ID")
neg_csv = Column_correction(neg_csv)
pos_csv = Column_correction(pos_csv)
neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "").str.replace('NEG_', '')
pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "").str.replace('POS_', '')
neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('NEG_', '', regex = False)
pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('POS_', '', regex = False)
neg_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
pos_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
samples = list(set(list(neg_csv.columns) + list(pos_csv.columns)))
Expand Down Expand Up @@ -121,11 +121,11 @@ def Spectrum_processing(s):


# Make a Series with MGF indexes as data and feature IDs as indexes
neg_mgf_data = pd.Series()
neg_mgf_data = pd.Series(dtype = int)
for i in range(len(neg_mgf)):
neg_mgf_data.loc[int(neg_mgf[i].get("feature_id"))] = i

pos_mgf_data = pd.Series()
pos_mgf_data = pd.Series(dtype = int)
for i in range(len(pos_mgf)):
pos_mgf_data.loc[int(pos_mgf[i].get("feature_id"))] = i

Expand Down Expand Up @@ -153,7 +153,7 @@ def Spectrum_processing(s):
remains_ions_pos += list(node_table.loc[unclustered_ions].index[node_table.loc[unclustered_ions]['status'] == 'pos_precursor'])

neg_node_table = node_table[node_table['ion_mode'] == "NEG"]
cluster_ion_list = pd.Series(index = cluster_list_neg)
cluster_ion_list = pd.Series(index = cluster_list_neg, dtype = int)
for i in cluster_ion_list.index:
tmp_rows = neg_node_table.index[neg_node_table['cluster_id'] == i]
cluster_ion_list[i] = '|'.join(neg_node_table.loc[tmp_rows, 'mgf_index'].dropna().astype(int).astype(str))
Expand Down Expand Up @@ -201,7 +201,7 @@ def Spectrum_processing(s):
full_matches.append(tmp_table['matches'])

pos_node_table = node_table[node_table['ion_mode'] == "POS"]
cluster_ion_list = pd.Series(index = cluster_list_pos)
cluster_ion_list = pd.Series(index = cluster_list_pos, dtype = int)
for i in cluster_ion_list.index:
tmp_rows = pos_node_table.index[pos_node_table['cluster_id'] == i]
cluster_ion_list[i] = '|'.join(pos_node_table.loc[tmp_rows, 'mgf_index'].dropna().astype(int).astype(str))
Expand Down Expand Up @@ -462,4 +462,4 @@ def Spectrum_processing(s):
out_path = out_path_samples,
merged_edge_table = edge_table,
merged_node_table = node_table)
return
return
12 changes: 6 additions & 6 deletions MolNotator/Dereplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
pos_csv = pd.read_csv(pos_csv_file, index_col ="row ID")
neg_csv = Column_correction(neg_csv)
pos_csv = Column_correction(pos_csv)
neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "").str.replace('NEG_', '')
pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "").str.replace('POS_', '')
neg_csv.columns = neg_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('NEG_', '', regex = False)
pos_csv.columns = pos_csv.columns.str.replace(".mzXML Peak area", "", regex = False).str.replace('POS_', '', regex = False)
neg_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
pos_csv.drop(["row m/z", "row retention time"], axis = 1, inplace = True)
samples = list(set(list(neg_csv.columns) + list(pos_csv.columns)))
Expand Down Expand Up @@ -331,7 +331,7 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
ion_mz = node_table.loc[i, "mz"]
ion_mgf_idx = int(node_table.loc[i, "mgf_index"])
ion_mode = node_table.loc[i, "ion_mode"]
hits = database_table[database_table['mz'].between(ion_mz - db_prec_error, ion_mz + db_prec_error, inclusive = True)].copy()
hits = database_table[database_table['mz'].between(ion_mz - db_prec_error, ion_mz + db_prec_error, inclusive = "both")].copy()

# Ion mode filter
if ion_mode == "NEG":
Expand All @@ -348,7 +348,7 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
hits = hits[hits['adduct'] == adduct]
if db_rt_filter:
rt = node_table.loc[i, 'rt']
hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = True)]
hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = "both")]

# Calculate cosine similarity if hit table is not empty
similarity_list = list()
Expand Down Expand Up @@ -461,10 +461,10 @@ def Samplewise_export(neg_csv_file, pos_csv_file, out_path, merged_edge_table, m
for i in tqdm(node_table.index):
if node_table.loc[i, "status_universal"] == "neutral":
mass = node_table.loc[i, "mz"]
hits = database_table[database_table['mass'].between(mass - db_prec_error, mass + db_prec_error, inclusive = True)].copy()
hits = database_table[database_table['mass'].between(mass - db_prec_error, mass + db_prec_error, inclusive = "both")].copy()
if db_adduct_filter :
rt = node_table.loc[i, 'rt']
hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = True)]
hits = hits[hits['rt'].between(rt - db_rt_error, rt + db_rt_error, inclusive = "both")]
if len(hits) == 0 :
new_row = [i, None, None, None] + [None]*len(db_params['db_export_fields'])
derep_table.append(new_row)
Expand Down
8 changes: 4 additions & 4 deletions MolNotator/Duplicate_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def Duplicate_finder(node_table, mgf_file):
candidates.sort()
candidates = pool_table.loc[candidates]

candidates = candidates[candidates['rt'].between(seed_rt - rt_error, seed_rt + rt_error, inclusive = True)]
candidates = candidates[candidates['mz'].between(seed_mz - mass_error, seed_mz + mass_error, inclusive = True)]
candidates = candidates[candidates['rt'].between(seed_rt - rt_error, seed_rt + rt_error, inclusive = "both")]
candidates = candidates[candidates['mz'].between(seed_mz - mass_error, seed_mz + mass_error, inclusive = "both")]
candidates = candidates.index.tolist()
ion_pool.remove(ion_seed)
ion_pool = list(set(ion_pool) - set(candidates))
Expand Down Expand Up @@ -194,9 +194,9 @@ def Duplicate_finder(node_table, mgf_file):
samples_j = csv_file.loc[id_j, samples][samples_j]
for sample in samples_j.index:
if sample in samples_i.index:
csv_file.loc[idx_i, sample] = max(samples_i[sample], samples_j[sample])
csv_file.loc[id_i, sample] = max(samples_i[sample], samples_j[sample])
else:
csv_file.loc[idx_i, sample] = samples_j[sample]
csv_file.loc[id_i, sample] = samples_j[sample]

# Filter kept ions to eliminate some ions that were missed

Expand Down
8 changes: 4 additions & 4 deletions MolNotator/Fragnotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def Rt_slicer(rt, rt_error, ion_id, input_table) :
rt_low = rt - rt_error
rt_high = rt + rt_error
sliced_table = input_table[input_table['rt'].between(rt_low,
rt_high, inclusive = True)].copy()
rt_high, inclusive = "both")].copy()
return sliced_table.drop(ion_id)


Expand Down Expand Up @@ -74,15 +74,15 @@ def Fragnotator_edge_table(mass_error, rt_error, min_shared_peaks, score_thresho
ion2_mz = node_table['mz'][j]
ion2_mz_low = ion2_mz - mass_error
ion2_mz_high = ion2_mz + mass_error
match = ion1_msms.between(ion2_mz_low, ion2_mz_high, inclusive = True)
match = ion1_msms.between(ion2_mz_low, ion2_mz_high, inclusive = "both")
if match.sum() > 0 : # if the frag candidate m/z is found in MSMS:
ion2_msms = mgf_file[j].peaks.mz # extract frag's MSMS to run tests
matched_peaks = 0
total_peaks = list(ion1_msms)
for frag in ion2_msms : # find the number of matched peaks
frag_low = frag - mass_error
frag_high = frag + mass_error
frag_found = ion1_msms.between(frag_low, frag_high, inclusive = True).sum()
frag_found = ion1_msms.between(frag_low, frag_high, inclusive = "both").sum()
if frag_found > 0 :
matched_peaks += 1
else :
Expand Down Expand Up @@ -166,7 +166,7 @@ def Fragnotation():
for i in frag_table.index:
low_mass = frag_table.loc[i, 'mass'] - mass_error
high_mass = frag_table.loc[i, 'mass'] + mass_error
temp_edge_table = edge_table[edge_table['mz_gap'].between(low_mass, high_mass, inclusive = True)]
temp_edge_table = edge_table[edge_table['mz_gap'].between(low_mass, high_mass, inclusive = "both")]
for j in temp_edge_table.index :
edge_table.loc[j, 'Fragnotation'] = frag_table.loc[i, 'loss']
return(edge_table)
Expand Down
2 changes: 1 addition & 1 deletion MolNotator/MGF_sample_slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def mgf_slicer(params : dict, ion_mode : str):

# Get the sample list
samples = pd.Series(csv_file.columns)
samples = list(samples.str.replace(mzmine_suffix, '.mgf'))
samples = list(samples.str.replace(mzmine_suffix, '.mgf', regex = False))
csv_file.columns = samples
samples.remove('row ID')
samples.remove('row m/z')
Expand Down
Loading

0 comments on commit ebfee75

Please sign in to comment.