Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] adding file loading portion for ms3 #14

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion msql.ebnf
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ statement: "QUERY" querytype "WHERE" wherefullcondition+ "FILTER" filterfullcond

querytype: datams1data
| datams2data
| function"(" datams1data ")"
| function "(" datams1data ")"
| function "(" datams2data ")"
| function "(" datams3data ")"
| function"(" datams1data "," param "=" floating ")"

datams1data: "MS1DATA"
datams2data: "MS2DATA"
datams3data: "MS3DATA"

wherefullcondition: wherefullcondition booleanconjunction wherefullcondition
| condition ":" qualifier
Expand Down
66 changes: 49 additions & 17 deletions msql_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,36 +40,67 @@ def _load_data(input_filename, cache=False):
if cache:
ms1_filename = input_filename + "_ms1.msql.feather"
ms2_filename = input_filename + "_ms2.msql.feather"
ms3_filename = input_filename + "_ms3.msql.feather"

if os.path.exists(ms1_filename):
ms1_df = pd.read_feather(ms1_filename)
ms2_df = pd.read_feather(ms2_filename)
feather_caches_exist = os.path.exists(ms1_filename) or os.path.exists(ms2_filename) or os.path.exists(ms3_filename)

return ms1_df, ms2_df
if feather_caches_exist:
try:
ms1_df = pd.read_feather(ms1_filename)
except:
ms1_df = pd.DataFrame()

try:
ms2_df = pd.read_feather(ms2_filename)
except:
ms2_df = pd.DataFrame()

try:
ms3_df = pd.read_feather(ms3_filename)
except:
ms3_df = pd.DataFrame()

return ms1_df, ms2_df, ms3_df

# Actually loading
if input_filename[-5:] == ".mzML":
ms1_df, ms2_df = msql_fileloading._load_data_mzML(input_filename)
ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_mzML(input_filename)

if input_filename[-6:] == ".mzXML":
ms1_df, ms2_df = msql_fileloading._load_data_mzXML(input_filename)
ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_mzXML(input_filename)

if input_filename[-5:] == ".json":
ms1_df, ms2_df = msql_fileloading._load_data_gnps_json(input_filename)
ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_gnps_json(input_filename)

if input_filename[-4:] == ".mgf":
ms1_df, ms2_df = msql_fileloading._load_data_mgf(input_filename)
ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_mgf(input_filename)

# Saving Cache
if cache:
ms1_filename = input_filename + "_ms1.msql.feather"
ms2_filename = input_filename + "_ms2.msql.feather"
ms3_filename = input_filename + "_ms3.msql.feather"

if not os.path.exists(ms1_filename):
ms1_df.to_feather(ms1_filename)
ms2_df.to_feather(ms2_filename)
feather_caches_exist = os.path.exists(ms1_filename) or os.path.exists(ms2_filename) or os.path.exists(ms3_filename)

return ms1_df, ms2_df
# Writing out the files
if not feather_caches_exist:
try:
ms1_df.to_feather(ms1_filename)
except:
pass

try:
ms2_df.to_feather(ms2_filename)
except:
pass

try:
ms3_df.to_feather(ms3_filename)
except:
pass

return ms1_df, ms2_df, ms3_df

def _get_ppm_tolerance(qualifiers):
if qualifiers is None:
Expand Down Expand Up @@ -256,7 +287,7 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Tr
# This is when the target is actually a float
pass

ms1_df, ms2_df = _load_data(input_filename, cache=cache)
ms1_df, ms2_df, ms3_df = _load_data(input_filename, cache=cache)

# Here we are going to translate the variable query into a concrete query based upon the data
all_concrete_queries = []
Expand Down Expand Up @@ -398,21 +429,22 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Tr
return _executecollate_query(parsed_dict, aggregated_ms1_df, aggregated_ms2_df)

@ray.remote
def _executeconditions_query_ray(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, cache=True):
return _executeconditions_query(parsed_dict, input_filename, ms1_input_df=ms1_input_df, ms2_input_df=ms2_input_df, cache=cache)
def _executeconditions_query_ray(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, ms3_input_df=None, cache=True):
return _executeconditions_query(parsed_dict, input_filename, ms1_input_df=ms1_input_df, ms2_input_df=ms2_input_df, ms3_input_df=ms3_input_df, cache=cache)

def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, cache=True):
def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, ms3_input_df=None, cache=True):
# This function attempts to find the data that the query specifies in the conditions

#import json
#print("parsed_dict", json.dumps(parsed_dict, indent=4))

# Let's apply this to real data
if ms1_input_df is None and ms2_input_df is None:
ms1_df, ms2_df = _load_data(input_filename, cache=cache)
ms1_df, ms2_df, ms3_df = _load_data(input_filename, cache=cache)
else:
ms1_df = ms1_input_df
ms2_df = ms2_input_df
ms3_df = ms3_input_df

# In order to handle intensities, we will make sure to sort all conditions with
# with the conditions that are the reference intensity first, then subsequent conditions
Expand Down
12 changes: 8 additions & 4 deletions msql_fileloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def _load_data_mgf(input_filename):
# Turning into pandas data frames
ms1_df = pd.DataFrame([peak_dict])
ms2_df = pd.DataFrame(ms2mz_list)
ms3_df = pd.DataFrame()

return ms1_df, ms2_df
return ms1_df, ms2_df, ms3_df

def _load_data_gnps_json(input_filename):
all_spectra = json.loads(open(input_filename).read())
Expand Down Expand Up @@ -64,8 +65,9 @@ def _load_data_gnps_json(input_filename):
# Turning into pandas data frames
ms1_df = pd.DataFrame([peak_dict])
ms2_df = pd.DataFrame(ms2mz_list)
ms3_df = pd.DataFrame()

return ms1_df, ms2_df
return ms1_df, ms2_df, ms3_df

def _load_data_mzXML(input_filename):
ms1mz_list = []
Expand Down Expand Up @@ -112,8 +114,9 @@ def _load_data_mzXML(input_filename):
# Turning into pandas data frames
ms1_df = pd.DataFrame(ms1mz_list)
ms2_df = pd.DataFrame(ms2mz_list)
ms3_df = pd.DataFrame()

return ms1_df, ms2_df
return ms1_df, ms2_df, ms3_df


def _load_data_mzML(input_filename):
Expand Down Expand Up @@ -187,6 +190,7 @@ def _load_data_mzML(input_filename):
# Turning into pandas data frames
ms1_df = pd.DataFrame(ms1mz_list)
ms2_df = pd.DataFrame(ms2mz_list)
ms3_df = pd.DataFrame()

return ms1_df, ms2_df
return ms1_df, ms2_df, ms3_df

10 changes: 9 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,13 @@ def test_ms1_filtered_by_ms2():
results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML")
print(results_df)

def test_ms3():
query = "QUERY scansum(MS3DATA)"
parse_obj = msql_parser.parse_msql(query)
print(json.dumps(parse_obj, indent=4))
results_df = msql_engine.process_query(query, "test/Toronamide_MS3_DDA_2.mzML")
print(results_df)

def test_intensity_int_parse():
query = "QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>1 AND MS2PROD=353.25:TOLERANCEMZ=0.1:INTENSITYPERCENT>80 AND MS1MZ=478.1991:TOLERANCEMZ=0.1:INTENSITYPERCENT>1"
parse_obj = msql_parser.parse_msql(query)
Expand Down Expand Up @@ -395,12 +402,13 @@ def main():
#test_ms1_cu()
#test_neutral_loss_intensity()
#test_gnps_library()
test_gnps_full_library()
#test_gnps_full_library()
#test_networking_mgf_library()
#test_swath()
#test_albicidin_tag()
#test_double_brominated()
#test_agilent()
test_ms3()

if __name__ == "__main__":
main()
Expand Down
3 changes: 2 additions & 1 deletion test/get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ wget --output-document=S_N2_neutral_Zn.mzML https://gnps-external.ucsd.edu/massi
wget --output-document=gnps.json https://gnps-external.ucsd.edu/gnpslibrary/ALL_GNPS.json
wget --output-document=gnps-library.json https://gnps-external.ucsd.edu/gnpslibrary/GNPS-LIBRARY.json
wget --output-document=specs_ms.mgf "http://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?task=5ecfcf81cb3c471698995b194d8246a0&block=main&file=spectra/specs_ms.mgf"
wget --output-document=1810E-II.mzML "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000084691/ccms_peak/1810E-II.mzML&forceDownload=true"
wget --output-document=1810E-II.mzML "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000084691/ccms_peak/1810E-II.mzML&forceDownload=true"
wget --output-document=Toronamide_MS3_DDA_2.mzML "https://gnps-external.ucsd.edu/massiveftpproxy?ftppath=ftp://massive.ucsd.edu/MSV000084754/ccms_peak/raw/Toronamide_MS3_DDA_2.mzML"