From cf04d46f4414d54b9f22dc98f7c445d3209eb30d Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Fri, 25 Jun 2021 10:51:46 -0700 Subject: [PATCH 1/3] adding file loading portion for ms3 --- msql_fileloading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/msql_fileloading.py b/msql_fileloading.py index 988efd9..fb51f39 100644 --- a/msql_fileloading.py +++ b/msql_fileloading.py @@ -187,6 +187,7 @@ def _load_data_mzML(input_filename): # Turning into pandas data frames ms1_df = pd.DataFrame(ms1mz_list) ms2_df = pd.DataFrame(ms2mz_list) + ms3_df = pd.DataFrame() return ms1_df, ms2_df From 9a49a1a524dc59847d56c21551f1c5c161f8800e Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 14 Jul 2021 13:17:41 -0700 Subject: [PATCH 2/3] downloading files --- test/get_data.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/get_data.sh b/test/get_data.sh index 068a042..ca155f2 100644 --- a/test/get_data.sh +++ b/test/get_data.sh @@ -7,4 +7,5 @@ wget --output-document=S_N2_neutral_Zn.mzML https://gnps-external.ucsd.edu/massi wget --output-document=gnps.json https://gnps-external.ucsd.edu/gnpslibrary/ALL_GNPS.json wget --output-document=gnps-library.json https://gnps-external.ucsd.edu/gnpslibrary/GNPS-LIBRARY.json wget --output-document=specs_ms.mgf "http://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?task=5ecfcf81cb3c471698995b194d8246a0&block=main&file=spectra/specs_ms.mgf" -wget --output-document=1810E-II.mzML "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000084691/ccms_peak/1810E-II.mzML&forceDownload=true" \ No newline at end of file +wget --output-document=1810E-II.mzML "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?file=f.MSV000084691/ccms_peak/1810E-II.mzML&forceDownload=true" +wget --output-document=Toronamide_MS3_DDA_2.mzML "https://gnps-external.ucsd.edu/massiveftpproxy?ftppath=ftp://massive.ucsd.edu/MSV000084754/ccms_peak/raw/Toronamide_MS3_DDA_2.mzML" \ No newline at end of file From 9dd676899491a3c789df318dbd071c765379246d Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 14 Jul 2021 15:48:39 -0700 Subject: [PATCH 3/3] partial support for ms3 --- msql.ebnf | 4 ++- msql_engine.py | 66 +++++++++++++++++++++++++++++++++------------ msql_fileloading.py | 11 +++++--- test.py | 10 ++++++- 4 files changed, 68 insertions(+), 23 deletions(-) diff --git a/msql.ebnf b/msql.ebnf index bb67fa9..5bd269e 100644 --- a/msql.ebnf +++ b/msql.ebnf @@ -5,12 +5,14 @@ statement: "QUERY" querytype "WHERE" wherefullcondition+ "FILTER" filterfullcond querytype: datams1data | datams2data - | function"(" datams1data ")" + | function "(" datams1data ")" | function "(" datams2data ")" + | function "(" datams3data ")" | function"(" datams1data "," param "=" floating ")" datams1data: "MS1DATA" datams2data: "MS2DATA" +datams3data: "MS3DATA" wherefullcondition: wherefullcondition booleanconjunction wherefullcondition | condition ":" qualifier diff --git a/msql_engine.py b/msql_engine.py index 31197b5..cd32e94 100644 --- a/msql_engine.py +++ b/msql_engine.py @@ -40,36 +40,67 @@ def _load_data(input_filename, cache=False): if cache: ms1_filename = input_filename + "_ms1.msql.feather" ms2_filename = input_filename + "_ms2.msql.feather" + ms3_filename = input_filename + "_ms3.msql.feather" - if os.path.exists(ms1_filename): - ms1_df = pd.read_feather(ms1_filename) - ms2_df = pd.read_feather(ms2_filename) + feather_caches_exist = os.path.exists(ms1_filename) or os.path.exists(ms2_filename) or os.path.exists(ms3_filename) - return ms1_df, ms2_df + if feather_caches_exist: + try: + ms1_df = pd.read_feather(ms1_filename) + except: + ms1_df = pd.DataFrame() + + try: + ms2_df = pd.read_feather(ms2_filename) + except: + ms2_df = pd.DataFrame() + + try: + ms3_df = pd.read_feather(ms3_filename) + except: + ms3_df = pd.DataFrame() + + return ms1_df, ms2_df, ms3_df # Actually loading if input_filename[-5:] == ".mzML": - ms1_df, ms2_df = msql_fileloading._load_data_mzML(input_filename) + ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_mzML(input_filename) if input_filename[-6:] == ".mzXML": - ms1_df, ms2_df = msql_fileloading._load_data_mzXML(input_filename) + ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_mzXML(input_filename) if input_filename[-5:] == ".json": - ms1_df, ms2_df = msql_fileloading._load_data_gnps_json(input_filename) + ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_gnps_json(input_filename) if input_filename[-4:] == ".mgf": - ms1_df, ms2_df = msql_fileloading._load_data_mgf(input_filename) + ms1_df, ms2_df, ms3_df = msql_fileloading._load_data_mgf(input_filename) # Saving Cache if cache: ms1_filename = input_filename + "_ms1.msql.feather" ms2_filename = input_filename + "_ms2.msql.feather" + ms3_filename = input_filename + "_ms3.msql.feather" - if not os.path.exists(ms1_filename): - ms1_df.to_feather(ms1_filename) - ms2_df.to_feather(ms2_filename) + feather_caches_exist = os.path.exists(ms1_filename) or os.path.exists(ms2_filename) or os.path.exists(ms3_filename) - return ms1_df, ms2_df + # Writing out the files + if not feather_caches_exist: + try: + ms1_df.to_feather(ms1_filename) + except: + pass + + try: + ms2_df.to_feather(ms2_filename) + except: + pass + + try: + ms3_df.to_feather(ms3_filename) + except: + pass + + return ms1_df, ms2_df, ms3_df def _get_ppm_tolerance(qualifiers): if qualifiers is None: @@ -256,7 +287,7 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Tr # This is when the target is actually a float pass - ms1_df, ms2_df = _load_data(input_filename, cache=cache) + ms1_df, ms2_df, ms3_df = _load_data(input_filename, cache=cache) # Here we are going to translate the variable query into a concrete query based upon the data all_concrete_queries = [] @@ -398,10 +429,10 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Tr return _executecollate_query(parsed_dict, aggregated_ms1_df, aggregated_ms2_df) @ray.remote -def _executeconditions_query_ray(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, cache=True): - return _executeconditions_query(parsed_dict, input_filename, ms1_input_df=ms1_input_df, ms2_input_df=ms2_input_df, cache=cache) +def _executeconditions_query_ray(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, ms3_input_df=None, cache=True): + return _executeconditions_query(parsed_dict, input_filename, ms1_input_df=ms1_input_df, ms2_input_df=ms2_input_df, ms3_input_df=ms3_input_df, cache=cache) -def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, cache=True): +def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2_input_df=None, ms3_input_df=None, cache=True): # This function attempts to find the data that the query specifies in the conditions #import json @@ -409,10 +440,11 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Let's apply this to real data if ms1_input_df is None and ms2_input_df is None: - ms1_df, ms2_df = _load_data(input_filename, cache=cache) + ms1_df, ms2_df, ms3_df = _load_data(input_filename, cache=cache) else: ms1_df = ms1_input_df ms2_df = ms2_input_df + ms3_df = ms3_input_df # In order to handle intensities, we will make sure to sort all conditions with # with the conditions that are the reference intensity first, then subsequent conditions diff --git a/msql_fileloading.py b/msql_fileloading.py index fb51f39..cc37b91 100644 --- a/msql_fileloading.py +++ b/msql_fileloading.py @@ -33,8 +33,9 @@ def _load_data_mgf(input_filename): # Turning into pandas data frames ms1_df = pd.DataFrame([peak_dict]) ms2_df = pd.DataFrame(ms2mz_list) + ms3_df = pd.DataFrame() - return ms1_df, ms2_df + return ms1_df, ms2_df, ms3_df def _load_data_gnps_json(input_filename): all_spectra = json.loads(open(input_filename).read()) @@ -64,8 +65,9 @@ def _load_data_gnps_json(input_filename): # Turning into pandas data frames ms1_df = pd.DataFrame([peak_dict]) ms2_df = pd.DataFrame(ms2mz_list) + ms3_df = pd.DataFrame() - return ms1_df, ms2_df + return ms1_df, ms2_df, ms3_df def _load_data_mzXML(input_filename): ms1mz_list = [] @@ -112,8 +114,9 @@ def _load_data_mzXML(input_filename): # Turning into pandas data frames ms1_df = pd.DataFrame(ms1mz_list) ms2_df = pd.DataFrame(ms2mz_list) + ms3_df = pd.DataFrame() - return ms1_df, ms2_df + return ms1_df, ms2_df, ms3_df def _load_data_mzML(input_filename): @@ -189,5 +192,5 @@ def _load_data_mzML(input_filename): ms2_df = pd.DataFrame(ms2mz_list) ms3_df = pd.DataFrame() - return ms1_df, ms2_df + return ms1_df, ms2_df, ms3_df diff --git a/test.py b/test.py index 225aa11..354cc3e 100644 --- a/test.py +++ b/test.py @@ -234,6 +234,13 @@ def test_ms1_filtered_by_ms2(): results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") print(results_df) +def test_ms3(): + query = "QUERY scansum(MS3DATA)" + parse_obj = msql_parser.parse_msql(query) + print(json.dumps(parse_obj, indent=4)) + results_df = msql_engine.process_query(query, "test/Toronamide_MS3_DDA_2.mzML") + print(results_df) + def test_intensity_int_parse(): query = "QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>1 AND MS2PROD=353.25:TOLERANCEMZ=0.1:INTENSITYPERCENT>80 AND MS1MZ=478.1991:TOLERANCEMZ=0.1:INTENSITYPERCENT>1" parse_obj = msql_parser.parse_msql(query) @@ -395,12 +402,13 @@ def main(): #test_ms1_cu() #test_neutral_loss_intensity() #test_gnps_library() - test_gnps_full_library() + #test_gnps_full_library() #test_networking_mgf_library() #test_swath() #test_albicidin_tag() #test_double_brominated() #test_agilent() + test_ms3() if __name__ == "__main__": main()