From 0a4d00e2ba7703bc8a93bce23748db688dadab0e Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 25 Aug 2021 16:45:32 -0700 Subject: [PATCH 1/3] enabling more flexible file loading --- README.md | 12 ++++++++++++ massql/msql_engine.py | 26 ++++++++++++++++++++++---- massql/msql_fileloading.py | 28 ++++++++++++++++++++++++++++ tests/test_query.py | 12 +++++++++--- 4 files changed, 71 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 72d6d4b..7927621 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,18 @@ from massql import msql_engine results_df = msql_engine.process_query(input_query, input_filename) ``` +If you want to push in a data frame you already have, you can specify it +``` +from massql import msql_engine +from massql import msql_fileloading + +# Loading Data +ms1_df, ms2_df = msql_fileloading.load_data(input_filename) + +# Executing Query +results_df = msql_engine.process_query(input_query, input_filename, ms1_df=ms1_df, ms2_df=ms2_df) +``` + ## Web API ```/api``` diff --git a/massql/msql_engine.py b/massql/msql_engine.py index 9f62d60..4abf246 100644 --- a/massql/msql_engine.py +++ b/massql/msql_engine.py @@ -48,10 +48,26 @@ def _get_da_tolerance(qualifiers): return None -def process_query(input_query, input_filename, path_to_grammar=None, cache=True, parallel=False): +def process_query(input_query, input_filename, path_to_grammar=None, cache=True, parallel=False, ms1_df=None, ms2_df=None): + """ + Process an actual query + + Args: + input_query ([type]): [description] + input_filename ([type]): [description] + path_to_grammar ([type], optional): [description]. Defaults to None. + cache (bool, optional): [description]. Defaults to True. + parallel (bool, optional): [description]. Defaults to False. + ms1_df ([type], optional): [description]. Defaults to None. Pass in if you have these data in memory + ms2_df ([type], optional): [description]. Defaults to None. Pass in if you have these data in memory + + Returns: + query results data frame: [description] + """ + parsed_dict = msql_parser.parse_msql(input_query, path_to_grammar=path_to_grammar) - return _evalute_variable_query(parsed_dict, input_filename, cache=cache, parallel=parallel) + return _evalute_variable_query(parsed_dict, input_filename, cache=cache, parallel=parallel, ms1_df=ms1_df, ms2_df=ms2_df) def _determine_mz_max(mz, ppm_tol, da_tol): da_tol = da_tol if da_tol < 10000 else 0 @@ -63,7 +79,7 @@ def _determine_mz_max(mz, ppm_tol, da_tol): return mz + half_delta -def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=False): +def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=False, ms1_df=None, ms2_df=None): # Lets check if there is a variable in here, the only one allowed is X for condition in parsed_dict["conditions"]: try: @@ -138,7 +154,9 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa # This is when the target is actually a float pass - ms1_df, ms2_df = msql_fileloading.load_data(input_filename, cache=cache) + # Loading data if not passed in + if ms1_df is None: + ms1_df, ms2_df = msql_fileloading.load_data(input_filename, cache=cache) # Here we are going to translate the variable query into a concrete query based upon the data all_concrete_queries = [] diff --git a/massql/msql_fileloading.py b/massql/msql_fileloading.py index 14b6327..e48f7c4 100644 --- a/massql/msql_fileloading.py +++ b/massql/msql_fileloading.py @@ -47,6 +47,9 @@ def load_data(input_filename, cache=False): elif input_filename[-4:].lower() == ".mgf": ms1_df, ms2_df = _load_data_mgf(input_filename) + + elif input_filename[-4:].lower() == ".txt": + ms1_df, ms2_df = _load_data_txt(input_filename) else: print("Cannot Load File Extension") @@ -475,3 +478,28 @@ def _load_data_mzML(input_filename): return ms1_df, ms2_df +def _load_data_txt(input_filename): + # We are assuming whitespace separated columns, first is mz, second is intensity, and will be marked as MS1 + mz_list = [] + i_list = [] + for line in open(input_filename): + cleaned_line = line.rstrip() + if len(cleaned_line) == 0: + continue + mz, i = cleaned_line.split() + + mz_list.append(float(mz)) + i_list.append(float(i)) + + ms1_df = pd.DataFrame() + ms1_df['mz'] = mz_list + ms1_df['i'] = i_list + ms1_df['i_norm'] = ms1_df['i'] / max(ms1_df['i']) + ms1_df['i_tic_norm'] = ms1_df['i'] / sum(ms1_df['i']) + ms1_df['scan'] = 1 + ms1_df['rt'] = 0 + ms1_df['polarity'] = "Positive" + + print(ms1_df) + + return ms1_df, pd.DataFrame() diff --git a/tests/test_query.py b/tests/test_query.py index a56e754..197e2e7 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -444,8 +444,14 @@ def test_topdown(): MS2PROD=X+202:TOLERANCEMZ=10:INTENSITYMATCH=Y*0.5:INTENSITYMATCHPERCENT=50 AND \ MS2PROD=X-202:TOLERANCEMZ=10:INTENSITYMATCH=Y*0.5:INTENSITYMATCHPERCENT=50" results_df = msql_engine.process_query(query, "tests/test_data/top_down.mgf") + assert(len(results_df) == 14) + + query = query.replace("MS2DATA", "MS1DATA") + query = query.replace("MS2PROD", "MS1MZ") + + results_df = msql_engine.process_query(query, "tests/test_data/top_down.txt") + assert(len(results_df) == 14) - print(results_df) def test_quad_brominated(): @@ -593,10 +599,10 @@ def main(): #test_translator() #test_ms1_iron_X_changes_intensity() #test_nocache() - #test_topdown() + test_topdown() #test_defect() #test_or_against_iron() - test_quad_brominated() + #test_quad_brominated() if __name__ == "__main__": main() From 278638bdd60e869d6791128dbb082d996f229811 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 25 Aug 2021 16:52:34 -0700 Subject: [PATCH 2/3] adding top down test --- proteosafe/msql-nf/test-integration-workflow/test_tasks.csv | 1 + proteosafe/test-integration/test_tasks.csv | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 proteosafe/test-integration/test_tasks.csv diff --git a/proteosafe/msql-nf/test-integration-workflow/test_tasks.csv b/proteosafe/msql-nf/test-integration-workflow/test_tasks.csv index fc0f816..2d978ac 100644 --- a/proteosafe/msql-nf/test-integration-workflow/test_tasks.csv +++ b/proteosafe/msql-nf/test-integration-workflow/test_tasks.csv @@ -4,3 +4,4 @@ f875745343574b1680138b6e12103afd,sanity test,query_results db21c5c316034b09a74668b53828b39b,Trimethoprem,query_results 3290b45278624b9b877bdee60dfbc7b3,Brominated,query_results fcb68f55a00c4ea39153266cd2bd42de,Iron OR queries,query_results +0e132d964fef4f1a9f1a80d95b352920,Top Down Test,query_results \ No newline at end of file diff --git a/proteosafe/test-integration/test_tasks.csv b/proteosafe/test-integration/test_tasks.csv deleted file mode 100644 index 26ef259..0000000 --- a/proteosafe/test-integration/test_tasks.csv +++ /dev/null @@ -1,2 +0,0 @@ -task_id,description,regressioncountviews -eba6034869fe419c87ce2b507339b04d,iron query, \ No newline at end of file From 1ca0b82135a39663a06d782b98b3665a43b3a8ee Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Wed, 25 Aug 2021 17:04:14 -0700 Subject: [PATCH 3/3] adding missing file --- tests/test_data/top_down.txt | 58 ++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/test_data/top_down.txt diff --git a/tests/test_data/top_down.txt b/tests/test_data/top_down.txt new file mode 100644 index 0000000..0ed7eaa --- /dev/null +++ b/tests/test_data/top_down.txt @@ -0,0 +1,58 @@ +132465.09 12.546343 +133225.4931 12.19392 +133989.6146 18.382821 +134744.8918 16.622169 +135508.8453 29.15272 +135709.4353 12.315019 +136269.7202 23.290751 +136474.0863 16.300753 +136823.8683 13.401818 +137025.2584 42.788894 +137234.0651 14.446877 +137584.0828 13.276068 +137785.3407 36.485754 +137995.0349 14.735787 +138340.1478 20.745298 +138549.298 62.669336 +138754.2691 23.062884 +139105.4871 23.903191 +139304.9367 54.209214 +139509.8068 24.567254 +139864.7173 23.295769 +140065.2515 62.811609 +140270.0596 30.497242 +140623.8754 29.028855 +140826.022 61.716949 +141035.2999 20.697356 +141383.9161 30.244086 +141585.5889 76.422158 +141794.0949 34.943156 +142144.3674 54.074194 +142345.3573 100 +142553.5582 42.638963 +142904.3269 35.756392 +143105.3558 66.716483 +143310.0193 33.103042 +143665.4048 35.090121 +143865.6924 70.924197 +144073.8344 34.686524 +144424.0522 27.382454 +144625.9266 69.20136 +144833.5925 26.57023 +145185.3049 31.549792 +145385.8676 84.545703 +145589.8953 39.743775 +145944.8025 28.254461 +146149.9511 71.468122 +146355.289 24.295305 +146704.5286 19.559481 +146905.8959 62.213523 +147106.5088 29.916714 +147669.7064 43.913333 +147873.7765 17.187839 +148426.2304 39.17724 +148634.4224 20.539458 +149190.0231 19.643502 +149395.1611 10.928549 +149953.621 15.454649 +150715.2934 10.336184 \ No newline at end of file