Merge pull request #146 from mwang87/new-api

[Engine] Creating new API and docs for file loading and calling query
mwang87 · Aug 26, 2021 · 46a7b05 · 46a7b05
2 parents 74e09ac + 1ca0b82
commit 46a7b05
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -40,6 +40,18 @@ from massql import msql_engine
 results_df = msql_engine.process_query(input_query, input_filename)
 ```
 
+If you want to push in a data frame you already have, you can specify it
+```
+from massql import msql_engine
+from massql import msql_fileloading
+
+# Loading Data
+ms1_df, ms2_df = msql_fileloading.load_data(input_filename)
+
+# Executing Query
+results_df = msql_engine.process_query(input_query, input_filename, ms1_df=ms1_df, ms2_df=ms2_df)
+```
+
 ## Web API
 
 ```/api```

diff --git a/massql/msql_engine.py b/massql/msql_engine.py
@@ -48,10 +48,26 @@ def _get_da_tolerance(qualifiers):
     return None
 
 
-def process_query(input_query, input_filename, path_to_grammar=None, cache=True, parallel=False):
+def process_query(input_query, input_filename, path_to_grammar=None, cache=True, parallel=False, ms1_df=None, ms2_df=None):
+    """
+    Process an actual query
+
+    Args:
+        input_query ([type]): [description]
+        input_filename ([type]): [description]
+        path_to_grammar ([type], optional): [description]. Defaults to None.
+        cache (bool, optional): [description]. Defaults to True.
+        parallel (bool, optional): [description]. Defaults to False.
+        ms1_df ([type], optional): [description]. Defaults to None. Pass in if you have these data in memory
+        ms2_df ([type], optional): [description]. Defaults to None. Pass in if you have these data in memory
+
+    Returns:
+        query results data frame: [description]
+    """
+
     parsed_dict = msql_parser.parse_msql(input_query, path_to_grammar=path_to_grammar)
 
-    return _evalute_variable_query(parsed_dict, input_filename, cache=cache, parallel=parallel)
+    return _evalute_variable_query(parsed_dict, input_filename, cache=cache, parallel=parallel, ms1_df=ms1_df, ms2_df=ms2_df)
 
 def _determine_mz_max(mz, ppm_tol, da_tol):
     da_tol = da_tol if da_tol < 10000 else 0
@@ -63,7 +79,7 @@ def _determine_mz_max(mz, ppm_tol, da_tol):
 
     return mz + half_delta
 
-def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=False):
+def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=False, ms1_df=None, ms2_df=None):
     # Lets check if there is a variable in here, the only one allowed is X
     for condition in parsed_dict["conditions"]:
         try:
@@ -138,7 +154,9 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa
                 # This is when the target is actually a float
                 pass
 
-    ms1_df, ms2_df = msql_fileloading.load_data(input_filename, cache=cache)
+    # Loading data if not passed in 
+    if ms1_df is None:
+        ms1_df, ms2_df = msql_fileloading.load_data(input_filename, cache=cache)
 
     # Here we are going to translate the variable query into a concrete query based upon the data
     all_concrete_queries = []

diff --git a/massql/msql_fileloading.py b/massql/msql_fileloading.py
@@ -47,6 +47,9 @@ def load_data(input_filename, cache=False):
 
     elif input_filename[-4:].lower() == ".mgf":
         ms1_df, ms2_df = _load_data_mgf(input_filename)
+
+    elif input_filename[-4:].lower() == ".txt":
+        ms1_df, ms2_df = _load_data_txt(input_filename)
 
     else:
         print("Cannot Load File Extension")
@@ -475,3 +478,28 @@ def _load_data_mzML(input_filename):
 
     return ms1_df, ms2_df
 
+def _load_data_txt(input_filename):
+    # We are assuming whitespace separated columns, first is mz, second is intensity, and will be marked as MS1
+    mz_list = []
+    i_list = []
+    for line in open(input_filename):
+        cleaned_line = line.rstrip()
+        if len(cleaned_line) == 0:
+            continue
+        mz, i = cleaned_line.split()
+
+        mz_list.append(float(mz))
+        i_list.append(float(i))
+
+    ms1_df = pd.DataFrame()
+    ms1_df['mz'] = mz_list
+    ms1_df['i'] = i_list
+    ms1_df['i_norm'] = ms1_df['i'] / max(ms1_df['i'])
+    ms1_df['i_tic_norm'] = ms1_df['i'] / sum(ms1_df['i'])
+    ms1_df['scan'] = 1
+    ms1_df['rt'] = 0
+    ms1_df['polarity'] = "Positive"
+
+    print(ms1_df)
+
+    return ms1_df, pd.DataFrame()
diff --git a/proteosafe/msql-nf/test-integration-workflow/test_tasks.csv b/proteosafe/msql-nf/test-integration-workflow/test_tasks.csv
@@ -4,3 +4,4 @@ f875745343574b1680138b6e12103afd,sanity test,query_results
 db21c5c316034b09a74668b53828b39b,Trimethoprem,query_results
 3290b45278624b9b877bdee60dfbc7b3,Brominated,query_results
 fcb68f55a00c4ea39153266cd2bd42de,Iron OR queries,query_results
+0e132d964fef4f1a9f1a80d95b352920,Top Down Test,query_results
diff --git a/proteosafe/test-integration/test_tasks.csv b/proteosafe/test-integration/test_tasks.csv
diff --git a/tests/test_data/top_down.txt b/tests/test_data/top_down.txt
@@ -0,0 +1,58 @@
+132465.09	12.546343
+133225.4931	12.19392
+133989.6146	18.382821
+134744.8918	16.622169
+135508.8453	29.15272
+135709.4353	12.315019
+136269.7202	23.290751
+136474.0863	16.300753
+136823.8683	13.401818
+137025.2584	42.788894
+137234.0651	14.446877
+137584.0828	13.276068
+137785.3407	36.485754
+137995.0349	14.735787
+138340.1478	20.745298
+138549.298	62.669336
+138754.2691	23.062884
+139105.4871	23.903191
+139304.9367	54.209214
+139509.8068	24.567254
+139864.7173	23.295769
+140065.2515	62.811609
+140270.0596	30.497242
+140623.8754	29.028855
+140826.022	61.716949
+141035.2999	20.697356
+141383.9161	30.244086
+141585.5889	76.422158
+141794.0949	34.943156
+142144.3674	54.074194
+142345.3573	100
+142553.5582	42.638963
+142904.3269	35.756392
+143105.3558	66.716483
+143310.0193	33.103042
+143665.4048	35.090121
+143865.6924	70.924197
+144073.8344	34.686524
+144424.0522	27.382454
+144625.9266	69.20136
+144833.5925	26.57023
+145185.3049	31.549792
+145385.8676	84.545703
+145589.8953	39.743775
+145944.8025	28.254461
+146149.9511	71.468122
+146355.289	24.295305
+146704.5286	19.559481
+146905.8959	62.213523
+147106.5088	29.916714
+147669.7064	43.913333
+147873.7765	17.187839
+148426.2304	39.17724
+148634.4224	20.539458
+149190.0231	19.643502
+149395.1611	10.928549
+149953.621	15.454649
+150715.2934	10.336184
diff --git a/tests/test_query.py b/tests/test_query.py
@@ -444,8 +444,14 @@ def test_topdown():
 MS2PROD=X+202:TOLERANCEMZ=10:INTENSITYMATCH=Y*0.5:INTENSITYMATCHPERCENT=50 AND \
 MS2PROD=X-202:TOLERANCEMZ=10:INTENSITYMATCH=Y*0.5:INTENSITYMATCHPERCENT=50"
     results_df = msql_engine.process_query(query, "tests/test_data/top_down.mgf")
+    assert(len(results_df) == 14)
+
+    query = query.replace("MS2DATA", "MS1DATA")
+    query = query.replace("MS2PROD", "MS1MZ")
+
+    results_df = msql_engine.process_query(query, "tests/test_data/top_down.txt")
+    assert(len(results_df) == 14)
 
-    print(results_df)
 
 
 def test_quad_brominated():
@@ -593,10 +599,10 @@ def main():
     #test_translator()
     #test_ms1_iron_X_changes_intensity()
     #test_nocache()
-    #test_topdown()
+    test_topdown()
     #test_defect()
     #test_or_against_iron()
-    test_quad_brominated()
+    #test_quad_brominated()
 
 if __name__ == "__main__":
     main()