Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Do Not Merge] [Grammar/Engine] Supporting multiple m/z in product search #13

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions msql.ebnf
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ filterfullcondition: filterfullcondition booleanconjunction filterfullcondition
| condition ":" qualifier
| condition

condition: conditionfields "=" floating
| conditionfields equal variable
// Conditions
condition: conditionfields equal conditionvalue
| conditionfields equal "(" statement ")"

conditionvalue: conditionvalue "," conditionvalue
| variable
| floating

qualifier: qualifier ":" qualifier
| qualifierfields equal floating
Expand Down
109 changes: 62 additions & 47 deletions msql_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,23 +461,32 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2

# Filtering MS2 Product Ions
if condition["type"] == "ms2productcondition":
mz = condition["value"][0]
mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz)
mz_min = mz - mz_tol
mz_max = mz + mz_tol
filtered_scans = set()
for mz in condition["value"]:
mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz)
mz_min = mz - mz_tol
mz_max = mz + mz_tol

min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None))
min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None))

ms2_filtered_df = ms2_df[(ms2_df["mz"] > mz_min) & (ms2_df["mz"] < mz_max) & (ms2_df["i"] > min_int) & (ms2_df["i_norm"] > min_intpercent)]
ms2_filtered_df = ms2_df[
(ms2_df["mz"] > mz_min) &
(ms2_df["mz"] < mz_max) &
(ms2_df["i"] > min_int) &
(ms2_df["i_norm"] > min_intpercent)
]

# Setting the intensity match register
_set_intensity_register(ms2_filtered_df, reference_conditions_register, condition)
# Setting the intensity match register
_set_intensity_register(ms2_filtered_df, reference_conditions_register, condition)

# Applying the intensity match
ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition)
# Applying the intensity match
ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition)

if len(ms2_filtered_df) > 0:
# Getting union of all scans
filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"]))

# Filtering the actual data structures
filtered_scans = set(ms2_filtered_df["scan"])
ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)]

# Filtering the MS1 data now
Expand Down Expand Up @@ -506,28 +515,32 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2

# Filtering MS2 Neutral Loss
if condition["type"] == "ms2neutrallosscondition":
mz = condition["value"][0]
mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz)
nl_min = mz - mz_tol
nl_max = mz + mz_tol
filtered_scans = set()
for mz in condition["value"]:
mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz)
nl_min = mz - mz_tol
nl_max = mz + mz_tol

min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None))
min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None))

ms2_filtered_df = ms2_df[
((ms2_df["precmz"] - ms2_df["mz"]) > nl_min) &
((ms2_df["precmz"] - ms2_df["mz"]) < nl_max) &
(ms2_df["i"] > min_int) &
(ms2_df["i_norm"] > min_intpercent)
]
ms2_filtered_df = ms2_df[
((ms2_df["precmz"] - ms2_df["mz"]) > nl_min) &
((ms2_df["precmz"] - ms2_df["mz"]) < nl_max) &
(ms2_df["i"] > min_int) &
(ms2_df["i_norm"] > min_intpercent)
]

# Setting the intensity match register
_set_intensity_register(ms2_filtered_df, reference_conditions_register, condition)
# Setting the intensity match register
_set_intensity_register(ms2_filtered_df, reference_conditions_register, condition)

# Applying the intensity match
ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition)
# Applying the intensity match
ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition)

if len(ms2_filtered_df) > 0:
# Getting union of all scans
filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"]))

# Filtering the actual data structures
filtered_scans = set(ms2_filtered_df["scan"])
ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)]

# Filtering the MS1 data now
Expand All @@ -538,33 +551,35 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2

# finding MS1 peaks
if condition["type"] == "ms1mzcondition":
mz = condition["value"][0]
mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz)
mz_min = mz - mz_tol
mz_max = mz + mz_tol
filtered_scans = set()
for mz in condition["value"]:
mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz)
mz_min = mz - mz_tol
mz_max = mz + mz_tol

min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None))
ms1_filtered_df = ms1_df[
(ms1_df["mz"] > mz_min) &
(ms1_df["mz"] < mz_max) &
(ms1_df["i"] > min_int) &
(ms1_df["i_norm"] > min_intpercent)]

#print("YYY", mz_min, mz_max, min_int, min_intpercent, len(ms1_filtered_df))
min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None))
ms1_filtered_df = ms1_df[
(ms1_df["mz"] > mz_min) &
(ms1_df["mz"] < mz_max) &
(ms1_df["i"] > min_int) &
(ms1_df["i_norm"] > min_intpercent)]
#print("YYY", mz_min, mz_max, min_int, min_intpercent, len(ms1_filtered_df))

# Setting the intensity match register
_set_intensity_register(ms1_filtered_df, reference_conditions_register, condition)
# Setting the intensity match register
_set_intensity_register(ms1_filtered_df, reference_conditions_register, condition)

# Applying the intensity match
ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition)
# Applying the intensity match
ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition)

#print(ms1_filtered_df)
if len(ms1_filtered_df) > 0:
# Getting union of all scans
filtered_scans = filtered_scans.union(set(ms1_filtered_df["scan"]))

if len(ms1_filtered_df) == 0:
if filtered_scans == 0:
return pd.DataFrame(), pd.DataFrame()

# Filtering the actual data structures
filtered_scans = set(ms1_filtered_df["scan"])
ms1_df = ms1_df[ms1_df["scan"].isin(filtered_scans)]
ms2_df = ms2_df[ms2_df["ms1scan"].isin(filtered_scans)]

Expand All @@ -583,7 +598,7 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2
if not condition["conditiontype"] == "filter":
continue

logging.error("FILTER CONDITION", condition)
#logging.error("FILTER CONDITION", condition)

# filtering MS1 peaks
if condition["type"] == "ms1mzcondition":
Expand Down
14 changes: 13 additions & 1 deletion msql_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ def qualifier(self, items):
def condition(self, items):
condition_dict = {}
condition_dict["type"] = items[0].children[0]
condition_dict["value"] = [items[-1]]
if type(items[-1]) is dict:
condition_dict["value"] = [items[-1]]
else:
condition_dict["value"] = items[-1]
return condition_dict

def wherefullcondition(self, items):
Expand Down Expand Up @@ -159,7 +162,16 @@ def filterfullcondition(self, items):
merged_list += items[-1]

return merged_list

def conditionvalue(self, items):
if len(items) == 1:
return items
if len(items) == 2:
merged_list = []
merged_list += items[0]
merged_list += items[-1]

return merged_list

def querytype(self, items):
query_dict = {}
Expand Down
20 changes: 16 additions & 4 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def test_qc_ms1_ms2peak():
print(set(results_df["scan"]))
assert(len(results_df) > 1000)



def test_diphen():
query = "QUERY scannum(MS2DATA) WHERE MS2PROD=167.0857:TOLERANCEPPM=5"
print(msql_parser.parse_msql(query))
Expand Down Expand Up @@ -105,8 +107,8 @@ def test_variable_ms1():
def test_subquery():
#query = "QUERY scanrangesum(MS1DATA, TOLERANCE=0.1) WHERE MS1MZ=(QUERY scanmz(MS2DATA) WHERE MS2NL=176.0321 AND MS2PROD=85.02915)"
query = "QUERY MS1DATA WHERE MS1MZ=(QUERY scanmz(MS2DATA) WHERE MS2NL=176.0321 AND MS2PROD=85.02915)"
results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML")
print(json.dumps(msql_parser.parse_msql(query), indent=4))
results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML")
print(results_df)

def test_filter():
Expand Down Expand Up @@ -280,6 +282,14 @@ def test_gnps_full_library():
results_df = msql_engine.process_query(query, "test/gnps.json")
print(results_df)

def test_multiple_mz():
query = "QUERY scaninfo(MS2DATA) WHERE \
MS2PROD=271.06,217.1"
parse_obj = msql_parser.parse_msql(query)
print(parse_obj)

results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML")
print(results_df)

def test_networking_mgf_library():
query = "QUERY scaninfo(MS2DATA) WHERE \
Expand Down Expand Up @@ -348,7 +358,8 @@ def test_parse():
for line in open("test_queries.txt"):
test_query = line.rstrip()
print(test_query)
msql_parser.parse_msql(test_query)
parsed_result = msql_parser.parse_msql(test_query)
assert(parsed_result is not None)

def test_query():
for line in open("test_queries.txt"):
Expand All @@ -372,7 +383,7 @@ def main():
#test_parse()
#test_query()
#test_xic()
#test_subquery()
test_subquery()
#test_variable_parse()
#test_variable()
#test_variable_ms1()
Expand All @@ -395,12 +406,13 @@ def main():
#test_ms1_cu()
#test_neutral_loss_intensity()
#test_gnps_library()
test_gnps_full_library()
#test_gnps_full_library()
#test_networking_mgf_library()
#test_swath()
#test_albicidin_tag()
#test_double_brominated()
#test_agilent()
#test_multiple_mz()

if __name__ == "__main__":
main()
Expand Down
3 changes: 2 additions & 1 deletion test_queries.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ QUERY scannum(MS2DATA) WHERE MS2PROD=88:TOLERANCEMZ=0.1:INTENSITYPERCENT>10 AND
QUERY scannum(MS2DATA) WHERE MS2NL=163
QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>0.1
QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>1
QUERY scaninfo(MS1DATA) WHERE RTMIN=50
QUERY scaninfo(MS1DATA) WHERE RTMIN=50
QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898,426.289