-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathannotate_variants.py
69 lines (55 loc) · 2.4 KB
/
annotate_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
Add drug_evidence_count, drug_evidence_url that denote knowledgebase results for each
entry in a table and output new table.
"""
import argparse
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import pandas as pd
from g2p_client import G2PDatabase
host = "dms-dev.compbio.ohsu.edu"
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("in_file", type=str, help="tabular input file")
parser.add_argument("-sv", "--struct-vars", help="annotate structural variants", action="store_true")
parser.add_argument("-l", "--label", help="label added to output", default=None)
parser.add_argument("-f", "--filtered", help="Print only variants with results", action="store_true")
args = parser.parse_args()
label = args.label
filtered = args.filtered
# Read table.
variants = pd.read_csv(args.in_file, sep="\t")
variants = variants.fillna('')
# Annotate variants.
database = G2PDatabase(host)
# Add some flexibility when searching for gene name.
gene_col_names = ['gene', 'Gene', 'Hugo_Symbol']
gene_col_name = ''
for name in gene_col_names:
if name in variants.columns.values.tolist():
gene_col_name = name
break
# Annotate variants by adding # of pieces of evidence + URL for each variant.
variants['drug_evidence_count'] = 0
variants['drug_evidence_url'] = ''
for index, row in variants.iterrows():
s = database.query_by_gene(row[gene_col_name])
count = s.count()
if count > 0:
variants.iloc[index, -2:] = (count, 'https://dms-dev.compbio.ohsu.edu/g2p#gene%3A' + row[gene_col_name])
if filtered:
variants = variants[variants['drug_evidence_count'] > 0]
print variants.to_csv(sep='\t', index=False, encoding='utf-8')
# Annotate variants by joining each variant repeatedly with each piece of evidence.
#
# results_dfs = []
# for index, row in variants.iterrows():
# s = database.query_by_gene(row['gene'])
# results_df = database._hits_to_dataframe(s)
# if len(results_df) > 0:
# merged_df = row.to_frame().transpose().merge(results_df, on="gene")
# if label:
# merged_df.insert(0, 'label', args.label)
# results_dfs.append(merged_df)
#
# print pd.concat(results_dfs).to_csv(sep='\t', index=False, encoding='utf-8')