Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD groot normalizer #60

Merged
merged 8 commits into from
Aug 13, 2024
3 changes: 3 additions & 0 deletions argnorm/data/manual_curation/groot_curation.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Original ID ARO
blaOXA-14_1_L38523 3001409
blaTEM-199_1_JX050178 3001058
53 changes: 33 additions & 20 deletions db_harmonisation/construct_groot_mappings.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import pandas as pd
from argnorm.lib import get_aro_mapping_table
from Bio import SeqIO
from Bio.Seq import translate
from Bio.Seq import translate, reverse_complement, Seq
import subprocess
import requests
import os

def download_file(url, ofile):
os.makedirs('dbs', exist_ok=True)
os.makedirs('mapping', exist_ok=True)
os.makedirs('manual_curation', exist_ok=True)

with open(ofile, 'wb') as f:
r = requests.get(url, stream=True)
Expand Down Expand Up @@ -46,7 +47,7 @@ def download_file(url, ofile):

if id in default_argannot_mappings:
groot_argannot_genes.append(record.id)
groot_argannot_accessions.append(str(argannot_aro_mapping_table.loc[id, 'ARO']).removeprefix('ARO:'))
groot_argannot_accessions.append(str(argannot_aro_mapping_table.loc[id, 'ARO']).replace('ARO:', ''))
else:
# Run this through RGI
SeqIO.write(record, ofile, 'fasta')
Expand All @@ -64,12 +65,16 @@ def download_file(url, ofile):
for record in SeqIO.parse(ifile, 'fasta'):
if str(record.id) in list(resfinder_aro_mapping_table.index):
groot_resfinder_genes.append(record.id)
groot_resfinder_accessions.append(str(resfinder_aro_mapping_table.loc[record.id, 'ARO']).removeprefix('ARO:'))
else:
record.seq = translate(record.seq).removesuffix('*')
print(record)
SeqIO.write(record, ofile, 'fasta')

groot_resfinder_accessions.append(str(resfinder_aro_mapping_table.loc[record.id, 'ARO']).replace('ARO:', ''))
else:
if record.seq.reverse_complement().translate().endswith('*')\
and record.seq.reverse_complement().translate().count('*') == 1:
record.seq = Seq(str(reverse_complement(translate(record.seq))).replace('*', ''))
SeqIO.write(record, ofile, 'fasta')
else:
record.seq = Seq(str(translate(record.seq)).replace('*', ''))
SeqIO.write(record, ofile, 'fasta')

resfinder_groot_mapping = pd.DataFrame(groot_resfinder_genes, columns=['Original ID'])
resfinder_groot_mapping['ARO'] = groot_resfinder_accessions

Expand All @@ -78,25 +83,33 @@ def download_file(url, ofile):
with open('./card-refs.fna', 'r') as ifile:
for record in SeqIO.parse(ifile, 'fasta'):
card_genes.append(str(record.id).split('|')[-1])
card_accessions.append(str(record.id).split('|')[-2].removeprefix('ARO:'))
card_accessions.append(str(record.id).split('|')[-2].replace('ARO:', ''))

card_groot_mapping = pd.DataFrame(card_genes, columns=['Original ID'])
card_groot_mapping['ARO'] = card_accessions

# subprocess.check_call([
# 'rgi',
# 'main',
# '-i', './groot_missing.fasta',
# '-o', './mapping/groot_missing_mapping',
# '-t', 'protein',
# '-a', 'BLAST',
# '--clean',
# '--include_loose'
# ])
groot_missing_genes = []
with open('./groot_missing.fasta', 'r') as ifile:
for record in SeqIO.parse(ifile, 'fasta'):
groot_missing_genes.append(record.id)

subprocess.check_call([
'rgi',
'main',
'-i', './groot_missing.fasta',
'-o', './mapping/groot_missing_mapping',
'-t', 'protein',
'-a', 'BLAST',
'--clean',
'--include_loose'
])

missing_groot_mappings = pd.read_csv('./mapping/groot_missing_mapping.txt', sep='\t')
missing_groot_mappings['Original ID'] = missing_groot_mappings['ORF_ID']
missing_groot_mappings_trim = missing_groot_mappings[['Original ID', 'ARO']]

comb_groot_mapping = pd.concat([argannot_groot_mapping, resfinder_groot_mapping, card_groot_mapping, missing_groot_mappings_trim])
comb_groot_mapping.to_csv('./mapping/groot_ARO_mapping.tsv', sep='\t', index=False)
comb_groot_mapping.to_csv('./mapping/groot_ARO_mapping.tsv', sep='\t', index=False)

groot_manual_curation = pd.DataFrame(list(set(groot_missing_genes) - set(comb_groot_mapping['Original ID'])), columns=['Original ID'])
groot_manual_curation.to_csv('./manual_curation/groot_curation.tsv', sep='\t', index=False)
Loading