Skip to content

Commit

Permalink
script: produces sample_input.fasta for 10 strains
Browse files Browse the repository at this point in the history
  • Loading branch information
mkayasth authored May 1, 2024
1 parent 0131921 commit cc80cd4
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions Sample Input FASTA/sarsCov2DataExtract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from Bio import Entrez, SeqIO
import requests

# replace with YOUR email id.
Entrez.email = "[email protected]"

# mapping accession numbers to strain names
accession_to_strain = {
"NC_045512": "Wuhan",
"OR075545": "XBB.1.16",
"OQ608429": "XBB.1.5",
"OR598953": "EG.5",
"PP292788": "AY.3",
"OR813619": "BA.2.86",
"OR829491": "CH.1.1",
"PP250483": "BF.10",
"PP316714": "JN.1",
"PP435534": "HV.1",
"OQ437945": "B.1.1.7",
"PP421053": "P.1",
"PP299611": "B.1.617.2",
"PP292591": "BE.1",
"OQ938406": "DN.2"
}

output_file = "tool_input.fasta"

with open(output_file, 'w') as outfile:
for accession, strain_name in accession_to_strain.items():
try:
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")

# replacing the header with the strain name
record.id = strain_name
record.description = strain_name

# writing the modified sequence to the output file
SeqIO.write(record, outfile, "fasta")
handle.close()
print(f"Successfully fetched and added {strain_name}")
except Exception as e:
print(f"Failed to fetch {accession} ({strain_name}): {e}")

print("Success! All sequences have been written to:", output_file)

0 comments on commit cc80cd4

Please sign in to comment.