diff --git a/Sample Input FASTA/sarsCov2DataExtract.py b/Sample Input FASTA/sarsCov2DataExtract.py new file mode 100644 index 0000000..4086a02 --- /dev/null +++ b/Sample Input FASTA/sarsCov2DataExtract.py @@ -0,0 +1,45 @@ +from Bio import Entrez, SeqIO +import requests + +# replace with YOUR email id. +Entrez.email = "mkayasth@ramapo.edu" + +# mapping accession numbers to strain names +accession_to_strain = { + "NC_045512": "Wuhan", + "OR075545": "XBB.1.16", + "OQ608429": "XBB.1.5", + "OR598953": "EG.5", + "PP292788": "AY.3", + "OR813619": "BA.2.86", + "OR829491": "CH.1.1", + "PP250483": "BF.10", + "PP316714": "JN.1", + "PP435534": "HV.1", + "OQ437945": "B.1.1.7", + "PP421053": "P.1", + "PP299611": "B.1.617.2", + "PP292591": "BE.1", + "OQ938406": "DN.2" +} + +output_file = "tool_input.fasta" + +with open(output_file, 'w') as outfile: + for accession, strain_name in accession_to_strain.items(): + try: + handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text") + record = SeqIO.read(handle, "fasta") + + # replacing the header with the strain name + record.id = strain_name + record.description = strain_name + + # writing the modified sequence to the output file + SeqIO.write(record, outfile, "fasta") + handle.close() + print(f"Successfully fetched and added {strain_name}") + except Exception as e: + print(f"Failed to fetch {accession} ({strain_name}): {e}") + +print("Success! All sequences have been written to:", output_file)