-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsamples2genes.py
62 lines (53 loc) · 2.12 KB
/
samples2genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/python3
# Script for splitting fasta files that each contain all contigs from a given sample into
# fasta files that each contain all samples from a given gene. This requires that all
# sequences in the original files are named "sample_gene".
#
# Requires a text file "filelist.txt" with the names of the input fasta files. I generated
# this using ls *trimmed.fasta > filelist.txt.
#
# Prints some gene-level stats (gene name, median lenght, number of sequences, number of
# sequences kept, threshold applied) to stdout.
#
# Recommended usage: samples2genes.py > outstats.csv
#
# Wolf Eiserhardt 17.4.2020
from Bio import SeqIO
from statistics import median
import sys
# Access the command-line arguments
file_list = sys.argv[1]
output_folder = sys.argv[2]
files2parse = []
with open(file_list) as files:
for line in files:
files2parse.append(line.strip())
# gather sequences from those files into gene dictionary_
# key: gene name
# item: list of seq records
genes = {}
for file2parse in files2parse: # this loops over each "species" in the files2parse vector
for record in SeqIO.parse(file2parse, "fasta"): # This loops over each
sample_gene = record.id.split("_")
record.id = sample_gene[0]
if sample_gene[1] in genes.keys():
genes[sample_gene[1]].append(record)
else:
genes[sample_gene[1]] = [record]
# print header
print("gene;median_length;sequences;sequences_kept;threshold")
# length-filter sequence sets and write to fasta
prc = .2 #minimum percentage of median length
len_thres = 150 #minimum absolute sequence length
for gene, seq_set in genes.items():
# get median length
lens = [len(s.seq) for s in seq_set]
# get final threshold (combination of percentage and min length)
thres = max(prc*median(lens), len_thres)
# select sequences over threshold
seq_set_keep = [seq for length, seq in zip(lens, seq_set) if length>thres]
# write out if more than 3 sequences
if len(seq_set_keep) > 3:
with open(output_folder+gene+'.FNA', "w") as outfile:
SeqIO.write(seq_set_keep, outfile, "fasta")
print(gene+';'+str(round(median(lens)))+';'+str(len(seq_set))+';'+str(len(seq_set_keep))+';'+str(round(thres)))