-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_othogroups_multiple_copy_orthofinder_missing_taxa.py
127 lines (82 loc) · 3.67 KB
/
extract_othogroups_multiple_copy_orthofinder_missing_taxa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
import argparse
import re
from collections import defaultdict
import glob
import os
# Extract orthogroups from output of orthofinder (maximum 2 copies per species) and print them in new fasta files so that the headers correspond to species names
# -> Have also the species name and not the sequence id only (fasta files can then be used directly for alignment and concatenation )
# -> Missing copies for some species are allowed (i.e. not all species need to be represented in the orthogroup)
# The script assumes that sequences have unique ids (i.e. no sequence with the same id exists in 2 different proteomes)
def make_out_dir(path):
'''Check if output dir exists, otherwise make output directory: path_to_out_dir'''
if not os.path.isdir(path):
os.mkdir(path)
def parse_fasta(fileneme, path):
dict_fasta =defaultdict(lambda: "")
fh_in_fasta=open(path+fileneme, "r")
for line in fh_in_fasta:
line=line.rstrip("\n")
if line.startswith(">"):
header=line
else:
dict_fasta[header]+=line
return dict_fasta
def write_fasta(dict_for_fasta, path, OG):
fh_out=open(path+OG+".fa", "w")
for k,v in dict_for_fasta.items():
fh_out.write(k+"\n"+v+"\n")
def check_max_no_copies_per_species(dict_seqs, max_copies):
gene_copy_counts=defaultdict(lambda: 0)
for h in dict_seqs.keys():
elements_h=h.split(" ")
gene_copy_counts[elements_h[0]]+=1
counts = gene_copy_counts.values()
print(counts)
for count in counts:
if count > int(max_copies):
return False
return True
def main():
parser=argparse.ArgumentParser("Parse output of orthofinder and extract orthogroups into fasta files in which headers also have the species name (missing taxa in some orthogroups are allowed).")
parser.add_argument("Orthogroups", help="Full path to Orthogroups.tsv")
parser.add_argument("Orthogroups_Sequences", help="Full path to Orthogroups_Sequences directory")
parser.add_argument("Output_dir", help="Full path to directory to place the output fasta files")
parser.add_argument("Max_gene_copies", help="Max no. of gene copies per species in orthogroup")
args=parser.parse_args()
species_names_ordered=[]
OG_to_IDs = defaultdict(lambda:[])
linecounter=0
#Check if output directory exists and make directory
make_out_dir(args.Output_dir)
#select only OGs with max; one copy from each species
with open (args.Orthogroups, "r") as fh_og:
for line in fh_og:
line = line.rstrip("\n")
if linecounter==0:
species_names_ordered = line.split("\t")
else:
elements = line.split("\t")
for i in range(1,len(elements)):
#splitting of appended elements not necessary here as there is one gene copy per OG in single-copy orthologs
OG_to_IDs[elements[0]].append(elements[i])
linecounter+=1
my_input_OG = glob.glob("*.fa")
for OG_ID in OG_to_IDs:
heads_seqs_renamed_headers = defaultdict(lambda: "")
heads_seqs= parse_fasta(OG_ID+".fa", args.Orthogroups_Sequences)
for header in heads_seqs:
capture_index=1
for element in OG_to_IDs[re.sub(".fa","",OG_ID)]:
if header.replace(">","") in element:
#remove characters after "." in header including the dot
heads_seqs_renamed_headers[">"+species_names_ordered[capture_index]+" "+header.replace(">","")] = heads_seqs[header]
capture_index+=1
if check_max_no_copies_per_species(heads_seqs_renamed_headers, args.Max_gene_copies):
print(OG_ID)
if len(heads_seqs_renamed_headers)>=2:
#writes new fasta in output directory path
write_fasta(heads_seqs_renamed_headers, args.Output_dir, OG_ID)
##########################################################################
if __name__ == '__main__':
main()