-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstrim.py
179 lines (145 loc) · 7.65 KB
/
strim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#import libraries
import numpy as np
import pandas as pd
import os, glob, fnmatch, collections, argparse
import matplotlib.pyplot as plt
plt.style.use("ggplot")
#let the user choose between KEGG orthologs or modules and the taxonomic level to use in the stratification step
anno_selection = str(input("Choose between KEGG orthologs (ko) or KEGG modules (mo) : "))
print("\n")
taxa_selection = str(input("Choose the taxonomic level among Domain, Phylum, Class, Order, Family or Species : "))
print("\n")
#Let the user set the directories
parser = argparse.ArgumentParser()
parser.add_argument('inputdir', help = 'input directory')
parser.add_argument("outputdir", help = "output directory")
args = parser.parse_args()
#import MAGs abundances
str1 = 'MAGs_coverage.txt'
path1 = args.inputdir + str1
ab = pd.read_csv(path1, sep='\t', engine='python')
df1 = ab['Bin Id']
df2 = ab.iloc[:,ab.columns.str.contains('populations')]
abundances = pd.concat([df1,df2], axis = 1)
mean_ab = abundances.sort_values(by='Bin Id')
mean_ab.to_csv(os.path.join(args.outputdir, r'MAGs_abundances.csv'), sep='\t')
#import taxonomic assignment
str2 = 'taxonomy.txt'
path2 = args.inputdir + str2
taxonomy = pd.read_csv(path2, sep='\t', engine='python')
new = taxonomy["NCBI classification"].str.split(";", n = 7, expand = True)
taxonomy["Domain"]= new[0]
taxonomy["Phylum"]= new[1]
taxonomy["Class"]= new[2]
taxonomy["Order"]= new[3]
taxonomy["Family"]= new[4]
taxonomy["Genus"]= new[5]
taxonomy["Species"]= new[6]
taxonomy.drop(columns =["NCBI classification"], inplace = True)
taxonomy['Class'].loc[(taxonomy['Class'] == 'c__')] = 'Unclassified'
taxonomy['Order'].loc[(taxonomy['Order'] == 'o__')] = 'Unclassified'
taxonomy['Family'].loc[(taxonomy['Family'] == 'f__')] = 'Unclassified'
taxonomy['Genus'].loc[(taxonomy['Genus'] == 'g__')] = 'Unclassified'
taxonomy['Species'].loc[(taxonomy['Species'] == 's__')] = 'Unclassified'
taxonomy = taxonomy.sort_values(by='user_genome')
taxonomy = taxonomy.filter(['user_genome','Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'], axis=1)
taxonomy.to_csv(os.path.join(args.outputdir, r'taxonomy_assignment.csv'), sep='\t')
# import eggNOG annotation files and calculate the total weighted occurrences for all the MAGs annotation files
os.chdir(args.inputdir)
cwd = os.getcwd()
csv_files = glob.glob(os.path.join(cwd, "*.annotations")) #group together the MAG annotation files
list_ann = list()
column_list = mean_ab.columns.tolist()
column_list.remove('Bin Id')
for element in column_list:
KEGG_list = list()
weight_list = list()
weight_list = mean_ab[element].tolist()
species_list = list()
species_list = mean_ab['Bin Id'].tolist()
db_list = pd.DataFrame()
for f in csv_files:
list_ann.append(f)
df = pd.read_csv(f, sep='\t', header=3)
df.drop(df.tail(3).index, inplace = True)
df = df.loc[:, ['KEGG_ko', 'KEGG_Module']]
df = df.astype(str)
if anno_selection == 'ko':
df = df['KEGG_ko'].str.split(',', expand=True).stack().reset_index(drop=True, level=1)
elif anno_selection == 'mo':
df = df['KEGG_Module'].str.split(',', expand=True).stack().reset_index(drop=True, level=1)
else:
print("\nERROR: Please type 'ko' if you want to select the kegg orthologs or 'mo' if you want the kegg module")
break
KEGG_list = df.tolist() #list of all the kegg codes
counts = list(collections.Counter(KEGG_list).items()) #create a list of tuples: kegg_code , occurrences
d1 = dict()
for KEGG,n in counts:
d1.setdefault(KEGG, []).append(n) #create a dictionary from the list of tuples 'counts'
kegg = pd.DataFrame.from_dict(d1, orient='index', columns=['Occurrences']) #create the dataframe from the dictionary
kegg = kegg.drop('nan') #remove the kegg module not detected, identified as NaN
index_names = kegg[ kegg['Occurrences'] == 1 ].index #remove all the kegg module with only 1 occurrence
kegg.drop(index_names, inplace = True)
kegg.reset_index(level=0, inplace=True)
index = list_ann.index(f)
kegg['Weight'] = weight_list[index] #add a column with the weight for that species
kegg['Occurrences_weighted'] = kegg['Occurrences'].astype(int)*kegg['Weight'].astype(float) #weight the occurrences
kegg = kegg.sort_values(by = 'Occurrences_weighted', ascending=False) #sort in descending order
kegg = kegg.filter(['index', 'Occurrences_weighted']) #keep only the columns we need
db_list = db_list.append(kegg, sort=False) #create a unique databases
db = db_list.groupby('index')['Occurrences_weighted'].sum().sort_values(ascending=False) #sum and sort occurrences
directory = os.path.join(args.outputdir, r'{}.pdf'.format(element))
plt.figure(figsize=(10, 8))
db.head(10).plot.bar(color="darkseagreen")
plt.title("KEGG codes ordered by total occurrences {} (Top10)".format(element))
plt.xlabel("KEGG codes")
plt.ylabel("Number of occurrences")
plt.savefig(directory, dpi=300, bbox_inches='tight')
print("Weighted abundances of {} finished".format(element))
#stratification
d = {}
d_taxa = {}
print("\nStratification is running ...")
for f in csv_files:
index = list_ann.index(f)
s = species_list[index]
df = pd.read_csv(f, sep='\t', header=3)
df.drop(df.tail(3).index, inplace = True)
df = df.loc[:, ['KEGG_ko', 'KEGG_Module']]
df = df.astype(str)
if anno_selection == 'ko':
df = df['KEGG_ko'].str.split(',', expand=True).stack().reset_index(drop=True, level=1)
elif anno_selection == 'mo':
df = df['KEGG_Module'].str.split(',', expand=True).stack().reset_index(drop=True, level=1)
else:
print("\nERROR: Please type 'ko' if you want to select the kegg orthologs or 'mo' if you want the kegg module")
break
KEGG_list = df.tolist()
for code in KEGG_list:
if (code not in d.keys() and code!="nan"):
l= taxonomy[taxonomy['user_genome'] == s.replace('.emapper.annotations', '')][taxa_selection].values
l = str(l).replace("['","").replace("']","")
d[code] = [l]
elif (code in d.keys() and code!="nan"):
l = taxonomy[taxonomy['user_genome'] == s.replace('.emapper.annotations', '')][taxa_selection].values
l = str(l).replace("['","").replace("']","")
n = d[code].append(l)
else:
continue
taxa_list = taxonomy[taxa_selection].tolist()
x = collections.Counter(taxa_list).items()
for taxa,n in x:
d_taxa.setdefault(taxa, []).append(n)
taxa = pd.DataFrame.from_dict(d_taxa, orient='index', columns=['Occurrences'])
taxa['Percentages'] = (taxa['Occurrences']/taxa['Occurrences'].sum())*100
taxa.reset_index(level=0, inplace=True)
taxa = taxa.sort_values(by='index')
for i in d.keys():
list_names = d[i]
frequency = collections.Counter(list_names).items()
data = pd.DataFrame(frequency, columns =[taxa_selection, 'Frequency'])
data = data.sort_values(by=taxa_selection)
for j in range(len(data)):
data['Real Frequency'] = data['Frequency'].astype(float)*taxa['Percentages'].astype(float)
data.to_csv(os.path.join(args.outputdir, r'{}.csv'.format(i)), sep='\t', index=False, float_format='%.2f')
print("\nSTRIM run succefully ! Check the directory to inspect the results")