forked from kapsakcj/INNUENDO_REST_API
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_core_from_wg.py
executable file
·115 lines (92 loc) · 3.74 KB
/
extract_core_from_wg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
import csv
import argparse
'''
This program removes genes from a tab delimited file according to a key-value
list. Currently it only allows replacing chewBBACA tags by 0
'''
def main():
"""
This program removes genes from a tab delimited file according to a list
of locus identifiers.
It also allows only replacing chewBBACA missing data tags by 0.
Returns
-------
file: writes a set of files. One with all header identifiers and other
with the resulting profiles in a tab delimited format.
"""
parser = argparse.ArgumentParser(
description="This program removes gens from a tab separated allele "
"profile file")
parser.add_argument('-i', nargs='?', type=str,
help='main matrix file from which to remove',
required=True)
parser.add_argument('-g', nargs='?', type=str,
help='list of genes to remove', required=True)
parser.add_argument('-o', nargs='?', type=str,
help='output file name', required=True)
parser.add_argument("--inverse",
help="list to remove is actually the one to keep",
dest='inverse', action="store_true", default=False)
parser.add_argument("--onlyreplace", help="Only replaces letters by 0",
dest='onlyreplace', action="store_true", default=False)
args = parser.parse_args()
mainListFile = args.i
toRemoveListFile = args.g
outputfileName = args.o
inverse = args.inverse
# Key-value dictionary with the tags that need to be changed by 0
allele_classes_to_ignore = {
'LNF': '0',
'INF-': '',
'NIPHEM': '0',
'NIPH': '0',
'LOTSC': '0',
'PLOT3': '0',
'PLOT5': '0',
'ALM': '0',
'ASM': '0'
}
if inverse:
FilesToRemove = ['File', 'FILE', 'file']
else:
FilesToRemove = []
with open(toRemoveListFile) as f:
for File in f:
File = File.rstrip('\n')
File = File.rstrip('\r')
File = (File.split('\t'))[0]
FilesToRemove.append(File)
with open(mainListFile, 'rb') as tsvin, open(outputfileName + ".tsv", "wb")\
as csvout, open(outputfileName + "_headers.txt", "wb")\
as headers_out:
# Opens the file with the locus tags to be included in the final profile
# file.
tsvin = csv.reader(tsvin, delimiter='\t')
listindextoremove = []
# Checks if a locus exists in the whole genome profile. If exists it
# marks it to be removed.
for firstline in tsvin:
for gene in firstline:
if gene in FilesToRemove and not inverse:
listindextoremove.append(firstline.index(gene))
elif gene not in FilesToRemove and inverse:
listindextoremove.append(firstline.index(gene))
if not args.onlyreplace:
for elem in reversed(listindextoremove):
del firstline[elem]
csvout.write(('\t'.join(firstline)) + "\n")
headers_out.write(('\n'.join(firstline)))
break
# Deletes the profile indexes to be removed from each profile.
for line in tsvin:
if not args.onlyreplace:
for elem in reversed(listindextoremove):
del line[elem]
string_list = ('\t'.join(line))
for k, v in allele_classes_to_ignore.iteritems():
string_list = string_list.replace(k,v)
# Writes the processed profile to the final profiles file.
csvout.write(string_list + "\n")
if __name__ == "__main__":
main()