-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfancyparser.py
193 lines (165 loc) · 7.99 KB
/
fancyparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
'''
An XML parser in Python for reading an LRG file in .xml format and exporting exons coordinates
Authors: Andrew Smith and Jethro Rainford
Development start date: 27th November 2018
Usage: refer to the README file
'''
# Importing required modules
import xml.etree.ElementTree as ET
import time
import requests
# Function to use user input to pull LRG XML from LRG website and create local XML file for parsing
def lrg_input(input_lrg):
# Assert to ensure only positive integers are entered by the user
assert input_lrg.isdigit(), "Please provide a singular positive integer"
url = 'http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_%s.xml' % input_lrg
r = requests.get(url, allow_redirects=True)
open('files_xml/LRG_%s.xml' % input_lrg, 'wb').write(r.content)
fileName = open('files_xml/LRG_%s.xml' % input_lrg, 'r')
return fileName
# Alternative function to pull LRG XML from LRG website and create local XML if currently "Under Curation"
def pending_lrg_input(input_lrg):
# Assert to ensure only positive integers are entered by the user
assert input_lrg.isdigit(), "Please provide a singular positive integer"
url = 'http://ftp.ebi.ac.uk/pub/databases/lrgex/pending/LRG_%s.xml' % input_lrg
r = requests.get(url, allow_redirects=True)
open('files_xml/LRG_%s.xml' % input_lrg, 'wb').write(r.content)
fileName = open('files_xml/LRG_%s.xml' % input_lrg, 'r')
return fileName
# Generates parsable tree of input XML file.
# Pulls whether the LRG is either currently "Published" or "Under Creation"
def tree_generation(fileName):
try:
tree = ET.parse(fileName)
root = tree.getroot()
curation = 'Curation Status: LRG Published\n'
return tree, root, curation
except SyntaxError:
try:
fileName = pending_lrg_input(input_lrg)
tree = ET.parse(fileName)
root = tree.getroot()
curation = 'Curation Status: Gene Under Curation or Pending Approval\n'
return tree, root, curation
except:
print("Invalid LRG, please check inputted LRG number")
exit()
# Function to return exon numbers of lrg file in a list
def exon_num(root):
exon_num_list = []
for child in root[0]:
if child.tag == "transcript" and child.attrib['name'] == "t1": # Only takes the first transcirpt
for i in child:
if i.tag == "exon":
if (str(i.attrib))[-4] == "'":
exon_num_list.append((str(i.attrib))[-3:-2]) # Returns exons 1 - 9
elif (str(i.attrib))[-5] == "'":
exon_num_list.append((str(i.attrib))[-4:-2]) # Returns exons 10 - 99
elif (str(i.attrib))[-6] == "'":
exon_num_list.append((str(i.attrib))[-5:-2]) # Returns exons 100 - 999
return (exon_num_list)
# Acquires the name of the gene for use in .bed file naming
def gene_name(tree):
for gene_name in tree.findall('.//lrg_locus'):
gene = 'LRG' + '_' + str(input_lrg) + "_" + gene_name.text
exon_num_var = exon_num(root)
return gene, exon_num_var
# Function to return two lists for start and end coordinates of exons respectively
def exon_coord(root):
exon_start_list = []
exon_end_list = []
for child in root[0]:
if child.tag == "transcript" and child.attrib['name'] == "t1": # Only takes the first transcirpt
for exon in child:
if exon.tag == "exon":
exon_start_list.append(exon[0].attrib["start"])
for exon in child:
if exon.tag == "exon":
exon_end_list.append(exon[0].attrib["end"])
return exon_start_list, exon_end_list
# Converts start_list_str and end_list_str to integer values
def list_conversion_str2int(list_a, list_b):
output_list_a = []
output_list_b = []
for i in list_a[0]:
output_list_a.append(int(i))
for i in list_b[0]:
output_list_b.append(int(i))
return output_list_a, output_list_b
# Function to calculate exon lengths
def exon_len_func(lrg_start, lrg_end):
exon_len = []
exon_len_count = 0
for i in lrg_start:
exon_len.append(lrg_end[exon_len_count] - lrg_start[exon_len_count])
exon_len_count += 1
return exon_len
# Pulls various values from the inputted XML file
def tree_values(tree):
for i in tree.findall('.//mapping'):
if i.attrib["coord_system"] == "GRCh37.p13":
chromosome = i.attrib["other_name"] # Pulls chromosome number from XML
gene_chr_start = int(i[0].attrib["other_start"]) # For loop to obtain start coords of gene on GRCh37.p13
gene_chr_end = int(i[0].attrib["other_end"]) # For loop to obtain end coords of gene on GRCh37.p13
strand = int(i[0].attrib["strand"]) # Fdentify between forward strand(1) and reverse strand(-1)
return chromosome, gene_chr_start, gene_chr_end, strand
# Mapping LRG coords to chromosomal coordinates (FORWARD STRAND)
def strand_pos_neg(strand, lrg_start_list, lrg_end_list, gene_chr_start, gene_chr_end):
if strand == 1:
chr_exon_start = []
for coord in lrg_start_list:
chr_exon_start.append(coord + gene_chr_start - 1)
chr_exon_end = []
for coord in lrg_end_list:
chr_exon_end.append(coord + gene_chr_start - 1)
# Mapping of LRG coordinates to chromosomal locations (REVERSE STRAND)
else:
chr_exon_start = []
for coord in lrg_start_list:
chr_exon_start.append(gene_chr_end - coord + 1)
chr_exon_end = []
for coord in lrg_end_list:
chr_exon_end.append(gene_chr_end - coord + 1)
return chr_exon_start, chr_exon_end
# Pulls chromosome number from input LRG_xml
def chrom_num(chr_exon_start, chromosome):
chr_list = []
count = 0
while count < len(chr_exon_start):
chr_list.append("chr" + chromosome)
count += 1
return chr_list
# Creation of output BED file named by LRG # followed by gene name
def output_bed(strand, chr_list, chr_exon_start, chr_exon_end, exon_num_var, exon_len):
date = time.strftime("File created: %d/%m/%Y %H:%M:%S\n") # Creates a date/time stamp for creaton of BED file
header = "\tStart\t\tEnd\t\tExon\tLength\n" # Headers for output text file
if strand == 1:
strand = "Strand: Forward (+)\n\n"
else:
strand = "Strand: Reverse (-)\n\n"
# Writes generated values to the output .bed file
with open('files_bed/%s.bed' % gene, 'w+') as file_temp:
file_temp.write(date)
file_temp.write(curation)
file_temp.write(strand)
file_temp.write(header)
for (chr_list, chr_exon_start, chr_exon_end, exon_num_var, exon_len) in zip(chr_list, chr_exon_start,
chr_exon_end, exon_num_var,
exon_len):
file_temp.write(
"{0}\t{1}\t{2}\t{3}\t{4}\n".format(chr_list, chr_exon_start, chr_exon_end, exon_num_var, exon_len))
print('File Created: %s.bed' % gene)
# If the program is run as the __main__ program an output .bed is generated
# Allows for use of the above functions in testing processes or external programs
if __name__ == "__main__":
input_lrg = input("Please enter LRG number: ")
fileName = lrg_input(input_lrg)
tree, root, curation = tree_generation(fileName)
gene, exon_num_var = gene_name(tree)
start_list_str, end_list_str = map(list, zip(exon_coord(root)))
lrg_start_list, lrg_end_list = list_conversion_str2int(start_list_str, end_list_str)
exon_len = exon_len_func(lrg_start_list, lrg_end_list)
chromosome, gene_chr_start, gene_chr_end, strand = (tree_values(tree))
chr_exon_start, chr_exon_end = strand_pos_neg(strand, lrg_start_list, lrg_end_list, gene_chr_start, gene_chr_end)
chr_list = chrom_num(chr_exon_start, chromosome)
output_bed(strand, chr_list, chr_exon_start, chr_exon_end, exon_num_var, exon_len)