-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorf_kozak.py
64 lines (54 loc) · 2.06 KB
/
orf_kozak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
import gzip
import click
from Bio import SeqIO
import pandas as pd
def smart_open(path):
"""open plain text or gzipped file"""
if path[-2:] == 'gz':
return gzip.open(path, 'rt')
else:
return click.open_file(path, 'rt')
def strip_version(tx_name):
"""Strip transcript version"""
return re.sub('\.\d+$', '', tx_name)
def read_fasta(path, ignore_version=False):
"""Construct a dict of input fasta"""
fa = dict()
with smart_open(path) as f:
for record in SeqIO.parse(f, 'fasta'):
if ignore_version:
record.id = strip_version(record.id)
fa[record.id] = str(record.seq)
return fa
@click.command(context_settings=dict(help_option_names=['-h', '--help'], show_default=True))
@click.argument('orf_table', type=click.STRING)
@click.option('-f', '--fas_path', type=click.STRING, default=['-'], multiple=True,
help=('transcript sequences in fasta. can be specified multiple times if ' +
'sequences are stored in separate files (e.g., cdna.fa and ncrna.fa)'))
@click.option('-i', '--ignore_txversion', is_flag=True, default=False,
help='ignore transcript version in ".\d+" format')
def get_kozak(orf_table, fas_path, ignore_txversion=True):
"""
Extract Kozak sequence for each ORF
\b
ORF_TABLE: table of ORFs (must have columns orf_id, tx_name, tstart)
output: stdout
"""
fas = dict()
for i in fas_path:
fas = fas | read_fasta(i, ignore_version=ignore_txversion)
orfs = pd.read_table(orf_table)
print('\t'.join(['orf_id', 'tx_name', 'kozak_seq', 'kozak_score']))
for row in orfs.itertuples():
try:
kozak = ('-' * max(0, 7 - row.tstart) +
fas[row.tx_name][max(0, row.tstart - 7):(row.tstart + 3)])
score = (kozak[3] in 'AG') + (kozak[9] == 'G')
except KeyError:
kozak = '##########'
score = 'NA'
print('\t'.join([row.orf_id, row.tx_name, kozak, str(score)]))
return
if __name__ == '__main__':
get_kozak()