-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcount_clip_sites.py
130 lines (96 loc) · 3.52 KB
/
count_clip_sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
cgat_script_template.py - template for CGAT scripts
====================================================
:Author:
:Release: $Id$
:Date: |today|
:Tags: Python
Purpose
-------
Count the number of clip sites in each gene, transcript or exon
Usage
-----
python count_clip_sites.py BAMFILE [OPTIONS]
.. Example use case
Example::
python cgat_script_template.py
Type::
python cgat_script_template.py --help
for command line help.
Command line options
--------------------
'''
import sys
import CGAT.Experiment as E
import CGAT.GTF as GTF
import CGAT.Intervals as Intervals
import iCLIP
import pysam
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-f", "--feature", dest="feature", type="choice",
choices=["gene", "transcript", "exon"],
default="transcript",
help="supply help")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
iterator = GTF.iterator(options.stdin)
if options.feature == "gene":
iterator = GTF.flat_gene_iterator(iterator)
elif options.feature == "transcript":
iterator = GTF.transcript_iterator(iterator)
elif options.feature == "exon":
def _exon_iterator(gff_iterator):
for exon in gff_iterator:
yield [exon]
iterator = _exon_iterator(iterator)
bamfile = pysam.AlignmentFile(args[0])
outlines = []
for feature in iterator:
exons = GTF.asRanges(feature, "exon")
exon_counts = iCLIP.count_intervals(bamfile,
exons,
feature[0].contig,
feature[0].strand,
dtype="uint32")
exon_counts = exon_counts.sum()
introns = Intervals.complement(exons)
intron_counts = iCLIP.count_intervals(bamfile,
introns,
feature[0].contig,
feature[0].strand,
dtype="uint32")
intron_counts = intron_counts.sum()
if options.feature == "exon":
exon_id = feature[0].exon_id
gene_id = feature[0].gene_id
transcript_id = feature[0].transcript_id
intron_counts = "NA"
else:
exon_id = "NA"
gene_id = feature[0].gene_id
transcript_id = feature[0].transcript_id
outlines.append([gene_id,
transcript_id,
exon_id,
str(exon_counts),
str(intron_counts)])
options.stdout.write("\t".join(["gene_id",
"transcript_id",
"exon_id",
"exon_count",
"intron_count"])+"\n")
outlines = ["\t".join(outline) for outline in outlines]
outlines = "\n".join(outlines)
options.stdout.write(outlines + "\n")
# write footer and output benchmark information.
E.Stop()
if __name__ == "__main__":
sys.exit(main(sys.argv))