-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfa2maf.py
83 lines (70 loc) · 3.35 KB
/
fa2maf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
definition of MAF (https://genome.ucsc.edu/FAQ/FAQformat.html#format5):
The first line of a custom MAF track must be a "track" line that contains
a name=value pair specifying the track name.
The first line of a .maf file begins with ##maf. This word is followed by
white-space-separated variable=value pairs. There should be no white space
surrounding the "=".
version - Required. Currently set to one.
scoring - Optional. A name for the scoring scheme used for the alignments.
The "s" lines together with the "a" lines define a multiple alignment. The
first "s" line must be the reference genome. The "s" lines have the following
fields which are defined by position.
src -- The name of one of the source sequences for the alignment. For
sequences that are resident in a browser assembly, the form
'database.chromosome' allows automatic creation of links to other
assemblies. Non-browser sequences are typically reference by the species
name alone.
start -- The start of the aligning region in the source sequence. This is a
zero-based number. If the strand field is "-" then this is the start
relative to the reverse-complemented source sequence (see Coordinate
Transforms).
size -- The size of the aligning region in the source sequence. This number
is equal to the number of non-dash characters in the alignment text
field below.
strand -- Either "+" or "-". If "-", then the alignment is to the reverse-
complemented source.
srcSize -- The size of the entire source sequence, not just the parts involved
in the alignment.
text -- The nucleotides (or amino acids) in the alignment and any insertions
(dashes) as well.
"""
from Bio import SeqIO
import click
@click.command(context_settings=dict(
help_option_names=['-h', '--help'], show_default=True))
@click.argument('fasta', type=click.File('rt'))
@click.argument('maf', type=click.File('wt'))
def fa2maf(fasta, maf):
"""
convert fasta alignment to a mock MAF(Multiple Alignment Format)
\b
fasta: input fasta alignment (can be stdin "-")
maf : output MAF (can be stdout "-")
note: MAF is for whole genome alignment and contains more information
than fasta and it is impossible to convert fasta to a true MAF block.
This script only generate a pseudo MAF that can be used to calculate
PhyloCSF score with `phylocsf++` software. The `src` (chromosome),
`strand` and `srcSize` columns are fill with arbitrary values as place
holder (0, +, 1000000, respectively). Note that gaps present in all
sequences, sequences consisting of gaps only, and size field in the s
lines do not influence phylocsf++ results.
\b
workflow:
```
python fa2maf.py test.aln.fa test.aln.maf
phylocsf++ score-msa 100vertebrates test.aln.maf
cat test.aln.maf.scores
```
"""
records = SeqIO.to_dict(SeqIO.parse(fasta, 'fasta'))
print('track name=ncorf\n##maf version=1\n\na score=100.0', end='\n', file=maf)
idlen_max = max([len(sp + '.aln') for sp in records])
for sp in records:
name = (sp + '.aln').ljust(idlen_max)
seq = str(records[sp].seq)
sline = ' '.join(['s', name, '0', str(len(seq)), '+', '1000000', seq])
print(sline, end='\n', file = maf)
return
if __name__ == '__main__':
fa2maf()