-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathConvertAln2Mat.pl
80 lines (73 loc) · 2.11 KB
/
ConvertAln2Mat.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/perl -w
# A perlscript written by Joseph Hughes, University of Glasgow
# use this perl script to convert a multiple sequence alignment into a tex-tab delimited matrix with positions as column names
use strict;
use Getopt::Long;
use Bio::SeqIO;
my ($in,$out,$help,$gene,$ssi);
&GetOptions(
'in:s' => \$in,#fastafile
'out:s' => \$out,#output text-tabe matrix
'gene:s' => \$gene,#gene label
'site:s' => \$ssi,#gene label
"help" => \$help, # provides help with usage
);
if (($help)&&!($help)||!($in)||!($out)){
print "Usage : ConvertAln2Mat.pl <list of arguments>\n";
print " -in <txt> - the input fasta file\n";
print " -out <txt> - the name of your output text-tab delimited matrix\n";
print " -gene <txt> - gene label\n";
print " -site <txt> - [optional] list of sites of interest e.g., 13,15,234 to limit the output\n";
print " -help - Get this help\n";
exit();
}
my $inseq = Bio::SeqIO->new(-file => "$in" ,
-format => 'fasta');
my %ssi_list; # special sites of interest
if ($ssi){
my @sites=split(/,/,$ssi);
foreach my $pos(@sites){
$ssi_list{$pos}++;
print "$pos \n";
}
}
open(OUT,">$out")||die "Can't open $out\n";
my $total_sites=0;
my %seqs;
while (my $seq = $inseq->next_seq()){
my $id=$seq->id;
my $seq_str=$seq->seq();
$seqs{$id}=$seq_str;
$total_sites=length($seq_str);
}
print OUT "sequence_name";
for (my $i=0; $i<$total_sites; $i++){
if ($ssi && $ssi_list{($i+1)}){
if ($gene){
print OUT "\t".$gene."_".($i+1);
}else{
print OUT "\t_".($i+1);
}
}elsif(!$ssi){
if ($gene){
print OUT "\t".$gene."_".($i+1);
}else{
print OUT "\t_".($i+1);
}
}
}
print OUT "\n";
for my $id (keys %seqs){
my @bases=split(//,$seqs{$id}); #the input can also be a protein alignment so these could equally be amino acids
if (!$ssi){
print OUT $id."\t".join("\t",@bases)."\n";
}elsif ($ssi){
print OUT $id;
for (my $i=0; $i<scalar(@bases);$i++){
if ($ssi_list{($i+1)}){
print OUT "\t$bases[$i]";
}
}
print OUT "\n";
}
}