-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate_hamming_distances.pl
157 lines (112 loc) · 4.74 KB
/
calculate_hamming_distances.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long qw(GetOptions);
########################################################################################################################################
#Calculate pairwise uncorrected hamming distances (proportion of differing sites) between all sequences in an alignment (fasta format).
#Input alignment must be in non-interleaved format (aa or nt)
#Mismatches due to gaps/Xs are ignored. Compares only overlapping sections of pairs without ambiguous characters (X,N,-).
#$0 -h -> prints help message
#Author:Alexandros Vasilikopoulos 2017
########################################################################################################################################
my $help;
my $in_file = 'alignment.fas';
my $distances = 'Hamming_distances.txt';
my $data_type = 'aa';
GetOptions ('h' => \$help,
'i=s' => \$in_file,
'o=s' => \$distances,
'd=s' => \$data_type );
if ($help) {
print<<HELP;
###################################################################################################
$0 is used to calculate pairwise uncorrected distances in an aminoacid or
nucleotidealignment. The output is a txt file with all distances based on
pairwise comparisons
Usage: $0 [-Options...]
###################################################################################################
Input alignment file (non-interleaved)
-i Default: alignment.fas
###################################################################################################
-d Data type -> amino acid or nucleotide
Default: alignment.fas
Options: aa | nuc
-o Output file with pairwise distances
Default: Hamming_distances.txt
###################################################################################################
HELP
exit;}
my $ambiguous;
if ($data_type eq "aa") {$ambiguous="X"}
elsif ($data_type eq "nuc"){$ambiguous="N";}
else {die"Unknown data type -> use option -d with either aa or nuc\n";}
my $length;
open my $fh_out, '>', $distances or die
"Could not open file\"$distances\":$!\n";
#read alignment in a hash
my $ref_hash=&read_FASTA_in_hash($in_file);
my %hash_new=%{$ref_hash};
foreach (keys %hash_new){
print "\n### Calculate hamming distances between $_ and all its pairwise comparisons ###\n\n";
print {$fh_out} "\n$_\tPaiwise distances\n\n";
sleep 1;
#Pick a pair of sequences to compare
foreach my $comparison (keys %hash_new){
$length=scalar @{$hash_new{$_}};
#skip when comparing sequence with itself
next if $_ eq $comparison;
print "\nCalculate $_-$comparison pairwise Hamming distance..\n";
my $value=0;
my $gap_only_positions=0;
my $uninformative=0;
for (my $i=0;$i<$length; ++$i){
#get values for coordinates between compared sequences
my $first=$hash_new{$_}->[$i];
my $second=$hash_new{$comparison}->[$i];
#
if ((($first eq "-") and ($second eq "-")) or
(($first eq "$ambiguous") and ($second eq "$ambiguous")) or
(($first eq "$ambiguous") and ($second eq "-")) or
(($first eq "-") and ($second eq "$ambiguous")))
{++$gap_only_positions;}
#Increase Hamming distance by 1 in case of a mismatch (ignore gaps and Xs)
elsif (($first eq "-") or
($second eq "-") or
($first eq "$ambiguous") or
($second eq "$ambiguous"))
{++$uninformative;}
elsif ($first ne $second) {++$value;}
}
#calculate proportion of differences between compared sequences
if ($value){
$length=$length-($gap_only_positions+$uninformative);
my $dist=$value/$length;
print {$fh_out} "$_-$comparison\t$dist\n";
}
elsif (($gap_only_positions+$uninformative)==$length) {print "\nWARNING: No overlapping sequence sections for Pair $_-$comparison\n";}
}
print "\n\n###### Done working with $_ ######\n\n";
sleep 1
}
###################################################################################################################################################
sub read_FASTA_in_hash {
#read fasta in a hash of arrays
my %hash;
my $header;
open my $fh, '<', $_[0] or die
"Could not open file\"$in_file\":$!\n";
while (my $line=<$fh>){
chomp $line;
if ($line=~m/^>(.+)/){
$header=$1;
}
else{
my @array=split("",$line);
#store sequence as an array in the hash
$hash{$header}= [@array];
$length=scalar @{$hash{$header}};
}
}
close $fh;
return \%hash;
}