-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProfile.pm
73 lines (59 loc) · 1.5 KB
/
Profile.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package Profile;
use strict;
use warnings;
require Text::Ngrams;
sub new {
my $class = shift;
my $self = {
_name => shift,
_text => shift,
_ngramRanks => {}
};
bless $self, $class;
return $self;
}
sub countNGramsRanks {
my ($self,$text, $minNGramLength, $maxNGramLength) = @_;
my %ngramFreqs;
my $ng = Text::Ngrams->new( type => "utf8" );
$ng->process_text($text);
my @range = ($minNGramLength...$maxNGramLength);
for(@range){
my %target_ngrams = $ng->get_ngrams( n => $_, normalize => 1 );
@ngramFreqs{keys %target_ngrams} = values %target_ngrams;
}
delete $ngramFreqs{' '};
my %ngramRanks;
my $current_size = 0;
for my $gram (sort { $ngramFreqs{$b} <=> $ngramFreqs{$a} } keys %ngramFreqs) {
$current_size = $current_size + 1;
print $gram, "\n";
$ngramRanks{$gram} = $current_size;
last if($current_size >= 300);
}
my $size = keys %ngramRanks;
$self->{_ngramRanks} = \%ngramRanks;
}
sub getNgramRanks {
my( $self ) = @_;
return $self->{_ngramRanks};
}
sub getName {
my( $self ) = @_;
return $self->{_name};
}
sub distance {
my ( $self, $other ) = @_;
my $d = 0;
my %ranksA = %{$self->getNgramRanks()};
my %ranksB = %{$other->getNgramRanks()};
for (keys %ranksA){
if (exists $ranksB{$_}) {
$d = $d + abs($ranksA{$_} - $ranksB{$_});
}else{
$d = $d + abs($ranksA{$_} - 300);
}
}
return $d
}
1;