Skip to content

Commit

Permalink
Further enhancements to scantext()
Browse files Browse the repository at this point in the history
  • Loading branch information
nigelhorne committed Jan 22, 2025
1 parent 39f1529 commit 4d0c3a8
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 14 deletions.
83 changes: 75 additions & 8 deletions lib/Geo/Coder/Free.pm
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,11 @@ sub new {
# Note that this yields many false positives and isn't useable yet
my @matches = $geo_coder->geocode(scantext => 'arbitrary text', region => 'US');
@matches = $geo_coder->geocode(scantext => 'arbitrary text', region => 'US', ignore_words => [ 'foo', 'bar' ]);
=cut

# List of words that scantext should ignore
my %common_words = (
'the' => 1,
'and' => 1,
Expand Down Expand Up @@ -230,6 +233,28 @@ sub geocode {
if(wantarray) {
my @rc = $self->{'openaddr'}->geocode(\%params);
if((my $scantext = $params{'scantext'}) && (my $region = $params{'region'})) {
my %ignore_words;
if($params{'ignore_words'}) {
%ignore_words = map { lc($_) => 1 } @{$params{'ignore_words'}};
}
# ::diag(Data::Dumper->new([\%ignore_words])->Dump());
$region = uc($region);
if($region eq 'US') {
my @candidates = _find_us_addresses($scantext);
if(scalar(@candidates)) {
if(wantarray) {
my @us;
foreach my $candidate(@candidates) {
my @res = $self->{'maxmind'}->geocode("$candidate, USA");
push @us, @res;
}
return @us if(scalar(@us));
}
if(my $rc = $self->{'maxmind'}->geocode($candidates[0] . ', USA')) {
return $rc;
}
}
}
$scantext =~ s/[^\w']+/ /g;
my @a = List::MoreUtils::uniq(split(/\s/, $scantext));
my $iterator = Array::Iterator->new({ __array__ => \@a });
Expand All @@ -238,6 +263,7 @@ sub geocode {
my $w;
if($w) {
next if(exists($common_words{lc($w)}));
next if(exists($ignore_words{lc($w)}));
if($w =~ /^[a-z]{2,}$/i) {
my $peek = $iterator->peek();
last if(!defined($peek));
Expand Down Expand Up @@ -266,29 +292,42 @@ sub geocode {
next if($word !~ /\D/);
# FIXME: There are a *lot* of false positives
next if(exists($common_words{lc($word)}));
next if(exists($ignore_words{lc($word)}));
if($word =~ /^[a-z]{2,}$/i) {
my $key = "$word/$region";
my @matches;
if($self->{'scantext'}->{$key}) {
# ::diag("$key: HIT");
@matches = @{$self->{'scantext'}->{$key}};
if(my $hits = $self->{'scantext'}->{$key}) {
# ::diag("$key: HIT: ", Data::Dumper->new([$hits])->Dump());
@matches = @{$hits} if(scalar(@{$hits}));
} else {
# ::diag("$key: MISS");
@matches = $self->{'maxmind'}->geocode({ location => $word, region => $region });
}
# ::diag(__LINE__, Data::Dumper->new([\@matches])->Dump());
my @m;
foreach my $match(@matches) {
if(ref($match) eq 'HASH') {
$match->{'location'} = "$word, " . uc($region);
$match->{'location'} = "$word, $region";
push @m, $match;
} elsif(ref($match) eq 'ARRAY') {
warn __PACKAGE__, ': TODO: handle array: ', Data::Dumper->new([$match])->Dump();
} else {
warn __PACKAGE__, ': TODO: handle ', ref($match), ': ', Data::Dumper->new([$match])->Dump();
push @m, {
confidence => $match->confidence(),
location => $match->as_string(),
latitude => $match->lat(),
longitude => $match->long(),
lat => $match->lat(),
long => $match->long(),
database => $match->database(),
country => $match->country(),
city => $match->city()
}
}
}
$self->{'scantext'}->{$key} = \@matches;
@rc = (@rc, @matches);
$self->{'scantext'}->{$key} = \@m;
@rc = (@rc, @m);
}

}
}
return @rc if(scalar(@rc) && $rc[0]);
Expand Down Expand Up @@ -336,6 +375,34 @@ sub geocode {
}
}

# Function to find all possible US addresses in a string
sub _find_us_addresses {
my ($text) = @_;
my @addresses;

# Regular expression to match U.S.-style addresses
my $address_regex = qr/
\b # Word boundary
(\d{1,5}) # Street number: 1 to 5 digits
\s+ # Space
([A-Za-z0-9\s]+?) # Street name (alphanumeric, allows spaces)
\s+ # Space
(Street|St\.?|Avenue|Ave\.?|Boulevard|Blvd\.?|Road|Rd\.?|Lane|Ln\.?|Drive|Dr\.?) # Street type
(,\s+[A-Za-z\s]+)? # Optional city name
\s* # Optional spaces
(,\s*[A-Z]{2})? # Optional state abbreviation
\s* # Optional spaces
(\d{5}(-\d{4})?)? # Optional ZIP code
\b # Word boundary
/x;

# Find all matches
while ($text =~ /$address_regex/g) {
push @addresses, $&; # Capture the full match
}

return @addresses;
}
=head2 reverse_geocode
$location = $geocoder->reverse_geocode(latlng => '37.778907,-122.39732');
Expand Down
11 changes: 11 additions & 0 deletions lib/Geo/Coder/Free/OpenAddresses.pm
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ sub new {
# @locations = $geocoder->geocode('Portland, USA');
# diag 'There are Portlands in ', join (', ', map { $_->{'state'} } @locations);
@locations = $geo_coder->geocode(scantext => 'arbitrary text', region => 'US', ignore_words => [ 'foo', 'bar' ]);
When looking for a house number in a street, if that address isn't found but that
street is found, a place in the street is given.
So "106 Wells Street, Fort Wayne, Allen, Indiana, USA" isn't found, a match for
Expand All @@ -147,6 +149,11 @@ sub geocode
$param{location} = shift;
}

my %ignore_words;
if($param{'ignore_words'}) {
%ignore_words = map { lc($_) => 1 } @{$param{'ignore_words'}};
}

if(my $scantext = $param{'scantext'}) {
return if(length($scantext) < 6);
# FIXME: wow this is inefficient
Expand All @@ -160,6 +167,10 @@ sub geocode
$offset++;
next;
}
if(exists($ignore_words{lc($words[$offset])})) {
$offset++;
next;
}
my $l;
if(($l = $self->geocode(location => $words[$offset])) && ref($l)) {
push @rc, $l;
Expand Down
23 changes: 17 additions & 6 deletions t/scantext.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use warnings;
use strict;
use Data::Dumper;
use Test::Most tests => 22;
use Test::Most tests => 24;
use Test::Number::Delta;
use Test::Carp;
use Test::Deep;
Expand Down Expand Up @@ -112,9 +112,20 @@ SCANTEXT: {
# ok($found{'INDIANAPOLIS'});

@locations = $geo_coder->geocode(scantext => 'Nigel Horne was here', region => 'gb');
ok(ref($locations[0]) eq '');
# diag(Data::Dumper->new([\@locations])->Dump()) if($ENV{'TEST_VERBOSE'});
diag(Data::Dumper->new([\@locations])->Dump());
cmp_ok(scalar(@locations), '==', 1, 'Found one match for Horne in GB');
diag(Data::Dumper->new([\@locations])->Dump()) if($ENV{'TEST_VERBOSE'});
cmp_ok(lc($locations[0]->{'city'}), 'eq', 'horne', 'There is a place near Gatwick called Horne');

@locations = $geo_coder->geocode(scantext => 'Nigel Horne was here', region => 'gb', ignore_words => [ 'horne' ]);
# cmp_ok(scalar(@locations), '==', 0, 'ignore_words are ignored');
cmp_ok($locations[0], 'eq', '', 'Empty string'); # FIXME: should be undef
diag(__LINE__, ': ', Data::Dumper->new([\@locations])->Dump()) if($ENV{'TEST_VERBOSE'});

@locations = $geo_coder->geocode({
scantext => 'Send it to 123 Main Street, Springfield, IL 62704 or to 456 Elm St., Denver, CO. Other options: 789 Pine Blvd, Austin, TX.',
region => 'us'
});
diag(Data::Dumper->new([\@locations])->Dump()) if($ENV{'TEST_VERBOSE'});

eval 'use Test::Memory::Cycle';
if($@) {
Expand All @@ -124,10 +135,10 @@ SCANTEXT: {
}
} elsif(!defined($ENV{'AUTHOR_TESTING'})) {
diag('Author tests not required for installation');
skip('Author tests not required for installation', 21);
skip('Author tests not required for installation', 23);
} else {
diag('Set OPENADDR_HOME to enable openaddresses.io testing');
skip('Set OPENADDR_HOME to enable openaddresses.io testing', 21);
skip('Set OPENADDR_HOME to enable openaddresses.io testing', 23);
}
}
}

0 comments on commit 4d0c3a8

Please sign in to comment.