-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.php
153 lines (130 loc) · 5.18 KB
/
scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
<?
try {
// open or create data.sqlite database
$file_db = new PDO('sqlite:data.sqlite');
$file_db->setAttribute(PDO::ATTR_ERRMODE,
PDO::ERRMODE_EXCEPTION);
$file_db->exec("CREATE TABLE IF NOT EXISTS data (
name TEXT,
location TEXT,
website TEXT,
lat REAL,
lon REAL,
rating TEXT,
contact TEXT)");
// copy that database to memory so we can remove entries that do not exist anymore in the table
$mem_db = new PDO('sqlite::memory:');
$mem_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$mem_db->exec('ATTACH "data.sqlite" as filedb');
$mem_db->exec('CREATE TABLE data AS SELECT * FROM filedb.data');
$mem_db->exec('DETACH filedb');
}
catch(PDOException $e) {
// Print PDOException message
die ($e->getMessage());
}
//require 'scraperwiki.php';
//require 'scraperwiki/simple_html_dom.php';
require_once 'vendor/autoload.php';
require_once 'vendor/openaustralia/scraperwiki/scraperwiki.php';
use PGuardiario\PGBrowser;
use Torann\DomParser\HtmlDom;
$browser = new PGBrowser();
$page = $browser->get("http://wiki.fablab.is/wiki/Portal:Labs");
$dom = HtmlDom::fromString($page->html);
//$html = scraperWiki::scrape("http://wiki.fablab.is/wiki/Portal:Labs");
//$dom = new simple_html_dom();
//$dom->load($html);
$i = 0;
$notLocated = array();
foreach($dom->find("#content .wikitable tr") as $data)
{
//if($i++ > 19) break;
$tds = $data->find("td");
if(count($tds) == 0) continue;
$country = trim($tds[1]->plaintext);
$city = trim($tds[2]->plaintext);
$combinedLocation = $country.", ". $city;
echo " before ".$combinedLocation.
$name = trim(strip_tags($tds[3]->plaintext));
//echo " before ".$combinedLocation. " after "
//figure out if this location exists in the db already, and if so. remove from the memoryDB
$stmt = $file_db->prepare("select * from data where name LIKE :name");
$stmt->bindParam(':name', $name, PDO::PARAM_STR);
echo ($stmt->execute());
if (count($stmt->fetchall())>0) {
echo (" location: ".$combinedLocation." already in database\n");
$stmt = $mem_db->prepare("delete from data where name LIKE :name");
$stmt->bindParam(':name', $name, PDO::PARAM_STR);
$stmt->execute();
continue;
}
echo (" location: ".$name." not yet in database, lets add\n");
$combinedLocationQuery = strip_tags($combinedLocation);
$combinedLocationQuery = htmlentities($combinedLocationQuery, ENT_QUOTES);
//$combinedLocationQuery = urlencode($combinedLocationQuery);
$combinedLocationQuery = urlencode($combinedLocation);
$website = $tds[4]->plaintext;
$rating = (count($tds) >= 6)? $tds[5]->plaintext : "";
$contact = (count($tds) >= 7)? $tds[6]->plaintext : "";
//echo "$locationName\n";
$lat = "";
$lng = "";
//$geocode_url = 'http://open.mapquestapi.com/nominatim/v1/search?format=json&q=';
$geocode_url = "http://where.yahooapis.com/v1/places.q('";
$app_id = "')?format=JSON&appid=DX4mM4PV34ESO96yg70UGL5nu87SZ.gLXnubndwBjFvVp6_6LlnRfyd7Co_4s_W1q3se1LE-";
//print(" geocode_url: ".$geocode_url.$combinedLocationQuery.$app_id."\n");
$geoResult = file_get_contents($geocode_url.$combinedLocationQuery.$app_id);
$geoJSON = json_decode($geoResult);
///print $geoJSON->{'places'}->{'count'};
if($geoJSON->{'places'}->{'count'} > 0)
{
$plObj = $geoJSON->{'places'}->{'place'}[0];
$place = $plObj->name;
/*print("\nplace: \n");
print_r($place);*/
$lat = $plObj->centroid->latitude;
$lng = $plObj->centroid->longitude;
print($i." located ".$name." (".$lat." x ".$lng.")\n");
}
else
{
echo "Can't locate: $locationName ($combinedLocation) ($combinedLocationQuery)\n";
print("geocode_url: ".$geocode_url.$combinedLocationQuery.$app_id."\n");
$notLocated[] = "$locationName ($combinedLocation) ($combinedLocationQuery)";
continue;
}
$fablab = array(
'name' => $name,
'location' => $combinedLocation,
'website' => $website,
'lat' => $lat,
'lon' => $lng,
'rating' => $rating,
'contact' => $contact
);
$insert = "INSERT INTO data (name, location, website, lat, lon, rating, contact)
VALUES (:name, :location, :website, :lat, :lon, :rating, :contact)";
$stmt = $file_db->prepare($insert);
$stmt->execute($fablab);
sleep((1000+rand(0,3000))/2000);
}
$stmt = $mem_db->prepare("select * from data");
$stmt->execute();
$unmatched = $stmt->fetchall();
echo "stored locations no longer in table: ".count($unmatched)."\n";
foreach ($unmatched as $d) {
print("\t".$d[0]."\n");
$stmt = $file_db->prepare("delete from data where name LIKE :name");
$stmt->bindParam(':name', $d[0], PDO::PARAM_STR);
$stmt->execute();
}
echo "unable to locate: " .count($notLocated)." locations\n";
if (count($notLocated)>0) {
print("\t");
$notLocatedString = implode("\n\t",$notLocated);
print($notLocatedString);
}
$file_db=null;
$mem_db=null;
?>