This repository has been archived by the owner on Mar 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f9ff902
Showing
7 changed files
with
182,971 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# OpenStreetMap | ||
This project is about wrangling data coming from OpenStreetMap. | ||
The goal of this project was to learn about the steps for getting data, using MongoDB and its Python driver to save it on a database, and cleaning that data. | ||
|
||
Even though the process wasn't linear, the order in which the scripts are supposed to be used is the following. Once you have exported the XML file from OpenStreetMap, `model.py` is | ||
executed to parse XML to JSON, then it's `import.py`'s turn to import the JSON documents into MongoDB, once that is finished, `street_spider.py` has to be run using Scrapy and its task is to | ||
scrape street names information from a local wiki, then `import.py` is run again to import the file generated by the spider, and finally `clean.py` is used to correct some of the streetnames we initially imported using the scraped ones. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import pymongo, pprint, json | ||
import sys, re | ||
|
||
# function to check whether a string contains a digit | ||
has_num = lambda s: any(c.isdigit() for c in s) | ||
|
||
# postcodes to fix | ||
postcodes = { | ||
"10":"0010", | ||
"26":"0026", | ||
"50":"0050", | ||
"N-0286":"0286", | ||
"1325 Lysaker":"1325", | ||
"1283 Oslo":"1283", | ||
"1900 Fetsund":"1900" | ||
} | ||
|
||
# abbreviations to fix | ||
abbreviations = { | ||
re.compile(r'^.*( gt).*$'):"gate" | ||
} | ||
|
||
|
||
def levenshtein(s1, s2): | ||
if len(s1) < len(s2): | ||
return levenshtein(s2, s1) | ||
|
||
if len(s2) == 0: | ||
return len(s1) | ||
|
||
previous_row = range(len(s2) + 1) | ||
for i, c1 in enumerate(s1): | ||
current_row = [i + 1] | ||
for j, c2 in enumerate(s2): | ||
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer | ||
deletions = current_row[j] + 1 # than s2 | ||
substitutions = previous_row[j] + (c1 != c2) | ||
current_row.append(min(insertions, deletions, substitutions)) | ||
previous_row = current_row | ||
|
||
return previous_row[-1] | ||
|
||
def distance(str1, str2): | ||
""" | ||
Returns edit distance in percentage | ||
""" | ||
return 100 - int(levenshtein(str1, str2)/float(len(str2))*100) | ||
|
||
|
||
def update_field(collection, field, old_value, new_value): | ||
query = {field: old_value} | ||
update = {"$set":{field: new_value}} | ||
res = collection.update_many(query, update) | ||
|
||
print old_value, " -> ", new_value | ||
print "(matched-modified):", res.matched_count, res.modified_count | ||
|
||
return res | ||
|
||
def check_streets(db_name, data, streets): | ||
client = pymongo.MongoClient("localhost", 27017) | ||
db = client[db_name] | ||
|
||
streets_col = db[streets] | ||
data_col = db[data] | ||
|
||
# fix postcodes | ||
for old, new in postcodes.items(): | ||
res = update_field(data_col, "addr.postcode", old, new) | ||
print "Postcode (matched,modified): ", res.matched_count, res.modified_count | ||
|
||
pipeline = [{"$match":{"addr.street":{"$exists":1}}}, {"$group":{"_id":"$addr.street"}}, {"$project":{"name":"$_id"}}] | ||
data_by_streets = data_col.aggregate(pipeline) | ||
|
||
|
||
not_found = 0 | ||
discarded = 0 | ||
changed = 0 | ||
changed_total = 0 | ||
|
||
for street in data_by_streets: | ||
|
||
# fix abbreviations | ||
for abbr in abbreviations: | ||
if abbr.match(street["name"]): | ||
new = street["name"].replace(" gt", " gate").strip(".") | ||
res = update_field(data_col, "addr.street", street["name"], new) | ||
changed_total += res.modified_count | ||
changed += 1 | ||
street["name"] = new | ||
|
||
|
||
match = streets_col.find_one({"$text":{"$search":street["name"]}}) | ||
|
||
if match != None: | ||
evaluation = distance(street["name"], match["name"]) | ||
# 86-99 auto correct | ||
# 0-85, 100 discard | ||
if evaluation > 85 and evaluation < 100: | ||
# if both the current street name and the match have a number | ||
# ask the user cause the number could differ and change completely | ||
# the street name, even if the street names are really similar | ||
if has_num(match["name"]) and has_num(street["name"]): | ||
choice = raw_input("{} -> {} ({})[y/N]: ".format(street["name"], match["name"], evaluation)) | ||
if choice.lower() == "y": | ||
res = update_field(data_col, "addr.street", street["name"], match["name"]) | ||
changed_total += res.modified_count | ||
changed += 1 | ||
else: | ||
discarded += 1 | ||
else: | ||
res = update_field(data_col, "addr.street", street["name"], match["name"]) | ||
changed_total += res.modified_count | ||
changed += 1 | ||
else: | ||
discarded += 1 | ||
else: | ||
not_found += 1 | ||
|
||
print "not found: ", not_found | ||
print "discarded: ", discarded | ||
print "changed: ", changed | ||
print "changed total: ", changed_total | ||
|
||
if __name__ == "__main__": | ||
try: | ||
db_name = sys.argv[1] | ||
data_col = sys.argv[2] | ||
streets_col = sys.argv[3] | ||
|
||
check_streets(db_name, data_col, streets_col) | ||
except IndexError: | ||
print "usage of this program:\n\tpython import.py [database] [data_collection] [streets_collection]" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import pymongo | ||
import sys, re | ||
import json | ||
|
||
mongo = ("localhost", 27017) | ||
|
||
|
||
def iter_json(filename): | ||
""" | ||
Generator to read a JSON file line by line | ||
useful to read big files without eating up RAM. | ||
""" | ||
with open(filename, "r") as f: | ||
for jsonline in f: | ||
yield json.loads(jsonline.strip("[").strip("]").strip("\n").strip(",")) | ||
|
||
def get_jsonlines(filename, n = 5): | ||
""" | ||
Return a list with n lines | ||
from a JSON file. | ||
""" | ||
data = [] | ||
for json in iter_json(filename): | ||
if len(data) < n: | ||
data.append(json) | ||
else: | ||
return data | ||
|
||
def mongo_import(db_name, col_name, filename, overwrite = True, v = (False, 0)): | ||
""" | ||
Imports JSON documents from a file to MongoDB. | ||
db_name: name of the database | ||
col_name: name of the collection | ||
filename: name of the source file | ||
overwrite: boolean, self-explaining | ||
v: verbose, tuple containing a boolean and an int | ||
representing the interval | ||
""" | ||
client = pymongo.MongoClient(mongo[0], mongo[1]) | ||
collection = client[db_name][col_name] | ||
|
||
c = 0 | ||
if overwrite: | ||
collection.drop() | ||
|
||
for json in iter_json(filename): | ||
collection.insert_one(json) | ||
if v[0] and c % v[1] == 0: | ||
print c, " documents added." | ||
c += 1 | ||
print c, " documents added." | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
try: | ||
db_name = sys.argv[1] | ||
col_name = sys.argv[2] | ||
filename = sys.argv[3] | ||
|
||
mongo_import(db_name, col_name, filename, overwrite = True, v = (True, 10000)) | ||
except IndexError: | ||
print "usage of this program:\n\tpython import.py [database] [collection] [filename]" |
Oops, something went wrong.