Skip to content
This repository has been archived by the owner on Mar 7, 2021. It is now read-only.

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
nglgzz committed Dec 11, 2016
0 parents commit f9ff902
Show file tree
Hide file tree
Showing 7 changed files with 182,971 additions and 0 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# OpenStreetMap
This project is about wrangling data coming from OpenStreetMap.
The goal of this project was to learn about the steps for getting data, using MongoDB and its Python driver to save it on a database, and cleaning that data.

Even though the process wasn't linear, the order in which the scripts are supposed to be used is the following. Once you have exported the XML file from OpenStreetMap, `model.py` is
executed to parse XML to JSON, then it's `import.py`'s turn to import the JSON documents into MongoDB, once that is finished, `street_spider.py` has to be run using Scrapy and its task is to
scrape street names information from a local wiki, then `import.py` is run again to import the file generated by the spider, and finally `clean.py` is used to correct some of the streetnames we initially imported using the scraped ones.
133 changes: 133 additions & 0 deletions clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import pymongo, pprint, json
import sys, re

# function to check whether a string contains a digit
has_num = lambda s: any(c.isdigit() for c in s)

# postcodes to fix
postcodes = {
"10":"0010",
"26":"0026",
"50":"0050",
"N-0286":"0286",
"1325 Lysaker":"1325",
"1283 Oslo":"1283",
"1900 Fetsund":"1900"
}

# abbreviations to fix
abbreviations = {
re.compile(r'^.*( gt).*$'):"gate"
}


def levenshtein(s1, s2):
if len(s1) < len(s2):
return levenshtein(s2, s1)

if len(s2) == 0:
return len(s1)

previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row

return previous_row[-1]

def distance(str1, str2):
"""
Returns edit distance in percentage
"""
return 100 - int(levenshtein(str1, str2)/float(len(str2))*100)


def update_field(collection, field, old_value, new_value):
query = {field: old_value}
update = {"$set":{field: new_value}}
res = collection.update_many(query, update)

print old_value, " -> ", new_value
print "(matched-modified):", res.matched_count, res.modified_count

return res

def check_streets(db_name, data, streets):
client = pymongo.MongoClient("localhost", 27017)
db = client[db_name]

streets_col = db[streets]
data_col = db[data]

# fix postcodes
for old, new in postcodes.items():
res = update_field(data_col, "addr.postcode", old, new)
print "Postcode (matched,modified): ", res.matched_count, res.modified_count

pipeline = [{"$match":{"addr.street":{"$exists":1}}}, {"$group":{"_id":"$addr.street"}}, {"$project":{"name":"$_id"}}]
data_by_streets = data_col.aggregate(pipeline)


not_found = 0
discarded = 0
changed = 0
changed_total = 0

for street in data_by_streets:

# fix abbreviations
for abbr in abbreviations:
if abbr.match(street["name"]):
new = street["name"].replace(" gt", " gate").strip(".")
res = update_field(data_col, "addr.street", street["name"], new)
changed_total += res.modified_count
changed += 1
street["name"] = new


match = streets_col.find_one({"$text":{"$search":street["name"]}})

if match != None:
evaluation = distance(street["name"], match["name"])
# 86-99 auto correct
# 0-85, 100 discard
if evaluation > 85 and evaluation < 100:
# if both the current street name and the match have a number
# ask the user cause the number could differ and change completely
# the street name, even if the street names are really similar
if has_num(match["name"]) and has_num(street["name"]):
choice = raw_input("{} -> {} ({})[y/N]: ".format(street["name"], match["name"], evaluation))
if choice.lower() == "y":
res = update_field(data_col, "addr.street", street["name"], match["name"])
changed_total += res.modified_count
changed += 1
else:
discarded += 1
else:
res = update_field(data_col, "addr.street", street["name"], match["name"])
changed_total += res.modified_count
changed += 1
else:
discarded += 1
else:
not_found += 1

print "not found: ", not_found
print "discarded: ", discarded
print "changed: ", changed
print "changed total: ", changed_total

if __name__ == "__main__":
try:
db_name = sys.argv[1]
data_col = sys.argv[2]
streets_col = sys.argv[3]

check_streets(db_name, data_col, streets_col)
except IndexError:
print "usage of this program:\n\tpython import.py [database] [data_collection] [streets_collection]"
64 changes: 64 additions & 0 deletions import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pymongo
import sys, re
import json

mongo = ("localhost", 27017)


def iter_json(filename):
"""
Generator to read a JSON file line by line
useful to read big files without eating up RAM.
"""
with open(filename, "r") as f:
for jsonline in f:
yield json.loads(jsonline.strip("[").strip("]").strip("\n").strip(","))

def get_jsonlines(filename, n = 5):
"""
Return a list with n lines
from a JSON file.
"""
data = []
for json in iter_json(filename):
if len(data) < n:
data.append(json)
else:
return data

def mongo_import(db_name, col_name, filename, overwrite = True, v = (False, 0)):
"""
Imports JSON documents from a file to MongoDB.
db_name: name of the database
col_name: name of the collection
filename: name of the source file
overwrite: boolean, self-explaining
v: verbose, tuple containing a boolean and an int
representing the interval
"""
client = pymongo.MongoClient(mongo[0], mongo[1])
collection = client[db_name][col_name]

c = 0
if overwrite:
collection.drop()

for json in iter_json(filename):
collection.insert_one(json)
if v[0] and c % v[1] == 0:
print c, " documents added."
c += 1
print c, " documents added."



if __name__ == "__main__":
try:
db_name = sys.argv[1]
col_name = sys.argv[2]
filename = sys.argv[3]

mongo_import(db_name, col_name, filename, overwrite = True, v = (True, 10000))
except IndexError:
print "usage of this program:\n\tpython import.py [database] [collection] [filename]"
Loading

0 comments on commit f9ff902

Please sign in to comment.