Wrangling with OpenStreetMap Data
OpenStreetMap is an open project, which means it's free and everyone can use it and edit as they like. OpenStreetMap is direct competitor of Google Maps. How OpenStreetMap can compete with the giant you ask? It's depend completely on crowd sourcing. There's lot of people willingly update the map around the world, most of them fix their map country.
Openstreetmap is so powerful, and rely heavily on the human input. But its strength also the downfall. Everytime there's human input, there's always be human error.It's very error prone.
Take the name of the street for example. People like to abbreviate the type of the street. Street become St. st. In Indonesia, 'Jalan'(Street-Eng), also abbreviated as Jln, jln, jl, Jln. It maybe get us less attention. But for someone as Data Scientist/Web Developer, they expect the street to have generic format.
'Jalan Sudirman' -> Jalan <name> -> name = Sudirman
'Jln Sudirman' -> Jalan <name> -> ERROR!
This project tends to fix that, it fix abbreviate name, so it can use more generalize type. Not only it's benefit for professional, But we can also can see more structured words.
In this project, i want to show you to fix one type of error, that is the address of the street. I choose whole places of Jakarta. Jakarta is the capital of Indonesia.This dataset is huge, over 250,000 examples. It's my hometown, and i somewhat want to help the community. And not only that, i also will show you how to put the data that has been audited into MongoDB instance. We also use MongoDB's Agregation Framework to get overview and analysis of the data.
the changeset is here http://osmhv.openstreetmap.de/changeset.jsp?id=26730562
If you want to try this yourself, you can always download this source code, and play around with it :)
OSM_FILE = 'jakarta.osm'
%load mapparser.py
To audit the osm file, first we need to know the overview of the data. To get an overview of the data, we count the tag content of the data.
# %%writefile mapparser.py
#!/usr/bin/env python
import xml.etree.ElementTree as ET
import pprint
def count_tags(filename):
"""count tags in filename.
Init 1 in dict if the key not exist, increment otherwise."""
tags = {}
for ev,elem in ET.iterparse(filename):
tag = elem.tag
if tag not in tags.keys():
tags[tag] = 1
else:
tags[tag]+=1
return tags
def test():
tags = count_tags(OSM_FILE)
pprint.pprint(tags)
# assert tags == {'bounds': 1,
# 'member': 3,
# 'nd': 4,
# 'node': 20,
# 'osm': 1,
# 'relation': 1,
# 'tag': 7,
# 'way': 1}
if __name__ == "__main__":
test()
%load tags.py
%%writefile tags.py
#!/usr/bin/env python
import xml.etree.ElementTree as ET
import pprint
import re
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def key_type(element, keys):
"""
Count the criteria in dictionary for the content of the tag.
"""
if element.tag == "tag":
if lower.search(element.attrib['k']):
keys['lower'] +=1
elif lower_colon.search(element.attrib['k']):
keys['lower_colon']+=1
elif problemchars.search(element.attrib['k']):
keys['problemchars']+=1
else:
keys['other']+=1
return keys
def process_map(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
def test():
# You can use another testfile 'map.osm' to look at your solution
# Note that the assertions will be incorrect then.
keys = process_map(OSM_FILE)
pprint.pprint(keys)
# assert keys == {'lower': 5, 'lower_colon': 0, 'other': 2, 'problemchars': 0}
if __name__ == "__main__":
test()
%load users.py
%%writefile users.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
def get_user(element):
return
def process_map(filename):
"""
Count the user id in the filename.
"""
users = set()
for _, element in ET.iterparse(filename):
try:
users.add(element.attrib['uid'])
except KeyError:
continue
return users
def test():
users = process_map(OSM_FILE)
pprint.pprint(users)
# assert len(users) == 6
if __name__ == "__main__":
test()
import xml.etree.cElementTree as ET
%load audit.py
# %%writefile audit.py
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
from optparse import OptionParser
# OSMFILE = "sample.osm"
# OSMFILE = "example_audit.osm"
#In Indonesia, type first, then name. So the regex has to be changed.
#street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_type_re = re.compile(r'^\b\S+\.?', re.IGNORECASE)
# expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
# "Trail", "Parkway", "Commons"]
expected = ['Jalan', 'Gang','Street', 'Road']
# UPDATE THIS VARIABLE
#Mapping has to sort in length descending.
#languange English-Indonesian{Street: Jalan}.
#{Sudirman Stret:Jalan Sudirman}
mapping = {
'jl.':'Jalan',
'JL.':'Jalan',
'Jl.':'Jalan',
'GG':'Gang',
'gg': 'Gang',
'jl' :'Jalan',
'JL':'Jalan',
'Jl':'Jalan',
}
# mapping = {
# "Ave":"Avenue",
# "St.": "Street",
# "Rd." : "Road",
# "N.":"North",
# "St" : "Street",
# }
def audit_street_type(street_types, street_name):
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
street_types[street_type].add(street_name)
#return True if need to be updated
return True
return False
def is_street_name(elem):
"""
Perhaps the addr:full should also included to be fixed
"""
return (elem.attrib['k'] == "addr:street") or (elem.attrib['k'] == "addr:full")
def is_name_is_street(elem):
"""Some people fill the name of the street in k=name.
Should change this"""
s = street_type_re.search(elem.attrib['v'])
#print s
return (elem.attrib['k'] == "name") and s and s.group() in mapping.keys()
def audit(osmfile):
osm_file = open(osmfile, "r")
street_types = defaultdict(set)
# tree = ET.parse(osm_file, events=("start",))
tree = ET.parse(osm_file)
listtree = list(tree.iter())
for elem in listtree:
if elem.tag == "node" or elem.tag == "way":
n_add = None
for tag in elem.iter("tag"):
if is_street_name(tag):
if audit_street_type(street_types, tag.attrib['v']):
#Update the tag attribtue
tag.attrib['v'] = update_name(tag.attrib['v'],mapping)
elif is_name_is_street(tag):
tag.attrib['v'] = update_name(tag.attrib['v'],mapping)
n_add = tag.attrib['v']
if n_add:
elem.append(ET.Element('tag',{'k':'addr:street', 'v':n_add}))
#write the to the file we've been audit
tree.write(osmfile[:osmfile.find('.osm')]+'_audit.osm')
return street_types
def update_name(name, mapping):
"""
Fixed abreviate name so the name can be uniform.
The reason why mapping in such particular order, is to prevent the shorter keys get first.
"""
dict_map = sorted(mapping.keys(), key=len, reverse=True)
for key in dict_map:
if name.find(key) != -1:
name = name.replace(key,mapping[key])
return name
#essentially, in Indonesia, you specify the all type of street as Street.
#So if it doesnt have any prefix, add 'Jalan'
return 'Jalan ' + name
def test():
st_types = audit(OSMFILE)
pprint.pprint(dict(st_types))
#assert len(st_types) == 3
for st_type, ways in st_types.iteritems():
for name in ways:
better_name = update_name(name, mapping)
print name, "=>", better_name
if __name__ == '__main__':
# test()
parser = OptionParser()
parser.add_option('-d', '--data', dest='audited_data', help='osm data that want to be audited')
(opts,args) = parser.parse_args()
audit(opts.audited_data)
This will save the jakarta osm that has been audited into jakarta_audit.osm Not let's prepare the audited file to be input to the MongoDB instance.
%load data.py
# %%writefile data.py
#!/usr/bin/env python
import xml.etree.ElementTree as ET
import pprint
import re
import codecs
import json
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
addresschars = re.compile(r'addr:(\w+)')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
OSM_FILE = 'jakarta_audit.osm'
def shape_element(element):
#node = defaultdict(set)
node = {}
if element.tag == "node" or element.tag == "way" :
#create the dictionary based on exaclty the value in element attribute.
node = {'created':{}, 'type':element.tag}
for k in element.attrib:
try:
v = element.attrib[k]
except KeyError:
continue
if k == 'lat' or k == 'lon':
continue
if k in CREATED:
node['created'][k] = v
else:
node[k] = v
try:
node['pos']=[float(element.attrib['lat']),float(element.attrib['lon'])]
except KeyError:
pass
if 'address' not in node.keys():
node['address'] = {}
#Iterate the content of the tag
for stag in element.iter('tag'):
#Init the dictionry
k = stag.attrib['k']
v = stag.attrib['v']
#Checking if indeed prefix with 'addr' and no ':' afterwards
if k.startswith('addr:'):
if len(k.split(':')) == 2:
content = addresschars.search(k)
if content:
node['address'][content.group(1)] = v
else:
node[k]=v
if not node['address']:
node.pop('address',None)
#Special case when the tag == way, scrap all the nd key
if element.tag == "way":
node['node_refs'] = []
for nd in element.iter('nd'):
node['node_refs'].append(nd.attrib['ref'])
# if 'address' in node.keys():
# pprint.pprint(node['address'])
return node
else:
return None
def process_map(file_in, pretty = False):
"""
Process the osm file to json file to be prepared for input file to monggo
"""
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
def test():
data = process_map(OSM_FILE)
pprint.pprint(data[500])
if __name__ == "__main__":
test()
The processed map has ben saved to jakarta_audit.osm.json Now that we have process the audited map file into array of JSON, let's put it into mongodb instance. this will take the map that we have been audited. First we load the script to insert the map
from data import *
import pprint
data = process_map('jakarta_audit.osm')
Okay let's test if the data is something that we expect
pprint.pprint(data[0:6])
The data seems about right. After we verified the data is ready, let's put it into MongoDB
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017')
db = client.examples
[db.jktosm.insert(e) for e in data]
Okay, it seems that we have sucessfully insert all of our data into MongoDB instance. Let's test this
pipeline = [
{'$limit' : 6}
]
pprint.pprint(db.jktosm.aggregate(pipeline)['result'])
Show 5 data that have street¶
pipeline = [
{'$match': {'address.street':{'$exists':1}}},
{'$limit' : 5}
]
result = db.jktosm.aggregate(pipeline)['result']
pprint.pprint(result)
Show the top 5 of contributed users¶
pipeline = [
{'$match': {'created.user':{'$exists':1}}},
{'$group': {'_id':'$created.user',
'count':{'$sum':1}}},
{'$sort': {'count':-1}},
{'$limit' : 5}
]
result = db.jktosm.aggregate(pipeline)['result']
pprint.pprint(result)
Show the restaurant's name, the food they serve, and contact number¶
pipeline = [
{'$match': {'amenity':'restaurant',
'name':{'$exists':1}}},
{'$project':{'_id':'$name',
'cuisine':'$cuisine',
'contact':'$phone'}}
]
result = db.jktosm.aggregate(pipeline)['result']
pprint.pprint(result)