Scraping Manai Khoroo for Fun and Social Good
Unlocking geo tagged data in Ulaanbaatar
In the Western world, we take for granted that we will be able to do a Google search and find the location of almost anything around us. Want to find the nearest grocery store? How about the schools near your house? All a click away.
Unfortunately in much of the developing world this is not the case. Unless a business takes it upon themselves to
import json
import requests
import pandas as pd
import math
all_dfs = {}
names = ['university','training_center','kindergarten','schools','pharmacy','cameras','grocery_store_8','grocery_store_6','tuts']
urls = ['https://dms.ulaanbaatar.mn/arcgis/rest/services/Manaikhoroo/Manaikhoroo/FeatureServer/3',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Manaikhoroo/Manaikhoroo/FeatureServer/7',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Education/School_kinder_2019/FeatureServer/0',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Education/School_kinder_2019/FeatureServer/1',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Health/Emiin_san/FeatureServer/0',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Manaikhoroo/Manaikhoroo3/FeatureServer/1/',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Manaikhoroo/Manaikhoroo/FeatureServer/10',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Manaikhoroo/Manaikhoroo/FeatureServer/0',
'https://dms.ulaanbaatar.mn/arcgis/rest/services/Manaikhoroo/Manaikhoroo/FeatureServer/4']
def get_features(feature):
xmin = 11858134.82
ymin = 6066042.565
xmax = 11897270.58
ymax = 6085610.444
query = f"/query?f=json&geometry=%7B%22spatialReference%22%3A%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D%2C%22xmin%22%3A{xmin}%2C%22ymin%22%3A{ymin}%2C%22xmax%22%3A{xmax}%2C%22ymax%22%3A{ymax}%7D&maxRecordCountFactor=3&outFields=*&outSR=102100&resultType=tile&returnExceededLimitFeatures=true&spatialRel=esriSpatialRelIntersects&where=1%3D1&geometryType=esriGeometryEnvelope&inSR=102100"
url = feature + query
r = requests.get(url)
attributes = [x['attributes'] for x in json.loads(r.content.decode("UTF-8"))['features']]
geometry = [x['geometry'] for x in json.loads(r.content.decode("UTF-8"))['features']]
df1 = pd.DataFrame(attributes)
df1 = df1.join(pd.DataFrame(geometry))
xmin = 11897270.58
ymin = 6085610.444
xmax = 11936406.34
ymax = 6105178.323
query = f"/query?f=json&geometry=%7B%22spatialReference%22%3A%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D%2C%22xmin%22%3A{xmin}%2C%22ymin%22%3A{ymin}%2C%22xmax%22%3A{xmax}%2C%22ymax%22%3A{ymax}%7D&maxRecordCountFactor=3&outFields=*&outSR=102100&resultType=tile&returnExceededLimitFeatures=true&spatialRel=esriSpatialRelIntersects&where=1%3D1&geometryType=esriGeometryEnvelope&inSR=102100"
url = feature + query
r = requests.get(url)
attributes = [x['attributes'] for x in json.loads(r.content.decode("UTF-8"))['features']]
geometry = [x['geometry'] for x in json.loads(r.content.decode("UTF-8"))['features']]
df2 = pd.DataFrame(attributes)
df2 = df2.join(pd.DataFrame(geometry))
df3 = df1.append(df2)
return df3
Grab the data.
for name, url in zip(names, urls):
all_dfs[name] = get_features(url)
Get the shape of the data.
for name, df in all_dfs.items():
print(df.shape)
Convert spatial reference to lat, long.
Original function found at: https://gist.github.com/maptiler/fddb5ce33ba995d5523de9afdf8ef118 Adapted
def convert_spatial_reference(point):
"Converts XY point in text form (x,y) to lat lon."
mx = float(point.split(',')[0])
my = float(point.split(',')[1])
originShift = 2 * math.pi * 6378137 / 2.0
lon = (mx / originShift) * 180.0
lat = (my / originShift) * 180.0
lat = 180 / math.pi * (2 * math.atan( math.exp( lat * math.pi / 180.0)) - math.pi / 2.0)
return lat, lon
Create x, y column
for name, df in all_dfs.items():
df['xy'] = df['x'].astype('str') + ',' + df['y'].astype('str')
Apply function
for name, df in all_dfs.items():
df['lat_lon'] = df['xy'].apply(convert_spatial_reference)
all_dfs['kindergarten'].head()
for name, df in all_dfs.items():
df.to_csv(f'datasets/manai_khoroo/{name}.csv')
url = 'https://dms.ulaanbaatar.mn/arcgis/rest/services/Admin/Admin/FeatureServer/0/query?where=area_m2%3E0&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=xyFootprint&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson'
r = requests.get(url)
with open('khoroo.json', 'w') as outfile:
json.dump(json.loads(r.content.decode('utf-8')), outfile)
url = 'https://dms.ulaanbaatar.mn/arcgis/rest/services/Admin/Admin/FeatureServer/1/query?where=POPULATION%3E0&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=xyFootprint&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson'
r = requests.get(url)
with open('duureg.json', 'w') as outfile:
json.dump(json.loads(r.content.decode('utf-8')), outfile)