Last active
September 6, 2018 03:35
-
-
Save harmy/7ecd6cfdfae939db7e2781e8aa302559 to your computer and use it in GitHub Desktop.
bluk load geoname.csv into ElasticSearch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import time | |
import csv | |
import json | |
from collections import namedtuple | |
from elasticsearch import Elasticsearch | |
from elasticsearch.helpers import bulk | |
ES_ENDPOINT = 'vpc-xxxxxxxxxxxx.us-east-1.es.amazonaws.com:80' | |
ES_INDEX = 'geonames' | |
ES_TYPE = 'doc' | |
ES_MAPPING = { | |
ES_TYPE: { | |
"properties": { | |
"location": { | |
"type": "geo_point" | |
} | |
} | |
} | |
} | |
def get_geoname_data(filename): | |
GeonameRecord = namedtuple('GeonameRecord', 'id name ascii_name alternate_names latitude longitude feature_class feature_code country_code cc2 admin1_code admin2_code admin3_code admin4_code population elevation dem timezone modified_at') | |
with open(filename, "r", encoding="utf-8") as geoname_records: | |
for geoname_record in csv.reader(geoname_records): | |
if len(geoname_record) == 19: # a valid row | |
geoname = GeonameRecord(*geoname_record) | |
yield { | |
'_index': ES_INDEX, | |
'_type': ES_TYPE, | |
'_id': geoname.id, | |
'_source': { | |
"admin1_code": geoname.admin1_code, | |
"ascii_name": geoname.ascii_name, | |
"country_code": geoname.country_code, | |
"dem": int(geoname.dem), | |
"elevation": int(geoname.elevation), | |
"feature_class": geoname.feature_class, | |
"feature_code": geoname.feature_code, | |
"id": geoname.id, | |
"location": ','.join([geoname.latitude, geoname.longitude]), | |
"modified_at": geoname.modified_at, | |
"name": geoname.name, | |
"population": int(geoname.population), | |
"timezone": geoname.timezone | |
} | |
} | |
if __name__ == '__main__': | |
es = Elasticsearch(hosts=[ES_ENDPOINT], timeout=5000) | |
es.indices.delete(index=ES_INDEX) | |
es.indices.create(index=ES_INDEX) | |
es.indices.put_mapping(index=ES_INDEX, doc_type=ES_TYPE, body=ES_MAPPING) | |
count, _ = bulk(es, get_geoname_data('./geoname.csv')) | |
print('Successfully load {0} items into ES.'.format(count)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3039077 | Sispony | Sispony | Sispony | 42.53368 | 1.51613 | P | PPL | AD | 04 | 0 | 0 | 1315 | Europe/Andorra | 2018-09-04 08:24:47.126072+00 | |||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3039154 | El Tarter | El Tarter | Ehl Tarter,Эл Тартер | 42.57952 | 1.65362 | P | PPL | AD | 02 | 1052 | 0 | 1721 | Europe/Andorra | 2018-09-04 08:24:47.129741+00 | |||||
3039163 | Sant Julià de Lòria | Sant Julia de Loria | San Julia,San Julià,Sant Julia de Loria,Sant Julià de Lòria,Sant-Zhulija-de-Lorija,sheng hu li ya-de luo li ya,Сант-Жулия-де-Лория,サン・ジュリア・デ・ロリア教区,圣胡利娅-德洛里亚,圣胡利娅-德洛里亚 | 42.46372 | 1.49129 | P | PPLA | AD | 06 | 8022 | 0 | 921 | Europe/Andorra | 2018-09-04 08:24:47.131937+00 | |||||
3039181 | Santa Coloma | Santa Coloma | Santa Coloma | 42.49454 | 1.49897 | P | PPL | AD | 07 | 0 | 0 | 978 | Europe/Andorra | 2018-09-04 08:24:47.13408+00 | |||||
3039604 | Pas de la Casa | Pas de la Casa | Pas de la Kasa,Пас де ла Каса | 42.54277 | 1.73361 | P | PPL | AD | 03 | 2363 | 2050 | 2106 | Europe/Andorra | 2018-09-04 08:24:47.136234+00 | |||||
3039678 | Ordino | Ordino | Ordino,ao er di nuo,orudino jiao qu,Ордино,オルディノ教区,奥尔迪诺 | 42.55623 | 1.53319 | P | PPLA | AD | 05 | 3066 | 0 | 1296 | Europe/Andorra | 2018-09-04 08:24:47.138404+00 | |||||
3039862 | Meritxell | Meritxell | Sanctuaire de Meritxeli,Sanctuaire de Meritxell,Santuari de Meritxell | 42.55403 | 1.59087 | P | PPL | AD | AD | 02 | 0 | 0 | 1479 | Europe/Andorra | 2018-09-04 08:24:47.140615+00 | ||||
3040051 | les Escaldes | les Escaldes | Ehskal'des-Ehndzhordani,Escaldes,Escaldes-Engordany,Les Escaldes,esukarudesu=engorudani jiao qu,lai sai si ka er de-en ge er da,Эскальдес-Энджордани,エスカルデス=エンゴルダニ教区,萊塞斯卡爾德-恩戈爾達,萊塞斯卡爾德-恩戈爾達 | 42.50729 | 1.53414 | P | PPLA | AD | 08 | 15853 | 0 | 1033 | Europe/Andorra | 2018-09-04 08:24:47.142784+00 | |||||
3040067 | Les Bons | Les Bons | Els Bons | 42.53873 | 1.58649 | P | PPL | AD | AD | 03 | 0 | 0 | 1299 | Europe/Andorra | 2018-09-04 08:24:47.145036+00 | ||||
3040132 | la Massana | la Massana | La Macana,La Massana,La Maçana,La-Massana,la Massana,ma sa na,Ла-Массана,ラ・マサナ教区,马萨纳 | 42.54499 | 1.51483 | P | PPLA | AD | 04 | 7211 | 0 | 1245 | Europe/Andorra | 2018-09-04 08:24:47.147197+00 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment