clemsos · February 27, 2024 10:15 · cheriemilk · Sep 2, 2019 · rgdragomir · Mar 10, 2020
diff --git a/csv_to_elastic_search_bulk_insert.py b/csv_to_elastic_search_bulk_insert.py
 from pyelasticsearch import ElasticSearch
 import pandas as pd
 from time import time

 root_path="/home/clemsos/Dev/mitras/"
 raw_data_path=root_path+"data/"
 csv_filename="week10.csv"

 t0=time()

 # size of the bulk
 chunksize=5000

 # open csv file
 f = open(raw_data_path+csv_filename) # read csv

 # parse csv with pandas
 csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) 

 # init ElasticSearch
 es = ElasticSearch('http://localhost:9200/')

 # init index
 try :
    es.delete_index("weiboscope")
 except :
    pass

 es.create_index("weiboscope")

 # start bulk indexing 
 print "now indexing %s..."%(csv_filename)

 for i,df in enumerate(csvfile): 
    print i
    records=df.where(pd.notnull(df), None).T.to_dict()
    list_records=[records[it] for it in records]
    try :
        es.bulk_index("weiboscope","tweet",list_records)
    except :
        print "error!, skiping some tweets sorry"
        pass

 print "done in %.3fs"%(time()-t0)
	from pyelasticsearch import ElasticSearch
	import pandas as pd
	from time import time

	root_path="/home/clemsos/Dev/mitras/"
	raw_data_path=root_path+"data/"
	csv_filename="week10.csv"

	t0=time()

	# size of the bulk
	chunksize=5000

	# open csv file
	f = open(raw_data_path+csv_filename) # read csv

	# parse csv with pandas
	csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize)

	# init ElasticSearch
	es = ElasticSearch('http://localhost:9200/')

	# init index
	try :
	es.delete_index("weiboscope")
	except :
	pass

	es.create_index("weiboscope")

	# start bulk indexing
	print "now indexing %s..."%(csv_filename)

	for i,df in enumerate(csvfile):
	print i
	records=df.where(pd.notnull(df), None).T.to_dict()
	list_records=[records[it] for it in records]
	try :
	es.bulk_index("weiboscope","tweet",list_records)
	except :
	print "error!, skiping some tweets sorry"
	pass

	print "done in %.3fs"%(time()-t0)