Created
July 22, 2020 03:26
-
-
Save vietvudanh/4d2881db833887b68304273453beb8b3 to your computer and use it in GitHub Desktop.
Convert csv to hdf5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import h5py | |
import sys | |
import pandas as pd | |
import datetime | |
if len(sys.argv) == 1: | |
print("No provided file") | |
sys.exit(1) | |
csv_file_name = sys.argv[1] | |
with open(csv_file_name, 'r') as tmp_file: | |
# list of columns (labels) that should be indexed | |
df_cols_to_index = tmp_file.readline().split(",") | |
print(f"file name:: {csv_file_name}") | |
print(f"headers:: {df_cols_to_index}") | |
# some errors might appear with column having len > limit len of first batch, provide it here | |
item_size = { | |
# 'col_0': 50 | |
} | |
hdf_key = 'hdf_key' | |
store = pd.HDFStore(f"{csv_file_name}.hdf5") | |
for chunk in pd.read_csv(csv_file_name, chunksize=500000): | |
# don't index data columns in each iteration - we'll do it later ... | |
store.append(hdf_key, chunk, data_columns=df_cols_to_index, index=False, min_itemsize=item_size) | |
# index data columns in HDFStore | |
print(f"Done chunk:: {datetime.datetime.now()}") | |
store.create_table_index(hdf_key, columns=df_cols_to_index, optlevel=9, kind='full') | |
store.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment