Last active
July 24, 2018 08:45
-
-
Save aljiwala/a6973b7582f1870927842e565f1f77cb to your computer and use it in GitHub Desktop.
Divide CSV data into chunks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Built-in imports. | |
from sys import argv | |
from os.path import join as join_path | |
# Third party imports. | |
from pandas import read_csv | |
def get_row_count(src_filepath): | |
with open(src_filepath, 'r') as f: | |
return sum(1 for row in f) | |
def divide_in_chunks(chunksize, src_filepath, dst_filepath): | |
count = 0 | |
tfr = read_csv( | |
src_filepath, iterator=True, chunksize=chunksize, index_col=0 | |
) | |
for chunk in tfr: | |
count += 1 | |
chunk.to_csv(join_path(dst_filepath, 'part{}.csv'.format(count))) | |
def main(): | |
chunksize, src_filepath, dst_filepath = 100, '', '' | |
chunksize_arg, src_filepath_arg, dst_filepath_arg =\ | |
'--chunksize', '--src_filepath', '--dst_filepath' | |
for arg in argv: | |
if arg.startswith(chunksize_arg): | |
chunksize = int(arg.split(chunksize_arg+'=')[1]) | |
elif arg.startswith(src_filepath_arg): | |
src_filepath = arg.split(src_filepath_arg+'=')[1] | |
elif arg.startswith(dst_filepath_arg): | |
dst_filepath = arg.split(dst_filepath_arg+'=')[1] | |
else: | |
pass | |
if src_filepath == '': | |
print(src_filepath_arg + ' isn\'t provided.') | |
return | |
if dst_filepath == '': | |
print(dst_filepath_arg + ' isn\'t provided.') | |
return | |
divide_in_chunks(chunksize, src_filepath, dst_filepath) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment