Last active
September 24, 2017 22:39
-
-
Save mreid-moz/14f177f461486692af945176c1ede87d to your computer and use it in GitHub Desktop.
Generate sample partitioned parquet data using pyarrow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generate a sample dataset with two partitioning fields | |
# `submission_date_s3` and `sample_id` and one or more | |
# actual parquet field. | |
# | |
# Arrow and Parquet reference material at | |
# https://arrow.apache.org/docs/python/parquet.html | |
import pandas as pd | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
import os | |
import random | |
dataset_name = "main_summary" | |
version = "v4" | |
compression = "snappy" | |
rows_per_partition = 10 | |
columns = ["document_id", "a", "b"] | |
current_id = 1 | |
for d in range(1, 4): | |
day = "submission_date_s3=2017010{}".format(d) | |
for p in range(1, 4): | |
part = "sample_id={}".format(p) | |
path = os.path.join(dataset_name, version, day, part) | |
try: | |
os.makedirs(path) | |
except Exception as e: | |
print e | |
output_file = "data{}.{}.parquet".format(current_id, compression) | |
output_path = os.path.join(path, output_file) | |
# Generate some data for this partition. | |
# Each row will have `d` columns. The `document_id` column will be | |
# a serial sequence of ints, while other values will be a | |
# random int between 1 and 10. | |
# Each `submission_date_s3` partition has an increasing number of columns | |
# to help test schema evolution. | |
data = {k: [random.randint(1,10) for i in range(rows_per_partition)] for k in columns[0:d]} | |
data["id"] = range(current_id, current_id + rows_per_partition) | |
current_id += rows_per_partition | |
df = pd.DataFrame(data) | |
# The `preserve_index` bit is to avoid creating the | |
# `__index_level_0__` fields. | |
table = pa.Table.from_pandas(df, preserve_index=False) | |
pq.write_table(table, output_path, compression=compression) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment