Last active
February 11, 2020 21:01
-
-
Save immanuelpotter/d6604d184ad334c939993ce9b52d2402 to your computer and use it in GitHub Desktop.
Scripting dataset generation using TPC-H toolset in amazon linux, for a record of how it was done
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Ensure an S3 full access role is attached. | |
# Install TPC-H benchmarking tools on an rpm-based Linux machine, and generate two datasets: 10GB and 40GB of tabular data respectively. | |
# Create a bucket for storage of the generated data and upload to S3. | |
# Install tools | |
sudo yum install -y make git gcc | |
git clone https://github.com/gregrahn/tpch-kit | |
cd tpch-kit/dbgen && make OS=linux | |
# Generate 10GB of tabular data for use with EMR | |
cd ~ && mkdir emrdata | |
export DSS_PATH="$HOME/emrdata" | |
cd tpch-kit/dbgen | |
./dbgen -v -T o -s 10 | |
emr_file_count=$(ls -1 $HOME/emrdata | grep '.tbl' | wc -l) | |
echo "Data files generated: $emr_file_count" | |
# Create bucket to store emr data | |
aws s3api create-bucket --bucket mannybigdatalabs --region us-east-1 | |
# Copy data to S3 | |
aws s3 cp $HOME/emrdata s3://mannybigdatalabs/emrdata --recursive | |
# Generate 40GB of tabular data for use with redshift | |
cd $HOME && mkdir redshiftdata | |
export DSS_PATH=$HOME/redshiftdata | |
cd tpch-kit/dbgen | |
./dbgen -v -T o -s 40 # 40GB file | |
rs_file_count=$(ls -1 $HOME/redshiftdata | grep '.tbl' | wc -l) | |
echo "Data files generated: $rs_file_count" | |
cd $HOME/redshiftdata | |
wc -l *.tbl # should be 60million and just over 240million respectively | |
split -d -l 15000000 -a 4 orders.tbl orders.tbl. #split into 4 parts of 15m rows each | |
split -d -l 60000000 -a 4 lineitem.tbl lineitem.tbl. # extra will find its way into a surplus file | |
rm orders.tbl lineitem.tbl # don't need original files for redshift, just the parts | |
aws s3 cp $HOME/redshiftdata s3://mannybigdatalabs/redshiftdata --recursive |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment