immanuelpotter · February 11, 2020 21:01
diff --git a/tpc-h-amazonlinux.sh b/tpc-h-amazonlinux.sh
 #!/bin/bash

 # Ensure an S3 full access role is attached.

 # Install TPC-H benchmarking tools on an rpm-based Linux machine, and generate two datasets: 10GB and 40GB of tabular data respectively.
 # Create a bucket for storage of the generated data and upload to S3.

 # Install tools
 sudo yum install -y make git gcc
 git clone https://github.com/gregrahn/tpch-kit
 cd tpch-kit/dbgen && make OS=linux

 # Generate 10GB of tabular data for use with EMR
 cd ~ && mkdir emrdata
 export DSS_PATH="$HOME/emrdata"
 cd tpch-kit/dbgen
 ./dbgen -v -T o -s 10
 emr_file_count=$(ls -1 $HOME/emrdata | grep '.tbl' | wc -l)
 echo "Data files generated: $emr_file_count"

 # Create bucket to store emr data
 aws s3api create-bucket --bucket mannybigdatalabs --region us-east-1

 # Copy data to S3
 aws s3 cp $HOME/emrdata s3://mannybigdatalabs/emrdata --recursive

 # Generate 40GB of tabular data for use with redshift
 cd $HOME && mkdir redshiftdata
 export DSS_PATH=$HOME/redshiftdata
 cd tpch-kit/dbgen
 ./dbgen -v -T o -s 40 # 40GB file 
 rs_file_count=$(ls -1 $HOME/redshiftdata | grep '.tbl' | wc -l)
 echo "Data files generated: $rs_file_count"

 cd $HOME/redshiftdata
 wc -l *.tbl # should be 60million and just over 240million respectively
 split -d -l 15000000 -a 4 orders.tbl orders.tbl. #split into 4 parts of 15m rows each
 split -d -l 60000000 -a 4 lineitem.tbl lineitem.tbl. # extra will find its way into a surplus file

 rm orders.tbl lineitem.tbl # don't need original files for redshift, just the parts
 aws s3 cp $HOME/redshiftdata s3://mannybigdatalabs/redshiftdata --recursive
	#!/bin/bash

	# Ensure an S3 full access role is attached.

	# Install TPC-H benchmarking tools on an rpm-based Linux machine, and generate two datasets: 10GB and 40GB of tabular data respectively.
	# Create a bucket for storage of the generated data and upload to S3.

	# Install tools
	sudo yum install -y make git gcc
	git clone https://github.com/gregrahn/tpch-kit
	cd tpch-kit/dbgen && make OS=linux

	# Generate 10GB of tabular data for use with EMR
	cd ~ && mkdir emrdata
	export DSS_PATH="$HOME/emrdata"
	cd tpch-kit/dbgen
	./dbgen -v -T o -s 10
	emr_file_count=$(ls -1 $HOME/emrdata \| grep '.tbl' \| wc -l)
	echo "Data files generated: $emr_file_count"

	# Create bucket to store emr data
	aws s3api create-bucket --bucket mannybigdatalabs --region us-east-1

	# Copy data to S3
	aws s3 cp $HOME/emrdata s3://mannybigdatalabs/emrdata --recursive

	# Generate 40GB of tabular data for use with redshift
	cd $HOME && mkdir redshiftdata
	export DSS_PATH=$HOME/redshiftdata
	cd tpch-kit/dbgen
	./dbgen -v -T o -s 40 # 40GB file
	rs_file_count=$(ls -1 $HOME/redshiftdata \| grep '.tbl' \| wc -l)
	echo "Data files generated: $rs_file_count"

	cd $HOME/redshiftdata
	wc -l *.tbl # should be 60million and just over 240million respectively
	split -d -l 15000000 -a 4 orders.tbl orders.tbl. #split into 4 parts of 15m rows each
	split -d -l 60000000 -a 4 lineitem.tbl lineitem.tbl. # extra will find its way into a surplus file

	rm orders.tbl lineitem.tbl # don't need original files for redshift, just the parts
	aws s3 cp $HOME/redshiftdata s3://mannybigdatalabs/redshiftdata --recursive