Skip to content

Instantly share code, notes, and snippets.

@chip
Forked from tanema/multiprocess_migration.sh
Created April 4, 2018 18:46
Show Gist options
  • Save chip/fd5dac6632affb24cd2fe7b184340285 to your computer and use it in GitHub Desktop.
Save chip/fd5dac6632affb24cd2fe7b184340285 to your computer and use it in GitHub Desktop.
migrate files from gridfs to aws s3
#! /bin/bash
###################### USAGE ######################################
usage() {
echo "
Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket
-t : number of parallel processes to use
mongo_host : the host of the mongodb server
mongo_collection : the collection to collecthe gridfs data from
s3_bucket : the name of the bucket you want to cp the files to
"
}
###################### END USAGE ##################################
# how many times to split up the list
thread_count=8
# parrallel process pid array
_worker_pids=()
# incremented variable to see progress
_current_file=1
# get options just -t for setting how many threads you want
while getopts 't:*:' opt; do
case $opt in
t) thread_count=$OPTARG;;
*)
usage
exit
;;
esac
done
shift $((OPTIND-1))
# script params
if [ "$#" -ne 3 ]
then
usage
fi
# mongo host
_host="${1:?Mongo Host Required}"
# mongo collection to pull grid_fs data from
_db="${2:?Mongo Collection required}"
# s3 bucket for everything to be synced to
_bucket="${3:?AWS Bucket Required}"
# all the files
_files_list=$(mongofiles -h $_host -db $_db list)
# total files to be synced
_total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})
# how many lines to send to each thread
((lines_per_file=(_total_files + thread_count - 1) / thread_count))
###################### LOGGING ####################################
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
NORMAL=$(tput sgr0)
log_ok() {
let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL}
printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL"
}
log_fail() {
let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
}
###################### END LOGGING ################################
###################### METHOD DEFINITIONS #########################
# param $1: filepath from mongo
# param $2: worker identity number
syncfile () {
status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
((_current_file++))
#check if file is already on the server
file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
if [[ $file_count -gt 0 ]]; then
log_ok "$status Already on server"
else
filename="_migration-$_current_file-$(uuidgen)"
#get file from gridfs and create a temp file of it
mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1
#get file succeeded
if [ $? -eq 0 ]; then
#send it to s3
aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
#send file status and if this file migration succeeded
if [ $? -eq 0 ]; then
log_ok "$status"
else
log_fail "$status"
fi
#rm temp file gotten from gridfs
rm $filename
else
log_fail "$status Get from db failed"
fi
fi
}
# param: $1 worker identity number
# param: $2 starting line number in the file to process
process_lines () {
while read -r line; do
#get filename
file=$(echo "$line" | awk -F'\t' '{ print $1 }')
#if connected message then continue
[[ $file == 'connected to'* ]] && continue
# sync the file with the server
syncfile $file $1
done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
}
# used for kill signals
# calls kill on each pid
kill_all_workers () {
echo 'killing all workers'
for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
kill -6 ${_worker_pids[i]} > /dev/null 2>&1
done
echo 'migration aborted'
#cleanup any files that were interrupted
rm _migration-* > /dev/null 2>&1
}
###################### END METHOD DEFINITIONS #####################
#allows ctrl c to work in the while loop
trap "kill_all_workers" SIGINT SIGHUP SIGTERM
for ((i=0; i < $thread_count; ++i)); do
echo "starting worker $i"
#call process on this chunk of files
process_lines $i $((lines_per_file * i)) &
#record the pid for cleanup and waiting
_worker_pids+=($!)
done
#wait for each process to finish
for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
wait ${_worker_pids[i]} > /dev/null 2>&1
done
#if no errors say we are complete
if [ $? -eq 0 ]; then
echo DONE
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment