Skip to content

Instantly share code, notes, and snippets.

@mikesparr
Created October 21, 2024 23:59
Show Gist options
  • Save mikesparr/5b663ca6642d9fe2e56aba6d7b7cfbd7 to your computer and use it in GitHub Desktop.
Save mikesparr/5b663ca6642d9fe2e56aba6d7b7cfbd7 to your computer and use it in GitHub Desktop.
Example deploying Clickhouse database operator on Google Kubernetes Engine (GKE) Autopilot cluster and importing parquet data from storage bucket using HMAC key
#!/usr/bin/env bash
#####################################################################
# REFERENCES
# - https://cloud.google.com/kubernetes-engine/docs/how-to/creating-an-autopilot-cluster
# - https://cloud.google.com/storage/docs/creating-buckets
# - https://cloud.google.com/storage/docs/access-control/using-iam-permissions
# - https://cloud.google.com/storage/docs/authentication/managing-hmackeys#command-line
# - https://github.com/Altinity/clickhouse-operator/blob/master/docs/quick_start.md
# - https://clickhouse.com/docs/en/getting-started/quick-start
# Extras:
# - https://altinity.com/kubernetes-operator/
# - https://github.com/Altinity/clickhouse-operator
# - https://www.propeldata.com/blog/clickhouse-operator
#####################################################################
export PROJECT_ID=$(gcloud config get-value project)
export PROJECT_USER=$(gcloud config get-value core/account) # set current user
export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
export IDNS=${PROJECT_ID}.svc.id.goog # workflow identity domain
export GCP_REGION="us-central1" # CHANGEME (OPT)
export GCP_ZONE="us-central1-a" # CHANGEME (OPT)
export NETWORK_NAME="default"
# enable apis
gcloud services enable compute.googleapis.com \
storage.googleapis.com \
container.googleapis.com
# configure gcloud sdk
gcloud config set compute/region $GCP_REGION
gcloud config set compute/zone $GCP_ZONE
#############################################################
# GKE CLUSTER
#############################################################
# create cluster
export CLUSTER_NAME="central"
gcloud container clusters create-auto $CLUSTER_NAME \
--location=$GCP_REGION \
--project=$PROJECT_ID
# authenticate kubectl (automatic for newly-created clusters)
gcloud container clusters get-credentials $CLUSTER_NAME --region $GCP_REGION
#############################################################
# IAM
#############################################################
# create service account
export SA_USER="data-sa"
export SA_EMAIL="$SA_USER@$PROJECT_ID.iam.gserviceaccount.com"
gcloud iam service-accounts create $SA_USER --display-name="$SA_USER"
#############################################################
# STORAGE
#############################################################
export BUCKET_NAME="data-analytics-ch-test"
export BUCKET_URL="gs://$BUCKET_NAME"
export BUCKET_LOCATION=$GCP_REGION
export DATA_FILE="comments.parquet"
export DATA_FILE_URL="https://storage.googleapis.com/$BUCKET_NAME/$DATA_FILE"
# create bucket
gcloud storage buckets create $BUCKET_URL \
--location=$BUCKET_LOCATION \
--uniform-bucket-level-access
# upload test parquet data
gcloud storage cp $(pwd)/$DATA_FILE $BUCKET_URL/$DATA_FILE
# grant service account permissions to bucket
gcloud storage buckets add-iam-policy-binding $BUCKET_URL \
--member="serviceAccount:$SA_EMAIL" \
--role="roles/storage.admin"
# create hmac key for service account
gcloud storage hmac create $SA_EMAIL # note the "secret"
export KEY_SECRET="<PASTE-SECRET-FROM-ABOVE-RESULT>" # CHANGEME
export KEY_ACCESS_ID=$(gcloud storage hmac list --filter="serviceAccountEmail:$SA_EMAIL" --format="value(accessId)")
#############################################################
# DATABASE (CLICKHOUSE)
#############################################################
# install clickhouse operator (custom namespace)
export OPERATOR_NAMESPACE="clickhouse-operator"
curl -s https://raw.githubusercontent.com/Altinity/clickhouse-operator/master/deploy/operator-web-installer/clickhouse-operator-install.sh | OPERATOR_NAMESPACE=$OPERATOR_NAMESPACE bash
# verify clickhouse operator install
kubectl get pods -n $OPERATOR_NAMESPACE
# create simple cluster
cat > pv-cluster.yaml << EOF | kubectl apply -f -
apiVersion: "clickhouse.altinity.com/v1"
kind: "ClickHouseInstallation"
metadata:
name: "pv-simple"
namespace: $OPERATOR_NAMESPACE
spec:
defaults:
templates:
dataVolumeClaimTemplate: data-volume-template
logVolumeClaimTemplate: log-volume-template
configuration:
users:
# printf 'test_password' | sha256sum
test_user/password_sha256_hex: 10a6e6cc8311a3e2bcc09bf6c199adecd5dd59408c343e926b129c4914f3cb01
# to allow access outside from kubernetes
test_user/networks/ip:
- 0.0.0.0/0
clusters:
- name: "simple"
layout:
shardsCount: 1
replicasCount: 1
templates:
volumeClaimTemplates:
- name: data-volume-template
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
- name: log-volume-template
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Mi
EOF
# verify cluster
kubectl get svc -n $OPERATOR_NAMESPACE # view external LB IP
# connect to cluster on server
kubectl -n $OPERATOR_NAMESPACE exec -it svc/clickhouse-pv-simple -- clickhouse-client
#############################################################
# TEST DATABASE CLI (clickhouse-client)
#############################################################
# create test table (which exec into server CLI)
CREATE TABLE my_first_table
(
user_id UInt32,
message String,
timestamp DateTime,
metric Float32
)
ENGINE = MergeTree
PRIMARY KEY (user_id, timestamp)
# insert test data
INSERT INTO my_first_table (user_id, message, timestamp, metric) VALUES
(101, 'Hello, ClickHouse!', now(), -1.0 ),
(102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ),
(102, 'Sort your data based on your commonly-used queries', today(), 2.718 ),
(101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 )
# query data
SELECT *
FROM my_first_table
ORDER BY timestamp
# exit back to shell
exit
#############################################################
# DATA IMPORT FROM STORAGE BUCKET
#############################################################
# print SQL with values injected from env to run after CLI
cat << EOF
-- RUN ME IN CLI --
SELECT
*
FROM s3(
'$BUCKET_URL',
'$KEY_ACCESS_ID',
'$KEY_SECRET',
'Parquet'
)
LIMIT 1000
-- END RUN IN CLI --
EOF
# exec into server to run CLI
kubectl -n $OPERATOR_NAMESPACE exec -it svc/clickhouse-pv-simple -- clickhouse-client
# run query generated above (paste)
# exit shell
exit
# cheer!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment