Created
October 21, 2024 23:59
-
-
Save mikesparr/5b663ca6642d9fe2e56aba6d7b7cfbd7 to your computer and use it in GitHub Desktop.
Example deploying Clickhouse database operator on Google Kubernetes Engine (GKE) Autopilot cluster and importing parquet data from storage bucket using HMAC key
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
##################################################################### | |
# REFERENCES | |
# - https://cloud.google.com/kubernetes-engine/docs/how-to/creating-an-autopilot-cluster | |
# - https://cloud.google.com/storage/docs/creating-buckets | |
# - https://cloud.google.com/storage/docs/access-control/using-iam-permissions | |
# - https://cloud.google.com/storage/docs/authentication/managing-hmackeys#command-line | |
# - https://github.com/Altinity/clickhouse-operator/blob/master/docs/quick_start.md | |
# - https://clickhouse.com/docs/en/getting-started/quick-start | |
# Extras: | |
# - https://altinity.com/kubernetes-operator/ | |
# - https://github.com/Altinity/clickhouse-operator | |
# - https://www.propeldata.com/blog/clickhouse-operator | |
##################################################################### | |
export PROJECT_ID=$(gcloud config get-value project) | |
export PROJECT_USER=$(gcloud config get-value core/account) # set current user | |
export PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)") | |
export IDNS=${PROJECT_ID}.svc.id.goog # workflow identity domain | |
export GCP_REGION="us-central1" # CHANGEME (OPT) | |
export GCP_ZONE="us-central1-a" # CHANGEME (OPT) | |
export NETWORK_NAME="default" | |
# enable apis | |
gcloud services enable compute.googleapis.com \ | |
storage.googleapis.com \ | |
container.googleapis.com | |
# configure gcloud sdk | |
gcloud config set compute/region $GCP_REGION | |
gcloud config set compute/zone $GCP_ZONE | |
############################################################# | |
# GKE CLUSTER | |
############################################################# | |
# create cluster | |
export CLUSTER_NAME="central" | |
gcloud container clusters create-auto $CLUSTER_NAME \ | |
--location=$GCP_REGION \ | |
--project=$PROJECT_ID | |
# authenticate kubectl (automatic for newly-created clusters) | |
gcloud container clusters get-credentials $CLUSTER_NAME --region $GCP_REGION | |
############################################################# | |
# IAM | |
############################################################# | |
# create service account | |
export SA_USER="data-sa" | |
export SA_EMAIL="$SA_USER@$PROJECT_ID.iam.gserviceaccount.com" | |
gcloud iam service-accounts create $SA_USER --display-name="$SA_USER" | |
############################################################# | |
# STORAGE | |
############################################################# | |
export BUCKET_NAME="data-analytics-ch-test" | |
export BUCKET_URL="gs://$BUCKET_NAME" | |
export BUCKET_LOCATION=$GCP_REGION | |
export DATA_FILE="comments.parquet" | |
export DATA_FILE_URL="https://storage.googleapis.com/$BUCKET_NAME/$DATA_FILE" | |
# create bucket | |
gcloud storage buckets create $BUCKET_URL \ | |
--location=$BUCKET_LOCATION \ | |
--uniform-bucket-level-access | |
# upload test parquet data | |
gcloud storage cp $(pwd)/$DATA_FILE $BUCKET_URL/$DATA_FILE | |
# grant service account permissions to bucket | |
gcloud storage buckets add-iam-policy-binding $BUCKET_URL \ | |
--member="serviceAccount:$SA_EMAIL" \ | |
--role="roles/storage.admin" | |
# create hmac key for service account | |
gcloud storage hmac create $SA_EMAIL # note the "secret" | |
export KEY_SECRET="<PASTE-SECRET-FROM-ABOVE-RESULT>" # CHANGEME | |
export KEY_ACCESS_ID=$(gcloud storage hmac list --filter="serviceAccountEmail:$SA_EMAIL" --format="value(accessId)") | |
############################################################# | |
# DATABASE (CLICKHOUSE) | |
############################################################# | |
# install clickhouse operator (custom namespace) | |
export OPERATOR_NAMESPACE="clickhouse-operator" | |
curl -s https://raw.githubusercontent.com/Altinity/clickhouse-operator/master/deploy/operator-web-installer/clickhouse-operator-install.sh | OPERATOR_NAMESPACE=$OPERATOR_NAMESPACE bash | |
# verify clickhouse operator install | |
kubectl get pods -n $OPERATOR_NAMESPACE | |
# create simple cluster | |
cat > pv-cluster.yaml << EOF | kubectl apply -f - | |
apiVersion: "clickhouse.altinity.com/v1" | |
kind: "ClickHouseInstallation" | |
metadata: | |
name: "pv-simple" | |
namespace: $OPERATOR_NAMESPACE | |
spec: | |
defaults: | |
templates: | |
dataVolumeClaimTemplate: data-volume-template | |
logVolumeClaimTemplate: log-volume-template | |
configuration: | |
users: | |
# printf 'test_password' | sha256sum | |
test_user/password_sha256_hex: 10a6e6cc8311a3e2bcc09bf6c199adecd5dd59408c343e926b129c4914f3cb01 | |
# to allow access outside from kubernetes | |
test_user/networks/ip: | |
- 0.0.0.0/0 | |
clusters: | |
- name: "simple" | |
layout: | |
shardsCount: 1 | |
replicasCount: 1 | |
templates: | |
volumeClaimTemplates: | |
- name: data-volume-template | |
spec: | |
accessModes: | |
- ReadWriteOnce | |
resources: | |
requests: | |
storage: 1Gi | |
- name: log-volume-template | |
spec: | |
accessModes: | |
- ReadWriteOnce | |
resources: | |
requests: | |
storage: 100Mi | |
EOF | |
# verify cluster | |
kubectl get svc -n $OPERATOR_NAMESPACE # view external LB IP | |
# connect to cluster on server | |
kubectl -n $OPERATOR_NAMESPACE exec -it svc/clickhouse-pv-simple -- clickhouse-client | |
############################################################# | |
# TEST DATABASE CLI (clickhouse-client) | |
############################################################# | |
# create test table (which exec into server CLI) | |
CREATE TABLE my_first_table | |
( | |
user_id UInt32, | |
message String, | |
timestamp DateTime, | |
metric Float32 | |
) | |
ENGINE = MergeTree | |
PRIMARY KEY (user_id, timestamp) | |
# insert test data | |
INSERT INTO my_first_table (user_id, message, timestamp, metric) VALUES | |
(101, 'Hello, ClickHouse!', now(), -1.0 ), | |
(102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ), | |
(102, 'Sort your data based on your commonly-used queries', today(), 2.718 ), | |
(101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 ) | |
# query data | |
SELECT * | |
FROM my_first_table | |
ORDER BY timestamp | |
# exit back to shell | |
exit | |
############################################################# | |
# DATA IMPORT FROM STORAGE BUCKET | |
############################################################# | |
# print SQL with values injected from env to run after CLI | |
cat << EOF | |
-- RUN ME IN CLI -- | |
SELECT | |
* | |
FROM s3( | |
'$BUCKET_URL', | |
'$KEY_ACCESS_ID', | |
'$KEY_SECRET', | |
'Parquet' | |
) | |
LIMIT 1000 | |
-- END RUN IN CLI -- | |
EOF | |
# exec into server to run CLI | |
kubectl -n $OPERATOR_NAMESPACE exec -it svc/clickhouse-pv-simple -- clickhouse-client | |
# run query generated above (paste) | |
# exit shell | |
exit | |
# cheer! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment