Skip to content

Instantly share code, notes, and snippets.

@chrispruitt
Last active July 19, 2024 15:40
Show Gist options
  • Save chrispruitt/c0333f2fed69225398b849c63a580557 to your computer and use it in GitHub Desktop.
Save chrispruitt/c0333f2fed69225398b849c63a580557 to your computer and use it in GitHub Desktop.
Idempotent script for managing an aws alb access log athena table.
#!/bin/bash
# Dependencies:
# - athena-cli - https://github.com/justmiles/athena-cli
# - aws cli
# Prerequisites:
# - AWS ALB Access log s3 bucket
# - AWS ALB Access logs created
# - Update the variables below to match your environment
# - Ensure the create table query is up to date with the latest spec: https://docs.aws.amazon.com/athena/latest/ug/application-load-balancer-logs.html
set -e
# Update these to match your envrionement
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
DATABASE="albaccesslogs"
TABLE_NAME="${DATABASE}.alb_access_logs"
S3_ACCESS_LOG_LOCATION="s3://alb-access-logs-${ACCOUNT_ID}/qa/"
echo "Creating database if not exists."
athena query --statistics --sql "$(cat <<EOF
CREATE DATABASE IF NOT EXISTS ${DATABASE};
EOF
)"
echo "Dropping table ${TABLE_NAME}"
athena query --statistics --sql "$(cat <<EOF
DROP TABLE IF EXISTS ${TABLE_NAME};
EOF
)"
echo "Creating ${TABLE_NAME}"
athena query --statistics --sql "$(cat <<EOF
CREATE EXTERNAL TABLE IF NOT EXISTS ${TABLE_NAME} (
\`type\` string,
\`time\` string,
elb string,
client_ip string,
client_port int,
target_ip string,
target_port int,
request_processing_time double,
target_processing_time double,
response_processing_time double,
elb_status_code int,
target_status_code string,
received_bytes bigint,
sent_bytes bigint,
request_verb string,
request_url string,
request_proto string,
user_agent string,
ssl_cipher string,
ssl_protocol string,
target_group_arn string,
trace_id string,
domain_name string,
chosen_cert_arn string,
matched_rule_priority string,
request_creation_time string,
actions_executed string,
redirect_url string,
lambda_error_reason string,
target_port_list string,
target_status_code_list string,
classification string,
classification_reason string,
conn_trace_id string
)
PARTITIONED BY (
\`lb_name\` string COMMENT '',
\`day\` string COMMENT 'yyyy/MM/dd format'
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = '([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) (.*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^ ]*)\" \"([^\\s]+?)\" \"([^\\s]+)\" \"([^ ]*)\" \"([^ ]*)\" ?([^ ]*)?( .*)?'
)
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://alb-access-logs-${ACCOUNT_ID}/'
TBLPROPERTIES (
'projection.account_id.type' = '${ACCOUNT_ID}',
'projection.day.format' = 'yyyy/MM/dd',
'projection.day.interval' = '1',
'projection.day.interval.unit' = 'DAYS',
'projection.day.range' = 'NOW-90DAYS,NOW',
'projection.day.type' = 'date',
'projection.enabled' = 'true',
'projection.hour.digits' = '2',
'projection.hour.range' = '0,23',
'projection.hour.type' = 'integer',
'projection.lb_name.type' = 'injected',
'storage.location.template' = '${S3_ACCESS_LOG_LOCATION}\${lb_name}/AWSLogs/${ACCOUNT_ID}/elasticloadbalancing/us-east-1/\${day}/'
);
EOF
)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment