-
-
Save jeroos/c4336e8e177ab634a91968431d8af4cd to your computer and use it in GitHub Desktop.
import json | |
import os | |
import duckdb | |
import boto3 | |
import datetime | |
from typing import Any, Dict | |
def construct_prepared_sql_and_params(sql_template, bind_params): | |
single_value_params = {k: v for k, v in bind_params.items() if not isinstance(v, list)} | |
list_params = {k: v for k, v in bind_params.items() if isinstance(v, list)} | |
for key in single_value_params.keys(): | |
sql_template = sql_template.replace(f"{{{key}}}", "?") | |
for key, values in list_params.items(): | |
placeholders = ', '.join(['?'] * len(values)) | |
sql_template = sql_template.replace(f"{{{key}}}", placeholders) | |
flat_params = [] | |
for key in bind_params: | |
if key in single_value_params: | |
flat_params.append(single_value_params[key]) | |
elif key in list_params: | |
flat_params.extend(list_params[key]) | |
return sql_template, flat_params | |
def serialize_dates(row): | |
for key, value in row.items(): | |
if isinstance(value, datetime.date): | |
row[key] = value.strftime('%d-%m-%Y') | |
return row | |
def lambda_handler(event: Dict[str, Any], context): | |
print("Duckdb Lambda received event: " + json.dumps(event, indent=4)) | |
file_name = event.get('file_name') | |
sql = event.get('sql') | |
bind_params = event.get('bind_params', {}) | |
return_type = event.get('return_type', 'List') | |
s3 = boto3.client('s3') | |
bucket_name = os.getenv('BUCKET_NAME') | |
local_file_name = f"/tmp/{file_name}" | |
os.makedirs(os.path.dirname(local_file_name), exist_ok=True) | |
s3.download_file(bucket_name, file_name, local_file_name) | |
conn = duckdb.connect(database=':memory:', read_only=False) | |
prepared_sql = sql % local_file_name | |
try: | |
prepared_sql, flat_params = construct_prepared_sql_and_params(prepared_sql, bind_params) | |
print(prepared_sql) | |
print(f"flat_params -> {flat_params}") | |
cursor = conn.execute(prepared_sql, parameters=flat_params) | |
columns = [description[0] for description in cursor.description] | |
fetched_results = [dict(zip(columns, row)) for row in cursor.fetchall()] | |
fetched_results = [serialize_dates(row) for row in fetched_results] | |
if return_type == 'Single': | |
results = fetched_results[0] if fetched_results else {} | |
else: | |
results = fetched_results | |
finally: | |
conn.close() | |
print("Duckdb Lambda processing completed") | |
return results |
alexander-grun
commented
Mar 8, 2025
via email
Can I also get the zip? [email protected]
Here is a step-by-step guide on how to create your own Lambda Layer via Docker and upload it to AWS
https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
Thank you, I got the zip, I created a Layer with it, and attached to my function (it's strange for me, that I can't choose it from Custom Layers, only I can add with the ARN, but at least that works), and I'm still getting the same error:
Function Logs:
[ERROR] Runtime.ImportModuleError: Unable to import module 'lambda_function': No module named 'duckdb.duckdb'
Traceback (most recent call last):
The code:
import json
import duckdb
def lambda_handler(event, context):
# TODO implement
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
If I repack it, without the "python" folder, so the subfolders right directly in the zip, I'm still getting the error
{
"errorMessage": "Unable to import module 'lambda_function': No module named 'duckdb'",
"errorType": "Runtime.ImportModuleError",
"requestId": "",
"stackTrace": []
}
@bbalazs-cubix in the environment variable of Lambda add Home: /tmp as a variable and try again
Thanks for the article, unfortunately that uses Glue, not Lambda, but it was interesting. Tried creating an environment variable called "Home" with the value "/tmp", but I'm getting the same import error. Trying to look for another solution.
this works for me.
ref: https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
this works for me.
ref: https://alexander-grun.github.io/ag-blog/duck-db-aws-lambda-layer/
To make it run on Lambda, I also need to set up home_directory. This is my simple Lambda code.
import json
import duckdb
def lambda_handler(event, context):
print('Hi')
duckdb_conn = duckdb.connect()
print(f"Installing duckdb extensions. Installing httpfs.")
duckdb_conn.sql("SET home_directory='/tmp';")
duckdb_conn.sql("install httpfs")
print(f"Loading httpfs")
duckdb_conn.sql("load httpfs")
print('LOADED!')