dbt Python model cliff notes / examples

dbt Python models cliff notes

Sample dummy models for testing. Golden rule is that python models have to always return a dataframe.

Snowflake

# sf_table.py
import pandas as pd
def model(dbt, session):
    return pd.DataFrame({"id": [1]})

Importing another Python package for use in the model:

# sf_table.py
import pandas as pd
def model(dbt, session):
    dbt.config(packages=["agate"])
    import agate
    return pd.DataFrame({"id": [1]})

Returning an empty DataFrame:

# sf_incremental.py
import pandas as pd
def model(dbt, session):
    dbt.config(materialized = "incremental")

    if dbt.is_incremental:
        """
        If we need to return an empty dataframe on the subsequent (i.e. incremental run of the model)
        then simply returning something like this will not work.

            df = pd.DataFrame()
        
        See below for an example empty dataframe that will work without Snowpark errors. Basically,
        it has to be a dataframe where the columns (column names) are defined.
        """
        df = pd.DataFrame({"id": [], "name": []})
    else:
        df = pd.DataFrame({"id": [1], "name": ["alice"]})

    return df

BigQuery

https://docs.getdbt.com/docs/core/connect-data-platform/bigquery-setup#running-python-models-on-dataproc

# bq_table.py
def model(dbt, session):
    dbt.config(submission_method="cluster")
    data = [{"id": 1}]
    return session.createDataFrame(data)

# bq_table.py
def model(dbt, session):
    dbt.config(submission_method="serverless")
    data = [{"id": 1}]
    return session.createDataFrame(data)

# bigframes.py
# import bigframes.pandas as bpd << already imported in parent script.
def model(dbt, session):
    dbt.config(submission_method="bigframes")
    return bpd.DataFrame({"id": [1]})

# bigframes_ml.py
def model(dbt, session):
    # Adapted from https://cloud.google.com/bigquery/docs/samples/bigquery-dataframes-clustering-model
    dbt.config(submission_method="bigframes")

    # Load data from BigQuery
    query_or_table = "bigquery-public-data.ml_datasets.penguins"
    bq_df = bpd.read_gbq(query_or_table)

    # Create the KMeans model
    from bigframes.ml.cluster import KMeans

    cluster_model = KMeans(n_clusters=10)
    cluster_model.fit(bq_df["culmen_length_mm"], bq_df["sex"])

    # Predict using the model
    result = cluster_model.predict(bq_df)
    # Score the model
    score = cluster_model.score(bq_df)

    return score

Databricks

# db_python.py
import pandas as pd
def model(dbt, session):
    dbt.config(
        submission_method="all_purpose_cluster",
        cluster_id="1121-175813-2agrmn6x"
    )
    return pd.DataFrame({"id": [1]})

jeremyyeo/README.md

Select an option

No results found

Select an option

No results found

dbt Python models cliff notes

Snowflake

BigQuery

Databricks