Skip to content

Instantly share code, notes, and snippets.

@amotl
Created March 31, 2025 18:10
Show Gist options
  • Save amotl/2196433103659335a7c341d7c39485c6 to your computer and use it in GitHub Desktop.
Save amotl/2196433103659335a7c341d7c39485c6 to your computer and use it in GitHub Desktop.
Investigate anomaly with pandas and CrateDB, re. storing `DATE` types that naturally do not use time zones.
"""
Investigate anomaly with pandas and CrateDB, re. storing `DATE` types that naturally do not use time zones.
https://github.com/crate/sqlalchemy-cratedb/issues/216
Setup: Please install dependency packages enumerated below.
Usage: Please toggle database connection URI per `dburi = this or that`.
Invoke: Just type `uv run pandas_cratedb_date_type.py`.
docker run --rm -it --name=cratedb --publish=4200:4200 --publish=5432:5432 --env=CRATE_HEAP_SIZE=2g crate/crate:nightly -Cdiscovery.type=single-node
docker run --rm -it --name=postgresql --publish=5433:5432 --env "POSTGRES_HOST_AUTH_METHOD=trust" postgres:17 postgres -c log_statement=all
"""
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "psycopg2-binary",
# "sqlalchemy-cratedb",
# ]
# ///
import sqlalchemy as sa
import pandas as pd
data = {
"date_1": ["2020-01-01", "2021-01-01", "2022-01-01", "2023-01-01", "2027-12-30"],
"date_2": ["2020-09-24", "2020-10-24", "2020-11-24", "2020-12-24", "2027-09-24"],
}
def main():
print("\ndataframe before")
df_data = pd.DataFrame.from_dict(data, dtype="datetime64[ns]")
print(df_data.dtypes)
print(df_data.sort_values(by="date_1").reset_index(drop=True))
dburi = "crate://"
#dburi = "postgresql://postgres@localhost:5433/"
engine = sa.create_engine(dburi, echo=False)
conn = engine.connect()
df_data.to_sql(
"test_date",
conn,
if_exists="replace",
index=False,
)
if dburi.startswith("crate"):
conn.exec_driver_sql("REFRESH TABLE test_date;")
#df_load = pd.read_sql_table("test_date", conn, parse_dates={"date_1": "date", "date_2": "date"})
df_load = pd.read_sql_table("test_date", conn)
print("\ndataframe after")
#df_load = df_load.sort_values(by="date_1").reset_index(drop=True)
print(df_load.dtypes)
print(df_load)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment