Created
March 31, 2025 18:10
-
-
Save amotl/2196433103659335a7c341d7c39485c6 to your computer and use it in GitHub Desktop.
Investigate anomaly with pandas and CrateDB, re. storing `DATE` types that naturally do not use time zones.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Investigate anomaly with pandas and CrateDB, re. storing `DATE` types that naturally do not use time zones. | |
https://github.com/crate/sqlalchemy-cratedb/issues/216 | |
Setup: Please install dependency packages enumerated below. | |
Usage: Please toggle database connection URI per `dburi = this or that`. | |
Invoke: Just type `uv run pandas_cratedb_date_type.py`. | |
docker run --rm -it --name=cratedb --publish=4200:4200 --publish=5432:5432 --env=CRATE_HEAP_SIZE=2g crate/crate:nightly -Cdiscovery.type=single-node | |
docker run --rm -it --name=postgresql --publish=5433:5432 --env "POSTGRES_HOST_AUTH_METHOD=trust" postgres:17 postgres -c log_statement=all | |
""" | |
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "psycopg2-binary", | |
# "sqlalchemy-cratedb", | |
# ] | |
# /// | |
import sqlalchemy as sa | |
import pandas as pd | |
data = { | |
"date_1": ["2020-01-01", "2021-01-01", "2022-01-01", "2023-01-01", "2027-12-30"], | |
"date_2": ["2020-09-24", "2020-10-24", "2020-11-24", "2020-12-24", "2027-09-24"], | |
} | |
def main(): | |
print("\ndataframe before") | |
df_data = pd.DataFrame.from_dict(data, dtype="datetime64[ns]") | |
print(df_data.dtypes) | |
print(df_data.sort_values(by="date_1").reset_index(drop=True)) | |
dburi = "crate://" | |
#dburi = "postgresql://postgres@localhost:5433/" | |
engine = sa.create_engine(dburi, echo=False) | |
conn = engine.connect() | |
df_data.to_sql( | |
"test_date", | |
conn, | |
if_exists="replace", | |
index=False, | |
) | |
if dburi.startswith("crate"): | |
conn.exec_driver_sql("REFRESH TABLE test_date;") | |
#df_load = pd.read_sql_table("test_date", conn, parse_dates={"date_1": "date", "date_2": "date"}) | |
df_load = pd.read_sql_table("test_date", conn) | |
print("\ndataframe after") | |
#df_load = df_load.sort_values(by="date_1").reset_index(drop=True) | |
print(df_load.dtypes) | |
print(df_load) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment