Skip to content

Instantly share code, notes, and snippets.

from typing import Iterable, cast
import polars as pl
from polars.testing import assert_frame_equal
import numpy as np
from datetime import timedelta
import datetime
import asyncio
import time
from io import StringIO
@deanm0000
deanm0000 / main.rs
Created January 29, 2025 19:26
polars to postgres via tokio::postgres copy binary in rust
#[tokio::main]
async fn main() {
let mut client = make_postgres().await; // tokio_postgres::connect with spawn
let transaction = client.transaction().await.unwrap();
// make an example df
let utc=PlSmallStr::from_str("UTC");
let ava = AnyValue::Datetime(Utc
.with_ymd_and_hms(2010, 1, 1, 1, 0, 0)
.unwrap()
@deanm0000
deanm0000 / pyarrow_sig.py
Last active September 26, 2024 04:56
pyarrow compute probe for signatures
import inspect
import json
import os
from pyarrow import compute as pc
from types import FunctionType
import pyarrow as pa
from datetime import datetime
import multiprocessing as mp
import sys
@deanm0000
deanm0000 / with_walrus.py
Last active August 4, 2024 17:15
with_walrus
import sys
import polars as pl
"""
This method is a work around for two annoyances.
1. If one wants to reuse an earlier column definition then the
walrus operator can be used but then it looks awkward because it
has the python variable on the left but then still needs an
alias at the end such as
`with_columns(a:=(pl.col('b')+1).alias('a'), (a*2).alias('c'))`
@deanm0000
deanm0000 / gist:485291942b47b4d32113973eff493e72
Created July 23, 2024 15:51
make fake ufunc to avoid map_batches
import polars as pl
import pyarrow.compute as pc
# Example df
df = pl.DataFrame(
[
pl.Series("a", [1, 2, 3], dtype=pl.Int64),
]
)
import httpx
import asyncio
from bs4 import BeautifulSoup
import os
import geopandas as gpd
import pandas as pd
from pathlib import Path
from geoarrow.rust.core import (
GeoTable,
write_parquet,
def parse_dtypes(df, exclude=[]):
str_cols = [x for x, y in df.schema.items() if y == pl.String and x not in exclude]
try_casts = df.select(
pl.struct(pl.all()).alias("original"),
pl.struct(
pl.coalesce(
pl.col(col).str.strptime(pl.Datetime, x, strict=False)
for x in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"]
)
for col in str_cols
@deanm0000
deanm0000 / add_prints_to_rust.py
Last active June 7, 2024 15:00
add eprint everywhere
from pathlib import Path
import re
rootpath = Path("./polars/crates")
for p in rootpath.rglob("*.rs"):
with p.open() as f:
filestr = f.read()
if filestr.find("fn") == -1:
continue
@deanm0000
deanm0000 / benchmark_filters.py
Last active March 20, 2024 15:16
benchmark filtering list in polars series
import polars as pl
import numpy as np
from itertools import product
import time
from datetime import datetime
import json
def gen_long_string(str_len=10, n_rows=10_000_000):
rng = np.random.default_rng()
@deanm0000
deanm0000 / calpl.py
Last active February 1, 2024 17:14
function to extract a sheet from CalamineWorkBook into a polars df
def pl_cal_sheet(
wb: CalamineWorkbook,
sheet: str,
header_rows: int | None = None,
header_merge_char: str = "_",
skip_rows: int = 0,
infer_schema_length: int = 1000,
infer_schema_minrow: int = 10,
column_dupe_name_seperator: str = "_",
) -> pl.DataFrame: