Ian Cook ianmcook

C++

AdbcError error = {};
AdbcDatabase database = {};
AdbcDatabaseNew(&database, &error);

// AdbcDatabaseSetOption(&database, "driver", "<driver>", &error) // should be able to omit this...

AdbcDatabaseSetOption(&database, "uri", "<driver>:<scheme>://<address>", &error); // ...and include only this

To create games.duckdb, download games.txt from https://github.com/ianmcook/coursera-datasets/blob/master/games.txt, open duckdb and run:

ATTACH 'games.duckdb' as main_db (BLOCK_SIZE 16384);
USE main;
CREATE TABLE games (
    id INTEGER,
    name VARCHAR,
    inventor VARCHAR,
 year VARCHAR,

First start an HTTP server to serve Arrow IPC stream data. You can do this using one of the server examples in HTTP GET Arrow Data: Simple Examples or simply by starting a Python HTTP server in the same directory where you have an Arrow IPC stream file (named file.arrows in this example).

python -m http.server 8008

Download the attached Python script script.py. You might need to do chmod +x script.py to make it executable.

Why IbisML?

This is a simple example demonstrating why you might want to use IbisML instead of just plain Ibis in an ML preprocessing pipeline.

Scenario

You are training an ML model that gets better accuracy when the floating point number columns in the training data are normalized (by subtracting the mean and dividing by the standard deviation). Your data contains multiple floating point columns.

To demonstrate this, we can use the iris flower dataset.

	library(adbcdrivermanager)

	drv <- adbc_driver("snowflake")

	db <- adbc_database_init(
	drv,
	username="USER",
	password="PASS",
	adbc.snowflake.sql.account="ACCOUNT-IDENT",
	adbc.snowflake.sql.warehouse="MY_WAREHOUSE",

	{
	"statement": "SELECT * FROM MYTABLEONE; SELECT * FROM MYTABLETWO",
	"parameters": {
	"MULTI_STATEMENT_COUNT": "2"
	},
	"resultSetMetaData": {
	"format": "arrowv1"
	},
	"timeout": 60,
	"database": "MYDATABASE",

	adbc_driver_flightsql
	adbc_driver_manager
	adbc_driver_postgresql
	adbc_driver_sqlite
	furo
	numpydoc
	pandas
	polars
	sphinx
	sphinx-copybutton

	import pandas as pd
	import pyarrow as pa

	file_path = 'fruit.arrow'
	stream_path = 'fruit.arrows'

	df = pd.DataFrame(data={'fruit': ['apple', 'apple', 'apple', 'orange', 'orange', 'orange'],
	'variety': ['gala', 'honeycrisp', 'fuji', 'navel', 'valencia', 'cara cara'],
	'weight': [134.2 , 158.6, None, 142.1, 96.7, None]})

	{
	"statement": "SELECT * FROM MYTABLE",
	"resultSetMetaData": {
	"format": "arrowv1"
	},
	"timeout": 60,
	"database": "MYDATABASE",
	"schema": "MYSCHEMA",
	"warehouse": "MYWAREHOUSE",
	"role": "MYROLE"

	import ibis
	import random

	con = ibis.connect("duckdb://penguins.ddb")
	con.create_table(
	"penguins", ibis.examples.penguins.fetch().to_pyarrow(), overwrite = True
	)

	ibis.options.interactive = True