MaxGhenis · January 28, 2026 20:22
diff --git a/README.md b/README.md
diff --git a/policyengine_batch.py b/policyengine_batch.py
 """
 Batch processing script for PolicyEngine calculations.
 Use with Stata or any system that can export/import CSV files.

 Usage:
    python policyengine_batch.py input.csv output.csv

 Input CSV format:
    id,age,employment_income,state_code
    1,35,30000,TX
    2,42,75000,CA

 Output adds calculated columns:
    id,age,employment_income,state_code,income_tax,eitc,ctc,snap,household_net_income
 """

 import sys
 import pandas as pd
 from policyengine_us import Simulation


 def calculate_for_row(row, year=2025):
    """Calculate PolicyEngine results for a single household."""
    situation = {
        "people": {
            "person": {
                "age": {year: int(row["age"])},
                "employment_income": {year: float(row["employment_income"])},
            }
        },
        "tax_units": {"tax_unit": {"members": ["person"]}},
        "spm_units": {"spm_unit": {"members": ["person"]}},
        "families": {"family": {"members": ["person"]}},
        "households": {
            "household": {
                "members": ["person"],
                "state_code": {year: row["state_code"]},
            }
        },
    }

    sim = Simulation(situation=situation)

    return {
        "income_tax": float(sim.calculate("income_tax", year)[0]),
        "eitc": float(sim.calculate("eitc", year)[0]),
        "ctc": float(sim.calculate("ctc", year)[0]),
        "snap": float(sim.calculate("snap", year)[0]),
        "household_net_income": float(
            sim.calculate("household_net_income", year)[0]
        ),
    }


 def main(input_path, output_path):
    df = pd.read_csv(input_path)

    # Validate required columns
    required = ["age", "employment_income", "state_code"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # Calculate for each row
    results = []
    for idx, row in df.iterrows():
        print(f"Processing row {idx + 1}/{len(df)}...")
        result = calculate_for_row(row)
        results.append(result)

    # Merge results
    results_df = pd.DataFrame(results)
    output_df = pd.concat([df, results_df], axis=1)
    output_df.to_csv(output_path, index=False)
    print(f"Results written to {output_path}")


 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(__doc__)
        sys.exit(1)
    main(sys.argv[1], sys.argv[2])
diff --git a/policyengine_stata_example.do b/policyengine_stata_example.do
 ********************************************************************************
 * Using PolicyEngine from Stata
 *
 * Three approaches shown below:
 * 1. Direct Python integration (Stata 16+)
 * 2. API calls via curl
 * 3. Batch processing via Python script
 ********************************************************************************

 *===============================================================================
 * APPROACH 1: Direct Python Integration (Stata 16+)
 * Requires: Stata 16+, Python 3.9+, policyengine-us package
 *===============================================================================

 * First, ensure Python is configured in Stata:
 * python query

 * Install policyengine-us (run once):
 * shell pip install policyengine-us

 * Example: Calculate federal income tax for a single filer
 python:
 from policyengine_us import Simulation

 # Define a household
 situation = {
    "people": {
        "person": {
            "age": {2025: 40},
            "employment_income": {2025: 50_000}
        }
    },
    "tax_units": {
        "tax_unit": {
            "members": ["person"]
        }
    },
    "households": {
        "household": {
            "members": ["person"],
            "state_code": {2025: "CA"}
        }
    }
 }

 sim = Simulation(situation=situation)

 # Calculate various outputs
 fed_tax = sim.calculate("income_tax", 2025)
 state_tax = sim.calculate("ca_income_tax", 2025)
 eitc = sim.calculate("eitc", 2025)
 ctc = sim.calculate("ctc", 2025)

 # Store results in Stata scalars
 from sfi import Scalar
 Scalar.setValue("fed_tax", float(fed_tax[0]))
 Scalar.setValue("state_tax", float(state_tax[0]))
 Scalar.setValue("eitc", float(eitc[0]))
 Scalar.setValue("ctc", float(ctc[0]))
 end

 * Display results
 di "Federal Income Tax: $" %10.2f scalar(fed_tax)
 di "State Income Tax:   $" %10.2f scalar(state_tax)
 di "EITC:               $" %10.2f scalar(eitc)
 di "CTC:                $" %10.2f scalar(ctc)


 *===============================================================================
 * APPROACH 2: API Calls via curl (works with any Stata version)
 * No Python installation required on Stata machine
 *===============================================================================

 * Using PolicyEngine's hosted API
 tempfile response
 shell curl -s -X POST "https://household.api.policyengine.org/us/calculate" \
    -H "Content-Type: application/json" \
    -d '{"household": {"people": {"person": {"age": {"2025": 40}, "employment_income": {"2025": 50000}}}, "tax_units": {"tax_unit": {"members": ["person"]}}, "households": {"household": {"members": ["person"], "state_code": {"2025": "CA"}}}}}' \
    > `response'

 * Parse JSON response (requires Stata 17+ for native JSON, or use insheetjson for earlier)
 * For Stata 17+:
 * jsonio using `response', elem("result.income_tax.2025")


 *===============================================================================
 * APPROACH 3: Batch Processing via External Python Script
 * Best for processing many observations
 *===============================================================================

 * Step 1: Export your data to CSV
 * sysuse auto, clear
 * export delimited using "input_data.csv", replace

 * Step 2: Run Python script (see policyengine_batch.py in this gist)
 * shell python policyengine_batch.py input_data.csv output_results.csv

 * Step 3: Merge results back
 * import delimited using "output_results.csv", clear


 *===============================================================================
 * EXAMPLE: Loop over dataset observations
 * Calculate PolicyEngine results for each row
 *===============================================================================

 * Create example dataset
 clear
 input id age income str2 state
 1 35 30000 "TX"
 2 42 75000 "CA"
 3 28 45000 "NY"
 4 55 120000 "FL"
 end

 * Generate empty results variables
 gen double fed_tax = .
 gen double eitc = .
 gen double net_income = .

 * Loop and calculate (note: slow for large datasets - use batch approach instead)
 forvalues i = 1/`=_N' {
    local age_i = age[`i']
    local income_i = income[`i']
    local state_i = state[`i']

    python:
    from policyengine_us import Simulation
    from sfi import Scalar, Data

    situation = {
        "people": {
            "person": {
                "age": {2025: `age_i'},
                "employment_income": {2025: `income_i'}
            }
        },
        "tax_units": {
            "tax_unit": {
                "members": ["person"]
            }
        },
        "households": {
            "household": {
                "members": ["person"],
                "state_code": {2025: "`state_i'"}
            }
        }
    }

    sim = Simulation(situation=situation)
    Scalar.setValue("_fed_tax", float(sim.calculate("income_tax", 2025)[0]))
    Scalar.setValue("_eitc", float(sim.calculate("eitc", 2025)[0]))
    Scalar.setValue("_net_income", float(sim.calculate("household_net_income", 2025)[0]))
    end

    quietly replace fed_tax = scalar(_fed_tax) in `i'
    quietly replace eitc = scalar(_eitc) in `i'
    quietly replace net_income = scalar(_net_income) in `i'
 }

 list

 ********************************************************************************
 * Notes:
 * - PolicyEngine is open source: github.com/policyengine/policyengine-us
 * - API documentation: policyengine.org/us/api
 * - For large-scale analysis, consider the batch Python approach
 * - Questions? Contact [email protected]
 ********************************************************************************
	"""
	Batch processing script for PolicyEngine calculations.
	Use with Stata or any system that can export/import CSV files.

	Usage:
	python policyengine_batch.py input.csv output.csv

	Input CSV format:
	id,age,employment_income,state_code
	1,35,30000,TX
	2,42,75000,CA

	Output adds calculated columns:
	id,age,employment_income,state_code,income_tax,eitc,ctc,snap,household_net_income
	"""

	import sys
	import pandas as pd
	from policyengine_us import Simulation


	def calculate_for_row(row, year=2025):
	"""Calculate PolicyEngine results for a single household."""
	situation = {
	"people": {
	"person": {
	"age": {year: int(row["age"])},
	"employment_income": {year: float(row["employment_income"])},
	}
	},
	"tax_units": {"tax_unit": {"members": ["person"]}},
	"spm_units": {"spm_unit": {"members": ["person"]}},
	"families": {"family": {"members": ["person"]}},
	"households": {
	"household": {
	"members": ["person"],
	"state_code": {year: row["state_code"]},
	}
	},
	}

	sim = Simulation(situation=situation)

	return {
	"income_tax": float(sim.calculate("income_tax", year)[0]),
	"eitc": float(sim.calculate("eitc", year)[0]),
	"ctc": float(sim.calculate("ctc", year)[0]),
	"snap": float(sim.calculate("snap", year)[0]),
	"household_net_income": float(
	sim.calculate("household_net_income", year)[0]
	),
	}


	def main(input_path, output_path):
	df = pd.read_csv(input_path)

	# Validate required columns
	required = ["age", "employment_income", "state_code"]
	missing = [c for c in required if c not in df.columns]
	if missing:
	raise ValueError(f"Missing required columns: {missing}")

	# Calculate for each row
	results = []
	for idx, row in df.iterrows():
	print(f"Processing row {idx + 1}/{len(df)}...")
	result = calculate_for_row(row)
	results.append(result)

	# Merge results
	results_df = pd.DataFrame(results)
	output_df = pd.concat([df, results_df], axis=1)
	output_df.to_csv(output_path, index=False)
	print(f"Results written to {output_path}")


	if __name__ == "__main__":
	if len(sys.argv) != 3:
	print(__doc__)
	sys.exit(1)
	main(sys.argv[1], sys.argv[2])
	********************************************************************************
	* Using PolicyEngine from Stata
	*
	* Three approaches shown below:
	* 1. Direct Python integration (Stata 16+)
	* 2. API calls via curl
	* 3. Batch processing via Python script
	********************************************************************************

	*===============================================================================
	* APPROACH 1: Direct Python Integration (Stata 16+)
	* Requires: Stata 16+, Python 3.9+, policyengine-us package
	*===============================================================================

	* First, ensure Python is configured in Stata:
	* python query

	* Install policyengine-us (run once):
	* shell pip install policyengine-us

	* Example: Calculate federal income tax for a single filer
	python:
	from policyengine_us import Simulation

	# Define a household
	situation = {
	"people": {
	"person": {
	"age": {2025: 40},
	"employment_income": {2025: 50_000}
	}
	},
	"tax_units": {
	"tax_unit": {
	"members": ["person"]
	}
	},
	"households": {
	"household": {
	"members": ["person"],
	"state_code": {2025: "CA"}
	}
	}
	}

	sim = Simulation(situation=situation)

	# Calculate various outputs
	fed_tax = sim.calculate("income_tax", 2025)
	state_tax = sim.calculate("ca_income_tax", 2025)
	eitc = sim.calculate("eitc", 2025)
	ctc = sim.calculate("ctc", 2025)

	# Store results in Stata scalars
	from sfi import Scalar
	Scalar.setValue("fed_tax", float(fed_tax[0]))
	Scalar.setValue("state_tax", float(state_tax[0]))
	Scalar.setValue("eitc", float(eitc[0]))
	Scalar.setValue("ctc", float(ctc[0]))
	end

	* Display results
	di "Federal Income Tax: $" %10.2f scalar(fed_tax)
	di "State Income Tax: $" %10.2f scalar(state_tax)
	di "EITC: $" %10.2f scalar(eitc)
	di "CTC: $" %10.2f scalar(ctc)


	*===============================================================================
	* APPROACH 2: API Calls via curl (works with any Stata version)
	* No Python installation required on Stata machine
	*===============================================================================

	* Using PolicyEngine's hosted API
	tempfile response
	shell curl -s -X POST "https://household.api.policyengine.org/us/calculate" \
	-H "Content-Type: application/json" \
	-d '{"household": {"people": {"person": {"age": {"2025": 40}, "employment_income": {"2025": 50000}}}, "tax_units": {"tax_unit": {"members": ["person"]}}, "households": {"household": {"members": ["person"], "state_code": {"2025": "CA"}}}}}' \
	> `response'

	* Parse JSON response (requires Stata 17+ for native JSON, or use insheetjson for earlier)
	* For Stata 17+:
	* jsonio using `response', elem("result.income_tax.2025")


	*===============================================================================
	* APPROACH 3: Batch Processing via External Python Script
	* Best for processing many observations
	*===============================================================================

	* Step 1: Export your data to CSV
	* sysuse auto, clear
	* export delimited using "input_data.csv", replace

	* Step 2: Run Python script (see policyengine_batch.py in this gist)
	* shell python policyengine_batch.py input_data.csv output_results.csv

	* Step 3: Merge results back
	* import delimited using "output_results.csv", clear


	*===============================================================================
	* EXAMPLE: Loop over dataset observations
	* Calculate PolicyEngine results for each row
	*===============================================================================

	* Create example dataset
	clear
	input id age income str2 state
	1 35 30000 "TX"
	2 42 75000 "CA"
	3 28 45000 "NY"
	4 55 120000 "FL"
	end

	* Generate empty results variables
	gen double fed_tax = .
	gen double eitc = .
	gen double net_income = .

	* Loop and calculate (note: slow for large datasets - use batch approach instead)
	forvalues i = 1/`=_N' {
	local age_i = age[`i']
	local income_i = income[`i']
	local state_i = state[`i']

	python:
	from policyengine_us import Simulation
	from sfi import Scalar, Data

	situation = {
	"people": {
	"person": {
	"age": {2025: `age_i'},
	"employment_income": {2025: `income_i'}
	}
	},
	"tax_units": {
	"tax_unit": {
	"members": ["person"]
	}
	},
	"households": {
	"household": {
	"members": ["person"],
	"state_code": {2025: "`state_i'"}
	}
	}
	}

	sim = Simulation(situation=situation)
	Scalar.setValue("_fed_tax", float(sim.calculate("income_tax", 2025)[0]))
	Scalar.setValue("_eitc", float(sim.calculate("eitc", 2025)[0]))
	Scalar.setValue("_net_income", float(sim.calculate("household_net_income", 2025)[0]))
	end

	quietly replace fed_tax = scalar(_fed_tax) in `i'
	quietly replace eitc = scalar(_eitc) in `i'
	quietly replace net_income = scalar(_net_income) in `i'
	}

	list

	********************************************************************************
	* Notes:
	* - PolicyEngine is open source: github.com/policyengine/policyengine-us
	* - API documentation: policyengine.org/us/api
	* - For large-scale analysis, consider the batch Python approach
	* - Questions? Contact [email protected]
	********************************************************************************