Last active
March 7, 2023 00:51
-
-
Save camriddell/d160a62a87713285afd82a53da3ca879 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from pathlib import Path | |
from numpy import sqrt, loadtxt, isclose | |
from pandas import read_table | |
from scipy.stats import t | |
def ttest(sample1, sample2): | |
vn1, vn2 = [s.var() / s.size for s in [sample1, sample2]] | |
# assume different sample sizes, unequal variance | |
pooled_var = sqrt(vn1 + vn2) | |
dof = (vn1 + vn2)**2 / (vn1**2 / (sample1.size-1) + vn2**2 / (sample2.size-1)) | |
# two-tailed test | |
test_statistic = (sample1.mean() - sample2.mean()) / pooled_var | |
p_value = t.cdf(-abs(test_statistic), df=dof) * 2 | |
return p_value | |
data_dir = Path('data') | |
raw_data = loadtxt(data_dir / 'data.txt') | |
sample1, sample2 = raw_data[:, 0], raw_data[:, 1] | |
numpy_p_value = ttest(sample1, sample2) | |
raw_data = read_table(data_dir / 'data.txt', header=None, sep=' ') | |
sample1, sample2 = raw_data.iloc[:, 0], raw_data.iloc[:, 1] | |
pandas_p_value = ttest(sample1, sample2) | |
assert not isclose(numpy_p_value, pandas_p_value) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from itertools import combinations | |
from pandas import read_csv, DataFrame, cut | |
from scipy.stats import ttest_ind | |
data = read_csv('data.csv') | |
results = ( | |
DataFrame.from_dict({ | |
(col1, col2): ttest_ind(data[col1], data[col2])._asdict() | |
for col1, col2 in combinations(data, r=2) | |
}, orient='index') | |
.rename_axis(['lhs', 'rhs']) | |
.assign( | |
significance=lambda d: cut( | |
d['pvalue'], | |
bins=[0, .0001, .001, .05, .07, float('inf')], | |
labels=['***', '**', '*', '~', ''] | |
), | |
) | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from pathlib import Path | |
from pandas import read_csv | |
from statsmodels.formula.api import ols | |
data_dir = Path('data') | |
member_data = read_csv(data_dir / 'data.csv') | |
model = ols('monthly_spend ~ standardize(income)', data=member_data) | |
fit = model.fit() | |
assert fit.params['standardize(income)'] < 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment