Last active
December 14, 2023 18:52
-
-
Save PaoloLeonard/86e90843d789261e3e69597b54f8f7f1 to your computer and use it in GitHub Desktop.
rootsconf-dq
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import yaml, json | |
import matplotlib.pyplot as plt | |
dq_score, dashboards = st.tabs(['Data quality score', 'Dashboards']) | |
with dq_score: | |
st.title(f"""Data quality score""") | |
schema, checks = {}, {} | |
curr_count_comp, curr_count_cons, curr_count_fresh, curr_count_uniq, curr_count_val = 0, 0, 0, 0, 0 | |
exp_count_comp, exp_count_cons, exp_count_fresh, exp_count_uniq, exp_count_val = 3, 2, 1, 2, 4 | |
col_uploader_schema, col_uploader_checks = st.columns(2) | |
col_completeness, col_accuracy, col_consistency = st.columns(3) | |
col_validity, col_uniqueness, col_stewardship = st.columns(3) | |
contract_args = ['dataset_name', 'columns', 'criticality', 'consumer', 'producer'] | |
with col_uploader_schema: | |
st.header('Schema uploader') | |
schema_file = st.file_uploader(label='Schema', type=['yaml', 'yml'], key='schema') | |
if schema_file: | |
schema = yaml.safe_load(schema_file) | |
with col_uploader_checks: | |
st.header('Checks uploader') | |
checks_file = st.file_uploader(label='Checks', type=['yaml', 'yml'], key='checks') | |
if checks_file: | |
checks = yaml.safe_load(checks_file) | |
with col_completeness: | |
with st.expander('Completeness'): | |
if checks: | |
for dataset_check in checks: | |
for check in checks[dataset_check]: | |
for key in check: | |
if "COMPLETENESS" in check[key]["name"].upper(): | |
curr_count_comp +=1 | |
if curr_count_comp >= exp_count_comp: | |
st.write(':green[Great you are covered for the completeness dimension]') | |
elif 0 < curr_count_comp < exp_count_comp: | |
st.write(f':orange[Please provide at least {exp_count_comp} completeness checks]') | |
else: | |
st.write(':red[Did you even try?...]') | |
else: | |
st.write(":red[Please upload a check file]") | |
with col_accuracy: | |
with st.expander('Freshness'): | |
if checks: | |
for dataset_check in checks: | |
for check in checks[dataset_check]: | |
for key in check: | |
if "FRESHNESS" in check[key]["name"].upper(): | |
curr_count_fresh +=1 | |
if curr_count_fresh >= 1: | |
st.write(':green[Great you are covered for the freshness dimension]') | |
else: | |
st.write(f':red[Did you even try?... We need {exp_count_fresh} more.]') | |
else: | |
st.write(":red[Please upload a check file]") | |
with col_consistency: | |
with st.expander('Consistency'): | |
if checks: | |
for dataset_check in checks: | |
for check in checks[dataset_check]: | |
for key in check: | |
if "CONSISTENCY" in check[key]["name"].upper(): | |
curr_count_cons +=1 | |
if curr_count_cons >= exp_count_cons: | |
st.write(':green[Great you are covered for the consistency dimension]') | |
elif 0 < curr_count_cons < exp_count_cons: | |
st.write(f':orange[Please provide at least {exp_count_cons} consistency checks]') | |
else: | |
st.write(':red[Did you even try?...]') | |
else: | |
st.write(":red[Please upload a check file]") | |
with col_uniqueness: | |
with st.expander('Uniqueness'): | |
if checks: | |
for dataset_check in checks: | |
for check in checks[dataset_check]: | |
for key in check: | |
if "UNIQUENESS" in check[key]["name"].upper(): | |
curr_count_uniq +=1 | |
if curr_count_uniq >= exp_count_uniq: | |
st.write(':green[Great you are covered for the uniqueness dimension]') | |
elif 0 < curr_count_uniq < exp_count_uniq: | |
st.write(f':orange[Please provide at least {exp_count_uniq} uniqueness checks]') | |
else: | |
st.write(':red[Did you even try?...]') | |
else: | |
st.write(":red[Please upload a check file]") | |
with col_validity: | |
with st.expander('Validity'): | |
if checks: | |
for dataset_check in checks: | |
for check in checks[dataset_check]: | |
for key in check: | |
if "VALIDITY" in check[key]["name"].upper(): | |
curr_count_val +=1 | |
if curr_count_val >= exp_count_val: | |
st.write(':green[Great you are covered for the validity dimension]') | |
elif 0 < curr_count_val < exp_count_val: | |
st.write(f':orange[Please provide at least {exp_count_val} validity checks]') | |
else: | |
st.write(':red[Did you even try?...]') | |
else: | |
st.write(":red[Please upload a check file]") | |
steward_score = 0 | |
with col_stewardship: | |
with st.expander('Stewardship'): | |
if schema: | |
for arg in contract_args: | |
if not schema.get(arg): | |
steward_score += 1 | |
st.write(f':red[We need a {arg}]') | |
if schema.get('columns'): | |
for col in schema.get('columns'): | |
if not schema.get('columns')[col].get('description') or not schema.get('columns')[col].get('type'): | |
st.write(f':red[Missing type or description for column {col}]') | |
else: | |
steward_score = len(contract_args) | |
st.write(':red[Please provide a schema file.]') | |
steward_score = (len(contract_args) - steward_score) / len(contract_args) | |
dq_score = curr_count_comp/exp_count_comp + curr_count_cons/exp_count_cons + curr_count_fresh/exp_count_fresh + curr_count_uniq/exp_count_uniq + curr_count_val/exp_count_val | |
st.header(f"DQ score: {(dq_score/5+steward_score)*50}%") | |
with dashboards: | |
st.header('Dashboards') | |
st.subheader('Results uploader') | |
checks = [] | |
results_file = st.file_uploader(label='Results', type=['json'], key='results') | |
data_health_bucket, freshness_bucket, cons_bucket, comp_bucket, val_bucket, uniquess_bucket = {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0} | |
data_health, freshness_col_graph, cons_col_graph = st.columns(3) | |
comp_col_graph, val_col_graph, uniqueness_col_graph = st.columns(3) | |
failed = False | |
if results_file: | |
results = json.load(results_file) | |
failed = results.get('hasFailures') | |
checks = results.get('checks') | |
for result in checks: | |
if 'FRESHNESS' in result.get('name').upper(): | |
freshness_bucket['total'] += 1 | |
if 'CONSISTENCY' in result.get('name').upper(): | |
cons_bucket['total'] += 1 | |
if 'COMPLETENESS' in result.get('name').upper(): | |
comp_bucket['total'] += 1 | |
if 'VALIDITY' in result.get('name').upper(): | |
val_bucket['total'] += 1 | |
if 'UNIQUENESS' in result.get('name').upper(): | |
uniquess_bucket['total'] += 1 | |
if result.get('outcome') == 'fail': | |
data_health_bucket['fail'] += 1 | |
if 'FRESHNESS' in result.get('name').upper(): | |
freshness_bucket['fail'] += 1 | |
if 'CONSISTENCY' in result.get('name').upper(): | |
cons_bucket['fail'] += 1 | |
if 'COMPLETENESS' in result.get('name').upper(): | |
comp_bucket['fail'] += 1 | |
if 'VALIDITY' in result.get('name').upper(): | |
val_bucket['fail'] += 1 | |
if 'UNIQUENESS' in result.get('name').upper(): | |
uniquess_bucket['fail'] += 1 | |
data_health_bucket['total'] += 1 | |
with data_health: | |
st.subheader("Data health") | |
labels = 'Passed', 'Failed' | |
sizes = [data_health_bucket['total']-data_health_bucket['fail'], data_health_bucket['fail']] | |
if checks and data_health_bucket['total'] > 0: | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, colors=['green', 'red']) | |
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
st.pyplot(fig1) | |
with freshness_col_graph: | |
st.subheader("Freshness") | |
labels = 'Passed', 'Failed' | |
sizes = [freshness_bucket['total']-freshness_bucket['fail'], freshness_bucket['fail']] | |
if checks and freshness_bucket['total'] > 0: | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, colors=['green', 'red']) | |
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
st.pyplot(fig1) | |
with cons_col_graph: | |
st.subheader("Consistency") | |
labels = 'Passed', 'Failed' | |
sizes = [cons_bucket['total']-cons_bucket['fail'], cons_bucket['fail']] | |
if checks and cons_bucket['total'] > 0: | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, colors=['green', 'red']) | |
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
st.pyplot(fig1) | |
with comp_col_graph: | |
st.subheader("Completeness") | |
labels = 'Passed', 'Failed' | |
sizes = [comp_bucket['total']-comp_bucket['fail'], comp_bucket['fail']] | |
if checks and comp_bucket['total'] > 0: | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, colors=['green', 'red']) | |
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
st.pyplot(fig1) | |
with val_col_graph: | |
st.subheader("Validity") | |
labels = 'Passed', 'Failed' | |
sizes = [val_bucket['total']-val_bucket['fail'], val_bucket['fail']] | |
if checks and val_bucket['total'] > 0: | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, colors=['green', 'red']) | |
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
st.pyplot(fig1) | |
with uniqueness_col_graph: | |
st.subheader("Uniqueness") | |
labels = 'Passed', 'Failed' | |
sizes = [uniquess_bucket['total']-uniquess_bucket['fail'], uniquess_bucket['fail']] | |
if checks and uniquess_bucket['total'] > 0: | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, colors=['green', 'red']) | |
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
st.pyplot(fig1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment