Created
February 6, 2019 06:15
-
-
Save bchess/c0d2ee3b432df41f29bf850b9ff5cc5f to your computer and use it in GitHub Desktop.
Generate a query that expands repeated fields in bigquery tables
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
from google.cloud import bigquery | |
client = bigquery.Client() | |
DATASET_NAME = 'TODO' | |
TABLE_NAME = 'TODO' | |
dataset_ref = client.dataset(DATASET_NAME) | |
table_ref = dataset_ref.table(TABLE_NAME) | |
table = client.get_table(table_ref) | |
view_query = [] | |
ALLOWED_CHARS = string.ascii_letters + string.digits + '_' | |
def to_field_name(unmunged): | |
munged = [] | |
for c in unmunged: | |
if c not in ALLOWED_CHARS: | |
c = '_' | |
munged.append(c) | |
return ''.join(munged) | |
new_schema = [] | |
for field in table.schema: | |
if field.mode != 'REPEATED': | |
view_query.append(field.name) | |
new_schema.append(field) | |
continue | |
for subfield in field.fields: | |
if subfield.name not in ('key', 'name'): | |
value_field = subfield | |
break | |
for subfield in field.fields: | |
if subfield.name not in ('key', 'name'): | |
continue | |
# Find the top N of these | |
top_n_query = ('SELECT f.{subfield_name} as k, count(*) as c ' | |
'FROM `{table.dataset_id}.{table.table_id}` ' | |
'CROSS JOIN UNNEST ({field_name}) as f ' | |
'GROUP BY f.{subfield_name} ' | |
'ORDER BY c DESC ' | |
'LIMIT 10').format( | |
field_name=field.name, | |
subfield_name=subfield.name, | |
table=table | |
) | |
print(top_n_query) | |
query_job = client.query(top_n_query) | |
rows = query_job.result() | |
for row in rows: | |
new_field = bigquery.schema.SchemaField( | |
name='{field_name}_{k}'.format(field_name=field.name, k=to_field_name(row['k'])), | |
field_type=value_field.field_type, | |
) | |
new_schema.append(new_field) | |
view_query.append( | |
('(SELECT {value_field.name} ' | |
'FROM UNNEST({field.name}) ' | |
'WHERE {subfield.name} = {k} ' | |
'LIMIT 1) as {new_field.name}' | |
).format( | |
value_field=value_field, | |
field=field, | |
subfield=subfield, | |
k=repr(row['k']), | |
new_field=new_field | |
) | |
) | |
# print(new_schema) | |
print('SELECT ', ','.join(view_query), 'FROM {table.dataset_id}.{table.table_id}'.format(table=table)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment