Last active
September 9, 2019 07:30
-
-
Save abicky/66242adacfb5fff6bf0243c1a90742b0 to your computer and use it in GitHub Desktop.
Extract data from JSON using keys in the first level
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'csv' | |
require 'faker' | |
require 'google/cloud/bigquery' | |
PROJECT_ID = ENV['PROJECT_ID'] | |
DATASET_ID = ENV['DATASET_ID'] | |
TABLE_ID = 'json_extract_benchmark' | |
bigquery = Google::Cloud::Bigquery.new(project_id: PROJECT_ID) | |
dataset = bigquery.dataset(DATASET_ID) | |
table = dataset.table(TABLE_ID) || dataset.create_table(TABLE_ID) do |t| | |
t.schema do |s| | |
s.string 'json', mode: :required | |
s.string 'key', mode: :required | |
s.string 'value_type', mode: :required | |
end | |
end | |
PROPERTIES = [ | |
['building_number', 'int'], | |
['latitude', 'float'], | |
['full_address', 'string'], | |
] | |
prng = Random.new(42) | |
Faker::Config.random = prng | |
Tempfile.open('json_extract_benchmark') do |f| | |
csv = CSV.new(f) | |
1_000_000.times do | |
key, value_type = PROPERTIES.sample(random: prng) | |
csv << [ | |
{ | |
building_number: Faker::Address.building_number.to_i, | |
latitude: Faker::Address.latitude.to_f, | |
full_address: Faker::Address.full_address, | |
}.to_json, | |
key, | |
value_type, | |
] | |
end | |
f.rewind | |
table.load(f, format: 'csv', write: 'truncate') | |
end | |
sqls = { | |
json_extract_scalar: <<~'SQL' % TABLE_ID, | |
#standardSQL | |
select | |
case key | |
when 'building_number' then json_extract_scalar(json, '$.building_number') | |
when 'latitude' then json_extract_scalar(json, '$.latitude') | |
when 'full_address' then json_extract_scalar(json, '$.full_address') | |
end | |
from | |
%s | |
SQL | |
f_json_extract_scalar: <<~'SQL' % TABLE_ID, | |
#standardSQL | |
create temporary function f_json_extract_scalar(json string, key string, value_type string) | |
returns string as ( | |
case value_type | |
when 'string' then | |
json_extract_scalar( | |
-- key should be escaped in advance | |
regexp_extract(json, concat('"', key, r'"\s*:\s*("[^"\\]*(?:\\.[^"\\]*)*")')) | |
, '$' | |
) | |
else | |
-- key should be escaped in advance | |
regexp_extract(json, concat('"', key, r'"\s*:\s*(-?\d+(?:\.\d+)?(?:[eE]-?\d+)?)')) | |
end | |
); | |
select | |
f_json_extract_scalar(json, key, value_type) | |
from | |
%s | |
SQL | |
f_json_extract_scalar_using_js: <<~'SQL' % TABLE_ID, | |
#standardSQL | |
create temporary function f_json_extract_scalar_using_js(json string, key string) | |
returns string | |
language js as """ | |
return JSON.parse(json)[key]; | |
"""; | |
select | |
f_json_extract_scalar_using_js(json, key) | |
from | |
%s | |
SQL | |
} | |
puts ' Name | Elapsed Time (s) | Total Slot Time (s) ' | |
puts '------|------------------|---------------------' | |
sqls.each do |name, sql| | |
job = dataset.query_job(sql, cache: false) | |
job.wait_until_done! | |
elapsed_time = job.ended_at - job.started_at | |
total_slot_time = job.statistics['totalSlotMs'].to_i / 1000 | |
puts [name, elapsed_time, total_slot_time].join(' | ') | |
end | |
# Name | Elapsed Time (s) | Total Slot Time (s) | |
# ------|------------------|--------------------- | |
# json_extract_scalar | 7.642 | 13 | |
# f_json_extract_scalar | 88.171 | 156 | |
# f_json_extract_scalar_using_js | 18.38 | 21 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#standardSQL | |
create temporary function f_json_extract_scalar(json string, key string, value_type string) | |
returns string as ( | |
case value_type | |
when 'string' then | |
json_extract_scalar( | |
-- key should be escaped in advance | |
regexp_extract(json, concat('"', key, r'"\s*:\s*("[^"\\]*(?:\\.[^"\\]*)*")')) | |
, '$' | |
) | |
else | |
-- key should be escaped in advance | |
regexp_extract(json, concat('"', key, r'"\s*:\s*(-?\d+(?:\.\d+)?(?:[eE]-?\d+)?)')) | |
end | |
); | |
create temporary function f_json_extract_scalar_using_js(json string, key string) | |
returns string | |
language js as """ | |
return JSON.parse(json)[key]; | |
"""; | |
with t as ( | |
select '{"abc":1}' as json, 'abc' as key, 'int' as value_type | |
union all | |
select '{"abc":-1}' as json, 'abc' as key, 'int' as value_type | |
union all | |
select '{"abc":1e10}' as json, 'abc' as key, 'float' as value_type | |
union all | |
select '{"abc":1.2e-1}' as json, 'abc' as key, 'float' as value_type | |
union all | |
select '{"abc":1E-10}' as json, 'abc' as key, 'float' as value_type | |
union all | |
select '{"abc":0.1234}' as json, 'abc' as key, 'float' as value_type | |
union all | |
select '{"abc":"string"}' as json, 'abc' as key, 'string' as value_type | |
union all | |
select r'{"abc":"s\"tring"}' as json, 'abc' as key, 'string' as value_type | |
union all | |
select '{"abc" \t: \n"string"}' as json, 'abc' as key, 'string' as value_type | |
union all | |
select r'{"abc":"s\tring"}' as json, 'abc' as key, 'string' as value_type | |
union all | |
select r'{"abc":null}' as json, 'abc' as key, 'string' as value_type | |
), extracted_values as ( | |
select | |
json | |
, case value_type | |
when 'int' then cast(cast(f_json_extract_scalar(json, key, value_type) as int64) as string) | |
when 'float' then cast(cast(f_json_extract_scalar(json, key, value_type) as float64) as string) | |
when 'string' then f_json_extract_scalar(json, key, value_type) | |
end as sql_udf_value | |
, case value_type | |
when 'int' then cast(cast(f_json_extract_scalar_using_js(json, key) as int64) as string) | |
when 'float' then cast(cast(f_json_extract_scalar_using_js(json, key) as float64) as string) | |
when 'string' then f_json_extract_scalar_using_js(json, key) | |
end as js_udf_value | |
from | |
t | |
) | |
select | |
json | |
, sql_udf_value | |
, js_udf_value | |
, (sql_udf_value = js_udf_value) or (sql_udf_value is null and js_udf_value is null) as equal | |
from | |
extracted_values | |
; | |
-- +--------------------+---------------+--------------+-------+ | |
-- | json | sql_udf_value | js_udf_value | equal | | |
-- +--------------------+---------------+--------------+-------+ | |
-- | {"abc":1} | 1 | 1 | true | | |
-- | {"abc":-1} | -1 | -1 | true | | |
-- | {"abc":1e10} | 10000000000 | 10000000000 | true | | |
-- | {"abc":1.2e-1} | 0.12 | 0.12 | true | | |
-- | {"abc":1E-10} | 1e-10 | 1e-10 | true | | |
-- | {"abc":0.1234} | 0.1234 | 0.1234 | true | | |
-- | {"abc":"string"} | string | string | true | | |
-- | {"abc":"s\"tring"} | s"tring | s"tring | true | | |
-- | {"abc" : | string | string | true | | |
-- | "string"} | | | | | |
-- | {"abc":"s\tring"} | s ring | s ring | true | | |
-- | {"abc":null} | NULL | NULL | true | | |
-- +--------------------+---------------+--------------+-------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment