Created
April 9, 2026 11:49
-
-
Save m-rey/5d38522066bf010968bb225183318bc1 to your computer and use it in GitHub Desktop.
get a grasp of long, repetitive json files by returning its underlying structure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # structure.jq | |
| # ============= | |
| # Summarizes the structure of any JSON input by recursively collapsing | |
| # repeated elements and collecting deduplicated values. | |
| # | |
| # Each scalar leaf becomes: | |
| # {"type": "<jq-type>", "values": [<deduplicated values>]} | |
| # | |
| # Arrays of objects are merged into a single representative object, where | |
| # each key's values are collected across all instances and recursed into. | |
| # | |
| # Arrays of scalars or arrays are unioned and deduplicated into a flat | |
| # {"type": "...", "values": [...]} node. | |
| # | |
| # If a key sometimes holds a bare scalar and sometimes an array of the same | |
| # type (a common shorthand in real-world JSON), both are normalized to the | |
| # array form and unioned. The "type" field will reflect both origins, e.g. | |
| # ["string", "array"], so the shorthand is visible in the output. | |
| # | |
| # Usage Example: | |
| # jq -L 'path/to/directory/containing/structure.jq' 'include "structure"; structure' 'input.json' | |
| # | |
| # ─── Example ──────────────────────────────────────────────────────────────── | |
| # | |
| # Input: | |
| # { | |
| # "users": [ | |
| # { "id": 1, "name": "Alice", "roles": "admin", "active": true }, | |
| # { "id": 2, "name": "Bob", "roles": ["user", "mod"], "active": false }, | |
| # { "id": 3, "name": "Alice", "roles": ["user"], "active": false } | |
| # ], | |
| # "meta": { "count": 3, "tag": "main" }, | |
| # "flags": [true, false, true] | |
| # } | |
| # | |
| # Output: | |
| # { | |
| # "users": [{ | |
| # "id": { "type": "number", "values": [1, 2, 3] }, | |
| # "name": { "type": "string", "values": ["Alice", "Bob"] }, | |
| # "roles": { "type": ["string", "array"], "values": ["admin", "mod", "user"] }, | |
| # "active": { "type": "boolean", "values": [false, true] } | |
| # }], | |
| # "meta": { | |
| # "count": { "type": "number", "values": [3] }, | |
| # "tag": { "type": "string", "values": ["main"] } | |
| # }, | |
| # "flags": { "type": "boolean", "values": [false, true] } | |
| # } | |
| # | |
| # Note how "roles" transparently merges the bare string "admin" with the | |
| # arrays ["user","mod"] and ["user"], signalling via ["string","array"] that | |
| # the shorthand pattern was present in the input. | |
| # ──────────────────────────────────────────────────────────────────────────── | |
| def structure: | |
| if type == "object" then | |
| # Recurse into each value, preserving all keys as-is | |
| with_entries(.value |= structure) | |
| elif type == "array" then | |
| if length == 0 then | |
| # Nothing to infer from an empty array | |
| {"type": "array", "values": []} | |
| elif ([.[] | type] | unique) == ["object"] then | |
| # Array of objects: merge all instances into one representative object. | |
| # Each key accumulates all its values across instances into a list, | |
| # then structure is called recursively on that list. | |
| # The result is wrapped in [...] to signal "this came from a list of objects". | |
| [ | |
| reduce .[] as $obj ( | |
| {}; | |
| reduce ($obj | to_entries[]) as $e (.; .[$e.key] += [$e.value]) | |
| ) | |
| | with_entries(.value |= structure) | |
| ] | |
| elif ([.[] | type] | unique) == ["array"] then | |
| # Array of arrays: flatten one level and deduplicate. | |
| # e.g. [["a","b"],["b","c"]] → ["a","b","c"] | |
| {"type": "array", "values": (add | unique)} | |
| elif ([.[] | if type == "array" then .[] else . end | type] | unique | length) == 1 then | |
| # Mixed scalars and arrays sharing the same base type. | |
| # Real-world JSON often omits the array wrapper when there is only one | |
| # value, so "foo" and ["foo","bar"] should be treated equivalently. | |
| # Normalize by wrapping bare scalars into arrays, then union and deduplicate. | |
| # The "type" field records both origins: e.g. ["string", "array"]. | |
| ( [.[] | if type == "array" then .[] else . end | type] | unique | first ) as $base_type | | |
| { | |
| "type": ( | |
| if ([.[] | type] | unique | length) > 1 | |
| then [$base_type, "array"] # both bare scalars and arrays were present | |
| else "array" # only arrays were present (base case fallthrough) | |
| end | |
| ), | |
| "values": ( [.[] | if type == "array" then .[] else . end] | unique ) | |
| } | |
| else | |
| # Truly heterogeneous array: no normalization possible. | |
| # Report all distinct types present so nothing is hidden. | |
| {"type": ([.[] | type] | unique), "values": unique} | |
| end | |
| else | |
| # Scalar (string, number, boolean, null): wrap in a single-element values | |
| # array so the output shape is uniform with the array branches above. | |
| {"type": type, "values": [.]} | |
| end; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment