Skip to content

Instantly share code, notes, and snippets.

@lmmx
Created August 29, 2025 02:11
Show Gist options
  • Save lmmx/23f9bdda00720639d947c7285a991ff5 to your computer and use it in GitHub Desktop.
Save lmmx/23f9bdda00720639d947c7285a991ff5 to your computer and use it in GitHub Desktop.
Generating an Avro schema from a JSON schema (JSON schema made with genson-cli from generated NDJSON)
import json
examples = [
{
"id": "Q1",
"aliases": ["Universe", "Cosmos"], # list of strings
"labels": { # categorical map
"en": "universe",
"fr": "univers",
"de": "Universum"
},
"description": { # struct
"id": 1,
"text": "all of space and time"
},
"claims": [ # list of structs
{"property": "P31", "value": {"id": "Q223557"}},
{"property": "P279", "value": {"id": "Q35120"}}
]
},
{
"id": "Q2",
"aliases": "Earth", # string instead of list[str]
"labels": {
"en": "Earth",
"es": "Tierra"
},
"description": {
"id": 2,
"text": "our home planet"
},
"claims": [
{"property": "P31", "value": {"id": "Q3504248"}}
]
}
]
with open("wikidata_example.ndjson", "w") as f:
for row in examples:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
{"id": "Q1", "aliases": ["Universe", "Cosmos"], "labels": {"en": "universe", "fr": "univers", "de": "Universum"}, "description": {"id": 1, "text": "all of space and time"}, "claims": [{"property": "P31", "value": {"id": "Q223557"}}, {"property": "P279", "value": {"id": "Q35120"}}]}
{"id": "Q2", "aliases": "Earth", "labels": {"en": "Earth", "es": "Tierra"}, "description": {"id": 2, "text": "our home planet"}, "claims": [{"property": "P31", "value": {"id": "Q3504248"}}]}
{
"type": "record",
"name": "document",
"namespace": "wikidata_example_schema",
"fields": [
{
"name": "id",
"type": "string"
},
{
"name": "aliases",
"type": [
{
"type": "array",
"items": "string"
},
"string"
]
},
{
"name": "labels",
"type": {
"name": "labels",
"type": "record",
"namespace": "wikidata_example_schema.document_types",
"fields": [
{
"name": "en",
"type": "string"
},
{
"name": "fr",
"type": [
"null",
"string"
]
},
{
"name": "de",
"type": [
"null",
"string"
]
},
{
"name": "es",
"type": [
"null",
"string"
]
}
]
}
},
{
"name": "description",
"type": {
"name": "description",
"type": "record",
"namespace": "wikidata_example_schema.document_types",
"fields": [
{
"name": "id",
"type": "int"
},
{
"name": "text",
"type": "string"
}
]
}
},
{
"name": "claims",
"type": {
"type": "array",
"items": {
"name": "claims",
"type": "record",
"namespace": "wikidata_example_schema.document_types",
"fields": [
{
"name": "property",
"type": "string"
},
{
"name": "value",
"type": {
"name": "value",
"type": "record",
"namespace": "wikidata_example_schema.document_types.claims_types",
"fields": [
{
"name": "id",
"type": "string"
}
]
}
}
]
}
}
}
]
}
{
"$schema": "http://json-schema.org/schema#",
"properties": {
"id": {
"type": "string"
},
"aliases": {
"anyOf": [
{
"type": "array",
"items": {
"type": "string"
}
},
{
"type": "string"
}
]
},
"labels": {
"properties": {
"en": {
"type": "string"
},
"fr": {
"type": "string"
},
"de": {
"type": "string"
},
"es": {
"type": "string"
}
},
"required": [
"en"
],
"type": "object"
},
"description": {
"properties": {
"id": {
"type": "integer"
},
"text": {
"type": "string"
}
},
"required": [
"id",
"text"
],
"type": "object"
},
"claims": {
"type": "array",
"items": {
"properties": {
"property": {
"type": "string"
},
"value": {
"properties": {
"id": {
"type": "string"
}
},
"required": [
"id"
],
"type": "object"
}
},
"required": [
"property",
"value"
],
"type": "object"
}
}
},
"required": [
"aliases",
"claims",
"description",
"id",
"labels"
],
"type": "object"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment