Created
May 23, 2023 01:16
-
-
Save r39132/f5324fa46449f6e6f78114239d0b8035 to your computer and use it in GitHub Desktop.
ChatGPT Json-to-Parquet-Converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.IOException; | |
import java.nio.file.Files; | |
import java.nio.file.Paths; | |
import org.apache.avro.Schema; | |
import org.apache.avro.generic.GenericData; | |
import org.apache.avro.generic.GenericRecord; | |
import org.apache.avro.generic.GenericRecordBuilder; | |
import org.apache.avro.io.DatumWriter; | |
import org.apache.avro.io.EncoderFactory; | |
import org.apache.avro.specific.SpecificDatumWriter; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.FileSystem; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.parquet.avro.AvroParquetWriter; | |
import org.apache.parquet.hadoop.ParquetWriter; | |
import com.fasterxml.jackson.databind.JsonNode; | |
import com.fasterxml.jackson.databind.ObjectMapper; | |
public class JsonToParquetConverter { | |
public static void main(String[] args) throws IOException { | |
// Load the JSON data into a Jackson JSON tree | |
ObjectMapper mapper = new ObjectMapper(); | |
JsonNode rootNode = mapper.readTree(new File("data.json")); | |
// Create an Avro schema to represent the JSON data | |
Schema schema = createAvroSchema(rootNode); | |
// Create a Parquet writer using the Avro schema | |
Configuration conf = new Configuration(); | |
FileSystem fs = FileSystem.get(conf); | |
Path outputPath = new Path("data.parquet"); | |
ParquetWriter<GenericRecord> writer = AvroParquetWriter | |
.<GenericRecord>builder(outputPath) | |
.withSchema(schema) | |
.withConf(conf) | |
.build(); | |
// Convert each JSON record to a GenericRecord and write it to the Parquet file | |
for (JsonNode jsonNode : rootNode) { | |
GenericRecord record = createGenericRecord(jsonNode, schema); | |
writer.write(record); | |
} | |
// Close the Parquet writer | |
writer.close(); | |
} | |
private static Schema createAvroSchema(JsonNode jsonNode) { | |
// Create an Avro schema based on the JSON data structure | |
// This example assumes that all JSON records have the same structure | |
Schema.Parser parser = new Schema.Parser(); | |
Schema schema = parser.parse(jsonNode.get(0).toString()); | |
return schema; | |
} | |
private static GenericRecord createGenericRecord(JsonNode jsonNode, Schema schema) { | |
// Convert a JSON record to a GenericRecord using the Avro schema | |
GenericRecordBuilder recordBuilder = new GenericRecordBuilder(schema); | |
for (Schema.Field field : schema.getFields()) { | |
String fieldName = field.name(); | |
if (jsonNode.has(fieldName)) { | |
JsonNode fieldValue = jsonNode.get(fieldName); | |
recordBuilder.set(fieldName, createAvroValue(fieldValue, field.schema())); | |
} | |
} | |
GenericRecord record = recordBuilder.build(); | |
return record; | |
} | |
private static Object createAvroValue(JsonNode jsonNode, Schema schema) { | |
// Convert a JSON value to an Avro value based on the Avro schema type | |
switch (schema.getType()) { | |
case BOOLEAN: | |
return jsonNode.booleanValue(); | |
case INT: | |
return jsonNode.intValue(); | |
case LONG: | |
return jsonNode.longValue(); | |
case FLOAT: | |
return jsonNode.floatValue(); | |
case DOUBLE: | |
return jsonNode.doubleValue(); | |
case STRING: | |
return jsonNode.textValue(); | |
case BYTES: | |
return jsonNode.binaryValue(); | |
case RECORD: | |
return createGenericRecord(jsonNode, schema); | |
default: | |
throw new IllegalArgumentException("Unsupported Avro schema type: " + schema.getType()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment