- Created the
sample.orcfile locally using the code inCreator.java - Uploaded to HDFS and created an external table in hive to point to that location (queries in
queries.hql) - Created a new table using Hive with 128MB stripe size and 64KB buffer size, and inserted data into it.
- Metadata dump of the ORC file created by Hive is
orc_dump.json.
Last active
February 16, 2022 12:48
-
-
Save shardulm94/718ab21d5e1e150924529d4f2359a1b4 to your computer and use it in GitHub Desktop.
ORC Sample file creator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import java.io.IOException; | |
| import org.apache.hadoop.conf.Configuration; | |
| import org.apache.hadoop.fs.FileSystem; | |
| import org.apache.hadoop.fs.Path; | |
| import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; | |
| import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; | |
| import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; | |
| import org.apache.orc.CompressionKind; | |
| import org.apache.orc.OrcFile; | |
| import org.apache.orc.TypeDescription; | |
| import org.apache.orc.Writer; | |
| public class Creator { | |
| public static void main(String[] args) throws IOException { | |
| TypeDescription schema = createSchema(); | |
| Configuration conf = new Configuration(); | |
| FileSystem fs = FileSystem.getLocal(conf); | |
| Path testFilePath = new Path("sample.orc"); | |
| fs.delete(testFilePath, false); | |
| int batchSize = 50000; | |
| Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) | |
| .setSchema(schema) | |
| .compress(CompressionKind.ZLIB) | |
| .stripeSize(128 * 1024 * 1024) | |
| .bufferSize(256 * 1024) | |
| .rowIndexStride(10000) | |
| .version(OrcFile.Version.V_0_12)); | |
| VectorizedRowBatch batch = schema.createRowBatch(batchSize); | |
| int numRows = 200000000; | |
| int iters = numRows / batchSize; | |
| for (int iter = 0; iter < iters; iter++) { | |
| for (int i = 0; i < batchSize; ++i) { | |
| int row = batch.size++; | |
| appendRow(batch, row); | |
| } | |
| writer.addRowBatch(batch); | |
| batch.reset(); | |
| } | |
| writer.close(); | |
| } | |
| private static void appendRow(VectorizedRowBatch batch, int row) { | |
| for (int i = 0; i < 122; i++) { | |
| ((LongColumnVector) batch.cols[i]).vector[row] = row * 300; | |
| } | |
| for (int i = 122; i < 133; i++) { | |
| ((BytesColumnVector) batch.cols[i]).setVal(row, Integer.toHexString(10 * row % 10000).getBytes()); | |
| } | |
| } | |
| private static TypeDescription createSchema() { | |
| TypeDescription td = TypeDescription.createStruct(); | |
| for (int i = 0; i < 122; i++) { | |
| td.addField("long" + i, TypeDescription.createLong()); | |
| } | |
| for (int i = 0; i < 11; i++) { | |
| td.addField("string" + i, TypeDescription.createString()); | |
| } | |
| return td; | |
| } | |
| } |
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
View raw
(Sorry about that, but we can’t show files that are this big right now.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment