- Created the
sample.orcfile locally using the code inCreator.java - Uploaded to HDFS and created an external table in hive to point to that location (queries in
queries.hql) - Created a new table using Hive with 128MB stripe size and 64KB buffer size, and inserted data into it.
- Metadata dump of the ORC file created by Hive is
orc_dump.json.
-
-
Save mymindwentblvnk/9222df05dfde2bb961a98cbbf50ad094 to your computer and use it in GitHub Desktop.
ORC Sample file creator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import java.io.IOException; | |
| import org.apache.hadoop.conf.Configuration; | |
| import org.apache.hadoop.fs.FileSystem; | |
| import org.apache.hadoop.fs.Path; | |
| import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; | |
| import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; | |
| import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; | |
| import org.apache.orc.CompressionKind; | |
| import org.apache.orc.OrcFile; | |
| import org.apache.orc.TypeDescription; | |
| import org.apache.orc.Writer; | |
| public class Creator { | |
| public static void main(String[] args) throws IOException { | |
| TypeDescription schema = createSchema(); | |
| Configuration conf = new Configuration(); | |
| FileSystem fs = FileSystem.getLocal(conf); | |
| Path testFilePath = new Path("sample.orc"); | |
| fs.delete(testFilePath, false); | |
| int batchSize = 50000; | |
| Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) | |
| .setSchema(schema) | |
| .compress(CompressionKind.ZLIB) | |
| .stripeSize(128 * 1024 * 1024) | |
| .bufferSize(256 * 1024) | |
| .rowIndexStride(10000) | |
| .version(OrcFile.Version.V_0_12)); | |
| VectorizedRowBatch batch = schema.createRowBatch(batchSize); | |
| int numRows = 200000000; | |
| int iters = numRows / batchSize; | |
| for (int iter = 0; iter < iters; iter++) { | |
| for (int i = 0; i < batchSize; ++i) { | |
| int row = batch.size++; | |
| appendRow(batch, row); | |
| } | |
| writer.addRowBatch(batch); | |
| batch.reset(); | |
| } | |
| writer.close(); | |
| } | |
| private static void appendRow(VectorizedRowBatch batch, int row) { | |
| for (int i = 0; i < 122; i++) { | |
| ((LongColumnVector) batch.cols[i]).vector[row] = row * 300; | |
| } | |
| for (int i = 122; i < 133; i++) { | |
| ((BytesColumnVector) batch.cols[i]).setVal(row, Integer.toHexString(10 * row % 10000).getBytes()); | |
| } | |
| } | |
| private static TypeDescription createSchema() { | |
| TypeDescription td = TypeDescription.createStruct(); | |
| for (int i = 0; i < 122; i++) { | |
| td.addField("long" + i, TypeDescription.createLong()); | |
| } | |
| for (int i = 0; i < 11; i++) { | |
| td.addField("string" + i, TypeDescription.createString()); | |
| } | |
| return td; | |
| } | |
| } |
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
View raw
(Sorry about that, but we can’t show files that are this big right now.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment