Skip to content

Instantly share code, notes, and snippets.

@shardulm94
Last active February 16, 2022 12:48
Show Gist options
  • Select an option

  • Save shardulm94/718ab21d5e1e150924529d4f2359a1b4 to your computer and use it in GitHub Desktop.

Select an option

Save shardulm94/718ab21d5e1e150924529d4f2359a1b4 to your computer and use it in GitHub Desktop.
ORC Sample file creator
  1. Created the sample.orc file locally using the code in Creator.java
  2. Uploaded to HDFS and created an external table in hive to point to that location (queries in queries.hql)
  3. Created a new table using Hive with 128MB stripe size and 64KB buffer size, and inserted data into it.
  4. Metadata dump of the ORC file created by Hive is orc_dump.json.
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
public class Creator {
public static void main(String[] args) throws IOException {
TypeDescription schema = createSchema();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Path testFilePath = new Path("sample.orc");
fs.delete(testFilePath, false);
int batchSize = 50000;
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf)
.setSchema(schema)
.compress(CompressionKind.ZLIB)
.stripeSize(128 * 1024 * 1024)
.bufferSize(256 * 1024)
.rowIndexStride(10000)
.version(OrcFile.Version.V_0_12));
VectorizedRowBatch batch = schema.createRowBatch(batchSize);
int numRows = 200000000;
int iters = numRows / batchSize;
for (int iter = 0; iter < iters; iter++) {
for (int i = 0; i < batchSize; ++i) {
int row = batch.size++;
appendRow(batch, row);
}
writer.addRowBatch(batch);
batch.reset();
}
writer.close();
}
private static void appendRow(VectorizedRowBatch batch, int row) {
for (int i = 0; i < 122; i++) {
((LongColumnVector) batch.cols[i]).vector[row] = row * 300;
}
for (int i = 122; i < 133; i++) {
((BytesColumnVector) batch.cols[i]).setVal(row, Integer.toHexString(10 * row % 10000).getBytes());
}
}
private static TypeDescription createSchema() {
TypeDescription td = TypeDescription.createStruct();
for (int i = 0; i < 122; i++) {
td.addField("long" + i, TypeDescription.createLong());
}
for (int i = 0; i < 11; i++) {
td.addField("string" + i, TypeDescription.createString());
}
return td;
}
}
This file has been truncated, but you can view the full file.
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment