Skip to content

Instantly share code, notes, and snippets.

@balshor
Created July 29, 2010 21:37

Revisions

  1. balshor revised this gist May 11, 2012. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions TokenizeUDTF.java
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,7 @@
    import java.util.List;

    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.exec.description;
    import org.apache.hadoop.hive.ql.exec.Description;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    @@ -13,7 +13,7 @@
    import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

    @description(name = "tokenize", value = "_FUNC_(doc) - emits (token, 1) for each token in the input document")
    @Description(name = "tokenize", value = "_FUNC_(doc) - emits (token, 1) for each token in the input document")
    public class TokenizeUDTF extends GenericUDTF {

    private PrimitiveObjectInspector stringOI = null;
  2. balshor renamed this gist Jul 29, 2010. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. balshor renamed this gist Jul 29, 2010. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  4. balshor created this gist Jul 29, 2010.
    60 changes: 60 additions & 0 deletions gistfile1.java
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    package com.bizo.hive.udtf;

    import java.util.ArrayList;
    import java.util.List;

    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.exec.description;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
    import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

    @description(name = "tokenize", value = "_FUNC_(doc) - emits (token, 1) for each token in the input document")
    public class TokenizeUDTF extends GenericUDTF {

    private PrimitiveObjectInspector stringOI = null;

    @Override
    public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
    if (args.length != 1) {
    throw new UDFArgumentException("tokenize() takes exactly one argument");
    }

    if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE
    && ((PrimitiveObjectInspector) args[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
    throw new UDFArgumentException("tokenize() takes a string as a parameter");
    }

    stringOI = (PrimitiveObjectInspector) args[0];

    List<String> fieldNames = new ArrayList<String>(2);
    List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(2);
    fieldNames.add("word");
    fieldNames.add("cnt");
    fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    @Override
    public void process(Object[] record) throws HiveException {
    final String document = (String) stringOI.getPrimitiveJavaObject(record[0]);

    if (document == null) {
    return;
    }
    String[] tokens = document.split("\\s+");
    for (String token : tokens) {
    forward(new Object[] { token, Integer.valueOf(1) });
    }
    }

    @Override
    public void close() throws HiveException {
    // do nothing
    }
    }