Created
July 29, 2010 21:37
Revisions
-
balshor revised this gist
May 11, 2012 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,7 +4,7 @@ import java.util.List; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -13,7 +13,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; @Description(name = "tokenize", value = "_FUNC_(doc) - emits (token, 1) for each token in the input document") public class TokenizeUDTF extends GenericUDTF { private PrimitiveObjectInspector stringOI = null; -
balshor renamed this gist
Jul 29, 2010 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
balshor renamed this gist
Jul 29, 2010 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
balshor created this gist
Jul 29, 2010 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,60 @@ package com.bizo.hive.udtf; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.description; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; @description(name = "tokenize", value = "_FUNC_(doc) - emits (token, 1) for each token in the input document") public class TokenizeUDTF extends GenericUDTF { private PrimitiveObjectInspector stringOI = null; @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { throw new UDFArgumentException("tokenize() takes exactly one argument"); } if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE && ((PrimitiveObjectInspector) args[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) { throw new UDFArgumentException("tokenize() takes a string as a parameter"); } stringOI = (PrimitiveObjectInspector) args[0]; List<String> fieldNames = new ArrayList<String>(2); List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(2); fieldNames.add("word"); fieldNames.add("cnt"); fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); } @Override public void process(Object[] record) throws HiveException { final String document = (String) stringOI.getPrimitiveJavaObject(record[0]); if (document == null) { return; } String[] tokens = document.split("\\s+"); for (String token : tokens) { forward(new Object[] { token, Integer.valueOf(1) }); } } @Override public void close() throws HiveException { // do nothing } }