Created
April 10, 2017 09:14
-
-
Save hn5092/7c84f33d395967a2cb33d9b6ba6342bb to your computer and use it in GitHub Desktop.
hive python udf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import logging | |
import traceback | |
import jieba | |
import jieba.analyse | |
import redis | |
import sys | |
import os | |
import json | |
import codecs | |
#ADD FILE /home/admin/chenyun/tmp/udf_jieba.py; | |
# select TRANSFORM (feature) using 'python udf_jieba.py' as segment | |
# from tablename where pt='20160329' limit 10; | |
try: | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
except: | |
pass | |
pool = redis.ConnectionPool(host='hostname', port=9000) | |
r = redis.StrictRedis(connection_pool=pool) | |
for i in r.lrange("cy_dict_my",0,-1): | |
split = i.split(",") | |
jieba.add_word(split[0],split[1]) | |
jieba.enable_parallel(8) | |
with codecs.open('/tmp/myhive.log', 'w') as log: | |
try: | |
for line in sys.stdin: | |
wordList = list(jieba.cut(line)) | |
for word in wordList: | |
print word | |
except Exception as e: | |
log.write(str(e)) | |
log.write(str(sys.path)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment