Created
October 19, 2017 04:43
-
-
Save zackster/649555aa3d4e6d6b046627d93490b0d6 to your computer and use it in GitHub Desktop.
Serverless fasttext implementation. I have a blog post on the Code For Cash blog (blog.codefor.cash) that discusses what else is needed to run fasttext in a serverless AWS lambda environment: compiling fasttext on ec2 for linux, including nltk in the root directory of the zip, etc.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import re | |
import nltk | |
def normalize(text): | |
# remove punctuation | |
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
# split into words | |
# nltk.data.path.append("/nltk_data") | |
from nltk.tokenize import word_tokenize | |
tokens = word_tokenize(text) | |
# convert to lower case | |
tokens = [w.lower() for w in tokens] | |
# remove remaining tokens that are not alphabetic | |
words = [word for word in tokens if word.isalnum()] | |
# filter out stop words | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('english')) | |
words = [w for w in words if not w in stop_words] | |
return ' '.join(words) | |
def find_tags(lambda_input): | |
job_ad_text = normalize(lambda_input) | |
from subprocess import Popen, PIPE, STDOUT | |
p = Popen(['./fasttext', 'predict-prob', 'model_alnum_gom.bin', '-', '4'], stdout=PIPE, stdin=PIPE, stderr=PIPE) | |
stdout_data = p.communicate(input=job_ad_text)[0] | |
return stdout_data.strip() | |
def handler(event, context): | |
return find_tags(event['job_ad_text']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment