Last active
August 29, 2015 14:14
-
-
Save takegue/2ee794dad639cd89e8ef to your computer and use it in GitHub Desktop.
MeCab implementation for Python using subprocess
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
import subprocess | |
import itertools as itt | |
class MeCab(): | |
def __init__(self, opts=[]): | |
self.opts = opts | |
self._process = subprocess.Popen( | |
list(itt.chain(['mecab'], opts)), | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
universal_newlines =True, | |
) | |
if '-Owakati' in opts or 'wakati' in opts: | |
self.parse = self.wakati_parse | |
else: | |
self.parse = self.default_parse | |
def wakati_parse(self, iterable): | |
for line in iterable: | |
self._process.stdin.write(line+'\n') | |
output = self._process.stdout.readline() | |
yield output.strip().split() | |
def default_parse(self, iterable): | |
buff = [] | |
for line in iterable: | |
self._process.stdin.write(line+'\n') | |
while True: | |
output = self._process.stdout.readline().strip() | |
if output.startswith('EOS'): | |
break | |
buff.append(Morph.from_mecab_format(output)) | |
yield buff | |
buff = [] | |
class Morph(): | |
def __init__(self): | |
self.surface = '' | |
self.attrs = [] | |
@classmethod | |
def from_mecab_format(cls, line): | |
surface, attrs = line.split() | |
attrs = attrs.split(',') | |
morph = Morph() | |
morph.surface = surface | |
morph.attrs = attrs | |
morph.pos = attrs[0] | |
return morph | |
def test_mecab(): | |
parser = MeCab(['-Owakati']) | |
morphs = next(parser.parse(['すもももももももものうち'])) | |
assert morphs == ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'] | |
if __name__ == '__main__': | |
test_mecab() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment