Last active
August 29, 2015 14:14
Revisions
-
takegue revised this gist
Jan 24, 2015 . 1 changed file with 10 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,14 +4,17 @@ import subprocess import itertools as itt class MeCab(): def __init__(self, opts=[]): self.opts = opts self._process = subprocess.Popen( list(itt.chain(['mecab'], opts)), stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines =True, ) if '-Owakati' in opts or 'wakati' in opts: self.parse = self.wakati_parse @@ -30,7 +33,8 @@ def default_parse(self, iterable): self._process.stdin.write(line+'\n') while True: output = self._process.stdout.readline().strip() if output.startswith('EOS'): break buff.append(Morph.from_mecab_format(output)) yield buff buff = [] @@ -53,10 +57,11 @@ def from_mecab_format(cls, line): morph.pos = attrs[0] return morph def test_mecab(): parser = MeCab(['-Owakati']) morphs = next(parser.parse(['すもももももももものうち'])) assert morphs == ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'] if __name__ == '__main__': -
takegue revised this gist
Jan 24, 2015 . No changes.There are no files selected for viewing
-
takegue created this gist
Jan 24, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,63 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- import subprocess import itertools as itt class MeCab(): def __init__(self,opts=[]): self.opts = opts self._process = subprocess.Popen( list(itt.chain(['mecab'], opts)), stdin=subprocess.PIPE, stdout=subprocess.PIPE) if '-Owakati' in opts or 'wakati' in opts: self.parse = self.wakati_parse else: self.parse = self.default_parse def wakati_parse(self, iterable): for line in iterable: self._process.stdin.write(line+'\n') output = self._process.stdout.readline() yield output.strip().split() def default_parse(self, iterable): buff = [] for line in iterable: self._process.stdin.write(line+'\n') while True: output = self._process.stdout.readline().strip() if output.startswith('EOS'): break buff.append(Morph.from_mecab_format(output)) yield buff buff = [] class Morph(): def __init__(self): self.surface = '' self.attrs = [] @classmethod def from_mecab_format(cls, line): surface, attrs = line.split() attrs = attrs.split(',') morph = Morph() morph.surface = surface morph.attrs = attrs morph.pos = attrs[0] return morph def test_mecab(): parser = MeCab(['-Owakati']) morphs = next(parser.parse(['すもももももももものうち'] * 10)) assert morphs == ['すもも','も','もも','も','もも','の','うち'] if __name__ == '__main__': test_mecab()