Skip to content

Instantly share code, notes, and snippets.

@takegue
Last active August 29, 2015 14:14

Revisions

  1. takegue revised this gist Jan 24, 2015. 1 changed file with 10 additions and 5 deletions.
    15 changes: 10 additions & 5 deletions 2015-01-25-005613.py
    Original file line number Diff line number Diff line change
    @@ -4,14 +4,17 @@
    import subprocess
    import itertools as itt


    class MeCab():

    def __init__(self,opts=[]):
    def __init__(self, opts=[]):
    self.opts = opts
    self._process = subprocess.Popen(
    list(itt.chain(['mecab'], opts)),
    stdin=subprocess.PIPE,
    stdout=subprocess.PIPE)
    stdout=subprocess.PIPE,
    universal_newlines =True,
    )

    if '-Owakati' in opts or 'wakati' in opts:
    self.parse = self.wakati_parse
    @@ -30,7 +33,8 @@ def default_parse(self, iterable):
    self._process.stdin.write(line+'\n')
    while True:
    output = self._process.stdout.readline().strip()
    if output.startswith('EOS'): break
    if output.startswith('EOS'):
    break
    buff.append(Morph.from_mecab_format(output))
    yield buff
    buff = []
    @@ -53,10 +57,11 @@ def from_mecab_format(cls, line):
    morph.pos = attrs[0]
    return morph


    def test_mecab():
    parser = MeCab(['-Owakati'])
    morphs = next(parser.parse(['すもももももももものうち'] * 10))
    assert morphs == ['すもも','も','もも','も','もも','の','うち']
    morphs = next(parser.parse(['すもももももももものうち']))
    assert morphs == ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']


    if __name__ == '__main__':
  2. takegue revised this gist Jan 24, 2015. No changes.
  3. takegue created this gist Jan 24, 2015.
    63 changes: 63 additions & 0 deletions 2015-01-25-005613.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    #!/usr/bin/env python
    # -*- coding:utf-8 -*-

    import subprocess
    import itertools as itt

    class MeCab():

    def __init__(self,opts=[]):
    self.opts = opts
    self._process = subprocess.Popen(
    list(itt.chain(['mecab'], opts)),
    stdin=subprocess.PIPE,
    stdout=subprocess.PIPE)

    if '-Owakati' in opts or 'wakati' in opts:
    self.parse = self.wakati_parse
    else:
    self.parse = self.default_parse

    def wakati_parse(self, iterable):
    for line in iterable:
    self._process.stdin.write(line+'\n')
    output = self._process.stdout.readline()
    yield output.strip().split()

    def default_parse(self, iterable):
    buff = []
    for line in iterable:
    self._process.stdin.write(line+'\n')
    while True:
    output = self._process.stdout.readline().strip()
    if output.startswith('EOS'): break
    buff.append(Morph.from_mecab_format(output))
    yield buff
    buff = []


    class Morph():

    def __init__(self):
    self.surface = ''
    self.attrs = []

    @classmethod
    def from_mecab_format(cls, line):
    surface, attrs = line.split()
    attrs = attrs.split(',')

    morph = Morph()
    morph.surface = surface
    morph.attrs = attrs
    morph.pos = attrs[0]
    return morph

    def test_mecab():
    parser = MeCab(['-Owakati'])
    morphs = next(parser.parse(['すもももももももものうち'] * 10))
    assert morphs == ['すもも','も','もも','も','もも','の','うち']


    if __name__ == '__main__':
    test_mecab()