Skip to content

Instantly share code, notes, and snippets.

@woxtu
Created February 8, 2015 11:09
Show Gist options
  • Save woxtu/81777eaa73a7b0b31724 to your computer and use it in GitHub Desktop.
Save woxtu/81777eaa73a7b0b31724 to your computer and use it in GitHub Desktop.
import math
import os
import sequtils
import strutils
const libmecab = "libmecab.2.dylib"
type mecab_node_t = object
prev: ptr mecab_node_t
next: ptr mecab_node_t
enext: ptr mecab_node_t
bnext: ptr mecab_node_t
rpath: pointer
lpath: pointer
surface: cstring
feature: cstring
id: cuint
length: cushort
rlength: cushort
rcAttr: cushort
lcAttr: cushort
posid: cushort
char_type: cuchar
stat: cuchar
# ...
type mecab_p = pointer
type mecab_node_p = ptr mecab_node_t
proc mecab_new2(cstring): mecab_p {.importc, dynlib: libmecab, cdecl.}
proc mecab_destroy(mecab_p) {.importc, dynlib: libmecab, cdecl.}
proc mecab_sparse_tonode(mecab_p, cstring): mecab_node_p {.importc, dynlib: libmecab, cdecl.}
proc first[T](a: openarray[T]): T = a[0]
proc last[T](a: openarray[T]): T = a[a.len() - 1]
proc drop[T](a: seq[T], n: int): seq[T] = a[n .. a.len() - 1]
proc random_nth[T](a: openarray[T]): T = a[random(a.len())]
proc tokenize(mecab: mecab_p, str: string): seq[string] =
iterator tokenizer(mecab: mecab_p, str: string): string =
var node = mecab.mecab_sparse_tonode(str.cstring)
while node.next != nil:
if node.stat.int < 2:
yield ($node.surface)[0 .. node.length.int - 1]
node = node.next
to_seq(mecab.tokenizer(str))
proc n_gramize(tokens: seq[string], order: int): seq[seq[string]] =
let tokens = tokens & to_seq(0 .. order - 1).map_it(string, "")
to_seq(0 .. tokens.len() - order - 1).map_it(seq[string], tokens[it .. it + order - 1])
proc markov_chain(grams: seq[seq[string]], start = ""): string =
proc recur(node: string, tokens: seq[string]): seq[string] =
if tokens.last() == "":
tokens
else:
let gram = grams.filter_it(it.first() == node).random_nth()
recur(gram.last(), tokens & gram.drop(1))
let start = if start != "": start else: grams.random_nth().first()
recur(start, @[start]).join("")
let query = param_str(1)
let mecab = mecab_new2("")
randomize()
echo mecab.tokenize(query).n_gramize(3).markov_chain()
mecab.mecab_destroy()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment