Last active
May 14, 2020 16:04
-
-
Save k0f1sh/cf32020aae18c0fda0e43f135bdf119d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns gengo04 | |
(:require [clojure.java.io :as io] | |
[incanter.core :as incanter-core] | |
[incanter.charts :as incanter-charts] | |
)) | |
;; https://nlp100.github.io/ja/ch04.html | |
;; $ mecab neko.txt -o neko.txt.mecab | |
;; 30 | |
(def maps (doall | |
(->> | |
(line-seq (io/reader (io/resource "neko.txt.mecab"))) | |
(map (fn [line] | |
(if-not (= line "EOS") | |
;; mecabの出力の構造: | |
;; 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音 | |
(let [[surface details-str] (clojure.string/split line #"\t") | |
[pos pos1 _ _ _ _ base _ _] (clojure.string/split details-str #",")] | |
{:surface surface | |
:base base | |
:pos pos | |
:pos1 pos1})))) | |
(filter some?)))) | |
;; 31 | |
(def surfaces (->> maps | |
(map :surface) | |
(into #{}))) | |
;; 32 | |
(def bases (->> maps | |
(map :base) | |
(into #{}))) | |
;; 33 | |
(def no (->> (partition 3 1 maps) | |
(filter (fn [[a b c]] | |
(and (= (:surface b) "の") | |
(= (:pos a) "名詞") | |
(= (:pos c) "名詞")))) | |
(map (fn [[a b c]] | |
(str (:surface a) (:surface b) (:surface c)))))) | |
;; 34 | |
(def nouns-list (loop [maps maps | |
nouns-seq []] | |
(let [rest (drop-while #(not= (:pos %) "名詞") maps)] | |
(let [[nouns rest] (split-with #(= (:pos %) "名詞") rest)] | |
(if (empty? rest) | |
nouns-seq | |
(recur rest (conj nouns-seq nouns))))))) | |
;; 35 | |
(def sorted-surface-freq (->> maps | |
(map :surface) | |
(frequencies) | |
(seq) | |
(sort-by second) | |
(reverse))) | |
;; 36 | |
(let [top10 (take 10 sorted-surface-freq)] | |
(incanter-core/view | |
(incanter-charts/bar-chart | |
(map first top10) | |
(map second top10) | |
:title "頻度上位10語"))) | |
;; 37 | |
(def sentences (->> maps | |
(partition-by #(= (:pos1 %) "句点")) | |
(filter #(not= (:pos1 (first %)) "句点")))) | |
(def cooccurrence (->> sentences | |
(map (fn [sentence] | |
(if (some #(= (:surface %) "猫") sentence) | |
(filter #(not= (:surface %) "猫") sentence) | |
[]))) | |
(flatten) | |
(map :surface) | |
(frequencies) | |
(sort-by second) | |
(reverse))) | |
(let [top10 (take 10 cooccurrence)] | |
(incanter-core/view | |
(incanter-charts/bar-chart | |
(map first top10) | |
(map second top10) | |
:title "猫との共起頻度上位10語"))) | |
;; 38 | |
(def histgram-data (->> maps | |
(map :surface) | |
(frequencies) | |
(map second) | |
(frequencies) | |
(sort-by second) | |
(reverse))) | |
(incanter-core/view | |
(incanter-charts/bar-chart | |
(map first histgram-data) | |
(map second histgram-data) | |
:title "単語の出現頻度のヒストグラム")) | |
;; 39 | |
(let [freq-ranks (->> maps | |
(map :surface) | |
(frequencies) | |
(map second) | |
(frequencies) | |
(sort-by second) | |
(reverse))] | |
(incanter-core/view | |
(-> (incanter-charts/scatter-plot (vals freq-ranks) (keys freq-ranks)) | |
(incanter-charts/set-axis :x (incanter-charts/log-axis)) | |
(incanter-charts/set-axis :y (incanter-charts/log-axis))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment