Created
August 11, 2020 23:24
-
-
Save jshirius/b60abbb0e566a75ef6f5c51a2e269cd0 to your computer and use it in GitHub Desktop.
共起語ライブラリ(pyfpgrowth)・networkxを使って共起語のネットワークを作る
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 共起語ライブラリ(pyfpgrowth)・networkxを使って共起語のネットワークを作る" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#別途JanomeDataSetクラスを以下のURLからダウンロードして設定してください。\n", | |
"#https://github.com/jshirius/nlp_tools/blob/master/janome_data_set.py\n", | |
"from janome_data_set import JanomeDataSet\n", | |
"import twitter\n", | |
"\n", | |
"import pyfpgrowth\n", | |
"\n", | |
"\n", | |
"#Twitter APIにアクセスするための4つの設定\n", | |
"#Twitterより与えられたものを設定する\n", | |
"CONSUMER_KEY = ''\n", | |
"CONSUMER_SECRET = ''\n", | |
"ACCESS_TOKEN = ''\n", | |
"ACCESS_TOKEN_SECRET = ''\n", | |
"\n", | |
"#Twitter APIにアクセスする\n", | |
"api = twitter.Api(consumer_key=CONSUMER_KEY,\n", | |
" consumer_secret=CONSUMER_SECRET,\n", | |
" access_token_key=ACCESS_TOKEN,\n", | |
" access_token_secret=ACCESS_TOKEN_SECRET)\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"#検索結果を取得する\n", | |
"from urllib.parse import urlencode\n", | |
"query = urlencode({\n", | |
" 'q': 'プログラミングスクール', # 検索ワード\n", | |
" 'result_type': 'recent', # recent/popular/mixed\n", | |
" 'count': 100 # 取得するツイート数(100が最大)\n", | |
" # 'max_id': これを利用して更に過去の情報を取れる\n", | |
"})\n", | |
"\n", | |
"#発言内容を無視するユーザー\n", | |
"#例えば、宣伝目的のツイートを弾くなど\n", | |
"exclude_users = []\n", | |
"\n", | |
"result = api.GetSearch(raw_query=query)\n", | |
"#print(result)\n", | |
"text_list = []\n", | |
"#ページ単位で文章を設定する\n", | |
"page_datas=[]\n", | |
"\n", | |
"for status in result:\n", | |
" \n", | |
" #除外ユーザーか?\n", | |
" #if(status.user.screen_name in exclude_users):\n", | |
" # continue\n", | |
" \n", | |
" if(\"https:\" in status.text):\n", | |
" continue\n", | |
" \n", | |
" #print(status)\n", | |
" print(\"-\" * 50)\n", | |
" print(status.id)\n", | |
" print(status.created_at)\n", | |
" \n", | |
" print(status.user.screen_name)\n", | |
" print(status.user.name)\n", | |
" print(status.text)\n", | |
"\n", | |
" page_datas.append(status.text)\n", | |
" #text_list.append(text)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#共起語用に形態素分析する\n", | |
"documents=[]\n", | |
"\n", | |
"#形態素に分けるときは、neologdを使ったほうが良い\n", | |
"#morpheme_janome = JanomeDataSet()\n", | |
"morpheme_janome = JanomeDataSet('neologd')\n", | |
"for t in page_datas:\n", | |
" #形態素処理\n", | |
" data = morpheme_janome.text_morpheme(t,\"名詞\")\n", | |
" if(len(data) == 0):\n", | |
" continue\n", | |
" #documents.extend(data)\n", | |
" documents.append(data)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"documents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#共起語を作成する\n", | |
"patterns = pyfpgrowth.find_frequent_patterns(documents, 5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"patterns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#デバッグ用のログ\n", | |
"documents" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# ここから共起語をnetworkxを使ってネットワーク図を作ってみる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#数字とラベルの辞書作成\n", | |
"#キーワードをユニーク化する\n", | |
"uniqu_list = []\n", | |
"for keys, v in patterns.items():\n", | |
" #print(keys)\n", | |
" uniqu_list.extend(keys) \n", | |
"uniqu_list = list(set(uniqu_list))\n", | |
"\n", | |
"print(\"キーワードをユニークにする\")\n", | |
"print(uniqu_list)\n", | |
"\n", | |
"#エッジの数字とラベルを紐付ける\n", | |
"dict_label ={} \n", | |
"int_dict_label = {}\n", | |
"for i in range(len(uniqu_list)):\n", | |
" k = uniqu_list[i]\n", | |
" dict_label[k] = i\n", | |
" int_dict_label[i] = k\n", | |
"\n", | |
"\n", | |
"#数字とラベルの紐付け完了\n", | |
"print(dict_label)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#networkxを使って実際にグラフを書いてみよう\n", | |
"import matplotlib.pyplot as plt\n", | |
"import networkx as nx\n", | |
"import matplotlib.font_manager #日本語\n", | |
"\n", | |
"#日本語フォント読み込み\n", | |
"font_path = r'/Users/develop/python/kaggle/Osaka.ttf'\n", | |
"font_prop = matplotlib.font_manager.FontProperties(fname=font_path)\n", | |
"\n", | |
"#network初期化\n", | |
"G=nx.Graph()\n", | |
"\n", | |
"#ネットワーク作成\n", | |
"for keys, v in patterns.items():\n", | |
" #print(keys)\n", | |
" old_k = \"\"\n", | |
" for index, k in enumerate(keys):\n", | |
" if(index == 0):\n", | |
" old_k = k\n", | |
" continue\n", | |
" \n", | |
" no = dict_label[k]\n", | |
" old_no = dict_label[old_k]\n", | |
" G.add_edge(old_no, no)\n", | |
" old_k = k\n", | |
" #uniqu_list.extend(keys)\n", | |
" \n", | |
"pos=nx.spring_layout(G)\n", | |
"nx.draw(G,pos,node_color='#A0CBE2',width=1,edge_cmap=plt.cm.Blues,with_labels=False)\n", | |
"\n", | |
"datas = nx.draw_networkx_labels(G,pos,int_dict_label,font_size=16)\n", | |
"\n", | |
"#日本語に対応できるようにするため、日本語が使えるフォントを設定している\n", | |
"for t in datas.values():\n", | |
" t.set_fontproperties(font_prop)\n", | |
" \n", | |
"plt.show()\n", | |
"#中心まではやる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment