Created
October 28, 2025 14:03
-
-
Save chottokun/fc0aa9e1ac008e6067cba34051455362 to your computer and use it in GitHub Desktop.
whoosh-reloaded+sudachi_sample.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyPDiBpBhT0pN23xEhWtPNFr", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/chottokun/fc0aa9e1ac008e6067cba34051455362/whoosh-reloaded-sudachi_sample.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 1. 必要なライブラリのインストール (Colabで最初に実行してください)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "!pip install whoosh-reloaded sudachipy sudachidict_core\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "jdiVCe8v-exO", | |
| "outputId": "a2d63f33-a3c4-4538-90ad-983e44fa5237" | |
| }, | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: whoosh-reloaded in /usr/local/lib/python3.12/dist-packages (2.7.5)\n", | |
| "Requirement already satisfied: sudachipy in /usr/local/lib/python3.12/dist-packages (0.6.10)\n", | |
| "Requirement already satisfied: sudachidict_core in /usr/local/lib/python3.12/dist-packages (20250825)\n", | |
| "Requirement already satisfied: cached-property in /usr/local/lib/python3.12/dist-packages (from whoosh-reloaded) (2.0.1)\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "id": "8JEC2eLv5Tzw" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 2. ライブラリのインポートと設定\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "import os\n", | |
| "import shutil\n", | |
| "import json\n", | |
| "from typing import Generator, Dict, Any, List, Tuple\n", | |
| "\n", | |
| "# Whoosh imports\n", | |
| "from whoosh import index\n", | |
| "from whoosh.fields import Schema, ID, TEXT, KEYWORD\n", | |
| "from whoosh.analysis import Tokenizer, Token\n", | |
| "from whoosh.qparser import MultifieldParser\n", | |
| "from whoosh.query import And, Term\n", | |
| "from whoosh.scoring import BM25F\n", | |
| "\n", | |
| "# Sudachi imports\n", | |
| "from sudachipy import tokenizer, dictionary\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 3. JSONデータの平坦化関数\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def flatten_json(data: Dict[str, Any], delimiter: str = '_', parent_key: str = '') -> Dict[str, str]:\n", | |
| " \"\"\"ネストされたJSON辞書をフラットなキー/値のペアに変換する。\"\"\"\n", | |
| " items: Dict[str, str] = {}\n", | |
| " for key, value in data.items():\n", | |
| " new_key = f\"{parent_key}{delimiter}{key}\" if parent_key else key\n", | |
| "\n", | |
| " if isinstance(value, dict):\n", | |
| " items.update(flatten_json(value, delimiter, new_key))\n", | |
| " elif isinstance(value, list):\n", | |
| " for i, item in enumerate(value):\n", | |
| " if isinstance(item, dict):\n", | |
| " items.update(flatten_json(item, delimiter, f\"{new_key}{delimiter}{i}\"))\n", | |
| " else:\n", | |
| " items[f\"{new_key}{delimiter}{i}\"] = str(item)\n", | |
| " else:\n", | |
| " items[new_key] = str(value)\n", | |
| " return items\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 4. Whoosh向けカスタムSudachiトークナイザー (エラー完全排除版)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "class SudachiTokenizer(Tokenizer):\n", | |
| " \"\"\"SudachiPyを利用したWhoosh互換のカスタムトークナイザー。\"\"\"\n", | |
| "\n", | |
| " MODE_STRINGS = {'A': tokenizer.Tokenizer.SplitMode.A,\n", | |
| " 'B': tokenizer.Tokenizer.SplitMode.B,\n", | |
| " 'C': tokenizer.Tokenizer.SplitMode.C}\n", | |
| "\n", | |
| " def __init__(self, mode: tokenizer.Tokenizer.SplitMode):\n", | |
| " self.mode_str = next((k for k, v in self.MODE_STRINGS.items() if v == mode), 'A')\n", | |
| " self.mode_obj = None # __call__で使用するSplitModeオブジェクト\n", | |
| " self._load_sudachi()\n", | |
| "\n", | |
| " def _load_sudachi(self):\n", | |
| " \"\"\"辞書とトークナイザをロードまたは再ロードする内部メソッド。\"\"\"\n", | |
| " self.mode_obj = self.MODE_STRINGS[self.mode_str]\n", | |
| " self.dict = dictionary.Dictionary()\n", | |
| " self.tokenizer_obj = self.dict.create()\n", | |
| "\n", | |
| " def __getstate__(self):\n", | |
| " \"\"\"Pickle化のために状態を取得する際、Whooshが失敗するオブジェクトを除外する。\"\"\"\n", | |
| " state = self.__dict__.copy()\n", | |
| " del state['dict']\n", | |
| " del state['tokenizer_obj']\n", | |
| " if 'mode_obj' in state:\n", | |
| " del state['mode_obj']\n", | |
| " return state\n", | |
| "\n", | |
| " def __setstate__(self, state):\n", | |
| " \"\"\"アンPickle化の際に、辞書オブジェクトを再作成する。\"\"\"\n", | |
| " self.__dict__.update(state)\n", | |
| " self._load_sudachi()\n", | |
| "\n", | |
| " # -----------------------------------------------------------------------------\n", | |
| " # 🚨 修正点:\n", | |
| " # Whooshが内部的に 'mode' などの余計な引数を渡してくるため、\n", | |
| " # **kwargs を追加して、それらの引数を吸収できるようにする。\n", | |
| " # -----------------------------------------------------------------------------\n", | |
| " def __call__(self, value: str, positions: bool = False, chars: bool = False,\n", | |
| " keeporiginal: bool = False, start_pos: int = 0, start_char: int = 0,\n", | |
| " **kwargs) -> Generator[Token, None, None]:\n", | |
| " \"\"\"入力文字列をSudachiでトークン化し、WhooshのTokenオブジェクトを生成する。\"\"\"\n", | |
| "\n", | |
| " # kwargs に 'mode' が入っていても、ここでは無視され、\n", | |
| " # __init__ で設定された self.mode_obj が使用される。\n", | |
| "\n", | |
| " token = Token()\n", | |
| " morphemes = self.tokenizer_obj.tokenize(value, self.mode_obj)\n", | |
| "\n", | |
| " pos = start_pos\n", | |
| " char_pos = start_char\n", | |
| "\n", | |
| " for m in morphemes:\n", | |
| " surface = m.surface()\n", | |
| " token.text = surface\n", | |
| "\n", | |
| " if positions:\n", | |
| " token.pos = pos\n", | |
| " pos += 1\n", | |
| " if chars:\n", | |
| " token.startchar = char_pos\n", | |
| " token.endchar = char_pos + len(surface)\n", | |
| " char_pos = token.endchar\n", | |
| "\n", | |
| " yield token\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 5. Whooshスキーマの定義\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def create_schema():\n", | |
| " \"\"\"Whooshインデックススキーマを定義する。\"\"\"\n", | |
| " return Schema(\n", | |
| " doc_id=ID(stored=True, unique=True),\n", | |
| " content_fts_b=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.B), stored=False),\n", | |
| " content_fts_c=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.C), stored=False),\n", | |
| " metadata_author_name=KEYWORD(stored=True, sortable=True),\n", | |
| " tags_list=KEYWORD(stored=False)\n", | |
| " )\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 6. インデックス作成処理\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def index_documents(index_dir: str, json_data_list: List[Dict[str, Any]]):\n", | |
| " \"\"\"JSONデータリストをWhooshインデックスに追加する。\"\"\"\n", | |
| " print(f\"--- インデックスディレクトリを準備: {index_dir} ---\")\n", | |
| " if os.path.exists(index_dir):\n", | |
| " shutil.rmtree(index_dir)\n", | |
| " os.makedirs(index_dir, exist_ok=True)\n", | |
| "\n", | |
| " schema = create_schema()\n", | |
| " ix = index.create_in(index_dir, schema)\n", | |
| " writer = ix.writer()\n", | |
| "\n", | |
| " print(f\"--- {len(json_data_list)}件のドキュメントをインデックス中 ---\")\n", | |
| " for i, doc in enumerate(json_data_list):\n", | |
| " flattened = flatten_json(doc)\n", | |
| "\n", | |
| " doc_id = flattened.get('id', str(i + 1))\n", | |
| " content = flattened.get('content', '')\n", | |
| "\n", | |
| " field_values = {\n", | |
| " 'doc_id': doc_id,\n", | |
| " 'content_fts_b': content,\n", | |
| " 'content_fts_c': content,\n", | |
| " 'metadata_author_name': flattened.get('metadata_author_name', ''),\n", | |
| " 'tags_list': flattened.get('tags_list', '')\n", | |
| " }\n", | |
| " writer.add_document(**field_values)\n", | |
| "\n", | |
| " writer.commit()\n", | |
| " print(\"--- インデックス作成完了 ---\")\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 7. 検索実行処理\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def search_index(index_dir: str, query_str: str, author_filter: str = None, limit: int = 5) -> List[Tuple[str, float, str]]:\n", | |
| " \"\"\"Whooshインデックスを検索し、結果を返す。\"\"\"\n", | |
| " try:\n", | |
| " ix = index.open_dir(index_dir)\n", | |
| " except index.EmptyIndexError:\n", | |
| " print(f\"エラー: インデックスディレクトリ '{index_dir}' が見つからないか空です。\")\n", | |
| " return []\n", | |
| "\n", | |
| " field_boosts = {'content_fts_b': 1.0, 'content_fts_c': 2.0}\n", | |
| "\n", | |
| " with ix.searcher(weighting=BM25F(field_boosts=field_boosts)) as searcher:\n", | |
| " parser = MultifieldParser(['content_fts_b', 'content_fts_c'], ix.schema)\n", | |
| "\n", | |
| " try:\n", | |
| " query = parser.parse(query_str)\n", | |
| " except Exception as e:\n", | |
| " print(f\"クエリ解析エラー: {e}\")\n", | |
| " return []\n", | |
| "\n", | |
| " if author_filter:\n", | |
| " author_query = Term('metadata_author_name', author_filter)\n", | |
| " query = And([query, author_query])\n", | |
| " print(f\" [複合クエリ実行] FTS: '{query_str}' AND 著者: '{author_filter}'\")\n", | |
| " else:\n", | |
| " print(f\" [FTS実行] クエリ: '{query_str}'\")\n", | |
| "\n", | |
| " results = searcher.search(query, limit=limit)\n", | |
| "\n", | |
| " output: List[Tuple[str, float, str]] = []\n", | |
| " for hit in results:\n", | |
| " doc_id = hit['doc_id']\n", | |
| " score = hit.score\n", | |
| " author = hit.get('metadata_author_name', 'N/A')\n", | |
| " output.append((doc_id, score, author))\n", | |
| "\n", | |
| " return output" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# --- A. テスト用データの定義 ---\n", | |
| "sample_data = [\n", | |
| " {\n", | |
| " \"id\": \"1\",\n", | |
| " \"content\": \"東京タワーは日本を代表する観光地です。正式名称は日本電波塔。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"Japan\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"2\",\n", | |
| " \"content\": \"消費者安全調査委員会の報告書は、複合的な問題解決を示唆する。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\":\"3\",\n", | |
| " \"content\": \"日本電波塔の公式見解が発表された。展望台の改修計画。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"news\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"4\",\n", | |
| " \"content\": \"新しい観光ルートの開発に関する会議が開かれました。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author C\"}},\n", | |
| " \"tags\": [\"tourism\", \"planning\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"5\",\n", | |
| " \"content\": \"安全基準の見直しについて、政府機関が検討を開始。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\", \"regulation\"]\n", | |
| " }\n", | |
| "]\n", | |
| "\n", | |
| "INDEX_DIR = \"whoosh_sudachi_index\"\n", | |
| "\n", | |
| "# 1. インデックスの作成とドキュメントの追加\n", | |
| "print(\"==============================================\")\n", | |
| "print(\" 1. インデックス作成\")\n", | |
| "print(\"==============================================\")\n", | |
| "index_documents(INDEX_DIR, sample_data)\n", | |
| "\n", | |
| "# 2. 検索テストの実行\n", | |
| "print(\"\\n==============================================\")\n", | |
| "print(\" 2. 検索テスト開始\")\n", | |
| "print(\" (結果はスコアの高い順に表示されます)\")\n", | |
| "print(\"==============================================\")\n", | |
| "\n", | |
| "# --- Test 1: 固有表現 (Cモード) の正確な一致 ---\n", | |
| "query_a = \"日本電波塔\"\n", | |
| "results_a = search_index(INDEX_DIR, query_a)\n", | |
| "print(f\"\\n[Test 1] 検索: '{query_a}'\")\n", | |
| "for doc_id, score, author in results_a:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "# --- Test 2: 複合語の部分一致 (Bモード/再現率の確認) ---\n", | |
| "query_b = \"消費者\"\n", | |
| "results_b = search_index(INDEX_DIR, query_b)\n", | |
| "print(f\"\\n[Test 2] 検索: '{query_b}'\")\n", | |
| "for doc_id, score, author in results_b:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "# --- Test 3: FTSと構造化フィルタリングの複合クエリ ---\n", | |
| "query_c = \"報告書\"\n", | |
| "filter_author = \"Author B\"\n", | |
| "results_c = search_index(INDEX_DIR, query_c, author_filter=filter_author)\n", | |
| "print(f\"\\n[Test 3] 検索: '{query_c}' AND 著者: '{filter_author}'\")\n", | |
| "for doc_id, score, author in results_c:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "# --- Test 4: 新しいデータに対する検索 ---\n", | |
| "query_d = \"観光ルート\"\n", | |
| "results_d = search_index(INDEX_DIR, query_d)\n", | |
| "print(f\"\\n[Test 4] 検索: '{query_d}'\")\n", | |
| "for doc_id, score, author in results_d:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "# --- Test 5: 複数のキーワードを含む検索 ---\n", | |
| "query_e = \"安全基準 政府機関\"\n", | |
| "results_e = search_index(INDEX_DIR, query_e)\n", | |
| "print(f\"\\n[Test 5] 検索: '{query_e}'\")\n", | |
| "for doc_id, score, author in results_e:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "\n", | |
| "# 3. クリーンアップ\n", | |
| "print(\"\\n==============================================\")\n", | |
| "print(\" 3. クリーンアップ\")\n", | |
| "print(\"==============================================\")\n", | |
| "if os.path.exists(INDEX_DIR):\n", | |
| " shutil.rmtree(INDEX_DIR)\n", | |
| " print(f\"--- インデックスディレクトリ '{INDEX_DIR}' を削除しました ---\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "29f5v7296nQt", | |
| "outputId": "4841b1b0-47d3-402f-8310-5da9fed4a225" | |
| }, | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "==============================================\n", | |
| " 1. インデックス作成\n", | |
| "==============================================\n", | |
| "--- インデックスディレクトリを準備: whoosh_sudachi_index ---\n", | |
| "--- 5件のドキュメントをインデックス中 ---\n", | |
| "--- インデックス作成完了 ---\n", | |
| "\n", | |
| "==============================================\n", | |
| " 2. 検索テスト開始\n", | |
| " (結果はスコアの高い順に表示されます)\n", | |
| "==============================================\n", | |
| " [FTS実行] クエリ: '日本電波塔'\n", | |
| "\n", | |
| "[Test 1] 検索: '日本電波塔'\n", | |
| " - DOC ID: 1, Score: 7.0837, 著者: Author A\n", | |
| " - DOC ID: 3, Score: 5.9482, 著者: Author A\n", | |
| " [FTS実行] クエリ: '消費者'\n", | |
| "\n", | |
| "[Test 2] 検索: '消費者'\n", | |
| " - DOC ID: 2, Score: 3.7243, 著者: Author B\n", | |
| " [複合クエリ実行] FTS: '報告書' AND 著者: 'Author B'\n", | |
| "\n", | |
| "[Test 3] 検索: '報告書' AND 著者: 'Author B'\n", | |
| " [FTS実行] クエリ: '観光ルート'\n", | |
| "\n", | |
| "[Test 4] 検索: '観光ルート'\n", | |
| " - DOC ID: 4, Score: 7.9609, 著者: Author C\n", | |
| " [FTS実行] クエリ: '安全基準 政府機関'\n", | |
| "\n", | |
| "[Test 5] 検索: '安全基準 政府機関'\n", | |
| " - DOC ID: 5, Score: 14.2910, 著者: Author B\n", | |
| "\n", | |
| "==============================================\n", | |
| " 3. クリーンアップ\n", | |
| "==============================================\n", | |
| "--- インデックスディレクトリ 'whoosh_sudachi_index' を削除しました ---\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# 別バージョン" | |
| ], | |
| "metadata": { | |
| "id": "WGNqHDUs7x21" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 必要なライブラリのインポート (このセルで完結するように再定義)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "import os\n", | |
| "import shutil\n", | |
| "from typing import List, Tuple, Dict, Any, Generator # Flatten JSON関数で必要な型もインポート\n", | |
| "\n", | |
| "# Whoosh imports\n", | |
| "from whoosh import index # <-- 追加\n", | |
| "from whoosh.fields import Schema, ID, TEXT, KEYWORD # <-- 必要に応じて追加 (スキーマ定義がないので不要かも?)\n", | |
| "from whoosh.analysis import Tokenizer, Token # <-- 必要に応じて追加 (SudachiTokenizer定義がないので不要かも?)\n", | |
| "from whoosh.qparser import MultifieldParser # <-- 追加\n", | |
| "from whoosh.query import And, Term # <-- 追加\n", | |
| "from whoosh.scoring import BM25F # <-- 追加\n", | |
| "\n", | |
| "# Sudachi imports (SudachiTokenizer定義がないので不要かも?)\n", | |
| "from sudachipy import tokenizer, dictionary # <-- コメントアウトを解除\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# JSONデータの平坦化関数 (必要に応じて再定義)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def flatten_json(data: Dict[str, Any], delimiter: str = '_', parent_key: str = '') -> Dict[str, str]:\n", | |
| " \"\"\"ネストされたJSON辞書をフラットなキー/値のペアに変換する。\"\"\"\n", | |
| " items: Dict[str, str] = {}\n", | |
| " for key, value in data.items():\n", | |
| " new_key = f\"{parent_key}{delimiter}{key}\" if parent_key else key\n", | |
| "\n", | |
| " if isinstance(value, dict):\n", | |
| " items.update(flatten_json(value, delimiter, new_key))\n", | |
| " elif isinstance(value, list):\n", | |
| " for i, item in enumerate(value):\n", | |
| " if isinstance(item, dict):\n", | |
| " items.update(flatten_json(item, delimiter, f\"{new_key}{delimiter}{i}\"))\n", | |
| " else:\n", | |
| " items[f\"{new_key}{delimiter}{i}\"] = str(item)\n", | |
| " else:\n", | |
| " items[new_key] = str(value)\n", | |
| " return items\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# Whoosh向けカスタムSudachiトークナイザー (必要に応じて再定義)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# from sudachipy import tokenizer, dictionary # <-- SudachiTokenizerが必要ならインポート\n", | |
| "\n", | |
| "class SudachiTokenizer(Tokenizer):\n", | |
| " \"\"\"SudachiPyを利用したWhoosh互換のカスタムトークナイザー。\"\"\"\n", | |
| "\n", | |
| " MODE_STRINGS = {'A': tokenizer.Tokenizer.SplitMode.A,\n", | |
| " 'B': tokenizer.Tokenizer.SplitMode.B,\n", | |
| " 'C': tokenizer.Tokenizer.SplitMode.C}\n", | |
| "\n", | |
| " def __init__(self, mode: tokenizer.Tokenizer.SplitMode):\n", | |
| " self.mode_str = next((k for k, v in self.MODE_STRINGS.items() if v == mode), 'A')\n", | |
| " self.mode_obj = None\n", | |
| " # SudachiTokenizerクラスのインスタンスを作成する前にsudachipy.dictionaryとsudachipy.tokenizerをインポートする必要がある\n", | |
| " from sudachipy import dictionary, tokenizer\n", | |
| " self._load_sudachi()\n", | |
| "\n", | |
| " def _load_sudachi(self):\n", | |
| " from sudachipy import dictionary, tokenizer\n", | |
| " self.mode_obj = self.MODE_STRINGS[self.mode_str]\n", | |
| " self.dict = dictionary.Dictionary()\n", | |
| " self.tokenizer_obj = self.dict.create()\n", | |
| "\n", | |
| " def __getstate__(self):\n", | |
| " state = self.__dict__.copy()\n", | |
| " del state['dict']\n", | |
| " del state['tokenizer_obj']\n", | |
| " if 'mode_obj' in state:\n", | |
| " del state['mode_obj']\n", | |
| " return state\n", | |
| "\n", | |
| " def __setstate__(self, state):\n", | |
| " self.__dict__.update(state)\n", | |
| " self._load_sudachi()\n", | |
| "\n", | |
| " def __call__(self, value: str, positions: bool = False, chars: bool = False,\n", | |
| " keeporiginal: bool = False, start_pos: int = 0, start_char: int = 0,\n", | |
| " **kwargs) -> Generator[Token, None, None]:\n", | |
| "\n", | |
| " from sudachipy import tokenizer\n", | |
| " token = Token()\n", | |
| " morphemes = self.tokenizer_obj.tokenize(value, self.mode_obj)\n", | |
| "\n", | |
| " pos = start_pos\n", | |
| " char_pos = start_char\n", | |
| "\n", | |
| " for m in morphemes:\n", | |
| " surface = m.surface()\n", | |
| " token.text = surface\n", | |
| "\n", | |
| " if positions:\n", | |
| " token.pos = pos\n", | |
| " pos += 1\n", | |
| " if chars:\n", | |
| " token.startchar = char_pos\n", | |
| " token.endchar = char_pos + len(surface)\n", | |
| " char_pos = token.endchar\n", | |
| "\n", | |
| " yield token\n", | |
| "\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# Whooshスキーマの定義 (必要に応じて再定義)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def create_schema():\n", | |
| " \"\"\"Whooshインデックススキーマを定義する。\"\"\"\n", | |
| " # SudachiTokenizerが必要なので、そのクラス定義より後に配置\n", | |
| " from sudachipy import tokenizer\n", | |
| " return Schema(\n", | |
| " doc_id=ID(stored=True, unique=True),\n", | |
| " content_fts_b=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.B), stored=False),\n", | |
| " content_fts_c=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.C), stored=False),\n", | |
| " metadata_author_name=KEYWORD(stored=True, sortable=True),\n", | |
| " tags_list=KEYWORD(stored=False)\n", | |
| " )\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# インデックス作成処理 (必要に応じて再定義)\n", | |
| "# 前のセルで定義されている場合は不要ですが、セルの独立性を高めるために含めます。\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def index_documents(index_dir: str, json_data_list: List[Dict[str, Any]]):\n", | |
| " \"\"\"JSONデータリストをWhooshインデックスに追加する。\"\"\"\n", | |
| " print(f\"--- インデックスディレクトリを準備: {index_dir} ---\")\n", | |
| " if os.path.exists(index_dir):\n", | |
| " shutil.rmtree(index_dir)\n", | |
| " os.makedirs(index_dir, exist_ok=True)\n", | |
| "\n", | |
| " schema = create_schema() # create_schemaが必要\n", | |
| " ix = index.create_in(index_dir, schema)\n", | |
| " writer = ix.writer()\n", | |
| "\n", | |
| " print(f\"--- {len(json_data_list)}件のドキュメントをインデックス中 ---\")\n", | |
| " for i, doc in enumerate(json_data_list):\n", | |
| " flattened = flatten_json(doc) # flatten_jsonが必要\n", | |
| "\n", | |
| " doc_id = flattened.get('id', str(i + 1))\n", | |
| " content = flattened.get('content', '')\n", | |
| "\n", | |
| " field_values = {\n", | |
| " 'doc_id': doc_id,\n", | |
| " 'content_fts_b': content,\n", | |
| " 'content_fts_c': content,\n", | |
| " 'metadata_author_name': flattened.get('metadata_author_name', ''),\n", | |
| " 'tags_list': flattened.get('tags_list', '')\n", | |
| " }\n", | |
| " writer.add_document(**field_values)\n", | |
| "\n", | |
| " writer.commit()\n", | |
| " print(\"--- インデックス作成完了 ---\")\n", | |
| "\n", | |
| "\n", | |
| "# 変更後: ixオブジェクトを引数で受け取る\n", | |
| "def search_index(ix: index.Index, query_str: str, author_filter: str = None, limit: int = 5) -> List[Tuple[str, float, str]]:\n", | |
| "\n", | |
| " # 変更点: ix.open_dir() を削除 (呼び出し元で管理)\n", | |
| "\n", | |
| " field_boosts = {'content_fts_b': 1.0, 'content_fts_c': 2.0}\n", | |
| "\n", | |
| " # ix.searcher() は毎回呼び出す (これが軽量なSearcherオブジェクトを生成する正しい方法)\n", | |
| " with ix.searcher(weighting=BM25F(field_boosts=field_boosts)) as searcher:\n", | |
| " parser = MultifieldParser(['content_fts_b', 'content_fts_c'], ix.schema)\n", | |
| "\n", | |
| " try:\n", | |
| " query = parser.parse(query_str)\n", | |
| " except Exception as e:\n", | |
| " print(f\"クエリ解析エラー: {e}\")\n", | |
| " return []\n", | |
| "\n", | |
| " if author_filter:\n", | |
| " author_query = Term('metadata_author_name', author_filter)\n", | |
| " query = And([query, author_query])\n", | |
| " print(f\" [複合クエリ実行] FTS: '{query_str}' AND 著者: '{author_filter}'\")\n", | |
| " else:\n", | |
| " print(f\" [FTS実行] クエリ: '{query_str}'\")\n", | |
| "\n", | |
| " results = searcher.search(query, limit=limit)\n", | |
| "\n", | |
| " output: List[Tuple[str, float, str]] = []\n", | |
| " for hit in results:\n", | |
| " doc_id = hit['doc_id']\n", | |
| " score = hit.score\n", | |
| " author = hit.get('metadata_author_name', 'N/A')\n", | |
| " output.append((doc_id, score, author))\n", | |
| "\n", | |
| " return output\n", | |
| "\n", | |
| "# --- メイン実行ブロック (アプリケーションの起動時を想定したColabテスト用) ---\n", | |
| "\n", | |
| "INDEX_DIR = \"whoosh_sudachi_index\"\n", | |
| "\n", | |
| "# sample_data は前のセルで定義されていることを想定\n", | |
| "# セルの独立性を高めるため、ここで再度定義することも検討\n", | |
| "sample_data = [\n", | |
| " {\n", | |
| " \"id\": \"1\",\n", | |
| " \"content\": \"東京タワーは日本を代表する観光地です。正式名称は日本電波塔。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"Japan\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"2\",\n", | |
| " \"content\": \"消費者安全調査委員会の報告書は、複合的な問題解決を示唆する。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\":\"3\",\n", | |
| " \"content\": \"日本電波塔の公式見解が発表された。展望台の改修計画。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"news\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"4\",\n", | |
| " \"content\": \"新しい観光ルートの開発に関する会議が開かれました。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author C\"}},\n", | |
| " \"tags\": [\"tourism\", \"planning\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"5\",\n", | |
| " \"content\": \"安全基準の見直しについて、政府機関が検討を開始。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\", \"regulation\"]\n", | |
| " }\n", | |
| "]\n", | |
| "\n", | |
| "\n", | |
| "# 1. インデックス作成 (Colabでこのセルを最初に実行する際に一度実行)\n", | |
| "# アプリケーション起動時のインデックス作成/更新処理をシミュレート\n", | |
| "# 既にインデックスがある場合は削除して再作成\n", | |
| "print(\"==============================================\")\n", | |
| "print(\" 1. インデックス作成\")\n", | |
| "print(\"==============================================\")\n", | |
| "index_documents(INDEX_DIR, sample_data) # <- コメントアウトを解除\n", | |
| "\n", | |
| "# 2. 検索の準備 (アプリケーション起動時に一度だけ実行)\n", | |
| "# インデックスオブジェクトをメモリ上に保持\n", | |
| "print(\"--- アプリケーション起動: インデックスをオープンします ---\")\n", | |
| "main_ix = None # main_ix を初期化\n", | |
| "\n", | |
| "# 既にインデックスが開かれているかチェック (Colabでセルを複数回実行する場合を考慮)\n", | |
| "# Whooshのインデックスオブジェクトは、ディレクトリが存在し、有効なインデックスファイルがある場合にのみ開けます。\n", | |
| "# index_documents() で毎回再作成されるため、このチェックは Colab 環境での複数回実行に対応するために残します。\n", | |
| "try:\n", | |
| " # このixオブジェクトをメモリ上に保持し、検索リクエストのたびに使い回す\n", | |
| " main_ix = index.open_dir(INDEX_DIR)\n", | |
| " print(f\"--- インデックス '{INDEX_DIR}' をオープンしました ---\")\n", | |
| "except index.EmptyIndexError:\n", | |
| " print(f\"エラー: インデックスディレクトリ '{INDEX_DIR}' が見つからないか空です。\")\n", | |
| " print(\"先にインデックス作成処理を実行してください。\")\n", | |
| " # ここで処理を中断するか、インデックス作成関数を呼び出すなど、適切なエラーハンドリングが必要です。\n", | |
| " # 今回はColabテストのため、エラーメッセージを出力するだけにします。\n", | |
| " main_ix = None # エラーが発生した場合、main_ix を None に設定\n", | |
| "except Exception as e:\n", | |
| " print(f\"インデックスオープン中にエラーが発生しました: {e}\")\n", | |
| " main_ix = None # エラーが発生した場合、main_ix を None に設定\n", | |
| "\n", | |
| "\n", | |
| "# 3. 検索リクエストの処理 (例)\n", | |
| "print(\"\\n--- 検索リクエスト処理 ---\")\n", | |
| "if main_ix: # main_ix が正常にオープンできた場合のみ検索を実行\n", | |
| " results_a = search_index(main_ix, \"日本電波塔\") # ixオブジェクトを渡す\n", | |
| " print(\"[Test 1] 検索: '日本電波塔'\")\n", | |
| " for doc_id, score, author in results_a:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " results_b = search_index(main_ix, \"消費者\") # ixオブジェクトを渡す\n", | |
| " print(\"\\n[Test 2] 検索: '消費者'\")\n", | |
| " for doc_id, score, author in results_b:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # 他の検索テストもここに追加可能\n", | |
| " results_c = search_index(main_ix, \"報告書\", author_filter=\"Author B\")\n", | |
| " print(\"\\n[Test 3] 検索: '報告書' AND 著者: 'Author B'\")\n", | |
| " for doc_id, score, author in results_c:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "else:\n", | |
| " print(\"\\n--- 検索スキップ: インデックスが正常にオープンできませんでした ---\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "NkjqAFK_7Rft", | |
| "outputId": "85a1dd59-9e90-4b05-f069-6d2f6ebe1fa9" | |
| }, | |
| "execution_count": 10, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "==============================================\n", | |
| " 1. インデックス作成\n", | |
| "==============================================\n", | |
| "--- インデックスディレクトリを準備: whoosh_sudachi_index ---\n", | |
| "--- 5件のドキュメントをインデックス中 ---\n", | |
| "--- インデックス作成完了 ---\n", | |
| "--- アプリケーション起動: インデックスをオープンします ---\n", | |
| "--- インデックス 'whoosh_sudachi_index' をオープンしました ---\n", | |
| "\n", | |
| "--- 検索リクエスト処理 ---\n", | |
| " [FTS実行] クエリ: '日本電波塔'\n", | |
| "[Test 1] 検索: '日本電波塔'\n", | |
| " - DOC ID: 1, Score: 7.0837, 著者: Author A\n", | |
| " - DOC ID: 3, Score: 5.9482, 著者: Author A\n", | |
| " [FTS実行] クエリ: '消費者'\n", | |
| "\n", | |
| "[Test 2] 検索: '消費者'\n", | |
| " - DOC ID: 2, Score: 3.7243, 著者: Author B\n", | |
| " [複合クエリ実行] FTS: '報告書' AND 著者: 'Author B'\n", | |
| "\n", | |
| "[Test 3] 検索: '報告書' AND 著者: 'Author B'\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# A,B,Cモードをすべて利用\n" | |
| ], | |
| "metadata": { | |
| "id": "kHCDhqYADp56" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 1. 必要なライブラリのインストール (Colabで最初に実行してください)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "!pip install whoosh-reloaded sudachipy sudachidict_core\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 2. ライブラリのインポートと設定\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "import os\n", | |
| "import shutil\n", | |
| "import json\n", | |
| "import logging\n", | |
| "from typing import Generator, Dict, Any, List, Tuple\n", | |
| "\n", | |
| "# Whoosh imports\n", | |
| "from whoosh import index\n", | |
| "from whoosh.fields import Schema, ID, TEXT, KEYWORD\n", | |
| "from whoosh.analysis import Tokenizer, Token\n", | |
| "from whoosh.qparser import MultifieldParser\n", | |
| "from whoosh.query import And, Term\n", | |
| "from whoosh.scoring import BM25F\n", | |
| "\n", | |
| "# Sudachi imports\n", | |
| "from sudachipy import tokenizer, dictionary\n", | |
| "\n", | |
| "# ロギング設定 (実運用向け)\n", | |
| "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 3. JSONデータの平坦化関数\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# ... (flatten_json関数は省略。変更なし) ...\n", | |
| "def flatten_json(data: Dict[str, Any], delimiter: str = '_', parent_key: str = '') -> Dict[str, str]:\n", | |
| " \"\"\"ネストされたJSON辞書をフラットなキー/値のペアに変換する。\"\"\"\n", | |
| " items: Dict[str, str] = {}\n", | |
| " for key, value in data.items():\n", | |
| " new_key = f\"{parent_key}{delimiter}{key}\" if parent_key else key\n", | |
| "\n", | |
| " if isinstance(value, dict):\n", | |
| " items.update(flatten_json(value, delimiter, new_key))\n", | |
| " elif isinstance(value, list):\n", | |
| " for i, item in enumerate(value):\n", | |
| " if isinstance(item, dict):\n", | |
| " items.update(flatten_json(item, delimiter, f\"{new_key}{delimiter}{i}\"))\n", | |
| " else:\n", | |
| " items[f\"{new_key}{delimiter}{i}\"] = str(item)\n", | |
| " else:\n", | |
| " items[new_key] = str(value)\n", | |
| " return items\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 4. Whoosh向けカスタムSudachiトークナイザー (Pickleエラー対策済み)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "class SudachiTokenizer(Tokenizer):\n", | |
| " \"\"\"SudachiPyを利用したWhoosh互換のカスタムトークナイザー。\"\"\"\n", | |
| "\n", | |
| " MODE_STRINGS = {'A': tokenizer.Tokenizer.SplitMode.A,\n", | |
| " 'B': tokenizer.Tokenizer.SplitMode.B,\n", | |
| " 'C': tokenizer.Tokenizer.SplitMode.C}\n", | |
| "\n", | |
| " def __init__(self, mode: tokenizer.Tokenizer.SplitMode):\n", | |
| " self.mode_str = next((k for k, v in self.MODE_STRINGS.items() if v == mode), 'A')\n", | |
| " self.mode_obj = None\n", | |
| " self._load_sudachi()\n", | |
| "\n", | |
| " def _load_sudachi(self):\n", | |
| " \"\"\"辞書とトークナイザをロードまたは再ロードする内部メソッド。\"\"\"\n", | |
| " self.mode_obj = self.MODE_STRINGS[self.mode_str]\n", | |
| " self.dict = dictionary.Dictionary()\n", | |
| " self.tokenizer_obj = self.dict.create()\n", | |
| "\n", | |
| " def __getstate__(self):\n", | |
| " state = self.__dict__.copy()\n", | |
| " del state['dict']\n", | |
| " del state['tokenizer_obj']\n", | |
| " if 'mode_obj' in state:\n", | |
| " del state['mode_obj']\n", | |
| " return state\n", | |
| "\n", | |
| " def __setstate__(self, state):\n", | |
| " self.__dict__.update(state)\n", | |
| " self._load_sudachi()\n", | |
| "\n", | |
| " def __call__(self, value: str, positions: bool = False, chars: bool = False,\n", | |
| " keeporiginal: bool = False, start_pos: int = 0, start_char: int = 0,\n", | |
| " **kwargs) -> Generator[Token, None, None]: # 👈 **kwargsで予期せぬ引数を吸収\n", | |
| " token = Token()\n", | |
| " morphemes = self.tokenizer_obj.tokenize(value, self.mode_obj)\n", | |
| "\n", | |
| " pos = start_pos\n", | |
| " char_pos = start_char\n", | |
| "\n", | |
| " for m in morphemes:\n", | |
| " surface = m.surface()\n", | |
| " token.text = surface\n", | |
| "\n", | |
| " if positions:\n", | |
| " token.pos = pos\n", | |
| " pos += 1\n", | |
| " if chars:\n", | |
| " token.startchar = char_pos\n", | |
| " token.endchar = char_pos + len(surface)\n", | |
| " char_pos = token.endchar\n", | |
| "\n", | |
| " yield token\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 5. Whooshスキーマの定義 (A, B, Cモード複合化)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def create_schema():\n", | |
| " \"\"\"Whooshインデックススキーマを定義する。\"\"\"\n", | |
| " return Schema(\n", | |
| " doc_id=ID(stored=True, unique=True),\n", | |
| " # 🚨 Aモード追加: 再現率特化\n", | |
| " content_fts_a=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.A), stored=False),\n", | |
| " # Bモード: バランス\n", | |
| " content_fts_b=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.B), stored=False),\n", | |
| " # Cモード: 精度特化\n", | |
| " content_fts_c=TEXT(analyzer=SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.C), stored=False),\n", | |
| " metadata_author_name=KEYWORD(stored=True, sortable=True),\n", | |
| " tags_list=KEYWORD(stored=False)\n", | |
| " )\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 6. インデックス作成処理 (実運用向け並列処理を追加)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def index_documents(index_dir: str, json_data_list: List[Dict[str, Any]], procs: int = 4):\n", | |
| " \"\"\"JSONデータリストをWhooshインデックスに追加する。(実運用向け並列処理対応)\"\"\"\n", | |
| " logging.info(f\"インデックスディレクトリを準備: {index_dir}\")\n", | |
| " if os.path.exists(index_dir):\n", | |
| " shutil.rmtree(index_dir)\n", | |
| " os.makedirs(index_dir, exist_ok=True)\n", | |
| "\n", | |
| " schema = create_schema()\n", | |
| " ix = index.create_in(index_dir, schema)\n", | |
| "\n", | |
| " # 🚨 実運用向け: マルチプロセスライターを使用し、CPUコア数に応じた並列書き込みを行う\n", | |
| " writer = ix.writer(procs=procs, limitmb=256, multisegment=True)\n", | |
| "\n", | |
| " logging.info(f\"{len(json_data_list)}件のドキュメントをインデックス中 (並列処理/procs={procs})\")\n", | |
| " for i, doc in enumerate(json_data_list):\n", | |
| " flattened = flatten_json(doc)\n", | |
| "\n", | |
| " doc_id = flattened.get('id', str(i + 1))\n", | |
| " content = flattened.get('content', '')\n", | |
| "\n", | |
| " field_values = {\n", | |
| " 'doc_id': doc_id,\n", | |
| " 'content_fts_a': content, # 👈 Aモード追加\n", | |
| " 'content_fts_b': content,\n", | |
| " 'content_fts_c': content,\n", | |
| " 'metadata_author_name': flattened.get('metadata_author_name', ''),\n", | |
| " 'tags_list': flattened.get('tags_list', '')\n", | |
| " }\n", | |
| " writer.add_document(**field_values)\n", | |
| "\n", | |
| " writer.commit()\n", | |
| " logging.info(\"インデックス作成完了\")\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 7. 検索実行処理 (A, B, Cモード複合化とインデックス引数化)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def search_index(ix: index.Index, query_str: str, author_filter: str = None, limit: int = 5) -> List[Tuple[str, float, str]]:\n", | |
| " \"\"\"Whooshインデックスを検索し、結果を返す。(インデックスオブジェクトを直接使用)\"\"\"\n", | |
| "\n", | |
| " # 🚨 A, B, Cモードのブースト戦略\n", | |
| " field_boosts = {\n", | |
| " 'content_fts_a': 0.8, # 再現率重視(低重み)\n", | |
| " 'content_fts_b': 1.5, # バランス\n", | |
| " 'content_fts_c': 2.5 # 精度重視(高重み)\n", | |
| " }\n", | |
| "\n", | |
| " with ix.searcher(weighting=BM25F(field_boosts=field_boosts)) as searcher:\n", | |
| " # 🚨 検索対象フィールドにAモードを追加\n", | |
| " parser = MultifieldParser(\n", | |
| " ['content_fts_a', 'content_fts_b', 'content_fts_c'],\n", | |
| " ix.schema\n", | |
| " )\n", | |
| "\n", | |
| " try:\n", | |
| " query = parser.parse(query_str)\n", | |
| " except Exception as e:\n", | |
| " logging.error(f\"クエリ解析エラー: {e}\")\n", | |
| " return []\n", | |
| "\n", | |
| " if author_filter:\n", | |
| " author_query = Term('metadata_author_name', author_filter)\n", | |
| " query = And([query, author_query])\n", | |
| " logging.info(f\"[複合クエリ実行] FTS: '{query_str}' AND 著者: '{author_filter}'\")\n", | |
| " else:\n", | |
| " logging.info(f\"[FTS実行] クエリ: '{query_str}'\")\n", | |
| "\n", | |
| " results = searcher.search(query, limit=limit)\n", | |
| "\n", | |
| " output: List[Tuple[str, float, str]] = []\n", | |
| " for hit in results:\n", | |
| " doc_id = hit['doc_id']\n", | |
| " score = hit.score\n", | |
| " author = hit.get('metadata_author_name', 'N/A')\n", | |
| " output.append((doc_id, score, author))\n", | |
| "\n", | |
| " return output\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 8. メイン実行ブロック (実運用起動フローを模倣)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "if __name__ == \"__main__\":\n", | |
| "\n", | |
| " sample_data = [\n", | |
| " {\n", | |
| " \"id\": \"1\",\n", | |
| " \"content\": \"東京タワーは日本を代表する観光地です。正式名称は日本電波塔。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"Japan\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"2\",\n", | |
| " \"content\": \"消費者安全調査委員会の報告書は、複合的な問題解決を示唆する。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\":\"3\",\n", | |
| " \"content\": \"日本電波塔の公式見解が発表された。展望台の改修計画。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"news\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"4\",\n", | |
| " \"content\": \"新しい観光ルートの開発に関する会議が開かれました。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author C\"}},\n", | |
| " \"tags\": [\"tourism\", \"planning\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"5\",\n", | |
| " \"content\": \"安全基準の見直しについて、政府機関が検討を開始。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\", \"regulation\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"6\",\n", | |
| " \"content\": \"環境問題に関する国際会議が開催され、持続可能な開発目標について議論されました。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author D\"}},\n", | |
| " \"tags\": [\"environment\", \"international\", \"SDGs\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"7\",\n", | |
| " \"content\": \"最新技術を用いた農業の効率化についての研究論文が発表されました。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author E\"}},\n", | |
| " \"tags\": [\"technology\", \"agriculture\", \"research\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"8\",\n", | |
| " \"content\": \"地域経済の活性化に向けた新しいプロジェクトが始動しました。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author C\"}},\n", | |
| " \"tags\": [\"economy\", \"local\", \"project\"]\n", | |
| " }\n", | |
| " ]\n", | |
| "\n", | |
| " INDEX_DIR = \"whoosh_sudachi_index\"\n", | |
| "\n", | |
| " # 1. インデックスの作成とドキュメントの追加 (バッチ処理を想定)\n", | |
| " print(\"==============================================\")\n", | |
| " print(\" 1. インデックス作成\")\n", | |
| " print(\"==============================================\")\n", | |
| " # CPUコア数に合わせてprocsを変更してください。Colabでは通常2-4コアです。\n", | |
| " index_documents(INDEX_DIR, sample_data, procs=4)\n", | |
| "\n", | |
| " # 2. アプリケーション起動時にインデックスをメモリにオープン (高速化のため)\n", | |
| " print(\"\\n==============================================\")\n", | |
| " print(\" 2. アプリケーション起動: インデックスオープン\")\n", | |
| " print(\"==============================================\")\n", | |
| " try:\n", | |
| " main_ix = index.open_dir(INDEX_DIR)\n", | |
| " logging.info(\"インデックスが正常にオープンされました。\")\n", | |
| " except index.EmptyIndexError:\n", | |
| " logging.error(f\"インデックスディレクトリ '{INDEX_DIR}' が見つかりません。\")\n", | |
| " exit()\n", | |
| "\n", | |
| " # 3. 検索テストの実行 (ixオブジェクトを渡す)\n", | |
| " print(\"\\n==============================================\")\n", | |
| " print(\" 3. 検索テスト開始\")\n", | |
| " print(\"==============================================\")\n", | |
| "\n", | |
| " # Test 1: 固有表現 (Cモード) の正確な一致\n", | |
| " query_a = \"日本電波塔\"\n", | |
| " results_a = search_index(main_ix, query_a)\n", | |
| " print(f\"\\n[Test 1] 検索: '{query_a}'\")\n", | |
| " for doc_id, score, author in results_a:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # Test 2: 複合語の部分一致 (A, Bモードの貢献を確認)\n", | |
| " query_b = \"安全調査\" # \"消費者安全調査委員会\"に含まれる複合語\n", | |
| " results_b = search_index(main_ix, query_b)\n", | |
| " print(f\"\\n[Test 2] 検索: '{query_b}'\")\n", | |
| " for doc_id, score, author in results_b:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # Test 3: FTSと構造化フィルタリングの複合クエリ\n", | |
| " query_c = \"報告書\"\n", | |
| " filter_author = \"Author B\"\n", | |
| " results_c = search_index(main_ix, query_c, author_filter=filter_author)\n", | |
| " print(f\"\\n[Test 3] 検索: '{query_c}' AND 著者: '{filter_author}'\")\n", | |
| " for doc_id, score, author in results_c:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # Test 4: 新しいデータに対する検索 (観光ルート)\n", | |
| " query_d = \"観光ルート\"\n", | |
| " results_d = search_index(main_ix, query_d)\n", | |
| " print(f\"\\n[Test 4] 検索: '{query_d}'\")\n", | |
| " for doc_id, score, author in results_d:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # Test 5: 複数のキーワードを含む検索 (安全基準 政府機関)\n", | |
| " query_e = \"安全基準 政府機関\"\n", | |
| " results_e = search_index(main_ix, query_e)\n", | |
| " print(f\"\\n[Test 5] 検索: '{query_e}'\")\n", | |
| " for doc_id, score, author in results_e:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # 🚨 追加する新しい検索テスト 🚨\n", | |
| "\n", | |
| " # Test 6: 新しいデータに対する検索 (環境問題 国際会議)\n", | |
| " query_f = \"環境問題 国際会議\"\n", | |
| " results_f = search_index(main_ix, query_f)\n", | |
| " print(f\"\\n[Test 6] 検索: '{query_f}'\")\n", | |
| " for doc_id, score, author in results_f:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # Test 7: 新しいデータに対する検索 (農業 効率化)\n", | |
| " query_g = \"農業 効率化\"\n", | |
| " results_g = search_index(main_ix, query_g)\n", | |
| " print(f\"\\n[Test 7] 検索: '{query_g}'\")\n", | |
| " for doc_id, score, author in results_g:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| " # Test 8: 著者フィルタリングと新しいデータに対するFTS (地域経済 Author C)\n", | |
| " query_h = \"地域経済\"\n", | |
| " filter_author_h = \"Author C\"\n", | |
| " results_h = search_index(main_ix, query_h, author_filter=filter_author_h)\n", | |
| " print(f\"\\n[Test 8] 検索: '{query_h}' AND 著者: '{filter_author_h}'\")\n", | |
| " for doc_id, score, author in results_h:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, 著者: {author}\")\n", | |
| "\n", | |
| "\n", | |
| " # 4. クリーンアップ\n", | |
| " print(\"\\n==============================================\")\n", | |
| " print(\" 4. クリーンアップ\")\n", | |
| " print(\"==============================================\")\n", | |
| " # 実際の運用では、アプリケーションを停止する際にインデックスをクローズする処理が必要です。\n", | |
| " # Colabでの一時ファイル削除\n", | |
| " if os.path.exists(INDEX_DIR):\n", | |
| " shutil.rmtree(INDEX_DIR)\n", | |
| " logging.info(f\"インデックスディレクトリ '{INDEX_DIR}' を削除しました\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "G14ix-2nDuCT", | |
| "outputId": "977b8072-532e-4661-94b3-9ecee7074698" | |
| }, | |
| "execution_count": 13, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: whoosh-reloaded in /usr/local/lib/python3.12/dist-packages (2.7.5)\n", | |
| "Requirement already satisfied: sudachipy in /usr/local/lib/python3.12/dist-packages (0.6.10)\n", | |
| "Requirement already satisfied: sudachidict_core in /usr/local/lib/python3.12/dist-packages (20250825)\n", | |
| "Requirement already satisfied: cached-property in /usr/local/lib/python3.12/dist-packages (from whoosh-reloaded) (2.0.1)\n", | |
| "==============================================\n", | |
| " 1. インデックス作成\n", | |
| "==============================================\n", | |
| "\n", | |
| "==============================================\n", | |
| " 2. アプリケーション起動: インデックスオープン\n", | |
| "==============================================\n", | |
| "\n", | |
| "==============================================\n", | |
| " 3. 検索テスト開始\n", | |
| "==============================================\n", | |
| "\n", | |
| "[Test 1] 検索: '日本電波塔'\n", | |
| " - DOC ID: 1, Score: 16.3936, 著者: Author A\n", | |
| " - DOC ID: 3, Score: 14.1742, 著者: Author A\n", | |
| "\n", | |
| "[Test 2] 検索: '安全調査'\n", | |
| " - DOC ID: 2, Score: 13.1207, 著者: Author B\n", | |
| "\n", | |
| "[Test 3] 検索: '報告書' AND 著者: 'Author B'\n", | |
| "\n", | |
| "[Test 4] 検索: '観光ルート'\n", | |
| " - DOC ID: 4, Score: 15.1919, 著者: Author C\n", | |
| "\n", | |
| "[Test 5] 検索: '安全基準 政府機関'\n", | |
| " - DOC ID: 5, Score: 28.5122, 著者: Author B\n", | |
| "\n", | |
| "[Test 6] 検索: '環境問題 国際会議'\n", | |
| " - DOC ID: 6, Score: 22.7340, 著者: Author D\n", | |
| "\n", | |
| "[Test 7] 検索: '農業 効率化'\n", | |
| " - DOC ID: 7, Score: 14.9077, 著者: Author E\n", | |
| "\n", | |
| "[Test 8] 検索: '地域経済' AND 著者: 'Author C'\n", | |
| "\n", | |
| "==============================================\n", | |
| " 4. クリーンアップ\n", | |
| "==============================================\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment