Created
October 26, 2025 14:55
-
-
Save chottokun/f57cacfab66a9dc2115ebbc34d41d0e3 to your computer and use it in GitHub Desktop.
whoosh-reloaded+sudachi_sample.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyNsHKpyVe2zuUuJDJGCs2Vf", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/chottokun/f57cacfab66a9dc2115ebbc34d41d0e3/whoosh-reloaded-sudachi_sample.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 1. 必要なライブラリのインストール\n", | |
| "# Colab環境で以下のセルを実行してください\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "!pip install whoosh-reloaded sudachipy sudachidict_core\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "jdiVCe8v-exO", | |
| "outputId": "779b7b43-1be2-4ddf-aa52-bce766018ca9" | |
| }, | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Collecting whoosh-reloaded\n", | |
| " Downloading Whoosh_Reloaded-2.7.5-py2.py3-none-any.whl.metadata (4.4 kB)\n", | |
| "Collecting sudachipy\n", | |
| " Downloading SudachiPy-0.6.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", | |
| "Collecting sudachidict_core\n", | |
| " Downloading sudachidict_core-20250825-py3-none-any.whl.metadata (2.7 kB)\n", | |
| "Collecting cached-property (from whoosh-reloaded)\n", | |
| " Downloading cached_property-2.0.1-py3-none-any.whl.metadata (10 kB)\n", | |
| "Downloading Whoosh_Reloaded-2.7.5-py2.py3-none-any.whl (551 kB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m551.8/551.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading SudachiPy-0.6.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m43.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading sudachidict_core-20250825-py3-none-any.whl (72.2 MB)\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.2/72.2 MB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25hDownloading cached_property-2.0.1-py3-none-any.whl (7.4 kB)\n", | |
| "Installing collected packages: sudachipy, sudachidict_core, cached-property, whoosh-reloaded\n", | |
| "Successfully installed cached-property-2.0.1 sudachidict_core-20250825 sudachipy-0.6.10 whoosh-reloaded-2.7.5\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "8JEC2eLv5Tzw", | |
| "outputId": "a78f54a0-b882-4e0c-c758-d167435ed54f" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: whoosh-reloaded in /usr/local/lib/python3.12/dist-packages (2.7.5)\n", | |
| "Requirement already satisfied: sudachipy in /usr/local/lib/python3.12/dist-packages (0.6.10)\n", | |
| "Requirement already satisfied: sudachidict_core in /usr/local/lib/python3.12/dist-packages (20250825)\n", | |
| "Requirement already satisfied: cached-property in /usr/local/lib/python3.12/dist-packages (from whoosh-reloaded) (2.0.1)\n", | |
| "--- Preparing index directory: whoosh_sudachi_index ---\n", | |
| "--- Indexing 3 documents ---\n", | |
| "--- Indexing complete ---\n", | |
| "\n", | |
| "==============================================\n", | |
| " Search Test Start\n", | |
| "==============================================\n", | |
| " FTS: '日本電波塔'\n", | |
| "\n", | |
| " Search: '日本電波塔'\n", | |
| " - DOC ID: 1, Score: 4.7495, Author: Author A\n", | |
| " - DOC ID: 3, Score: 3.9996, Author: Author A\n", | |
| " FTS: '消費者'\n", | |
| "\n", | |
| " Search: '消費者'\n", | |
| " - DOC ID: 2, Score: 2.7757, Author: Author B\n", | |
| " [Compound query executed] FTS: '報告書' AND Author: 'Author B'\n", | |
| "\n", | |
| " Search: '報告書' AND Author: 'Author B'\n", | |
| " FTS: '電波塔 AND 東京タワー'\n", | |
| "\n", | |
| " Search: '電波塔 AND 東京タワー'\n", | |
| " - DOC ID: 1, Score: 7.6211, Author: Author A\n", | |
| "\n", | |
| "--- Deleted index directory 'whoosh_sudachi_index' ---\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 1. 必要なライブラリのインストール\n", | |
| "# Colab環境で以下のセルを実行してください\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "!pip install whoosh-reloaded sudachipy sudachidict_core\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 2. ライブラリのインポートと設定\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "import os\n", | |
| "import shutil\n", | |
| "import json\n", | |
| "from typing import Generator, Dict, Any, List, Tuple\n", | |
| "\n", | |
| "# Whoosh imports\n", | |
| "from whoosh import index\n", | |
| "from whoosh.fields import Schema, ID, TEXT, KEYWORD\n", | |
| "from whoosh.analysis import Tokenizer, Token\n", | |
| "from whoosh.qparser import MultifieldParser\n", | |
| "from whoosh.query import And, Term\n", | |
| "from whoosh.scoring import BM25F\n", | |
| "\n", | |
| "# Sudachi imports\n", | |
| "from sudachipy import tokenizer, dictionary\n", | |
| "# Correct way to import SplitMode\n", | |
| "# from sudachipy.tokenizer import SplitMode # Import SplitMode directly\n", | |
| "\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 3. JSONデータの平坦化関数\n", | |
| "# Whooshのフラットなスキーマに対応させるため、ネストされたJSONを平坦化します。\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def flatten_json(data: Dict[str, Any], delimiter: str = '_', parent_key: str = '') -> Dict[str, str]:\n", | |
| " \"\"\"ネストされたJSON辞書をフラットなキー/値のペアに変換する。\"\"\"\n", | |
| " items: Dict[str, str] = {}\n", | |
| " for key, value in data.items():\n", | |
| " # キーを結合 (例: metadata_author_name)\n", | |
| " new_key = f\"{parent_key}{delimiter}{key}\" if parent_key else key\n", | |
| "\n", | |
| " if isinstance(value, dict):\n", | |
| " items.update(flatten_json(value, delimiter, new_key))\n", | |
| " elif isinstance(value, list):\n", | |
| " # 配列内の要素を処理(この例では辞書を再帰的に処理)\n", | |
| " for i, item in enumerate(value):\n", | |
| " if isinstance(item, dict):\n", | |
| " items.update(flatten_json(item, delimiter, f\"{new_key}{delimiter}{i}\"))\n", | |
| " else:\n", | |
| " items[f\"{new_key}{delimiter}{i}\"] = str(item)\n", | |
| " else:\n", | |
| " # 検索対象とするテキストフィールドは 'content' または 'description' が前提\n", | |
| " items[new_key] = str(value)\n", | |
| " return items\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 4. Whoosh向けカスタムSudachiトークナイザー\n", | |
| "# Sudachiの高性能な形態素解析をWhooshの分析パイプラインに組み込む。\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "class SudachiTokenizer(Tokenizer):\n", | |
| " \"\"\"SudachiPyを利用したWhoosh互換のカスタムトークナイザー。\"\"\"\n", | |
| " def __init__(self, mode: tokenizer.Tokenizer.SplitMode): # Use imported SplitMode\n", | |
| " # 辞書の初期化はインスタンスごとに一度だけ行う\n", | |
| " self.mode = mode\n", | |
| " self.dict = dictionary.Dictionary()\n", | |
| " self.tokenizer_obj = self.dict.create()\n", | |
| "\n", | |
| "\n", | |
| " def __getstate__(self):\n", | |
| " # Return a dictionary of attributes to pickle, excluding non-picklable ones\n", | |
| " # Store the mode name as a string\n", | |
| " mode_string = \"\"\n", | |
| " if self.mode == tokenizer.Tokenizer.SplitMode.A: # Use imported SplitMode\n", | |
| " mode_string = \"A\"\n", | |
| " elif self.mode == tokenizer.Tokenizer.SplitMode.B: # Use imported SplitMode\n", | |
| " mode_string = \"B\"\n", | |
| " elif self.mode == tokenizer.Tokenizer.SplitMode.C: # Use imported SplitMode\n", | |
| " mode_string = \"C\"\n", | |
| " return {'mode_string': mode_string}\n", | |
| "\n", | |
| " def __setstate__(self, state):\n", | |
| " # Restore the state from the pickled attributes\n", | |
| " mode_string = state['mode_string']\n", | |
| " if mode_string == \"A\":\n", | |
| " self.mode = tokenizer.Tokenizer.SplitMode.A # Use imported SplitMode\n", | |
| " elif mode_string == \"B\":\n", | |
| " self.mode = tokenizer.Tokenizer.SplitMode.B # Use imported SplitMode\n", | |
| " elif mode_string == \"C\":\n", | |
| " self.mode = tokenizer.Tokenizer.SplitMode.C # Use imported SplitMode\n", | |
| " else:\n", | |
| " # Default to B mode if something goes wrong\n", | |
| " self.mode = tokenizer.Tokenizer.SplitMode.B # Use imported SplitMode\n", | |
| "\n", | |
| " # Re-initialize the non-picklable objects\n", | |
| " self.dict = dictionary.Dictionary()\n", | |
| " self.tokenizer_obj = self.dict.create()\n", | |
| "\n", | |
| "\n", | |
| " def __call__(self, value: str, positions: bool = False, chars: bool = False,\n", | |
| " keeporiginal: bool = False, start_pos: int = 0, start_char: int = 0, **kwargs) -> Generator:\n", | |
| " \"\"\"入力文字列をSudachiでトークン化し、WhooshのTokenオブジェクトを生成する。\"\"\"\n", | |
| " token = Token()\n", | |
| " # Specified split mode (A/B/C) is used internally from self.mode\n", | |
| " morphemes = self.tokenizer_obj.tokenize(value, self.mode)\n", | |
| "\n", | |
| " pos = start_pos\n", | |
| " char_pos = start_char\n", | |
| "\n", | |
| " for m in morphemes:\n", | |
| " surface = m.surface()\n", | |
| " token.text = surface\n", | |
| "\n", | |
| " # Set position information, important for search accuracy\n", | |
| " if positions:\n", | |
| " token.pos = pos\n", | |
| " pos += 1\n", | |
| " if chars:\n", | |
| " token.startchar = char_pos\n", | |
| " token.endchar = char_pos + len(surface)\n", | |
| " char_pos = token.endchar\n", | |
| "\n", | |
| " yield token\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 5. Whooshスキーマの定義\n", | |
| "# Define a schema including hybrid fields for B and C modes.\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def sudachi_tokenizer_b_factory():\n", | |
| " return SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.B) # Use imported SplitMode\n", | |
| "\n", | |
| "def sudachi_tokenizer_c_factory():\n", | |
| " return SudachiTokenizer(mode=tokenizer.Tokenizer.SplitMode.C) # Use imported SplitMode\n", | |
| "\n", | |
| "def create_schema():\n", | |
| " \"\"\"Define the Whoosh index schema.\"\"\"\n", | |
| " return Schema(\n", | |
| " doc_id=ID(stored=True, unique=True),\n", | |
| " # FTS field: B mode (balanced for recall)\n", | |
| " content_fts_b=TEXT(analyzer=sudachi_tokenizer_b_factory(), stored=False),\n", | |
| " # FTS field: C mode (prioritizing proper nouns and precision)\n", | |
| " content_fts_c=TEXT(analyzer=sudachi_tokenizer_c_factory(), stored=False),\n", | |
| " # Metadata fields: for strict filtering\n", | |
| " metadata_author_name=KEYWORD(stored=True, sortable=True),\n", | |
| " tags_list=KEYWORD(stored=False)\n", | |
| " )\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 6. Indexing process\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def index_documents(index_dir: str, json_data_list: List[Dict[str, Any]]):\n", | |
| " \"\"\"Add a list of JSON data to the Whoosh index.\"\"\"\n", | |
| " print(f\"--- Preparing index directory: {index_dir} ---\")\n", | |
| " # Clean up the directory for testing in Colab environment\n", | |
| " if os.path.exists(index_dir):\n", | |
| " shutil.rmtree(index_dir)\n", | |
| " os.makedirs(index_dir, exist_ok=True)\n", | |
| "\n", | |
| " schema = create_schema()\n", | |
| " ix = index.create_in(index_dir, schema)\n", | |
| " writer = ix.writer()\n", | |
| "\n", | |
| " print(f\"--- Indexing {len(json_data_list)} documents ---\")\n", | |
| " for i, doc in enumerate(json_data_list):\n", | |
| " flattened = flatten_json(doc)\n", | |
| "\n", | |
| " # Determine doc_id (prioritize 'id' field, otherwise use index number)\n", | |
| " doc_id = flattened.get('id', str(i + 1))\n", | |
| "\n", | |
| " # Field values to write to Whoosh\n", | |
| " field_values = {\n", | |
| " 'doc_id': doc_id,\n", | |
| " # Assuming 'content' is the text field for searching\n", | |
| " 'content_fts_b': flattened.get('content', ''),\n", | |
| " 'content_fts_c': flattened.get('content', ''),\n", | |
| " # Metadata field mapping (using flattened keys)\n", | |
| " 'metadata_author_name': flattened.get('metadata_author_name', ''),\n", | |
| " 'tags_list': flattened.get('tags_list', '')\n", | |
| " }\n", | |
| " writer.add_document(**field_values)\n", | |
| "\n", | |
| " writer.commit()\n", | |
| " print(\"--- Indexing complete ---\")\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 7. Search execution process\n", | |
| "# Apply boost to C mode and combine with filtering using AND query.\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "def search_index(index_dir: str, query_str: str, author_filter: str = None, limit: int = 5) -> List[Tuple[str, float, str]]:\n", | |
| " \"\"\"Search the Whoosh index and return results.\"\"\"\n", | |
| " ix = index.open_dir(index_dir)\n", | |
| "\n", | |
| " # Apply boost to C mode (2.0) to prioritize proper noun matches\n", | |
| " field_boosts = {'content_fts_b': 1.0, 'content_fts_c': 2.0}\n", | |
| "\n", | |
| " with ix.searcher(weighting=BM25F(field_boosts=field_boosts)) as searcher:\n", | |
| " # Set target fields for searching to both B and C modes\n", | |
| " parser = MultifieldParser(['content_fts_b', 'content_fts_c'], ix.schema)\n", | |
| "\n", | |
| " try:\n", | |
| " query = parser.parse(query_str)\n", | |
| " except Exception as e:\n", | |
| " print(f\"Query parsing error: {e}\")\n", | |
| " return []\n", | |
| "\n", | |
| " # Apply structured filtering (author filter)\n", | |
| " if author_filter:\n", | |
| " author_query = Term('metadata_author_name', author_filter)\n", | |
| " # Combine full-text search query and filter with AND\n", | |
| " query = And([query, author_query])\n", | |
| " print(f\" [Compound query executed] FTS: '{query_str}' AND Author: '{author_filter}'\")\n", | |
| " else:\n", | |
| " print(f\" FTS: '{query_str}'\")\n", | |
| "\n", | |
| " # Execute search\n", | |
| " results = searcher.search(query, limit=limit)\n", | |
| "\n", | |
| " # Format results\n", | |
| " output = []\n", | |
| " for hit in results:\n", | |
| " doc_id = hit['doc_id']\n", | |
| " score = hit.score\n", | |
| " # Use stored=True field to get author name from original data\n", | |
| " author = hit.get('metadata_author_name', 'N/A')\n", | |
| " output.append((doc_id, score, author))\n", | |
| "\n", | |
| " return output\n", | |
| "\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "# 8. Main execution block (for Colab testing)\n", | |
| "# ---------------------------------------------------------------------------------\n", | |
| "if __name__ == \"__main__\":\n", | |
| " # Sample JSON data (assuming small data equivalent to about 2000 lines)\n", | |
| " sample_data = [\n", | |
| " {\n", | |
| " \"id\": \"1\",\n", | |
| " \"content\": \"東京タワーは日本を代表する観光地です。正式名称は日本電波塔。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"Japan\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"2\",\n", | |
| " \"content\": \"消費者安全調査委員会の報告書は、複合的な問題解決を示唆する。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author B\"}},\n", | |
| " \"tags\": [\"government\", \"safety\"]\n", | |
| " },\n", | |
| " {\n", | |
| " \"id\": \"3\",\n", | |
| " \"content\": \"日本電波塔の公式見解が発表された。展望台の改修計画。\",\n", | |
| " \"metadata\": {\"author\": {\"name\": \"Author A\"}},\n", | |
| " \"tags\": [\"tourism\", \"news\"]\n", | |
| " }\n", | |
| " ]\n", | |
| "\n", | |
| " INDEX_DIR = \"whoosh_sudachi_index\"\n", | |
| "\n", | |
| " # 1. Create index and add documents\n", | |
| " index_documents(INDEX_DIR, sample_data)\n", | |
| "\n", | |
| " # 2. Execute search tests\n", | |
| " print(\"\\n==============================================\")\n", | |
| " print(\" Search Test Start\")\n", | |
| " print(\"==============================================\")\n", | |
| "\n", | |
| " # --- Test 1: Exact match for proper noun (C mode) ---\n", | |
| " query_a = \"日本電波塔\"\n", | |
| " results_a = search_index(INDEX_DIR, query_a)\n", | |
| " print(f\"\\n Search: '{query_a}'\")\n", | |
| " for doc_id, score, author in results_a:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, Author: {author}\")\n", | |
| "\n", | |
| " # --- Test 2: Partial match for compound word (B mode/Recall check) ---\n", | |
| " # In Sudachi's A/B mode, \"消費者\" is split, so partial words also hit\n", | |
| " query_b = \"消費者\"\n", | |
| " results_b = search_index(INDEX_DIR, query_b)\n", | |
| " print(f\"\\n Search: '{query_b}'\")\n", | |
| " for doc_id, score, author in results_b:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, Author: {author}\")\n", | |
| "\n", | |
| " # --- Test 3: Compound query with FTS and structured filtering ---\n", | |
| " query_c = \"報告書\"\n", | |
| " filter_author = \"Author B\"\n", | |
| " results_c = search_index(INDEX_DIR, query_c, author_filter=filter_author)\n", | |
| " print(f\"\\n Search: '{query_c}' AND Author: '{filter_author}'\")\n", | |
| " for doc_id, score, author in results_c:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, Author: {author}\")\n", | |
| "\n", | |
| " # --- Test 4: Complex AND query ---\n", | |
| " query_d = \"電波塔 AND 東京タワー\"\n", | |
| " results_d = search_index(INDEX_DIR, query_d)\n", | |
| " print(f\"\\n Search: '{query_d}'\")\n", | |
| " for doc_id, score, author in results_d:\n", | |
| " print(f\" - DOC ID: {doc_id}, Score: {score:.4f}, Author: {author}\")\n", | |
| "\n", | |
| "\n", | |
| " # 3. Cleanup\n", | |
| " if os.path.exists(INDEX_DIR):\n", | |
| " shutil.rmtree(INDEX_DIR)\n", | |
| " print(f\"\\n--- Deleted index directory '{INDEX_DIR}' ---\")" | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment