Last active
April 14, 2025 14:55
-
-
Save chottokun/1530a48adc15bdb5190b08569bfd4bbd to your computer and use it in GitHub Desktop.
ruti-v3.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"gpuType": "T4", | |
"authorship_tag": "ABX9TyPjX1MQYl5a3LmhN4cgAP2c", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/chottokun/1530a48adc15bdb5190b08569bfd4bbd/ruti-v3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "rhmttsjLpuNq" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install -U \"transformers>=4.48.0\"\n", | |
"# !pip install flash-attn --no-build-isolation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# テストコードを実行してみる。" | |
], | |
"metadata": { | |
"id": "ZF1kdF9wvaFS" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import torch.nn.functional as F\n", | |
"from sentence_transformers import SentenceTransformer\n", | |
"\n", | |
"# Download from the 🤗 Hub\n", | |
"model = SentenceTransformer(\"cl-nagoya/ruri-v3-310m\")\n", | |
"\n", | |
"# Ruri v3 employs a 1+3 prefix scheme to distinguish between different types of text inputs:\n", | |
"# \"\" (empty string) is used for encoding semantic meaning.\n", | |
"# \"トピック: \" is used for classification, clustering, and encoding topical information.\n", | |
"# \"検索クエリ: \" is used for queries in retrieval tasks.\n", | |
"# \"検索文書: \" is used for documents to be retrieved.\n", | |
"sentences = [\n", | |
" \"川べりでサーフボードを持った人たちがいます\",\n", | |
" \"サーファーたちが川べりに立っています\",\n", | |
" \"トピック: 瑠璃色のサーファー\",\n", | |
" \"検索クエリ: 瑠璃色はどんな色?\",\n", | |
" \"検索文書: 瑠璃色(るりいろ)は、紫みを帯びた濃い青。名は、半貴石の瑠璃(ラピスラズリ、英: lapis lazuli)による。JIS慣用色名では「こい紫みの青」(略号 dp-pB)と定義している[1][2]。\",\n", | |
"]\n", | |
"\n", | |
"embeddings = model.encode(sentences, convert_to_tensor=True)\n", | |
"print(embeddings.size())\n", | |
"# [5, 768]\n", | |
"\n", | |
"similarities = F.cosine_similarity(embeddings.unsqueeze(0), embeddings.unsqueeze(1), dim=2)\n", | |
"print(similarities)\n", | |
"# [[1.0000, 0.9603, 0.8157, 0.7074, 0.6916],\n", | |
"# [0.9603, 1.0000, 0.8192, 0.7014, 0.6819],\n", | |
"# [0.8157, 0.8192, 1.0000, 0.8701, 0.8470],\n", | |
"# [0.7074, 0.7014, 0.8701, 1.0000, 0.9746],\n", | |
"# [0.6916, 0.6819, 0.8470, 0.9746, 1.0000]]\n" | |
], | |
"metadata": { | |
"id": "xE-8pexgp7TY" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# langchainと使ってみる" | |
], | |
"metadata": { | |
"id": "baB9DuA_vJzA" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install -q -U langchain-huggingface" | |
], | |
"metadata": { | |
"id": "taggKdYdqEr-" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from langchain_huggingface import HuggingFaceEmbeddings\n", | |
"\n", | |
"model_name = \"cl-nagoya/ruri-v3-310m\"\n", | |
"model_kwargs = {'device': 'cuda'}\n", | |
"encode_kwargs = {'normalize_embeddings': False}\n", | |
"hf = HuggingFaceEmbeddings(\n", | |
" model_name=model_name,\n", | |
" model_kwargs=model_kwargs,\n", | |
" encode_kwargs=encode_kwargs\n", | |
")" | |
], | |
"metadata": { | |
"id": "YbH2Su9vqOQ_" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# テスト用の文章\n", | |
"# テスト用の文章 (例)\n", | |
"texts = [\n", | |
" \"今日は良い天気です。\",\n", | |
" \"明日は雨が降るでしょう。\",\n", | |
" \"週末は旅行に行きます。\",\n", | |
" \"美味しい料理を食べたいです。\",\n", | |
" \"新しい本を読んでいます。\",\n", | |
" \"映画を見に行きました。\",\n", | |
" \"音楽を聴いています。\",\n", | |
" \"ゲームで遊んでいます。\",\n", | |
" \"仕事で疲れています。\",\n", | |
" \"ゆっくり休みたいと思います。\",\n", | |
" \"友達と会いました。\",\n", | |
" \"家族と過ごしました。\",\n", | |
" \"誕生日プレゼントをもらいました。\",\n", | |
" \"クリスマスが楽しみです。\",\n", | |
" \"お正月は実家に帰ります。\",\n", | |
" \"明後日は嵐になります\",\n", | |
" \"桜が咲きました。\",\n", | |
" \"紅葉が綺麗です。\",\n", | |
" \"雪が降りました。\",\n", | |
" \"海に行きました。\",\n", | |
" \"山に登りました。\"\n", | |
"]\n", | |
"\n", | |
"# 文章を埋め込みベクトルに変換\n", | |
"embeddings = hf.embed_documents(texts)\n", | |
"\n", | |
"# 埋め込みベクトルのサイズを確認\n", | |
"print(len(embeddings)) # 2\n", | |
"print(len(embeddings[0])) # 768\n", | |
"\n", | |
"# 埋め込みベクトルを表示 (最初の5次元のみ)\n", | |
"print(embeddings[0][:5])\n", | |
"print(embeddings[1][:5])" | |
], | |
"metadata": { | |
"id": "pw5uBu8sqfrP" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import numpy as np\n", | |
"from sklearn.metrics.pairwise import cosine_similarity\n", | |
"\n", | |
"# 類似度を計算\n", | |
"similarity_matrix = cosine_similarity(embeddings)\n", | |
"\n", | |
"# 類似度に基づいて文章を並べ替え\n", | |
"for i in range(len(texts)):\n", | |
" # 各文章に対して、類似度が上位の文章を抽出\n", | |
" similar_indices = np.argsort(similarity_matrix[i])[::-1]\n", | |
"\n", | |
" # 類似度と文章をまとめて表示\n", | |
" print(f\"--- {texts[i]} ---\")\n", | |
" for j in similar_indices:\n", | |
" if i != j: # 自分自身を除外\n", | |
" print(f\"- {texts[j]} (類似度: {similarity_matrix[i][j]:.4f})\")" | |
], | |
"metadata": { | |
"id": "VWzvnEkEsdTc", | |
"collapsed": true | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Retrieverとして" | |
], | |
"metadata": { | |
"id": "jMGbM88sth_f" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install -q -U langchain-community\n", | |
"!!pip install -q -U chromadb huggingface_hub" | |
], | |
"metadata": { | |
"id": "T287EDTctofn" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from langchain.embeddings import HuggingFaceEmbeddings\n", | |
"from langchain.vectorstores import Chroma\n", | |
"from langchain.chains import RetrievalQA\n", | |
"from langchain.llms import HuggingFaceHub\n", | |
"\n", | |
"# 1. 埋め込みモデルとベクトルストアを初期化\n", | |
"embeddings = HuggingFaceEmbeddings(\n", | |
" model_name=\"cl-nagoya/ruri-v3-310m\",\n", | |
" model_kwargs={\"device\": \"cuda\"}, # GPUを使用する場合\n", | |
" encode_kwargs={\"normalize_embeddings\": False},\n", | |
")\n", | |
"# ChromaDBを使用\n", | |
"db = Chroma.from_texts(texts, embeddings, persist_directory=\"db\") # textsはテスト用の文章リスト、dbディレクトリに保存\n", | |
"\n", | |
"# 2. Retrieverを取得\n", | |
"retriever = db.as_retriever()\n", | |
"\n", | |
"# 3. クエリを実行し、検索結果を取得\n", | |
"query = \"今日の天気はどうですか?\"\n", | |
"docs = retriever.get_relevant_documents(query)\n", | |
"\n", | |
"# 4. 検索結果を表示\n", | |
"for doc in docs:\n", | |
" print(doc.page_content)\n", | |
" print(\"---\")" | |
], | |
"metadata": { | |
"id": "xrvTEYDztiX2" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment