Skip to content

Instantly share code, notes, and snippets.

@chottokun
Last active April 14, 2025 14:55
Show Gist options
  • Save chottokun/1530a48adc15bdb5190b08569bfd4bbd to your computer and use it in GitHub Desktop.
Save chottokun/1530a48adc15bdb5190b08569bfd4bbd to your computer and use it in GitHub Desktop.
ruti-v3.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyPjX1MQYl5a3LmhN4cgAP2c",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/chottokun/1530a48adc15bdb5190b08569bfd4bbd/ruti-v3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rhmttsjLpuNq"
},
"outputs": [],
"source": [
"!pip install -U \"transformers>=4.48.0\"\n",
"# !pip install flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"source": [
"# テストコードを実行してみる。"
],
"metadata": {
"id": "ZF1kdF9wvaFS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import torch.nn.functional as F\n",
"from sentence_transformers import SentenceTransformer\n",
"\n",
"# Download from the 🤗 Hub\n",
"model = SentenceTransformer(\"cl-nagoya/ruri-v3-310m\")\n",
"\n",
"# Ruri v3 employs a 1+3 prefix scheme to distinguish between different types of text inputs:\n",
"# \"\" (empty string) is used for encoding semantic meaning.\n",
"# \"トピック: \" is used for classification, clustering, and encoding topical information.\n",
"# \"検索クエリ: \" is used for queries in retrieval tasks.\n",
"# \"検索文書: \" is used for documents to be retrieved.\n",
"sentences = [\n",
" \"川べりでサーフボードを持った人たちがいます\",\n",
" \"サーファーたちが川べりに立っています\",\n",
" \"トピック: 瑠璃色のサーファー\",\n",
" \"検索クエリ: 瑠璃色はどんな色?\",\n",
" \"検索文書: 瑠璃色(るりいろ)は、紫みを帯びた濃い青。名は、半貴石の瑠璃(ラピスラズリ、英: lapis lazuli)による。JIS慣用色名では「こい紫みの青」(略号 dp-pB)と定義している[1][2]。\",\n",
"]\n",
"\n",
"embeddings = model.encode(sentences, convert_to_tensor=True)\n",
"print(embeddings.size())\n",
"# [5, 768]\n",
"\n",
"similarities = F.cosine_similarity(embeddings.unsqueeze(0), embeddings.unsqueeze(1), dim=2)\n",
"print(similarities)\n",
"# [[1.0000, 0.9603, 0.8157, 0.7074, 0.6916],\n",
"# [0.9603, 1.0000, 0.8192, 0.7014, 0.6819],\n",
"# [0.8157, 0.8192, 1.0000, 0.8701, 0.8470],\n",
"# [0.7074, 0.7014, 0.8701, 1.0000, 0.9746],\n",
"# [0.6916, 0.6819, 0.8470, 0.9746, 1.0000]]\n"
],
"metadata": {
"id": "xE-8pexgp7TY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# langchainと使ってみる"
],
"metadata": {
"id": "baB9DuA_vJzA"
}
},
{
"cell_type": "code",
"source": [
"!pip install -q -U langchain-huggingface"
],
"metadata": {
"id": "taggKdYdqEr-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from langchain_huggingface import HuggingFaceEmbeddings\n",
"\n",
"model_name = \"cl-nagoya/ruri-v3-310m\"\n",
"model_kwargs = {'device': 'cuda'}\n",
"encode_kwargs = {'normalize_embeddings': False}\n",
"hf = HuggingFaceEmbeddings(\n",
" model_name=model_name,\n",
" model_kwargs=model_kwargs,\n",
" encode_kwargs=encode_kwargs\n",
")"
],
"metadata": {
"id": "YbH2Su9vqOQ_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# テスト用の文章\n",
"# テスト用の文章 (例)\n",
"texts = [\n",
" \"今日は良い天気です。\",\n",
" \"明日は雨が降るでしょう。\",\n",
" \"週末は旅行に行きます。\",\n",
" \"美味しい料理を食べたいです。\",\n",
" \"新しい本を読んでいます。\",\n",
" \"映画を見に行きました。\",\n",
" \"音楽を聴いています。\",\n",
" \"ゲームで遊んでいます。\",\n",
" \"仕事で疲れています。\",\n",
" \"ゆっくり休みたいと思います。\",\n",
" \"友達と会いました。\",\n",
" \"家族と過ごしました。\",\n",
" \"誕生日プレゼントをもらいました。\",\n",
" \"クリスマスが楽しみです。\",\n",
" \"お正月は実家に帰ります。\",\n",
" \"明後日は嵐になります\",\n",
" \"桜が咲きました。\",\n",
" \"紅葉が綺麗です。\",\n",
" \"雪が降りました。\",\n",
" \"海に行きました。\",\n",
" \"山に登りました。\"\n",
"]\n",
"\n",
"# 文章を埋め込みベクトルに変換\n",
"embeddings = hf.embed_documents(texts)\n",
"\n",
"# 埋め込みベクトルのサイズを確認\n",
"print(len(embeddings)) # 2\n",
"print(len(embeddings[0])) # 768\n",
"\n",
"# 埋め込みベクトルを表示 (最初の5次元のみ)\n",
"print(embeddings[0][:5])\n",
"print(embeddings[1][:5])"
],
"metadata": {
"id": "pw5uBu8sqfrP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"# 類似度を計算\n",
"similarity_matrix = cosine_similarity(embeddings)\n",
"\n",
"# 類似度に基づいて文章を並べ替え\n",
"for i in range(len(texts)):\n",
" # 各文章に対して、類似度が上位の文章を抽出\n",
" similar_indices = np.argsort(similarity_matrix[i])[::-1]\n",
"\n",
" # 類似度と文章をまとめて表示\n",
" print(f\"--- {texts[i]} ---\")\n",
" for j in similar_indices:\n",
" if i != j: # 自分自身を除外\n",
" print(f\"- {texts[j]} (類似度: {similarity_matrix[i][j]:.4f})\")"
],
"metadata": {
"id": "VWzvnEkEsdTc",
"collapsed": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Retrieverとして"
],
"metadata": {
"id": "jMGbM88sth_f"
}
},
{
"cell_type": "code",
"source": [
"!pip install -q -U langchain-community\n",
"!!pip install -q -U chromadb huggingface_hub"
],
"metadata": {
"id": "T287EDTctofn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.chains import RetrievalQA\n",
"from langchain.llms import HuggingFaceHub\n",
"\n",
"# 1. 埋め込みモデルとベクトルストアを初期化\n",
"embeddings = HuggingFaceEmbeddings(\n",
" model_name=\"cl-nagoya/ruri-v3-310m\",\n",
" model_kwargs={\"device\": \"cuda\"}, # GPUを使用する場合\n",
" encode_kwargs={\"normalize_embeddings\": False},\n",
")\n",
"# ChromaDBを使用\n",
"db = Chroma.from_texts(texts, embeddings, persist_directory=\"db\") # textsはテスト用の文章リスト、dbディレクトリに保存\n",
"\n",
"# 2. Retrieverを取得\n",
"retriever = db.as_retriever()\n",
"\n",
"# 3. クエリを実行し、検索結果を取得\n",
"query = \"今日の天気はどうですか?\"\n",
"docs = retriever.get_relevant_documents(query)\n",
"\n",
"# 4. 検索結果を表示\n",
"for doc in docs:\n",
" print(doc.page_content)\n",
" print(\"---\")"
],
"metadata": {
"id": "xrvTEYDztiX2"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment