Last active
November 24, 2024 13:28
-
-
Save tanukon/c53d578fd7e77a7b09a0188149882108 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/.local/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | |
" from .autonotebook import tqdm as notebook_tqdm\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import torch\n", | |
"import faiss\n", | |
"\n", | |
"from PIL import Image\n", | |
"from transformers import AutoModel, AutoConfig, AutoTokenizer, CLIPImageProcessor, CLIPModel\n", | |
"from tqdm import tqdm\n", | |
"from llm2vec import LLM2Vec" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load Flickr30k dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ubuntu/.local/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | |
" from .autonotebook import tqdm as notebook_tqdm\n" | |
] | |
} | |
], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"\n", | |
"dataset = load_dataset(\"nlphuji/flickr30k\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = dataset['test']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## LLM2CLIP" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Loading checkpoint shards: 25%|██▌ | 1/4 [00:18<00:54, 18.19s/it]" | |
] | |
} | |
], | |
"source": [ | |
"# Prepare CLIP part in LLM2CLIP\n", | |
"processor = CLIPImageProcessor.from_pretrained(\"openai/clip-vit-large-patch14-336\")\n", | |
"model_name_or_path = \"microsoft/LLM2CLIP-Openai-L-14-336\" # or /path/to/local/LLM2CLIP-Openai-L-14-336\n", | |
"model = AutoModel.from_pretrained(\n", | |
" model_name_or_path, \n", | |
" torch_dtype=torch.bfloat16,\n", | |
" trust_remote_code=True).to('cuda').eval()\n", | |
"\n", | |
"# Prepare LLM part in LLM2CLIP\n", | |
"llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'\n", | |
"config = AutoConfig.from_pretrained(\n", | |
" llm_model_name, trust_remote_code=True\n", | |
")\n", | |
"llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)\n", | |
"tokenizer = AutoTokenizer.from_pretrained(llm_model_name)\n", | |
"llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC\n", | |
"l2v = LLM2Vec(llm_model, tokenizer, pooling_mode=\"mean\", max_length=512, doc_max_length=512)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### LLM2CLIP text-to-image and image-to-image similarity search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"result_dir = './results'\n", | |
"model_name = 'llm2clip'\n", | |
"batch_size = 32\n", | |
"\n", | |
"os.makedirs(result_dir, exist_ok=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create embeddings\n", | |
"\n", | |
"def create_embeddings(model, processor, dataset, save_path: str, l2v = None, mode: str = 'image', batch_size: int = 128):\n", | |
" # result embedding list\n", | |
" embeddings = []\n", | |
" \n", | |
" for idx in tqdm(range(0, len(dataset), batch_size)):\n", | |
" \n", | |
" # load images to cuda\n", | |
" if mode == 'image':\n", | |
" images = dataset[idx: idx + batch_size]['image']\n", | |
" input_pixels = processor(images=images, return_tensors=\"pt\").pixel_values.to('cuda')\n", | |
" \n", | |
" with torch.no_grad(), torch.cuda.amp.autocast():\n", | |
" out = model.get_image_features(input_pixels)\n", | |
" else:\n", | |
" # load caption to cuda\n", | |
" captions = dataset[idx: idx + batch_size]['caption']\n", | |
" captions = [cap[0] for cap in captions]\n", | |
" \n", | |
" text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')\n", | |
" \n", | |
" with torch.no_grad(), torch.cuda.amp.autocast():\n", | |
" out = model.get_text_features(text_features)\n", | |
" \n", | |
" embeddings.append(out.detach().cpu())\n", | |
" \n", | |
" del out\n", | |
" torch.cuda.empty_cache()\n", | |
"\n", | |
" embeddings = torch.cat(embeddings, dim=0).detach().cpu().float().numpy()\n", | |
" print('embedding shape: ', embeddings.shape)\n", | |
" np.save(save_path, embeddings)\n", | |
" \n", | |
" return embeddings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# faiss similarity search\n", | |
"\n", | |
"def similarity_search(index, embeddings, dataset, result_csv_path, mode='img2img', top_k=5):\n", | |
" \n", | |
" if mode == 'img2img':\n", | |
" result_dict = {\n", | |
" f'top{str(i)}_similar': [] for i in range(top_k)\n", | |
" }\n", | |
" else:\n", | |
" result_dict = {\n", | |
" f'top{str(i)}_similar': [] for i in range(top_k)\n", | |
" }\n", | |
" \n", | |
" captions = []\n", | |
"\n", | |
" for data in dataset:\n", | |
" captions.append(data['caption'])\n", | |
" \n", | |
" result_dict['caption'] = captions\n", | |
" \n", | |
" for embed_idx, embed in tqdm(enumerate(embeddings), total=embeddings.shape[0]):\n", | |
" embed = embed.reshape(1, -1)\n", | |
" faiss.normalize_L2(embed)\n", | |
" distances, ann = index.search(embed, k=top_k)\n", | |
"\n", | |
" for k in range(top_k):\n", | |
" idx = ann[0][k]\n", | |
" \n", | |
" result_dict[f'top{str(k)}_similar'].append(dataset['filename'][idx])\n", | |
" \n", | |
" df = pd.DataFrame.from_dict(result_dict)\n", | |
" df.to_csv(result_csv_path, index=None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"embedding shape: (10000, 1280)\n" | |
] | |
} | |
], | |
"source": [ | |
"if os.path.exists(os.path.join(result_dir, f'{model_name}.npy')):\n", | |
" embeddings = np.load(os.path.join(result_dir, f'{model_name}.npy'))\n", | |
" print('embedding shape: ', embeddings.shape)\n", | |
"else:\n", | |
" # result embedding list\n", | |
" save_path = os.path.join(result_dir, f'{model_name}.npy')\n", | |
" embeddings = create_embeddings(model=model, processor=processor, dataset=dataset, save_path=save_path, batch_size=batch_size)\n", | |
" \n", | |
"if os.path.exists(os.path.join(result_dir, f'{model_name}_caption.npy')):\n", | |
" cap_embeddings = np.load(os.path.join(result_dir, f'{model_name}_caption.npy'))\n", | |
" print(cap_embeddings.shape)\n", | |
"else:\n", | |
" # caption embeddings\n", | |
" save_path = os.path.join(result_dir, f'{model_name}_caption.npy')\n", | |
" cap_embeddings = create_embeddings(model=model, processor=processor, dataset=dataset, mode='text', l2v=l2v, save_path=save_path, batch_size=batch_size)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# faiss\n", | |
"features = embeddings.astype(np.float32)\n", | |
"vec_dim = features.shape[1]\n", | |
"top_k = 5\n", | |
"\n", | |
"# register embeddings to faiss vector store\n", | |
"index = faiss.IndexFlatIP(vec_dim)\n", | |
"faiss.normalize_L2(features)\n", | |
"index.add(features)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"result_csv_path = os.path.join(result_dir, f'{model_name}_img2img.csv')\n", | |
"similarity_search(index=index, embeddings=embeddings, dataset=dataset, result_csv_path=result_csv_path, top_k=top_k)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"result_csv_path = os.path.join(result_dir, f'{model_name}_txt2img.csv')\n", | |
"similarity_search(index=index, embeddings=cap_embeddings, dataset=dataset, result_csv_path=result_csv_path, top_k=top_k, mode='txt2img')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## CLIP" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Prepare CLIP part in LLM2CLIP\n", | |
"processor = CLIPImageProcessor.from_pretrained(\"openai/clip-vit-large-patch14-336\")\n", | |
"model_name_or_path = \"openai/clip-vit-large-patch14\" \n", | |
"model = CLIPModel.from_pretrained(\n", | |
" model_name_or_path,\n", | |
" attn_implementation=\"flash_attention_2\",\n", | |
" torch_dtype=torch.float16,\n", | |
" trust_remote_code=True).to('cuda').eval()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### CLIP text-to-image and image-to-image similarity search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"result_dir = './results'\n", | |
"model_name = 'clip'\n", | |
"batch_size = 128\n", | |
"\n", | |
"os.makedirs(result_dir, exist_ok=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def create_embeddings(model, processor, dataset, save_path: str, mode: str = 'image', batch_size: int = 128):\n", | |
" # result embedding list\n", | |
" embeddings = []\n", | |
" \n", | |
" for idx in tqdm(range(0, len(dataset), batch_size)):\n", | |
" \n", | |
" # load images to cuda\n", | |
" if mode == 'image':\n", | |
" images = dataset[idx: idx + batch_size]['image']\n", | |
" input_pixels = processor(images=images, return_tensors=\"pt\").pixel_values.to('cuda')\n", | |
" \n", | |
" with torch.no_grad():\n", | |
" out = model.get_image_features(input_pixels)\n", | |
" else:\n", | |
" # load caption to cuda\n", | |
" captions = dataset[idx: idx + batch_size]['caption']\n", | |
" captions = [cap[0] for cap in captions]\n", | |
" text_features = processor(text=captions, padding=True, return_tensors=\"pt\").to('cuda')\n", | |
" \n", | |
" with torch.no_grad():\n", | |
" out = model.get_text_features(**text_features)\n", | |
"\n", | |
" embeddings.append(out.detach().cpu())\n", | |
" \n", | |
" del out\n", | |
" torch.cuda.empty_cache()\n", | |
"\n", | |
" embeddings = torch.cat(embeddings, dim=0).detach().cpu().float().numpy()\n", | |
" print('embedding shape: ', embeddings.shape)\n", | |
" np.save(save_path, embeddings)\n", | |
" \n", | |
" return embeddings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"if os.path.exists(os.path.join(result_dir, f'{model_name}.npy')):\n", | |
" embeddings = np.load(os.path.join(result_dir, f'{model_name}.npy'))\n", | |
" print('embedding shape: ', embeddings.shape)\n", | |
"else:\n", | |
" # result embedding list\n", | |
" save_path = os.path.join(result_dir, f'{model_name}.npy')\n", | |
" embeddings = create_embeddings(model=model, processor=processor, dataset=dataset, save_path=save_path, batch_size=batch_size)\n", | |
" \n", | |
"if os.path.exists(os.path.join(result_dir, f'{model_name}_caption.npy')):\n", | |
" cap_embeddings = np.load(os.path.join(result_dir, f'{model_name}_caption.npy'))\n", | |
" print(cap_embeddings.shape)\n", | |
"else:\n", | |
" # caption embeddings\n", | |
" save_path = os.path.join(result_dir, f'{model_name}_caption.npy')\n", | |
" cap_embeddings = create_embeddings(model=model, processor=tokenizer, dataset=dataset, mode='text', save_path=save_path, batch_size=batch_size)\n", | |
" \n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# faiss\n", | |
"features = embeddings.astype(np.float32)\n", | |
"vec_dim = features.shape[1]\n", | |
"top_k = 4\n", | |
"\n", | |
"# register embeddings to faiss vector store\n", | |
"index = faiss.IndexFlatIP(vec_dim)\n", | |
"faiss.normalize_L2(features)\n", | |
"index.add(features)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"result_csv_path = os.path.join(result_dir, f'{model_name}_img2img.csv')\n", | |
"similarity_search(index=index, embeddings=embeddings, dataset=dataset, result_csv_path=result_csv_path, top_k=top_k)\n", | |
"\n", | |
"result_csv_path = os.path.join(result_dir, f'{model_name}_txt2img.csv')\n", | |
"similarity_search(index=index, embeddings=cap_embeddings, dataset=dataset, result_csv_path=result_csv_path, top_k=top_k, mode='txt2img')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "llm2clip", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.20" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment