Last active
April 6, 2024 11:44
-
-
Save apple2373/8b421991898a0b4311456a46d8cd412a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"clip 1.0\n", | |
"gsheets 0.6.1\n", | |
"imageio 2.34.0\n", | |
"matplotlib 3.8.4\n", | |
"numpy 1.26.4\n", | |
"opencv-python 4.9.0.80\n", | |
"pandas 2.2.1\n", | |
"pycocoevalcap 1.2\n", | |
"pytorch-lightning 1.6.0\n", | |
"scipy 1.13.0\n", | |
"spacy 3.0.0\n", | |
"torch 1.11.0+cu113\n", | |
"torchinfo 1.8.0\n", | |
"torchvision 0.12.0+cu113\n", | |
"wandb 0.16.6\n" | |
] | |
} | |
], | |
"source": [ | |
"# print installed packages versions for those listed in requirements.txt\n", | |
"import pkg_resources\n", | |
"with open('requirements.txt') as f:\n", | |
" reqs = [line.split('==')[0] for line in f.read().splitlines()]\n", | |
"installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set if pkg.key in reqs}\n", | |
"for k,v in installed_packages.items():\n", | |
" print(k,v)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# CVCL-VIT\n", | |
"- Clone from https://github.com/wkvong/multimodal-baby and put this notebook at the root\n", | |
"- Testd on the commit ['1dcc72e6f37fabcbac5a04235a3489d7304e644c'](https://github.com/wkvong/multimodal-baby/tree/1dcc72e6f37fabcbac5a04235a3489d7304e644c)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'vision_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['vision_encoder'])`.\n", | |
" rank_zero_warn(\n", | |
"/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/pytorch_lightning/utilities/parsing.py:244: UserWarning: Attribute 'text_encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['text_encoder'])`.\n", | |
" rank_zero_warn(\n" | |
] | |
}, | |
{ | |
"ename": "RuntimeError", | |
"evalue": "The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", | |
"Cell \u001b[0;32mIn[43], line 26\u001b[0m\n\u001b[1;32m 24\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m cvcl\u001b[38;5;241m.\u001b[39mtokenize(texts)\n\u001b[1;32m 25\u001b[0m texts, texts_len \u001b[38;5;241m=\u001b[39m texts\u001b[38;5;241m.\u001b[39mto(device), texts_len\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 26\u001b[0m texts_features \u001b[38;5;241m=\u001b[39m \u001b[43mcvcl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtexts_len\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal_lit.py:158\u001b[0m, in \u001b[0;36mMultiModalLitModel.encode_text\u001b[0;34m(self, y, y_len)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, y, y_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 157\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Encode text to obtain text features\"\"\"\u001b[39;00m\n\u001b[0;32m--> 158\u001b[0m text_features, _ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_len\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m text_features\n", | |
"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:740\u001b[0m, in \u001b[0;36mMultiModalModel.encode_text\u001b[0;34m(self, text, text_length)\u001b[0m\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode_text\u001b[39m(\u001b[38;5;28mself\u001b[39m, text, text_length):\n\u001b[0;32m--> 740\u001b[0m text_features, text_outputs, attns \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_embed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_length\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalize_features:\n\u001b[1;32m 742\u001b[0m \u001b[38;5;66;03m# normalize text features\u001b[39;00m\n\u001b[1;32m 743\u001b[0m text_features \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39mnormalize(text_features, p\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", | |
"File \u001b[0;32m/home/localstorage/miniconda3/envs/baby/lib/python3.9/site-packages/torch/nn/modules/module.py:1110\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1107\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1108\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1109\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1111\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1112\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", | |
"File \u001b[0;32m/home/ssd_satoshi/projects/cvclvit/multimodal/multimodal.py:563\u001b[0m, in \u001b[0;36mTextEncoder.forward\u001b[0;34m(self, x, x_len, image_features, image_feature_map)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msinusoidal\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlearned\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 562\u001b[0m pos_embed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpos_embed[:embedding\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m), :, :]\n\u001b[0;32m--> 563\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[43membedding\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpos_embed\u001b[49m\n\u001b[1;32m 565\u001b[0m raw_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer_encoder(embedding, src_key_padding_mask\u001b[38;5;241m=\u001b[39msrc_key_padding_mask)\n\u001b[1;32m 567\u001b[0m \u001b[38;5;66;03m# transpose back to (B, L, E)\u001b[39;00m\n", | |
"\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (27) must match the size of tensor b (25) at non-singleton dimension 0" | |
] | |
} | |
], | |
"source": [ | |
"import torch\n", | |
"from multimodal.multimodal_lit import MultiModalLitModel\n", | |
"from huggingface_hub import hf_hub_download\n", | |
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", | |
"\n", | |
"from torchvision import transforms\n", | |
"preprocess = transforms.Compose([\n", | |
" transforms.Resize((224, 224)),\n", | |
" transforms.ToTensor(),\n", | |
" transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])\n", | |
"\n", | |
"checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n", | |
"checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n", | |
"cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n", | |
"cvcl = cvcl.to(device)\n", | |
"cvcl.eval()\n", | |
"\n", | |
"# create random image to encode\n", | |
"images = torch.rand(4, 3, 224, 224).to(device)\n", | |
"image_features = cvcl.encode_image(images)\n", | |
"\n", | |
"# create texts to encode\n", | |
"texts = [\"ball\", \"puzzle\", \"car\"]\n", | |
"texts, texts_len = cvcl.tokenize(texts)\n", | |
"texts, texts_len = texts.to(device), texts_len.to(device)\n", | |
"texts_features = cvcl.encode_text(texts, texts_len)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"texts.shape torch.Size([3, 27])\n", | |
"texts_len tensor([6, 8, 5], device='cuda:0')\n", | |
"cvcl.text_encoder.pos_embed.shape torch.Size([25, 1, 512])\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"texts.shape\",texts.shape)\n", | |
"print(\"texts_len\",texts_len)\n", | |
"print(\"cvcl.text_encoder.pos_embed.shape\",cvcl.text_encoder.pos_embed.shape)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Questions\n", | |
"- Why is `texts.shape` 27 despite the positional encoding being with a length of 25?\n", | |
"- Why is `texts_len` `[6, 8, 5]`? Shouldn't it be `[3, 3, 3]` because each should be `<eos> word <eos>`?\n", | |
"\n", | |
"## Suggested Fix\n", | |
"- Modify [`tokenize(self, texts)`](https://github.com/wkvong/multimodal-baby/blob/1dcc72e6f37fabcbac5a04235a3489d7304e644c/multimodal/multimodal_lit.py#L161-L178) function in `MultiModalLitModel` to the following:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tokenize_modified(self,texts):\n", | |
" \"\"\"Tokenize texts to obtain tokens and token lengths\"\"\"\n", | |
" max_seq_len = 23 # hold out the 2 of <sos> and <eos>\n", | |
"\n", | |
" if isinstance(texts, str):\n", | |
" texts = [texts]\n", | |
"\n", | |
" all_tokens = []\n", | |
" token_lengths = []\n", | |
" for text in texts:\n", | |
" doc = self.nlp(text)\n", | |
" tokens = [token.text for token in doc]\n", | |
" # TODO: might\n", | |
" tokens = [self.vocab[\"<sos>\"]] + [self.vocab.get(token, self.vocab[\"<unk>\"]) for token in tokens] + [self.vocab[\"<eos>\"]] + [self.vocab[\"<pad>\"]] * (max_seq_len - len(tokens))\n", | |
" all_tokens.append(tokens)\n", | |
" token_lengths.append(len(tokens))\n", | |
"\n", | |
" tokens = torch.tensor(all_tokens, dtype=torch.long)\n", | |
" token_lengths = torch.tensor(token_lengths, dtype=torch.long)\n", | |
" return tokens, token_lengths" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- then the following code works!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"checkpoint_name = \"cvcl_s_dino_vit_embedding\"\n", | |
"checkpoint = hf_hub_download(repo_id=\"wkvong/\"+checkpoint_name, filename=checkpoint_name+\".ckpt\")\n", | |
"cvcl = MultiModalLitModel.load_from_checkpoint(checkpoint_path=checkpoint)\n", | |
"cvcl = cvcl.to(device)\n", | |
"cvcl.eval()\n", | |
"\n", | |
"# create random image to encode\n", | |
"images = torch.rand(4, 3, 224, 224).to(device)\n", | |
"image_features = cvcl.encode_image(images)\n", | |
"\n", | |
"# create texts to encode\n", | |
"texts = [\"ball\", \"puzzle\", \"car\"]\n", | |
"texts, texts_len = tokenize_modified(cvcl,texts)\n", | |
"texts, texts_len = texts.to(device), texts_len.to(device)\n", | |
"texts_features = cvcl.encode_text(texts, texts_len)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "prsclip", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.19" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment