jamessdixon · January 13, 2025 01:31
diff --git a/create_dummy_documents.ipynb b/create_dummy_documents.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install transformers diffusers torch accelerate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import os\n",
    "import requests\n",
    "from diffusers import StableDiffusionPipeline\n",
    "import torch\n",
    "from PIL import Image\n",
    "from io import BytesIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prompts():\n",
    "    prompts = []\n",
    "    document_types = ['hawaii drivers license with character name, picture, and address', 'english passport with character name, picture, and address', 'cambridge university student id with character name, picture, and address']\n",
    "    characters = ['King Arthur', 'Sir Lancelot', 'Sir Galahad', 'Sir Robin']\n",
    "    for document_type in document_types:\n",
    "        for character in characters:\n",
    "            prompt = f\"Create a {document_type} for {character}.\"\n",
    "            prompts.append(prompt)\n",
    "    return prompts\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_pipe():\n",
    "    model_id = \"stabilityai/stable-diffusion-2\"\n",
    "    pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)\n",
    "    pipe.to(\"mps\")\n",
    "    return pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_image(pipe, document_number, prompt):\n",
    "    generated_image = pipe(prompt).images[0]\n",
    "    image_dir = os.path.join(os.curdir, 'data')\n",
    "    image_path = os.path.join(image_dir, str(document_number) + '.png')\n",
    "\n",
    "    with open(image_path, \"wb\") as image_file:\n",
    "        with BytesIO() as img_byte_arr:\n",
    "            generated_image.save(img_byte_arr, format='PNG')\n",
    "            img_byte_arr.seek(0)\n",
    "            image_file.write(img_byte_arr.read())\n",
    "\n",
    "    print(f\"{document_number} Done.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: \n",
      "```\n",
      "pip install accelerate\n",
      "```\n",
      ".\n",
      "Loading pipeline components...: 100%|██████████| 6/6 [00:02<00:00,  2.14it/s]\n",
      "100%|██████████| 50/50 [00:53<00:00,  1.06s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [00:52<00:00,  1.06s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [00:54<00:00,  1.10s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:01<00:00,  1.23s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:54<00:00,  2.29s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [00:55<00:00,  1.11s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [00:58<00:00,  1.17s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:09<00:00,  1.40s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:08<00:00,  1.36s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:04<00:00,  1.29s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:01<00:00,  1.23s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [01:01<00:00,  1.22s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11 Done.\n"
     ]
    }
   ],
   "source": [
    "pipe = create_pipe()\n",
    "prompts = create_prompts()\n",
    "document_number = 0\n",
    "for prompt in prompts:\n",
    "    create_image(pipe, document_number,prompt)\n",
    "    document_number += 1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install transformers diffusers torch accelerate"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"import io\n",
	"import os\n",
	"import requests\n",
	"from diffusers import StableDiffusionPipeline\n",
	"import torch\n",
	"from PIL import Image\n",
	"from io import BytesIO"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"def create_prompts():\n",
	" prompts = []\n",
	" document_types = ['hawaii drivers license with character name, picture, and address', 'english passport with character name, picture, and address', 'cambridge university student id with character name, picture, and address']\n",
	" characters = ['King Arthur', 'Sir Lancelot', 'Sir Galahad', 'Sir Robin']\n",
	" for document_type in document_types:\n",
	" for character in characters:\n",
	" prompt = f\"Create a {document_type} for {character}.\"\n",
	" prompts.append(prompt)\n",
	" return prompts\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def create_pipe():\n",
	" model_id = \"stabilityai/stable-diffusion-2\"\n",
	" pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)\n",
	" pipe.to(\"mps\")\n",
	" return pipe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"def create_image(pipe, document_number, prompt):\n",
	" generated_image = pipe(prompt).images[0]\n",
	" image_dir = os.path.join(os.curdir, 'data')\n",
	" image_path = os.path.join(image_dir, str(document_number) + '.png')\n",
	"\n",
	" with open(image_path, \"wb\") as image_file:\n",
	" with BytesIO() as img_byte_arr:\n",
	" generated_image.save(img_byte_arr, format='PNG')\n",
	" img_byte_arr.seek(0)\n",
	" image_file.write(img_byte_arr.read())\n",
	"\n",
	" print(f\"{document_number} Done.\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: \n",
	"```\n",
	"pip install accelerate\n",
	"```\n",
	".\n",
	"Loading pipeline components...: 100%\|██████████\| 6/6 [00:02<00:00, 2.14it/s]\n",
	"100%\|██████████\| 50/50 [00:53<00:00, 1.06s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [00:52<00:00, 1.06s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [00:54<00:00, 1.10s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"2 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:01<00:00, 1.23s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:54<00:00, 2.29s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [00:55<00:00, 1.11s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"5 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [00:58<00:00, 1.17s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"6 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:09<00:00, 1.40s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"7 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:08<00:00, 1.36s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"8 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:04<00:00, 1.29s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"9 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:01<00:00, 1.23s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"10 Done.\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 50/50 [01:01<00:00, 1.22s/it]\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"11 Done.\n"
	]
	}
	],
	"source": [
	"pipe = create_pipe()\n",
	"prompts = create_prompts()\n",
	"document_number = 0\n",
	"for prompt in prompts:\n",
	" create_image(pipe, document_number,prompt)\n",
	" document_number += 1"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": ".venv",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}