Skip to content

Instantly share code, notes, and snippets.

@Map987
Created October 14, 2024 01:17
Show Gist options
  • Save Map987/fd2fc6071c5c9b6bfecda6b85c644ffc to your computer and use it in GitHub Desktop.
Save Map987/fd2fc6071c5c9b6bfecda6b85c644ffc to your computer and use it in GitHub Desktop.
-untitled57-ipynb-ipynb.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyOxta5udFmSkALj74bs/1/W",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Map987/fd2fc6071c5c9b6bfecda6b85c644ffc/-untitled57-ipynb-ipynb.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"#@title exf_set为空表示看exf\n",
"#@markdown 随便输入一个字符,看日语\n",
"!apt-get install exiftool\n",
"\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import subprocess\n",
"import re\n",
"import os\n",
"requests.packages.urllib3.disable_warnings()\n",
"url = 'https://www.google.com/search?newwindow=1&sca_esv=4cf44482ebf951fe&tbs=simg:CAQSiwIJmxK2AlaXSlsa_1wELELCMpwgaOgo4CAQSFJUb1Au7MYg_14AbCHrMU_1S_1tPekXGhqU-cm8_1euLV_1yBqvRNRs3opp8UhvQcz4DMPSAFMAQMCxCOrv4IGgoKCAgBEgS_1qqUwDAsQne3BCRqfAQonChNmaWN0aW9uYWwgY2hhcmFjdGVy2qWI9gMMCgovbS8wMmg3bGt0ChkKBmNvbWljc9qliPYDCwoJL20vMDEyaDI0ChoKB2ZpY3Rpb27apYj2AwsKCS9qLzRmazlxMwoiChBhbmltYXRlZCBjYXJ0b29u2qWI9gMKCggvbS8wOTViYgoZCgZwb3N0ZXLapYj2AwsKCS9tLzAxbjVqcQw&sxsrf=ADLYWIJOVAOyfF1cAv6lfL55hxfYOWv-MA:1718480494934&q=demon+slayer+film+2&tbm=isch&sa=X&ved=2ahUKEwjiv9Dbrt6GAxVfMjQIHRv9C0IQ2A56BAgFEAI&biw=360&bih=677&dpr=3' # @param {type:\"string\"}\n",
"original_url = url\n",
"match = re.search(r'&start=\\d+&', original_url)\n",
"#@markdown\n",
"\n",
"exf_set = '' # @param {type:\"string\"}\n",
"# 如果找到了匹配项,替换它;否则,在URL末尾添加 '&start={page}'\n",
"if match:\n",
" # 使用正则表达式替换找到的部分\n",
" processed_url = re.sub(r'&start=\\d+&', r'&start={page}&', original_url)\n",
"else:\n",
" # 在URL末尾添加 '&start={page}'\n",
" if not original_url.endswith('&'):\n",
" original_url += '&'\n",
" processed_url = original_url + 'start={page}'\n",
"\n",
"url = processed_url\n",
"ua_list = [\n",
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; LG; Optimus 7)'\n",
"]\n",
"headers = {'User-Agent': ua_list[0]}\n",
"start_page = 0\n",
"end_page = 200\n",
"temp_folder = '/content/temp_ima'\n",
"os.makedirs(temp_folder, exist_ok=True)\n",
"\n",
"image_info_list = []\n",
"for page in range(start_page, end_page, 20):\n",
" current_url = url.format(page=page)\n",
" try:\n",
" response = requests.get(current_url, headers=headers, verify=False)\n",
"\n",
" if response.status_code == 200:\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
" for a in soup.find_all('a', href=True):\n",
" if 'imgurl' in a['href']:\n",
" img_url = (a['href'].split('imgurl=')[1].split('&')[0]).strip().replace('-scaled', '').replace('-683x1024', '').replace('i0.wp.com/', '')\n",
"\n",
" img_url = re.sub(r'(\\.tif|\\.bmp|\\.jpeg|\\.webp|\\.png|\\.jpg)\\?.+', r'\\1', img_url)\n",
"\n",
" img_response = requests.get(img_url, stream=True, verify=False)\n",
" if img_response.status_code == 200:\n",
" img_filename = os.path.join(temp_folder, img_url.split('/')[-1])\n",
" if len(img_filename) > 255:\n",
" img_filename = img_filename[:255]\n",
" with open(img_filename, 'wb') as f:\n",
" f.write(img_response.content)\n",
" image_info_list.append((img_filename, img_url))\n",
" try:\n",
" result = subprocess.check_output(['exiftool', '-a', '-G', img_filename], stderr=subprocess.STDOUT)\n",
" found_xmp = False\n",
" for line in result.decode('utf-8').split('\\n'):\n",
" if exf_set is not None:\n",
" if re.search(r'xmp', line):\n",
" if not found_xmp:\n",
" print(f\"Extracted Image URL: {img_url}\")\n",
" found_xmp = True\n",
"\n",
" print(line)\n",
" else:\n",
" if re.search(r'[\\u3040-\\u309f\\u30a0-\\u30ff\\uff66-\\uff9f]', line):\n",
" if not found_xmp:\n",
" print(f\"Extracted Image URL: {img_url}\")\n",
" found_xmp = True\n",
" print(line)\n",
" except subprocess.CalledProcessError as e:\n",
" print(\"Error processing file:\", e.output)\n",
" except Exception as e:\n",
" print(\"An error occurred, but continuing:\", e)"
],
"metadata": {
"id": "L--iqzB4DE6O",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ok =\n",
"img_url = re.sub(r'\\.(' + '|'.join(['png', 'jpg', 'jpeg', 'gif', 'webp']) + ')\\?', '', img_url)\n",
"#@markdown\n",
"print(img_url)\n",
"\"\"\"\n",
"\n",
"\"\"\"\n",
"import re\n",
"\n",
"# 原始的URL列表\n",
"img_urls = [\n",
" \"https://superunofficial.co/cdn/shop/files/NEPTUNE-FLAMIN-MOCK_1024x1024.png?v%3D1707456590\",\n",
" \"https://superunofficial.co/cdn/shop/files/NEPTUNE-FLAMIN-MOCK_1024x1024.jpg?v%3D1707456590\",\n",
" \".test?170\",\n",
" \"test?170\",\n",
" \"png.png?170\",\n",
" \"https://www.cosrea.com/cdn/shop/products/cosrea-tv-costumes-star-wars-the-clone-wars-ahsoka-tano-cosplay-costume-33615101558953_1024x1024.png?v%3D1629389021\"\n",
"]\n",
"\n",
"# 清理每个URL的正则表达式,只在查询字符串前是.png或.jpg等图片扩展名时进行替换\n",
"cleaned_img_urls = [re.sub(r'(\\.tif|\\.bmp|\\.jpeg|\\.webp|\\.png|\\.jpg)\\?.+', r'\\1', url) for url in img_urls]\n",
"\n",
"# 打印清理后的URL\n",
"for cleaned_img_url in cleaned_img_urls:\n",
" print(cleaned_img_url)"
],
"metadata": {
"id": "bnQ0-xzZKET8",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import os\n",
"\n",
"# 假设 image_info_list 已经在第一步中填充好了\n",
"image_size_info = []\n",
"\n",
"for img_filename, img_url in image_info_list:\n",
" try:\n",
" img_size = os.path.getsize(img_filename) / (1024 * 1024) # 将大小转换为MB\n",
" image_size_info.append((img_size, img_filename, img_url))\n",
" except FileNotFoundError:\n",
" print(f\"File not found: {img_filename}\")\n",
"\n",
"# 按图片大小排序\n",
"image_size_info.sort()\n",
"\n",
"# 打印前十大小的图片信息\n",
"for img_size, img_filename, img_url in image_size_info[-10:]:\n",
" print(f\"Size: {img_size:.2f} MB, Filename: {img_filename}, URL: {img_url}\")"
],
"metadata": {
"id": "qQznrKSa61gN"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment