Created
October 14, 2024 01:17
-
-
Save Map987/fd2fc6071c5c9b6bfecda6b85c644ffc to your computer and use it in GitHub Desktop.
-untitled57-ipynb-ipynb.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyOxta5udFmSkALj74bs/1/W", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Map987/fd2fc6071c5c9b6bfecda6b85c644ffc/-untitled57-ipynb-ipynb.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title exf_set为空表示看exf\n", | |
"#@markdown 随便输入一个字符,看日语\n", | |
"!apt-get install exiftool\n", | |
"\n", | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"import subprocess\n", | |
"import re\n", | |
"import os\n", | |
"requests.packages.urllib3.disable_warnings()\n", | |
"url = 'https://www.google.com/search?newwindow=1&sca_esv=4cf44482ebf951fe&tbs=simg:CAQSiwIJmxK2AlaXSlsa_1wELELCMpwgaOgo4CAQSFJUb1Au7MYg_14AbCHrMU_1S_1tPekXGhqU-cm8_1euLV_1yBqvRNRs3opp8UhvQcz4DMPSAFMAQMCxCOrv4IGgoKCAgBEgS_1qqUwDAsQne3BCRqfAQonChNmaWN0aW9uYWwgY2hhcmFjdGVy2qWI9gMMCgovbS8wMmg3bGt0ChkKBmNvbWljc9qliPYDCwoJL20vMDEyaDI0ChoKB2ZpY3Rpb27apYj2AwsKCS9qLzRmazlxMwoiChBhbmltYXRlZCBjYXJ0b29u2qWI9gMKCggvbS8wOTViYgoZCgZwb3N0ZXLapYj2AwsKCS9tLzAxbjVqcQw&sxsrf=ADLYWIJOVAOyfF1cAv6lfL55hxfYOWv-MA:1718480494934&q=demon+slayer+film+2&tbm=isch&sa=X&ved=2ahUKEwjiv9Dbrt6GAxVfMjQIHRv9C0IQ2A56BAgFEAI&biw=360&bih=677&dpr=3' # @param {type:\"string\"}\n", | |
"original_url = url\n", | |
"match = re.search(r'&start=\\d+&', original_url)\n", | |
"#@markdown\n", | |
"\n", | |
"exf_set = '' # @param {type:\"string\"}\n", | |
"# 如果找到了匹配项,替换它;否则,在URL末尾添加 '&start={page}'\n", | |
"if match:\n", | |
" # 使用正则表达式替换找到的部分\n", | |
" processed_url = re.sub(r'&start=\\d+&', r'&start={page}&', original_url)\n", | |
"else:\n", | |
" # 在URL末尾添加 '&start={page}'\n", | |
" if not original_url.endswith('&'):\n", | |
" original_url += '&'\n", | |
" processed_url = original_url + 'start={page}'\n", | |
"\n", | |
"url = processed_url\n", | |
"ua_list = [\n", | |
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; LG; Optimus 7)'\n", | |
"]\n", | |
"headers = {'User-Agent': ua_list[0]}\n", | |
"start_page = 0\n", | |
"end_page = 200\n", | |
"temp_folder = '/content/temp_ima'\n", | |
"os.makedirs(temp_folder, exist_ok=True)\n", | |
"\n", | |
"image_info_list = []\n", | |
"for page in range(start_page, end_page, 20):\n", | |
" current_url = url.format(page=page)\n", | |
" try:\n", | |
" response = requests.get(current_url, headers=headers, verify=False)\n", | |
"\n", | |
" if response.status_code == 200:\n", | |
" soup = BeautifulSoup(response.text, 'html.parser')\n", | |
" for a in soup.find_all('a', href=True):\n", | |
" if 'imgurl' in a['href']:\n", | |
" img_url = (a['href'].split('imgurl=')[1].split('&')[0]).strip().replace('-scaled', '').replace('-683x1024', '').replace('i0.wp.com/', '')\n", | |
"\n", | |
" img_url = re.sub(r'(\\.tif|\\.bmp|\\.jpeg|\\.webp|\\.png|\\.jpg)\\?.+', r'\\1', img_url)\n", | |
"\n", | |
" img_response = requests.get(img_url, stream=True, verify=False)\n", | |
" if img_response.status_code == 200:\n", | |
" img_filename = os.path.join(temp_folder, img_url.split('/')[-1])\n", | |
" if len(img_filename) > 255:\n", | |
" img_filename = img_filename[:255]\n", | |
" with open(img_filename, 'wb') as f:\n", | |
" f.write(img_response.content)\n", | |
" image_info_list.append((img_filename, img_url))\n", | |
" try:\n", | |
" result = subprocess.check_output(['exiftool', '-a', '-G', img_filename], stderr=subprocess.STDOUT)\n", | |
" found_xmp = False\n", | |
" for line in result.decode('utf-8').split('\\n'):\n", | |
" if exf_set is not None:\n", | |
" if re.search(r'xmp', line):\n", | |
" if not found_xmp:\n", | |
" print(f\"Extracted Image URL: {img_url}\")\n", | |
" found_xmp = True\n", | |
"\n", | |
" print(line)\n", | |
" else:\n", | |
" if re.search(r'[\\u3040-\\u309f\\u30a0-\\u30ff\\uff66-\\uff9f]', line):\n", | |
" if not found_xmp:\n", | |
" print(f\"Extracted Image URL: {img_url}\")\n", | |
" found_xmp = True\n", | |
" print(line)\n", | |
" except subprocess.CalledProcessError as e:\n", | |
" print(\"Error processing file:\", e.output)\n", | |
" except Exception as e:\n", | |
" print(\"An error occurred, but continuing:\", e)" | |
], | |
"metadata": { | |
"id": "L--iqzB4DE6O", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ok =\n", | |
"img_url = re.sub(r'\\.(' + '|'.join(['png', 'jpg', 'jpeg', 'gif', 'webp']) + ')\\?', '', img_url)\n", | |
"#@markdown\n", | |
"print(img_url)\n", | |
"\"\"\"\n", | |
"\n", | |
"\"\"\"\n", | |
"import re\n", | |
"\n", | |
"# 原始的URL列表\n", | |
"img_urls = [\n", | |
" \"https://superunofficial.co/cdn/shop/files/NEPTUNE-FLAMIN-MOCK_1024x1024.png?v%3D1707456590\",\n", | |
" \"https://superunofficial.co/cdn/shop/files/NEPTUNE-FLAMIN-MOCK_1024x1024.jpg?v%3D1707456590\",\n", | |
" \".test?170\",\n", | |
" \"test?170\",\n", | |
" \"png.png?170\",\n", | |
" \"https://www.cosrea.com/cdn/shop/products/cosrea-tv-costumes-star-wars-the-clone-wars-ahsoka-tano-cosplay-costume-33615101558953_1024x1024.png?v%3D1629389021\"\n", | |
"]\n", | |
"\n", | |
"# 清理每个URL的正则表达式,只在查询字符串前是.png或.jpg等图片扩展名时进行替换\n", | |
"cleaned_img_urls = [re.sub(r'(\\.tif|\\.bmp|\\.jpeg|\\.webp|\\.png|\\.jpg)\\?.+', r'\\1', url) for url in img_urls]\n", | |
"\n", | |
"# 打印清理后的URL\n", | |
"for cleaned_img_url in cleaned_img_urls:\n", | |
" print(cleaned_img_url)" | |
], | |
"metadata": { | |
"id": "bnQ0-xzZKET8", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import os\n", | |
"\n", | |
"# 假设 image_info_list 已经在第一步中填充好了\n", | |
"image_size_info = []\n", | |
"\n", | |
"for img_filename, img_url in image_info_list:\n", | |
" try:\n", | |
" img_size = os.path.getsize(img_filename) / (1024 * 1024) # 将大小转换为MB\n", | |
" image_size_info.append((img_size, img_filename, img_url))\n", | |
" except FileNotFoundError:\n", | |
" print(f\"File not found: {img_filename}\")\n", | |
"\n", | |
"# 按图片大小排序\n", | |
"image_size_info.sort()\n", | |
"\n", | |
"# 打印前十大小的图片信息\n", | |
"for img_size, img_filename, img_url in image_size_info[-10:]:\n", | |
" print(f\"Size: {img_size:.2f} MB, Filename: {img_filename}, URL: {img_url}\")" | |
], | |
"metadata": { | |
"id": "qQznrKSa61gN" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment