Created
November 9, 2016 23:54
-
-
Save pmallory/671e7e7398af426404edfd3f485b7fa6 to your computer and use it in GitHub Desktop.
Search a page for a representative image (a big, square one). Inspired by the code Reddit uses to pick images to put next to headlines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import sys | |
import urllib | |
import bs4 | |
import requests | |
from PIL import Image | |
def get_image_list(url): | |
"""Given the url of an HTML document, return a list of urls of all images | |
on the page. | |
""" | |
response = requests.get(url) | |
soup = bs4.BeautifulSoup(response.content, "html.parser") | |
image_tags = soup.findAll('img') | |
image_urls = [] | |
for image_tag in image_tags: | |
image_url = image_tag.get('src') | |
# image_url might be a relative url, urljoin will make a full url if necessary | |
full_url = urllib.parse.urljoin(url, image_url) | |
if full_url[-4:] != '.svg': # PIL can't handle svgs, so skip | |
image_urls.append(full_url) | |
return image_urls | |
def image_dimensions(image): | |
width = image.size[0] | |
height = image.size[1] | |
return width, height, width*height | |
def is_square(width, height): | |
"""Determine if an image is close to square shaped""" | |
return 0.5 < width/height < 2 | |
def is_large(area): | |
"""Determine if an image is largish""" | |
return area > 5000 | |
def select_best_image(image_urls): | |
biggest_image = None | |
biggest_size = 0 | |
for url in image_urls: | |
response = requests.get(url) | |
im = Image.open(io.BytesIO(response.content)) | |
width, height, area = image_dimensions(im) | |
if is_square(width, height) and is_large(area): | |
if area > biggest_size: | |
biggest_image = url | |
biggest_size = area | |
return biggest_image | |
if __name__ == '__main__': | |
url = sys.argv[1] | |
images = get_image_list(url) | |
print(select_best_image(images)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment