Xifax · July 14, 2020 17:12
diff --git a/onomato.py b/onomato.py
 """
 Parse https://onomatoproject.com/list.html to json dict and download images.
 """
 from typing import Dict, Tuple

 import requests
 from bs4 import BeautifulSoup
 from pathlib import Path
 import json

 BASE = "./images/"


 def download_link(link) -> Tuple[str, Dict]:
    with requests.get(link) as r:
        html = BeautifulSoup(r.content)
        kana = html.h1.text
        definition = html.find("div", class_="termdefs").text

        sentences = []
        examples = list(html.find_all("span", class_="standardtext"))
        furigana = list(html.find_all("span", class_="furigana"))
        translation = list(html.find_all("span", class_="english"))
        for i, _ in enumerate(examples):
            ruby = [str(r) for r in furigana[i].contents]
            sentences.append(
                {
                    "text": examples[i].text,
                    "ruby": ruby,
                    "translation": translation[i].text,
                }
            )

        # download image
        try:
            image = f"https://onomatoproject.com/{html.img.get('src')}"
            image_name = image.split("/")[-1].rstrip(".html")
            path = Path() / Path(BASE) / image_name
            if not path.exists():
                with open(str(path), "wb") as f:
                    f.write(requests.get(image).content)
        except AttributeError:
            image_name = ""

        return (
            kana,
            {"definition": definition, "examples": sentences, "image": image_name},
        )


 if __name__ == "__main__":
    results = {}
    with requests.get("https://onomatoproject.com/list.html") as r:
        r.raise_for_status()
        soup = BeautifulSoup(r.content)
        links = soup.find_all("a", class_="three-col-url")
        # Exclude the last one
        for link in [l.get("href") for l in links][:-1]:
            # Parse each page into sqlite row + image
            url = f"https://onomatoproject.com/{link}"
            print(url)
            entry, data = download_link(url)
            results[entry] = data

    with open("data.json", "w", encoding="utf-8") as f:
        json.dump(results, f)
	"""
	Parse https://onomatoproject.com/list.html to json dict and download images.
	"""
	from typing import Dict, Tuple

	import requests
	from bs4 import BeautifulSoup
	from pathlib import Path
	import json

	BASE = "./images/"


	def download_link(link) -> Tuple[str, Dict]:
	with requests.get(link) as r:
	html = BeautifulSoup(r.content)
	kana = html.h1.text
	definition = html.find("div", class_="termdefs").text

	sentences = []
	examples = list(html.find_all("span", class_="standardtext"))
	furigana = list(html.find_all("span", class_="furigana"))
	translation = list(html.find_all("span", class_="english"))
	for i, _ in enumerate(examples):
	ruby = [str(r) for r in furigana[i].contents]
	sentences.append(
	{
	"text": examples[i].text,
	"ruby": ruby,
	"translation": translation[i].text,
	}
	)

	# download image
	try:
	image = f"https://onomatoproject.com/{html.img.get('src')}"
	image_name = image.split("/")[-1].rstrip(".html")
	path = Path() / Path(BASE) / image_name
	if not path.exists():
	with open(str(path), "wb") as f:
	f.write(requests.get(image).content)
	except AttributeError:
	image_name = ""

	return (
	kana,
	{"definition": definition, "examples": sentences, "image": image_name},
	)


	if __name__ == "__main__":
	results = {}
	with requests.get("https://onomatoproject.com/list.html") as r:
	r.raise_for_status()
	soup = BeautifulSoup(r.content)
	links = soup.find_all("a", class_="three-col-url")
	# Exclude the last one
	for link in [l.get("href") for l in links][:-1]:
	# Parse each page into sqlite row + image
	url = f"https://onomatoproject.com/{link}"
	print(url)
	entry, data = download_link(url)
	results[entry] = data

	with open("data.json", "w", encoding="utf-8") as f:
	json.dump(results, f)