puhitaku · October 18, 2023 19:37
diff --git a/okayama_health_tourism_scraper.py b/okayama_health_tourism_scraper.py
 '''
 This script scrapes okayamahealthtourism.com/food/okayama-city/ and
 gathers all restaurants' name and address.

 The result will be printed out to the stdout. Redirect the stderr
 if you find it annoying.

 The result will need some hand-picking and hand-cleansing.

 While I've never confirmed, this will work for Maniwa City page.

 LICENSE (all files in this Gist): CC0 1.0 Universal
 '''

 import json
 import sys

 import requests
 from bs4 import BeautifulSoup


 page = 'https://okayamahealthtourism.com/food/okayama-city/'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
 }

 try:
    res = requests.get(page, headers=headers)
 except Exception as e:
    print('Failed to get the foods page: {res.}', file=sys.stderr)
    sys.exit(1)

 if res.status_code != 200:
    print('Failed to get the foods page: {res.}', file=sys.stderr)
    sys.exit(1)

 html = BeautifulSoup(res.text, 'html.parser')
 urls = html.select('.vc_column-inner > .wpb_wrapper a')
 urls = [u.attrs['href'] for u in urls]

 addresses = dict()

 for url in urls:
    print(f'>> {url}', file=sys.stderr)

    res = None
    for i in range(3):
        try:
            res = requests.get(url, headers=headers)
            if res.status_code != 200:
                print(f'Got non-200 for {url}', file=sys.stderr)
                continue
            break
        except Exception as e:
            print(f'Error: failed to get {url}: {e}', file=sys.stderr)

    if res is None:
        continue

    html = BeautifulSoup(res.text, 'html.parser')
    title = html.find('title').text.replace('Okayama Health Tourism | ', '').strip()
    spans = html.find_all('span')

    found = False
    for span in spans:
        text = span.text.lower()

        if 'address' in text:
            address = span.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
            al = addresses.get(title, [])
            al.append(address)
            addresses[title] = al
            print(f'{title}: {address}', file=sys.stderr)
            found = True

    if not found:
        ps = html.find_all('p')
        for p in ps:
            text = p.text.lower()
            if 'copyright' in text:
                continue

            if 'okayama' in text and ('city' in text or 'shi' in text):
                address = p.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
                al = addresses.get(title, [])
                al.append(address)
                addresses[title] = al
                print(f'{title}: {address}', file=sys.stderr)
                found = True

    if not found:
        print(f'Warning: no address was found for {title}', file=sys.stderr)
        continue

 addresses['__len__'] = len(addresses)
 print(json.dumps(addresses, indent=2, ensure_ascii=False))
diff --git a/requirements.txt b/requirements.txt
 requests
 beautifulsoup4
	'''
	This script scrapes okayamahealthtourism.com/food/okayama-city/ and
	gathers all restaurants' name and address.

	The result will be printed out to the stdout. Redirect the stderr
	if you find it annoying.

	The result will need some hand-picking and hand-cleansing.

	While I've never confirmed, this will work for Maniwa City page.

	LICENSE (all files in this Gist): CC0 1.0 Universal
	'''

	import json
	import sys

	import requests
	from bs4 import BeautifulSoup


	page = 'https://okayamahealthtourism.com/food/okayama-city/'
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
	}

	try:
	res = requests.get(page, headers=headers)
	except Exception as e:
	print('Failed to get the foods page: {res.}', file=sys.stderr)
	sys.exit(1)

	if res.status_code != 200:
	print('Failed to get the foods page: {res.}', file=sys.stderr)
	sys.exit(1)

	html = BeautifulSoup(res.text, 'html.parser')
	urls = html.select('.vc_column-inner > .wpb_wrapper a')
	urls = [u.attrs['href'] for u in urls]

	addresses = dict()

	for url in urls:
	print(f'>> {url}', file=sys.stderr)

	res = None
	for i in range(3):
	try:
	res = requests.get(url, headers=headers)
	if res.status_code != 200:
	print(f'Got non-200 for {url}', file=sys.stderr)
	continue
	break
	except Exception as e:
	print(f'Error: failed to get {url}: {e}', file=sys.stderr)

	if res is None:
	continue

	html = BeautifulSoup(res.text, 'html.parser')
	title = html.find('title').text.replace('Okayama Health Tourism \| ', '').strip()
	spans = html.find_all('span')

	found = False
	for span in spans:
	text = span.text.lower()

	if 'address' in text:
	address = span.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
	al = addresses.get(title, [])
	al.append(address)
	addresses[title] = al
	print(f'{title}: {address}', file=sys.stderr)
	found = True

	if not found:
	ps = html.find_all('p')
	for p in ps:
	text = p.text.lower()
	if 'copyright' in text:
	continue

	if 'okayama' in text and ('city' in text or 'shi' in text):
	address = p.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
	al = addresses.get(title, [])
	al.append(address)
	addresses[title] = al
	print(f'{title}: {address}', file=sys.stderr)
	found = True

	if not found:
	print(f'Warning: no address was found for {title}', file=sys.stderr)
	continue

	addresses['__len__'] = len(addresses)
	print(json.dumps(addresses, indent=2, ensure_ascii=False))