Last active
October 18, 2023 19:37
-
-
Save puhitaku/47f0ce324ca9695b8c7f11be29ae5775 to your computer and use it in GitHub Desktop.
Scraper of the list of Muslim-friendly restaurants on Okayama Health Tourism / Okayama City
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This script scrapes okayamahealthtourism.com/food/okayama-city/ and | |
gathers all restaurants' name and address. | |
The result will be printed out to the stdout. Redirect the stderr | |
if you find it annoying. | |
The result will need some hand-picking and hand-cleansing. | |
While I've never confirmed, this will work for Maniwa City page. | |
LICENSE (all files in this Gist): CC0 1.0 Universal | |
''' | |
import json | |
import sys | |
import requests | |
from bs4 import BeautifulSoup | |
page = 'https://okayamahealthtourism.com/food/okayama-city/' | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' | |
} | |
try: | |
res = requests.get(page, headers=headers) | |
except Exception as e: | |
print('Failed to get the foods page: {res.}', file=sys.stderr) | |
sys.exit(1) | |
if res.status_code != 200: | |
print('Failed to get the foods page: {res.}', file=sys.stderr) | |
sys.exit(1) | |
html = BeautifulSoup(res.text, 'html.parser') | |
urls = html.select('.vc_column-inner > .wpb_wrapper a') | |
urls = [u.attrs['href'] for u in urls] | |
addresses = dict() | |
for url in urls: | |
print(f'>> {url}', file=sys.stderr) | |
res = None | |
for i in range(3): | |
try: | |
res = requests.get(url, headers=headers) | |
if res.status_code != 200: | |
print(f'Got non-200 for {url}', file=sys.stderr) | |
continue | |
break | |
except Exception as e: | |
print(f'Error: failed to get {url}: {e}', file=sys.stderr) | |
if res is None: | |
continue | |
html = BeautifulSoup(res.text, 'html.parser') | |
title = html.find('title').text.replace('Okayama Health Tourism | ', '').strip() | |
spans = html.find_all('span') | |
found = False | |
for span in spans: | |
text = span.text.lower() | |
if 'address' in text: | |
address = span.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020') | |
al = addresses.get(title, []) | |
al.append(address) | |
addresses[title] = al | |
print(f'{title}: {address}', file=sys.stderr) | |
found = True | |
if not found: | |
ps = html.find_all('p') | |
for p in ps: | |
text = p.text.lower() | |
if 'copyright' in text: | |
continue | |
if 'okayama' in text and ('city' in text or 'shi' in text): | |
address = p.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020') | |
al = addresses.get(title, []) | |
al.append(address) | |
addresses[title] = al | |
print(f'{title}: {address}', file=sys.stderr) | |
found = True | |
if not found: | |
print(f'Warning: no address was found for {title}', file=sys.stderr) | |
continue | |
addresses['__len__'] = len(addresses) | |
print(json.dumps(addresses, indent=2, ensure_ascii=False)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
requests | |
beautifulsoup4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment