Created
July 26, 2025 18:43
-
-
Save CaptainStabs/75e58cd50e0d95db464130d800015d4e to your computer and use it in GitHub Desktop.
codesample
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from time import sleep | |
from tqdm import tqdm | |
import os | |
import dbm | |
import csv | |
import requests | |
import random | |
desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'] | |
def random_user_agent(): | |
return random.choice(desktop_agents) | |
def make_headers(): | |
headers = { | |
'User-Agent': random_user_agent(), | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Upgrade-Insecure-Requests': '1', | |
'Sec-Fetch-Dest': 'document', | |
'Sec-Fetch-Mode': 'navigate', | |
'Sec-Fetch-Site': 'none', | |
'Sec-Fetch-User': '?1', | |
'Connection': 'keep-alive', | |
} | |
return headers | |
s = requests.Session() | |
def scrape_menu(db, url): | |
response = s.get('https://www.allmenus.com' + url, | |
headers=make_headers(), | |
timeout=31.05) | |
if response.status_code != 200: | |
db[url] = 'BAD' | |
return | |
soup = BeautifulSoup(response.content, features="lxml") | |
try: | |
state, city = [a.text.strip() for a in soup.find(class_ = "s-list-inline-breadcrumb").findAll("a")] | |
address = soup.find(class_= "menu-address").text.strip() | |
restaurant_name = soup.find('div', class_='restaurant-summary').find('h1').text.strip() | |
except ValueError: | |
return | |
items = soup.findAll("div", class_ = "item-main") | |
item_dict_list = [] | |
for item in items: | |
item_dict = { | |
'restaurant_name' : restaurant_name, | |
'complete_address' : address, | |
'city' : city, | |
'state': state, | |
'name' : item.find(class_ = "item-title").text.strip(), | |
'price_usd': item.find(class_ = "item-price").text.strip(), | |
} | |
item_dict_list.append(item_dict) | |
if len(item_dict_list) == 0: | |
db[url] = 'NO MENU' | |
return | |
with open('allmenus_menus.csv', 'a') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames = fieldnames) | |
for item_dict in item_dict_list: | |
writer.writerow(item_dict) | |
db[url] = 'SCRAPED' | |
if __name__ == '__main__': | |
fieldnames = ['restaurant_name', 'complete_address', 'city', 'state', | |
'name', 'price_usd'] | |
if not os.path.exists("allmenus_menus.csv"): | |
""" | |
Master list of all the menu items. Unfiltered data | |
""" | |
with open('allmenus_menus.csv', 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames = fieldnames) | |
writer.writeheader() | |
with dbm.open('allmenus_restaurants', 'c') as db: | |
restaurants = [(key.decode(), db[key].decode()) for key in db.keys()] | |
for url, value in tqdm(restaurants): | |
if 'www.grubhub.com' in url: | |
db[url] = 'GRUBHUB' | |
continue | |
if 'www.seamless.com' in url: | |
db[url] = 'SEAMLESS' | |
continue | |
if value != 'UNSCRAPED': | |
continue | |
scrape_menu(db, url) | |
sleep(.5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment