Skip to content

Instantly share code, notes, and snippets.

@CaptainStabs
Created July 26, 2025 18:43
Show Gist options
  • Save CaptainStabs/75e58cd50e0d95db464130d800015d4e to your computer and use it in GitHub Desktop.
Save CaptainStabs/75e58cd50e0d95db464130d800015d4e to your computer and use it in GitHub Desktop.
codesample
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import os
import dbm
import csv
import requests
import random
desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
def random_user_agent():
return random.choice(desktop_agents)
def make_headers():
headers = {
'User-Agent': random_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Connection': 'keep-alive',
}
return headers
s = requests.Session()
def scrape_menu(db, url):
response = s.get('https://www.allmenus.com' + url,
headers=make_headers(),
timeout=31.05)
if response.status_code != 200:
db[url] = 'BAD'
return
soup = BeautifulSoup(response.content, features="lxml")
try:
state, city = [a.text.strip() for a in soup.find(class_ = "s-list-inline-breadcrumb").findAll("a")]
address = soup.find(class_= "menu-address").text.strip()
restaurant_name = soup.find('div', class_='restaurant-summary').find('h1').text.strip()
except ValueError:
return
items = soup.findAll("div", class_ = "item-main")
item_dict_list = []
for item in items:
item_dict = {
'restaurant_name' : restaurant_name,
'complete_address' : address,
'city' : city,
'state': state,
'name' : item.find(class_ = "item-title").text.strip(),
'price_usd': item.find(class_ = "item-price").text.strip(),
}
item_dict_list.append(item_dict)
if len(item_dict_list) == 0:
db[url] = 'NO MENU'
return
with open('allmenus_menus.csv', 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
for item_dict in item_dict_list:
writer.writerow(item_dict)
db[url] = 'SCRAPED'
if __name__ == '__main__':
fieldnames = ['restaurant_name', 'complete_address', 'city', 'state',
'name', 'price_usd']
if not os.path.exists("allmenus_menus.csv"):
"""
Master list of all the menu items. Unfiltered data
"""
with open('allmenus_menus.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
writer.writeheader()
with dbm.open('allmenus_restaurants', 'c') as db:
restaurants = [(key.decode(), db[key].decode()) for key in db.keys()]
for url, value in tqdm(restaurants):
if 'www.grubhub.com' in url:
db[url] = 'GRUBHUB'
continue
if 'www.seamless.com' in url:
db[url] = 'SEAMLESS'
continue
if value != 'UNSCRAPED':
continue
scrape_menu(db, url)
sleep(.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment