Created
October 12, 2019 19:24
-
-
Save mdzhang/5d6ad5927295a0de1b8ac93140281fb1 to your computer and use it in GitHub Desktop.
Scrape NomadList for data and dump to CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Scrape data from NomadList into local CSV. | |
Usage: | |
python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City' | |
""" | |
import argparse | |
import logging | |
import os | |
import re | |
import string | |
import typing as T | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from tabulate import tabulate | |
logging.basicConfig() | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.DEBUG) | |
def load_to_record(city): | |
clean_city = re.sub(r"\s+", "-", city.lower()) | |
url = f"https://nomadlist.com/{clean_city}" | |
driver = webdriver.Firefox() | |
driver.get(url) | |
html_source = driver.page_source | |
driver.close() | |
soup = BeautifulSoup(html_source, "html.parser") | |
nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0] | |
keys = list( | |
map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"})) | |
) | |
values = list( | |
map( | |
lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"}) | |
) | |
) | |
record = dict(zip(keys, values)) | |
record["city"] = city | |
return record | |
def load_to_df(cities): | |
def skip_fail(city): | |
try: | |
return load_to_record(city) | |
except Exception as exc: | |
logger.exception(f"Failed to fetch city: {city}, {exc}") | |
return None | |
records = list(filter(None, map(skip_fail, cities))) | |
df = pd.DataFrame.from_dict(records) | |
def strip_emojis(s): | |
return "".join(filter(lambda x: x in string.printable, s)).strip() | |
cols = map(lambda col: strip_emojis(col), df.columns) | |
df.columns = cols | |
top_cols = [ | |
"LGBT friendly", | |
"Female friendly", | |
"Safety", | |
"Nomad Score", | |
"Internet", | |
"Walkability", | |
"Traffic safety", | |
"English speaking", | |
"Fun", | |
"Happiness", | |
"Places to work from", | |
"Cost", | |
"city", | |
] | |
df2 = df[top_cols] | |
# pd.set_option('display.max_columns', None) | |
def extract_cost(df): | |
df2 = df["Cost"].str.split(":", expand=True) | |
cost = ( | |
df2[1] | |
.str.extract(pat=r"\$([\d,]+) \/ mo")[0] | |
.str.replace(",", "") | |
.astype(int) | |
) | |
return cost | |
df2["Cost"] = extract_cost(df2) | |
def extract_internet(df): | |
df2 = df["Internet"].str.split(":", expand=True) | |
speed = ( | |
df2[1] | |
.str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0] | |
.str.replace(",", "") | |
.astype(int) | |
) | |
return speed | |
df2["Internet"] = extract_internet(df2) | |
def extract_nomad_score(df): | |
return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float) | |
df2["Nomad Score"] = extract_nomad_score(df2) | |
cat_cols = set(df2.dtypes[df2.dtypes == "object"].index) | |
cat_cols.remove("city") | |
levels = ["Bad", "Okay", "Good", "Great"] | |
df2[cat_cols] = df2[cat_cols].apply( | |
lambda s: s.astype("category").cat.set_categories(levels, ordered=True) | |
) | |
return df2.sort_values( | |
by=["LGBT friendly", "Female friendly", "Walkability", "Safety"], | |
ascending=False, | |
) | |
def get_parser(): | |
parser = argparse.ArgumentParser( | |
description="Fetch data from NomadList and write as CSV" | |
) | |
parser.add_argument("--cities", nargs="+", help="Cities to fetch data on") | |
return parser | |
def main(cities=T.List[str]): | |
cache_file = "nomadlist.csv" | |
if not os.path.exists(cache_file): | |
logger.info(f"Fetching contents for first time '{cache_file}'") | |
df = load_to_df(cities) | |
df.to_csv(cache_file, index=False) | |
else: | |
logger.info(f"Reusing local '{cache_file}'") | |
df = pd.read_csv(cache_file) | |
print(tabulate(df, headers="keys", tablefmt="psql")) | |
if __name__ == "__main__": | |
parser = get_parser() | |
args = parser.parse_args() | |
main(cities=args.cities) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment