Last active
May 15, 2022 11:44
-
-
Save ohahohah/b7f07813437dd235c5f9829188c6d6d5 to your computer and use it in GitHub Desktop.
스탬프투어참여_박물관미술관리스트 스크래핑 후 엑셀에 저장. 구글 내 지도에서 엑셀파일을 지도 마커로 불러올 수 있음. 결과지도 : https://www.google.com/maps/d/edit?mid=1F_gqeG2V5V6ac07dkczUZ8CpmMRhV9lc&usp=sharing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from itertools import chain | |
import requests | |
from bs4 import BeautifulSoup | |
from openpyxl import Workbook | |
BASE_URL = 'https://xn--2d3b68pp1a79ecyl.kr' # https://뮤지엄위크.kr | |
YEAR = '2022' | |
REGION_CODE = {'서울': 1, '경기': 2, '강원/인천': 3, '경상/부산/울산/대구': 4, '전라/광주/제주': 5, '충청/대전/세종': 6} | |
DESKTOP_CHROME = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} | |
def scrap_museum_urls(): | |
"""박물관미술관 리스트페이지에서 모든 지역의 각 기관 url 링크를 스크래핑""" | |
museum_urls = [] | |
for region_code in REGION_CODE.values(): | |
museum_urls.extend(scrap_museum_urls_region(region_code)) | |
return museum_urls | |
def scrap_museum_urls_region(region_code): | |
"""박물관미술관 리스트페이지에서 특정 지역의 기관 url 링크를 스크래핑 | |
Parameters: | |
region_code (int): 특정 지역 코드 = 리스트페이지의 해당 지역 li 행번호 | |
Returns: | |
list: 기관 urls | |
""" | |
data = requests.get(f'{BASE_URL}/program/stamp', headers=DESKTOP_CHROME) | |
soup = BeautifulSoup(data.text, 'html.parser') | |
museum_list_tb = f'#app > main > div.container > div > div > div.info09_con > div.contents_con > div ' \ | |
f'> table > tbody > tr > td:nth-child(2) > div.list_con > ul > li:nth-child({region_code}) > div' \ | |
f'> table >tbody' | |
museum_tds = soup.select(f'{museum_list_tb}> tr > td') | |
museum_urls = [] | |
for museum_td in museum_tds: | |
urls = [convert_url(BASE_URL, museum_li.select_one('a')['href']) | |
for museum_li in museum_td.select('div > ul > li') | |
if is_url(museum_li.select_one('a')['href'])] | |
museum_urls.extend(urls) | |
return list(set(museum_urls)) | |
def scrap_museum_info(url): | |
"""박물관미술관 기관 페이지에서 정보를 스크래핑 | |
Parameters: | |
url (string): 스크래핑할 기관 페이지 url | |
Returns: | |
dictionary: | |
{'name':장소명, 'address': 주소, 'phone': 전화번호, 'url': 기관 url, 'homepage' : 홈페이지, 'opening_hrs': {운영시간 정보}} | |
""" | |
data = requests.get(url, headers=DESKTOP_CHROME) | |
soup = BeautifulSoup(data.text, 'html.parser') | |
base_element = '#app > main > div.container.museum.detail > div.museum-header > div.info' | |
name_raw = soup.select_one(f'{base_element}> div.name') | |
name = name_raw.text if name_raw is not None else 'NaN' | |
address_raw = soup.select_one(f'{base_element}> ul > li:nth-child(1) > div') | |
address = address_raw.text.strip() if address_raw is not None else 'NaN' | |
phone_raw = soup.select_one(f'{base_element}> ul > li:nth-child(2) > div') | |
phone = phone_raw.text if phone_raw is not None else 'NaN' | |
homepage_raw = soup.select_one(f'{base_element}> ul > li:nth-child(3) > div:nth-child(1) > div > a') | |
homepage = homepage_raw['href'] if homepage_raw is not None else 'NaN' | |
hr_operation_raw = soup.select_one(f'{base_element}> ul > li.info-group-item.operating') | |
if hr_operation_raw is not None: | |
it = iter(filter(bool, hr_operation_raw.text.splitlines())) | |
opening_hrs = dict(zip(it, it)) | |
else: | |
opening_hrs = {'평일 관람시간': 'NaN', '공휴일 관람시간': 'NaN', '휴관일': 'NaN'} | |
loc = locals() | |
infos = {i: ("NaN" if loc[i] == '' else loc[i]) for i in | |
('name', 'address', 'phone', 'homepage', 'url', 'opening_hrs')} | |
return infos | |
def save_xlsx(): | |
"""박물관미술관 스탬프투어에 참여하는 기관 정보를 xlsx 로 저장""" | |
xlsx_name = f'museum_stamp_{datetime.now().strftime("%Y%m%d_%H_%M_%S")}.xlsx' | |
wb = Workbook() | |
work_sheet = wb.active | |
work_sheet.title = YEAR | |
categories = ['장소', '주소', '전화번호', '홈페이지', '뮤지엄위크 url', '평일 관람시간', '공휴일 관람시간', '휴관일', ] | |
for i in range(len(categories)): | |
work_sheet.cell(row=1, column=i + 1, value=categories[i]) | |
museum_urls = scrap_museum_urls() | |
row = 2 | |
for m_url in museum_urls: | |
info = convert_dict_to_list(scrap_museum_info(m_url)) | |
print(info) | |
for i in range(len(info)): | |
work_sheet.cell(row=row, column=i + 1, value=info[i]) | |
row += 1 | |
wb.save(xlsx_name) | |
def convert_dict_to_list(nested_dict): | |
items = list(nested_dict.values()) | |
opening_hrs = list((dict(items.pop())).values()) | |
items = list(chain(items, opening_hrs)) | |
return items | |
def convert_url(site_url, str_url): | |
return f'{site_url}{str_url}' | |
def is_url(str_url): | |
return True if str_url != 'javascript:;' else False | |
if __name__ == '__main__': | |
save_xlsx() | |
def save_xlsx_with_keys(): | |
"""박물관미술관 스탬프투어에 참여하는 기관 정보를 xlsx 로 저장 | |
.. deprecated:: 2022.05.15 | |
""" | |
museum_urls = scrap_museum_urls() | |
xlsx_name = f'museum_stamp_{datetime.now().strftime("%Y%m%d_%H_%M_%S")}.xlsx' | |
year = '2022' | |
wb = Workbook() | |
work_sheet = wb.active | |
work_sheet.title = year | |
work_sheet['A1'] = '장소' | |
work_sheet['B1'] = '주소' | |
work_sheet['C1'] = '전화번호' | |
work_sheet['D1'] = '홈페이지' | |
work_sheet['E1'] = '평일 관람시간' | |
work_sheet['F1'] = '공휴일 관람시간' | |
work_sheet['G1'] = '휴관일' | |
work_sheet['H1'] = '뮤지엄위크 url' | |
row = 2 | |
for m_url in museum_urls: | |
info = scrap_museum_info(m_url) | |
print(info) | |
work_sheet[f"A{row}"] = info['name'] | |
work_sheet[f"B{row}"] = info['address'] | |
work_sheet[f"C{row}"] = info['phone'] | |
work_sheet[f"D{row}"] = info['homepage'] | |
work_sheet[f"E{row}"] = info['opening_hrs']['평일 관람시간'] | |
work_sheet[f"F{row}"] = info['opening_hrs']['공휴일 관람시간'] | |
work_sheet[f"G{row}"] = info['opening_hrs']['휴관일'] | |
work_sheet[f"H{row}"] = info['url'] | |
row += 1 | |
wb.save(xlsx_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment