Last active
July 18, 2022 09:10
-
-
Save jkjung-avt/dd40526ba97665cfbf847858fdd52e12 to your computer and use it in GitHub Desktop.
從公開資訊觀測站下載108年度台灣上市、上櫃公司 "非擔任主管職務之全時員工薪資資訊"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""crawl_salaries.py | |
For crawling average salaries of all public listed companies on Taiwan | |
Stock Exchange. Data source: https://mops.twse.com.tw/mops/web/t100sb15 | |
Source code reference: | |
https://blog.techbridge.cc/2019/07/26/how-to-use-taiwan-salary-data-to-do-python-data-analytics-and-data-visualization/ | |
Usage: | |
$ python3 crawl_salaries.py | |
Outputs: | |
108_sii.csv | |
108_otc.csv | |
""" | |
from collections import OrderedDict | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
class HTMLTableParser: | |
def get_html_tables_from_resp(self, html_text): | |
soup = BeautifulSoup(html_text, 'html.parser') | |
tables = soup.find_all('table') | |
return tables | |
def parse_html_table(self, table): | |
""" | |
<tr> | |
<th align="center" class="tblHead" nowrap="" rowspan="2">產業類別</th> | |
<th align="center" class="tblHead" nowrap="" rowspan="2">公司代號</th> | |
<th align="center" class="tblHead" nowrap="" rowspan="2">公司名稱</th> | |
<th align="center" class="tblHead" colspan="4" nowrap="">非擔任主管職務之<br/>全時員工資訊</th> | |
<th align="center" class="tblHead" colspan="2" nowrap="">同業公司資訊</th> | |
<th align="center" class="tblHead" colspan="4" nowrap="">薪資統計情形</th> | |
</tr> | |
<tr> | |
<th align="center" class="tblHead" nowrap="">員工薪資總額(仟元)</th> | |
<th align="center" class="tblHead" nowrap="">員工人數-加權平均(人)</th> | |
<th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th> | |
<th align="center" class="tblHead" nowrap="">每股盈餘(元/股)</th> | |
<th align="center" class="tblHead" nowrap="">員工薪資-平均數(仟元/人)</th> | |
<th align="center" class="tblHead" nowrap="">平均每股盈餘(元/股)</th> | |
<th align="center" class="tblHead" nowrap="">非經理人之<br/>全時員工薪資<br/>平均數未達50萬元</th> | |
<th align="center" class="tblHead" nowrap="">公司EPS獲利表現較同業為佳<br/>,惟非經理人之全時員工<br/>薪資平均數低於同業水準</th> | |
<th align="center" class="tblHead" nowrap="">公司EPS較前一年度成長<br/>,惟非經理人之全時員工<br/>薪資平均數較前一年度減少</th> | |
<th align="center" class="tblHead" nowrap="">公司經營績效與員工薪酬<br/>之關聯性及合理性說明</th> | |
</tr> | |
<tr> | |
<td nowrap="" style="text-align:left !important;">資訊服務業</td> | |
<td nowrap="" style="text-align:left !important;">8416</td> | |
<td nowrap="" style="text-align:left !important;">實威</td> | |
<td nowrap="" style="text-align:right !important;"> 158,636 </td> | |
<td nowrap="" style="text-align:right !important;"> 186 </td> | |
<td nowrap="" style="text-align:right !important;"> 853 </td> | |
<td nowrap="" style="text-align:right !important;"> 9.69 </td> | |
<td nowrap="" style="text-align:right !important;"> 807 </td> | |
<td nowrap="" style="text-align:right !important;"> 1.20 </td> | |
<td nowrap="" style="text-align:right !important;"></td> | |
<td nowrap="" style="text-align:right !important;"></td> | |
<td nowrap="" style="text-align:right !important;"></td> | |
<td nowrap="" style="text-align:left !important;"><br/></td> | |
</tr> | |
""" | |
parsed_data = [] | |
# Find number of rows and columns | |
# we also find the column titles if we can | |
table_row_tags = table.find_all('tr') | |
table_header_tags = table.find_all('th') | |
column_names = [table_header_tag.get_text() for key, table_header_tag in enumerate(table_header_tags) if key not in (3, 4, 5)] | |
column_names[7] = '同業公司{}'.format(column_names[7]) | |
column_names[8] = '同業公司{}'.format(column_names[8]) | |
tr_td_tags = [ | |
[td_tag.get_text().strip() for td_tag in table_row.find_all('td')] | |
for table_row in table_row_tags if table_row.find_all('td') | |
] | |
parsed_data = [ | |
OrderedDict({ | |
column_names[index]: td_tag | |
for index, td_tag in enumerate(tr_td_tag) | |
}) | |
for tr_td_tag in tr_td_tags | |
] | |
df = pd.DataFrame.from_dict(parsed_data) | |
return df | |
def get_108_data(typek): | |
"""Get average salaries data for specified market in year 108 (2019). | |
# Arguments | |
typek: 'sii' (上市) or 'otc' (上櫃) | |
# Returns | |
A pandas dataframe containing the requested data | |
""" | |
htlm_parser = HTMLTableParser() | |
payload = { | |
# 'encodeURIComponent': 1, | |
'step': 1, | |
'firstin': 1, | |
'TYPEK': typek, | |
'RYEAR': 108, | |
'code': '', | |
} | |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} | |
resp = requests.post('https://mops.twse.com.tw/mops/web/ajax_t100sb15', | |
data=payload, headers=headers, timeout=2) | |
html_tables = htlm_parser.get_html_tables_from_resp(resp.text) | |
df_table = htlm_parser.parse_html_table(html_tables[0]) | |
# 產業類別,公司代號,公司名稱,員工薪資總額(仟元),員工人數-年度平均(人), | |
# 員工薪資-平均數(仟元/人)-108年,員工薪資-平均數(仟元/人)-107年, | |
# 員工薪資-中位數(仟元/人)-108年,員工薪資-中位數(仟元/人)-107年, | |
# 平均每股盈餘(元/股), | |
# 同業公司員工薪資-平均數(仟元/人),同業公司每股盈餘(元/股), | |
# 非經理人之全時員工薪資平均數未達50萬元, | |
# 公司EPS獲利表現較同業為佳,惟非經理人之全時員工薪資平均數低於同業水準, | |
# 公司EPS較前一年度成長,惟非經理人之全時員工薪資平均數較前一年度減少, | |
# 公司經營績效與員工薪酬之關聯性及合理性說明 | |
df_table.columns = [ | |
'產業類別', '公司代號', '公司名稱', '員工薪資總額', '員工人數-年度平均', | |
'員工薪資-平均數-108年', '員工薪資-平均數-107年', | |
'員工薪資-中位數-108年', '員工薪資-中位數-107年', | |
'平均每股盈餘', | |
'同業公司員工薪資-平均數', '同業公司每股盈餘', | |
'is_under_50w', 'high_eps_low_salary', | |
'growth_but_low_salary', 'low_salary_reason'] | |
# Convert salary numbers from strings into integers | |
df_table['員工人數-年度平均'] = df_table['員工人數-年度平均'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) | |
df_table['員工薪資總額'] = df_table['員工薪資總額'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000 | |
df_table['員工薪資-平均數-108年'] = df_table['員工薪資-平均數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000 | |
df_table['員工薪資-平均數-107年'] = df_table['員工薪資-平均數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000 | |
df_table['員工薪資-中位數-108年'] = df_table['員工薪資-中位數-108年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000 | |
df_table['員工薪資-中位數-107年'] = df_table['員工薪資-中位數-107年'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000 | |
df_table['同業公司員工薪資-平均數'] = df_table['同業公司員工薪資-平均數'].str.replace(',', '').replace('nan', '0').replace('', '0').astype(int) * 1000 | |
df_table['平均每股盈餘'] = df_table['平均每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float) | |
df_table['同業公司每股盈餘'] = df_table['同業公司每股盈餘'].str.replace('nan', '0').replace('', '0').astype(float) | |
return df_table | |
def main(): | |
df_sii = get_108_data('sii') | |
df_sii.to_csv('108_sii.csv', index=False, encoding='utf-8') | |
df_sii = get_108_data('otc') | |
df_sii.to_csv('108_otc.csv', index=False, encoding='utf-8') | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""google_image_links.py | |
For fetching image links google by querying a keyword. | |
# Prerequisite: | |
1. Download "chromedriver" (same version as your Chrome browser). | |
link: https://chromedriver.chromium.org/downloads | |
2. Unzip the downloaded file and put "chromedriver" binary into ${HOME}/bin. | |
2. `pip3 install selenium` | |
""" | |
import os | |
import selenium | |
DRIVER_PATH = os.environ['HOME'] + '/bin/chromedriver' | |
WEB_DRIVER = webdriver.Chrome(executable_path=DRIVER_PATH) | |
def google_image_links(keyword, max_links_to_fetch): | |
def scroll_to_end(wd): | |
wd.execute_script('window.scrollTo(0, document.body.scrollHeight);') | |
time.sleep(1) | |
# build the google query | |
search_url = ('https://www.google.com/search?safe=off&site=&tbm=isch&' | |
'source=hp&q={q}&oq={q}&gs_l=img') | |
# load the page | |
WEB_DRIVER.get(search_url.format(q=keyword)) | |
image_urls = set() | |
image_count = 0 | |
results_start = 0 | |
while image_count < max_links_to_fetch: | |
scroll_to_end(WEB_DRIVER) | |
# get all image thumbnail results | |
thumbnail_results = WEB_DRIVER.find_elements_by_css_selector('img.Q4LuWd') | |
number_results = len(thumbnail_results) | |
for img in thumbnail_results[results_start:number_results]: | |
# try to click every thumbnail such that we can get the real | |
# image behind it | |
try: | |
img.click() | |
time.sleep(1) | |
except Exception: | |
continue | |
# extract image urls | |
actual_images = WEB_DRIVER.find_elements_by_css_selector('img.n3VNCb') | |
for actual_image in actual_images: | |
if (actual_image.get_attribute('src') and | |
'http' in actual_image.get_attribute('src')): | |
image_urls.add(actual_image.get_attribute('src')) | |
image_count = len(image_urls) | |
if image_count >= max_links_to_fetch: | |
break | |
else: | |
time.sleep(30) | |
load_more_button = WEB_DRIVER.find_element_by_css_selector('.mye4qd') | |
if load_more_button: | |
WEB_DRIVER.execute_script('document.querySelector(".mye4qd").click();') | |
# move the result startpoint further down | |
results_start = len(thumbnail_results) | |
return list(image_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment