Created
February 20, 2021 21:22
-
-
Save jilmun/b2abccdcbfe124da13b7189c2a447005 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import pandas as pd | |
import numpy as np | |
import time | |
from itertools import groupby | |
def scroll(driver, timeout): | |
# get scroll height | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
for i in range(timeout): | |
# scroll down | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
# wait for page to load | |
time.sleep(1) | |
# get new scroll height and compare to last height | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
# If heights are the same it will exit the function | |
break | |
last_height = new_height | |
def scrapeleaders(leaderboard_url, leaderboard_name): | |
driver.get(leaderboard_url) | |
scroll(driver, 10) # increase this when ready to run | |
print("... Done scrolling!") | |
time.sleep(3) | |
if leaderboard_name == 'submission': | |
submissions_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[1]') | |
individuals_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[2]') | |
metrics_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[4]') | |
dates_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[7]') | |
submissions = [i.text for i in submissions_l] | |
individuals = [i.text for i in individuals_l] | |
metrics = [i.text for i in metrics_l] | |
dates = [i.text for i in dates_l] | |
table_name = [leaderboard_name] * len(individuals) | |
df = pd.DataFrame(list(zip(table_name, submissions, individuals, metrics, dates)), | |
columns =['Leaderboard', 'Submission', 'Individual', 'Metric', 'Date']) | |
else: # url is for weekly profit or RMSE leaderboards | |
ranking_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[2]') | |
participants_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[3]') | |
if leaderboard_name == 'rmse': | |
metrics_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[4]') | |
submissions_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[8]/a') | |
else: | |
metrics_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[5]') | |
submissions_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[9]/a') | |
ranking = [i.text for i in ranking_l] | |
participants = [i.text for i in participants_l] | |
metrics = [i.text for i in metrics_l] | |
submissions = [i.get_attribute('href') for i in submissions_l] | |
submissions = [sub[-6:] for sub in submissions] # get last 6-digit Submission ID | |
table_name = [leaderboard_name] * len(participants) | |
df = pd.DataFrame(list(zip(table_name, ranking, participants, metrics, submissions)), | |
columns =['Leaderboard', 'Rank', 'Participant', 'Metric', 'Submission']) | |
return df | |
# get the path of ChromeDriverServer | |
dir = os.path.dirname(__file__) | |
chrome_driver_path = dir + "\chromedriver.exe" | |
# create a new Chrome session | |
driver = webdriver.Chrome(executable_path = chrome_driver_path) | |
driver.implicitly_wait(30) | |
#driver.maximize_window() | |
df_submissions = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/submissions", "submission") | |
# df_rmse = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=467&challenge_round_id=625", "rmse") | |
df_week01 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=489&challenge_round_id=625", "week01") | |
df_week02 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=504&challenge_round_id=625", "week02") | |
df_week03 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=521&challenge_round_id=625", "week03") | |
df_week04 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=430&challenge_round_id=625", "week04") | |
df_week05 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=522&challenge_round_id=625", "week05") | |
df_week06 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=524&challenge_round_id=625", "week06") | |
df_week07 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_round_id=625", "week07") | |
print(len(df_submissions), | |
# len(df_rmse), | |
len(df_week01), len(df_week02), len(df_week03), | |
len(df_week04), len(df_week05), len(df_week06), | |
len(df_week07)) | |
df_weeks = df_week01.append([df_week02, df_week03, df_week04, df_week05, df_week06, df_week07]) | |
df_leaders = pd.merge(df_weeks, | |
df_submissions.drop(columns=['Leaderboard']), | |
on='Submission', how='left') | |
print(len(df_leaders)) | |
df_leaders = df_leaders.rename(columns= | |
{"Metric_x": "Profit", | |
"Metric_y": "RMSE", | |
"Rank" : "Profit_Rank"}) | |
# clean up dates | |
df_leaders[['Wkday_DMY', 'Submit_Time']] = df_leaders.Date.str.split("\n", expand=True) | |
df_leaders[['Submit_Weekday', "Submit_Date"]] = df_leaders.Wkday_DMY.str.split(",", expand=True) | |
df_leaders = df_leaders.drop(columns=['Date', 'Wkday_DMY']) | |
df_leaders['Submit_Date'] = pd.to_datetime(df_leaders['Submit_Date']).dt.date | |
# create RMSE rank column | |
df_leaders["RMSE"] = pd.to_numeric(df_leaders["RMSE"]) | |
df_leaders["RMSE_Rank"] = df_leaders.groupby("Leaderboard")["RMSE"].rank("average", ascending=True) | |
# rearrange columns | |
df_leaders = df_leaders[['Leaderboard', 'Participant', | |
'Individual', 'Submission', | |
'RMSE', 'RMSE_Rank', 'Profit', 'Profit_Rank', | |
'Submit_Date', 'Submit_Weekday', 'Submit_Time']] | |
print(df_leaders.head()) | |
df_leaders.to_csv('leaderboard_20210207.csv', index=False) | |
# close the browser window | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment