marcossilva · August 9, 2021 02:30
diff --git a/icml_scrapper.py b/icml_scrapper.py
 # We use the requests lib to make the HTTP GET request
 import requests
 r = requests.get("https://icml.cc/Conferences/2021/Schedule?type=Poster")

 # And the BeautifulSoup lib to parse the HTML data
 from bs4 import BeautifulSoup
 soup = BeautifulSoup(r.text, 'html.parser')

 # I used the select_one operator to use the CSS selector and get the element with class 'col-xs-12'
 # and then the select to obtain all the divs with the onClick attribute
 divs = soup.select_one(".col-xs-12").select('div[onClick]')

 # This is a helper function to proccess each div extracting the desired info
 def process_div(d):
    # The title and authors are obtained with the select_one on their respective divs and classes
    title = d.select_one('div.maincardBody').text
    authors = d.select_one('div.maincardFooter').text
    # The URL is built using the onClick parameters accessed through the attrs.get
    url = "https://icml.cc/Conferences/2021/Schedule?showEvent=" + d.attrs.get('onclick')[11:-1]

    return pd.Series({'title' : title, 'authors' : authors, 'url' : url})

 # Here I used pandas as a simple way to create a DataFrame and save it as CSV locally
 import pandas as pd
 dfs = []
 for d in divs[2:]:
    dfs.append(process_div(d))
    
 df = pd.concat(dfs, axis=1).T

 # As the PDF's papers are not available on ICML website I also built a Google Search Ready URL
 # to improve the hability to one's find it
 import urllib.parse
 df['search_url'] = df.title.apply(lambda t : "https://www.google.com/search?q="+urllib.parse.quote_plus(t))
 df.to_csv('icml_2021.csv')
	# We use the requests lib to make the HTTP GET request
	import requests
	r = requests.get("https://icml.cc/Conferences/2021/Schedule?type=Poster")

	# And the BeautifulSoup lib to parse the HTML data
	from bs4 import BeautifulSoup
	soup = BeautifulSoup(r.text, 'html.parser')

	# I used the select_one operator to use the CSS selector and get the element with class 'col-xs-12'
	# and then the select to obtain all the divs with the onClick attribute
	divs = soup.select_one(".col-xs-12").select('div[onClick]')

	# This is a helper function to proccess each div extracting the desired info
	def process_div(d):
	# The title and authors are obtained with the select_one on their respective divs and classes
	title = d.select_one('div.maincardBody').text
	authors = d.select_one('div.maincardFooter').text
	# The URL is built using the onClick parameters accessed through the attrs.get
	url = "https://icml.cc/Conferences/2021/Schedule?showEvent=" + d.attrs.get('onclick')[11:-1]

	return pd.Series({'title' : title, 'authors' : authors, 'url' : url})

	# Here I used pandas as a simple way to create a DataFrame and save it as CSV locally
	import pandas as pd
	dfs = []
	for d in divs[2:]:
	dfs.append(process_div(d))

	df = pd.concat(dfs, axis=1).T

	# As the PDF's papers are not available on ICML website I also built a Google Search Ready URL
	# to improve the hability to one's find it
	import urllib.parse
	df['search_url'] = df.title.apply(lambda t : "https://www.google.com/search?q="+urllib.parse.quote_plus(t))
	df.to_csv('icml_2021.csv')
No results found