beefy · December 29, 2020 03:56
diff --git a/chess_scrape.py b/chess_scrape.py
 #!/usr/bin/python

 import spynner
 import pyquery
 import urllib
 import os

 # author: Nate Schultz
 # contact: github.com/beefy
 # created: 10/22/16

 # This script downloads all live game PGNs from chess.com and concatinates the resulting files
 # Please note you need your chess.com login credentials in the file path below
 # Please note this may be broken in the future due to chess.com UI changes

 # first line username
 # second line password
 LOGIN_CRED_FILE_PATH = './chesslogin.txt'

 # path to output directory
 # autocreated if it doesn't exist
 OUTPUT_PATH = './my_chess_games'

 if not os.path.exists(OUTPUT_PATH):
 	os.makedirs(OUTPUT_PATH)

 url_old_game_archive = "https://www.chess.com/home/my_archive"
 url_old_game_archive_all_live = "https://www.chess.com/home/my_archive?sortby=&show=live&color=all&result=all"
 url_new_game_archive = "https://www.chess.com/games/archive"

 next_page = lambda page_num: url_old_game_archive_all_live+"&page="+str(page_num)

 # initialize browser
 def init_browser():
 	b = spynner.Browser(debug_level=spynner.INFO)
 	b.load("https://www.chess.com/login")
 	b.load_jquery(True)
 	return b

 # login from new chess.com
 def login_new(b):
 	b.click_link('a[href="//www.chess.com/switch?request_uri=%2Flogin"]')
 	credentials = [line.strip() for line in open(LOGIN_CRED_FILE_PATH,'rb')]
 	b.wk_fill('input[id="username"]',credentials[0])
 	b.wk_fill('input[id="password"]',credentials[1])
 	b.click_link('button[id="login"]')

 # login from old chess.com
 def login_old(b):
 	credentials = [line.strip() for line in open(LOGIN_CRED_FILE_PATH,'rb')]
 	b.wk_fill('input[name="c1"]',credentials[0])
 	b.wk_fill('input[name="loginpassword"]',credentials[1])
 	b.click_link('button[name="btnLogin"]')

 # load ajax
 def load_ajax(b):
 	js_str = 'var script = document.createElement("script");script.src = "http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js";document.getElementsByTagName("head")[0].appendChild(script);'
 	b.runjs(js_str)
 	b.wait(5) # let ajax finish

 # "depricated"
 def check_checkboxes(b):
 	# first attempt: I tried to do the typical checkboxes + download button on the new chess.com
 	# but the download button proved too difficult, so I had to use the old chess.com
 	# this code checks the first 10 checkboxes, as a test

 	# index checkboxes
 	js_str = "i = 0; $('input[game-checkbox]').each( function() { $(this).attr('id','game-checkbox'+i); i++; });"
 	b.runjs(js_str)

 	# iterate checkboxes
 	for i in range(10):
 		b.click('input[id="game-checkbox'+str(i)+'"]')

 	b.click('.pull-right.download-all')

 def scrape(b):

 	links_per_page = 50		
 	chess_game_extension = '.chessgame'
 	chess_page_extension = '.chesspage'
 	game_delimiter = '\nwww.thenateschultz.com\ngithub.com/beefy\n'

 	# page iteration
 	page_i = 1;
 	while True:

 		# index view links
 		js_str = "i = 0; jQuery('.games.right-4').each( function() { jQuery(this).attr('id','game-view'+i); i++; });"
 		b.runjs(js_str)

 		# game iteration (same # every page)
 		for i in range(links_per_page):
 			b.click_link('#game-view'+str(i)) # click 'view' link
 			# download PGN
 			d = pyquery.PyQuery(b.html)
 			raw_href = d('a[class="bpgn"]').attr("href") # get download link
 			href = urllib.unquote(raw_href)
 			b.download(href, open(OUTPUT_PATH+'/PGN_'+str(i)+chess_game_extension,'w')) # write PGN to file
 			b.load(url_old_game_archive_all_live) # redirect to game archive
 		
 			# re-index view links
 			js_str = "i = 0; jQuery('.games.right-4').each( function() { jQuery(this).attr('id','game-view'+i); i++; });"
 			b.runjs(js_str)

 		# merge PGNs
 		PGNs = [open(os.path.join(OUTPUT_PATH, file),'rb') for file in os.listdir(OUTPUT_PATH) if file.endswith(chess_game_extension)]
 		PGN_data = [file.read() for file in PGNs]
 		PGN_page_str = game_delimiter.join(PGN_data)
 		PGN_page_out = open(OUTPUT_PATH+'/page'+str(page_i)+chess_page_extension,'w')
 		PGN_page_out.write(PGN_page_str)
 		PGN_page_out.close()
 		for file in PGNs:
 			os.remove(file.name) # delete merged files

 		try:
 			# redirect to next page!
 			page_i += 1
 			b.load(next_page(page_i))
 		except:
 			# I guess there's no more pages
 			# merge pages
 			PGNs = [open(os.path.join(OUTPUT_PATH, file),'rb') for file in os.listdir(OUTPUT_PATH) if file.endswith(chess_page_extension)]
 			PGN_data = [file.read() for file in PGNs]
 			PGN_page_str = game_delimiter.join(PGN_data)
 			PGN_page_out = open(OUTPUT_PATH+'/PGNs.txt','w')
 			PGN_page_out.write(PGN_page_str)
 			PGN_page_out.close()
 			for file in PGNs:
 				os.remove(file.name) # delete merged files

 			return

 if __name__ == "__main__":
 	b = init_browser()
 	login_old(b)
 	b.load(url_old_game_archive_all_live) # redirect to game archive
 	load_ajax(b)
 	scrape(b)
 	# b.browse() # activate GUI
	#!/usr/bin/python

	import spynner
	import pyquery
	import urllib
	import os

	# author: Nate Schultz
	# contact: github.com/beefy
	# created: 10/22/16

	# This script downloads all live game PGNs from chess.com and concatinates the resulting files
	# Please note you need your chess.com login credentials in the file path below
	# Please note this may be broken in the future due to chess.com UI changes

	# first line username
	# second line password
	LOGIN_CRED_FILE_PATH = './chesslogin.txt'

	# path to output directory
	# autocreated if it doesn't exist
	OUTPUT_PATH = './my_chess_games'

	if not os.path.exists(OUTPUT_PATH):
	os.makedirs(OUTPUT_PATH)

	url_old_game_archive = "https://www.chess.com/home/my_archive"
	url_old_game_archive_all_live = "https://www.chess.com/home/my_archive?sortby=&show=live&color=all&result=all"
	url_new_game_archive = "https://www.chess.com/games/archive"

	next_page = lambda page_num: url_old_game_archive_all_live+"&page="+str(page_num)

	# initialize browser
	def init_browser():
	b = spynner.Browser(debug_level=spynner.INFO)
	b.load("https://www.chess.com/login")
	b.load_jquery(True)
	return b

	# login from new chess.com
	def login_new(b):
	b.click_link('a[href="//www.chess.com/switch?request_uri=%2Flogin"]')
	credentials = [line.strip() for line in open(LOGIN_CRED_FILE_PATH,'rb')]
	b.wk_fill('input[id="username"]',credentials[0])
	b.wk_fill('input[id="password"]',credentials[1])
	b.click_link('button[id="login"]')

	# login from old chess.com
	def login_old(b):
	credentials = [line.strip() for line in open(LOGIN_CRED_FILE_PATH,'rb')]
	b.wk_fill('input[name="c1"]',credentials[0])
	b.wk_fill('input[name="loginpassword"]',credentials[1])
	b.click_link('button[name="btnLogin"]')

	# load ajax
	def load_ajax(b):
	js_str = 'var script = document.createElement("script");script.src = "http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js";document.getElementsByTagName("head")[0].appendChild(script);'
	b.runjs(js_str)
	b.wait(5) # let ajax finish

	# "depricated"
	def check_checkboxes(b):
	# first attempt: I tried to do the typical checkboxes + download button on the new chess.com
	# but the download button proved too difficult, so I had to use the old chess.com
	# this code checks the first 10 checkboxes, as a test

	# index checkboxes
	js_str = "i = 0; $('input[game-checkbox]').each( function() { $(this).attr('id','game-checkbox'+i); i++; });"
	b.runjs(js_str)

	# iterate checkboxes
	for i in range(10):
	b.click('input[id="game-checkbox'+str(i)+'"]')

	b.click('.pull-right.download-all')

	def scrape(b):

	links_per_page = 50
	chess_game_extension = '.chessgame'
	chess_page_extension = '.chesspage'
	game_delimiter = '\nwww.thenateschultz.com\ngithub.com/beefy\n'

	# page iteration
	page_i = 1;
	while True:

	# index view links
	js_str = "i = 0; jQuery('.games.right-4').each( function() { jQuery(this).attr('id','game-view'+i); i++; });"
	b.runjs(js_str)

	# game iteration (same # every page)
	for i in range(links_per_page):
	b.click_link('#game-view'+str(i)) # click 'view' link
	# download PGN
	d = pyquery.PyQuery(b.html)
	raw_href = d('a[class="bpgn"]').attr("href") # get download link
	href = urllib.unquote(raw_href)
	b.download(href, open(OUTPUT_PATH+'/PGN_'+str(i)+chess_game_extension,'w')) # write PGN to file
	b.load(url_old_game_archive_all_live) # redirect to game archive

	# re-index view links
	js_str = "i = 0; jQuery('.games.right-4').each( function() { jQuery(this).attr('id','game-view'+i); i++; });"
	b.runjs(js_str)

	# merge PGNs
	PGNs = [open(os.path.join(OUTPUT_PATH, file),'rb') for file in os.listdir(OUTPUT_PATH) if file.endswith(chess_game_extension)]
	PGN_data = [file.read() for file in PGNs]
	PGN_page_str = game_delimiter.join(PGN_data)
	PGN_page_out = open(OUTPUT_PATH+'/page'+str(page_i)+chess_page_extension,'w')
	PGN_page_out.write(PGN_page_str)
	PGN_page_out.close()
	for file in PGNs:
	os.remove(file.name) # delete merged files

	try:
	# redirect to next page!
	page_i += 1
	b.load(next_page(page_i))
	except:
	# I guess there's no more pages
	# merge pages
	PGNs = [open(os.path.join(OUTPUT_PATH, file),'rb') for file in os.listdir(OUTPUT_PATH) if file.endswith(chess_page_extension)]
	PGN_data = [file.read() for file in PGNs]
	PGN_page_str = game_delimiter.join(PGN_data)
	PGN_page_out = open(OUTPUT_PATH+'/PGNs.txt','w')
	PGN_page_out.write(PGN_page_str)
	PGN_page_out.close()
	for file in PGNs:
	os.remove(file.name) # delete merged files

	return

	if __name__ == "__main__":
	b = init_browser()
	login_old(b)
	b.load(url_old_game_archive_all_live) # redirect to game archive
	load_ajax(b)
	scrape(b)
	# b.browse() # activate GUI