trevery · May 19, 2017 02:14
diff --git a/scrap_whole_site_urls.py b/scrap_whole_site_urls.py
 from urllib.request 
 import urlopen 
 from bs4 
 import BeautifulSoup 
 import re

 pages = set()
 def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html)
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
 	    if link.attrs['href'] not in pages:
 	    # 我们遇到了新页面
 	    newPage = link.attrs['href'] 
 	    print(newPage) 
 	    pages.add(newPage) 
 	    getLinks(newPage)
 getLinks("")
	from urllib.request
	import urlopen
	from bs4
	import BeautifulSoup
	import re

	pages = set()
	def getLinks(pageUrl):
	global pages
	html = urlopen("http://en.wikipedia.org"+pageUrl)
	bsObj = BeautifulSoup(html)
	for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
	if 'href' in link.attrs:
	if link.attrs['href'] not in pages:
	# 我们遇到了新页面
	newPage = link.attrs['href']
	print(newPage)
	pages.add(newPage)
	getLinks(newPage)
	getLinks("")
No results found