Last active
January 10, 2022 03:48
-
-
Save AndrewPardoe/8701800c118e1945a4ea63a65f7f7acb to your computer and use it in GitHub Desktop.
Scrape flat list of UserVoice Ideas with links from a category's UserVoice pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# UserVoice doesn't have a search capability that will filter on category. I only care about my category (C++) in a huge | |
# Visual Studio database. This script scrapes all UserVoice suggestions in my category with links into an HTML document. | |
# Improvements welcome from those who actually know Python--this is the first Python script I've ever needed to write. | |
import re | |
import requests | |
import urllib.request | |
from bs4 import BeautifulSoup | |
# Whack any Unicode characters when printing to file. Not correct, but not crashing. | |
def safewrite(file, string): | |
try: | |
file.write(string) | |
except UnicodeEncodeError: | |
for char in string: | |
try: | |
file.write(char) | |
except UnicodeEncodeError: | |
file.write("?") | |
# Specifics of my UserVoice page and category | |
prefix = 'https://visualstudio.uservoice.com' | |
firstPage = prefix + '/forums/121579-visual-studio-ide/category/30937-languages-c' | |
nextPage = prefix + '/forums/121579-visual-studio-ide/category/30937-languages-c/filters/top?page={}' | |
# Ideas are split across many pages. Find all page numbers from the Pagination control. | |
soup = BeautifulSoup(requests.get(firstPage).content, "lxml") | |
pagination = soup.find("div", attrs={'class':'uvPagination'}) | |
def pageeq(href): | |
return href and re.compile("page=").search(href) | |
redigits=re.compile('\d+') | |
# Loop through and find the last page number. Could probably write a better regex above. | |
i = 0 | |
for pageref in pagination.find_all(href=pageeq): | |
x = redigits.findall(pageref.string) | |
if x: | |
i = x[0] | |
upper = int(i) + 1 | |
# Create a local HTML page with a list of my category's UserVoice links | |
outfile = open('UserVoice.html', 'w') | |
outfile.write("<html>\n<head>\n<title>UserVoice items from {0}</title>\n</head>\n".format(firstPage)) | |
outfile.write("<body>\n<h2>UserVoice items from {0}</h2>\n<ul>\n".format(firstPage)) | |
# Run through every page, find the links, print the prefix, link, and link title | |
for page in range(1, upper): | |
print ("Processing page {0} of {1}".format(page, upper - 1)) | |
soup = BeautifulSoup(requests.get(nextPage.format(page)).content, "lxml") | |
for header in soup.find_all("h2", class_="uvIdeaTitle"): | |
outfile.write("\t<li><a href=\"{0}{1}\">".format(prefix, header.a.get('href'))) | |
safewrite(outfile, header.a.string) | |
outfile.write("</a/></li>\n") | |
# Close out the HTML page | |
outfile.write("</ul>\n</body>\n</html>\n") | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment