-
-
Save reallytiredofclowns/b51f63d042a4b5416ceee282ee524295 to your computer and use it in GitHub Desktop.
# to do: error checking/resumption code (can use pagination cursor of post to determine when script paused?) | |
# clean up repetition | |
# timing issue: if fetching by latest, someone can make a comment | |
# that puts a post out of the date limits before the looping | |
# has a chance to fetch the post | |
# do a second sweep after hitting the date limit? | |
# would have to store the script's start time and figure out | |
# when it halts due to hitting the lower date limit and | |
# reprocess comments according to that... | |
import requests, time, pandas, datetime | |
# URL of the last report, to link back to it in the current report | |
lastReportURL = "https://discuit.net/DiscuitMeta/post/GBoECayW" | |
# set fromDate to "" to get all | |
fromDate = "20240811" | |
toDate = "20240818" | |
# summary tables show top X items | |
topX = 10 | |
# no point calculating stats for bots | |
ignoredUsers = ["autotldr", "FlagWaverBot", "Betelgeuse"] | |
# userId 000000000000000000000000 is an admin account for collecting | |
# deleted accounts? | |
#ignoredIds = ["000000000000000000000000"] | |
# initial feed nextPage parameter--to be used in eventual resumption code | |
nextPage = "" | |
baseURL = "https://discuit.net" | |
#baseURL = "http://localhost:8080" | |
########################################################## | |
def dateFormat(date): | |
return date[:10].replace("-", "") | |
def serverDateToDT(s): | |
serverDateFormat = '%Y-%m-%dT%H:%M:%S%z' | |
return datetime.datetime.strptime(s, serverDateFormat) | |
def daysAgo(dt): | |
currDateTime = datetime.datetime.now(tz=datetime.timezone.utc) | |
return max(0, (currDateTime - dt).days) | |
# title field may have special characters that need to be escaped | |
def cleanTitle(title): | |
return title.translate(str.maketrans({ | |
"|": r"\|", "[": r"\[", "]": r"\]", "(": r"\(", ")": r"\)", "_": r"\_", "*": "\*"})) | |
def fetchFeed(feedNext, disc = None, sort = "activity"): | |
args = {"sort": sort, "next": feedNext} | |
if disc: | |
args["communityId"] = disc | |
response = requests.get(rf"{baseURL}/api/posts", args) | |
json = response.json() | |
return json["posts"], json["next"] | |
def getFullPost(post): | |
return requests.get( | |
f"{baseURL}/api/posts/{post['publicId']}").json() | |
def commentIsValid(comment): | |
if comment["deletedAt"]: | |
return False | |
if comment["editedAt"]: | |
commentDate = dateFormat(comment["editedAt"]) | |
else: | |
commentDate = dateFormat(comment["createdAt"]) | |
if (fromDate != "" and commentDate < fromDate) or\ | |
commentDate > toDate: | |
return False | |
username = comment["username"] | |
if username in ignoredUsers: | |
return False | |
return True | |
def processComments(post, activeUsers): | |
fullPost = getFullPost(post) | |
# posts from home feed don't seem to contain comments | |
comments = fullPost["comments"] | |
commentsNext = fullPost["commentsNext"] | |
commentCount = 0 | |
while comments: | |
for comment in comments: | |
if not commentIsValid(comment): | |
continue | |
commentCount += 1 | |
username = comment["username"] | |
if not (username in activeUsers.index): | |
activeUsers.loc[username] = [0] * len(activeUsers.columns) | |
activeUsers.loc[username, "Comments"] += 1 | |
if commentsNext: | |
comments = requests.get( | |
f"{baseURL}/api/posts/{fullPost['publicId']}/comments", | |
{"next": commentsNext}).json() | |
comments, commentsNext = comments["comments"], comments["next"] | |
else: | |
break | |
return commentCount | |
def postIsValid(post): | |
reachedTimeLimit = False | |
# hardValid is True/False depending on non-date-associated reasons | |
# for validity--ignored users, past oldest date to retrieve, | |
# deleted post, banned author | |
hardValid = True | |
# softValid is True/False based solely on the post date--if | |
# the post date falls between the from-to configuration | |
# needed because there could be comments in the post in the | |
# date range, but the post itself is not in the date range | |
softValid = True | |
username = post["username"] | |
lastActivityAt = dateFormat(post["lastActivityAt"]) | |
createdAt = dateFormat(post["createdAt"]) | |
if not ( | |
(fromDate <= lastActivityAt <= toDate) or\ | |
(fromDate <= createdAt <= toDate)): | |
softValid = False | |
if fromDate != "" and lastActivityAt < fromDate: | |
reachedTimeLimit = True | |
hardValid = False | |
elif post["deleted"]: | |
hardValid = False | |
elif username in ignoredUsers: | |
hardValid = False | |
return hardValid, reachedTimeLimit, softValid | |
def processPosts(posts, activeUsers, activeDiscs, topPosts): | |
reachedTimeLimit = False | |
lastSuccessfulPostDate = "" | |
for post in posts: | |
# filter out posts that are out of scope | |
username = post["username"] | |
lastActivityAt = dateFormat(post["lastActivityAt"]) | |
hardValid, reachedTimeLimit, softValid = postIsValid(post) | |
if not hardValid: | |
if reachedTimeLimit: | |
break | |
# skip the post if it is hard-invalid | |
continue | |
# need to separate invalid due to deletion vs invalid due to time | |
# must be a conditionally valid post after above filtering | |
title = cleanTitle(post["title"].replace("\n", " ")) | |
discName = post["communityName"] | |
postType = post["type"] # "text", "image", "link" | |
if not (username in activeUsers.index): | |
activeUsers.loc[username] = [0] * len(activeUsers.columns) | |
postType = postType.title() + "s" | |
activeUsers.loc[username, postType] += 1 | |
numComments = processComments(post, activeUsers) | |
# if there are no valid comments in the timeframe of interest | |
# and the post dates are not in the timeframe, then | |
# conclusively skip the post; otherwise count it | |
if not numComments and not softValid: | |
continue | |
if not (discName in activeDiscs.index): | |
activeDiscs.loc[discName] = [0] * len(activeDiscs.columns) | |
activeDiscs.loc[discName, postType] += 1 | |
activeDiscs.loc[discName, "Comments"] += numComments | |
url = f"{baseURL}/{discName}/post/{post['publicId']}" | |
if not (url in topPosts.index): | |
# can init everything to zero: number cells can be incremented | |
# and str cells can overwrite original zero | |
topPosts.loc[url] = [0] * len(topPosts.columns) | |
topPosts.loc[url, "Type"] = postType | |
topPosts.loc[url, "Disc"] = discName | |
topPosts.loc[url, "Title"] = title | |
topPosts.loc[url, "User"] = username | |
topPosts.loc[url, "Comments"] = numComments | |
lastSuccessfulPostDate = lastActivityAt | |
return lastSuccessfulPostDate, reachedTimeLimit | |
def generateTables(nextPage): | |
lastPostDate = "" | |
topPosts = pandas.DataFrame({ | |
"Rank": [], "Type": [], "Disc": [], "Title": [], "User": [], "Comments": []}) | |
activeUsers = pandas.DataFrame({ | |
"Rank": [], "Texts": [], "Images": [], "Links": [], "TotalPosts": [], "Comments": [], "TotalEngagement": []}, | |
pandas.Index([], name = "User")) | |
activeDiscs = pandas.DataFrame({ | |
"Rank": [], "Texts": [], "Images": [], "Links": [], "TotalPosts": [], "Comments": [], "TotalEngagement": []}, | |
pandas.Index([], name = "Disc")) | |
while True: | |
print(f"Pagination parameter is: {nextPage}; last processed post date was: {lastPostDate}") | |
posts, nextPage = fetchFeed(nextPage) | |
lastPostDate, reachedTimeLimit = processPosts( | |
posts, activeUsers, activeDiscs, topPosts) | |
if nextPage is None or reachedTimeLimit: | |
break | |
time.sleep(2) | |
return activeUsers, activeDiscs, topPosts | |
def topXReport(activeUsers, activeDiscs, topPosts): | |
sumPostComments = topPosts["Comments"].sum() | |
numDiscs = len(topPosts['Disc'].unique()) | |
print(f"\n\nDiscuit week in review: {fromDate}-{toDate}\n") | |
print(f"\n[Last week's report is here]({lastReportURL}).") | |
print("\nDiscuit API is [documented here](https://docs.discuit.net/getting-started). " | |
"Source code of script generating the tables is " | |
"[available here](https://gist.github.com/reallytiredofclowns/b51f63d042a4b5416ceee282ee524295).") | |
registeredAccounts = requests.get( | |
f"{baseURL}/api/_initial").json()["noUsers"] | |
print(f"\nOver the last week, {len(activeUsers)} users discussed {len(topPosts)} posts in " | |
f"{sumPostComments} comments over {numDiscs} total discs. " | |
f"At the time of this report, there were {registeredAccounts} accounts.\n") | |
print("Felix30 has been [charting some of these numbers here](https://docs.google.com/spreadsheets/d/1H7zV_7YIZar9dwDHbutr0Dm9N6H-1mEXe0irIwSHsx0/edit#gid=1256137398).\n") | |
postTypes = topPosts["Type"].unique() | |
postTypes.sort() | |
for postType in postTypes: | |
subset = topPosts.query("Type == @postType").\ | |
drop(columns = "Type").copy() | |
if len(subset): | |
subset["User"] = subset["User"].str.replace("_", "\\_") | |
subset["Rank"] = subset["Comments"].rank(method = "min", ascending = False) | |
subset = subset.query("Rank <= @topX") | |
subset = subset.sort_values("Rank") | |
subset = subset.reset_index() | |
subset["Title"] = "[" + subset["Title"] + "](" + subset["index"] + ")" | |
subset = subset.drop(columns = "index") | |
print(f"# Top {topX} most engaging {postType}:") | |
print(subset.to_markdown(index = False)) | |
print("\n\n") | |
activeDiscs["TotalPosts"] = activeDiscs["Texts"] + activeDiscs["Images"] + activeDiscs["Links"] | |
activeDiscs["TotalEngagement"] = activeDiscs["TotalPosts"] + activeDiscs["Comments"] | |
activeDiscs["Rank"] = activeDiscs["TotalEngagement"].rank(method = "min", ascending = False) | |
# reset the index after filling out the calculations, so | |
# the reassignment doesn't break the link with the original | |
# input dataframe | |
activeDiscs = activeDiscs.reset_index() | |
subset = activeDiscs.query("Rank <= @topX") | |
subset = subset.sort_values("Rank") | |
subset["Disc"] = "[" + subset["Disc"] + f"]({baseURL}/" + subset["Disc"] + ")" | |
colOrder = ["Rank"] + [_ for _ in subset.columns if _ != "Rank"] | |
subset = subset[colOrder] | |
print(f"# Top {topX} most engaging Discs:") | |
print(subset.to_markdown(index = False)) | |
print("\n") | |
# remove Ghost user from the active users table | |
if "ghost" in activeUsers.index: | |
activeUsers.drop("ghost", inplace = True) | |
activeUsers["TotalPosts"] = activeUsers["Texts"] + activeUsers["Images"] + activeUsers["Links"] | |
activeUsers["TotalEngagement"] = activeUsers["TotalPosts"] + activeUsers["Comments"] | |
activeUsers["Rank"] = activeUsers["TotalEngagement"].rank(method = "min", ascending = False) | |
# reset the index after filling out the calculations, so | |
# the reassignment doesn't break the link with the original | |
# input dataframe | |
activeUsers = activeUsers.reset_index() | |
activeUsers["User"] = activeUsers["User"].str.replace("_", "\\_") | |
subset = activeUsers.query("Rank <= @topX") | |
subset = subset.sort_values("Rank") | |
subset["User"] = "[" + subset["User"] + f"]({baseURL}/@" + subset["User"] + ")" | |
colOrder = ["Rank"] + [_ for _ in subset.columns if _ != "Rank"] | |
subset = subset[colOrder] | |
print(f"# Top {topX} most engaged Discuiteers:") | |
print(subset.to_markdown(index = False)) | |
def discLatestActivityReport(): | |
discActivity = pandas.DataFrame( | |
{"DaysSinceLastActivity": []}, pandas.Index([], name = "Disc")) | |
communityList = requests.get(f"{baseURL}/api/communities").json() | |
for comm in communityList: | |
# reset pagination for each disc | |
nextPage = "" | |
daysSinceActivity = None | |
commName = comm["name"] | |
commId = comm["id"] | |
while True: | |
print(commName) | |
posts, nextPage = fetchFeed(nextPage, disc = commId) | |
if posts: | |
# from/to date limit not used here, so not using the postIsValid function | |
for post in posts: | |
if post["deletedAt"] or post["author"]["isBanned"] or\ | |
post["username"] in ignoredUsers: | |
continue | |
daysSinceActivity = daysAgo(serverDateToDT(post["lastActivityAt"])) | |
break | |
if nextPage is None or daysSinceActivity is not None: | |
discActivity.loc[commName] = [daysSinceActivity] | |
break | |
else: # empty disc | |
discActivity.loc[commName] = [daysSinceActivity] | |
break | |
time.sleep(3) | |
discActivity.loc[discActivity["DaysSinceLastActivity"] <= 1, "ChartCategory"] = "01) 1 day" | |
discActivity.loc[discActivity.query("1 < DaysSinceLastActivity <= 2").index, "ChartCategory"] = "02) 2 days" | |
discActivity.loc[discActivity.query("2 < DaysSinceLastActivity <= 3").index, "ChartCategory"] = "03) 3 days" | |
discActivity.loc[discActivity.query("3 < DaysSinceLastActivity <= 4").index, "ChartCategory"] = "04) 4 days" | |
discActivity.loc[discActivity.query("4 < DaysSinceLastActivity <= 5").index, "ChartCategory"] = "05) 5 days" | |
discActivity.loc[discActivity.query("5 < DaysSinceLastActivity <= 6").index, "ChartCategory"] = "06) 6 days" | |
discActivity.loc[discActivity.query("6 < DaysSinceLastActivity <= 7").index, "ChartCategory"] = "07) 1 week" | |
discActivity.loc[discActivity.query("7 < DaysSinceLastActivity <= 14").index, "ChartCategory"] = "08) 2 weeks" | |
discActivity.loc[discActivity.query("14 < DaysSinceLastActivity <= 21").index, "ChartCategory"] = "09) 3 weeks" | |
discActivity.loc[discActivity.query("21 < DaysSinceLastActivity <= 28").index, "ChartCategory"] = "10) 4 weeks" | |
discActivity.loc[28 < discActivity["DaysSinceLastActivity"], "ChartCategory"] = "11) > 4 weeks" | |
discActivity.loc[discActivity["DaysSinceLastActivity"].isna(), "ChartCategory"] = "12) No activity" | |
return discActivity | |
def modActivityReport(): | |
discActivity = pandas.DataFrame( | |
{"CreatedDaysAgo": [], "ActivityDaysAgo": [], "ModActivityDaysAgo": []}, | |
pandas.Index([], name = "Disc")) | |
discList = requests.get("{baseURL}/api/communities").json() | |
for disc in discList: | |
time.sleep(3) | |
# reset variables for each disc | |
discName = disc["name"] | |
print("Looping for", discName) | |
discId = disc["id"] | |
# communities API doesn't appear to return full data, so do a second request | |
discData = requests.get(f"{baseURL}/api/communities/{disc['name']}", {"byName": "true"}).json() | |
discMods = discData["mods"] | |
discLastActivity = None | |
modLastActivity = None | |
discCreated = daysAgo(serverDateToDT(discData["createdAt"])) | |
posts, _ = fetchFeed("", disc = discId) | |
if posts: | |
post = posts[0] | |
discLastActivity = daysAgo(serverDateToDT(post["lastActivityAt"])) | |
modActivityList = [] | |
for mod in discMods: | |
response = requests.get(f"{baseURL}/api/users/{mod['username']}/feed", {"limit": 1}) | |
# possibility of mod being banned, which would return 403 error... or 401? | |
if response.status_code in (401, 403): | |
continue | |
activityItem = response.json()["items"] | |
if not activityItem: | |
continue | |
activityItem = activityItem[0] | |
# seems comments have a postId and posts do not? | |
if "postId" in activityItem: | |
tempList = [activityItem["item"]["createdAt"], #activityItem["item"]["lastActivityAt"], | |
activityItem["item"]["editedAt"]] | |
tempList = [_ for _ in tempList if _ != None] | |
currModActivity = max(tempList) | |
else: #comment | |
tempList = [activityItem["item"]["createdAt"], activityItem["item"]["editedAt"]] | |
tempList = [_ for _ in tempList if _ != None] | |
currModActivity = max(tempList) | |
modActivityList.append(daysAgo(serverDateToDT(currModActivity))) | |
if modActivityList: | |
modLastActivity = min(modActivityList) | |
discActivity.loc[discName] = [discCreated, discLastActivity, modLastActivity] | |
return discActivity | |
###################################################### | |
activeUsers, activeDiscs, topPosts = generateTables(nextPage) | |
topXReport(activeUsers, activeDiscs, topPosts) | |
#discActivity = discLatestActivityReport() | |
#discModReport = modActivityReport() |
I'm happy to look over ideas but not really interested in turning it into a full-fledged repo for collaboration at this time (not really interested in the management aspect of it). If someone wants to take it over, I'm more than happy to step aside--it would give me some time to go and look at other things.
Thanks for the suggestion for re
replacement. I think my original impetus for the escaping was for (1) newlines in titles screwing up the tables when the text was converted to markdown, and (2) the same for user-supplied tagging (e.g., marking geographic region [USA]
in news articles) in the titles. The existing code seems sufficient for now, but I can keep in mind your suggestion if it needs to be more aggressive in the future.
@reallytiredofclowns Gotcha. I was thinking about the escaping after posting and realized you use the markdown feature in pandas (DataFrame.to_markdown(...)
). I haven't used the markdown package much, but I'm surprised it doesn't handle the things you mentioned.
Another thing that I wanted to explore with this script was speeding it up a bit (like you mentioned). I can leave another comment with an idea if I come up with something, but will also respect your stance of not messing with it too much.
Thanks for the response! I appreciate the insight.
Will this script accept any PRs or tips/suggestions/ideas?
For example, will the cleanTitle function work similarly if you perform a broader application of the same idea Python's built-in
re
package?