Last active
May 1, 2020 16:08
-
-
Save CryogenicPlanet/b2dd54a8c946999e9fe497b33ae2037a to your computer and use it in GitHub Desktop.
A Client Side Twitter DOM Scrapper, Full Post https://medium.com/etwas/twitter-client-side-dom-scrapping-6f5a36ce3243
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Functions | |
let tweetParser = async function (tweetDom) { | |
let tweetContent = tweetDom.innerText; | |
let tweet = { | |
name: "", | |
username: "", | |
time: "", | |
content: "", | |
interaction: { | |
reply: "", | |
retweets: "", | |
like: "", | |
}, | |
}; | |
//console.log("Tweet Content", tweetContent) | |
let timeElm = tweetDom.getElementsByTagName("time")[0]; | |
let timeDis = timeElm.innerText; | |
//console.log("Tweet Time Element ",timeElm) | |
let dateTimeAtri = timeElm.getAttribute("datetime"); | |
let splitTweet = tweetContent.split(/\n/); | |
let splitLength = splitTweet.length; | |
let breakpoint = 4; | |
let endContent = splitLength - 4; | |
for (let i = 0; i < splitLength; i++) { | |
if (splitTweet[i] === timeDis) { | |
breakpoint = i; | |
} | |
} | |
//console.log("Split Tweet",splitTweet) | |
tweet.name = splitTweet[0]; | |
tweet.username = splitTweet[1]; | |
tweet.time = dateTimeAtri; | |
tweet.content = splitTweet.slice(breakpoint + 1, endContent + 1); | |
tweet.content = tweet.content.join("\n"); | |
tweet.interaction.reply = splitTweet[endContent + 1]; | |
tweet.interaction.retweets = splitTweet[endContent + 2]; | |
tweet.interaction.like = splitTweet[endContent + 3]; | |
//console.log(tweet) | |
return tweet; | |
}; | |
async function getTweets() { | |
// Function to get New Tweet Bodies | |
let divs = document.querySelectorAll("div"); // Load Div Elements | |
tweets = []; | |
tweetIds = []; | |
for (let div of divs) { | |
//console.log(div.innerHTML) | |
let dataTestId = div.getAttribute("data-testid"); | |
// data-tweet-id | |
if (dataTestId == "tweet") { | |
tweets.push(div); | |
} | |
} // Load Tweet Elements by checking for specific Attribute | |
tweetContent = {}; | |
let parsedTweets = {}; | |
for (let tweet of tweets) { | |
//console.log(tweet) | |
let aTags = tweet.getElementsByTagName("a"); | |
for (let aTag of aTags) { | |
let href = aTag.getAttribute("href"); | |
if (href.includes("/status/")) { | |
let start = href.indexOf("/status/"); | |
let tweetId = href.split("/status/"); | |
tweetId = tweetId[1]; | |
if (!(tweetId in parsedTweets)) { | |
//console.log(tweetId) | |
tweetIds.push(tweetId); | |
//console.log(tweet.innerText) | |
parsedTweets[tweetId] = await tweetParser(tweet); | |
} | |
} | |
} // Finding Tweet Id for every tweet by processing all <a> tags within the tweet | |
} // Iterating through tweets | |
return parsedTweets; | |
} | |
// ------- End of Functions ------ | |
// Main | |
let main = async function () { | |
let parsedTweetsGlobal = {}; | |
parsedTweetsGlobal = await getTweets(); | |
window.addEventListener("scroll", async function () { | |
let newParsedTweets = await getTweets(); | |
console.log( | |
"From Scroll Event Listener ", | |
Object.keys(newParsedTweets).length | |
); | |
let newDistinctTweets = new Object(); | |
for (let newTweetID in newParsedTweets) { | |
if (!(newTweetID in parsedTweetsGlobal)) { | |
newDistinctTweets[newTweetID] = newParsedTweets[newTweetID]; | |
console.log("New Distinct Tweet from Scroll Event"); | |
} | |
} | |
parsedTweetsGlobal = { ...parsedTweetsGlobal, ...newParsedTweets }; | |
console.log( | |
"New Key Length From Scroll Even", | |
Object.keys(parsedTweetsGlobal).length | |
); | |
}); | |
}; | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment