Created
April 11, 2019 08:26
-
-
Save divamgupta/a3693858dff15351c6b6a361d7000d08 to your computer and use it in GitHub Desktop.
Fetch tweets from twitter via phantomjs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Author Divam Gupta | |
This is a small script to fetch tweets of a given url ( handle / search ) | |
*/ | |
var page = require('webpage').create(); | |
var fs = require('fs'); | |
var system = require('system'); | |
var userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36" | |
page.settings.userAgent = userAgent; | |
page.settings.loadImages = false; | |
allTweets = []; | |
var args = system.args; | |
if (args.length != 3) | |
{ | |
console.log( args[0] + ' "https://mobile.twitter.com/search?q=search" tweets.json'); | |
phantom.exit(); | |
} | |
var url = args[1] || 'https://mobile.twitter.com/search?q=mein+hoon'; | |
var path = args[2] || 'tweets.json'; | |
function save() | |
{ | |
try | |
{ | |
var content = JSON.stringify(allTweets) ; | |
fs.write(path, content, 'w'); | |
}catch(err) { | |
console.log(err) | |
} | |
} | |
function scrapeMore(link) | |
{ | |
page.open(link, function(status) { | |
// list all the a.href links in the hello kitty etsy page | |
var tweet_texts = page.evaluate(function() { | |
return [].map.call(document.querySelectorAll('div.tweet-text div.dir-ltr'), function(p) { | |
return p.textContent; | |
}); | |
}); | |
var tweetTime = page.evaluate(function() { | |
return [].map.call(document.querySelectorAll('td.timestamp a'), function(p) { | |
return p.textContent; | |
}); | |
}); | |
if( page.content.indexOf("You've made a few too many attempts. Please try again later") > 0 ) | |
{ | |
console.log("too many attempts"); | |
setTimeout(function(){ | |
return scrapeMore(link); | |
} , 5000 ) | |
} | |
else{ | |
for( var index in tweet_texts) | |
{ | |
allTweets.push({ | |
text : tweet_texts[index] , | |
time : tweetTime[index] | |
}); | |
} | |
var nextLink = page.evaluate(function() { | |
return [].map.call(document.querySelectorAll('div.w-button-more a'), function(link) { | |
return link.getAttribute('href'); | |
}); | |
}); | |
console.log(nextLink); | |
if(nextLink.length > 0 ) | |
{ | |
nextLink = "https://mobile.twitter.com"+nextLink[0]; | |
save(); | |
return scrapeMore(nextLink); | |
} | |
else | |
{ | |
console.log("doneeeee"); | |
save(); | |
phantom.exit(); | |
} | |
} | |
}); | |
} | |
scrapeMore(url); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You would need phantomjs version 1.9.8 for this.
The command to run it would be :
phantomjs script.js "" out.json