Last active
October 22, 2018 16:12
-
-
Save jwcounts/ebf46df324e3e4704ab60dc01b86eb32 to your computer and use it in GitHub Desktop.
Update to Twitter Analytics Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// You can set a custom user agent if you like, this one is for Firefox 60 | |
$user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:60.0) Gecko/20100101 Firefox/60.0'; | |
// Username and password | |
$username = ""; | |
$password = ""; | |
// Set the start and end time, and add 3 zeroes, since Twitter measures in microseconds | |
$start = mktime( 0, 0, 0, 6, 19, 2018 ).'000'; | |
$end = mktime( 0, 0, 0, 6, 24, 2018 ).'000'; | |
// Pull in the cookie file to see if authentication tokens already exist | |
$tw_cookie = "./cookie.txt"; | |
$cookie = file_get_contents( $tw_cookie ); | |
// Set up our basic cURL options | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); | |
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent); | |
curl_setopt($ch, CURLOPT_COOKIEFILE, $tw_cookie); | |
curl_setopt($ch, CURLOPT_COOKIEJAR, $tw_cookie); | |
curl_setopt($ch, CURLOPT_REFERER, "https://twitter.com/"); | |
// Check if the auth_token cookie exists. If not, log in and grab one | |
if ( !preg_match( '/auth_token\t([a-z0-9]+)/', $cookie ) ) : | |
// First call gets hidden form field authenticity_token and session cookie | |
curl_setopt($ch, CURLOPT_URL, "https://twitter.com/"); | |
$html = curl_exec($ch); | |
// parse authenticity_token out of html response | |
preg_match('/formAuthenticityToken\"\;\:"\;([0-9a-zA-Z]+)\"\;/', $html, $match); | |
$authenticity_token = $match[1]; | |
// set post data | |
$sPost = "session[username_or_email]=$username&session[password]=$password&return_to_ssl=true&scribe_log=&redirect_after_login=%2F&authenticity_token=$authenticity_token"; | |
// second call is a post and performs login | |
curl_setopt($ch, CURLOPT_URL, "https://twitter.com/sessions"); | |
curl_setopt($ch, CURLOPT_POST, true); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $sPost); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, array("Content-type: application/x-www-form-urlencoded")); | |
curl_exec($ch); | |
endif; | |
// Pull in the cookie file again, and remove references to '#HttpOnly_' | |
// If you don't, subsequent runs will ignore the auth_token because it will be parsed as a comment | |
$cookie = file_get_contents( $tw_cookie ); | |
$cookie_strip = str_replace( '#HttpOnly_', '', $cookie ); | |
file_put_contents( $tw_cookie, $cookie_strip ); | |
// Do a post request to generate a CSV of the tweets from our time period | |
$sTarget = "https://analytics.twitter.com/user/{$username}/tweets/export.json?start_time={$start}&end_time={$end}&lang=en"; | |
$sTargetBundle = "https://analytics.twitter.com/user/$username/tweets/bundle?start_time={$start}&end_time={$end}&lang=en"; | |
curl_setopt($ch, CURLOPT_URL, $sTarget); | |
curl_setopt($ch, CURLOPT_POST, true); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, array("Content-type: application/x-www-form-urlencoded")); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, ''); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_HEADER, false); | |
$data_arr = ["status" => "Pending"]; | |
// Since the CSV takes some time to generate, check the status a few times | |
$attempts = 0; | |
while ( $data_arr['status'] == "Pending" && $attempts < 5 ) : | |
$data = curl_exec($ch); | |
$data_arr = json_decode($data, true); | |
sleep(2); | |
endwhile; | |
// Once it is completed, download the CSV file and save it | |
curl_setopt($ch, CURLOPT_POST, false); | |
curl_setopt($ch, CURLOPT_URL, $sTargetBundle); | |
$data = curl_exec($ch); | |
// display server response | |
$error = curl_error($ch); | |
$destination = "./tweets.csv"; | |
$file = fopen($destination, "w+"); // overwrite the previous file | |
fputs($file, $data); | |
fclose($file); | |
// Download the graph data JSON | |
$sTarget = "https://analytics.twitter.com/user/{$username}/tweets/account_stats.json?start_time={$start}&end_time={$end}"; | |
curl_setopt($ch, CURLOPT_URL, $sTarget); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_HEADER, false); | |
// Save the JSON file | |
$data = curl_exec ($ch); | |
$error = curl_error($ch); | |
$destination = "./graphs.json"; | |
$file = fopen($destination, "w+"); // overwrite the previous file | |
fputs($file, $data); | |
fclose($file); | |
// Data for SVG timeline | |
$sTarget = "https://analytics.twitter.com/user/{$username}/tweets/timeline.json?start_time={$start}&max_id=0&end_time={$end}&page=0&filter=no_replies&metric=clicks&lang=en"; | |
curl_setopt($ch, CURLOPT_URL, $sTarget); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
// Save the timeline data | |
$data = curl_exec ($ch); | |
$error = curl_error($ch); | |
$destination = "./timeline.json"; | |
$file = fopen($destination, "w+"); // overwrite the previous file | |
fputs($file, $data); | |
fclose($file); | |
curl_close ($ch); | |
?> |
Thanks for updating it! Warning understood, do you have a guess on how often was it running that resulted in a lock up?
Thanks for updating it! Warning understood, do you have a guess on how often was it running that resulted in a lock up?
Sorry I missed your question for so long! If you're still interested, I initially ran into problems while testing, since I would tweak the script and test it several times over the course of an hour or so. I then tried to space out runs of the script every couple of hours, but that started backfiring too. Pretty soon, any run of the script, no matter how long I waited, would lock the account, even with the modifications I had made.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Leaving this here because it's an interesting experiment, but be ye warned: it's also a really good way to lock up your account. Apparently this type of automated behavior is against the rules. Granted, I or anyone else wouldn't have to resort to this if Twitter just had an Analytics API that wasn't only about ads or stupid expensive.