-
-
Save Iqlaas/40be545aad7f072f9d14680fda93f8dd to your computer and use it in GitHub Desktop.
Exracting instagram info from screenshot images, scraping each profile, and put them into spreadsheets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var webdriver = require('selenium-webdriver'), | |
chrome = require('selenium-webdriver/chrome'), | |
By = webdriver.By, | |
until = webdriver.until, | |
Key = webdriver.Key; | |
var o = new chrome.Options(); | |
// o.addArguments('start-fullscreen'); | |
o.addArguments('disable-infobars'); | |
o.addArguments("disable-notifications"); | |
o.setUserPreferences( { credentials_enable_service: false } ); | |
var parseFullName = require('parse-full-name').parseFullName; | |
var findEmails = require('find-emails-in-string'); | |
var json2csv = require('json2csv'); | |
var fs = require('fs'); | |
var okrabyte = require("okrabyte"); | |
var driver = new webdriver.Builder().withCapabilities(webdriver.Capabilities.chrome()).setChromeOptions(o).build(); | |
//helper find function | |
var find = function(el){ | |
driver.wait(until.elementLocated(By.css(el)), 5000, "Could not find " + el); | |
return driver.findElement(By.css(el)); | |
} | |
//read contents of directory with images | |
//Since this deals with converting images to text and is error prone, I run this script separately first, so I can validate that it's all correct in the console, | |
//I then literally copy paste the result into an array :) Ideally, I would simply change this to a funciton that returns an array value ready to be used by the | |
//rest of the script. | |
//Read directory contents using 'readdir' | |
fs.readdir("imgs/", function (err, files) { | |
if (err) { | |
throw err; | |
} | |
//iterate over each file in the directory. | |
for(i=0;i<files.length;i++){ | |
//Using okrabyte package read each image file (using readFileSync) and capture text | |
okrabyte.decodeBuffer(fs.readFileSync("imgs/" + files[i]), function(error, data){ | |
//split the returned string into an array of words (separated by spaces), and strip unwanted characters | |
var splitWords = data.split(" "); | |
var word = splitWords[0].replace(/_|[0-9]/g,"") | |
console.log(word); | |
}) | |
} | |
}); | |
//array of influencers extracted from images. | |
var influencers = [ | |
] | |
function goToUser() { | |
//initialize variables to store data from each profile | |
var followers = null; | |
var firstName = null; | |
var middleName = null; | |
var lastName = null; | |
var email = null; | |
var website = null; | |
var profileUrl = null; | |
//array to store influencers | |
var influencerCSV = []; | |
//csv fields to be created by json2csv | |
var fields = ["first name", "last name", "followers", "email", "profile url"]; | |
//iterate through array of influencers | |
for(i=0;i<influencers.length;i++){ | |
//using webdriver, go to the profile page for each list member | |
driver.get("http://instagram.com/" + influencers[i]); | |
//Find the div tha contains the followers | |
find("._9o0bc li:nth-child(2) ._bkw5z").getText().then(function(txt){ | |
followers = txt; | |
}) | |
//find div containing instagram bio, the first div is always the user's full name. use 'praseFullName' package to split the user's name | |
//Using a package in order to deal with long names, middle initials, etc | |
find("._79dar").getText().then(function(txt){ | |
var name = parseFullName(txt); | |
firstName = name.first; | |
lastName = name.last; | |
}) | |
//find div containing instagram bio and extract emails if any. | |
find("._bugdy").getText().then(function(txt){ | |
var emails = findEmails(txt); | |
if(emails[0] === undefined){ | |
email = "No email listed"; | |
}else{ | |
email = emails[0]; | |
} | |
}) | |
//get link to instagram user's profile url. | |
driver.getCurrentUrl().then(function(url){ | |
profileUrl = url; | |
}) | |
//By now all the vars contain the desired data for this IG user, create a JSON object for the user with their info. | |
// I'm executing these functions inside driver.sleep() methods to make them run syncronously because I'm lazy like that. | |
driver.sleep(100).then(function(){ | |
influencerCSV.push({ | |
"first name": firstName, | |
"last name": lastName, | |
"followers": followers, | |
"email": email, | |
"profile url": profileUrl | |
}) | |
}) | |
} | |
//Convert JSON array to CSV spreadsheet | |
driver.sleep(500).then(function(){ | |
var csv = json2csv({ data: influencerCSV, fields: fields }); | |
fs.writeFile('file.csv', csv, function(err) { | |
if (err) throw err; | |
console.log('file saved'); | |
}); | |
}); | |
} | |
//Run the function that produces spreadsheet. | |
goToUser(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment