Last active
May 30, 2022 20:18
-
-
Save Rup1/dbe9788ffe15d9260f352fce74a360c2 to your computer and use it in GitHub Desktop.
Extract instagram info from screenshot, scrape each profile, put data into spreadsheet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
Little background story: | |
One day before going to a brunch party, my girlfriend got an email from her employer with a bunch of screenshots of | |
instagram/n influencer accounts. They asked that she put their names, websites, emails and other relevant info in a | |
spreadsheet. There were hundreds of accounts and this would have taken her at least a full day - ruining our party plans. | |
So I wrote this script to do it for her - I researched packages on the spot and was amazed at how easy it was to find node | |
packages that did what I needed. Then it was just . amatter of stringing everything together. It'ts not pretty, but it took | |
about an hour and a half to write (targeting the right css selectors was a pain) and got the job done in about 15 minutes. | |
**/ | |
var webdriver = require('selenium-webdriver'), | |
chrome = require('selenium-webdriver/chrome'), | |
By = webdriver.By, | |
until = webdriver.until, | |
Key = webdriver.Key; | |
var o = new chrome.Options(); | |
// o.addArguments('start-fullscreen'); | |
o.addArguments('disable-infobars'); | |
o.addArguments("disable-notifications"); | |
o.setUserPreferences( { credentials_enable_service: false } ); | |
var parseFullName = require('parse-full-name').parseFullName; | |
var findEmails = require('find-emails-in-string'); | |
var json2csv = require('json2csv'); | |
var fs = require('fs'); | |
var okrabyte = require("okrabyte"); | |
var driver = new webdriver.Builder().withCapabilities(webdriver.Capabilities.chrome()).setChromeOptions(o).build(); | |
//helper find function | |
var find = function(el){ | |
driver.wait(until.elementLocated(By.css(el)), 5000, "Could not find " + el); | |
return driver.findElement(By.css(el)); | |
} | |
//read contents of directory with images | |
//Since this deals with converting images to text and is error prone, I run this script separately first, so I can validate that it's all correct in the console, | |
//I then literally copy paste the result into an array :) Ideally, I would simply change this to a funciton that returns an array value ready to be used by the | |
//rest of the script. | |
//Read directory contents using 'readdir' | |
fs.readdir("imgs/", function (err, files) { | |
if (err) { | |
throw err; | |
} | |
//iterate over each file in the directory. | |
for(i=0;i<files.length;i++){ | |
//Using okrabyte package read each image file (using readFileSync) and capture text | |
okrabyte.decodeBuffer(fs.readFileSync("imgs/" + files[i]), function(error, data){ | |
//split the returned string into an array of words (separated by spaces), and strip unwanted characters | |
var splitWords = data.split(" "); | |
var word = splitWords[0].replace(/_|[0-9]/g,"") | |
console.log(word); | |
}) | |
} | |
}); | |
//array of influencers extracted from images. | |
var influencers = [ | |
] | |
function goToUser() { | |
//initialize variables to store data from each profile | |
var followers; | |
var firstName; | |
var middleName; | |
var lastName; | |
var email; | |
var website; | |
var profileUrl; | |
//array to store influencers | |
var influencerCSV = []; | |
//csv fields to be created by json2csv | |
var fields = ["first name", "last name", "followers", "email", "profile url"]; | |
//iterate through array of influencers | |
for(i=0;i<influencers.length;i++){ | |
//using webdriver, go to the profile page for each list member | |
driver.get("http://instagram.com/" + influencers[i]); | |
//Find the div tha contains the followers | |
find("._9o0bc li:nth-child(2) ._bkw5z").getText().then(function(txt){ | |
followers = txt; | |
}) | |
//find div containing instagram bio, the first div is always the user's full name. use 'praseFullName' package to split the user's name | |
//Using a package in order to deal with long names, middle initials, etc | |
find("._79dar").getText().then(function(txt){ | |
var name = parseFullName(txt); | |
firstName = name.first; | |
lastName = name.last; | |
}) | |
//find div containing instagram bio and extract emails if any. | |
find("._bugdy").getText().then(function(txt){ | |
var emails = findEmails(txt); | |
if(emails[0] === undefined){ | |
email = "No email listed"; | |
}else{ | |
email = emails[0]; | |
} | |
}) | |
//get link to instagram user's profile url. | |
driver.getCurrentUrl().then(function(url){ | |
profileUrl = url; | |
}) | |
//By now all the vars contain the desired data for this IG user, create a JSON object for the user with their info. | |
// I'm executing these functions inside driver.sleep() methods to make them run syncronously because I'm lazy like that. | |
driver.sleep(100).then(function(){ | |
influencerCSV.push({ | |
"first name": firstName, | |
"last name": lastName, | |
"followers": followers, | |
"email": email, | |
"profile url": profileUrl | |
}) | |
}) | |
} | |
//Convert JSON array to CSV spreadsheet | |
driver.sleep(500).then(function(){ | |
var csv = json2csv({ data: influencerCSV, fields: fields }); | |
fs.writeFile('file.csv', csv, function(err) { | |
if (err) throw err; | |
console.log('file saved'); | |
}); | |
}); | |
} | |
//Run the function that produces spreadsheet. | |
goToUser(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Wow