Last active
August 12, 2021 12:43
-
-
Save matthew-e-brown/82fe581b522919192420f1c229eb3483 to your computer and use it in GitHub Desktop.
Scrape Instagram post URLs for all images in that post (including 'album' style posts). Doesn't work on videos; it will just return their thumbnail.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A user agent is randomly selected from this list; adding more will make it | |
// harder for the rate limiter to detect you | |
module.exports = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", | |
"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9", | |
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36", | |
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" | |
]; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const express = require('express'); | |
const fetch = require('node-fetch'); | |
const agents = require('./agents'); | |
const app = express(); | |
// Boot the browser directly on startup | |
console.log('Starting Puppeteer...'); | |
const browser = puppeteer.launch(); | |
/** | |
* Scrapes an Instagram post URL (instagram.com/p/abcd1234/) | |
* @param {string} postUrl The URL of the instagram post to get the images from | |
* @returns An array of Facebook CDN strings pointing to the images | |
*/ | |
const getImagesUrls = async (postUrl) => { | |
const page = await (await browser).newPage(); | |
page.on('response', res => { | |
const code = res.status(); | |
if (code >= 300 && code <= 399) { | |
throw new Error("Instagram redirect. Probably a rate-limit."); | |
} | |
}); | |
await page.setUserAgent(agents[Math.floor((Math.random() * agents.length))]); | |
await page.goto(postUrl); | |
await page.waitForSelector('article[role="presentation"]', { | |
hidden: true, | |
visible: true, | |
timeout: 5_000 | |
}); | |
const div = await page.$('article[role="presentation"]>div:nth-of-type(2)'); | |
const allImages = []; | |
let runs = 0; | |
// Basically a 'while true' loop, but since there can only be 10 images in an | |
// album post, we can add this hard-defined safeguard, just in case | |
while ((runs++) <= 10) { | |
// Find all the currently loaded images in the main img container | |
const images = await div.$$eval('img', imgTags => { | |
return Array.from(imgTags).map(img => img.src); | |
}); | |
// If we found no images for some reason | |
if (images == undefined || images.length == 0) { | |
throw new Error("Could not find images at URL"); | |
} | |
// Add all the images we didn't get before | |
let added = 0; | |
images.forEach(str => { | |
if (!allImages.includes(str)) { | |
allImages.push(str); | |
added += 1; | |
} | |
}); | |
// If we found them all (the 'next' image is always loaded, so we know we | |
// can stop if we don't get anything past this one) | |
if (added == 0) break; | |
// check if there's any (<) or (>) buttons to push | |
const buttons = await div.$$('div[role="presentation"]~button'); | |
if (!buttons) break; // if there isn't, this was only one image. done | |
// https://stackoverflow.com/a/55601090/10549827 | |
const findAsync = async (arr, callback) => { | |
const results = await Promise.all(arr.map(callback)); | |
return arr[results.findIndex(bool => bool)]; | |
} | |
// Check which of the buttons we found is the (>) button (vs the (<) button) | |
const button = await findAsync(buttons, async handle => { | |
return await page.evaluate(b => { | |
const style = getComputedStyle(b); | |
// the right button is absolutely positioned relative to the right edge, | |
// so even if 'left' is not set, it has a value in the *computed* style | |
return (parseFloat(style.right) < parseFloat(style.left)); | |
}, handle); | |
}); | |
if (!button) break; // couldn't find a right button, so we are done | |
// Click the button and wait for it to fire a network request | |
await Promise.all([ | |
page.waitForResponse(res => res.status() == 200), | |
button.click() | |
]); | |
} | |
await page.close(); | |
return allImages; | |
} | |
app.get('/image/:url(*)', async (req, res) => { | |
console.log(`Received request for '${req.params.url}'...`); | |
try { | |
const URLs = await getImagesUrls(req.params.url); | |
console.log('Found images:', URLs); | |
const base64 = await Promise.all(URLs.map(async url => { | |
return fetch(url) | |
.then(r => r.buffer()) | |
.then(buffer => buffer.toString('base64')); | |
})); | |
res.send(` | |
<html> | |
<head> | |
<title>Here's those pics you ordered dawg 🖼</title> | |
</head> | |
<body> | |
${base64.map(s => `<img src="data:image/jpeg;base64,${s}" />`).join('')} | |
</body> | |
</html> | |
`); | |
} catch (error) { | |
res.status(400).json(error); | |
} | |
}); | |
// Once Puppeteer is running, start the server | |
browser.then(() => { | |
const port = process.env.PORT || 3000; | |
process.on('SIGINT', async () => (await browser).close()); | |
app.listen(port, () => { | |
console.log(`Express is running on port ${port}`); | |
}); | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "instagram-scrape", | |
"version": "1.0.1", | |
"description": "Scrape some stuff, I guess 🤷🏻♂️", | |
"main": "index.js", | |
"scripts": { | |
"start": "node index.js" | |
}, | |
"keywords": [ "instagram", "scrape" ], | |
"author": { | |
"name": "Matthew Brown", | |
"url": "https://github.com/matthew-e-brown" | |
}, | |
"license": "MIT", | |
"dependencies": { | |
"express": "^4.17.1", | |
"node-fetch": "^2.6.1", | |
"puppeteer": "^10.2.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment