Skip to content

Instantly share code, notes, and snippets.

@matthew-e-brown
Last active August 12, 2021 12:43
Show Gist options
  • Save matthew-e-brown/82fe581b522919192420f1c229eb3483 to your computer and use it in GitHub Desktop.
Save matthew-e-brown/82fe581b522919192420f1c229eb3483 to your computer and use it in GitHub Desktop.
Scrape Instagram post URLs for all images in that post (including 'album' style posts). Doesn't work on videos; it will just return their thumbnail.
// A user agent is randomly selected from this list; adding more will make it
// harder for the rate limiter to detect you
module.exports = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
];
const puppeteer = require('puppeteer');
const express = require('express');
const fetch = require('node-fetch');
const agents = require('./agents');
const app = express();
// Boot the browser directly on startup
console.log('Starting Puppeteer...');
const browser = puppeteer.launch();
/**
* Scrapes an Instagram post URL (instagram.com/p/abcd1234/)
* @param {string} postUrl The URL of the instagram post to get the images from
* @returns An array of Facebook CDN strings pointing to the images
*/
const getImagesUrls = async (postUrl) => {
const page = await (await browser).newPage();
page.on('response', res => {
const code = res.status();
if (code >= 300 && code <= 399) {
throw new Error("Instagram redirect. Probably a rate-limit.");
}
});
await page.setUserAgent(agents[Math.floor((Math.random() * agents.length))]);
await page.goto(postUrl);
await page.waitForSelector('article[role="presentation"]', {
hidden: true,
visible: true,
timeout: 5_000
});
const div = await page.$('article[role="presentation"]>div:nth-of-type(2)');
const allImages = [];
let runs = 0;
// Basically a 'while true' loop, but since there can only be 10 images in an
// album post, we can add this hard-defined safeguard, just in case
while ((runs++) <= 10) {
// Find all the currently loaded images in the main img container
const images = await div.$$eval('img', imgTags => {
return Array.from(imgTags).map(img => img.src);
});
// If we found no images for some reason
if (images == undefined || images.length == 0) {
throw new Error("Could not find images at URL");
}
// Add all the images we didn't get before
let added = 0;
images.forEach(str => {
if (!allImages.includes(str)) {
allImages.push(str);
added += 1;
}
});
// If we found them all (the 'next' image is always loaded, so we know we
// can stop if we don't get anything past this one)
if (added == 0) break;
// check if there's any (<) or (>) buttons to push
const buttons = await div.$$('div[role="presentation"]~button');
if (!buttons) break; // if there isn't, this was only one image. done
// https://stackoverflow.com/a/55601090/10549827
const findAsync = async (arr, callback) => {
const results = await Promise.all(arr.map(callback));
return arr[results.findIndex(bool => bool)];
}
// Check which of the buttons we found is the (>) button (vs the (<) button)
const button = await findAsync(buttons, async handle => {
return await page.evaluate(b => {
const style = getComputedStyle(b);
// the right button is absolutely positioned relative to the right edge,
// so even if 'left' is not set, it has a value in the *computed* style
return (parseFloat(style.right) < parseFloat(style.left));
}, handle);
});
if (!button) break; // couldn't find a right button, so we are done
// Click the button and wait for it to fire a network request
await Promise.all([
page.waitForResponse(res => res.status() == 200),
button.click()
]);
}
await page.close();
return allImages;
}
app.get('/image/:url(*)', async (req, res) => {
console.log(`Received request for '${req.params.url}'...`);
try {
const URLs = await getImagesUrls(req.params.url);
console.log('Found images:', URLs);
const base64 = await Promise.all(URLs.map(async url => {
return fetch(url)
.then(r => r.buffer())
.then(buffer => buffer.toString('base64'));
}));
res.send(`
<html>
<head>
<title>Here's those pics you ordered dawg 🖼</title>
</head>
<body>
${base64.map(s => `<img src="data:image/jpeg;base64,${s}" />`).join('')}
</body>
</html>
`);
} catch (error) {
res.status(400).json(error);
}
});
// Once Puppeteer is running, start the server
browser.then(() => {
const port = process.env.PORT || 3000;
process.on('SIGINT', async () => (await browser).close());
app.listen(port, () => {
console.log(`Express is running on port ${port}`);
});
});
{
"name": "instagram-scrape",
"version": "1.0.1",
"description": "Scrape some stuff, I guess 🤷🏻‍♂️",
"main": "index.js",
"scripts": {
"start": "node index.js"
},
"keywords": [ "instagram", "scrape" ],
"author": {
"name": "Matthew Brown",
"url": "https://github.com/matthew-e-brown"
},
"license": "MIT",
"dependencies": {
"express": "^4.17.1",
"node-fetch": "^2.6.1",
"puppeteer": "^10.2.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment