matthew-e-brown · August 12, 2021 12:43
diff --git a/agents.js b/agents.js
 // A user agent is randomly selected from this list; adding more will make it
 // harder for the rate limiter to detect you
 module.exports = [
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
  "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
  "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
 ];
diff --git a/index.js b/index.js
 const puppeteer = require('puppeteer');
 const express = require('express');
 const fetch = require('node-fetch');

 const agents = require('./agents');

 const app = express();

 // Boot the browser directly on startup
 console.log('Starting Puppeteer...');
 const browser = puppeteer.launch();

 /**
 * Scrapes an Instagram post URL (instagram.com/p/abcd1234/)
 * @param {string} postUrl The URL of the instagram post to get the images from
 * @returns An array of Facebook CDN strings pointing to the images
 */
 const getImagesUrls = async (postUrl) => {
  const page = await (await browser).newPage();

  page.on('response', res => {
    const code = res.status();
    if (code >= 300 && code <= 399) {
      throw new Error("Instagram redirect. Probably a rate-limit.");
    }
  });

  await page.setUserAgent(agents[Math.floor((Math.random() * agents.length))]);
  await page.goto(postUrl);

  await page.waitForSelector('article[role="presentation"]', {
    hidden: true,
    visible: true,
    timeout: 5_000
  });

  const div = await page.$('article[role="presentation"]>div:nth-of-type(2)');

  const allImages = [];

  let runs = 0;
  // Basically a 'while true' loop, but since there can only be 10 images in an
  // album post, we can add this hard-defined safeguard, just in case
  while ((runs++) <= 10) {

    // Find all the currently loaded images in the main img container
    const images = await div.$$eval('img', imgTags => {
      return Array.from(imgTags).map(img => img.src);
    });

    // If we found no images for some reason
    if (images == undefined || images.length == 0) {
      throw new Error("Could not find images at URL");
    }

    // Add all the images we didn't get before
    let added = 0;
    images.forEach(str => {
      if (!allImages.includes(str)) {
        allImages.push(str);
        added += 1;
      }
    });

    // If we found them all (the 'next' image is always loaded, so we know we
    // can stop if we don't get anything past this one)
    if (added == 0) break;

    // check if there's any (<) or (>) buttons to push
    const buttons = await div.$$('div[role="presentation"]~button');
    if (!buttons) break; // if there isn't, this was only one image. done

    // https://stackoverflow.com/a/55601090/10549827
    const findAsync = async (arr, callback) => {
      const results = await Promise.all(arr.map(callback));
      return arr[results.findIndex(bool => bool)];
    }

    // Check which of the buttons we found is the (>) button (vs the (<) button)
    const button = await findAsync(buttons, async handle => {
      return await page.evaluate(b => {
        const style = getComputedStyle(b);
        // the right button is absolutely positioned relative to the right edge,
        // so even if 'left' is not set, it has a value in the *computed* style
        return (parseFloat(style.right) < parseFloat(style.left));
      }, handle);
    });

    if (!button) break; // couldn't find a right button, so we are done

    // Click the button and wait for it to fire a network request
    await Promise.all([
      page.waitForResponse(res => res.status() == 200),
      button.click()
    ]);

  }

  await page.close();
  return allImages;
 }


 app.get('/image/:url(*)', async (req, res) => {

  console.log(`Received request for '${req.params.url}'...`);

  try {

    const URLs = await getImagesUrls(req.params.url);

    console.log('Found images:', URLs);

    const base64 = await Promise.all(URLs.map(async url => {
      return fetch(url)
        .then(r => r.buffer())
        .then(buffer => buffer.toString('base64'));
    }));

    res.send(`
      <html>
      <head>
        <title>Here's those pics you ordered dawg 🖼</title>
      </head>
      <body>
        ${base64.map(s => `<img src="data:image/jpeg;base64,${s}" />`).join('')}
      </body>
      </html>
    `);

  } catch (error) {

    res.status(400).json(error);

  }

 });


 // Once Puppeteer is running, start the server
 browser.then(() => {
  const port = process.env.PORT || 3000;

  process.on('SIGINT', async () => (await browser).close());

  app.listen(port, () => {
    console.log(`Express is running on port ${port}`);
  });
 });
diff --git a/package.json b/package.json
 {
  "name": "instagram-scrape",
  "version": "1.0.1",
  "description": "Scrape some stuff, I guess 🤷🏻‍♂️",
  "main": "index.js",
  "scripts": {
    "start": "node index.js"
  },
  "keywords": [ "instagram", "scrape" ],
  "author": {
    "name": "Matthew Brown",
    "url": "https://github.com/matthew-e-brown"
  },
  "license": "MIT",
  "dependencies": {
    "express": "^4.17.1",
    "node-fetch": "^2.6.1",
    "puppeteer": "^10.2.0"
  }
 }
	// A user agent is randomly selected from this list; adding more will make it
	// harder for the rate limiter to detect you
	module.exports = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
	"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
	"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
	];
	const puppeteer = require('puppeteer');
	const express = require('express');
	const fetch = require('node-fetch');

	const agents = require('./agents');

	const app = express();

	// Boot the browser directly on startup
	console.log('Starting Puppeteer...');
	const browser = puppeteer.launch();

	/**
	* Scrapes an Instagram post URL (instagram.com/p/abcd1234/)
	* @param {string} postUrl The URL of the instagram post to get the images from
	* @returns An array of Facebook CDN strings pointing to the images
	*/
	const getImagesUrls = async (postUrl) => {
	const page = await (await browser).newPage();

	page.on('response', res => {
	const code = res.status();
	if (code >= 300 && code <= 399) {
	throw new Error("Instagram redirect. Probably a rate-limit.");
	}
	});

	await page.setUserAgent(agents[Math.floor((Math.random() * agents.length))]);
	await page.goto(postUrl);

	await page.waitForSelector('article[role="presentation"]', {
	hidden: true,
	visible: true,
	timeout: 5_000
	});

	const div = await page.$('article[role="presentation"]>div:nth-of-type(2)');

	const allImages = [];

	let runs = 0;
	// Basically a 'while true' loop, but since there can only be 10 images in an
	// album post, we can add this hard-defined safeguard, just in case
	while ((runs++) <= 10) {

	// Find all the currently loaded images in the main img container
	const images = await div.$$eval('img', imgTags => {
	return Array.from(imgTags).map(img => img.src);
	});

	// If we found no images for some reason
	if (images == undefined \|\| images.length == 0) {
	throw new Error("Could not find images at URL");
	}

	// Add all the images we didn't get before
	let added = 0;
	images.forEach(str => {
	if (!allImages.includes(str)) {
	allImages.push(str);
	added += 1;
	}
	});

	// If we found them all (the 'next' image is always loaded, so we know we
	// can stop if we don't get anything past this one)
	if (added == 0) break;

	// check if there's any (<) or (>) buttons to push
	const buttons = await div.$$('div[role="presentation"]~button');
	if (!buttons) break; // if there isn't, this was only one image. done

	// https://stackoverflow.com/a/55601090/10549827
	const findAsync = async (arr, callback) => {
	const results = await Promise.all(arr.map(callback));
	return arr[results.findIndex(bool => bool)];
	}

	// Check which of the buttons we found is the (>) button (vs the (<) button)
	const button = await findAsync(buttons, async handle => {
	return await page.evaluate(b => {
	const style = getComputedStyle(b);
	// the right button is absolutely positioned relative to the right edge,
	// so even if 'left' is not set, it has a value in the computed style
	return (parseFloat(style.right) < parseFloat(style.left));
	}, handle);
	});

	if (!button) break; // couldn't find a right button, so we are done

	// Click the button and wait for it to fire a network request
	await Promise.all([
	page.waitForResponse(res => res.status() == 200),
	button.click()
	]);

	}

	await page.close();
	return allImages;
	}


	app.get('/image/:url(*)', async (req, res) => {

	console.log(`Received request for '${req.params.url}'...`);

	try {

	const URLs = await getImagesUrls(req.params.url);

	console.log('Found images:', URLs);

	const base64 = await Promise.all(URLs.map(async url => {
	return fetch(url)
	.then(r => r.buffer())
	.then(buffer => buffer.toString('base64'));
	}));

	res.send(`
	<html>
	<head>
	<title>Here's those pics you ordered dawg 🖼</title>
	</head>
	<body>
	${base64.map(s => `<img src="data:image/jpeg;base64,${s}" />`).join('')}
	</body>
	</html>
	`);

	} catch (error) {

	res.status(400).json(error);

	}

	});


	// Once Puppeteer is running, start the server
	browser.then(() => {
	const port = process.env.PORT \|\| 3000;

	process.on('SIGINT', async () => (await browser).close());

	app.listen(port, () => {
	console.log(`Express is running on port ${port}`);
	});
	});
	{
	"name": "instagram-scrape",
	"version": "1.0.1",
	"description": "Scrape some stuff, I guess 🤷🏻‍♂️",
	"main": "index.js",
	"scripts": {
	"start": "node index.js"
	},
	"keywords": [ "instagram", "scrape" ],
	"author": {
	"name": "Matthew Brown",
	"url": "https://github.com/matthew-e-brown"
	},
	"license": "MIT",
	"dependencies": {
	"express": "^4.17.1",
	"node-fetch": "^2.6.1",
	"puppeteer": "^10.2.0"
	}
	}