neerolyte · March 3, 2014 04:40
diff --git a/google_sub_path_finder.php b/google_sub_path_finder.php
 <?php
 function docurl($url) {
 	$opts = array(
 		CURLOPT_RETURNTRANSFER => true,
 		CURLOPT_URL => $url,
 		CURLOPT_HEADER => true,
 	);
 
 	$ch = curl_init();
 	curl_setopt_array($ch, $opts);
 
 	$data = array();
 
 	$res = curl_exec($ch);
 
 	$data['info'] = curl_getinfo($ch);
 
 	$data['header'] = substr($res, 0, $data['info']['header_size']);
 	$data['body'] = substr($res, $data['info']['header_size']);
 
 	$data['error'] = curl_error($ch);
 	$data['errorno'] = curl_errno($ch);
 
 	curl_close($ch);
 
 	return $data;
 }
 
 function extractSubPaths($main, $ignores, $filter = '', $start = 0) {
 	$ignores = array_map(function($v) { return "+-site:$v"; }, $ignores);
 	$url = "http://www.google.com/search?q="
 		."site:$main+$filter"
 		.implode('', $ignores)
 		."&start=$start";
 
 	echo "Testing URL: $url\n";
 
 	$res = docurl($url);
 
 	$doc = new DOMDocument();
 	$doc->loadHTML($res['body']);
 	$xpath = new DOMXPath($doc);
 	$nodes = $xpath->query("//cite");
 
 	$cites = array();
 	foreach ($nodes as $node) {
 		$cite = $node->nodeValue;
 		// remove everything after the second slash
 		$cite = preg_replace('%^([^/]*/[^/]*)/.*$%', '\1', $cite);
 		$cites []= $cite;
 	}

 	$cites = array_unique($cites);
 
 	return $cites;
 }

 function extractPathCount($path) {
 	$url = "http://www.google.com/search?q="
 		."site:$path";
 
 	// echo "Extracting count under: $url ... ";
 
 	$res = docurl($url);
 
 	$doc = new DOMDocument();
 	$doc->loadHTML($res['body']);
 	$xpath = new DOMXPath($doc);
 	$nodes = $xpath->query("//div[@id='resultStats']/text()");
 	$countText = $nodes->item(0)->wholeText;

 	$count = preg_replace('%^About ([0-9,]+) results$%', '\1', $countText);
 	$count = intval(preg_replace('%,%', '', $count));

 	// echo "$count\n";
 
 	return $count;
 }

 // start up
 libxml_use_internal_errors(true);
 $start = $argv[1];
 $filter = isset($argv[2])?$argv[2]:'';
 $paths = array();
 
 do {
 	// try a couple of offsets every time to pick up a couple more results
 	for ($i = 0; $i < 50; $i+=10) {
 		$newPaths = extractSubPaths($start, $paths, $filter, $i);
 		$paths = array_merge($paths, $newPaths);
 	}
 
 	$paths = array_unique($paths);
 	sleep(1);
 } while(!empty($newPaths));

 foreach ($paths as $path) {
 	echo "$path: ".extractPathCount($path)."\n";
 }
 
 // echo implode("\n", $paths);
	<?php
	function docurl($url) {
	$opts = array(
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_URL => $url,
	CURLOPT_HEADER => true,
	);

	$ch = curl_init();
	curl_setopt_array($ch, $opts);

	$data = array();

	$res = curl_exec($ch);

	$data['info'] = curl_getinfo($ch);

	$data['header'] = substr($res, 0, $data['info']['header_size']);
	$data['body'] = substr($res, $data['info']['header_size']);

	$data['error'] = curl_error($ch);
	$data['errorno'] = curl_errno($ch);

	curl_close($ch);

	return $data;
	}

	function extractSubPaths($main, $ignores, $filter = '', $start = 0) {
	$ignores = array_map(function($v) { return "+-site:$v"; }, $ignores);
	$url = "http://www.google.com/search?q="
	."site:$main+$filter"
	.implode('', $ignores)
	."&start=$start";

	echo "Testing URL: $url\n";

	$res = docurl($url);

	$doc = new DOMDocument();
	$doc->loadHTML($res['body']);
	$xpath = new DOMXPath($doc);
	$nodes = $xpath->query("//cite");

	$cites = array();
	foreach ($nodes as $node) {
	$cite = $node->nodeValue;
	// remove everything after the second slash
	$cite = preg_replace('%^([^/]/[^/])/.*$%', '\1', $cite);
	$cites []= $cite;
	}

	$cites = array_unique($cites);

	return $cites;
	}

	function extractPathCount($path) {
	$url = "http://www.google.com/search?q="
	."site:$path";

	// echo "Extracting count under: $url ... ";

	$res = docurl($url);

	$doc = new DOMDocument();
	$doc->loadHTML($res['body']);
	$xpath = new DOMXPath($doc);
	$nodes = $xpath->query("//div[@id='resultStats']/text()");
	$countText = $nodes->item(0)->wholeText;

	$count = preg_replace('%^About ([0-9,]+) results$%', '\1', $countText);
	$count = intval(preg_replace('%,%', '', $count));

	// echo "$count\n";

	return $count;
	}

	// start up
	libxml_use_internal_errors(true);
	$start = $argv[1];
	$filter = isset($argv[2])?$argv[2]:'';
	$paths = array();

	do {
	// try a couple of offsets every time to pick up a couple more results
	for ($i = 0; $i < 50; $i+=10) {
	$newPaths = extractSubPaths($start, $paths, $filter, $i);
	$paths = array_merge($paths, $newPaths);
	}

	$paths = array_unique($paths);
	sleep(1);
	} while(!empty($newPaths));

	foreach ($paths as $path) {
	echo "$path: ".extractPathCount($path)."\n";
	}

	// echo implode("\n", $paths);