Created
March 3, 2014 04:40
-
-
Save neerolyte/9318476 to your computer and use it in GitHub Desktop.
quick and dirty php code to try to extract high level subpaths from google indexes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function docurl($url) { | |
$opts = array( | |
CURLOPT_RETURNTRANSFER => true, | |
CURLOPT_URL => $url, | |
CURLOPT_HEADER => true, | |
); | |
$ch = curl_init(); | |
curl_setopt_array($ch, $opts); | |
$data = array(); | |
$res = curl_exec($ch); | |
$data['info'] = curl_getinfo($ch); | |
$data['header'] = substr($res, 0, $data['info']['header_size']); | |
$data['body'] = substr($res, $data['info']['header_size']); | |
$data['error'] = curl_error($ch); | |
$data['errorno'] = curl_errno($ch); | |
curl_close($ch); | |
return $data; | |
} | |
function extractSubPaths($main, $ignores, $filter = '', $start = 0) { | |
$ignores = array_map(function($v) { return "+-site:$v"; }, $ignores); | |
$url = "http://www.google.com/search?q=" | |
."site:$main+$filter" | |
.implode('', $ignores) | |
."&start=$start"; | |
echo "Testing URL: $url\n"; | |
$res = docurl($url); | |
$doc = new DOMDocument(); | |
$doc->loadHTML($res['body']); | |
$xpath = new DOMXPath($doc); | |
$nodes = $xpath->query("//cite"); | |
$cites = array(); | |
foreach ($nodes as $node) { | |
$cite = $node->nodeValue; | |
// remove everything after the second slash | |
$cite = preg_replace('%^([^/]*/[^/]*)/.*$%', '\1', $cite); | |
$cites []= $cite; | |
} | |
$cites = array_unique($cites); | |
return $cites; | |
} | |
function extractPathCount($path) { | |
$url = "http://www.google.com/search?q=" | |
."site:$path"; | |
// echo "Extracting count under: $url ... "; | |
$res = docurl($url); | |
$doc = new DOMDocument(); | |
$doc->loadHTML($res['body']); | |
$xpath = new DOMXPath($doc); | |
$nodes = $xpath->query("//div[@id='resultStats']/text()"); | |
$countText = $nodes->item(0)->wholeText; | |
$count = preg_replace('%^About ([0-9,]+) results$%', '\1', $countText); | |
$count = intval(preg_replace('%,%', '', $count)); | |
// echo "$count\n"; | |
return $count; | |
} | |
// start up | |
libxml_use_internal_errors(true); | |
$start = $argv[1]; | |
$filter = isset($argv[2])?$argv[2]:''; | |
$paths = array(); | |
do { | |
// try a couple of offsets every time to pick up a couple more results | |
for ($i = 0; $i < 50; $i+=10) { | |
$newPaths = extractSubPaths($start, $paths, $filter, $i); | |
$paths = array_merge($paths, $newPaths); | |
} | |
$paths = array_unique($paths); | |
sleep(1); | |
} while(!empty($newPaths)); | |
foreach ($paths as $path) { | |
echo "$path: ".extractPathCount($path)."\n"; | |
} | |
// echo implode("\n", $paths); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment