Skip to content

Instantly share code, notes, and snippets.

@neerolyte
Last active December 22, 2015 01:19

Revisions

  1. neerolyte revised this gist Sep 17, 2013. 1 changed file with 58 additions and 55 deletions.
    113 changes: 58 additions & 55 deletions gistfile1.php
    Original file line number Diff line number Diff line change
    @@ -1,64 +1,67 @@
    <?php
    function docurl($url) {
    $opts = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_URL => $url,
    CURLOPT_HEADER => true,
    );

    $ch = curl_init();
    curl_setopt_array($ch, $opts);

    $data = array();

    $res = curl_exec($ch);

    $data['info'] = curl_getinfo($ch);

    $data['header'] = substr($res, 0, $data['info']['header_size']);
    $data['body'] = substr($res, $data['info']['header_size']);

    $data['error'] = curl_error($ch);
    $data['errorno'] = curl_errno($ch);
    curl_close($ch);

    return $data;
    $opts = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_URL => $url,
    CURLOPT_HEADER => true,
    );
    $ch = curl_init();
    curl_setopt_array($ch, $opts);
    $data = array();
    $res = curl_exec($ch);
    $data['info'] = curl_getinfo($ch);
    $data['header'] = substr($res, 0, $data['info']['header_size']);
    $data['body'] = substr($res, $data['info']['header_size']);
    $data['error'] = curl_error($ch);
    $data['errorno'] = curl_errno($ch);

    curl_close($ch);
    return $data;
    }

    function extractSubs($main, $ignores) {
    $ignores = array_map(function($v) { return "+-site:$v"; }, $ignores);
    $url = "http://www.google.com/search?q="
    ."site:$main"
    .implode('', $ignores);

    $res = docurl($url);

    $doc = new DOMDocument();
    $doc->loadHTML($res['body']);
    $xpath = new DOMXPath($doc);
    $nodes = $xpath->query("//cite");

    $cites = array();
    foreach ($nodes as $node) {
    $cite = $node->nodeValue;
    $cite = preg_replace('%^https?://%', '', $cite);
    $cite = preg_replace("%/.*%", '', $cite);
    if (!in_array($cite, $cites)) $cites []= $cite;
    }

    return $cites;

    function extractSubs($main, $ignores, $filter = '') {
    $ignores = array_map(function($v) { return "+-site:$v"; }, $ignores);
    $url = "http://www.google.com/search?q="
    ."site:$main+$filter"
    .implode('', $ignores);

    echo "Testing URL: $url\n";

    $res = docurl($url);

    $doc = new DOMDocument();
    $doc->loadHTML($res['body']);
    $xpath = new DOMXPath($doc);
    $nodes = $xpath->query("//cite");

    $cites = array();
    foreach ($nodes as $node) {
    $cite = $node->nodeValue;
    $cite = preg_replace('%^https?://%', '', $cite);
    $cite = preg_replace("%/.*%", '', $cite);
    if (!in_array($cite, $cites)) $cites []= $cite;
    }

    return $cites;
    }
    libxml_use_internal_errors(true);

    $start = $argv[1];
    $filter = isset($argv[2])?$argv[2]:'';
    $subs = array();

    do {
    $newSubs = extractSubs($start, $subs);
    $subs = array_merge($subs, $newSubs);

    print_r($subs);

    sleep(1);
    $newSubs = extractSubs($start, $subs, $filter);
    $subs = array_merge($subs, $newSubs);

    sleep(1);
    } while(!empty($newSubs));

    echo implode("\n", $subs);
  2. neerolyte renamed this gist Aug 31, 2013. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. neerolyte created this gist Aug 31, 2013.
    64 changes: 64 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    <?php
    function docurl($url) {
    $opts = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_URL => $url,
    CURLOPT_HEADER => true,
    );

    $ch = curl_init();
    curl_setopt_array($ch, $opts);

    $data = array();

    $res = curl_exec($ch);

    $data['info'] = curl_getinfo($ch);

    $data['header'] = substr($res, 0, $data['info']['header_size']);
    $data['body'] = substr($res, $data['info']['header_size']);

    $data['error'] = curl_error($ch);
    $data['errorno'] = curl_errno($ch);

    curl_close($ch);

    return $data;
    }

    function extractSubs($main, $ignores) {
    $ignores = array_map(function($v) { return "+-site:$v"; }, $ignores);
    $url = "http://www.google.com/search?q="
    ."site:$main"
    .implode('', $ignores);

    $res = docurl($url);

    $doc = new DOMDocument();
    $doc->loadHTML($res['body']);
    $xpath = new DOMXPath($doc);
    $nodes = $xpath->query("//cite");

    $cites = array();
    foreach ($nodes as $node) {
    $cite = $node->nodeValue;
    $cite = preg_replace('%^https?://%', '', $cite);
    $cite = preg_replace("%/.*%", '', $cite);
    if (!in_array($cite, $cites)) $cites []= $cite;
    }

    return $cites;
    }
    libxml_use_internal_errors(true);

    $start = $argv[1];
    $subs = array();

    do {
    $newSubs = extractSubs($start, $subs);
    $subs = array_merge($subs, $newSubs);

    print_r($subs);

    sleep(1);
    } while(!empty($newSubs));