Skip to content

Instantly share code, notes, and snippets.

@noodles-v6
Forked from creaktive/mojo-crawler.pl
Created September 18, 2013 05:02

Revisions

  1. @creaktive creaktive revised this gist Feb 21, 2013. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -97,4 +97,7 @@ sub parse_html {
    say '';

    return;
    }
    }
    __DATA__
    Featured at:
    http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
  2. @creaktive creaktive revised this gist Jan 21, 2013. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -12,9 +12,6 @@
    http://sysd.org/page/1/
    http://sysd.org/page/2/
    http://sysd.org/page/3/
    http://sysd.org/page/4/
    http://sysd.org/page/5/
    http://sysd.org/page/6/
    );

    # Limit parallel connections to 4
  3. @creaktive creaktive revised this gist Jan 21, 2013. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -88,6 +88,7 @@ sub parse_html {

    # Access every link only once
    state $uniq = {};
    ++$uniq->{$url->to_string};
    next if ++$uniq->{$link->to_string} > 1;

    # Don't visit other hosts
  4. @creaktive creaktive revised this gist Jan 21, 2013. 1 changed file with 1 addition and 3 deletions.
    4 changes: 1 addition & 3 deletions mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -17,7 +17,7 @@
    http://sysd.org/page/6/
    );

    # Limit parallel connections
    # Limit parallel connections to 4
    my $max_conn = 4;

    # User agent following up to 5 redirects
    @@ -30,8 +30,6 @@

    Mojo::IOLoop->recurring(
    0 => sub {

    # Keep up to 4 parallel crawlers sharing the same user agent
    for ($active + 1 .. $max_conn) {

    # Dequeue or halt if there are no active crawlers anymore
  5. @creaktive creaktive revised this gist Jan 21, 2013. 1 changed file with 59 additions and 48 deletions.
    107 changes: 59 additions & 48 deletions mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -9,19 +9,69 @@

    # FIFO queue
    my @urls = map { Mojo::URL->new($_) } qw(
    http://localhost/manual/
    http://sysd.org/page/1/
    http://sysd.org/page/2/
    http://sysd.org/page/3/
    http://sysd.org/page/4/
    http://sysd.org/page/5/
    http://sysd.org/page/6/
    );

    # Limit parallel connections
    my $max_conn = 4;

    # User agent following up to 5 redirects
    my $ua = Mojo::UserAgent->new(max_redirects => 5);
    my $ua = Mojo::UserAgent
    ->new(max_redirects => 5)
    ->detect_proxy;

    # Keep track of active connections
    my $active = 0;

    Mojo::IOLoop->recurring(
    0 => sub {

    # Keep up to 4 parallel crawlers sharing the same user agent
    for ($active + 1 .. $max_conn) {

    sub parse {
    my ($tx) = @_;
    # Dequeue or halt if there are no active crawlers anymore
    return ($active or Mojo::IOLoop->stop)
    unless my $url = shift @urls;

    # Fetch non-blocking just by adding
    # a callback and marking as active
    ++$active;
    $ua->get($url => \&get_callback);
    }
    }
    );

    # Start event loop if necessary
    Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

    sub get_callback {
    my (undef, $tx) = @_;

    # Deactivate
    --$active;

    # Parse only OK HTML responses
    return
    if not $tx->res->is_status_class(200)
    or $tx->res->headers->content_type !~ m{^text/html\b}ix;

    # Request URL
    my $url = $tx->req->url;

    say "\n$url";
    say $url;
    parse_html($url, $tx);

    return;
    }

    sub parse_html {
    my ($url, $tx) = @_;

    say $tx->res->dom->at('html title')->text;

    # Extract and enqueue URLs
    @@ -33,13 +83,13 @@ sub parse {

    # "normalize" link
    $link = $link->to_abs($tx->req->url)->fragment(undef);
    next unless $link->protocol =~ /^https?$/x;
    next unless grep { $link->protocol eq $_ } qw(http https);

    # Don't go deeper than /a/b/c
    next if @{$link->path->parts} > 3;

    # Access every link only once
    state $uniq;
    state $uniq = {};
    next if ++$uniq->{$link->to_string} > 1;

    # Don't visit other hosts
    @@ -48,46 +98,7 @@ sub parse {
    push @urls, $link;
    say " -> $link";
    }
    say '';

    return;
    }

    # Keep track of active connections
    my $active = 0;

    sub get_callback {
    my (undef, $tx) = @_;

    # Parse only OK HTML responses
    $tx->res->code == 200
    and
    $tx->res->headers->content_type =~ m{^text/html\b}ix
    and
    parse($tx);

    # Deactivate
    --$active;

    return;
    }

    Mojo::IOLoop->recurring(
    0 => sub {

    # Keep up to 4 parallel crawlers sharing the same user agent
    for ($active + 1 .. 4) {

    # Dequeue or halt if there are no active crawlers anymore
    return ($active or Mojo::IOLoop->stop)
    unless my $url = shift @urls;

    # Fetch non-blocking just by adding
    # a callback and marking as active
    ++$active;
    $ua->get($url => \&get_callback);
    }
    }
    );

    # Start event loop if necessary
    Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
    }
  6. @creaktive creaktive revised this gist Jan 17, 2013. 1 changed file with 9 additions and 8 deletions.
    17 changes: 9 additions & 8 deletions mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -8,16 +8,13 @@
    use Mojo::UserAgent;

    # FIFO queue
    my @urls = (Mojo::URL->new('http://localhost/manual/'));
    my @urls = map { Mojo::URL->new($_) } qw(
    http://localhost/manual/
    );

    # User agent following up to 5 redirects
    my $ua = Mojo::UserAgent->new(max_redirects => 5);

    # Track accessed URLs
    my %uniq;

    my $active = 0;

    sub parse {
    my ($tx) = @_;

    @@ -42,7 +39,8 @@ sub parse {
    next if @{$link->path->parts} > 3;

    # Access every link only once
    next if ++$uniq{$link->to_string} > 1;
    state $uniq;
    next if ++$uniq->{$link->to_string} > 1;

    # Don't visit other hosts
    next if $link->host ne $url->host;
    @@ -54,6 +52,9 @@ sub parse {
    return;
    }

    # Keep track of active connections
    my $active = 0;

    sub get_callback {
    my (undef, $tx) = @_;

    @@ -74,7 +75,7 @@ sub get_callback {
    0 => sub {

    # Keep up to 4 parallel crawlers sharing the same user agent
    for ($active .. 4 - 1) {
    for ($active + 1 .. 4) {

    # Dequeue or halt if there are no active crawlers anymore
    return ($active or Mojo::IOLoop->stop)
  7. @creaktive creaktive created this gist Dec 20, 2012.
    92 changes: 92 additions & 0 deletions mojo-crawler.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,92 @@
    #!/usr/bin/env perl
    use 5.010;
    use open qw(:locale);
    use strict;
    use utf8;
    use warnings qw(all);

    use Mojo::UserAgent;

    # FIFO queue
    my @urls = (Mojo::URL->new('http://localhost/manual/'));

    # User agent following up to 5 redirects
    my $ua = Mojo::UserAgent->new(max_redirects => 5);

    # Track accessed URLs
    my %uniq;

    my $active = 0;

    sub parse {
    my ($tx) = @_;

    # Request URL
    my $url = $tx->req->url;

    say "\n$url";
    say $tx->res->dom->at('html title')->text;

    # Extract and enqueue URLs
    for my $e ($tx->res->dom('a[href]')->each) {

    # Validate href attribute
    my $link = Mojo::URL->new($e->{href});
    next if 'Mojo::URL' ne ref $link;

    # "normalize" link
    $link = $link->to_abs($tx->req->url)->fragment(undef);
    next unless $link->protocol =~ /^https?$/x;

    # Don't go deeper than /a/b/c
    next if @{$link->path->parts} > 3;

    # Access every link only once
    next if ++$uniq{$link->to_string} > 1;

    # Don't visit other hosts
    next if $link->host ne $url->host;

    push @urls, $link;
    say " -> $link";
    }

    return;
    }

    sub get_callback {
    my (undef, $tx) = @_;

    # Parse only OK HTML responses
    $tx->res->code == 200
    and
    $tx->res->headers->content_type =~ m{^text/html\b}ix
    and
    parse($tx);

    # Deactivate
    --$active;

    return;
    }

    Mojo::IOLoop->recurring(
    0 => sub {

    # Keep up to 4 parallel crawlers sharing the same user agent
    for ($active .. 4 - 1) {

    # Dequeue or halt if there are no active crawlers anymore
    return ($active or Mojo::IOLoop->stop)
    unless my $url = shift @urls;

    # Fetch non-blocking just by adding
    # a callback and marking as active
    ++$active;
    $ua->get($url => \&get_callback);
    }
    }
    );

    # Start event loop if necessary
    Mojo::IOLoop->start unless Mojo::IOLoop->is_running;