noodles-v6 · September 18, 2013 05:02 · Feb 21, 2013 · Jan 21, 2013 · Jan 21, 2013 · Jan 21, 2013
diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -97,4 +97,7 @@ sub parse_html {
     say '';
 
     return;
-}
+}
+__DATA__
+Featured at:
+http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -12,9 +12,6 @@
     http://sysd.org/page/1/
     http://sysd.org/page/2/
     http://sysd.org/page/3/
-    http://sysd.org/page/4/
-    http://sysd.org/page/5/
-    http://sysd.org/page/6/
 );
 
 # Limit parallel connections to 4

diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -88,6 +88,7 @@ sub parse_html {
 
         # Access every link only once
         state $uniq = {};
+        ++$uniq->{$url->to_string};
         next if ++$uniq->{$link->to_string} > 1;
 
         # Don't visit other hosts

diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -17,7 +17,7 @@
     http://sysd.org/page/6/
 );
 
-# Limit parallel connections
+# Limit parallel connections to 4
 my $max_conn = 4;
 
 # User agent following up to 5 redirects
@@ -30,8 +30,6 @@
 
 Mojo::IOLoop->recurring(
     0 => sub {
-
-        # Keep up to 4 parallel crawlers sharing the same user agent
         for ($active + 1 .. $max_conn) {
 
             # Dequeue or halt if there are no active crawlers anymore

diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -9,19 +9,69 @@
 
 # FIFO queue
 my @urls = map { Mojo::URL->new($_) } qw(
-    http://localhost/manual/
+    http://sysd.org/page/1/
+    http://sysd.org/page/2/
+    http://sysd.org/page/3/
+    http://sysd.org/page/4/
+    http://sysd.org/page/5/
+    http://sysd.org/page/6/
 );
 
+# Limit parallel connections
+my $max_conn = 4;
+
 # User agent following up to 5 redirects
-my $ua = Mojo::UserAgent->new(max_redirects => 5);
+my $ua = Mojo::UserAgent
+    ->new(max_redirects => 5)
+    ->detect_proxy;
+
+# Keep track of active connections
+my $active = 0;
+
+Mojo::IOLoop->recurring(
+    0 => sub {
+
+        # Keep up to 4 parallel crawlers sharing the same user agent
+        for ($active + 1 .. $max_conn) {
 
-sub parse {
-    my ($tx) = @_;
+            # Dequeue or halt if there are no active crawlers anymore
+            return ($active or Mojo::IOLoop->stop)
+                unless my $url = shift @urls;
+
+            # Fetch non-blocking just by adding
+            # a callback and marking as active
+            ++$active;
+            $ua->get($url => \&get_callback);
+        }
+    }
+);
+
+# Start event loop if necessary
+Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
+
+sub get_callback {
+    my (undef, $tx) = @_;
+
+    # Deactivate
+    --$active;
+
+    # Parse only OK HTML responses
+    return
+        if not $tx->res->is_status_class(200)
+        or $tx->res->headers->content_type !~ m{^text/html\b}ix;
 
     # Request URL
     my $url = $tx->req->url;
 
-    say "\n$url";
+    say $url;
+    parse_html($url, $tx);
+
+    return;
+}
+
+sub parse_html {
+    my ($url, $tx) = @_;
+
     say $tx->res->dom->at('html title')->text;
 
     # Extract and enqueue URLs
@@ -33,13 +83,13 @@ sub parse {
 
         # "normalize" link
         $link = $link->to_abs($tx->req->url)->fragment(undef);
-        next unless $link->protocol =~ /^https?$/x;
+        next unless grep { $link->protocol eq $_ } qw(http https);
 
         # Don't go deeper than /a/b/c
         next if @{$link->path->parts} > 3;
 
         # Access every link only once
-        state $uniq;
+        state $uniq = {};
         next if ++$uniq->{$link->to_string} > 1;
 
         # Don't visit other hosts
@@ -48,46 +98,7 @@ sub parse {
         push @urls, $link;
         say " -> $link";
     }
+    say '';
 
     return;
-}
-
-# Keep track of active connections
-my $active = 0;
-
-sub get_callback {
-    my (undef, $tx) = @_;
-
-    # Parse only OK HTML responses
-    $tx->res->code == 200
-        and
-    $tx->res->headers->content_type =~ m{^text/html\b}ix
-        and
-    parse($tx);
-
-    # Deactivate
-    --$active;
-
-    return;
-}
-
-Mojo::IOLoop->recurring(
-    0 => sub {
-
-        # Keep up to 4 parallel crawlers sharing the same user agent
-        for ($active + 1 .. 4) {
-
-            # Dequeue or halt if there are no active crawlers anymore
-            return ($active or Mojo::IOLoop->stop)
-                unless my $url = shift @urls;
-
-            # Fetch non-blocking just by adding
-            # a callback and marking as active
-            ++$active;
-            $ua->get($url => \&get_callback);
-        }
-    }
-);
-
-# Start event loop if necessary
-Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
+}
diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -8,16 +8,13 @@
 use Mojo::UserAgent;
 
 # FIFO queue
-my @urls = (Mojo::URL->new('http://localhost/manual/'));
+my @urls = map { Mojo::URL->new($_) } qw(
+    http://localhost/manual/
+);
 
 # User agent following up to 5 redirects
 my $ua = Mojo::UserAgent->new(max_redirects => 5);
 
-# Track accessed URLs
-my %uniq;
-
-my $active = 0;
-
 sub parse {
     my ($tx) = @_;
 
@@ -42,7 +39,8 @@ sub parse {
         next if @{$link->path->parts} > 3;
 
         # Access every link only once
-        next if ++$uniq{$link->to_string} > 1;
+        state $uniq;
+        next if ++$uniq->{$link->to_string} > 1;
 
         # Don't visit other hosts
         next if $link->host ne $url->host;
@@ -54,6 +52,9 @@ sub parse {
     return;
 }
 
+# Keep track of active connections
+my $active = 0;
+
 sub get_callback {
     my (undef, $tx) = @_;
 
@@ -74,7 +75,7 @@ sub get_callback {
     0 => sub {
 
         # Keep up to 4 parallel crawlers sharing the same user agent
-        for ($active .. 4 - 1) {
+        for ($active + 1 .. 4) {
 
             # Dequeue or halt if there are no active crawlers anymore
             return ($active or Mojo::IOLoop->stop)

diff --git a/mojo-crawler.pl b/mojo-crawler.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/env perl
+use 5.010;
+use open qw(:locale);
+use strict;
+use utf8;
+use warnings qw(all);
+
+use Mojo::UserAgent;
+
+# FIFO queue
+my @urls = (Mojo::URL->new('http://localhost/manual/'));
+
+# User agent following up to 5 redirects
+my $ua = Mojo::UserAgent->new(max_redirects => 5);
+
+# Track accessed URLs
+my %uniq;
+
+my $active = 0;
+
+sub parse {
+    my ($tx) = @_;
+
+    # Request URL
+    my $url = $tx->req->url;
+
+    say "\n$url";
+    say $tx->res->dom->at('html title')->text;
+
+    # Extract and enqueue URLs
+    for my $e ($tx->res->dom('a[href]')->each) {
+
+        # Validate href attribute
+        my $link = Mojo::URL->new($e->{href});
+        next if 'Mojo::URL' ne ref $link;
+
+        # "normalize" link
+        $link = $link->to_abs($tx->req->url)->fragment(undef);
+        next unless $link->protocol =~ /^https?$/x;
+
+        # Don't go deeper than /a/b/c
+        next if @{$link->path->parts} > 3;
+
+        # Access every link only once
+        next if ++$uniq{$link->to_string} > 1;
+
+        # Don't visit other hosts
+        next if $link->host ne $url->host;
+
+        push @urls, $link;
+        say " -> $link";
+    }
+
+    return;
+}
+
+sub get_callback {
+    my (undef, $tx) = @_;
+
+    # Parse only OK HTML responses
+    $tx->res->code == 200
+        and
+    $tx->res->headers->content_type =~ m{^text/html\b}ix
+        and
+    parse($tx);
+
+    # Deactivate
+    --$active;
+
+    return;
+}
+
+Mojo::IOLoop->recurring(
+    0 => sub {
+
+        # Keep up to 4 parallel crawlers sharing the same user agent
+        for ($active .. 4 - 1) {
+
+            # Dequeue or halt if there are no active crawlers anymore
+            return ($active or Mojo::IOLoop->stop)
+                unless my $url = shift @urls;
+
+            # Fetch non-blocking just by adding
+            # a callback and marking as active
+            ++$active;
+            $ua->get($url => \&get_callback);
+        }
+    }
+);
+
+# Start event loop if necessary
+Mojo::IOLoop->start unless Mojo::IOLoop->is_running;