Revisions
-
creaktive revised this gist
Feb 21, 2013 . 1 changed file with 4 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -97,4 +97,7 @@ sub parse_html { say ''; return; } __DATA__ Featured at: http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html -
creaktive revised this gist
Jan 21, 2013 . 1 changed file with 0 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,9 +12,6 @@ http://sysd.org/page/1/ http://sysd.org/page/2/ http://sysd.org/page/3/ ); # Limit parallel connections to 4 -
creaktive revised this gist
Jan 21, 2013 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -88,6 +88,7 @@ sub parse_html { # Access every link only once state $uniq = {}; ++$uniq->{$url->to_string}; next if ++$uniq->{$link->to_string} > 1; # Don't visit other hosts -
creaktive revised this gist
Jan 21, 2013 . 1 changed file with 1 addition and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -17,7 +17,7 @@ http://sysd.org/page/6/ ); # Limit parallel connections to 4 my $max_conn = 4; # User agent following up to 5 redirects @@ -30,8 +30,6 @@ Mojo::IOLoop->recurring( 0 => sub { for ($active + 1 .. $max_conn) { # Dequeue or halt if there are no active crawlers anymore -
creaktive revised this gist
Jan 21, 2013 . 1 changed file with 59 additions and 48 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,19 +9,69 @@ # FIFO queue my @urls = map { Mojo::URL->new($_) } qw( http://sysd.org/page/1/ http://sysd.org/page/2/ http://sysd.org/page/3/ http://sysd.org/page/4/ http://sysd.org/page/5/ http://sysd.org/page/6/ ); # Limit parallel connections my $max_conn = 4; # User agent following up to 5 redirects my $ua = Mojo::UserAgent ->new(max_redirects => 5) ->detect_proxy; # Keep track of active connections my $active = 0; Mojo::IOLoop->recurring( 0 => sub { # Keep up to 4 parallel crawlers sharing the same user agent for ($active + 1 .. $max_conn) { # Dequeue or halt if there are no active crawlers anymore return ($active or Mojo::IOLoop->stop) unless my $url = shift @urls; # Fetch non-blocking just by adding # a callback and marking as active ++$active; $ua->get($url => \&get_callback); } } ); # Start event loop if necessary Mojo::IOLoop->start unless Mojo::IOLoop->is_running; sub get_callback { my (undef, $tx) = @_; # Deactivate --$active; # Parse only OK HTML responses return if not $tx->res->is_status_class(200) or $tx->res->headers->content_type !~ m{^text/html\b}ix; # Request URL my $url = $tx->req->url; say $url; parse_html($url, $tx); return; } sub parse_html { my ($url, $tx) = @_; say $tx->res->dom->at('html title')->text; # Extract and enqueue URLs @@ -33,13 +83,13 @@ sub parse { # "normalize" link $link = $link->to_abs($tx->req->url)->fragment(undef); next unless grep { $link->protocol eq $_ } qw(http https); # Don't go deeper than /a/b/c next if @{$link->path->parts} > 3; # Access every link only once state $uniq = {}; next if ++$uniq->{$link->to_string} > 1; # Don't visit other hosts @@ -48,46 +98,7 @@ sub parse { push @urls, $link; say " -> $link"; } say ''; return; } -
creaktive revised this gist
Jan 17, 2013 . 1 changed file with 9 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,16 +8,13 @@ use Mojo::UserAgent; # FIFO queue my @urls = map { Mojo::URL->new($_) } qw( http://localhost/manual/ ); # User agent following up to 5 redirects my $ua = Mojo::UserAgent->new(max_redirects => 5); sub parse { my ($tx) = @_; @@ -42,7 +39,8 @@ sub parse { next if @{$link->path->parts} > 3; # Access every link only once state $uniq; next if ++$uniq->{$link->to_string} > 1; # Don't visit other hosts next if $link->host ne $url->host; @@ -54,6 +52,9 @@ sub parse { return; } # Keep track of active connections my $active = 0; sub get_callback { my (undef, $tx) = @_; @@ -74,7 +75,7 @@ sub get_callback { 0 => sub { # Keep up to 4 parallel crawlers sharing the same user agent for ($active + 1 .. 4) { # Dequeue or halt if there are no active crawlers anymore return ($active or Mojo::IOLoop->stop) -
creaktive created this gist
Dec 20, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,92 @@ #!/usr/bin/env perl use 5.010; use open qw(:locale); use strict; use utf8; use warnings qw(all); use Mojo::UserAgent; # FIFO queue my @urls = (Mojo::URL->new('http://localhost/manual/')); # User agent following up to 5 redirects my $ua = Mojo::UserAgent->new(max_redirects => 5); # Track accessed URLs my %uniq; my $active = 0; sub parse { my ($tx) = @_; # Request URL my $url = $tx->req->url; say "\n$url"; say $tx->res->dom->at('html title')->text; # Extract and enqueue URLs for my $e ($tx->res->dom('a[href]')->each) { # Validate href attribute my $link = Mojo::URL->new($e->{href}); next if 'Mojo::URL' ne ref $link; # "normalize" link $link = $link->to_abs($tx->req->url)->fragment(undef); next unless $link->protocol =~ /^https?$/x; # Don't go deeper than /a/b/c next if @{$link->path->parts} > 3; # Access every link only once next if ++$uniq{$link->to_string} > 1; # Don't visit other hosts next if $link->host ne $url->host; push @urls, $link; say " -> $link"; } return; } sub get_callback { my (undef, $tx) = @_; # Parse only OK HTML responses $tx->res->code == 200 and $tx->res->headers->content_type =~ m{^text/html\b}ix and parse($tx); # Deactivate --$active; return; } Mojo::IOLoop->recurring( 0 => sub { # Keep up to 4 parallel crawlers sharing the same user agent for ($active .. 4 - 1) { # Dequeue or halt if there are no active crawlers anymore return ($active or Mojo::IOLoop->stop) unless my $url = shift @urls; # Fetch non-blocking just by adding # a callback and marking as active ++$active; $ua->get($url => \&get_callback); } } ); # Start event loop if necessary Mojo::IOLoop->start unless Mojo::IOLoop->is_running;