-
-
Save kwakwaversal/6c0fc9d5d4c6a8683315f8c907dc4a7f to your computer and use it in GitHub Desktop.
Modularization of my answer from SO on URL queuing for non-blocking ua #mojo #perl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
package Mojo::URLQueue; | |
use Mojo::Base 'Mojo::EventEmitter'; | |
use Mojo::UserAgent; | |
has queue => sub { [] }; | |
has ua => sub { Mojo::UserAgent->new(max_redirects => 5) }; | |
has concurrency => 4; | |
sub start { | |
my ($self, $cb) = @_; | |
return unless @{ $self->queue }; | |
unless ( $self->{delay} ) { | |
$self->{concurrent} = 0; | |
$self->{delay} = Mojo::IOLoop->delay; | |
$self->{delay}->on(finish => sub{ | |
warn("Loop ended before queue depleted\n") if @{ $self->queue }; | |
undef $self->{delay}; | |
$self->$cb() if $cb; | |
}); | |
} | |
$self->_refresh; | |
# Start event loop if necessary | |
$self->{delay}->wait unless $self->{delay}->ioloop->is_running; | |
} | |
sub _refresh { | |
my $self = shift; | |
my $concurrency = $self->concurrency; | |
while ( $self->{concurrent} < $concurrency and my $url = shift @{ $self->queue } ) { | |
$self->{concurrent}++; | |
my $end = $self->{delay}->begin; | |
$self->ua->get($url => sub{ | |
my ($ua, $tx) = @_; | |
$self->emit( process => $tx ); | |
# refresh worker pool | |
$self->{concurrent}--; | |
$self->_refresh; | |
$end->(); | |
}); | |
} | |
} | |
package main; | |
use Mojo::Base -strict; | |
use Mojo::URL; | |
use utf8::all; | |
# FIFO queue | |
my @urls = qw( | |
http://sysd.org/page/1/ | |
http://sysd.org/page/2/ | |
http://sysd.org/page/3/ | |
); | |
my $q = Mojo::URLQueue->new( queue => \@urls ); | |
$q->on( process => \&process ); | |
$q->start(sub { say 'Finished' }); | |
sub process { | |
my ($q, $tx) = @_; | |
my $queue = $q->queue; | |
# Parse only OK HTML responses | |
return unless | |
$tx->res->is_status_class(200) | |
and $tx->res->headers->content_type =~ m{^text/html\b}ix; | |
# Request URL | |
my $url = $tx->req->url; | |
say "Processing $url"; | |
push @$queue, parse_html($url, $tx); | |
} | |
sub parse_html { | |
my ($url, $tx) = @_; | |
state %visited; | |
my @links; | |
my $dom = $tx->res->dom; | |
say $dom->at('html title')->text; | |
# Extract and enqueue URLs | |
$dom->find('a[href]')->each(sub{ | |
# Validate href attribute | |
my $link = Mojo::URL->new($_->{href}); | |
return unless eval { $link->isa('Mojo::URL') }; | |
# "normalize" link | |
$link = $link->to_abs($url)->fragment(undef); | |
return unless grep { $link->protocol eq $_ } qw(http https); | |
# Don't go deeper than /a/b/c | |
return if @{$link->path->parts} > 3; | |
# Access every link only once | |
return if $visited{$link->to_string}++; | |
# Don't visit other hosts | |
return if $link->host ne $url->host; | |
push @links, $link; | |
say " -> $link"; | |
}); | |
say ''; | |
return @links; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment