Skip to content

Instantly share code, notes, and snippets.

@jukworks
Created February 11, 2014 11:54

Revisions

  1. Jonguk Kim created this gist Feb 11, 2014.
    98 changes: 98 additions & 0 deletions fetch_all_files.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@
    use strict;
    use autodie;
    use WWW::Mechanize;
    use Getopt::Long;
    use URI;
    use Math::Random qw/random_exponential/;

    my $url = '';
    my $interval = 5;
    my $save = 'fetched';
    my $test = '';

    # parse arguments
    GetOptions(
    "url=s" => \$url,
    "interval=i" => \$interval,
    "save=s" => \$save,
    "test" => \$test,
    ) or usage();

    # check if the given URL is valid
    unless ( ( URI->new( $url, "http" ) )->scheme() ) {
    print STDERR "ERROR! Invalid URL: ", $url, "\n\n";
    usage();
    }

    # let's get started
    $url .= '/' unless $url =~ /.*\/$/;
    print "Given URL: ", $url, "\n";
    print "Interval : ", $interval, "\n";
    print "Save Dir : ", $save, "\n";

    mkdir $save unless $test or -d $save;

    my $mech = WWW::Mechanize->new;
    $mech->show_progress(1);
    $mech->agent_alias('Windows Mozilla');
    $mech->get($url);

    my @links = $mech->links;
    my @dir_queue = ();
    my @file_queue = ();

    while (1) {
    for my $x (@links) {
    if ( defined $x->url ) {
    push @dir_queue, $x->base . $x->url
    if $x->url !~ /^\//
    && $x->url !~ /^http/
    && $x->url =~ /\/$/;
    push @file_queue, $x->base . $x->url
    if $x->url
    =~ /.*(avi|bmp|gif|jpg|jpeg|mov|mp4|mpg|mpeg|png|zip)$/i;
    }
    }

    my @delays
    = Math::Random::random_exponential( scalar @file_queue, $interval )
    unless $test;

    for my $target (@file_queue) {
    my $save_to = $save . '/' . substr( $target, length($url) );
    print "Downloading: $target => $save_to\n";
    unless ($test) {
    $mech->get($target);
    $mech->save_content($save_to);
    my $sleep_time = int( shift @delays );
    if ( $sleep_time > 0 ) {
    print "Sleeping: ${sleep_time}s\n";
    sleep($sleep_time);
    }
    }
    }
    last unless @dir_queue;
    my $next = shift @dir_queue;
    unless ($test) {
    my $local_dir = $save . '/' . substr( $next, length($url) );
    mkdir $local_dir unless -d $local_dir;
    }
    print "The next directory: ", $next, "\n";

    $mech->get($next);
    @links = $mech->links;
    @file_queue = ();
    }

    sub usage {
    print <<HELLO;
    Usage: fetch_all_files.pl -url [URL] -interval [Interval] -save [Save Directory] -test
    -url : the target URL (REQUIRED)
    -interval : the average time interval (second) (default: 5s)
    -save : the directory name for storing files (default: fetched)
    -test : the script will not download files (just print out the target files)
    HELLO
    exit;
    }

    __END__