Skip to content

Instantly share code, notes, and snippets.

@audreyt
Last active July 31, 2018 03:41
Show Gist options
  • Save audreyt/4642830 to your computer and use it in GitHub Desktop.
Save audreyt/4642830 to your computer and use it in GitHub Desktop.
<教育部重編國語辭典修訂本>單字下載腳本雛形
#!/usr/bin/env perl
use utf8;
use Encode;
binmode STDOUT, ':utf8';
for my $x (0xA4 .. 0xF9) {
for my $y (0x40 .. 0x7E, 0xA1 .. 0xFE) {
my $big5 = sprintf('%%%02X%%%02X', $x, $y);
my $char = Encode::decode(big5 => chr($x) . chr($y));
print qq[curl -m 10 --retry 10 --retry-delay 10 'http://dict.revised.moe.edu.tw/cgi-bin/newDict/dict.sh?idx=dict.idx&cond=%5E$big5%24&pieceLen=100&fld=1&cat=&imgFont=1' | piconv -f big5 -t utf8 | perl -ne 'next unless m!^<table width="90%"! .. m!^</table!; s!<span class="key">(.+?)</span>!\$1!g; print' > "$char.html"\n];
print "sleep 2\n";
}
}
@audreyt
Copy link
Author

audreyt commented Jan 26, 2013

#!/usr/bin/env perl
# Disambiguation script, to run after moe-download.pl finishes
use utf8;
use Encode;
use File::Slurp;
use LWP::Simple qw($ua get);
$ua->timeout(10);

sub fetch {
    my $url = shift;
    for (1..100) {
        my $rv = eval { LWP::Simple::get($url) };
        if ($rv) { return $rv };
        print "(Timeout, sleeping 10)\n";
        sleep 10;
    }
    print "(Timeout after 100 times, giving up)\n";
}

binmode STDOUT, ':utf8';
for my $char (`grep -l fetch *.html  | perl -pe 's/.html//'`) {
    chomp $char;
    $char = Encode::decode_utf8($char) unless Encode::is_utf8($char);
    my ($x, $y) = split //, Encode::encode(big5 => $char);
    my $big5 = sprintf('%%%02X%%%02X', ord $x, ord $y);
    print qq[$char ==> http://dict.revised.moe.edu.tw/cgi-bin/newDict/dict.sh?idx=dict.idx&cond=%5E$big5%24&pieceLen=100&fld=1&cat=&imgFont=1\n];
    my $body = fetch(qq[http://dict.revised.moe.edu.tw/cgi-bin/newDict/dict.sh?idx=dict.idx&cond=%5E$big5%24&pieceLen=100&fld=1&cat=&imgFont=1]) ;
    $body =~ /<input type="hidden" name="ukey" value="([^"]+)">/ or die;
    my $ukey = $1;
    sleep 5;

    while ($body =~ s/javascript:fetch\((\d+)//) {
        my $recNo = $1;
        my $def = fetch(qq[http://dict.revised.moe.edu.tw/cgi-bin/newDict/dict.sh?cond=%5E$big5%24&pieceLen=100&fld=1&cat=&ukey=$ukey&serial=1&recNo=$recNo&op=f&imgFont=1]);
        $def = Encode::decode(big5 => $def) unless Encode::is_utf8($def);
        $def =~ m!(<table width="90%".*</table>)!s or die;
        $def = $1;
        $def =~ s!<span class="key">(.+?)</span>!$1!g;
        write_file("$char.$recNo.html", Encode::encode_utf8($def));
        print "... $char.$recNo.html\n";
        sleep 5;
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment