Created
April 29, 2013 22:11
-
-
Save binary132/5485217 to your computer and use it in GitHub Desktop.
Display 100 most common lines from STDIN. Unlimited input (external sort.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
use strict; | |
use warnings; | |
# JBS 2013 | |
# External sort | |
# read lines (possibly billions of lines) from standard input | |
# print out the 100 most common lines | |
# 8388608 = 8Mi | |
# 134217728 = 128Mi | |
my $max_file_size = 131072; | |
sub make_sorted_chunk_files { | |
my %lines; | |
my $line; | |
my $file_id = 0; | |
while (<>) { | |
chomp; | |
$line = $_; | |
$lines{$line}++; | |
# When hash has max elements, | |
# sort hash and put it in a file. | |
# then nuke it and start over! | |
if (scalar keys %lines >= $max_file_size ) { | |
# sort keys by hash value, descending | |
my @tmpkeys = sort {$lines{$b} <=> $lines{$a}} keys %lines; | |
# open file for writeout (auto-close) | |
open (my $outfile, ">", $file_id . ".dat") | |
or die "Failed to open > " . $file_id . ".dat: $!"; | |
$file_id++; | |
print $outfile "$_\n", $lines{$_}, "\n" foreach (@tmpkeys); | |
# reset hash | |
%lines = (); | |
} | |
} | |
# If there's anything left, wrap things up. | |
if (scalar keys %lines) { | |
my @tmpkeys = sort {$lines{$b} <=> $lines {$a}} keys %lines; | |
open (my $outfile, ">", $file_id . ".dat") | |
or die "Failed to open > " . $file_id . ".dat: $!"; | |
$file_id++; | |
print $outfile "$_\n", $lines{$_}, "\n" foreach (@tmpkeys); | |
} | |
} | |
sub sort_chunks_top100 { | |
# Get a list of the dat files. | |
my @files = <*.dat>; | |
my @filehandles = map { open my $fh, '<', $_; $fh } @files; | |
my %most_frequent_lines; | |
# initialize values to second line (frequency number) | |
my @instrings = map { my $tmpstr = scalar <$_>; chomp $tmpstr; $tmpstr } @filehandles; | |
my @frequencies = map { my $tmpstr = scalar <$_>; chomp $tmpstr; $tmpstr } @filehandles; | |
# now until we have 100 most frequent strings, | |
until ((scalar keys %most_frequent_lines) >= 100) { | |
# find index of largest frequency in current frequencies | |
my $largest_index = 0; | |
my $max_frequency = $frequencies[$largest_index]; | |
for (0 .. (scalar @frequencies) - 1) { | |
my $i = $_; | |
if ($max_frequency < $frequencies[$i]) { | |
$largest_index = $i; | |
$max_frequency = $frequencies[$i]; | |
} | |
} | |
# insert that item into hash of largest, adding in case of collision | |
# my @tmp_split = split (/,/, $instrings[$largest_index]); | |
$most_frequent_lines{ $instrings[$largest_index] } += $frequencies[$largest_index]; | |
# then get next line from that file, | |
$instrings[$largest_index] = scalar readline ($filehandles[$largest_index]); | |
$frequencies[$largest_index] = scalar readline ($filehandles[$largest_index]); | |
chomp $instrings[$largest_index]; | |
chomp $frequencies[$largest_index]; | |
} | |
# once we have read 100 strings into sorted list, we are done. | |
return sort {$most_frequent_lines{$b} <=> $most_frequent_lines{$a}} keys %most_frequent_lines; | |
} | |
make_sorted_chunk_files( ); | |
{ | |
local $, = "\n"; | |
print sort_chunks_top100( ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note: script may not work perfectly. Not sure why, but the order of the top 100 lines seems incorrect, in spite of being sorted right when they're returned from the function. Otherwise, I believe it works correctly.