baragona · December 16, 2014 05:21
diff --git a/gistfile1.pl b/gistfile1.pl
 #!/usr/bin/perl -w
 # created October 4, 2014
 use strict;

 use Data::Dumper;
 #use JSON::PP;



 $"=" ";
 $|=1;




 sub context_to_string{
  my @arr = @{ shift() };
  
  my $char = ')';
  
  return join $char, @arr;
  
 }

 sub dec2bin {
  my $numBits = shift; # param1
  my $number = shift;  # param2
  my $str = unpack("B32", pack("N", $number));
  #$str =~ s/^0+(?=\d)//;   # otherwise you'll get leading zeros
  return substr($str, 32-$numBits, $numBits);
 }
 sub splitIntoWords{
  my $text = shift;
  my @words = split /\s+/, $text ;
  
  @words = map { /(\w+|\W+)/g  } @words;
  return @words;
 }

 sub build_markov_table{
  my $text = shift;
  my $n_context=shift;
  $text =~ s/[(){}\[\]"']+/ /g;

  my @words = splitIntoWords($text);
  my %words_seen;
  #die Dumper \@words;

  my @last_words;
  push @last_words, "";


  for (1 .. $n_context){
    push @last_words, "";
  }
  for my $word(@words){
    shift @last_words;
    push @last_words, $word;
    my @context = @last_words;
    pop @context;
    
    $words_seen{context_to_string \@context}{ $word }++;

  }
  return \%words_seen;
 }

 sub bitstring_to_bytes{
  my $bitstring=shift;
  $bitstring .= '0' x (8-(length($bitstring)%8)) if length($bitstring)%8>0;
  return pack("B*", $bitstring);
 }
 sub bytes_to_bitstring{
  my $bytes=shift;
  $bytes = unpack("B*",$bytes);
  return $bytes;
 }

 sub slurp_file{
  my $fname = shift;
  local( $/ ) ;
  open my $fh, '<', $fname or die "cant open $!";
  my $text = <$fh>;
  return $text;
 }

 sub encode_or_decode{
  my $mode = shift;
  my %words_seen = %{ shift() };
  my $n_context=shift;
  my $input=shift;
  
  
  my $context =  [ map { "" } (1 .. $n_context)];


  my @encodedWords;
  my $input_plain;
  if($mode eq "decode") {
      @encodedWords = splitIntoWords($input);
  }
  if($mode eq "encode") {
      $input_plain =$input;

  }
  my $input_binary = bytes_to_bitstring($input_plain);

  my $getBit = sub {
    if(not defined $input_binary){
      return;
    }

    my $bit = substr($input_binary,0,1);
    #warn $input_binary;
    if(length($input_binary)==1){
      $input_binary = undef;
    }else{
      $input_binary = substr($input_binary, 1);
    }
    return $bit;
    #return 1;#rand() > .5? 1:0;
  };

  my $input_finished = sub {
    if((not defined $input_binary) or $input_binary eq '' ){
      return 1;
    }
    return 0;
  };

  my $bitString;
  my $textString;
  while(1) {
    my %choices_hash = %{ $words_seen{context_to_string($context)} };
    #print Dumper \%choices_hash;

    my @keys = sort { ($choices_hash{$b} <=> $choices_hash{$a}) or ($a cmp $b) } (keys %choices_hash);
    my $max_bits = int(log(scalar(@keys))/log(2));

    if($mode eq "decode") {
      my $word = shift(@encodedWords);
      if(not defined $word) {
        #print "\n".$bitString."\n";
        return bitstring_to_bytes($bitString);
      }
  
      if($max_bits > 0){
        my $numericValue;
        for(my $i = 0; $i < scalar(@keys); ++$i){
          if($word eq $keys[$i]){
            $numericValue = $i;
          }
        }
        die "numeric value undef BAD" if(not defined $numericValue);
        #print $numericValue;
        #print "\n".dec2bin($max_bits, $numericValue)."\n";
        $bitString .= dec2bin($max_bits, $numericValue);

      }
  
      shift @$context;
      push @$context, $word;
    }
    if($mode eq "encode") {
      my @databits;
      # get databits
      for(1..$max_bits){
        my $bit = $getBit->();        
        push @databits, $bit if defined $bit;
    
      }
      my $numeric_value = oct("0b".join("", @databits));


      #print $max_bits."\n";

      my $freq_sum=0;
      for my $freq(values %choices_hash){
        $freq_sum+=$freq;
      }

  #   my $rand = int(rand()*$freq_sum);
  #
  #   my $lower_limit=0;
  #   my $choice_key;
  #   for my $key(@keys){
  #     $lower_limit += $choices_hash{$key};
  #     if($rand < $lower_limit){
  #     $choice_key=$key;
  #     last;
  #     }
  #   }
  #   if(not defined $choice_key){
  #     warn "no choice index";
  #     $choice_key = $keys[-1];
  #   }

      my $choice_key = $keys[$numeric_value];
      #my $choice = $keys[ int(rand()* scalar(@keys)) ];
      my $choice = $choice_key;
      #die $choice;
      $textString .= " " unless $choice =~ /[.,?!:;]/;
      $textString .= $choice;

      shift @$context;
      push @$context, $choice;

      if($input_finished->()){
        #warn "input finished";
        return $textString;
      }
    }
  }
 }

 sub encode{
  my %words_seen = %{ shift() };
  my $n_context=shift;
  my $input=shift;
  return encode_or_decode('encode', \%words_seen, $n_context, $input);
 }
 sub decode{
  my %words_seen = %{ shift() };
  my $n_context=shift;
  my $input=shift;
  return encode_or_decode('decode', \%words_seen, $n_context, $input);
 }


 #Main script follows

 if("@ARGV" =~ /-(d|e)\s+-sample=(.+?)$/){
    my $m=$1;
    my $sample_f = $2;
    my $mode;



    my $sample_text = slurp_file($sample_f);
    my $n_context=2;

    my %words_seen = %{build_markov_table($sample_text, $n_context)};

    #print Dumper \%words_seen;

    #my $plain_input =  "She sells sea shells by the sea shore.";
    
    #read everything from stdin
    local $/;
    my $plain_input = <STDIN>;
    
    if($m eq 'd'){
        print decode(\%words_seen, 2, $plain_input);
    }elsif($m eq 'e'){
        print encode(\%words_seen, 2, $plain_input);
    }

 }else{
    die "Usage: ./markov.pl [-d or -e] -sample=<FILE>\nReads from STDIN and writes to STDOUT\n";
 }
	#!/usr/bin/perl -w
	# created October 4, 2014
	use strict;

	use Data::Dumper;
	#use JSON::PP;



	$"=" ";
	$\|=1;




	sub context_to_string{
	my @arr = @{ shift() };

	my $char = ')';

	return join $char, @arr;

	}

	sub dec2bin {
	my $numBits = shift; # param1
	my $number = shift; # param2
	my $str = unpack("B32", pack("N", $number));
	#$str =~ s/^0+(?=\d)//; # otherwise you'll get leading zeros
	return substr($str, 32-$numBits, $numBits);
	}
	sub splitIntoWords{
	my $text = shift;
	my @words = split /\s+/, $text ;

	@words = map { /(\w+\|\W+)/g } @words;
	return @words;
	}

	sub build_markov_table{
	my $text = shift;
	my $n_context=shift;
	$text =~ s/[(){}\[\]"']+/ /g;

	my @words = splitIntoWords($text);
	my %words_seen;
	#die Dumper \@words;

	my @last_words;
	push @last_words, "";


	for (1 .. $n_context){
	push @last_words, "";
	}
	for my $word(@words){
	shift @last_words;
	push @last_words, $word;
	my @context = @last_words;
	pop @context;

	$words_seen{context_to_string \@context}{ $word }++;

	}
	return \%words_seen;
	}

	sub bitstring_to_bytes{
	my $bitstring=shift;
	$bitstring .= '0' x (8-(length($bitstring)%8)) if length($bitstring)%8>0;
	return pack("B*", $bitstring);
	}
	sub bytes_to_bitstring{
	my $bytes=shift;
	$bytes = unpack("B*",$bytes);
	return $bytes;
	}

	sub slurp_file{
	my $fname = shift;
	local( $/ ) ;
	open my $fh, '<', $fname or die "cant open $!";
	my $text = <$fh>;
	return $text;
	}

	sub encode_or_decode{
	my $mode = shift;
	my %words_seen = %{ shift() };
	my $n_context=shift;
	my $input=shift;


	my $context = [ map { "" } (1 .. $n_context)];


	my @encodedWords;
	my $input_plain;
	if($mode eq "decode") {
	@encodedWords = splitIntoWords($input);
	}
	if($mode eq "encode") {
	$input_plain =$input;

	}
	my $input_binary = bytes_to_bitstring($input_plain);

	my $getBit = sub {
	if(not defined $input_binary){
	return;
	}

	my $bit = substr($input_binary,0,1);
	#warn $input_binary;
	if(length($input_binary)==1){
	$input_binary = undef;
	}else{
	$input_binary = substr($input_binary, 1);
	}
	return $bit;
	#return 1;#rand() > .5? 1:0;
	};

	my $input_finished = sub {
	if((not defined $input_binary) or $input_binary eq '' ){
	return 1;
	}
	return 0;
	};

	my $bitString;
	my $textString;
	while(1) {
	my %choices_hash = %{ $words_seen{context_to_string($context)} };
	#print Dumper \%choices_hash;

	my @keys = sort { ($choices_hash{$b} <=> $choices_hash{$a}) or ($a cmp $b) } (keys %choices_hash);
	my $max_bits = int(log(scalar(@keys))/log(2));

	if($mode eq "decode") {
	my $word = shift(@encodedWords);
	if(not defined $word) {
	#print "\n".$bitString."\n";
	return bitstring_to_bytes($bitString);
	}

	if($max_bits > 0){
	my $numericValue;
	for(my $i = 0; $i < scalar(@keys); ++$i){
	if($word eq $keys[$i]){
	$numericValue = $i;
	}
	}
	die "numeric value undef BAD" if(not defined $numericValue);
	#print $numericValue;
	#print "\n".dec2bin($max_bits, $numericValue)."\n";
	$bitString .= dec2bin($max_bits, $numericValue);

	}

	shift @$context;
	push @$context, $word;
	}
	if($mode eq "encode") {
	my @databits;
	# get databits
	for(1..$max_bits){
	my $bit = $getBit->();
	push @databits, $bit if defined $bit;

	}
	my $numeric_value = oct("0b".join("", @databits));


	#print $max_bits."\n";

	my $freq_sum=0;
	for my $freq(values %choices_hash){
	$freq_sum+=$freq;
	}

	# my $rand = int(rand()*$freq_sum);
	#
	# my $lower_limit=0;
	# my $choice_key;
	# for my $key(@keys){
	# $lower_limit += $choices_hash{$key};
	# if($rand < $lower_limit){
	# $choice_key=$key;
	# last;
	# }
	# }
	# if(not defined $choice_key){
	# warn "no choice index";
	# $choice_key = $keys[-1];
	# }

	my $choice_key = $keys[$numeric_value];
	#my $choice = $keys[ int(rand()* scalar(@keys)) ];
	my $choice = $choice_key;
	#die $choice;
	$textString .= " " unless $choice =~ /[.,?!:;]/;
	$textString .= $choice;

	shift @$context;
	push @$context, $choice;

	if($input_finished->()){
	#warn "input finished";
	return $textString;
	}
	}
	}
	}

	sub encode{
	my %words_seen = %{ shift() };
	my $n_context=shift;
	my $input=shift;
	return encode_or_decode('encode', \%words_seen, $n_context, $input);
	}
	sub decode{
	my %words_seen = %{ shift() };
	my $n_context=shift;
	my $input=shift;
	return encode_or_decode('decode', \%words_seen, $n_context, $input);
	}


	#Main script follows

	if("@ARGV" =~ /-(d\|e)\s+-sample=(.+?)$/){
	my $m=$1;
	my $sample_f = $2;
	my $mode;



	my $sample_text = slurp_file($sample_f);
	my $n_context=2;

	my %words_seen = %{build_markov_table($sample_text, $n_context)};

	#print Dumper \%words_seen;

	#my $plain_input = "She sells sea shells by the sea shore.";

	#read everything from stdin
	local $/;
	my $plain_input = <STDIN>;

	if($m eq 'd'){
	print decode(\%words_seen, 2, $plain_input);
	}elsif($m eq 'e'){
	print encode(\%words_seen, 2, $plain_input);
	}

	}else{
	die "Usage: ./markov.pl [-d or -e] -sample=<FILE>\nReads from STDIN and writes to STDOUT\n";
	}