|
use strict; |
|
use warnings; |
|
use 5.010; |
|
use Storable qw(dclone); |
|
use List::Util qw(first min max sum product any all none any all); |
|
use List::MoreUtils qw(pairwise natatime mesh zip uniq); |
|
|
|
################## |
|
## Declarations ## |
|
################## |
|
|
|
# Set function declarations |
|
|
|
sub set_new; |
|
sub set_from_list; |
|
sub set_add; |
|
sub set_remove; |
|
sub set_elements; |
|
sub set_next; |
|
sub set_size; |
|
sub set_union; |
|
sub set_intersection; |
|
sub set_difference; |
|
sub set_symmetric_difference; |
|
sub set_is_subset; |
|
sub set_is_empty; |
|
sub set_to_string; |
|
sub set_filter; |
|
sub set_contains; |
|
sub set_iter; |
|
sub set_map; |
|
sub set_fold; |
|
|
|
# Stack function declarations |
|
|
|
sub stack_new; |
|
sub stack_from_list; |
|
sub stack_push; |
|
sub stack_pop; |
|
sub stack_top; |
|
sub stack_is_empty; |
|
sub stack_to_string; |
|
sub stack_contains; |
|
sub stack_filter; |
|
sub stack_iter; |
|
sub stack_map; |
|
sub stack_fold; |
|
|
|
# Queue function declarations |
|
|
|
sub queue_new; |
|
sub queue_from_list; |
|
sub queue_enqueue; |
|
sub queue_dequeue; |
|
sub queue_is_empty; |
|
sub queue_peek; |
|
sub queue_to_string; |
|
sub queue_contains; |
|
sub queue_filter; |
|
sub queue_iter; |
|
sub queue_map; |
|
sub queue_fold; |
|
|
|
# Symbol enum & function declarations |
|
|
|
my $SYMBOL_Epsilon = \do { my $eps; }; |
|
my $SYMBOL_Endmarker = \do { my $endmrk; }; |
|
my $SYMBOL_Terminal = \do { my $term; }; |
|
my $SYMBOL_Nonterminal = \do { my $nterm; }; |
|
my $SYMBOL_Start = \do { my $start; }; |
|
|
|
sub symbol_new; |
|
sub symbol_new_epsilon; |
|
sub symbol_new_endmarker; |
|
sub symbol_new_terminal; |
|
sub symbol_new_nonterminal; |
|
sub symbol_new_start; |
|
sub symbol_is_epsilon; |
|
sub symbol_is_endmarker; |
|
sub symbol_is_terminal; |
|
sub symbol_is_nonterminal; |
|
sub symbol_is_start; |
|
sub symbol_get_value; |
|
sub symbol_list_filter; |
|
|
|
# Production function declarations |
|
|
|
sub prod_new; |
|
sub prod_get_head; |
|
sub prod_get_body; |
|
sub prod_get_all_nterms; |
|
sub prod_get_all_terms; |
|
sub prod_get_all_heads; |
|
sub prod_append_body; |
|
sub prod_list_filter; |
|
sub prod_get_by_head; |
|
|
|
# First/Follow/Predict function declarations |
|
|
|
sub ffp_compute_firsts; |
|
sub ffp_compute_follows; |
|
sub ffp_compute_predicts; |
|
|
|
# Automate State Transitions enum and function declarations |
|
|
|
my $TRANS_Epsilon = \do { my $epstran; }; |
|
my $TRANS_Symbolic = \do { my $symtran; }; |
|
|
|
sub trans_new; |
|
sub trans_new_epsilon; |
|
sub trans_new_symbolic; |
|
sub trans_new_dead; |
|
sub trans_is_epsilon; |
|
sub trans_is_symbolic; |
|
sub trans_is_dead; |
|
sub trans_get_from; |
|
sub trans_get_to; |
|
sub trans_get_symbol; |
|
sub trans_list_filter; |
|
|
|
# RegExp Token function declarations |
|
|
|
my $REGXTOK_Star = \do { my $restar; }; |
|
my $REGXTOK_Plus = \do { my $replus; }; |
|
my $REGXTOK_Opt = \do { my $reopt; }; |
|
my $REGXTOK_Alt = \do { my $realt; }; |
|
my $REGXTOK_Cat = \do { my $recat; }; |
|
my $REGXTOK_Brack = \do { my $rebrack; }; |
|
my $REGXTOK_Wildcard = \do { my $rewc; }; |
|
my $REGXTOK_Group = \do { my $regroup; }; |
|
my $REGXTOK_Bounds = \do { my $rebounds; }; |
|
my $REGXTOK_SOLAnchor = \do { my $resol; }; |
|
my $REGXTOK_EOLAnchor = \do { my $reeol; }; |
|
my $REGXTOK_StartWord = \do { my $resw; }; |
|
my $REGXTOK_EndWord = \do { my $reew; }; |
|
my $REGXTOK_Literal = \do { my $relit; }; |
|
my $REGXTOK_Escape = \do { my $reesc; }; |
|
my $REGXTOK_Trail = \do { my $retrail; }; |
|
my $REGXTOK_Keyword = \do { my $rekw; }; |
|
my $REGXTOK_Class = \do { my $recls; }; |
|
my $REGXTOK_Flag = \do { my $reflg; }; |
|
|
|
sub regxtok_new; |
|
sub rexgtok_is; |
|
sub regxtok_new_star; |
|
sub regxtok_new_plus; |
|
sub regxtok_new_opt; |
|
sub regxtok_new_alt; |
|
sub regxtok_new_cat; |
|
sub regxtok_new_brack; |
|
sub regxtok_new_wildcard; |
|
sub regxtok_new_group; |
|
sub regxtok_new_bounds; |
|
sub regxtok_new_solanchor; |
|
sub regxtok_new_eolanchor; |
|
sub regxtok_new_literal; |
|
sub regxtok_new_escape; |
|
sub regxtok_new_trail; |
|
sub regxtok_new_keyword; |
|
sub regxtok_new_class; |
|
sub regxtok_new_flag; |
|
sub regxtok_new_directive; |
|
sub regxtok_new_macro; |
|
sub regxtok_is_star; |
|
sub regxtok_is_plus; |
|
sub regxtok_is_opt; |
|
sub regxtok_is_alt; |
|
sub regxtok_is_cat; |
|
sub regxtok_is_brack; |
|
sub regxtok_is_wildcard; |
|
sub regxtok_is_group; |
|
sub regxtok_is_bounds; |
|
sub regxtok_is_solanchor; |
|
sub regxtok_is_eolanchor; |
|
sub regxtok_is_literal; |
|
sub regxtok_is_escape; |
|
sub regxtok_is_trail; |
|
sub regxtok_is_keyword; |
|
sub regxtok_is_class; |
|
sub regxtok_is_flag; |
|
sub regxtok_is_directive; |
|
sub regxtok_is_macro; |
|
sub regxtok_is_binary_op; |
|
sub regxtok_is_unary_op; |
|
sub regxtok_is_grouped; |
|
sub regxtok_is_assert; |
|
sub regxtok_is_charrep; |
|
|
|
# Lexer Grammar Parser enum & function declarations |
|
|
|
my $LEXERGRM_Pattern = \do { my $lgpatt; }; |
|
my $LEXERGRM_Directive = \do { my $lgdir; }; |
|
my $LEXERGRM_Decorator = \do { my $decr; }; |
|
|
|
sub lexergrm_parse_patt; |
|
sub lexergrm_parse_directive; |
|
sub lexergrm_parse_decorator; |
|
|
|
# Automaton State enum & function declarations |
|
|
|
my $STATE_Start = \do { my $sstt; }; |
|
my $STATE_Interm = \do { my $sirm; }; |
|
my $STATE_Accept = \do { my $ssacc; }; |
|
|
|
sub state_new; |
|
sub state_new_start; |
|
sub state_new_interm; |
|
sub state_new_accept; |
|
sub state_is_interm; |
|
sub state_is_accept; |
|
sub state_is_start; |
|
sub state_get_id; |
|
sub state_get_from_id; |
|
sub state_list_filter; |
|
sub state_collect_eps_trans; |
|
sub state_collect_moves; |
|
|
|
# NFA function declaration |
|
|
|
sub nfa_new; |
|
sub nfa_get_states; |
|
sub nfa_get_trans; |
|
sub nfa_get_alpha; |
|
sub nfa_get_start_state; |
|
sub nfa_get_accept_states; |
|
sub nfa_append_symbol; |
|
sub nfa_append_state; |
|
sub nfa_append_trans; |
|
sub nfa_epsilon_closure; |
|
sub nfa_to_dfa; |
|
|
|
# DFA function declarations |
|
|
|
sub dfa_new; |
|
sub dfa_complete; |
|
sub dfa_compute_invtrans; |
|
sub dfa_minimize; |
|
sub dfa_compile; |
|
|
|
###################### |
|
## Implementations ## |
|
###################### |
|
|
|
# Set function implementations |
|
|
|
sub set_new { |
|
my %set; |
|
@set{@_} = (1) x @_; |
|
return \%set; |
|
} |
|
|
|
sub set_from_list ($) { |
|
my $listref = shift; |
|
my $set = set_new; |
|
map { set_add $set, $_; } @$listref; |
|
return $set; |
|
} |
|
|
|
sub set_from_array (\@) { |
|
my $arrayref = shift; |
|
return set_from_list $arrayref; |
|
} |
|
|
|
sub set_add ($$) { |
|
my ( $set, $element ) = @_; |
|
$set->{$element} = 1; |
|
return $set; |
|
} |
|
|
|
sub set_remove ($$) { |
|
my ( $set, $element ) = @_; |
|
delete $set->{$element}; |
|
return $set; |
|
} |
|
|
|
sub set_elements ($) { |
|
my $set = shift; |
|
return keys %$set; |
|
} |
|
|
|
sub set_next ($) { |
|
return ( keys %{ $_[0] } )[0]; |
|
} |
|
|
|
sub set_size ($) { |
|
my $set = shift; |
|
return scalar keys %$set; |
|
} |
|
|
|
sub set_union ($$) { |
|
my ( $set1, $set2 ) = @_; |
|
return { %$set1, %$set2 }; |
|
} |
|
|
|
sub set_intersection ($$) { |
|
my ( $set1, $set2 ) = @_; |
|
my %result; |
|
foreach my $element ( keys %$set1 ) { |
|
$result{$element} = 1 if exists $set2->{$element}; |
|
} |
|
return \%result; |
|
} |
|
|
|
sub set_difference ($$) { |
|
my ( $set1, $set2 ) = @_; |
|
my %result; |
|
foreach my $element ( keys %$set1 ) { |
|
$result{$element} = 1 unless exists $set2->{$element}; |
|
} |
|
return \%result; |
|
} |
|
|
|
sub set_symmetric_difference ($$) { |
|
my ( $set1, $set2 ) = @_; |
|
my $diff1 = set_difference( $set1, $set2 ); |
|
my $diff2 = set_difference( $set2, $set1 ); |
|
return set_union( $diff1, $diff2 ); |
|
} |
|
|
|
sub set_is_subset ($$) { |
|
my ( $set1, $set2 ) = @_; |
|
foreach my $element ( keys %$set1 ) { |
|
return 0 unless exists $set2->{$element}; |
|
} |
|
return 1; |
|
} |
|
|
|
sub set_is_empty ($) { |
|
my $set = shift; |
|
return keys(%$set) == 0 ? 1 : 0; |
|
} |
|
|
|
sub set_to_string ($) { |
|
my $set = shift; |
|
return 'Set {' . join( ', ', sort set_elements($set) ) . '}'; |
|
} |
|
|
|
sub set_contains ($$) { |
|
my ( $set, $item ) = @_; |
|
return exists $set->{$item} ? $item : undef; |
|
} |
|
|
|
sub set_filter (&$) { |
|
my ( $pred, $set ) = @_; |
|
return grep $pred->($_), @$set; |
|
} |
|
|
|
sub set_iter (&$) { |
|
my ( $applyfn, $set ) = @_; |
|
foreach my $elt ( @{ set_elements $set } ) { |
|
&applyfn->($elt); |
|
} |
|
} |
|
|
|
sub set_map (&$) { |
|
my ( $mapfn, $set ) = @_; |
|
my $set_prime = set_new; |
|
foreach my $elt ( @{ set_elements $set } ) { |
|
set_add( $set_prime, $mapfn->($elt) ); |
|
} |
|
return $set_prime; |
|
} |
|
|
|
sub set_fold (&$$) { |
|
my ( $foldfn, $initarray, $set ) = @_; |
|
my $set_acc = set_from_list $initarray; |
|
foreach my $elt ( @{ set_elements $set_acc } ) { |
|
set_add( $set_acc, $foldfn->( $set_acc, $elt ) ); |
|
} |
|
return $set_acc; |
|
} |
|
|
|
# Stack function implementation |
|
|
|
sub stack_new { |
|
return []; |
|
} |
|
|
|
sub stack_from_list ($) { |
|
my $list = shift; |
|
my $stack = stack_new; |
|
map { stack_push $stack, $_ } @$list; |
|
return $stack; |
|
} |
|
|
|
sub stack_push ($$) { |
|
my ( $stack, $item ) = @_; |
|
push @$stack, $item; |
|
} |
|
|
|
sub stack_pop ($) { |
|
my $stack = shift; |
|
return pop @$stack; |
|
} |
|
|
|
sub stack_top ($) { |
|
my $stack = shift; |
|
return $stack->[-1]; |
|
} |
|
|
|
sub stack_is_empty ($) { |
|
my $stack = shift; |
|
return scalar(@$stack) == 0; |
|
} |
|
|
|
sub stack_to_string ($) { |
|
my $stack = shift; |
|
my @items = @$stack; |
|
|
|
return "Stack [" . join( " ← ", @items ) . "]"; |
|
} |
|
|
|
sub stack_contains ($$) { |
|
my ( $stack, $item ) = @_; |
|
for ( reverse @$stack ) { |
|
return $_ if $_ == $item; |
|
} |
|
return undef; |
|
} |
|
|
|
sub stack_filter (&$) { |
|
my ( $pred, $stack ) = @_; |
|
return grep $pred->($_), @$stack; |
|
} |
|
|
|
sub stack_iter (&$) { |
|
my ( $applyfn, $stack ) = @_; |
|
foreach my $member (@$stack) { |
|
&applyfn->($member); |
|
} |
|
} |
|
|
|
sub stack_map (&$) { |
|
my ( $mapfn, $stack ) = @_; |
|
my $stack_prime = stack_new; |
|
foreach my $member (@$stack) { |
|
stack_push( $stack_prime, $mapfn->($member) ); |
|
} |
|
return $stack_prime; |
|
} |
|
|
|
sub stack_fold (&$$) { |
|
my ( $foldfn, $initarray, $stack ) = @_; |
|
my $stack_acc = stack_from_list $initarray; |
|
foreach my $member (@$stack_acc) { |
|
stack_push( $stack_acc, $foldfn->( $stack_acc, $member ) ); |
|
} |
|
return $stack_acc; |
|
} |
|
|
|
# Queue function definitions |
|
|
|
sub queue_new { |
|
return []; |
|
} |
|
|
|
sub queue_from_list ($) { |
|
my $list = shift; |
|
my $queue = queue_new; |
|
map { queue_enqueue $queue, $_ } @$list; |
|
return $queue; |
|
} |
|
|
|
sub queue_enqueue ($$) { |
|
my ( $queue, $item ) = @_; |
|
push @$queue, $item; |
|
} |
|
|
|
sub queue_dequeue ($) { |
|
my $queue = shift; |
|
return shift @$queue; |
|
} |
|
|
|
sub queue_is_empty ($) { |
|
my $queue = shift; |
|
return !@$queue; |
|
} |
|
|
|
sub queue_peek ($) { |
|
my $queue = shift; |
|
return $queue->[0]; |
|
} |
|
|
|
sub queue_to_string ($) { |
|
my $queue = shift; |
|
return "Queue [" . join( ", ", @$queue ) . "]"; |
|
} |
|
|
|
sub queue_filter (&$) { |
|
my ( $pred, $queue ) = @_; |
|
return grep $pred->($_), @$queue; |
|
} |
|
|
|
sub queue_contains ($$) { |
|
my ( $queue, $item ) = @_; |
|
for (@$queue) { |
|
return $_ if $_ == $item; |
|
} |
|
return undef; |
|
} |
|
|
|
sub queue_iter (&$) { |
|
my ( $applyfn, $queue ) = @_; |
|
foreach my $member (@$queue) { |
|
&applyfn->($member); |
|
} |
|
} |
|
|
|
sub queue_niter (&$) { |
|
my ( $applyfn, $queue ) = @_; |
|
foreach my $i ( 0 .. $#{$queue} ) { |
|
$applyfn->( $i, $queue->[$i] ); |
|
} |
|
} |
|
|
|
sub queue_map (&$) { |
|
my ( $mapfn, $queue ) = @_; |
|
my $queue_prime = queue_new; |
|
foreach my $member (@$queue) { |
|
queue_enqueue( $queue_prime, $mapfn->($member) ); |
|
} |
|
return $queue_prime; |
|
} |
|
|
|
sub queue_fold (&$$) { |
|
my ( $foldfn, $initarray, $queue ) = @_; |
|
my $queue_acc = queue_from_list $initarray; |
|
foreach my $member (@$queue_acc) { |
|
queue_enqueue( $queue_acc, $foldfn->( $queue_acc, $member ) ); |
|
} |
|
return $queue_acc; |
|
} |
|
|
|
# Symbol function definitions |
|
|
|
sub symbol_new ($$) { |
|
my ( $type, $value ) = @_; |
|
return [ $type, $value ]; |
|
} |
|
|
|
sub symbol_new_epsilon { |
|
return symbol_new( $SYMBOL_Epsilon, undef ); |
|
} |
|
|
|
sub symbol_new_endmarker { |
|
return symbol_new( $SYMBOL_Endmarker, undef ); |
|
} |
|
|
|
sub symbol_new_terminal ($) { |
|
return symbol_new( $SYMBOL_Terminal, shift ); |
|
} |
|
|
|
sub symbol_new_nonterminal ($) { |
|
|
|
return symbol_new( $SYMBOL_Nonterminal, shift ); |
|
} |
|
|
|
sub symbol_new_start ($) { |
|
return symbol_new( $SYMBOL_Start, shift ); |
|
} |
|
|
|
sub symbol_is_epsilon { $_[0][0] == $SYMBOL_Epsilon } |
|
sub symbol_is_endmarker { $_[0][0] == $SYMBOL_Endmarker } |
|
sub symbol_is_terminal { $_[0][0] == $SYMBOL_Terminal } |
|
sub symbol_is_nonterminal { $_[0][0] == $SYMBOL_Nonterminal } |
|
sub symbol_is_start { $_[0][0] == $SYMBOL_Start } |
|
|
|
sub symbol_get_value { $_[0][1]; } |
|
|
|
sub symbol_list_filter (&$) { |
|
my ( $pred, $list ) = @_; |
|
return grep $pred->($_), @$list; |
|
} |
|
|
|
# Production function definitions |
|
|
|
sub prod_new ($$) { |
|
my ( $head, $body ) = @_; |
|
return [ $head, $body ]; |
|
} |
|
|
|
sub prod_get_head { $_[0][0]; } |
|
sub prod_get_body { $_[0][1]; } |
|
|
|
sub prod_get_all_nterms ($) { |
|
my ($prod) = @_; |
|
my $prod_body = prod_get_body $prod; |
|
my @nterms; |
|
|
|
foreach my $sym (@$prod_body) { |
|
push @nterms, $sym if symbol_is_nonterminal $sym; |
|
} |
|
|
|
return \@nterms; |
|
} |
|
|
|
sub prod_get_all_terms ($) { |
|
my ($prod) = @_; |
|
my $prod_body = prod_get_body $prod; |
|
my @terms; |
|
|
|
foreach my $sym (@$prod_body) { |
|
push @terms, $sym if symbol_is_terminal $sym; |
|
} |
|
|
|
return \@terms; |
|
} |
|
|
|
sub prod_get_all_heads { |
|
my ($prods) = @_; |
|
my @heads; |
|
map { push @heads, prod_get_head $_ } @$prods; |
|
return \@heads; |
|
} |
|
|
|
sub prod_append_body ($$) { |
|
my ( $prod, $symbols ) = @_; |
|
my $prod_body = prod_get_body $prod; |
|
map { push @$prod_body, $_ } @$symbols; |
|
return [ prod_get_head $prod, $prod_body ]; |
|
} |
|
|
|
sub prod_list_filter (&$) { |
|
my ( $pred, $list ) = @_; |
|
return grep $pred->($_), @$list; |
|
} |
|
|
|
sub prod_get_by_head ($$) { |
|
my ( $prod_list, $head ) = @_; |
|
my $filtered = prod_list_filter { prod_get_head $_ == $head } $prod_list; |
|
return $filtered; |
|
} |
|
|
|
# First/Follow/Predict function definitions |
|
|
|
sub ffp_compute_firsts ($) { |
|
my ($prods) = @_; |
|
my $firsts = {}; |
|
my $eps = {}; |
|
|
|
my $prod_is_eps = sub { |
|
my ($head) = @_; |
|
return symbol_is_epsilon( prod_get_body($head)->[0] ); |
|
}; |
|
|
|
my $arrays_eq = sub { |
|
my ( $arr1, $arr2 ) = @_; |
|
|
|
foreach my $elt1 (@$arr1) { |
|
foreach my $elt2 (@$arr2) { |
|
return 0 if $elt1 ne $elt2; |
|
} |
|
} |
|
|
|
return 1; |
|
}; |
|
|
|
my $firsts_converged = sub { |
|
my ($old_firsts) = @_; |
|
|
|
foreach my ( $k1, $v1 ) (%$firsts) { |
|
foreach my ( $k2, $v2 ) (%$old_firsts) { |
|
if ( $k1 eq $k2 ) { |
|
return 0 if not $arrays_eq->( $v1, $v2 ); |
|
} |
|
} |
|
} |
|
|
|
return 1; |
|
}; |
|
|
|
my $eps_converged = sub { |
|
my ($old_eps) = @_; |
|
|
|
foreach my ( $k1, $v1 ) (%$eps) { |
|
foreach my ( $k2, $v2 ) (%$old_eps) { |
|
if ( $k1 eq $k2 ) { |
|
return 0 if $v1 != $v2; |
|
} |
|
} |
|
} |
|
|
|
return 1; |
|
}; |
|
|
|
foreach my $prod (@$prods) { |
|
my $curr_body = prod_get_body($prod); |
|
foreach my $sym (@$curr_body) { |
|
$firsts->{$sym} = [$sym]; |
|
$eps->{$sym} = 0; |
|
} |
|
} |
|
|
|
foreach my $head ( @${ prod_get_all_heads $prods } ) { |
|
$eps->{$head} = $prod_is_eps->($head) ? 1 : 0; |
|
$firsts->{$head} = []; |
|
} |
|
|
|
my $old_firsts = {}; |
|
my $old_eps = {}; |
|
until ( $firsts_converged->($old_firsts) && $eps_converged->($old_eps) ) { |
|
$old_firsts = dclone($firsts); |
|
$old_eps = dclone($eps); |
|
|
|
OUTER: |
|
foreach my $prod (@$prods) { |
|
my $curr_head = prod_get_head($prod); |
|
my $curr_body = prod_get_body($prod); |
|
|
|
foreach my $body_sym (@$curr_body) { |
|
push @{ $firsts->{$curr_head} }, $firsts->{$body_sym}; |
|
next OUTER if not $eps->{$body_sym}; |
|
} |
|
$eps->{$curr_head} = 1; |
|
} |
|
} |
|
return [ $firsts, $eps ]; |
|
} |
|
|
|
sub ffp_compute_follows ($) { |
|
my ($prods) = @_; |
|
my ( $firsts, $eps ) = @{ ffp_compute_firsts $prods }; |
|
my $follows = {}; |
|
|
|
my $symstring_is_eps = sub { |
|
my ($symstring) = @_; |
|
foreach my $ss (@$symstring) { |
|
return 0 if not $eps->{$ss}; |
|
} |
|
return 1; |
|
}; |
|
|
|
my $symstring_firsts = sub { |
|
my ($symstring) = @_; |
|
my @result; |
|
foreach my $ss (@$symstring) { |
|
push @result, $firsts->{$ss}; |
|
last if not $eps->{$ss}; |
|
} |
|
return \@result; |
|
}; |
|
|
|
my $arrays_eq = sub { |
|
my ( $arr1, $arr2 ) = @_; |
|
|
|
foreach my $elt1 (@$arr1) { |
|
foreach my $elt2 (@$arr2) { |
|
return 0 if $elt1 ne $elt2; |
|
} |
|
} |
|
|
|
return 1; |
|
}; |
|
|
|
my $arrays_merge = sub { |
|
my ( $arr1, $arr2 ) = @_; |
|
my $merged = dclone($arr1); |
|
|
|
foreach my $elt (@$arr2) { |
|
push @$merged, $elt; |
|
} |
|
|
|
return $merged; |
|
}; |
|
|
|
my $follows_converged = sub { |
|
my ($old_follows) = @_; |
|
|
|
foreach my ( $k1, $v1 ) (%$follows) { |
|
foreach my ( $k2, $v2 ) (%$old_follows) { |
|
if ( $k1 eq $k2 ) { |
|
return 0 if not $arrays_eq->( $v1, $v2 ); |
|
} |
|
} |
|
} |
|
|
|
return 1; |
|
}; |
|
|
|
foreach my $sym ( keys %$firsts ) { |
|
$follows->{$sym} = []; |
|
} |
|
|
|
my $old_follows = {}; |
|
until ( $follows_converged->($old_follows) ) { |
|
$old_follows = dclone($follows); |
|
|
|
foreach my $prod (@$prods) { |
|
my $curr_nterms = prod_get_nterms $prod; |
|
my $curr_body = prod_get_body $prod; |
|
my ( @after, $is_after ) = ( (), 0 ); |
|
|
|
foreach my $nterm (@$curr_nterms) { |
|
foreach my $sym (@$curr_body) { |
|
push @after, $sym if $is_after; |
|
$is_after = 1 if $sym eq $nterm; |
|
} |
|
} |
|
|
|
foreach my $nterm (@$curr_nterms) { |
|
push @{ $follows->{$nterm} }, $symstring_firsts->( \@after ); |
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
# Automaton State Transitions function definitions |
|
|
|
sub trans_new ($$$$) { |
|
my ( $type, $from, $to, $symbol ) = @_; |
|
return [ $type, $from, $to, $symbol ]; |
|
} |
|
|
|
sub trans_new_epsilon ($$) { |
|
my ( $from, $to ) = @_; |
|
return trans_new( $TRANS_Epsilon, undef, undef, undef ); |
|
} |
|
|
|
sub trans_new_symbolic ($$$) { |
|
my ( $from, $to, $symbol ) = @_; |
|
return trans_new( $TRANS_Symbolic, $from, $to, $symbol ); |
|
} |
|
|
|
sub trans_new_dead ($$) { |
|
my ( $from, $symbol ) = @_; |
|
return trans_new( $TRANS_Symbolic, $from, -1, $symbol ); |
|
} |
|
|
|
sub trans_is_epsilon { $_[0][0] == $TRANS_Epsilon; } |
|
sub trans_is_symbolic { $_[0][0] == $TRANS_Symbolic; } |
|
sub trans_is_dead { $_[0][2] == -1; } |
|
|
|
sub trans_get_from { $_[0][1]; } |
|
sub trans_get_to { $_[0][2]; } |
|
sub trans_get_symbol { $_[0][3]; } |
|
|
|
sub trans_list_filter (&$) { |
|
my ( $pred, $list ) = @_; |
|
return grep $pred->($_), @$list; |
|
} |
|
|
|
# RegExp Token function definitions |
|
|
|
sub regxtok_new ($$) { |
|
my ( $type, $value ) = @_; |
|
return [ $type, $value ]; |
|
} |
|
|
|
sub regxtok_is ($$) { $_[0][0] == $_[1] } |
|
|
|
sub regxtok_new_star { regxtok_new( $REGXTOK_Star, $_[0] ) } |
|
sub regxtok_new_plus { regxtok_new( $REGXTOK_Plus, $_[0] ) } |
|
sub regxtok_new_opt { regxtok_new( $REGXTOK_Opt, $_[0] ) } |
|
sub regxtok_new_alt { regxtok_new( $REGXTOK_Alt, $_[0] ) } |
|
sub regxtok_new_cat { regxtok_new( $REGXTOK_Cat, $_[0] ) } |
|
sub regxtok_new_brack { regxtok_new( $REGXTOK_Brack, $_[0] ) } |
|
sub regxtok_new_wildcard { regxtok_new( $REGXTOK_Wildcard, $_[0] ) } |
|
sub regxtok_new_group { regxtok_new( $REGXTOK_Group, $_[0] ) } |
|
sub regxtok_new_bounds { regxtok_new( $REGXTOK_Bounds, $_[0] ) } |
|
sub regxtok_new_solanchor { regxtok_new( $REGXTOK_SOLAnchor, $_[0] ) } |
|
sub regxtok_new_eolanchor { regxtok_new( $REGXTOK_EOLAnchor, $_[0] ) } |
|
sub regxtok_new_literal { regxtok_new( $REGXTOK_Literal, $_[0] ) } |
|
sub regxtok_new_escape { regxtok_new( $REGXTOK_Escape, $_[0] ) } |
|
sub regxtok_new_trail { regxtok_new( $REGXTOK_Trail, $_[0] ) } |
|
sub regxtok_new_keyword { regxtok_new( $REGXTOK_Keyword, $_[0] ) } |
|
sub regxtok_new_class { regxtok_new( $REGXTOK_Class, $_[0] ) } |
|
sub regxtok_new_flag { regxtok_new( $REGXTOK_Flag, $_[0] ) } |
|
|
|
sub regxtok_is_star { regxtok_is( $_[0], $REGXTOK_Star ) } |
|
sub regxtok_is_plus { regxtok_is( $_[0], $REGXTOK_Plus ) } |
|
sub regxtok_is_opt { regxtok_is( $_[0], $REGXTOK_Opt ) } |
|
sub regxtok_is_alt { regxtok_is( $_[0], $REGXTOK_Alt ) } |
|
sub regxtok_is_cat { regxtok_is( $_[0], $REGXTOK_Cat ) } |
|
sub regxtok_is_brack { regxtok_is( $_[0], $REGXTOK_Brack ) } |
|
sub regxtok_is_wildcard { regxtok_is( $_[0], $REGXTOK_Wildcard ) } |
|
sub regxtok_is_group { regxtok_is( $_[0], $REGXTOK_Group ) } |
|
sub regxtok_is_bounds { regxtok_is( $_[0], $REGXTOK_Bounds ) } |
|
sub regxtok_is_solanchor { regxtok_is( $_[0], $REGXTOK_SOLAnchor ) } |
|
sub regxtok_is_eolanchor { regxtok_is( $_[0], $REGXTOK_EOLAnchor ) } |
|
sub regxtok_is_literal { regxtok_is( $_[0], $REGXTOK_Literal ) } |
|
sub regxtok_is_escape { regxtok_is( $_[0], $REGXTOK_Escape ) } |
|
sub regxtok_is_trail { regxtok_is( $_[0], $REGXTOK_Trail ) } |
|
sub regxtok_is_keyword { regxtok_is( $_[0], $REGXTOK_Keyword ) } |
|
sub regxtok_is_class { regxtok_is( $_[0], $REGXTOK_Class ) } |
|
sub regxtok_is_flag { regxtok_is( $_[0], $REGXTOK_Flag ) } |
|
|
|
sub regxtok_is_binary_op { |
|
my ($token) = @_; |
|
my $type = $token->[0]; |
|
return |
|
$type == $REGXTOK_Alt |
|
|| $type == $REGXTOK_Cat |
|
|| $type == $REGXTOK_Trail; |
|
} |
|
|
|
sub regxtok_is_unary_op { |
|
my ($token) = @_; |
|
my $type = $token->[0]; |
|
return |
|
$type == $REGXTOK_Star |
|
|| $type == $REGXTOK_Plus |
|
|| $type == $REGXTOK_Opt |
|
|| $type == $REGXTOK_Bounds; |
|
} |
|
|
|
sub regxtok_is_assert { |
|
my ($token) = @_; |
|
my $type = $token->[0]; |
|
return $type == $REGXTOK_SOLAnchor |
|
|| $type == $REGXTOK_EOLAnchor; |
|
} |
|
|
|
sub regxtok_is_grouped { |
|
my ($token) = @_; |
|
my $type = $token->[0]; |
|
return $type == $REGXTOK_Group |
|
|| $type == $REGXTOK_Brack; |
|
} |
|
|
|
sub regxtok_is_charrep { |
|
my ($token) = @_; |
|
my $type = $token->[0]; |
|
return |
|
$type == $REGXTOK_Literal |
|
|| $type == $REGXTOK_Wildcard |
|
|| $type == $REGXTOK_Keyword |
|
|| $type == $REGXTOK_Class |
|
|| $type == $REGXTOK_Escape; |
|
} |
|
|
|
sub regxtok_list_filter (&$) { |
|
my ( $pred, $regxtok_list ) = @_; |
|
return grep $pred->($_), @$regxtok_list; |
|
} |
|
|
|
# Lexer Grammar Parser functio definitions |
|
|
|
sub lexergrm_parse_pattern { |
|
my ($args) = @_; |
|
my ( $patt, $pattnm, $offs ) = @$args{qw(pattern name offs)}; |
|
my @tokens; |
|
my $len = length($patt); |
|
my $i = 0; |
|
|
|
my $unescape_escape_seq = sub { |
|
my ( $seq, $mode ) = @_; |
|
my %esc_charmap = ( |
|
'n' => "\n", |
|
't' => "\t", |
|
'r' => "\r", |
|
'f' => "\f", |
|
'a' => "\a", |
|
'e' => "\e", |
|
'b' => "\b", |
|
'"' => '"', |
|
"'" => "'", |
|
'?' => '?', |
|
's' => " \t\n\r", |
|
'S' => join( '', '!' .. '~' ), |
|
'w' => join( '', 'a' .. 'z', 'A' .. 'Z' ), |
|
'W' => join( '', ' ' .. '@', '[' .. '`', '{', '~' ), |
|
'D' => join( '', ' ' .. '/', ':' .. '~' ), |
|
'd' => '0' .. '9', |
|
'p' => '~!@#$%^&*()+_-?.,"\':', |
|
'h' => join( '', 'a' .. 'f', 'A' .. 'F', '0' .. '9' ), |
|
'o' => '01234567', |
|
'.' => '.', |
|
'(' => '(', |
|
'$' => '$', |
|
'^' => '^', |
|
'[' => '[', |
|
'{' => '{', |
|
'/' => '/', |
|
'\\' => '\\', |
|
'*' => '*', |
|
'+' => '+', |
|
'~' => '~', |
|
'|' => '|', |
|
); |
|
if ( $mode eq 'CHAR' ) { |
|
return $esc_charmap{$seq} |
|
|| die "Invalid escape at $pattnm, offset $offs, character $i;"; |
|
} |
|
elsif ( $mode eq 'HEX' && $seq =~ m/[a-fA-F0-9]+/ ) { |
|
return pack( 'H2', $seq ); |
|
} |
|
elsif ( $mode eq 'UNICODE' && $seq =~ m/[a-fA-F0-9]+/ ) { |
|
return pack( 'U', hex($seq) ); |
|
} |
|
}; |
|
|
|
my $unescape_literal_seq = sub { |
|
my ($lit_seq) = @_; |
|
my $seq_len = length($lit_seq); |
|
my $unescaped = ""; |
|
my $i = 0; |
|
|
|
while ( $i++ < $seq_len ) { |
|
if ( substr( $lit_seq, $i, 1 ) ne '\\' ) { |
|
$unescaped .= substr( $lit_seq, $i, 1 ); |
|
next; |
|
} |
|
else { |
|
$i++; |
|
my $seq_first = substr( $lit_seq, $i++, 1 ); |
|
|
|
if ( lc($seq_first) eq 'x' ) { |
|
$unescaped .= |
|
$unescape_escape_seq->( substr( $lit_seq, $i, 2 ), |
|
'HEX' ); |
|
$i += 2; |
|
next; |
|
} |
|
elsif ( lc($seq_first) eq 'u' ) { |
|
$unescaped .= $unescape_escape_seq->( |
|
substr( $lit_seq, $i, 4 ), 'UNICODE' |
|
); |
|
$i += 4; |
|
next; |
|
} |
|
else { |
|
$unescaped .= $unescape_escape_seq->( $seq_first, 'CHAR' ); |
|
next; |
|
} |
|
} |
|
return $unescaped; |
|
} |
|
|
|
}; |
|
|
|
my $collect_literal = sub { |
|
my $start = $i; |
|
while ( $i < $len |
|
&& ( substr( $patt, $i, 1 ) !~ m{[\[\]\(\)\{\}\*\+\?\|~\$\./\\]} ) ) |
|
{ |
|
$i++; |
|
} |
|
return if $start == $i; |
|
push @tokens, |
|
regxtok_new_literal( |
|
$unescape_literal_seq->( substr( $patt, $start, $i - $start ) ) ); |
|
}; |
|
|
|
my $eval_brack_expr = sub { |
|
my ($brack_expr) = @_; |
|
my $len = length($brack_expr); |
|
my $evaluated = ""; |
|
my $i = 0; |
|
|
|
while ( $i < $len - 1 ) { |
|
my $c = substr( $brack_expr, $i++, 1 ); |
|
if ( substr( $brack_expr, $i, 1 ) eq '-' ) { |
|
my $c_next = substr( $brack_expr, ++$i, 1 ); |
|
$evaluated .= "$c" .. "$c_next"; |
|
next; |
|
} |
|
else { |
|
$evaluated .= "$c"; |
|
} |
|
|
|
$evaluated .= substr( $brack_expr, $i, $len - 1 ); |
|
} |
|
return $evaluated; |
|
}; |
|
|
|
while ( $i < $len ) { |
|
my $c = substr( $patt, $i, 1 ); |
|
|
|
# BEGINNING OF PATTERN |
|
if ( $i == 0 && $c eq '^' ) { |
|
push @tokens, regxtok_new_solanchor(undef); |
|
$i++; |
|
next; |
|
} |
|
|
|
# POSSIBLE END OF PATTERN 1 |
|
if ( ( $i == $len - 1 || substr( $patt, $i + 1, 2 ) eq '/{' ) |
|
&& $c eq '$' ) |
|
{ |
|
push @tokens, regxtok_new_eolanchor(undef); |
|
$i++; |
|
next; |
|
} |
|
|
|
# POSSIBLE END OF PATTERN 2 |
|
if ( substr( $patt, $i, 2 ) eq "/{" ) { |
|
$i += 2; |
|
my @flags; |
|
while ( $i <= $len - 1 ) { |
|
last if substr( $patt, $i, 1 ) eq '}'; |
|
push @flags, substr( $patt, $i++, 1 ); |
|
} |
|
if ( $i == $len - 1 ) { |
|
die |
|
"Unclosed flags '/{}' in pattern $pattnm, offset $offs, character $i;"; |
|
} |
|
push @tokens, regxtok_new_flag( \@flags ); |
|
last; |
|
} |
|
|
|
# GROUP |
|
if ( $c eq '(' ) { |
|
my $depth = 1; |
|
my $j = $i + 1; |
|
while ( $j < $len && $depth > 0 ) { |
|
my $cc = substr( $patt, $j, 1 ); |
|
$depth++ if $cc eq '('; |
|
$depth-- if $cc eq ')'; |
|
$j++; |
|
} |
|
if ( $depth != 0 ) { |
|
die |
|
"Unmatched '(' in pattern $pattnm, offset $offs, character $i;"; |
|
} |
|
my $group_str = substr( $patt, $i + 1, $j - $i - 2 ); |
|
push @tokens, |
|
regxtok_new_group( |
|
lexergrm_parse_pattern( |
|
{ |
|
pattern => $group_str, |
|
name => $pattnm, |
|
offs => $len - $j |
|
} |
|
) |
|
); |
|
$i = $j; |
|
next; |
|
} |
|
|
|
# CHAR CLASS |
|
if ( $c eq '[' ) { |
|
my $j = $i + 1; |
|
my $in_class = 1; |
|
my $negated = 0; |
|
|
|
if ( substr( $patt, $j, 2 ) eq "[:" ) { |
|
$j += 2; |
|
my $start = $j; |
|
|
|
while ( $j < $len ) { |
|
last if substr( $patt, $j, 3 ) eq ":]]"; |
|
$j++; |
|
} |
|
|
|
if ( $j == $len ) { |
|
die |
|
"Unclosed '[[:' in pattern $pattnm, offset $offs, character $i;"; |
|
} |
|
my $class = substr( $patt, $start, $j - $start ); |
|
push @tokens, regxtok_new_class($class); |
|
$j += 3; |
|
next; |
|
} |
|
|
|
if ( substr( $patt, $j, 1 ) eq '^' ) { |
|
$negated = 1; |
|
$j++; |
|
} |
|
|
|
while ( $j < $len && $in_class ) { |
|
if ( substr( $patt, $j, 1 ) eq ']' ) { |
|
$in_class = 0; |
|
} |
|
$j++; |
|
} |
|
if ($in_class) { |
|
die |
|
"Unmatched '[' in pattern $pattnm, offset $offs, character $i;"; |
|
} |
|
my $cc_str = substr( $patt, $i, $j - $i ); |
|
push @tokens, |
|
regxtok_new_brack( |
|
{ class => $eval_brack_expr->($cc_str), negaged => $negated } ); |
|
$i = $j; |
|
next; |
|
} |
|
|
|
# TRAIL (foo/bar) |
|
if ( $c eq '/' ) { |
|
my $before_trail = pop @tokens; |
|
$i++; |
|
my $after_start = $i; |
|
while ( $i < $len ) { $i++ } |
|
my $after_trail = lexergrm_parse_patt( |
|
{ |
|
pattern => |
|
substr( $patt, $after_start, $len - $after_start ), |
|
name => $pattnm, |
|
offs => $len - $i, |
|
} |
|
); |
|
push @tokens, regxtok_new_trail( [ $before_trail, $after_trail ] ); |
|
last; |
|
} |
|
|
|
# KEYWORD ("foo") |
|
if ( $c eq '"' ) { |
|
my $start = $i + 1; |
|
my $j = ++$i; |
|
while ( $j < $len ) { |
|
last if substr( $patt, $j, 1 ) eq '"'; |
|
$j++; |
|
} |
|
if ( $j == $len ) { |
|
die |
|
"Unmatched '\"' in pattern $pattnm, offset $offs, character $i;"; |
|
} |
|
my $keyword = substr( $patt, $start, $j - $start ); |
|
push @tokens, |
|
regxtok_new_keyword( $unescape_literal_seq->($keyword) ); |
|
$i = $j + 1; |
|
next; |
|
} |
|
|
|
# BOUNDS {m,n} |
|
if ( $c eq '{' ) { |
|
my $j = $i + 1; |
|
while ( $j < $len && substr( $patt, $j, 1 ) ne '}' ) { $j++ } |
|
if ( $j == $len ) { |
|
die |
|
"Unmatched '{' in pattern $pattnm, offset $offs, character $i;"; |
|
} |
|
my $bounds_str = substr( $patt, $i + 1, $j - $i - 1 ); |
|
my @bounds = split /,/, $bounds_str; |
|
@bounds = map { /^\s*(\d+)\s*$/ ? $1 : undef } @bounds; |
|
push @tokens, regxtok_new_bounds( \@bounds ); |
|
$i = $j + 1; |
|
next; |
|
} |
|
|
|
# OPERATORS |
|
if ( $c eq '*' ) { |
|
my $before_closure = pop @tokens; |
|
push @tokens, regxtok_new_star($before_closure); |
|
$i++; |
|
next; |
|
} |
|
if ( $c eq '+' ) { |
|
my $before_plus = pop @tokens; |
|
push @tokens, regxtok_new_plus($before_plus); |
|
$i++; |
|
next; |
|
} |
|
if ( $c eq '?' ) { |
|
my $before_option = pop @tokens; |
|
push @tokens, regxtok_new_opt($before_option); |
|
$i++; |
|
next; |
|
} |
|
if ( $c eq '|' ) { |
|
my $before_alt = pop @tokens; |
|
my $after_start = $i++; |
|
while ( $i < $len ) { $i++; } |
|
my $after_alt = lexergrm_parse_patt( |
|
{ |
|
pattern => substr( $patt, $i, $i - $after_start ), |
|
name => $pattnm, |
|
offs => $len - $i |
|
} |
|
); |
|
push @tokens, regxtok_new_alt( [ $before_alt, $after_alt ] ); |
|
next; |
|
} |
|
if ( $c eq '~' ) { |
|
my $before_cat = pop @tokens; |
|
my $after_start = $i++; |
|
while ( $i < $len ) { $i++ } |
|
my $after_cat = lexergrm_parse_patt( |
|
{ |
|
pattern => substr( $patt, $i, $i - $after_start ), |
|
name => $pattnm, |
|
offs => $len - 1, |
|
} |
|
); |
|
push @tokens, regextok_new_cat( [ $before_cat, $after_cat ] ); |
|
next; |
|
} |
|
if ( $c eq '.' ) { |
|
push @tokens, regxtok_new_wildcard(undef); |
|
$i++; |
|
next; |
|
} |
|
|
|
# LITERAL RUN |
|
$collect_literal->(); |
|
} |
|
|
|
return \@tokens; |
|
} |
|
|
|
our $STATE_ID_COUNTER = 0; |
|
|
|
sub state_new { |
|
return [ $STATE_ID_COUNTER++, shift ]; |
|
} |
|
|
|
sub state_new_start { |
|
return state_new( $STATE_Start, shift ); |
|
} |
|
|
|
sub state_new_interm { |
|
return state_new( $STATE_Interm, shift ); |
|
} |
|
|
|
sub state_new_accept { |
|
return state_new( $STATE_Accept, shift ); |
|
} |
|
|
|
sub state_is_start { $_[0][1] == $STATE_Start; } |
|
sub state_is_interm { $_[0][1] == $STATE_Interm; } |
|
sub state_is_accept { $_[0][1] == $STATE_Accept; } |
|
|
|
sub state_get_id { $_[0][0]; } |
|
|
|
sub state_list_filter (&$) { |
|
my ( $pred, $list ) = @_; |
|
return grep $pred->($_), @$list; |
|
} |
|
|
|
sub state_get_from_id ($$) { |
|
my ( $states_haystack, $id_needle ); |
|
my $filtered = |
|
state_list_filter { state_get_id $_ == $id_needle } $states_haystack; |
|
return $filtered->[0]; |
|
} |
|
|
|
sub state_collect_eps_trans ($$) { |
|
my ( $state, $trans ) = @_; |
|
my $state_id = state_get_id $state; |
|
return trans_list_filter { |
|
trans_is_epsilon $_ |
|
&& ( $state_id == trans_get_from $_ ) |
|
} |
|
$trans; |
|
} |
|
|
|
sub state_collect_noneps_trans ($$) { |
|
my ( $state, $trans ) = @_; |
|
my $state_id = state_get_id $state; |
|
return trans_list_filter { |
|
!trans_is_epsilon($_) |
|
&& ( $state_id == trans_get_from $_ ) |
|
} |
|
$trans; |
|
} |
|
|
|
sub state_collect_moves ($$) { |
|
my ( $state, $trans ) = @_; |
|
my $state_id = state_get_id $state; |
|
my $moves = []; |
|
map { |
|
push @$moves, [ trans_get_to $_, trans_get_symbol $_ ] |
|
if trans_is_symbolic $_ |
|
&& trans_get_from $_ == $state_id; |
|
} $trans; |
|
return $moves; |
|
} |
|
|
|
# NFA function definitions |
|
|
|
sub nfa_new { |
|
my ( $states, $trans, $alpha ) = @_; |
|
return [ $states, $trans, $alpha ]; |
|
} |
|
|
|
sub nfa_get_states { $_[0][0]; } |
|
sub nfa_get_trans { $_[0][1]; } |
|
sub nfa_get_alpha { $_[0][2]; } |
|
|
|
sub nfa_append_symbol ($$) { |
|
my ( $nfa, $nsym ) = @_; |
|
my @nfa_alpha = @{ nfa_get_alpha($nfa) }; |
|
push @nfa_alpha, $nsym; |
|
return [ \@nfa_alpha, nfa_get_states($nfa), nfa_get_states($nfa) ]; |
|
} |
|
|
|
sub nfa_append_state ($$) { |
|
my ( $nfa, $nstate ) = @_; |
|
my @nfa_states = @{ nfa_get_states($nfa) }; |
|
push @nfa_states, $nstate; |
|
return [ nfa_get_alpha($nfa), \@nfa_states, nfa_get_trans($nfa) ]; |
|
} |
|
|
|
sub nfa_append_trans ($$) { |
|
my ( $nfa, $ntrans ) = @_; |
|
my @nfa_trans = @{ nfa_get_trans($nfa) }; |
|
push @nfa_trans, $ntrans; |
|
return [ nfa_get_alpha($nfa), nfa_get_states($nfa), \@nfa_trans ]; |
|
} |
|
|
|
sub nfa_get_start_state ($) { |
|
my $nfa = shift; |
|
my $start_states = |
|
state_list_filter { state_is_start $_ } nfa_get_states($nfa); |
|
return $start_states->[0]; |
|
} |
|
|
|
sub nfa_get_accept_states ($) { |
|
my $nfa = shift; |
|
return state_list_filter { state_is_accept $_ } nfa_get_states($nfa); |
|
} |
|
|
|
sub nfa_epsilon_closure ($$) { |
|
my ( $states, $trans ) = @_; |
|
my $closure = set_from_list $states; |
|
my $queue = queue_from_list $states; |
|
my $visited = set_new; |
|
|
|
until ( queue_is_empty($queue) ) { |
|
my $state = queue_dequeue $queue; |
|
set_add $closure, $state; |
|
foreach my $state_prime ( state_collect_eps_trans( $state, $trans ) ) { |
|
unless ( set_contains( $visited, $state_prime ) ) { |
|
set_add $visited, $state_prime; |
|
queue_enqueue $queue, $state_prime; |
|
} |
|
} |
|
} |
|
return $closure; |
|
} |
|
|
|
sub nfa_to_dfa ($) { |
|
my $nfa = shift; |
|
my $nfa_alpha = nfa_get_alpha $nfa; |
|
my $nfa_states = nfa_get_states $nfa; |
|
my $nfa_trans = nfa_get_trans $nfa; |
|
my $nfa_start_state = nfa_get_start_state $nfa; |
|
my $nfa_accept_states = set_from_list( nfa_get_accept_states $nfa ); |
|
|
|
my $state_is_accpeting = sub { set_contains $nfa_accept_states, $_; }; |
|
my $state_map_moves = sub { |
|
my $moves = shift; |
|
my $map = {}; |
|
for my $i ( 0 .. $#{$moves} ) { |
|
my ( $to_id, $symbol ) = @{ $moves->[$i] }; |
|
push @{ $map->{$symbol} }, state_get_from_id( $nfa_states, $to_id ); |
|
} |
|
return $map; |
|
}; |
|
|
|
my $dfa_states = set_new; |
|
my $dfa_trans = set_new; |
|
my $dfa_start = undef; |
|
my $dfa_accept = set_new; |
|
my $dfa_alpha = $nfa_alpha; |
|
|
|
my $visited = set_new; |
|
my $queue = queue_new; |
|
|
|
my $start_set = nfa_epsilon_closure [$nfa_start_state], $nfa_trans; |
|
$dfa_start = set_from_list $start_set; |
|
|
|
queue_enqueue $queue, $dfa_start; |
|
set_add $visited, $dfa_start; |
|
|
|
until ( queue_is_empty($queue) ) { |
|
my $curr_state = queue_dequeue $queue; |
|
my $curr_state_moves = state_collect_moves $curr_state, $nfa_trans; |
|
my $curr_state_moves_map = &map_states_moves->($curr_state_moves); |
|
set_add $dfa_states, $curr_state; |
|
set_add $dfa_accept, $curr_state if &state_is_accepting->($curr_state); |
|
|
|
foreach my $sym (@$nfa_alpha) { |
|
my $move_set = $curr_state_moves_map->{$sym}; |
|
my $next_state = nfa_epsilon_closure( $move_set, $nfa_trans ); |
|
|
|
foreach my $i ( 0 .. $#{$next_state} ) { |
|
my $ntrans = trans_new_symbolic( state_get_id($curr_state), |
|
state_get_id( $next_state->[$i] ), $sym ); |
|
set_add $dfa_trans, $ntrans; |
|
} |
|
|
|
if ( !set_contains( $visited, $next_state ) ) { |
|
set_add $visited, $next_state; |
|
queue_enqueue $queue, $next_state; |
|
} |
|
|
|
} |
|
} |
|
|
|
return dfa_new( $dfa_states, $dfa_alpha, $dfa_trans, $dfa_start, |
|
$dfa_accept ); |
|
|
|
} |
|
|
|
# DFA function implementations |
|
|
|
sub dfa_new ($$$$$) { |
|
my ( $states, $alpha, $trans, $start, $accept ) = @_; |
|
return [ $states, $alpha, $trans, $start, $accept ]; |
|
} |
|
|
|
sub dfa_complete ($) { |
|
my ( $states, $alpha, $trans, $start, $accept ) = @{ $_[0][0] }; |
|
my $states_new = dclone($states); |
|
|
|
my $trans_existing = set_new; |
|
foreach my $t (@$trans) { |
|
if ( trans_is_symbolic($t) ) { |
|
set_add( $trans_existing, |
|
[ trans_get_from($t), trans_get_symbol($t) ] ); |
|
} |
|
} |
|
|
|
my $trans_new = set_new; |
|
my $dead_state_added = 0; |
|
foreach my $s (@$states) { |
|
foreach my $sym (@$alpha) { |
|
unless ( set_contains( $trans_new, [ state_get_id($s), $sym ] ) ) { |
|
set_add( $trans_new, trans_new_dead( state_get_id($s), $sym ) ); |
|
$dead_state_added = 1; |
|
} |
|
} |
|
} |
|
|
|
if ($dead_state_added) { |
|
push @$states_new, [ $STATE_Interm, -1 ]; |
|
foreach my $sym (@$alpha) { |
|
set_add( $trans_new, trans_new_dead( -1, $sym ) ); |
|
} |
|
} |
|
|
|
return dfa_new( $states_new, $alpha, set_elments($trans_new), $start, |
|
$accept ); |
|
} |
|
|
|
sub dfa_compute_invtrans ($) { |
|
my $new_trans = @_; |
|
|
|
my $invtrans = {}; |
|
foreach my $t (@$new_trans) { |
|
if ( trans_is_symbolic($t) ) { |
|
my $key = [ trans_get_to($t), trans_get_sym($t) ]; |
|
unless ( exists $invtrans->{$key} ) { |
|
$invtrans->{$key} = set_new; |
|
} |
|
set_add $invtrans->{$key}, trans_get_from($t); |
|
} |
|
} |
|
|
|
return $invtrans; |
|
} |
|
|
|
sub dfa_minimize ($) { |
|
my ( $new_states, $alpha, $new_trans, $start, $accept ) = |
|
@{ dfa_complete $_[0][0] }; |
|
my $start_state_id = state_get_id $start; |
|
my $invtrans = dfa_compute_invtrans $new_trans; |
|
|
|
my $states_accepting = set_from_list($accept); |
|
my $states_nonaccepting = |
|
set_difference( set_from_lists($new_states), $states_accepting ); |
|
|
|
my $states_partition = |
|
set_from_list [ $states_accepting, $states_nonaccepting ]; |
|
|
|
my $worklist = queue_from_list [ $states_accepting, $states_nonaccepting ]; |
|
|
|
my $block_of = {}; |
|
set_iter { $block_of->{$_} = set_new } $states_accepting; |
|
set_iter { $block_of->{$_} = set_new } $states_nonaccepting; |
|
|
|
until ( queue_is_empty($worklist) ) { |
|
my $curr_states = queue_dequeue $worklist; |
|
foreach my $sym (@$alpha) { |
|
my $trans_states = set_new; |
|
foreach my $cs (@$curr_states) { |
|
my $key = [ $cs, $sym ]; |
|
set_add( $trans_states, $invtrans->{$key} ) |
|
if ( exists $invtrans->{$key} ); |
|
|
|
my $nblocks = {}; |
|
foreach my $ts (@$trans_states) { |
|
my $y = $block_of->{$ts}; |
|
$nblocks->{$y} = set_new |
|
unless ( exists $nblocks->{$y} ); |
|
set_add $nblocks->{$y}, $ts; |
|
} |
|
|
|
foreach my ( $y, $y1_set ) (%$nblocks) { |
|
my $y1 = dclone($y1_set); |
|
my $y2 = set_difference( $y, $y1 ); |
|
|
|
continue if set_empty($y2); |
|
|
|
set_remove( $states_partition, $y ); |
|
set_add( $states_partition, $y1 ); |
|
set_add( $states_partition, $y2 ); |
|
|
|
set_iter { $block_of->{$_} = $y1 } $y1; |
|
set_iter { $block_of->{$_} = $y2 } $y2; |
|
|
|
queue_iter { |
|
delete $worklist->[ $_[0] ] |
|
if $worklist->[ $_[0] ] == $_[1] |
|
} |
|
$y; |
|
queue_enqueue $worklist, $y1; |
|
queue_enqueue $worklist, $y2; |
|
} |
|
} |
|
} |
|
} |
|
|
|
my $partitions_flattened = []; |
|
set_iter { |
|
foreach my $s ( @{ set_elements $_ } ) { |
|
push @$partitions_flattened, $_ if ( state_is_accepting $s ); |
|
} |
|
} |
|
$states_partition; |
|
|
|
my $min_states = dclone($states_partition); |
|
my $min_start = $block_of->{$start_state_id}; |
|
my $min_accept = set_from_list $partitions_flattened; |
|
|
|
my $min_trans = {}; |
|
my $block_i = 0; |
|
foreach my $block (@$min_states) { |
|
my $s_rep = set_elements($block)->[ $block_i++ ]; |
|
foreach my $sym (@$alpha) { |
|
my $next_state = undef; |
|
INNER: |
|
foreach my $t (@$new_trans) { |
|
if ( trans_is_symbolic $t |
|
&& trans_get_from $t == $s_rep |
|
&& trans_get_sym $t == $sym ) |
|
{ |
|
$next_state = trans_get_to $t; |
|
last INNER; |
|
} |
|
my $key = [ $block, $sym ]; |
|
$min_trans->{$key} = $block_of->{$next_state}; |
|
} |
|
} |
|
} |
|
|
|
return [ $min_states, $min_trans, $min_start, $min_accept ]; |
|
} |