package Regex::List;
use strict;
use warnings;
no warnings 'uninitialized';
our $VERSION = do { my @r = (q$Revision: 1.06 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

our $DEBUG     = 0;

our $FILLER = "\x{fffd}"; # fallback

our $RE_PAREN = $Regex::Optimizer::RE_PAREN;
our $RE_START =
    qr{(?:
    (?!\\)\((?:\?
    (?:
     ([imsx\-\^\|]*:)  | # options
     \<?[\=\!]     | # look(behind|ahead)
     \#[^\)]+      | # comments
     #$RE_PAREN    | # ( condtion )
     #\??$RE_EXPR  | # { expression }
     \>              # independent subexpression
    ))?
    )}xo;

our $RE_XCHAR =
    qr{
       (?:\\
	(?:
	[^0xclupPNLUQEXC]     | # ordinary escaped character
	 0[0-9][0-9]          | # octal
	 x(?:[0-9A-Fa-f]{1,2} | # hex
	   \{[0-9A-Fa-f]+\})  | # unicode hex
	 c.                   | # control char
	 [pP]\{\w+\}          | # unicode properties
	 N\{[\w\ ]+\}         | # unicode name
	 )
	)}xo;

our $RE_PCHAR = 
    qr{
       (?:\\
        (?:
         [XC]  # unicode name
	)
       )}xo;

our $RE_CHAR = 
    qr{(?:
        (?!\\)[^\\]            | # raw character (except \)
        $RE_XCHAR              | # extended character
       )
      }xo;

our %PARAM = 
    (
     _i         => 0,
     _m         => 0,
     _s         => 0,
     _x         => 0,
     _char      => $RE_CHAR,
     _token     => $RE_CHAR,
     _cclass    => $RE_CHAR,
     debug      => $DEBUG,
     capture    => 0,
     lookahead  => 1,
     modifiers  => '',
     optim_cc   => 1,
     optim_cq   => 1,
     optim_sx   => 1,
     po         => '(?:',
     pc         => ')',
     quotemeta  => 1, 
     sort       => 0,
     );

# aliases

sub new{
    my $class = ref $_[0] ? ref shift : shift;
    my $self = bless { %PARAM } => $class;
    $self->set(@_);
}

sub clone{
    my $self = shift;
    my $clone = bless { %$self } => ref $self;
    $clone->set(@_);
}

sub set{
    my $self = shift;
    my %param = @_;
    for (sort keys %param){
	$self->{$_} = $param{$_};
	if ($_ eq 'capture'){
	    $self->{po} = $self->{capture} ? '(' : '(?:';
	    $self->{pc} = ')';
	}
	if ($_ eq 'modifiers'){
	    map { $self->{'_' . $_} = 0 } qw/i m s x/;
	    map { $self->{'_' . $_} = 1 } split //, $self->{$_};
	}
    }
    $self;
}

sub tokens{
    my $self  = shift;
    my $str   = shift;
    grep {$_ ne '' } split /($self->{_token})/, $str;
}

sub regopt{
    my  $re = shift;
    #ref $re eq 'Regexp' or return;
    $re =~ /^($RE_START)/ or return; # die "malformed regexp : $re";
    my $opt = $1;
    $opt =~ s/\(\?//o; $opt =~ s/[-:].*//o;
    $opt;
}

sub expand{
    my $self  = shift;
    my $re   = shift;
    my $isre  = ref $re eq 'Regexp';
    #$isre or $re = qr/$re/;
    my $mod = regopt($re);
    $mod =~ /x/ or $mod .= 'x';
    my ($indent, @indent);
    $re =~ 
	s{
	  ( $RE_START | (?!\\)[\)|])
	 }{
	     my $paren = $1;
	     my $sub = $paren;
	     if  ($paren eq ')'){ # close
		 $indent -= pop @indent;
	     }elsif($paren eq '|'){ # |
		 $sub = " | \n";
		 $sub .= " " x $indent;
	     }else{
		 $sub  = $indent ? "\n" : '';
		 $sub .= " " x $indent . $paren;
		 $indent += length($paren);
		 push @indent, length($paren);
	     }
	     $sub;
	 }geox;
    $isre ? qr/(?$mod:$re)/ : qq/(?$mod:$re)/;
}

sub unexpand{
    my $self = shift;
    my $re   = shift;
    my $isre  = ref $re eq 'Regexp';
    my $mod = regopt($re); 
    $mod =~ s/x//o;
    $re =~ s/\((?!\\)\?\#[^\)]+\)//o; # strip (?#comment)
    $re =~ s/(?!\\)#.*$//mg;          # strip comment
    $re =~ s/(?!\\)[ \t]//g;          # strip space
    # $re =~ s/([^\x00-\xff])/sprintf('\x{%04x}', ord($1))/eg;
    # and finally strip CRLF
    $re =~ s/[\n\r]//g;
    $isre ? 
	$mod ? qr/(?$mod:$re)/ : qr/$re/ :
	$mod ? qq/(?$mod:$re)/ : $re;
}

sub list2re { 
#    use utf8; # for substr
    no warnings 'redefine'; # for cheats
    my $self = shift;
    # trie construction allows no duplicates 
    # so we make sure they are all unique
    my (%list, @list);
    # Unique with order preserved
    if ($self->{_i}){ for (@_){ $_=lc($_); $list{$_}++ or push @list, $_ } }
    else            { for (@_){            $list{$_}++ or push @list, $_ } }
    undef %list; # to save memory
    #$self->{sort} and @list = sort {length($b) <=> length($a) } @list;
    $self->{sort} and @list = sort @list;
    my $result;
    if ($self->{quotemeta}){
	# cheat
	*_head = sub{ substr($_[1], 0, $_[2]*2) };
	*_tail = sub{ substr($_[1], $_[2]*2) };
	$result = _trie_regex($self, map { _metaquote($_) } @list);
	$result =~ tr/\x00//d; 
	#$result =~ tr/\x{FFFd}//d;
    }else{
	*_head = \&_head_re;
	*_tail = \&_tail_re;
	$self->{_x} and @list = map { s/\\? /\\ /g; $_  } @list;
	$result = _trie_regex($self, @list); 
    }
    my $lookahead;
    if ($self->{lookahead}){
	my %lookahead;
	$lookahead{$self->_first($_)}++ for @list;
        my @lookahead = 
	    $self->{quotemeta} 
		?  map { tr/\x00//d; qq/\Q$_/ } keys %lookahead
		    #map { tr/\x{FFFd}//d; qq/\Q$_/ } keys %lookahead
		    : keys %lookahead;
	@lookahead = sort sort grep {!/^\\[luLUEQXC]/} @lookahead;
	if (@lookahead > 1){
	    my $lookahead = join('' => @lookahead);
	    $result = qq/(?=[$lookahead])$result/;
	}
	undef @lookahead;
    }
    my $mod = $self->{modifiers};
    $mod =~ 'x' and return $self->expand($result);
    $result = $self->{as_string} ? 
	$mod ? qq/(?$mod:$result)/ : qq/$result/ :
	    $mod ? qr/(?$mod:$result)/ : qr/$result/;
}

sub _metaquote{
    my $str =
	join '' => 
	    map { my $q=qq/\Q$_/; length($q) == 2 ? $q : "\x00$q" }
	    #map { my $q=qq/\Q$_/; length($q) == 2 ? $q : "\x{FFFd}$q" }
		split // => shift;
    $str;
}

sub _first{
    my $self = shift;
    my $str = shift;
    my $re = $self->{_char};
    $str =~ /^($re)/o;
    return $1;
}

sub _head_re{
    my $self = shift;
    my ($str, $pos) = @_;
    $str eq '' and return '';
    my $token = $self->{_token};
    for (my $p = $pos, pos($str) = 0; $p > 0 ; $p--){
	$str =~ /\G$token/gc or last;
    }
    substr($str, 0, pos($str));
    
}

sub _tail_re{
#    use utf8;
    my $self = shift;
    my ($str, $pos) = @_;
    $str eq '' and return '';
    my $token = $self->{_token};
    for (my $p = $pos, pos($str) = 0; $p > 0 ; $p--){
	$str =~ /\G$token/gcs or last;
    }
    substr($str,pos($str));
}

#use Data::Dumper;
#$Data::Dumper::Indent = 1;

sub _prefixes {
    my $self = shift;
    my (@head, @prefix, %prefix);
    for (@_) {
	my $c = $self->_head($_, 1);
	exists $prefix{$c} or push @prefix, $c; # to preserve order
	$prefix{$c} ||= [];
	push @{$prefix{$c}}, $_;
    }
    for (@prefix) {
	# Find common substring
	my $prefix = $prefix{$_}->[0];
	if (@{$prefix{$_}} == 1){
	    push @head, [ $prefix ]; next ;
	}
	my $l = length($prefix);
	for (@{$prefix{$_}}) {
	    $l -= 1
		while $self->_head($_, $l) ne $self->_head($prefix, $l);
	}
	# Return value
	$prefix = $self->_head($prefix, $l);
	my @suffix = map {$self->_tail($_, $l)} @{$prefix{$_}};
    if ($prefix or $prefix eq '0') {
	    push @head, [$prefix, @suffix];
	} else        {
	    push @head, [@suffix];
	}
    }
    #print Dumper \@head;
    #sleep 1;
    @head;
}


sub _rev{
    my $self = shift;
    my $str = shift;
    if ($self->{quotemeta}){
	return length $str > 2 ?
	    join '' => reverse split /(..)/, $str : $str;
    }else{
	my $re = $self->{_token};
	#return $str =~ /^$re?$/o ?
	#   $str : join '' => reverse split /($re)/, $str;
	$str =~ /^$self->{_token}$/ and return $str;
	my @token;
	$str =~ s{ ($re) }{ push @token, $1 }egx;
	return join '' => reverse @token;
    }
}
sub _trie_regex {
    my $self = shift;
    @_ or return;
    @_ == 1 and return shift;

    $self->{debug} and $self->{_indent}++;
    $self->{debug} and
	print STDERR '>'x $self->{_indent}, " ", join(',' => @_), "\n";

    my (@leaf, @result);

    #
    # Suffixing Optimization
    # - only leaf nodes in the same branch can be suffix-bundled
    #
    if ($self->{optim_sx}){
	for ($self->_prefixes(@_)){
	    my ($prefix, @suffix) = @$_;
	    if (@suffix){
		push @result, $prefix.$self->_trie_regex(@suffix);
	    }else{
		push @leaf, $prefix;
	    }
	}
	for ($self->_prefixes(map { $self->_rev($_) } @leaf)){
	    my ($suffix, @prefix) = @$_;
	    $suffix = $self->_rev($suffix);
	    if (@prefix){
		push @result, 
		    $self->_trie_regex(map { $self->_rev($_) } @prefix)
			. $suffix;
	    }else{
		push @result, $suffix;
	    }
	}
    }else{
	for ($self->_prefixes(@_)){
	    my ($prefix, @suffix) = @$_;
	    push @result, @suffix ? $prefix.$self->_trie_regex(@suffix) : $prefix;
	}
    }

    my $result;

  RESULT:
    {
	@result == 1 and $result = $result[0] and last RESULT;
	my $q = '';
	# alteration check
	# we do linear seach here to preserve order.
	for (my $i = 0; $i < @result; $i++){
	    if ($result[$i] eq ''){
		 splice @result, $i, 1;
		 $q = '?';
		 last;
	    }
	}
	# if ($result[0] eq '') { $q = '?'; shift @result }
	# extract character class
	if ($self->{optim_cc}){
	    my @char; my  $charpos = -1;
	    for (my $i = 0; $i < @result; $i++){
		if ($self->{quotemeta}){
		    if (length($result[$i]) == 2){ 
			$charpos < 0 and $charpos = $i;
			push @char => splice(@result, $i, 1, "");
		    }
		}else{
		    if ($result[$i] =~ /^$self->{_cclass}$/){
			$charpos < 0 and $charpos = $i;
			push @char => splice(@result, $i, 1, "");
		    }
		}
	    }
	    if (@char){
		my $char = $self->_optim_cc(@char);
		splice @result, $charpos, 0, $char;
		@result = grep {$_} @result;
		if (@result == 1){
		    $result = "$result[0]$q" and last RESULT;
		}
	    }
	}
	my $joiner = '|' ;
	if ($self->{optim_cq} and @result == 1 and
	    ($self->{quotemeta} 
	     ? length($result[0]) ==
	     1 : $result[0] =~ /^$self->{_token}$/))
	{
	    $result = qq/$result[0]$q/;
	}else{
	    $result = 
		$self->{po} . join($joiner => @result) . $self->{pc} .  $q;
	}
    }
    $self->{debug} and 
	print STDERR '<'x $self->{_indent}, " ", $result, "\n";
    $self->{_x} || $self->{debug} and $self->{_indent}--;

    $result;
}

sub _optim_cc{
    my $self = shift;
    @_ or return undef;
    if ($self->{quotemeta}){
	return  @_ ? @_ > 1  ? "[".join("",@_)."]" : $_[0] : undef;
    }
    # check '.'
    for (@_){
	$_ eq '.' and return '.';
    }
    my @char = @_;
    my ($positive, $negative) = ('','');
    my ($npos, $nneg) = (0, 0);
    for (@char){
	if    (s/^\[\^(.*)\]$/$1/){
	    $negative .= $_; $nneg += 2; next;
	}
	if (s/^\[(.*)\]$/$1/){
	    $positive .= $_; $npos += 2; next;
	}else{
	    #$positive .= length($_) eq 1 ? qq/\Q$_/ : $_;
	    $positive .= $_ eq '-' ? '\-' : $_;
	    $npos++;
	}
    }
    $nneg > 1 and $negative = qq/[^$negative]/;
    $npos > 1 and $positive = qq/[$positive]/;
    return $negative 
	?  $positive ? "(?:$positive|$negative)" : $negative
	    :  $positive;
}

1;

