/* ,file-id archive://[lord]/419/rx/comp.c/1998-05-18
 */

/*	Copyright (C) 1997 Tom Lord
 * 
 * This program is provided to you under the terms of the Liberty Software
 * License.  You are NOT permitted to redistribute, modify, or use it
 * except in very specific ways described by that license.
 *
 * This software comes with NO WARRANTY.
 * 
 * You should have received a copy of the Liberty Software License
 * along with this software; see the file =LICENSE.  If not, write to
 * the Tom Lord, 1810 Francisco St. #2, Berkeley CA, 94703, USA.  
 */





#include <sys/types.h>
#include <ctype.h>
#include <setjmp.h>
#include "vu/bitset.h"
#include "vu/dstr.h"
#include "errnorx.h"
#include "rexp.h"
#include "comp.h"


/* Parser Error Messages
 */

char * rx_error_msg[] =
{
  0,						/* REG_NOUT */
  "No match",					/* REG_NOMATCH */
  "Invalid regular expression",			/* REG_BADPAT */
  "Invalid collation character",		/* REG_ECOLLATE */
  "Invalid character class name",		/* REG_ECTYPE */
  "Trailing backslash",				/* REG_EESCAPE */
  "Invalid back reference",			/* REG_ESUBREG */
  "Unmatched [ or [^",				/* REG_EBRACK */
  "Unmatched ( or \\(",				/* REG_EPAREN */
  "Unmatched \\{",				/* REG_EBRACE */
  "Invalid content of \\{\\}",			/* REG_BADBR */
  "Invalid range end",				/* REG_ERANGE */
  "Memory exhausted",				/* REG_ESPACE */
  "Invalid preceding regular expression",	/* REG_BADRPT */
  "Premature end of regular expression",	/* REG_EEND */
  "Regular expression too big",			/* REG_ESIZE */
  "Unmatched ) or \\)",				/* REG_ERPAREN */
};


/* Predefined Translation Tables
 */

unsigned char rx_id_translation_table[256] =
{
  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,

 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,

 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
 250, 251, 252, 253, 254, 255
};

unsigned char rx_case_fold_translation_table[256] =
{
  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
 60, 61, 62, 63, 64, 97, 98, 99, 100, 101,
 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,

 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,

 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
 250, 251, 252, 253, 254, 255
};


/* Functions Used Internally by the Parser 
 */

struct rx_parse_state
{
  struct rx_exp_node * root;

  unsigned char const * pos;

  unsigned char const * pattern;
  int size;
  int extended_p;
  int no_newline;
  int cset_size;

  unsigned char * translate;
  subset inv_tr [256 * bitset_numb_subsets(256)];
  char valid_inv_tr [256];
  int n_members [256];

  int at_beg;
  char backrefs_valid[10];
  int exp_number;

  int dfa_only;
  int cut_count;

  int err;
  jmp_buf err_escape;
};

static void rx_parse_alt (struct rx_exp_node ** where, struct rx_parse_state * state);
static void rx_parse_concat (struct rx_exp_node ** where, struct rx_parse_state * state);
static void rx_parse_repeated (struct rx_exp_node ** where, struct rx_parse_state * state);
static void rx_parse_item (struct rx_exp_node ** where, struct rx_parse_state * state);

static int
rx_eop (struct rx_parse_state * state)
{
  return (state->pos == (state->pattern + state->size));
}

static int
rx_scan_ahead (struct rx_parse_state * state, unsigned char * token, int len)
{
  if ((state->pos + len) > (state->pattern + state->size))
    return 0;

  {
    int x;
    for (x = 0; x < len; ++x)
      if (state->translate[*(state->pos + x)] != token[x])
	return 0;
    return 1;
  }
}

static int
rx_expand_op (unsigned char * buf, struct rx_parse_state * state, int op)
{
  if (!state->extended_p && (op != '*'))
    {
      buf[0] = '\\';
      buf[1] = op;
      return 2;
    }
  else
    {
      buf[0] = op;
      return 1;
    }
}

static int
rx_scan_op_ahead (struct rx_parse_state * state, int op)
{
  int x;
  unsigned char buf[2];
  x = rx_expand_op (buf, state, op);
  return rx_scan_ahead (state, buf, x);
}

static int
rx_scan (struct rx_parse_state * state, unsigned char * token, int len)
{
  if (rx_scan_ahead (state, token, len))
    {
      state->pos += len;
      return 1;
    }
  else
    return 0;
}

static int
rx_scan_op (struct rx_parse_state * state, int op)
{
  int x;
  unsigned char buf[2];
  x = rx_expand_op (buf, state, op);
  return rx_scan (state, buf, x);
}


static int
rx_factor_string (struct rx_exp_node *** lastp, int cset_size)
{
  struct rx_exp_node ** expp;
  struct rx_exp_node * exp;
  bitset cs;
  struct rx_exp_node * cset_node;

  expp = *lastp;
  exp = *expp;

  cs = (bitset) alloca (sizeof_bitset (cset_size));
  bitset_clear (cset_size, cs);
  bitset_adjoin (cs, exp->cstr.chr[exp->cstr.len - 1]);
  cset_node = rx_mk_r_cset (r_cset, cset_size, cs);
  cset_node->observed = 0;
  if (exp->cstr.len == 1)
    {
      rx_free_rexp (exp);
      *expp = cset_node;
      return 0;
    }
  else
    {
      struct rx_exp_node * concat_node;
      exp->cstr.len--;
      concat_node = rx_mk_r_binop (r_concat, exp, cset_node);
      if (!concat_node)
	{
	  rx_free_rexp (cset_node);
	  return -1;
	}
      concat_node->observed = 0;
      *expp = concat_node;
      *lastp = &concat_node->right;
      return 0;
    }
}

/* The compiler keeps an inverted translation table.
 * This looks up/inititalize elements.
 * VALID is an array of booleans that validate CACHE.
 */
static bitset
rx_inverse_translation (int * n_members,
			int cset_size,
			char * valid,
			bitset cache,
			unsigned char * translate,
			int c)
{
  bitset cs;

  cs = cache + c * bitset_numb_subsets (cset_size); 

  if (!valid[c])
    {
      int x;
      int c_tr;
      int membs;

      c_tr = translate[(unsigned char)c];
      bitset_clear (cset_size, cs);
      membs = 0;
      for (x = 0; x < 256; ++x)
	if (translate[x] == c_tr)
	  {
	    bitset_adjoin (cs, x);
	    membs++;
	  }
      valid[c] = 1;
      n_members[c] = membs;
    }
  return cs;
}


/* A Recursive Descent Regexp Parser 
 */

static void
rx_parse_alt (struct rx_exp_node ** where, struct rx_parse_state * state)
{
  rx_parse_concat (where, state);

  if (rx_scan_op (state, '|'))
    {
      struct rx_exp_node * alt;
      
      alt = rx_mk_r_binop (r_alternate, *where, 0);
      state->at_beg = 1;
      rx_parse_alt (&alt->right, state);
      alt->observed = ((   alt->left
			&& alt->left->observed)
		       || (   alt->right
			   && alt->right->observed));
      *where = alt;
    }
}

static void
rx_parse_concat (struct rx_exp_node ** where, struct rx_parse_state * state)
{
  rx_parse_repeated (where, state);

  if (   *where
      && !rx_eop (state)
      && !rx_scan_op_ahead (state, '|')
      && !rx_scan_op_ahead (state, ')')
      && !rx_scan_ahead (state, "[[:):]]", 7))
    {
      struct rx_exp_node * concat;
      concat = rx_mk_r_binop (r_concat, *where, 0);
      rx_parse_concat (&concat->right, state);
      if (!concat->right)
	{
	  concat->left = 0;
	  rx_free_rexp (concat);
	}
      else
	{
	  concat->observed = ((   concat->left
			       && concat->left->observed)
			      || (   concat->right
				  && concat->right->observed));
	  *where = concat;
	}
    }
}

static void
rx_parse_repeated (struct rx_exp_node ** where, struct rx_parse_state * state)
{
  char const * saved_pos;
  enum { plus, opt, star, interval } op_type;
  int iv;
  int iv2;

  rx_parse_item (where, state);

  saved_pos = state->pos;

  if (rx_scan_op (state, '+'))
    op_type = plus;
  else if (rx_scan_op (state, '?'))
    op_type = opt;
  else if (rx_scan (state, "*", 1))
    op_type = star;
  else if (!state->dfa_only && rx_scan_op (state, '{'))
    {
      int lo;
      int hi;
      unsigned char const * bound;

      bound = state->pos + state->size;
      if (state->pos == bound)
	{
	not_an_interval:
	  state->pos = saved_pos;
	  return;
	}
      lo = 0;
      while (   (state->pos < bound)
	     && (isdigit (state->translate[*state->pos])))
	lo = lo * 10 + (state->translate[*(state->pos++)] - '0');
      if (state->pos == bound)
	goto not_an_interval;
      if (rx_scan (state, ",", 1))
	{
	  hi = 0;
 	  while (   (state->pos < bound)
		 && (state->translate[isdigit (*state->pos)]))
	    hi = hi * 10 + (state->translate[*(state->pos++)] - '0');
	  if (!(rx_scan_op (state, '}')))
	    goto not_an_interval;
	  goto know_range;
	}
      else if (rx_scan_op (state, '}'))
	{
	  hi = lo;
	  goto know_range;
	}
      else
	goto not_an_interval;
      
    know_range:
      if ((hi < lo) || (hi == 0))
	{
	  state->err = REG_BADBR;
	  longjmp (state->err_escape, 1);
	}
      op_type = interval;
      iv = lo;
      iv2 = hi;
    }
  else
    return;

  while (*where)
    {
      if (((*where)->type == r_concat))
	where = &(*where)->right;
      else if ((*where)->type == r_string)
	rx_factor_string (&where, state->cset_size);
      else
	break;
    }

  {
    struct rx_exp_node * iter;

    switch (op_type)
      {
      case opt:
	{
	  struct rx_exp_node * alt;
	  alt = rx_mk_r_binop (r_alternate, *where, 0);
	  *where = alt;
	  break;
	}

      case plus:
      case star:
	{
	  iter = rx_mk_r_monop (r_star, *where);
	  iter->observed = *where ? (*where)->observed : 0;
	  if (op_type == plus)
	    {
	      struct rx_exp_node * copy;
	      struct rx_exp_node * conc;
	      
	      copy = rx_copy_rexp (256, *where);
	      conc = rx_mk_r_binop (r_concat, copy, iter);
	      iter = conc;
	    }
	  *where = iter;
	  break;
	}

      case interval:
	{
	  iter = rx_mk_r_monop (r_interval, *where);
	  iter->observed = *where ? (*where)->observed : 0;
	  iter->intval = iv;
	  iter->intval2 = iv2;
	  *where = iter;
	  break;
	}
      }
  }
}

enum rx_character_classes
{
  rx_cc_alnum,
  rx_cc_alpha,
  rx_cc_blank,
  rx_cc_cntrl,
  rx_cc_digit,
  rx_cc_graph,
  rx_cc_lower,
  rx_cc_print,
  rx_cc_punct,
  rx_cc_space,
  rx_cc_upper,
  rx_cc_xdigit
};

struct rx_cc_name
{
  char * name;
  enum rx_character_classes class_id;
};

struct rx_cc_name rx_cc_names[] = 
{
  {"alnum", rx_cc_alnum},
  {"alpha", rx_cc_alpha},
  {"blank", rx_cc_blank},
  {"cntrl", rx_cc_cntrl},
  {"digit", rx_cc_digit},
  {"graph", rx_cc_graph},
  {"lower", rx_cc_lower},
  {"print", rx_cc_print},
  {"punct", rx_cc_punct},
  {"space", rx_cc_space},
  {"upper", rx_cc_upper},
  {"xdigit", rx_cc_xdigit},
  {0, 0}
};

static void
rx_parse_item (struct rx_exp_node ** where, struct rx_parse_state * state)
{
  int type;
  int iv;
  int token;
  unsigned char * translate;

  translate = state->translate;

  if (rx_eop (state))
    {
    empty_item:
      *where = 0;
      state->at_beg = 0;
      return;
    }

  /* nested subexpressions */
  if (state->dfa_only && rx_scan_op (state, '('))
    {
      state->at_beg = 1;
      rx_parse_alt (where, state);
      if (!(rx_scan_op (state, ')')))
	{
	  state->err = REG_EPAREN;
	  longjmp (state->err_escape, 1);
	}
      state->at_beg = 0;
      return;
    }

  if (rx_scan_op (state, '('))
    {
      int exp_number;

      exp_number = state->exp_number;
      ++state->exp_number;
      state->at_beg = 1;
      rx_parse_alt (where, state);
      if (!(rx_scan_op (state, ')')))
	{
	  state->err = REG_EPAREN;
	  longjmp (state->err_escape, 1);
	}
      state->at_beg = 0;
      if (exp_number < 10)
	state->backrefs_valid[exp_number] = 1;
      {
	struct rx_exp_node * n;
	n = rx_mk_r_monop (r_parens, *where);
	n->intval = exp_number;
	n->observed = 1;
	*where = n;
      }
      return;
    }

  if (rx_scan_op_ahead (state, ')'))
    {
      *where = 0;
      return;
    }

  if (rx_scan (state, "[[:(:]]", 7))
    {
      state->at_beg = 1;
      rx_parse_alt (where, state);
      if (!(rx_scan (state, "[[:):]]", 7)))
	{
	  state->err = REG_EPAREN;
	  longjmp (state->err_escape, 1);
	}
      state->at_beg = 0;
      return;
    }

  if (rx_scan_ahead (state, "[[:):]]", 7))
    {
      *where = 0;
      return;
    }

  if (rx_scan (state, "[[:cut ", 7))
    {
      unsigned const char * bound;
      int val;
      int sign;

      sign = 1;
      bound = state->pos + state->size;
      val = 0;

      while (   (state->pos < bound)
	     && isspace (translate[*state->pos]))
	++state->pos;

      if (   (state->pos >= bound)
	  || !(   (translate[*state->pos] == '%')
	       || (translate[*state->pos] == '-')
	       || isdigit (translate[*state->pos])))
	{
	bad_cut:
	  state->err = REG_BADPAT;
	  longjmp (state->err_escape, 1);
	}

      if (translate[*state->pos] == '%')
	{
	  val = state->cut_count++;
	  ++state->pos;
	}
      else
	{
	  if (translate[*state->pos] == '-')
	    {
	      sign = -1;
	      ++state->pos;
	    }
	  while (   (state->pos < bound)
		 && isdigit (translate [*state->pos]))
	    val = val * 10 + (translate [*(state->pos++)] - '0');
	  val = val * sign;
	}

      while (   (state->pos < bound)
	     && isspace (translate [*state->pos]))
	++state->pos;

      if (!rx_scan (state, ":]]", 3))
	goto bad_cut;

      *where = rx_mk_r_int (r_cut, val);
      (*where)->observed = 1;
      return;
    }

  /* anchors */
  if (   !state->dfa_only
      && (state->extended_p || state->at_beg)
      && rx_scan (state, "^", 1))
    {
      iv = '^';
    make_context_node:
      type = r_context;
      {
	struct rx_exp_node * n;
	n = rx_mk_r_int (type, iv);
	*where = n;
	n->observed = 1;
	state->at_beg = (iv == '^');
	return;
      }      
    }
  else if (!state->dfa_only && state->extended_p && !state->at_beg && rx_scan (state, "^", 1))
    {
      state->err = REG_BADPAT;
      longjmp (state->err_escape, 1);
    }

  {
    int at_end;

    at_end = (   !state->dfa_only
	      && !state->extended_p
	      && (   rx_scan_ahead (state, "$\\)", 3)
		  || rx_scan_ahead (state, "$\\|", 3)
		  || (   rx_scan_ahead (state, "$", 1)
		      && (state->pos + 1 == (state->pattern + state->size)))));

    if (!state->dfa_only && (state->extended_p || at_end) && rx_scan (state, "$", 1))
      {
	type = r_context;
	iv = '$';
	goto make_context_node;
      }
    else if (!state->dfa_only && !state->extended_p && !at_end && rx_scan (state, "$", 1))
      {
	state->err = REG_BADPAT;
	longjmp (state->err_escape, 1);
      }
  }

  /* The characters *, ?, +, and { are sometimes valid,
   * sometimes special, and sometimes an error:
   */
  if (token = '*', rx_scan_op_ahead (state, token))
    {
      iv = '*';
      goto got_iterator;
    }
  if (token = '+', rx_scan_op_ahead (state, token))
    {
      iv = '+';
      goto got_iterator;
    }
  if (!state->dfa_only && (token = '{', rx_scan_op_ahead (state, token)))
    {
      iv = '{';
      goto got_iterator;
    }
  if (token = '?', rx_scan_op_ahead (state, token))
    {
      iv = '?';
    got_iterator:
      if (!state->extended_p && state->at_beg)
	{
	  rx_scan_op (state, token);
	  goto begin_string;
	}
      else
	goto empty_item;
    }

  /* empty before alt */
  if (rx_scan_op_ahead (state, '|'))
    goto empty_item;

  /* csets */
  if (rx_scan (state, ".", 1))
    {
      bitset cs;
      struct rx_exp_node * n;
      cs = (bitset) alloca (sizeof_bitset (state->cset_size));
      bitset_fill (state->cset_size, cs);
      bitset_remove (cs, 0);
      if (state->no_newline)
	bitset_remove (cs, '\n');
      n = rx_mk_r_cset (r_cset, state->cset_size, cs);
      n->observed = 0;
      *where = n;
      return;
    }


  if (rx_scan (state, "[", 1))
    {
      int invert_it;
      bitset cs;
      struct rx_exp_node * n;

      invert_it = rx_scan (state, "^", 1);
      
      cs = (bitset) alloca (sizeof_bitset (state->cset_size));
      bitset_clear (state->cset_size, cs);

      /* An initial ']' is special. */

      if (rx_scan_ahead (state, "]", 1))
	goto normal_char;

      while (!rx_scan (state, "]", 1))
	{
	  if (rx_eop (state))
	    {
	    short_bracket:
	      state->err = REG_EBRACK;
	      longjmp (state->err_escape, 1);
	    }
	  else if (rx_scan (state, "[:", 2))
	    {
	      int x;
	      int cl;

	      x = 0;
	      while (rx_cc_names[x].name)
		{
		  if (rx_scan (state, rx_cc_names[x].name, strlen (rx_cc_names[x].name)))
		    {
		      cl = rx_cc_names[x].class_id;
		      break;
		    }
		  else
		    ++x;
		}

	      if (   !rx_cc_names[x].name
		  || !rx_scan (state, ":]", 2))
		{
		  state->err = REG_ECTYPE;
		  longjmp (state->err_escape, 1);
		}

	      for (x = 0; x < state->cset_size; ++x)
		{
		  int in;

		  switch (cl)
		    {
		    case rx_cc_alnum:
		      in = isalnum (x);
		      break;

		    case rx_cc_alpha:
		      in = isalpha (x);
		      break;

		    case rx_cc_blank:
		      in = ((x == ' ') || (x == '\t'));
		      break;
		      
		    case rx_cc_cntrl:
		      in = iscntrl (x);
		      break;

		    case rx_cc_digit:
		      in = isdigit (x);
		      break;

		    case rx_cc_graph:
		      in = isgraph (x);
		      break;

		    case rx_cc_lower:
		      in = islower (x);
		      break;

		    case rx_cc_print:
		      in = isprint (x);
		      break;

		    case rx_cc_punct:
		      in = ispunct (x);
		      break;

		    case rx_cc_space:
		      in = isspace (x);
		      break;

		    case rx_cc_upper:
		      in = isupper (x);
		      break;

		    case rx_cc_xdigit:
		      in = isxdigit (x);
		      break;
		    }

		  if (in)
		    {
		      bitset it;
		      it = rx_inverse_translation (state->n_members, state->cset_size,
						   state->valid_inv_tr, state->inv_tr,
						   translate, x);
		      bitset_union (state->cset_size, cs, it);
		    }
		  bitset_adjoin (cs, x);
		}
	    }
	  else
	    {
	      int first;
	      int last;

	    normal_char:

	      first = translate[*state->pos];
	      ++state->pos;
	      if ((first == '\\') && !rx_eop(state))
		{
		  first = translate[*state->pos];
		  ++state->pos;
		  switch (first)
		    {
		    default:
		      break;
		    case 'n':
		      first = '\n';
		      break;
		    case 'f':
		      first = '\f';
		      break;
		    case 't':
		      first = '\t';
		      break;
		    }
		}
	      {
		bitset it;
		it = rx_inverse_translation (state->n_members, state->cset_size,
					     state->valid_inv_tr, state->inv_tr,
					     translate, first);
		bitset_union (state->cset_size, cs, it);
	      }
	      if (   !rx_scan_ahead (state, "-]", 2)
		  && rx_scan (state, "-", 1))
		{
		  if (rx_eop (state))
		    goto short_bracket;
		  last = translate[*state->pos];
		  ++state->pos;
		  if (first > last)
		    {
		      state->err = REG_ERANGE;
		      longjmp (state->err_escape, 1);
		    }
		  while (first <= last)
		    {
		      {
			bitset it;
			it = rx_inverse_translation (state->n_members, state->cset_size,
						     state->valid_inv_tr, state->inv_tr,
						     translate, first);
			bitset_union (state->cset_size, cs, it);
		      }
		      ++first;
		    }
		}
	    }
	}

      if (invert_it)
	{
	  bitset_complement (state->cset_size, cs);
	  if (state->no_newline)
	    bitset_remove (cs, '\n');
	}

      n = rx_mk_r_cset (r_cset, state->cset_size, cs);
      *where = n;
      n->observed = 0;
      
      return;
    }

  if (rx_scan (state, "\\", 1))
    {
      if (rx_eop (state))
	{
	  state->err = REG_EEND;
	  longjmp (state->err_escape, 1);
	}

      if (state->dfa_only)
	goto escaped_char_default;
      
      switch (translate[*state->pos])
	{
	default:
	escaped_char_default:
	  iv = translate[*state->pos];
	  ++state->pos;
	  goto begin_string;

	case '0': case '1':  case '2':  case '3':  case '4':
	case '5': case '6':  case '7':  case '8':  case '9':
	  iv = translate[*state->pos];
	  ++state->pos;
	  if (!state->backrefs_valid[iv - '0'])
	    goto begin_string;
	  else
	    goto make_context_node;
	}
    }
  
  /* string */
  iv = translate[*state->pos];
  ++state->pos;

 begin_string:
  {
    bitset it;
    it = rx_inverse_translation (state->n_members, state->cset_size,
				 state->valid_inv_tr, state->inv_tr,
				 translate, iv);
    if (state->n_members[iv] == 0)
      {
	*where = 0;
	return;
      }
    else if (state->n_members[iv] > 1)
      {
	bitset cs;
	struct rx_exp_node * match;

	cs = (bitset) alloca (sizeof_bitset (state->cset_size));
	bitset_assign (state->cset_size, cs, it);
	match = rx_mk_r_cset (r_cset, state->cset_size, cs);
	match->observed = 0;
	*where = match;
	return;
      }
  }
  {
    char c;
    c = iv;
    *where = rx_mk_r_str (r_string, &c, 1);
  }
  (*where)->observed = 0;
  {
    unsigned const char * bound;
    bound = state->pattern + state->size;
    while (state->pos < bound)
      {
	switch (translate[*state->pos])
	  {
	  default:
	  add_simple_char:
	    {
	      bitset it;

	      it = rx_inverse_translation (state->n_members, state->cset_size,
					   state->valid_inv_tr, state->inv_tr,
					   translate, iv);
	      if (state->n_members[iv] != 1)
		return;

	      dstr_append (&(*where)->cstr, &translate[*state->pos], 1);
	      ++state->pos;
	      continue;
	    }

	  case '.':
	  case '*':
	  case '[':
	    return;

	  case '{':
	  case '^':
	    if (state->dfa_only)
	      goto add_simple_char;
	    /* fall through */
	  case '(':
	  case ')':
	  case '|':
	  case '+':
	  case '?':
	    if (!state->extended_p)
	      goto add_simple_char;
	    else
	      return;

	  case '$':
	    if (state->dfa_only)
	      goto add_simple_char;
	    if (   state->extended_p
		|| rx_scan_ahead (state, "$\\|", 3)
		|| (   rx_scan_ahead (state, "$", 1)
		    && (state->pos + 1 == bound)))
	      return;
	    else
	      goto add_simple_char;

	  case '\\':
	    if ((state->pos + 1) == bound)
	      {
		state->err = REG_BADPAT;
		longjmp (state->err_escape, 1);
	      }

	    if (state->dfa_only)
	      {
		++state->pos;
		goto add_simple_char;
	      }

	    if (isdigit (translate[*(state->pos + 1)]))
	      return;

	    if (state->extended_p)
	      {
		++state->pos;
		goto add_simple_char;
	      }

	    switch (translate[*(state->pos + 1)])
	      {
	      default:
		++state->pos;
		goto add_simple_char;
	      case '{':
		if (state->dfa_only)
		  goto add_simple_char;
		else
		  return;
	      case '(':
	      case ')':
	      case '|':
	      case '+':
	      case '?':
		return;
	      }
	  }
      }
  }
}


/* Regexp Optimizer
 *
 * This function rewrites an expression returned by the parser,
 * to produce an equivalent expression that can be matched more
 * quickly.
 *
 * The gist of the optimization is to move regexp constructs which are 
 * not regular expressions closer to the root of the tree.
 */

static struct rx_exp_node * rx_optimize (struct rx_parse_state * state, struct rx_exp_node * n);

static struct rx_exp_node *
rx_optimize_combination_left (struct rx_parse_state * state,
			      enum rx_exp_node_type type, 
			      struct rx_exp_node * n)
{
  if (!n)
    return 0;

  n = rx_optimize (state, n);

  if (n->type != type)
    return n;

  if (!n->observed)
    return n;

  n->right = rx_optimize_combination_left (state, type, n->right);

  if (!n->right->observed)
    return n;

  if (   (n->right->type != type)
      || (n->right->right->observed))
    return n;

  {
    struct rx_exp_node * tmp;

    tmp = n->right->right;
    n->right->right = n->right->left;
    n->right->left = n->left;
    n->left = n->right;
    n->right = tmp;
    n->observed = 1;
    n->left->observed = 1;
    return n;
  }
}

static struct rx_exp_node *
rx_optimize_combination_right (struct rx_parse_state * state,
			       enum rx_exp_node_type type, struct rx_exp_node * n)
{
  if (!n)
    return 0;

  n = rx_optimize (state, n);

  if (n->type != type)
    return n;

  if (!n->observed)
    return n;

  n->left = rx_optimize_combination_right (state, type, n->left);

  if (!n->left->observed)
    return n;

  if (   (n->left->type != type)
      || (n->left->left->observed))
    return n;

  {
    struct rx_exp_node * tmp;

    tmp = n->left->left;
    n->left->left = n->left->right;
    n->left->right = n->right;
    n->right = n->left;
    n->left = tmp;
    n->observed = 1;
    n->right->observed = 1;
    return n;
  }
}

static struct rx_exp_node *
rx_optimize_strings (struct rx_parse_state * state,
		     struct rx_exp_node * r)
{
  if (   (r->type == r_concat)
      && (r->left->type == r_string)
      && (r->right->type == r_string))
    {
      struct rx_exp_node * t;
      dstr_append (&r->left->cstr,
		   r->right->cstr.chr,
		   r->right->cstr.len);
      t = r->left;
      r->left = 0;
      r->right = 0;
      rx_free_rexp (r);
      r = t;
    }
  return r;
}

static struct rx_exp_node *
rx_optimize (struct rx_parse_state * state, struct rx_exp_node * n)
{
  if (!n)
    return 0;

  if (!n->observed)
    return n;

  if (   (n->type != r_alternate)
      && (n->type != r_concat))
    {
      if (n->left)
	n->left = rx_optimize (state, n->left);
      return n;
    }

  {
    struct rx_exp_node * l;
    struct rx_exp_node * r;
    int l_raisable;
    int r_raisable;

    l = rx_optimize_combination_left (state, n->type, n->left);
    r = rx_optimize_combination_right (state, n->type, n->right);

    if (l)
      l_raisable = (   (l->type == n->type)
		    && (!l->right->observed));
    else
      l_raisable = 0;

    if (r)
      r_raisable = (   (r->type == n->type)
		    && (!r->left->observed));
    else
      r_raisable = 0;

    if (!l->observed && r_raisable)
      {
	struct rx_exp_node * tmp;
	tmp = r->right;
	r->right = r->left;
	r->left = l;
	r = rx_optimize_strings (state, r);
	n->right = tmp;
	n->left = r;
	r->observed = 0;
	n->observed = 1;
	return n;
      }
    else if (!r->observed && l_raisable)
      {
	struct rx_exp_node * tmp;
	tmp = l->left;
	l->left = l->right;
	l->right = r;
	n->left = tmp;
	n->right = l;
	l->observed = 0;
	n->observed = 1;
	return n;
      }
    else if (l_raisable && r_raisable)
      {
	struct rx_exp_node * leafs[4];
	leafs[0] = l->left;
	leafs[1] = l->right;
	leafs[2] = r->left;
	leafs[3] = r->right;

	n->left = leafs[0];
	n->right = l;
	l->left = r;
	r->left = leafs[1];
	r->right = leafs[2];
	l->right = leafs[3];

	n->observed = 1;
	l->observed = 1;
	r->observed = 0;
	return n;
      }
    else
      {
	n->left = l;
	n->right = r;
	return n;
      }
  }
}


/* rx_parse
 *
 * Translate a string into a regexp syntax tree.
 * 
 * rx_exp_p : (return parameter) the resulting syntax tree.
 * nsub : (return parameter) the number of parenthesized subexpressions.
 * pattern : the input string.
 * size : the length of that string.
 * extended_p : if 0, use "Posix Basic Syntax", otherwise "Posix Extended".
 * no_newline : if not 0, don't match newline with patterns like "." or "[^a]".
 * dfa_only : if not 0, compile pure regular expressions, not regexps.
 * cset_size : 256
 * translate : an array of cset_size characters, defining a mapping from
 *	       from characters to characters.  The compiler reads "pattern"
 *	       through this mapping (i.e., parses according to 
 * 	       "translate[*pattern]" instead of simple "*pattern").
 *	       Additionally, the pattern is modified to achieve the 
 *	       effect of translating a target string through the same
 *	       translation.  Conceptually, the functions that perform
 *	       matching could translate the string being compared to
 *	       the pattern using "translate[*string]" but in fact, the
 *	       same effect is achieved by changing the pattern itself,
 *	       instead.
 */
int
rx_parse (struct rx_exp_node ** rx_exp_p,
	  int *nsub,
	  const char *pattern,
	  int size,
	  int extended_p,
	  int no_newline,
	  int dfa_only,
	  int cset_size,
	  unsigned char *translate)
{
  struct rx_parse_state state;

  memset (&state, 0, sizeof (state));

  state.pos = (unsigned char const *)pattern;
  state.pattern = (unsigned char const *)pattern;
  state.size = size;
  state.extended_p = extended_p;
  state.no_newline = no_newline;
  state.cset_size = cset_size;
  if (!translate)
    translate = rx_id_translation_table;
  state.translate = translate;
  state.at_beg = 1;
  state.exp_number = 1;
  state.dfa_only = dfa_only;
  state.cut_count = 1;

  if (setjmp (state.err_escape))
    {
      rx_free_rexp (state.root);
      return state.err;
    }
  else
    {
      rx_parse_alt (&state.root, &state);
      *nsub = state.exp_number;
      if (rx_scan_op_ahead (&state, ')'))
	{
	  state.err = REG_ERPAREN;
	  longjmp (state.err_escape, 1);
	}

      if (!rx_eop (&state))
	{
	  state.err = REG_BADPAT;
	  longjmp (state.err_escape, 1);
	}

      *rx_exp_p = rx_optimize (&state, state.root);
      /* *rx_exp_p = state.root; */
      return 0;
    }
}
