granicus.if.org Git - postgresql/blob - src/backend/regex/regcomp.c

   1 /*-
   2  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   3  * Copyright (c) 1992, 1993, 1994
   4  *              The Regents of the University of California.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * Henry Spencer.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *        notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *        notice, this list of conditions and the following disclaimer in the
  16  *        documentation and/or other materials provided with the distribution.
  17  * 3. All advertising materials mentioning features or use of this software
  18  *        must display the following acknowledgement:
  19  *              This product includes software developed by the University of
  20  *              California, Berkeley and its contributors.
  21  * 4. Neither the name of the University nor the names of its contributors
  22  *        may be used to endorse or promote products derived from this software
  23  *        without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED.      IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  *
  37  *              @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  38  */
  39
  40 #if defined(LIBC_SCCS) && !defined(lint)
  41 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
  42
  43 #endif   /* LIBC_SCCS and not lint */
  44
  45 #include "postgres.h"
  46
  47 #include <sys/types.h>
  48 #include <stdio.h>
  49 #include <string.h>
  50 #include <ctype.h>
  51 #include <limits.h>
  52 #include <stdlib.h>
  53 #include <assert.h>
  54
  55 #include "regex/regex.h"
  56 #include "regex/utils.h"
  57 #include "regex/regex2.h"
  58 #include "regex/cclass.h"
  59 #include "regex/cname.h"
  60
  61 /*
  62  * parse structure, passed up and down to avoid global variables and
  63  * other clumsinesses
  64  */
  65 struct parse
  66 {
  67         pg_wchar   *next;                       /* next character in RE */
  68         pg_wchar   *end;                        /* end of string (-> NUL normally) */
  69         int                     error;                  /* has an error been seen? */
  70         sop                *strip;                      /* malloced strip */
  71         sopno           ssize;                  /* malloced strip size (allocated) */
  72         sopno           slen;                   /* malloced strip length (used) */
  73         int                     ncsalloc;               /* number of csets allocated */
  74         struct re_guts *g;
  75 #define  NPAREN  10                             /* we need to remember () 1-9 for back
  76                                                                  * refs */
  77         sopno           pbegin[NPAREN]; /* -> ( ([0] unused) */
  78         sopno           pend[NPAREN];   /* -> ) ([0] unused) */
  79 };
  80
  81 /* ========= begin header generated by ./mkh ========= */
  82 #ifdef __cplusplus
  83 extern          "C"
  84 {
  85 #endif
  86
  87 /* === regcomp.c === */
  88         static void p_ere(struct parse * p, int stop);
  89         static void p_ere_exp(struct parse * p);
  90         static void p_str(struct parse * p);
  91         static void p_bre(struct parse * p, int end1, int end2);
  92         static int      p_simp_re(struct parse * p, int starordinary);
  93         static int      p_count(struct parse * p);
  94         static void p_bracket(struct parse * p);
  95         static void p_b_term(struct parse * p, cset *cs);
  96         static void p_b_cclass(struct parse * p, cset *cs);
  97         static void p_b_eclass(struct parse * p, cset *cs);
  98         static pg_wchar p_b_symbol(struct parse * p);
  99         static char p_b_coll_elem(struct parse * p, int endc);
 100 #ifdef MULTIBYTE
 101         static unsigned char othercase(int ch);
 102 #else
 103         static char othercase(int ch);
 104 #endif
 105         static void bothcases(struct parse * p, int ch);
 106         static void ordinary(struct parse * p, int ch);
 107         static void nonnewline(struct parse * p);
 108         static void repeat(struct parse * p, sopno start, int from, int to);
 109         static int      seterr(struct parse * p, int e);
 110         static cset *allocset(struct parse * p);
 111         static void freeset(struct parse * p, cset *cs);
 112         static int      freezeset(struct parse * p, cset *cs);
 113         static int      firstch(struct parse * p, cset *cs);
 114         static int      nch(struct parse * p, cset *cs);
 115         static void mcadd(struct parse * p, cset *cs, char *cp);
 116         static void mcinvert(struct parse * p, cset *cs);
 117         static void mccase(struct parse * p, cset *cs);
 118         static int      isinsets(struct re_guts * g, int c);
 119         static int      samesets(struct re_guts * g, int c1, int c2);
 120         static void categorize(struct parse * p, struct re_guts * g);
 121         static sopno dupl(struct parse * p, sopno start, sopno finish);
 122         static void doemit(struct parse * p, sop op, size_t opnd);
 123         static void doinsert(struct parse * p, sop op, size_t opnd, sopno pos);
 124         static void dofwd(struct parse * p, sopno pos, sop value);
 125         static void enlarge(struct parse * p, sopno size);
 126         static void stripsnug(struct parse * p, struct re_guts * g);
 127         static void findmust(struct parse * p, struct re_guts * g);
 128         static sopno pluscount(struct parse * p, struct re_guts * g);
 129         static int      pg_isdigit(int c);
 130         static int      pg_isalpha(int c);
 131         static int      pg_isupper(int c);
 132         static int      pg_islower(int c);
 133
 134 #ifdef __cplusplus
 135 }
 136
 137 #endif
 138 /* ========= end header generated by ./mkh ========= */
 139
 140 static pg_wchar nuls[10];               /* place to point scanner in event of
 141                                                                  * error */
 142
 143 /*
 144  * macros for use with parse structure
 145  * BEWARE:      these know that the parse structure is named `p' !!!
 146  */
 147 #define PEEK()  (*p->next)
 148 #define PEEK2() (*(p->next+1))
 149 #define MORE()  (p->next < p->end)
 150 #define MORE2() (p->next+1 < p->end)
 151 #define SEE(c)  (MORE() && PEEK() == (c))
 152 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 153 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 154 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 155 #define NEXT()  (p->next++)
 156 #define NEXT2() (p->next += 2)
 157 #define NEXTn(n)                (p->next += (n))
 158 #define GETNEXT()               (*p->next++)
 159 #define SETERROR(e)             seterr(p, (e))
 160 #define REQUIRE(co, e)  if (!(co)) SETERROR(e)
 161 #define MUSTSEE(c, e)   REQUIRE(MORE() && PEEK() == (c), e)
 162 #define MUSTEAT(c, e)   REQUIRE(MORE() && GETNEXT() == (c), e)
 163 #define MUSTNOTSEE(c, e)                REQUIRE(!MORE() || PEEK() != (c), e)
 164 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 165 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 166 #define AHEAD(pos)                              dofwd(p, pos, HERE()-(pos))
 167 #define ASTERN(sop, pos)                EMIT(sop, HERE()-pos)
 168 #define HERE()                  (p->slen)
 169 #define THERE()                 (p->slen - 1)
 170 #define THERETHERE()    (p->slen - 2)
 171 #define DROP(n) (p->slen -= (n))
 172
 173 #ifndef NDEBUG
 174 static int      never = 0;                      /* for use in asserts; shuts lint up */
 175
 176 #else
 177 #define never   0                               /* some <assert.h>s have bugs too */
 178 #endif
 179
 180 /*
 181  - regcomp - interface for parser and compilation
 182  = extern int regcomp(regex_t *, const char *, int);
 183  = #define              REG_BASIC               0000
 184  = #define              REG_EXTENDED    0001
 185  = #define              REG_ICASE               0002
 186  = #define              REG_NOSUB               0004
 187  = #define              REG_NEWLINE             0010
 188  = #define              REG_NOSPEC              0020
 189  = #define              REG_PEND                0040
 190  = #define              REG_DUMP                0200
 191  */
 192 int                                                             /* 0 success, otherwise REG_something */
 193 pg95_regcomp(preg, pattern, cflags)
 194 regex_t    *preg;
 195 const char *pattern;
 196 int                     cflags;
 197 {
 198         struct parse pa;
 199         struct re_guts *g;
 200         struct parse *p = &pa;
 201         int                     i;
 202         size_t          len;
 203
 204 #ifdef MULTIBYTE
 205         pg_wchar   *wcp;
 206
 207 #endif
 208
 209 #ifdef REDEBUG
 210 #define  GOODFLAGS(f)    (f)
 211 #else
 212 #define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 213 #endif
 214
 215         cflags = GOODFLAGS(cflags);
 216         if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC))
 217                 return REG_INVARG;
 218
 219         if (cflags & REG_PEND)
 220         {
 221 #ifdef MULTIBYTE
 222                 wcp = preg->patsave;
 223                 if (preg->re_endp < wcp)
 224                         return REG_INVARG;
 225                 len = preg->re_endp - wcp;
 226 #else
 227                 if (preg->re_endp < pattern)
 228                         return REG_INVARG;
 229                 len = preg->re_endp - pattern;
 230 #endif
 231         }
 232         else
 233         {
 234 #ifdef MULTIBYTE
 235                 wcp = (pg_wchar *) malloc((strlen(pattern) + 1) * sizeof(pg_wchar));
 236                 if (wcp == NULL)
 237                         return REG_ESPACE;
 238                 preg->patsave = wcp;
 239                 (void) pg_mb2wchar((unsigned char *) pattern, wcp);
 240                 len = pg_wchar_strlen(wcp);
 241 #else
 242
 243                 len = strlen((char *) pattern);
 244 #endif
 245         }
 246
 247         /* do the mallocs early so failure handling is easy */
 248         g = (struct re_guts *) malloc(sizeof(struct re_guts) +
 249                                                                   (NC - 1) * sizeof(cat_t));
 250         if (g == NULL)
 251                 return REG_ESPACE;
 252         p->ssize = len / (size_t) 2 *(size_t) 3 + (size_t) 1;           /* ugh */
 253
 254         p->strip = (sop *) malloc(p->ssize * sizeof(sop));
 255         p->slen = 0;
 256         if (p->strip == NULL)
 257         {
 258                 free((char *) g);
 259                 return REG_ESPACE;
 260         }
 261
 262         /* set things up */
 263         p->g = g;
 264 #ifdef MULTIBYTE
 265         p->next = wcp;
 266 #else
 267         p->next = (pg_wchar *) pattern;         /* convenience; we do not modify
 268                                                                                  * it */
 269 #endif
 270         p->end = p->next + len;
 271         p->error = 0;
 272         p->ncsalloc = 0;
 273         for (i = 0; i < NPAREN; i++)
 274         {
 275                 p->pbegin[i] = 0;
 276                 p->pend[i] = 0;
 277         }
 278         g->csetsize = NC;
 279         g->sets = NULL;
 280         g->setbits = NULL;
 281         g->ncsets = 0;
 282         g->cflags = cflags;
 283         g->iflags = 0;
 284         g->nbol = 0;
 285         g->neol = 0;
 286         g->must = NULL;
 287         g->mlen = 0;
 288         g->nsub = 0;
 289         g->ncategories = 1;                     /* category 0 is "everything else" */
 290         g->categories = &g->catspace[-(CHAR_MIN)];
 291         memset((char *) g->catspace, 0, NC * sizeof(cat_t));
 292         g->backrefs = 0;
 293
 294         /* do it */
 295         EMIT(OEND, 0);
 296         g->firststate = THERE();
 297         if (cflags & REG_EXTENDED)
 298                 p_ere(p, OUT);
 299         else if (cflags & REG_NOSPEC)
 300                 p_str(p);
 301         else
 302                 p_bre(p, OUT, OUT);
 303         EMIT(OEND, 0);
 304         g->laststate = THERE();
 305
 306         /* tidy up loose ends and fill things in */
 307         categorize(p, g);
 308         stripsnug(p, g);
 309         findmust(p, g);
 310         g->nplus = pluscount(p, g);
 311         g->magic = MAGIC2;
 312         preg->re_nsub = g->nsub;
 313         preg->re_g = g;
 314         preg->re_magic = MAGIC1;
 315 #ifndef REDEBUG
 316         /* not debugging, so can't rely on the assert() in regexec() */
 317         if (g->iflags & BAD)
 318                 SETERROR(REG_ASSERT);
 319 #endif
 320
 321         /* win or lose, we're done */
 322         if (p->error != 0)                      /* lose */
 323                 pg95_regfree(preg);
 324         return p->error;
 325 }
 326
 327 /*
 328  - p_ere - ERE parser top level, concatenation and alternation
 329  == static void p_ere(struct parse *p, int stop);
 330  */
 331 static void
 332 p_ere(p, stop)
 333 struct parse *p;
 334 int                     stop;                           /* character this ERE should end at */
 335 {
 336         char            c;
 337         sopno           prevback = 0;
 338         sopno           prevfwd = 0;
 339         sopno           conc;
 340         int                     first = 1;              /* is this the first alternative? */
 341
 342         for (;;)
 343         {
 344                 /* do a bunch of concatenated expressions */
 345                 conc = HERE();
 346                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 347                         p_ere_exp(p);
 348                 REQUIRE(HERE() != conc, REG_EMPTY);             /* require nonempty */
 349
 350                 if (!EAT('|'))
 351                         break;                          /* NOTE BREAK OUT */
 352
 353                 if (first)
 354                 {
 355                         INSERT(OCH_, conc); /* offset is wrong */
 356                         prevfwd = conc;
 357                         prevback = conc;
 358                         first = 0;
 359                 }
 360                 ASTERN(OOR1, prevback);
 361                 prevback = THERE();
 362                 AHEAD(prevfwd);                 /* fix previous offset */
 363                 prevfwd = HERE();
 364                 EMIT(OOR2, 0);                  /* offset is very wrong */
 365         }
 366
 367         if (!first)
 368         {                                                       /* tail-end fixups */
 369                 AHEAD(prevfwd);
 370                 ASTERN(O_CH, prevback);
 371         }
 372
 373         assert(!MORE() || SEE(stop));
 374 }
 375
 376 /*
 377  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 378  == static void p_ere_exp(struct parse *p);
 379  */
 380 static void
 381 p_ere_exp(p)
 382 struct parse *p;
 383 {
 384         pg_wchar        c;
 385         sopno           pos;
 386         int                     count;
 387         int                     count2;
 388         sopno           subno;
 389         int                     wascaret = 0;
 390
 391         assert(MORE());                         /* caller should have ensured this */
 392         c = GETNEXT();
 393
 394         pos = HERE();
 395         switch (c)
 396         {
 397                 case '(':
 398                         REQUIRE(MORE(), REG_EPAREN);
 399                         p->g->nsub++;
 400                         subno = p->g->nsub;
 401                         if (subno < NPAREN)
 402                                 p->pbegin[subno] = HERE();
 403                         EMIT(OLPAREN, subno);
 404                         if (!SEE(')'))
 405                                 p_ere(p, ')');
 406                         if (subno < NPAREN)
 407                         {
 408                                 p->pend[subno] = HERE();
 409                                 assert(p->pend[subno] != 0);
 410                         }
 411                         EMIT(ORPAREN, subno);
 412                         MUSTEAT(')', REG_EPAREN);
 413                         break;
 414 #ifndef POSIX_MISTAKE
 415                 case ')':                               /* happens only if no current unmatched ( */
 416
 417                         /*
 418                          * You may ask, why the ifndef?  Because I didn't notice this
 419                          * until slightly too late for 1003.2, and none of the other
 420                          * 1003.2 regular-expression reviewers noticed it at all.  So
 421                          * an unmatched ) is legal POSIX, at least until we can get it
 422                          * fixed.
 423                          */
 424                         SETERROR(REG_EPAREN);
 425                         break;
 426 #endif
 427                 case '^':
 428                         EMIT(OBOL, 0);
 429                         p->g->iflags |= USEBOL;
 430                         p->g->nbol++;
 431                         wascaret = 1;
 432                         break;
 433                 case '$':
 434                         EMIT(OEOL, 0);
 435                         p->g->iflags |= USEEOL;
 436                         p->g->neol++;
 437                         break;
 438                 case '|':
 439                         SETERROR(REG_EMPTY);
 440                         break;
 441                 case '*':
 442                 case '+':
 443                 case '?':
 444                         SETERROR(REG_BADRPT);
 445                         break;
 446                 case '.':
 447                         if (p->g->cflags & REG_NEWLINE)
 448                                 nonnewline(p);
 449                         else
 450                                 EMIT(OANY, 0);
 451                         break;
 452                 case '[':
 453                         p_bracket(p);
 454                         break;
 455                 case '\\':
 456                         REQUIRE(MORE(), REG_EESCAPE);
 457                         c = GETNEXT();
 458                         ordinary(p, c);
 459                         break;
 460                 case '{':                               /* okay as ordinary except if digit
 461                                                                  * follows */
 462                         REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT);
 463                         /* FALLTHROUGH */
 464                 default:
 465                         ordinary(p, c);
 466                         break;
 467         }
 468
 469         if (!MORE())
 470                 return;
 471         c = PEEK();
 472         /* we call { a repetition if followed by a digit */
 473         if (!(c == '*' || c == '+' || c == '?' ||
 474                   (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
 475                 return;                                 /* no repetition, we're done */
 476         NEXT();
 477
 478         REQUIRE(!wascaret, REG_BADRPT);
 479         switch (c)
 480         {
 481                 case '*':                               /* implemented as +? */
 482                         /* this case does not require the (y|) trick, noKLUDGE */
 483                         INSERT(OPLUS_, pos);
 484                         ASTERN(O_PLUS, pos);
 485                         INSERT(OQUEST_, pos);
 486                         ASTERN(O_QUEST, pos);
 487                         break;
 488                 case '+':
 489                         INSERT(OPLUS_, pos);
 490                         ASTERN(O_PLUS, pos);
 491                         break;
 492                 case '?':
 493                         /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 494                         INSERT(OCH_, pos);      /* offset slightly wrong */
 495                         ASTERN(OOR1, pos);      /* this one's right */
 496                         AHEAD(pos);                     /* fix the OCH_ */
 497                         EMIT(OOR2, 0);          /* offset very wrong... */
 498                         AHEAD(THERE());         /* ...so fix it */
 499                         ASTERN(O_CH, THERETHERE());
 500                         break;
 501                 case '{':
 502                         count = p_count(p);
 503                         if (EAT(','))
 504                         {
 505                                 if (pg_isdigit(PEEK()))
 506                                 {
 507                                         count2 = p_count(p);
 508                                         REQUIRE(count <= count2, REG_BADBR);
 509                                 }
 510                                 else
 511 /* single number with comma */
 512                                         count2 = INFINITY;
 513                         }
 514                         else
 515 /* just a single number */
 516                                 count2 = count;
 517                         repeat(p, pos, count, count2);
 518                         if (!EAT('}'))
 519                         {                                       /* error heuristics */
 520                                 while (MORE() && PEEK() != '}')
 521                                         NEXT();
 522                                 REQUIRE(MORE(), REG_EBRACE);
 523                                 SETERROR(REG_BADBR);
 524                         }
 525                         break;
 526         }
 527
 528         if (!MORE())
 529                 return;
 530         c = PEEK();
 531         if (!(c == '*' || c == '+' || c == '?' ||
 532                   (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
 533                 return;
 534         SETERROR(REG_BADRPT);
 535 }
 536
 537 /*
 538  - p_str - string (no metacharacters) "parser"
 539  == static void p_str(struct parse *p);
 540  */
 541 static void
 542 p_str(p)
 543 struct parse *p;
 544 {
 545         REQUIRE(MORE(), REG_EMPTY);
 546         while (MORE())
 547                 ordinary(p, GETNEXT());
 548 }
 549
 550 /*
 551  - p_bre - BRE parser top level, anchoring and concatenation
 552  == static void p_bre(struct parse *p, int end1, \
 553  ==             int end2);
 554  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 555  *
 556  * This implementation is a bit of a kludge, in that a trailing $ is first
 557  * taken as an ordinary character and then revised to be an anchor.  The
 558  * only undesirable side effect is that '$' gets included as a character
 559  * category in such cases.      This is fairly harmless; not worth fixing.
 560  * The amount of lookahead needed to avoid this kludge is excessive.
 561  */
 562 static void
 563 p_bre(p, end1, end2)
 564 struct parse *p;
 565 int                     end1;                           /* first terminating character */
 566 int                     end2;                           /* second terminating character */
 567 {
 568         sopno           start = HERE();
 569         int                     first = 1;              /* first subexpression? */
 570         int                     wasdollar = 0;
 571
 572         if (EAT('^'))
 573         {
 574                 EMIT(OBOL, 0);
 575                 p->g->iflags |= USEBOL;
 576                 p->g->nbol++;
 577         }
 578         while (MORE() && !SEETWO(end1, end2))
 579         {
 580                 wasdollar = p_simp_re(p, first);
 581                 first = 0;
 582         }
 583         if (wasdollar)
 584         {                                                       /* oops, that was a trailing anchor */
 585                 DROP(1);
 586                 EMIT(OEOL, 0);
 587                 p->g->iflags |= USEEOL;
 588                 p->g->neol++;
 589         }
 590
 591         REQUIRE(HERE() != start, REG_EMPTY);            /* require nonempty */
 592 }
 593
 594 /*
 595  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 596  == static int p_simp_re(struct parse *p, int starordinary);
 597  */
 598 static int                                              /* was the simple RE an unbackslashed $? */
 599 p_simp_re(p, starordinary)
 600 struct parse *p;
 601 int                     starordinary;           /* is a leading * an ordinary character? */
 602 {
 603         int                     c;
 604         int                     count;
 605         int                     count2;
 606         sopno           pos;
 607         int                     i;
 608         sopno           subno;
 609
 610 #define  BACKSL  (1<<24)
 611
 612         pos = HERE();                           /* repetion op, if any, covers from here */
 613
 614         assert(MORE());                         /* caller should have ensured this */
 615         c = GETNEXT();
 616         if (c == '\\')
 617         {
 618                 REQUIRE(MORE(), REG_EESCAPE);
 619 #ifdef MULTIBYTE
 620                 c = BACKSL | (pg_wchar) GETNEXT();
 621 #else
 622                 c = BACKSL | (unsigned char) GETNEXT();
 623 #endif
 624         }
 625         switch (c)
 626         {
 627                 case '.':
 628                         if (p->g->cflags & REG_NEWLINE)
 629                                 nonnewline(p);
 630                         else
 631                                 EMIT(OANY, 0);
 632                         break;
 633                 case '[':
 634                         p_bracket(p);
 635                         break;
 636                 case BACKSL | '{':
 637                         SETERROR(REG_BADRPT);
 638                         break;
 639                 case BACKSL | '(':
 640                         p->g->nsub++;
 641                         subno = p->g->nsub;
 642                         if (subno < NPAREN)
 643                                 p->pbegin[subno] = HERE();
 644                         EMIT(OLPAREN, subno);
 645                         /* the MORE here is an error heuristic */
 646                         if (MORE() && !SEETWO('\\', ')'))
 647                                 p_bre(p, '\\', ')');
 648                         if (subno < NPAREN)
 649                         {
 650                                 p->pend[subno] = HERE();
 651                                 assert(p->pend[subno] != 0);
 652                         }
 653                         EMIT(ORPAREN, subno);
 654                         REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 655                         break;
 656                 case BACKSL | ')':              /* should not get here -- must be user */
 657                 case BACKSL | '}':
 658                         SETERROR(REG_EPAREN);
 659                         break;
 660                 case BACKSL | '1':
 661                 case BACKSL | '2':
 662                 case BACKSL | '3':
 663                 case BACKSL | '4':
 664                 case BACKSL | '5':
 665                 case BACKSL | '6':
 666                 case BACKSL | '7':
 667                 case BACKSL | '8':
 668                 case BACKSL | '9':
 669                         i = (c & ~BACKSL) - '0';
 670                         assert(i < NPAREN);
 671                         if (p->pend[i] != 0)
 672                         {
 673                                 assert(i <= p->g->nsub);
 674                                 EMIT(OBACK_, i);
 675                                 assert(p->pbegin[i] != 0);
 676                                 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 677                                 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 678                                 dupl(p, p->pbegin[i] + 1, p->pend[i]);
 679                                 EMIT(O_BACK, i);
 680                         }
 681                         else
 682                                 SETERROR(REG_ESUBREG);
 683                         p->g->backrefs = 1;
 684                         break;
 685                 case '*':
 686                         REQUIRE(starordinary, REG_BADRPT);
 687                         /* FALLTHROUGH */
 688                 default:
 689                         ordinary(p, c & ~BACKSL);
 690                         break;
 691         }
 692
 693         if (EAT('*'))
 694         {                                                       /* implemented as +? */
 695                 /* this case does not require the (y|) trick, noKLUDGE */
 696                 INSERT(OPLUS_, pos);
 697                 ASTERN(O_PLUS, pos);
 698                 INSERT(OQUEST_, pos);
 699                 ASTERN(O_QUEST, pos);
 700         }
 701         else if (EATTWO('\\', '{'))
 702         {
 703                 count = p_count(p);
 704                 if (EAT(','))
 705                 {
 706                         if (MORE() && pg_isdigit(PEEK()))
 707                         {
 708                                 count2 = p_count(p);
 709                                 REQUIRE(count <= count2, REG_BADBR);
 710                         }
 711                         else
 712 /* single number with comma */
 713                                 count2 = INFINITY;
 714                 }
 715                 else
 716 /* just a single number */
 717                         count2 = count;
 718                 repeat(p, pos, count, count2);
 719                 if (!EATTWO('\\', '}'))
 720                 {                                               /* error heuristics */
 721                         while (MORE() && !SEETWO('\\', '}'))
 722                                 NEXT();
 723                         REQUIRE(MORE(), REG_EBRACE);
 724                         SETERROR(REG_BADBR);
 725                 }
 726         }
 727         else if (c == (unsigned char) '$')      /* $ (but not \$) ends it */
 728                 return 1;
 729
 730         return 0;
 731 }
 732
 733 /*
 734  - p_count - parse a repetition count
 735  == static int p_count(struct parse *p);
 736  */
 737 static int                                              /* the value */
 738 p_count(p)
 739 struct parse *p;
 740 {
 741         int                     count = 0;
 742         int                     ndigits = 0;
 743
 744         while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX)
 745         {
 746                 count = count * 10 + (GETNEXT() - '0');
 747                 ndigits++;
 748         }
 749
 750         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 751         return count;
 752 }
 753
 754 /*
 755  - p_bracket - parse a bracketed character list
 756  == static void p_bracket(struct parse *p);
 757  *
 758  * Note a significant property of this code:  if the allocset() did SETERROR,
 759  * no set operations are done.
 760  */
 761 static void
 762 p_bracket(p)
 763 struct parse *p;
 764 {
 765         cset       *cs = allocset(p);
 766         int                     invert = 0;
 767
 768 #ifdef MULTIBYTE
 769         pg_wchar        sp1[] = {'[', ':', '<', ':', ']', ']'};
 770         pg_wchar        sp2[] = {'[', ':', '>', ':', ']', ']'};
 771
 772 #endif
 773
 774         /* Dept of Truly Sickening Special-Case Kludges */
 775 #ifdef MULTIBYTE
 776         if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0)
 777 #else
 778         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0)
 779 #endif
 780         {
 781                 EMIT(OBOW, 0);
 782                 NEXTn(6);
 783                 return;
 784         }
 785 #ifdef MULTIBYTE
 786         if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0)
 787 #else
 788         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0)
 789 #endif
 790         {
 791                 EMIT(OEOW, 0);
 792                 NEXTn(6);
 793                 return;
 794         }
 795
 796         if (EAT('^'))
 797                 invert++;                               /* make note to invert set at end */
 798         if (EAT(']'))
 799                 CHadd(cs, ']');
 800         else if (EAT('-'))
 801                 CHadd(cs, '-');
 802         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 803                 p_b_term(p, cs);
 804         if (EAT('-'))
 805                 CHadd(cs, '-');
 806         MUSTEAT(']', REG_EBRACK);
 807
 808         if (p->error != 0)                      /* don't mess things up further */
 809                 return;
 810
 811         if (p->g->cflags & REG_ICASE)
 812         {
 813                 int                     i;
 814                 int                     ci;
 815
 816                 for (i = p->g->csetsize - 1; i >= 0; i--)
 817                         if (CHIN(cs, i) && pg_isalpha(i))
 818                         {
 819                                 ci = othercase(i);
 820                                 if (ci != i)
 821                                         CHadd(cs, ci);
 822                         }
 823                 if (cs->multis != NULL)
 824                         mccase(p, cs);
 825         }
 826         if (invert)
 827         {
 828                 int                     i;
 829
 830                 for (i = p->g->csetsize - 1; i >= 0; i--)
 831                         if (CHIN(cs, i))
 832                                 CHsub(cs, i);
 833                         else
 834                                 CHadd(cs, i);
 835                 if (p->g->cflags & REG_NEWLINE)
 836                         CHsub(cs, '\n');
 837                 if (cs->multis != NULL)
 838                         mcinvert(p, cs);
 839         }
 840
 841         assert(cs->multis == NULL); /* xxx */
 842
 843         if (nch(p, cs) == 1)
 844         {                                                       /* optimize singleton sets */
 845                 ordinary(p, firstch(p, cs));
 846                 freeset(p, cs);
 847         }
 848         else
 849                 EMIT(OANYOF, freezeset(p, cs));
 850 }
 851
 852 /*
 853  - p_b_term - parse one term of a bracketed character list
 854  == static void p_b_term(struct parse *p, cset *cs);
 855  */
 856 static void
 857 p_b_term(p, cs)
 858 struct parse *p;
 859 cset       *cs;
 860 {
 861         pg_wchar        c;
 862         pg_wchar        start,
 863                                 finish;
 864         int                     i;
 865
 866         /* classify what we've got */
 867         switch ((MORE()) ? PEEK() : '\0')
 868         {
 869                 case '[':
 870                         c = (MORE2()) ? PEEK2() : '\0';
 871                         break;
 872                 case '-':
 873                         SETERROR(REG_ERANGE);
 874                         return;                         /* NOTE RETURN */
 875                         break;
 876                 default:
 877                         c = '\0';
 878                         break;
 879         }
 880
 881         switch (c)
 882         {
 883                 case ':':                               /* character class */
 884                         NEXT2();
 885                         REQUIRE(MORE(), REG_EBRACK);
 886                         c = PEEK();
 887                         REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 888                         p_b_cclass(p, cs);
 889                         REQUIRE(MORE(), REG_EBRACK);
 890                         REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 891                         break;
 892                 case '=':                               /* equivalence class */
 893                         NEXT2();
 894                         REQUIRE(MORE(), REG_EBRACK);
 895                         c = PEEK();
 896                         REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 897                         p_b_eclass(p, cs);
 898                         REQUIRE(MORE(), REG_EBRACK);
 899                         REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 900                         break;
 901                 default:                                /* symbol, ordinary character, or range */
 902 /* xxx revision needed for multichar stuff */
 903                         start = p_b_symbol(p);
 904                         if (SEE('-') && MORE2() && PEEK2() != ']')
 905                         {
 906                                 /* range */
 907                                 NEXT();
 908                                 if (EAT('-'))
 909                                         finish = '-';
 910                                 else
 911                                         finish = p_b_symbol(p);
 912                         }
 913                         else
 914                                 finish = start;
 915 /* xxx what about signed chars here... */
 916                         REQUIRE(start <= finish, REG_ERANGE);
 917 #ifdef MULTIBYTE
 918                         if (CHlc(start) != CHlc(finish))
 919                                 SETERROR(REG_ERANGE);
 920 #endif
 921                         for (i = start; i <= finish; i++)
 922                                 CHadd(cs, i);
 923                         break;
 924         }
 925 }
 926
 927 /*
 928  - p_b_cclass - parse a character-class name and deal with it
 929  == static void p_b_cclass(struct parse *p, cset *cs);
 930  */
 931 static void
 932 p_b_cclass(p, cs)
 933 struct parse *p;
 934 cset       *cs;
 935 {
 936         pg_wchar   *sp = p->next;
 937         struct cclass *cp;
 938         size_t          len;
 939         char       *u;
 940         char            c;
 941
 942         while (MORE() && pg_isalpha(PEEK()))
 943                 NEXT();
 944         len = p->next - sp;
 945         for (cp = cclasses; cp->name != NULL; cp++)
 946 #ifdef MULTIBYTE
 947                 if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 948 #else
 949                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 950 #endif
 951                         break;
 952         if (cp->name == NULL)
 953         {
 954                 /* oops, didn't find it */
 955                 SETERROR(REG_ECTYPE);
 956                 return;
 957         }
 958
 959         u = cp->chars;
 960         while ((c = *u++) != '\0')
 961                 CHadd(cs, c);
 962         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 963                 MCadd(p, cs, u);
 964 }
 965
 966 /*
 967  - p_b_eclass - parse an equivalence-class name and deal with it
 968  == static void p_b_eclass(struct parse *p, cset *cs);
 969  *
 970  * This implementation is incomplete. xxx
 971  */
 972 static void
 973 p_b_eclass(p, cs)
 974 struct parse *p;
 975 cset       *cs;
 976 {
 977         char            c;
 978
 979         c = p_b_coll_elem(p, '=');
 980         CHadd(cs, c);
 981 }
 982
 983 /*
 984  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 985  == static char p_b_symbol(struct parse *p);
 986  */
 987 static pg_wchar                 /* value of symbol */
 988 p_b_symbol(p)
 989 struct parse *p;
 990 {
 991         pg_wchar        value;
 992
 993         REQUIRE(MORE(), REG_EBRACK);
 994         if (!EATTWO('[', '.'))
 995                 return GETNEXT();
 996
 997         /* collating symbol */
 998         value = p_b_coll_elem(p, '.');
 999         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1000         return value;
1001 }
1002
1003 /*
1004  - p_b_coll_elem - parse a collating-element name and look it up
1005  == static char p_b_coll_elem(struct parse *p, int endc);
1006  */
1007 static char                                             /* value of collating element */
1008 p_b_coll_elem(p, endc)
1009 struct parse *p;
1010 int                     endc;                           /* name ended by endc,']' */
1011 {
1012         pg_wchar   *sp = p->next;
1013         struct cname *cp;
1014         int                     len;
1015
1016         while (MORE() && !SEETWO(endc, ']'))
1017                 NEXT();
1018         if (!MORE())
1019         {
1020                 SETERROR(REG_EBRACK);
1021                 return 0;
1022         }
1023         len = p->next - sp;
1024         for (cp = cnames; cp->name != NULL; cp++)
1025 #ifdef MULTIBYTE
1026                 if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1027 #else
1028                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1029 #endif
1030                         return cp->code;        /* known name */
1031         if (len == 1)
1032                 return *sp;                             /* single character */
1033         SETERROR(REG_ECOLLATE);         /* neither */
1034         return 0;
1035 }
1036
1037 /*
1038  - othercase - return the case counterpart of an alphabetic
1039  == static char othercase(int ch);
1040  */
1041 #ifdef MULTIBYTE
1042 static unsigned char                    /* if no counterpart, return ch */
1043 #else
1044 static char                                             /* if no counterpart, return ch */
1045 #endif
1046 othercase(ch)
1047 int                     ch;
1048 {
1049         assert(pg_isalpha(ch));
1050         if (pg_isupper(ch))
1051 #ifdef MULTIBYTE
1052                 return (unsigned char) tolower(ch);
1053 #else
1054                 return tolower(ch);
1055 #endif
1056         else if (pg_islower(ch))
1057 #ifdef MULTIBYTE
1058                 return (unsigned char) toupper(ch);
1059 #else
1060                 return toupper(ch);
1061 #endif
1062         else
1063 /* peculiar, but could happen */
1064 #ifdef MULTIBYTE
1065                 return (unsigned char) ch;
1066 #else
1067                 return ch;
1068 #endif
1069 }
1070
1071 /*
1072  - bothcases - emit a dualcase version of a two-case character
1073  == static void bothcases(struct parse *p, int ch);
1074  *
1075  * Boy, is this implementation ever a kludge...
1076  */
1077 static void
1078 bothcases(p, ch)
1079 struct parse *p;
1080 int                     ch;
1081 {
1082         pg_wchar   *oldnext = p->next;
1083         pg_wchar   *oldend = p->end;
1084         pg_wchar        bracket[3];
1085
1086         assert(othercase(ch) != ch);/* p_bracket() would recurse */
1087         p->next = bracket;
1088         p->end = bracket + 2;
1089         bracket[0] = ch;
1090         bracket[1] = ']';
1091         bracket[2] = '\0';
1092         p_bracket(p);
1093         assert(p->next == bracket + 2);
1094         p->next = oldnext;
1095         p->end = oldend;
1096 }
1097
1098 /*
1099  - ordinary - emit an ordinary character
1100  == static void ordinary(struct parse *p, int ch);
1101  */
1102 static void
1103 ordinary(p, ch)
1104 struct parse *p;
1105 int                     ch;
1106 {
1107         cat_t      *cap = p->g->categories;
1108
1109         if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
1110                 bothcases(p, ch);
1111         else
1112         {
1113 #ifdef MULTIBYTE
1114                 EMIT(OCHAR, (pg_wchar) ch);
1115 #else
1116                 EMIT(OCHAR, (unsigned char) ch);
1117 #endif
1118                 if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0)
1119                         cap[ch] = p->g->ncategories++;
1120         }
1121 }
1122
1123 /*
1124  - nonnewline - emit REG_NEWLINE version of OANY
1125  == static void nonnewline(struct parse *p);
1126  *
1127  * Boy, is this implementation ever a kludge...
1128  */
1129 static void
1130 nonnewline(p)
1131 struct parse *p;
1132 {
1133         pg_wchar   *oldnext = p->next;
1134         pg_wchar   *oldend = p->end;
1135         pg_wchar        bracket[4];
1136
1137         p->next = bracket;
1138         p->end = bracket + 3;
1139         bracket[0] = '^';
1140         bracket[1] = '\n';
1141         bracket[2] = ']';
1142         bracket[3] = '\0';
1143         p_bracket(p);
1144         assert(p->next == bracket + 3);
1145         p->next = oldnext;
1146         p->end = oldend;
1147 }
1148
1149 /*
1150  - repeat - generate code for a bounded repetition, recursively if needed
1151  == static void repeat(struct parse *p, sopno start, int from, int to);
1152  */
1153 static void
1154 repeat(p, start, from, to)
1155 struct parse *p;
1156 sopno           start;                          /* operand from here to end of strip */
1157 int                     from;                           /* repeated from this number */
1158 int                     to;                                     /* to this number of times (maybe
1159                                                                  * INFINITY) */
1160 {
1161         sopno           finish = HERE();
1162
1163 #define  N               2
1164 #define  INF     3
1165 #define  REP(f, t)               ((f)*8 + (t))
1166 #define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1167         sopno           copy;
1168
1169         if (p->error != 0)                      /* head off possible runaway recursion */
1170                 return;
1171
1172         assert(from <= to);
1173
1174         switch (REP(MAP(from), MAP(to)))
1175         {
1176                 case REP(0, 0): /* must be user doing this */
1177                         DROP(finish - start);           /* drop the operand */
1178                         break;
1179                 case REP(0, 1): /* as x{1,1}? */
1180                 case REP(0, N): /* as x{1,n}? */
1181                 case REP(0, INF):               /* as x{1,}? */
1182                         /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1183                         INSERT(OCH_, start);/* offset is wrong... */
1184                         repeat(p, start + 1, 1, to);
1185                         ASTERN(OOR1, start);
1186                         AHEAD(start);           /* ... fix it */
1187                         EMIT(OOR2, 0);
1188                         AHEAD(THERE());
1189                         ASTERN(O_CH, THERETHERE());
1190                         break;
1191                 case REP(1, 1): /* trivial case */
1192                         /* done */
1193                         break;
1194                 case REP(1, N): /* as x?x{1,n-1} */
1195                         /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1196                         INSERT(OCH_, start);
1197                         ASTERN(OOR1, start);
1198                         AHEAD(start);
1199                         EMIT(OOR2, 0);          /* offset very wrong... */
1200                         AHEAD(THERE());         /* ...so fix it */
1201                         ASTERN(O_CH, THERETHERE());
1202                         copy = dupl(p, start + 1, finish + 1);
1203                         assert(copy == finish + 4);
1204                         repeat(p, copy, 1, to - 1);
1205                         break;
1206                 case REP(1, INF):               /* as x+ */
1207                         INSERT(OPLUS_, start);
1208                         ASTERN(O_PLUS, start);
1209                         break;
1210                 case REP(N, N): /* as xx{m-1,n-1} */
1211                         copy = dupl(p, start, finish);
1212                         repeat(p, copy, from - 1, to - 1);
1213                         break;
1214                 case REP(N, INF):               /* as xx{n-1,INF} */
1215                         copy = dupl(p, start, finish);
1216                         repeat(p, copy, from - 1, to);
1217                         break;
1218                 default:                                /* "can't happen" */
1219                         SETERROR(REG_ASSERT);           /* just in case */
1220                         break;
1221         }
1222 }
1223
1224 /*
1225  - seterr - set an error condition
1226  == static int seterr(struct parse *p, int e);
1227  */
1228 static int                                              /* useless but makes type checking happy */
1229 seterr(p, e)
1230 struct parse *p;
1231 int                     e;
1232 {
1233         if (p->error == 0)                      /* keep earliest error condition */
1234                 p->error = e;
1235         p->next = nuls;                         /* try to bring things to a halt */
1236         p->end = nuls;
1237         return 0;                                       /* make the return value well-defined */
1238 }
1239
1240 /*
1241  - allocset - allocate a set of characters for []
1242  == static cset *allocset(struct parse *p);
1243  */
1244 static cset *
1245 allocset(p)
1246 struct parse *p;
1247 {
1248         int                     no = p->g->ncsets++;
1249         size_t          nc;
1250         size_t          nbytes;
1251         cset       *cs;
1252         size_t          css = (size_t) p->g->csetsize;
1253         int                     i;
1254
1255         if (no >= p->ncsalloc)
1256         {                                                       /* need another column of space */
1257                 p->ncsalloc += CHAR_BIT;
1258                 nc = p->ncsalloc;
1259                 assert(nc % CHAR_BIT == 0);
1260                 nbytes = nc / CHAR_BIT * css;
1261                 if (p->g->sets == NULL)
1262                         p->g->sets = (cset *) malloc(nc * sizeof(cset));
1263                 else
1264                         p->g->sets = (cset *) realloc((char *) p->g->sets,
1265                                                                                   nc * sizeof(cset));
1266                 if (p->g->setbits == NULL)
1267                         p->g->setbits = (uch *) malloc(nbytes);
1268                 else
1269                 {
1270                         p->g->setbits = (uch *) realloc((char *) p->g->setbits,
1271                                                                                         nbytes);
1272                         /* xxx this isn't right if setbits is now NULL */
1273                         for (i = 0; i < no; i++)
1274                                 p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT);
1275                 }
1276                 if (p->g->sets != NULL && p->g->setbits != NULL)
1277                         memset((char *) p->g->setbits + (nbytes - css),
1278                                    0, css);
1279                 else
1280                 {
1281                         no = 0;
1282                         SETERROR(REG_ESPACE);
1283                         /* caller's responsibility not to do set ops */
1284                 }
1285         }
1286
1287         assert(p->g->sets != NULL); /* xxx */
1288         cs = &p->g->sets[no];
1289         cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT);
1290         cs->mask = 1 << ((no) % CHAR_BIT);
1291         cs->hash = 0;
1292         cs->smultis = 0;
1293         cs->multis = NULL;
1294
1295         return cs;
1296 }
1297
1298 /*
1299  - freeset - free a now-unused set
1300  == static void freeset(struct parse *p, cset *cs);
1301  */
1302 static void
1303 freeset(p, cs)
1304 struct parse *p;
1305 cset       *cs;
1306 {
1307         int                     i;
1308         cset       *top = &p->g->sets[p->g->ncsets];
1309         size_t          css = (size_t) p->g->csetsize;
1310
1311         for (i = 0; i < css; i++)
1312                 CHsub(cs, i);
1313         if (cs == top - 1)                      /* recover only the easy case */
1314                 p->g->ncsets--;
1315 }
1316
1317 /*
1318  - freezeset - final processing on a set of characters
1319  == static int freezeset(struct parse *p, cset *cs);
1320  *
1321  * The main task here is merging identical sets.  This is usually a waste
1322  * of time (although the hash code minimizes the overhead), but can win
1323  * big if REG_ICASE is being used.      REG_ICASE, by the way, is why the hash
1324  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1325  * the same value!
1326  */
1327 static int                                              /* set number */
1328 freezeset(p, cs)
1329 struct parse *p;
1330 cset       *cs;
1331 {
1332         uch                     h = cs->hash;
1333         int                     i;
1334         cset       *top = &p->g->sets[p->g->ncsets];
1335         cset       *cs2;
1336         size_t          css = (size_t) p->g->csetsize;
1337
1338         /* look for an earlier one which is the same */
1339         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1340                 if (cs2->hash == h && cs2 != cs)
1341                 {
1342                         /* maybe */
1343                         for (i = 0; i < css; i++)
1344                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1345                                         break;          /* no */
1346                         if (i == css)
1347                                 break;                  /* yes */
1348                 }
1349
1350         if (cs2 < top)
1351         {                                                       /* found one */
1352                 freeset(p, cs);
1353                 cs = cs2;
1354         }
1355
1356         return (int) (cs - p->g->sets);
1357 }
1358
1359 /*
1360  - firstch - return first character in a set (which must have at least one)
1361  == static int firstch(struct parse *p, cset *cs);
1362  */
1363 static int                                              /* character; there is no "none" value */
1364 firstch(p, cs)
1365 struct parse *p;
1366 cset       *cs;
1367 {
1368         int                     i;
1369         size_t          css = (size_t) p->g->csetsize;
1370
1371         for (i = 0; i < css; i++)
1372                 if (CHIN(cs, i))
1373                         return i;
1374         assert(never);
1375         return 0;                                       /* arbitrary */
1376 }
1377
1378 /*
1379  - nch - number of characters in a set
1380  == static int nch(struct parse *p, cset *cs);
1381  */
1382 static int
1383 nch(p, cs)
1384 struct parse *p;
1385 cset       *cs;
1386 {
1387         int                     i;
1388         size_t          css = (size_t) p->g->csetsize;
1389         int                     n = 0;
1390
1391         for (i = 0; i < css; i++)
1392                 if (CHIN(cs, i))
1393                         n++;
1394         return n;
1395 }
1396
1397 /*
1398  - mcadd - add a collating element to a cset
1399  == static void mcadd(struct parse *p, cset *cs, \
1400  ==             char *cp);
1401  */
1402 static void
1403 mcadd(p, cs, cp)
1404 struct parse *p;
1405 cset       *cs;
1406 char       *cp;
1407 {
1408         size_t          oldend = cs->smultis;
1409
1410         cs->smultis += strlen(cp) + 1;
1411         if (cs->multis == NULL)
1412                 cs->multis = malloc(cs->smultis);
1413         else
1414                 cs->multis = realloc(cs->multis, cs->smultis);
1415         if (cs->multis == NULL)
1416         {
1417                 SETERROR(REG_ESPACE);
1418                 return;
1419         }
1420
1421         strcpy(cs->multis + oldend - 1, cp);
1422         cs->multis[cs->smultis - 1] = '\0';
1423 }
1424
1425 /*
1426  - mcsub - subtract a collating element from a cset
1427  == static void mcsub(cset *cs, char *cp);
1428  */
1429 /*
1430 static void
1431 mcsub(cs, cp)
1432 cset *cs;
1433 char *cp;
1434 {
1435                 char *fp = mcfind(cs, cp);
1436                 size_t len = strlen(fp);
1437
1438                 assert(fp != NULL);
1439                 memmove(fp, fp + len + 1,
1440                                                                 cs->smultis - (fp + len + 1 - cs->multis));
1441                 cs->smultis -= len;
1442
1443                 if (cs->smultis == 0) {
1444                                 free(cs->multis);
1445                                 cs->multis = NULL;
1446                                 return;
1447                 }
1448
1449                 cs->multis = realloc(cs->multis, cs->smultis);
1450                 assert(cs->multis != NULL);
1451 }
1452 */
1453
1454 /*
1455  - mcin - is a collating element in a cset?
1456  == static int mcin(cset *cs, char *cp);
1457  */
1458 /*
1459 static int
1460 mcin(cs, cp)
1461 cset *cs;
1462 char *cp;
1463 {
1464                 return(mcfind(cs, cp) != NULL);
1465 }
1466 */
1467
1468 /*
1469  - mcfind - find a collating element in a cset
1470  == static char *mcfind(cset *cs, char *cp);
1471  */
1472 /*
1473 static char *
1474 mcfind(cs, cp)
1475 cset *cs;
1476 char *cp;
1477 {
1478                 char *p;
1479
1480                 if (cs->multis == NULL)
1481                                 return(NULL);
1482                 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1483                                 if (strcmp(cp, p) == 0)
1484                                                 return(p);
1485                 return(NULL);
1486 }
1487 */
1488 /*
1489  - mcinvert - invert the list of collating elements in a cset
1490  == static void mcinvert(struct parse *p, cset *cs);
1491  *
1492  * This would have to know the set of possibilities.  Implementation
1493  * is deferred.
1494  */
1495 static void
1496 mcinvert(p, cs)
1497 struct parse *p;
1498 cset       *cs;
1499 {
1500         assert(cs->multis == NULL); /* xxx */
1501 }
1502
1503 /*
1504  - mccase - add case counterparts of the list of collating elements in a cset
1505  == static void mccase(struct parse *p, cset *cs);
1506  *
1507  * This would have to know the set of possibilities.  Implementation
1508  * is deferred.
1509  */
1510 static void
1511 mccase(p, cs)
1512 struct parse *p;
1513 cset       *cs;
1514 {
1515         assert(cs->multis == NULL); /* xxx */
1516 }
1517
1518 /*
1519  - isinsets - is this character in any sets?
1520  == static int isinsets(struct re_guts *g, int c);
1521  */
1522 static int                                              /* predicate */
1523 isinsets(g, c)
1524 struct re_guts *g;
1525 int                     c;
1526 {
1527         uch                *col;
1528         int                     i;
1529         int                     ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
1530         unsigned        uc = (unsigned char) c;
1531
1532         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1533                 if (col[uc] != 0)
1534                         return 1;
1535         return 0;
1536 }
1537
1538 /*
1539  - samesets - are these two characters in exactly the same sets?
1540  == static int samesets(struct re_guts *g, int c1, int c2);
1541  */
1542 static int                                              /* predicate */
1543 samesets(g, c1, c2)
1544 struct re_guts *g;
1545 int                     c1;
1546 int                     c2;
1547 {
1548         uch                *col;
1549         int                     i;
1550         int                     ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
1551         unsigned        uc1 = (unsigned char) c1;
1552         unsigned        uc2 = (unsigned char) c2;
1553
1554         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1555                 if (col[uc1] != col[uc2])
1556                         return 0;
1557         return 1;
1558 }
1559
1560 /*
1561  - categorize - sort out character categories
1562  == static void categorize(struct parse *p, struct re_guts *g);
1563  */
1564 static void
1565 categorize(p, g)
1566 struct parse *p;
1567 struct re_guts *g;
1568 {
1569         cat_t      *cats = g->categories;
1570         int                     c;
1571         int                     c2;
1572         cat_t           cat;
1573
1574         /* avoid making error situations worse */
1575         if (p->error != 0)
1576                 return;
1577
1578         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1579                 if (cats[c] == 0 && isinsets(g, c))
1580                 {
1581                         cat = g->ncategories++;
1582                         cats[c] = cat;
1583                         for (c2 = c + 1; c2 <= CHAR_MAX; c2++)
1584                                 if (cats[c2] == 0 && samesets(g, c, c2))
1585                                         cats[c2] = cat;
1586                 }
1587 }
1588
1589 /*
1590  - dupl - emit a duplicate of a bunch of sops
1591  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1592  */
1593 static sopno                                    /* start of duplicate */
1594 dupl(p, start, finish)
1595 struct parse *p;
1596 sopno           start;                          /* from here */
1597 sopno           finish;                         /* to this less one */
1598 {
1599         sopno           ret = HERE();
1600         sopno           len = finish - start;
1601
1602         assert(finish >= start);
1603         if (len == 0)
1604                 return ret;
1605         enlarge(p, p->ssize + len); /* this many unexpected additions */
1606         assert(p->ssize >= p->slen + len);
1607         memcpy((char *) (p->strip + p->slen),
1608                    (char *) (p->strip + start), (size_t) len * sizeof(sop));
1609         p->slen += len;
1610         return ret;
1611 }
1612
1613 /*
1614  - doemit - emit a strip operator
1615  == static void doemit(struct parse *p, sop op, size_t opnd);
1616  *
1617  * It might seem better to implement this as a macro with a function as
1618  * hard-case backup, but it's just too big and messy unless there are
1619  * some changes to the data structures.  Maybe later.
1620  */
1621 static void
1622 doemit(p, op, opnd)
1623 struct parse *p;
1624 sop                     op;
1625 size_t          opnd;
1626 {
1627         /* avoid making error situations worse */
1628         if (p->error != 0)
1629                 return;
1630
1631         /* deal with oversize operands ("can't happen", more or less) */
1632         assert(opnd < 1 << OPSHIFT);
1633
1634         /* deal with undersized strip */
1635         if (p->slen >= p->ssize)
1636                 enlarge(p, (p->ssize + 1) / 2 * 3);             /* +50% */
1637         assert(p->slen < p->ssize);
1638
1639         /* finally, it's all reduced to the easy case */
1640         p->strip[p->slen++] = SOP(op, opnd);
1641 }
1642
1643 /*
1644  - doinsert - insert a sop into the strip
1645  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1646  */
1647 static void
1648 doinsert(p, op, opnd, pos)
1649 struct parse *p;
1650 sop                     op;
1651 size_t          opnd;
1652 sopno           pos;
1653 {
1654         sopno           sn;
1655         sop                     s;
1656         int                     i;
1657
1658         /* avoid making error situations worse */
1659         if (p->error != 0)
1660                 return;
1661
1662         sn = HERE();
1663         EMIT(op, opnd);                         /* do checks, ensure space */
1664         assert(HERE() == sn + 1);
1665         s = p->strip[sn];
1666
1667         /* adjust paren pointers */
1668         assert(pos > 0);
1669         for (i = 1; i < NPAREN; i++)
1670         {
1671                 if (p->pbegin[i] >= pos)
1672                         p->pbegin[i]++;
1673                 if (p->pend[i] >= pos)
1674                         p->pend[i]++;
1675         }
1676
1677         memmove((char *) &p->strip[pos + 1], (char *) &p->strip[pos],
1678                         (HERE() - pos - 1) * sizeof(sop));
1679         p->strip[pos] = s;
1680 }
1681
1682 /*
1683  - dofwd - complete a forward reference
1684  == static void dofwd(struct parse *p, sopno pos, sop value);
1685  */
1686 static void
1687 dofwd(p, pos, value)
1688 struct parse *p;
1689 sopno           pos;
1690 sop                     value;
1691 {
1692         /* avoid making error situations worse */
1693         if (p->error != 0)
1694                 return;
1695
1696         assert(value < 1 << OPSHIFT);
1697         p->strip[pos] = OP(p->strip[pos]) | value;
1698 }
1699
1700 /*
1701  - enlarge - enlarge the strip
1702  == static void enlarge(struct parse *p, sopno size);
1703  */
1704 static void
1705 enlarge(p, size)
1706 struct parse *p;
1707 sopno           size;
1708 {
1709         sop                *sp;
1710
1711         if (p->ssize >= size)
1712                 return;
1713
1714         sp = (sop *) realloc(p->strip, size * sizeof(sop));
1715         if (sp == NULL)
1716         {
1717                 SETERROR(REG_ESPACE);
1718                 return;
1719         }
1720         p->strip = sp;
1721         p->ssize = size;
1722 }
1723
1724 /*
1725  - stripsnug - compact the strip
1726  == static void stripsnug(struct parse *p, struct re_guts *g);
1727  */
1728 static void
1729 stripsnug(p, g)
1730 struct parse *p;
1731 struct re_guts *g;
1732 {
1733         g->nstates = p->slen;
1734         g->strip = (sop *) realloc((char *) p->strip, p->slen * sizeof(sop));
1735         if (g->strip == NULL)
1736         {
1737                 SETERROR(REG_ESPACE);
1738                 g->strip = p->strip;
1739         }
1740 }
1741
1742 /*
1743  - findmust - fill in must and mlen with longest mandatory literal string
1744  == static void findmust(struct parse *p, struct re_guts *g);
1745  *
1746  * This algorithm could do fancy things like analyzing the operands of |
1747  * for common subsequences.  Someday.  This code is simple and finds most
1748  * of the interesting cases.
1749  *
1750  * Note that must and mlen got initialized during setup.
1751  */
1752 static void
1753 findmust(p, g)
1754 struct parse *p;
1755 struct re_guts *g;
1756 {
1757         sop                *scan;
1758         sop                *start = 0;
1759         sop                *newstart = 0;
1760         sopno           newlen;
1761         sop                     s;
1762         pg_wchar   *cp;
1763         sopno           i;
1764
1765         /* avoid making error situations worse */
1766         if (p->error != 0)
1767                 return;
1768
1769         /* find the longest OCHAR sequence in strip */
1770         newlen = 0;
1771         scan = g->strip + 1;
1772         do
1773         {
1774                 s = *scan++;
1775                 switch (OP(s))
1776                 {
1777                         case OCHAR: /* sequence member */
1778                                 if (newlen == 0)/* new sequence */
1779                                         newstart = scan - 1;
1780                                 newlen++;
1781                                 break;
1782                         case OPLUS_:            /* things that don't break one */
1783                         case OLPAREN:
1784                         case ORPAREN:
1785                                 break;
1786                         case OQUEST_:           /* things that must be skipped */
1787                         case OCH_:
1788                                 scan--;
1789                                 do
1790                                 {
1791                                         scan += OPND(s);
1792                                         s = *scan;
1793                                         /* assert() interferes w debug printouts */
1794                                         if (OP(s) != O_QUEST && OP(s) != O_CH &&
1795                                                 OP(s) != OOR2)
1796                                         {
1797                                                 g->iflags |= BAD;
1798                                                 return;
1799                                         }
1800                                 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1801                                 /* fallthrough */
1802                         default:                        /* things that break a sequence */
1803                                 if (newlen > g->mlen)
1804                                 {                               /* ends one */
1805                                         start = newstart;
1806                                         g->mlen = newlen;
1807                                 }
1808                                 newlen = 0;
1809                                 break;
1810                 }
1811         } while (OP(s) != OEND);
1812
1813         if (g->mlen == 0)                       /* there isn't one */
1814                 return;
1815
1816         /* turn it into a character string */
1817 #ifdef MULTIBYTE
1818         g->must = (pg_wchar *) malloc((size_t) (g->mlen + 1) * sizeof(pg_wchar));
1819 #else
1820         g->must = malloc((size_t) g->mlen + 1);
1821 #endif
1822         if (g->must == NULL)
1823         {                                                       /* argh; just forget it */
1824                 g->mlen = 0;
1825                 return;
1826         }
1827         cp = g->must;
1828         scan = start;
1829         for (i = g->mlen; i > 0; i--)
1830         {
1831                 while (OP(s = *scan++) != OCHAR)
1832                         continue;
1833                 assert(cp < g->must + g->mlen);
1834                 *cp++ = (pg_wchar) OPND(s);
1835         }
1836         assert(cp == g->must + g->mlen);
1837         *cp++ = '\0';                           /* just on general principles */
1838 }
1839
1840 /*
1841  - pluscount - count + nesting
1842  == static sopno pluscount(struct parse *p, struct re_guts *g);
1843  */
1844 static sopno                                    /* nesting depth */
1845 pluscount(p, g)
1846 struct parse *p;
1847 struct re_guts *g;
1848 {
1849         sop                *scan;
1850         sop                     s;
1851         sopno           plusnest = 0;
1852         sopno           maxnest = 0;
1853
1854         if (p->error != 0)
1855                 return 0;                               /* there may not be an OEND */
1856
1857         scan = g->strip + 1;
1858         do
1859         {
1860                 s = *scan++;
1861                 switch (OP(s))
1862                 {
1863                         case OPLUS_:
1864                                 plusnest++;
1865                                 break;
1866                         case O_PLUS:
1867                                 if (plusnest > maxnest)
1868                                         maxnest = plusnest;
1869                                 plusnest--;
1870                                 break;
1871                 }
1872         } while (OP(s) != OEND);
1873         if (plusnest != 0)
1874                 g->iflags |= BAD;
1875         return maxnest;
1876 }
1877
1878 /*
1879  * some ctype functions with none-ascii-char guard
1880  */
1881 static int
1882 pg_isdigit(int c)
1883 {
1884 #ifdef MULTIBYTE
1885         return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
1886 #else
1887         return (isdigit(c));
1888 #endif
1889 }
1890
1891 static int
1892 pg_isalpha(int c)
1893 {
1894 #ifdef MULTIBYTE
1895         return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
1896 #else
1897         return (isalpha(c));
1898 #endif
1899 }
1900
1901 static int
1902 pg_isupper(int c)
1903 {
1904 #ifdef MULTIBYTE
1905         return (c >= 0 && c <= UCHAR_MAX && isupper(c));
1906 #else
1907         return (isupper(c));
1908 #endif
1909 }
1910
1911 static int
1912 pg_islower(int c)
1913 {
1914 #ifdef MULTIBYTE
1915         return (c >= 0 && c <= UCHAR_MAX && islower(c));
1916 #else
1917         return (islower(c));
1918 #endif
1919 }