summaryrefslogtreecommitdiffstats
path: root/src/RESearch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/RESearch.cpp')
-rwxr-xr-xsrc/RESearch.cpp788
1 files changed, 788 insertions, 0 deletions
diff --git a/src/RESearch.cpp b/src/RESearch.cpp
new file mode 100755
index 0000000..00e1830
--- /dev/null
+++ b/src/RESearch.cpp
@@ -0,0 +1,788 @@
+// Scintilla source code edit control
+/** @file RESearch.cxx
+ ** Regular expression search library.
+ **/
+
+/*
+ * regex - Regular expression pattern matching and replacement
+ *
+ * By: Ozan S. Yigit (oz)
+ * Dept. of Computer Science
+ * York University
+ *
+ * Original code available from http://www.cs.yorku.ca/~oz/
+ * Translation to C++ by Neil Hodgson neilh@scintilla.org
+ * Removed all use of register.
+ * Converted to modern function prototypes.
+ * Put all global/static variables into an object so this code can be
+ * used from multiple threads, etc.
+ *
+ * These routines are the PUBLIC DOMAIN equivalents of regex
+ * routines as found in 4.nBSD UN*X, with minor extensions.
+ *
+ * These routines are derived from various implementations found
+ * in software tools books, and Conroy's grep. They are NOT derived
+ * from licensed/restricted software.
+ * For more interesting/academic/complicated implementations,
+ * see Henry Spencer's regexp routines, or GNU Emacs pattern
+ * matching module.
+ *
+ * Modification history removed.
+ *
+ * Interfaces:
+ * RESearch::Compile: compile a regular expression into a NFA.
+ *
+ * const char *RESearch::Compile(const char *pat, int length,
+ * bool caseSensitive, bool posix)
+ *
+ * Returns a short error string if they fail.
+ *
+ * RESearch::Execute: execute the NFA to match a pattern.
+ *
+ * int RESearch::Execute(characterIndexer &ci, int lp, int endp)
+ *
+ * RESearch::Substitute: substitute the matched portions in a new string.
+ *
+ * int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)
+ *
+ * re_fail: failure routine for RESearch::Execute. (no longer used)
+ *
+ * void re_fail(char *msg, char op)
+ *
+ * Regular Expressions:
+ *
+ * [1] char matches itself, unless it is a special
+ * character (metachar): . \ [ ] * + ^ $
+ * and ( ) if posix option.
+ *
+ * [2] . matches any character.
+ *
+ * [3] \ matches the character following it, except:
+ * - \a, \b, \f, \n, \t, \v match the
+ * corresponding C escape char;
+ * - if not in posix mode, when followed by a
+ * left or right round bracket (see [7]);
+ * - when followed by a digit 1 to 9 (see [8]);
+ * - when followed by a left or right angle bracket
+ * (see [9]).
+ * It is used as an escape character for all
+ * other meta-characters, and itself. When used
+ * in a set ([4]), it is treated as an ordinary
+ * character (except for escape chars).
+ *
+ * [4] [set] matches one of the characters in the set.
+ * If the first character in the set is "^",
+ * it matches a character NOT in the set, i.e.
+ * complements the set. A shorthand S-E (start-end)
+ * is used to specify a set of characters S upto
+ * E, inclusive. The special characters "]" and
+ * "-" have no special meaning if they appear
+ * as the first chars in the set. To include both,
+ * put - first: [-]A-Z]:
+ * [-]|] matches these 2 chars,
+ * []-|] matches from ] to | chars.
+ * examples: match:
+ *
+ * [a-z] any lowercase alpha
+ *
+ * [^-]] any char except - and ]
+ *
+ * [^A-Z] any char except uppercase
+ * alpha
+ *
+ * [a-zA-Z] any alpha
+ *
+ * [5] * any regular expression form [1] to [4], followed by
+ * closure char (*) matches zero or more matches of
+ * that form.
+ *
+ * [6] + same as [5], except it matches one or more.
+ *
+ * [7] a regular expression in the form [1] to [10], enclosed
+ * as \(form\) (or (form) with posix flag) matches what
+ * form matches. The enclosure creates a set of tags,
+ * used for [8] and for pattern substitution.
+ * The tagged forms are numbered starting from 1.
+ *
+ * [8] a \ followed by a digit 1 to 9 matches whatever a
+ * previously tagged regular expression ([7]) matched.
+ *
+ * [9] \< a regular expression starting with a \< construct
+ * \> and/or ending with a \> construct, restricts the
+ * pattern matching to the beginning of a word, and/or
+ * the end of a word. A word is defined to be a character
+ * string beginning and/or ending with the characters
+ * A-Z a-z 0-9 and _. It must also be preceded and/or
+ * followed by any character outside those mentioned.
+ *
+ * [10] a composite regular expression xy where x and y
+ * are in the form [1] to [10] matches the longest
+ * match of x followed by a match for y.
+ *
+ * [11] ^ a regular expression starting with a ^ character
+ * $ and/or ending with a $ character, restricts the
+ * pattern matching to the beginning of the line,
+ * or the end of line. [anchors] Elsewhere in the
+ * pattern, ^ and $ are treated as ordinary characters.
+ *
+ *
+ * Acknowledgements:
+ *
+ * HCR's Hugh Redelmeier has been most helpful in various
+ * stages of development. He convinced me to include BOW
+ * and EOW constructs, originally invented by Rob Pike at
+ * the University of Toronto.
+ *
+ * References:
+ * Software tools Kernighan & Plauger
+ * Software tools in Pascal Kernighan & Plauger
+ * Grep [rsx-11 C dist] David Conroy
+ * ed - text editor Un*x Programmer's Manual
+ * Advanced editing on Un*x B. W. Kernighan
+ * RegExp routines Henry Spencer
+ *
+ * Notes:
+ *
+ * This implementation uses a bit-set representation for character
+ * classes for speed and compactness. Each character is represented
+ * by one bit in a 256-bit block. Thus, CCL always takes a
+ * constant 32 bytes in the internal nfa, and RESearch::Execute does a single
+ * bit comparison to locate the character in the set.
+ *
+ * Examples:
+ *
+ * pattern: foo*.*
+ * compile: CHR f CHR o CLO CHR o END CLO ANY END END
+ * matches: fo foo fooo foobar fobar foxx ...
+ *
+ * pattern: fo[ob]a[rz]
+ * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
+ * matches: fobar fooar fobaz fooaz
+ *
+ * pattern: foo\\+
+ * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
+ * matches: foo\ foo\\ foo\\\ ...
+ *
+ * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
+ * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
+ * matches: foo1foo foo2foo foo3foo
+ *
+ * pattern: \(fo.*\)-\1
+ * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
+ * matches: foo-foo fo-fo fob-fob foobar-foobar ...
+ */
+
+#include "CharClassify.h"
+#include "RESearch.h"
+
+// Shut up annoying Visual C++ warnings:
+#ifdef _MSC_VER
+#pragma warning(disable: 4514)
+#endif
+
+#define OKP 1
+#define NOP 0
+
+#define CHR 1
+#define ANY 2
+#define CCL 3
+#define BOL 4
+#define EOL 5
+#define BOT 6
+#define EOT 7
+#define BOW 8
+#define EOW 9
+#define REF 10
+#define CLO 11
+
+#define END 0
+
+/*
+ * The following defines are not meant to be changeable.
+ * They are for readability only.
+ */
+#define BLKIND 0370
+#define BITIND 07
+
+const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
+
+#define badpat(x) (*nfa = END, x)
+
+/*
+ * Character classification table for word boundary operators BOW
+ * and EOW is passed in by the creator of this object (Scintilla
+ * Document). The Document default state is that word chars are:
+ * 0-9,a-z, A-Z and _
+ */
+
+RESearch::RESearch(CharClassify *charClassTable) {
+ charClass = charClassTable;
+ Init();
+}
+
+RESearch::~RESearch() {
+ Clear();
+}
+
+void RESearch::Init() {
+ sta = NOP; /* status of lastpat */
+ bol = 0;
+ for (int i=0; i<MAXTAG; i++)
+ pat[i] = 0;
+ for (int j=0; j<BITBLK; j++)
+ bittab[j] = 0;
+}
+
+void RESearch::Clear() {
+ for (int i=0; i<MAXTAG; i++) {
+ delete []pat[i];
+ pat[i] = 0;
+ bopat[i] = NOTFOUND;
+ eopat[i] = NOTFOUND;
+ }
+}
+
+bool RESearch::GrabMatches(CharacterIndexer &ci) {
+ bool success = true;
+ for (unsigned int i=0; i<MAXTAG; i++) {
+ if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
+ unsigned int len = eopat[i] - bopat[i];
+ pat[i] = new char[len + 1];
+ if (pat[i]) {
+ for (unsigned int j=0; j<len; j++)
+ pat[i][j] = ci.CharAt(bopat[i] + j);
+ pat[i][len] = '\0';
+ } else {
+ success = false;
+ }
+ }
+ }
+ return success;
+}
+
+void RESearch::ChSet(char c) {
+ bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
+}
+
+void RESearch::ChSetWithCase(char c, bool caseSensitive) {
+ if (caseSensitive) {
+ ChSet(c);
+ } else {
+ if ((c >= 'a') && (c <= 'z')) {
+ ChSet(c);
+ ChSet(static_cast<char>(c - 'a' + 'A'));
+ } else if ((c >= 'A') && (c <= 'Z')) {
+ ChSet(c);
+ ChSet(static_cast<char>(c - 'A' + 'a'));
+ } else {
+ ChSet(c);
+ }
+ }
+}
+
+const char escapeValue(char ch) {
+ switch (ch) {
+ case 'a': return '\a';
+ case 'b': return '\b';
+ case 'f': return '\f';
+ case 'n': return '\n';
+ case 'r': return '\r';
+ case 't': return '\t';
+ case 'v': return '\v';
+ }
+ return 0;
+}
+
+const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
+ char *mp=nfa; /* nfa pointer */
+ char *lp; /* saved pointer */
+ char *sp=nfa; /* another one */
+ char *mpMax = mp + MAXNFA - BITBLK - 10;
+
+ int tagi = 0; /* tag stack index */
+ int tagc = 1; /* actual tag count */
+
+ int n;
+ char mask; /* xor mask -CCL/NCL */
+ int c1, c2;
+
+ if (!pat || !length)
+ if (sta)
+ return 0;
+ else
+ return badpat("No previous regular expression");
+ sta = NOP;
+
+ const char *p=pat; /* pattern pointer */
+ for (int i=0; i<length; i++, p++) {
+ if (mp > mpMax)
+ return badpat("Pattern too long");
+ lp = mp;
+ switch(*p) {
+
+ case '.': /* match any char */
+ *mp++ = ANY;
+ break;
+
+ case '^': /* match beginning */
+ if (p == pat)
+ *mp++ = BOL;
+ else {
+ *mp++ = CHR;
+ *mp++ = *p;
+ }
+ break;
+
+ case '$': /* match endofline */
+ if (!*(p+1))
+ *mp++ = EOL;
+ else {
+ *mp++ = CHR;
+ *mp++ = *p;
+ }
+ break;
+
+ case '[': /* match char class */
+ *mp++ = CCL;
+
+ i++;
+ if (*++p == '^') {
+ mask = '\377';
+ i++;
+ p++;
+ } else
+ mask = 0;
+
+ if (*p == '-') { /* real dash */
+ i++;
+ ChSet(*p++);
+ }
+ if (*p == ']') { /* real brace */
+ i++;
+ ChSet(*p++);
+ }
+ while (*p && *p != ']') {
+ if (*p == '-' && *(p+1) && *(p+1) != ']') {
+ i++;
+ p++;
+ c1 = *(p-2) + 1;
+ i++;
+ c2 = *p++;
+ while (c1 <= c2) {
+ ChSetWithCase(static_cast<char>(c1++), caseSensitive);
+ }
+ } else if (*p == '\\' && *(p+1)) {
+ i++;
+ p++;
+ char escape = escapeValue(*p);
+ if (escape)
+ ChSetWithCase(escape, caseSensitive);
+ else
+ ChSetWithCase(*p, caseSensitive);
+ i++;
+ p++;
+ } else {
+ i++;
+ ChSetWithCase(*p++, caseSensitive);
+ }
+ }
+ if (!*p)
+ return badpat("Missing ]");
+
+ for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
+ *mp++ = static_cast<char>(mask ^ bittab[n]);
+
+ break;
+
+ case '*': /* match 0 or more... */
+ case '+': /* match 1 or more... */
+ if (p == pat)
+ return badpat("Empty closure");
+ lp = sp; /* previous opcode */
+ if (*lp == CLO) /* equivalence... */
+ break;
+ switch(*lp) {
+
+ case BOL:
+ case BOT:
+ case EOT:
+ case BOW:
+ case EOW:
+ case REF:
+ return badpat("Illegal closure");
+ default:
+ break;
+ }
+
+ if (*p == '+')
+ for (sp = mp; lp < sp; lp++)
+ *mp++ = *lp;
+
+ *mp++ = END;
+ *mp++ = END;
+ sp = mp;
+ while (--mp > lp)
+ *mp = mp[-1];
+ *mp = CLO;
+ mp = sp;
+ break;
+
+ case '\\': /* tags, backrefs... */
+ i++;
+ switch(*++p) {
+
+ case '<':
+ *mp++ = BOW;
+ break;
+ case '>':
+ if (*sp == BOW)
+ return badpat("Null pattern inside \\<\\>");
+ *mp++ = EOW;
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ n = *p-'0';
+ if (tagi > 0 && tagstk[tagi] == n)
+ return badpat("Cyclical reference");
+ if (tagc > n) {
+ *mp++ = static_cast<char>(REF);
+ *mp++ = static_cast<char>(n);
+ }
+ else
+ return badpat("Undetermined reference");
+ break;
+ case 'a':
+ case 'b':
+ case 'n':
+ case 'f':
+ case 'r':
+ case 't':
+ case 'v':
+ *mp++ = CHR;
+ *mp++ = escapeValue(*p);
+ break;
+ default:
+ if (!posix && *p == '(') {
+ if (tagc < MAXTAG) {
+ tagstk[++tagi] = tagc;
+ *mp++ = BOT;
+ *mp++ = static_cast<char>(tagc++);
+ }
+ else
+ return badpat("Too many \\(\\) pairs");
+ } else if (!posix && *p == ')') {
+ if (*sp == BOT)
+ return badpat("Null pattern inside \\(\\)");
+ if (tagi > 0) {
+ *mp++ = static_cast<char>(EOT);
+ *mp++ = static_cast<char>(tagstk[tagi--]);
+ }
+ else
+ return badpat("Unmatched \\)");
+ } else {
+ *mp++ = CHR;
+ *mp++ = *p;
+ }
+ }
+ break;
+
+ default : /* an ordinary char */
+ if (posix && *p == '(') {
+ if (tagc < MAXTAG) {
+ tagstk[++tagi] = tagc;
+ *mp++ = BOT;
+ *mp++ = static_cast<char>(tagc++);
+ }
+ else
+ return badpat("Too many () pairs");
+ } else if (posix && *p == ')') {
+ if (*sp == BOT)
+ return badpat("Null pattern inside ()");
+ if (tagi > 0) {
+ *mp++ = static_cast<char>(EOT);
+ *mp++ = static_cast<char>(tagstk[tagi--]);
+ }
+ else
+ return badpat("Unmatched )");
+ } else if (caseSensitive) {
+ *mp++ = CHR;
+ *mp++ = *p;
+ } else {
+ *mp++ = CCL;
+ mask = 0;
+ ChSetWithCase(*p, false);
+ for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
+ *mp++ = static_cast<char>(mask ^ bittab[n]);
+ }
+ break;
+ }
+ sp = lp;
+ }
+ if (tagi > 0)
+ return badpat((posix ? "Unmatched (" : "Unmatched \\("));
+ *mp = END;
+ sta = OKP;
+ return 0;
+}
+
+/*
+ * RESearch::Execute:
+ * execute nfa to find a match.
+ *
+ * special cases: (nfa[0])
+ * BOL
+ * Match only once, starting from the
+ * beginning.
+ * CHR
+ * First locate the character without
+ * calling PMatch, and if found, call
+ * PMatch for the remaining string.
+ * END
+ * RESearch::Compile failed, poor luser did not
+ * check for it. Fail fast.
+ *
+ * If a match is found, bopat[0] and eopat[0] are set
+ * to the beginning and the end of the matched fragment,
+ * respectively.
+ *
+ */
+
+int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
+ char c;
+ int ep = NOTFOUND;
+ char *ap = nfa;
+
+ bol = lp;
+ failure = 0;
+
+ Clear();
+
+ switch(*ap) {
+
+ case BOL: /* anchored: match from BOL only */
+ ep = PMatch(ci, lp, endp, ap);
+ break;
+ case EOL: /* just searching for end of line normal path doesn't work */
+ if (*(ap+1) == END) {
+ lp = endp;
+ ep = lp;
+ break;
+ } else {
+ return 0;
+ }
+ case CHR: /* ordinary char: locate it fast */
+ c = *(ap+1);
+ while ((lp < endp) && (ci.CharAt(lp) != c))
+ lp++;
+ if (lp >= endp) /* if EOS, fail, else fall thru. */
+ return 0;
+ default: /* regular matching all the way. */
+ while (lp < endp) {
+ ep = PMatch(ci, lp, endp, ap);
+ if (ep != NOTFOUND)
+ break;
+ lp++;
+ }
+ break;
+ case END: /* munged automaton. fail always */
+ return 0;
+ }
+ if (ep == NOTFOUND)
+ return 0;
+
+ bopat[0] = lp;
+ eopat[0] = ep;
+ return 1;
+}
+
+/*
+ * PMatch: internal routine for the hard part
+ *
+ * This code is partly snarfed from an early grep written by
+ * David Conroy. The backref and tag stuff, and various other
+ * innovations are by oz.
+ *
+ * special case optimizations: (nfa[n], nfa[n+1])
+ * CLO ANY
+ * We KNOW .* will match everything upto the
+ * end of line. Thus, directly go to the end of
+ * line, without recursive PMatch calls. As in
+ * the other closure cases, the remaining pattern
+ * must be matched by moving backwards on the
+ * string recursively, to find a match for xy
+ * (x is ".*" and y is the remaining pattern)
+ * where the match satisfies the LONGEST match for
+ * x followed by a match for y.
+ * CLO CHR
+ * We can again scan the string forward for the
+ * single char and at the point of failure, we
+ * execute the remaining nfa recursively, same as
+ * above.
+ *
+ * At the end of a successful match, bopat[n] and eopat[n]
+ * are set to the beginning and end of subpatterns matched
+ * by tagged expressions (n = 1 to 9).
+ */
+
+extern void re_fail(char *,char);
+
+#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
+
+/*
+ * skip values for CLO XXX to skip past the closure
+ */
+
+#define ANYSKIP 2 /* [CLO] ANY END */
+#define CHRSKIP 3 /* [CLO] CHR chr END */
+#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
+
+int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
+ int op, c, n;
+ int e; /* extra pointer for CLO */
+ int bp; /* beginning of subpat... */
+ int ep; /* ending of subpat... */
+ int are; /* to save the line ptr. */
+
+ while ((op = *ap++) != END)
+ switch(op) {
+
+ case CHR:
+ if (ci.CharAt(lp++) != *ap++)
+ return NOTFOUND;
+ break;
+ case ANY:
+ if (lp++ >= endp)
+ return NOTFOUND;
+ break;
+ case CCL:
+ c = ci.CharAt(lp++);
+ if (!isinset(ap,c))
+ return NOTFOUND;
+ ap += BITBLK;
+ break;
+ case BOL:
+ if (lp != bol)
+ return NOTFOUND;
+ break;
+ case EOL:
+ if (lp < endp)
+ return NOTFOUND;
+ break;
+ case BOT:
+ bopat[*ap++] = lp;
+ break;
+ case EOT:
+ eopat[*ap++] = lp;
+ break;
+ case BOW:
+ if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
+ return NOTFOUND;
+ break;
+ case EOW:
+ if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
+ return NOTFOUND;
+ break;
+ case REF:
+ n = *ap++;
+ bp = bopat[n];
+ ep = eopat[n];
+ while (bp < ep)
+ if (ci.CharAt(bp++) != ci.CharAt(lp++))
+ return NOTFOUND;
+ break;
+ case CLO:
+ are = lp;
+ switch(*ap) {
+
+ case ANY:
+ while (lp < endp)
+ lp++;
+ n = ANYSKIP;
+ break;
+ case CHR:
+ c = *(ap+1);
+ while ((lp < endp) && (c == ci.CharAt(lp)))
+ lp++;
+ n = CHRSKIP;
+ break;
+ case CCL:
+ while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
+ lp++;
+ n = CCLSKIP;
+ break;
+ default:
+ failure = true;
+ //re_fail("closure: bad nfa.", *ap);
+ return NOTFOUND;
+ }
+
+ ap += n;
+
+ while (lp >= are) {
+ if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
+ return e;
+ --lp;
+ }
+ return NOTFOUND;
+ default:
+ //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
+ return NOTFOUND;
+ }
+ return lp;
+}
+
+/*
+ * RESearch::Substitute:
+ * substitute the matched portions of the src in dst.
+ *
+ * & substitute the entire matched pattern.
+ *
+ * \digit substitute a subpattern, with the given tag number.
+ * Tags are numbered from 1 to 9. If the particular
+ * tagged subpattern does not exist, null is substituted.
+ */
+int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
+ char c;
+ int pin;
+ int bp;
+ int ep;
+
+ if (!*src || !bopat[0])
+ return 0;
+
+ while ((c = *src++) != 0) {
+ switch(c) {
+
+ case '&':
+ pin = 0;
+ break;
+
+ case '\\':
+ c = *src++;
+ if (c >= '0' && c <= '9') {
+ pin = c - '0';
+ break;
+ }
+
+ default:
+ *dst++ = c;
+ continue;
+ }
+
+ if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
+ while (ci.CharAt(bp) && bp < ep)
+ *dst++ = ci.CharAt(bp++);
+ if (bp < ep)
+ return 0;
+ }
+ }
+ *dst = (char) 0;
+ return 1;
+}