OWLNext    7.0
Borland's Object Windows Library for the modern age
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
regexp.cpp
Go to the documentation of this file.
1
2#include <owl/pch.h>
3#include <owl/defs.h>
4
5#include <algorithm>
6
7//
8// Borland includes this in run-time library
9//
10//#if !defined(BI_COMP_BORLANDC)
11
12#include <owl/private/regexp.h>
13
14namespace owl
15{
16
17/////////////////////////////////////
18// TRegexp Class
19
21
22int makepat(const tchar * exp, PatternType * pat, size_t maxpattern);
23const tchar * matchs(const tchar * str,
24 const PatternType * pat,
25 tchar * * startpat);
26
27const unsigned TRegexp::maxpat=128;
28
30{
31 gen_pattern( str );
32}
33
35{
36 copy_pattern( r );
37}
38
40{
41 delete[] the_pattern;
42}
43
44void TRegexp::copy_pattern(const TRegexp & r)
45{
46 the_pattern = new PatternType[maxpat];
47 memcpy( the_pattern, r.the_pattern, maxpat );
48 stat = r.stat;
49}
50
51void TRegexp::gen_pattern(const tchar * str)
52{
53 the_pattern = new PatternType[maxpat];
54 stat = (StatVal)makepat( str, the_pattern, maxpat );
55}
56
58{
59 delete the_pattern;
60 gen_pattern( cp );
61 return *this;
62}
63
65{
66 if(this != &r)
67 {
68 delete the_pattern;
69 copy_pattern( r );
70 }
71 return *this;
72}
73
75{
76 StatVal temp = stat;
77 stat = OK;
78 return temp;
79}
80
81int makepat(const tchar *exp, PatternType *pat, size_t maxpattern);
82const tchar * matchs( const tchar *str,
83 const PatternType *pat,
84 tchar * *startpat);
85
86size_t TRegexp::find( const tstring & str,
87 size_t *len,
88 size_t i ) const
89{
90 PRECONDITION( stat==OK );
91 tchar * startp;
92 const tchar * s = str.c_str();
93 const tchar * endp = matchs( s+i, the_pattern, &startp );
94 if( endp ){
95 *len = (size_t)((tchar *)endp - startp + 1);
96 return (size_t)(startp - (tchar *)s);
97 }
98 else{
99 *len = 0;
100 return static_cast<size_t>(-1); //JJH added static_cast
101 }
102}
103
104/*
105 *
106 * Author: Allen I. Holub
107 *
108 * (c) C Gazette. May be used freely as long as author and publication are
109 * acknowledged
110 *
111 */
112
113/*
114 *
115 * Modified by Borland International to compile without warnings as C++.
116 *
117 * Modified by Yura Bidus to support UNICODE.
118 */
119
120/* Metacharacters in the input: */
121#define BOL _T('^') /* start-of-line anchor */
122#define EOL _T('$') /* end-of-line anchor */
123#define ANY _T('.') /* matches any character */
124#define CCL _T('[') /* start a character class */
125#define CCLEND _T(']') /* end a character class */
126#define NCCL _T('^') /* negates character class if 1st char. */
127#define CLOSURE _T('*') /* Kleene closure (matches 0 or more) */
128#define PCLOSE _T('+') /* Positive closure (1 or more) */
129#define OPT _T('?') /* Optional closure (0 or 1) */
130
131//?????????????????
132// 0x80 will not work in UNICODE !!!!!!!!!!!!
133typedef enum action { /* These are put in the pattern string */
134 /* to represent metacharacters. */
135 M_BOL = (0x80 | _T('^')),
136 M_EOL = (0x80 | _T('$')),
137 M_ANY = (0x80 | _T('.')),
138 M_CCL = (0x80 | _T('[')),
139 M_OPT = (0x80 | _T('?')),
140 M_CLOSE = (0x80 | _T('*')),
141 M_PCLOSE = (0x80 | _T('+'))
143
144//typedef unsigned char pattern; /* pattern strings are unsigned char */
146
147#define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */
148 /* action that represents a metacharacter */
149
150/*----------------------------------------------------------------------*/
151#define MAPSIZE 16 /* need this many bytes for character class bit map */
152
153/*
154 * Advance a pointer into the pattern template
155 * to the next pattern element, this is a +1 for
156 * all pattern elements but M_CCL, where you
157 * to skip past both the M_CCL character and the
158 * bitmap that follows that character
159 */
160
161#define ADVANCE(pat) (pat += (*pat == (pattern)M_CCL) ? (MAPSIZE+1) : 1)
162
163/*
164 * Bitmap functions. Set bit b in the map and
165 * test bit b to see if it was set previously.
166 */
167
168#define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= pattern(1 << ((b) & 0x07)) )
169#define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) )
170/*----------------------------------------------------------------------*/
171#define E_NONE 0 /* Possible return values from pat_err. */
172#define E_ILLEGAL 1 /* Set in makepat() to indicate prob- */
173#define E_NOMEM 2 /* lems that came up while making the */
174#define E_PAT 3 /* pattern template. */
175
176/*----------------------------------------------------------------------*/
177
178 static const tchar *doccl(pattern *, const tchar *);
179 static int hex2bin(int);
180 extern int makepat( const tchar *, pattern *, size_t);
181 extern const tchar *matchs( const tchar *, const pattern *, tchar * *);
182 static int oct2bin(int);
183 static int omatch(const tchar * *, const pattern *, const tchar *);
184 extern const tchar *patcmp(const tchar *, const pattern *, const tchar *);
185 extern int esc( const tchar * *);
186
187/*----------------------------------------------------------------------*/
188int
189makepat( const tchar *exp, pattern *pat, size_t maxpat)
190// char *exp; /* Regular expression */
191// pattern *pat; /* Assembled compiled pattern */
192// int maxpat; /* Length of pat */
193{
194 /*
195 * Make a pattern template from the string pointed to by exp. Stop when
196 * '\0' or '\n' is found in exp. The pattern template is assembled
197 * in pat whose length is given by maxpat.
198 *
199 * Return:
200 * E_ILLEGAL Illegal input pattern.
201 * E_NOMEM out of memory.
202 * E_PAT pattern too long.
203 */
204
205 pattern *cur; /* pointer to current pattern element */
206 pattern *prev; /* pointer to previous pattern element */
207 int Error = E_ILLEGAL;
208
209 if (!*exp || *exp == _T('\n'))
210 goto exit;
211
212 if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT)
213 goto exit;
214
215 Error = E_NOMEM;
216 if (!pat)
217 goto exit; /* Check for bad pat */
218
219 prev = cur = pat;
220 Error = E_PAT;
221
222 while (*exp && *exp != _T('\n')) {
223
224 if (cur >= &pat[maxpat - 1])
225 goto exit;
226
227 switch (*exp) {
228 case ANY:
229 *cur = (pattern)M_ANY;
230 prev = cur++;
231 ++exp;
232 break;
233
234 case BOL:
235 *cur = (cur == pat) ? (pattern)M_BOL : *exp;
236 prev = cur++;
237 ++exp;
238 break;
239
240 case EOL:
241 *cur = (!exp[1] || exp[1] == _T('\n')) ? (pattern)M_EOL : *exp;
242 prev = cur++;
243 ++exp;
244 break;
245
246 case CCL:
247 if (uint((cur - pat) + MAPSIZE) >= maxpat)
248 goto exit; /* not enough room for bit map */
249 prev = cur;
250 *cur++ = (pattern)M_CCL;
251 exp = doccl(cur, exp);
252 cur += MAPSIZE;
253 break;
254
255 case OPT:
256 case CLOSURE:
257 case PCLOSE:
258 switch (*prev) {
259 case M_BOL:
260 case M_EOL:
261 case M_OPT:
262 case M_PCLOSE:
263 case M_CLOSE:
264 goto exit;
265 }
266
267 /* memmove( prev+1, prev, cur-prev ); */
268 {
269 pattern *p = cur;
270 while (p > prev) {
271 *p = *(p - 1);
272 p--;
273 }
274 }
275 *prev = (*exp == OPT) ? (pattern)M_OPT :
277 ++cur;
278 ++exp;
279 break;
280
281 default:
282 prev = cur;
283 *cur++ = (pattern)esc(&exp);
284 break;
285 }
286 }
287
288 *cur = _T('\0');
289 Error = E_NONE;
290
291 exit:
292 return Error;
293}
294
295/*----------------------------------------------------------------------*/
296static const tchar *
297doccl( pattern *map, const tchar * src)
298{
299 /*
300 * Set bits in the map corresponding to characters specified in the src
301 * character class.
302 */
303
304 int first, last, negative;
305 const tchar *start;
306
307 ++src; /* skip past the [ */
308 negative = (*src == NCCL);
309 if (negative) /* check for negative ccl */
310 ++src;
311 start = src; /* start of characters in class */
312 memset(map, 0, MAPSIZE); /* bitmap initially empty */
313
314 while (*src && *src != CCLEND) {
315 if (*src != _T('-')) {
316 first = esc(&src); /* Use temp. to avoid macro */
317 SETBIT(first, map); /* side effects. */
318 }
319 else if (src == start) {
320 SETBIT(_T('-'), map); /* literal dash at start or end */
321 ++src;
322 }
323 else {
324 ++src; /* skip to end-of-sequence char */
325 if (*src < src[-2]) {
326 first = *src;
327 last = src[-2];
328 }
329 else {
330 first = src[-2];
331 last = *src;
332 }
333 while (++first <= last)
334 SETBIT(first, map);
335 src++;
336 }
337 }
338
339 if (*src == CCLEND)
340 ++src; /* Skip CCLEND */
341
342 if (negative)
343 for (first = MAPSIZE; --first >= 0;)
344 *map++ ^= ~0; /* invert all bits */
345
346 return src;
347}
348
349/*----------------------------------------------------------------------*/
350const tchar *
351matchs( const tchar * str, const pattern * pat, tchar * * startpat)
352{
353 const tchar * endp = nullptr;
354 const tchar * start;
355
356 if (!pat)
357 return nullptr;
358
359 if (*str == _T('\0')) {
360 if ((*pat == (pattern)M_EOL) || (*pat == (pattern)M_BOL && (!pat[1] || pat[1] == (pattern)M_EOL)))
361 endp = str;
362 }
363 else {
364 start = str; /* Do a brute-force substring search,
365 * comparing a pattern against the input string */
366 while (*str) {
367 endp = patcmp(str, pat, start);
368 if (endp)
369 break;
370 str++;
371 }
372 }
373 *startpat = (tchar*)str;
374 return endp;
375}
376
377/*----------------------------------------------------------------------*/
378const tchar *
379patcmp( const tchar * str, const pattern * pat, const tchar * start)
380{
381 /*
382 * Like strcmp, but compares str against pat. Each element of str is
383 * compared with the template until either a mis-match is found or the end
384 * of the template is reached. In the former case a 0 is returned; in the
385 * latter, a pointer into str (pointing to the last character in the
386 * matched pattern) is returned. Strstart points at the first character in
387 * the string, which might not be the same thing as line if the search
388 * started in the middle of the string.
389 */
390
391 const tchar * bocl; /* beginning of closure string. */
392 const tchar * end=0; /* return value: end-of-string pointer. */
393
394 if (!pat) /* make sure pattern is valid */
395 return nullptr;
396
397 while (*pat) {
398 if (*pat == (pattern)M_OPT) {
399 /*
400 * Zero or one matches. It doesn't matter if omatch fails---it will
401 * advance str past the character on success, though. Always advance
402 * the pattern past both the M_OPT and the operand.
403 */
404
405 omatch(&str, ++pat, start);
406 ADVANCE(pat);
407 }
408 else if (!(*pat == (pattern)M_CLOSE || *pat == (pattern)M_PCLOSE)) {
409 /*
410 * Do a simple match. Note that omatch() fails if there's still
411 * something in pat but we're at end of string.
412 */
413
414 if (!omatch(&str, pat, start))
415 return nullptr;
416
417 ADVANCE(pat);
418
419 } else { /* Process a Kleene or positive closure */
420
421 if (*pat++ == (pattern)M_PCLOSE) /* one match required */
422 if (!omatch(&str, pat, start))
423 return nullptr;
424
425 /* Match as many as possible, zero is okay */
426
427 bocl = str;
428 while (*str && omatch(&str, pat, start)) { /* do nothing */ }
429
430 /*
431 * 'str' now points to the character that made made us fail. Try to
432 * process the rest of the string. If the character following the
433 * closure could have been in the closure (as in the pattern "[a-z]*t")
434 * the final 't' will be sucked up in the while loop. So, if the match
435 * fails, back up a notch and try to match the rest of the string
436 * again, repeating this process recursively until we get back to the
437 * beginning of the closure. The recursion goes, at most, one levels
438 * deep.
439 */
440
441 if (*ADVANCE(pat)) {
442 for (; bocl <= str; --str) {
443 end = patcmp(str, pat, start);
444 if (end) break;
445 }
446 return end;
447 }
448 break;
449 }
450 }
451
452 /*
453 * omatch() advances str to point at the next character to be matched. So
454 * str points at the character following the last character matched when
455 * you reach the end of the template. The exceptions are templates
456 * containing only a BOLN or EOLN token. In these cases omatch doesn't
457 * advance. Since we must return a pointer to the last matched character,
458 * decrement str to make it point at the end of the matched string, making
459 * sure that the decrement hasn't gone past the beginning of the string.
460 *
461 * Note that $ is a position, not a character, but in the case of a pattern
462 * ^$, a pointer to the end of line character is returned. In ^xyz$, a
463 * pointer to the z is returned.
464 *
465 * The --str is done outside the return statement because __max() was a macro
466 * with side-effects.
467 */
468
469 --str;
470 return (std::max(start, str));
471}
472
473/*----------------------------------------------------------------------*/
474static int
475omatch( const tchar * * strp,
476 const pattern * pat,
477 const tchar * start )
478{
479 /*
480 * Match one pattern element, pointed at by pat, against the character at
481 * **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
482 * over the matched character on a successful match. Closure is handled one
483 * level up by patcmp().
484 *
485 * "start" points at the character at the left edge of the line. This might
486 * not be the same thing as *strp if the search is starting in the middle
487 * of the string. An end-of- line anchor matches '\n' or '\0'.
488 */
489
490 int advance = -1; /* amount to advance *strp, -1 == error */
491
492 switch (*pat) {
493 case M_BOL: /* First char in string? */
494 if (*strp == start) /* Only one star here. */
495 advance = 0;
496 break;
497
498 case M_ANY: /* . = anything but newline and end-of-string */
499 if (**strp != _T('\n') && **strp != _T('\0'))
500 advance = 1;
501 break;
502
503 case M_EOL:
504 if (**strp == _T('\n') || **strp == _T('\0'))
505 advance = 0;
506 break;
507
508 case M_CCL:
509 if (**strp != _T('\0') && TSTBIT(**strp, pat + 1)) /* the end must never match */
510 advance = 1;
511 break;
512
513 default: /* literal match */
514 if (**strp == *pat)
515 advance = 1;
516 break;
517 }
518
519 if (advance > 0)
520 *strp += advance;
521
522 return (advance + 1);
523}
524
525#define ISHEXDIGIT(x) (_istdigit(x) \
526 || (_T('a')<=(x) && (x)<=_T('f')) \
527 || (_T('A')<=(x) && (x)<=_T('F')) )
528
529#define ISOCTDIGIT(x) (_T('0')<=(x) && (x)<=_T('7'))
530
531static int hex2bin( int c )
532{
533 /* Convert the hex digit represented by 'c' to an int. 'c'
534 * must be one of: 0123456789abcdefABCDEF
535 */
536 return (_istdigit(c) ? (c)-_T('0') : ((_toupper(c))-_T('A'))+10) & 0xf;
537}
538
539static int oct2bin( int c )
540{
541 /* Convert the hex digit represented by 'c' to an int. 'c'
542 * must be a digit in the range '0'-'7'.
543 */
544 return ( ((c)-_T('0')) & 0x7 );
545}
546
547/*------------------------------------------------------------*/
548
549int esc( const tchar * * s)
550{
551 /* Map escape sequences into their equivalent symbols. Return
552 * the equivalent ASCII character. *s is advanced past the
553 * escape sequence. If no escape sequence is present, the
554 * current character is returned and the string is advanced by
555 * one. The following are recognized:
556 *
557 * \b backspace
558 * \f formfeed
559 * \n newline
560 * \r carriage return
561 * \s space
562 * \t tab
563 * \e ASCII ESC character ('\033')
564 * \DDD number formed of 1-3 octal digits
565 * \xDDD number formed of 1-3 hex digits
566 * \^C C = any letter. Control code
567 */
568
569 int rval;
570
571 if( **s != _T('\\') )
572 rval = *( (*s)++ );
573 else {
574 ++(*s); /* Skip the \ */
575 switch( toupper(**s) ) {
576 case _T('\0'): rval = _T('\\'); break;
577 case _T('B'): rval = _T('\b') ; break;
578 case _T('F'): rval = _T('\f') ; break;
579 case _T('N'): rval = _T('\n') ; break;
580 case _T('R'): rval = _T('\r') ; break;
581 case _T('S'): rval = _T(' ') ; break;
582 case _T('T'): rval = _T('\t') ; break;
583 case _T('E'): rval = _T('\033'); break;
584
585 case _T('^'):
586 rval = *++(*s) ;
587 rval = _toupper(rval) - _T('@') ;
588 break;
589
590 case _T('X'):
591 rval = 0;
592 ++(*s);
593 if( ISHEXDIGIT(**s) ) {
594 rval = hex2bin( *(*s)++ );
595 }
596 if( ISHEXDIGIT(**s) ) {
597 rval <<= 4;
598 rval |= hex2bin( *(*s)++ );
599 }
600 if( ISHEXDIGIT(**s) ) {
601 rval <<= 4;
602 rval |= hex2bin( *(*s)++ );
603 }
604 --(*s);
605 break;
606
607 default:
608 if( !ISOCTDIGIT(**s) )
609 rval = **s;
610 else {
611 ++(*s);
612 rval = oct2bin( *(*s)++ );
613 if( ISOCTDIGIT(**s) ) {
614 rval <<= 3;
615 rval |= oct2bin( *(*s)++ );
616 }
617 if( ISOCTDIGIT(**s) ) {
618 rval <<= 3;
619 rval |= oct2bin( *(*s)++ );
620 }
621 --(*s);
622 }
623 break;
624 }
625 ++(*s);
626 }
627 return rval;
628}
629
630}
631
632//#endif
633
634
635//==============================================================================
636
#define PRECONDITION(condition)
Definition checks.h:227
This class represents regular expressions.
Definition regexp.h:94
TRegexp & operator=(const TRegexp &r)
Definition regexp.cpp:64
StatVal status() noexcept
Definition regexp.cpp:74
StatVal
StatVal enumerates the status conditions returned by TRegexp::status.
Definition regexp.h:97
@ OK
Means the given regular expression is legal.
Definition regexp.h:98
size_t find(const tstring &s, size_t *len, size_t start=0) const
Definition regexp.cpp:86
TRegexp(const tchar *cp)
Definition regexp.cpp:29
#define _istdigit
Definition cygwin.h:70
unsigned char _TUCHAR
Definition cygwin.h:44
#define _T(x)
Definition cygwin.h:51
Object Windows Library (OWLNext Core)
Definition animctrl.h:22
_TUCHAR pattern
Definition regexp.cpp:145
int makepat(const tchar *exp, PatternType *pat, size_t maxpattern)
Definition regexp.cpp:189
utchar PatternType
Definition regexp.cpp:20
char tchar
Definition defs.h:77
const tchar * patcmp(const tchar *, const pattern *, const tchar *)
Definition regexp.cpp:379
int esc(const tchar **)
Definition regexp.cpp:549
std::string tstring
Definition defs.h:79
const tchar * matchs(const tchar *str, const PatternType *pat, tchar **startpat)
Definition regexp.cpp:351
unsigned int uint
Definition number.h:25
unsigned char utchar
Definition defs.h:78
action
Definition regexp.cpp:133
@ M_EOL
Definition regexp.cpp:136
@ M_CLOSE
Definition regexp.cpp:140
@ M_OPT
Definition regexp.cpp:139
@ M_PCLOSE
Definition regexp.cpp:141
@ M_BOL
Definition regexp.cpp:135
@ M_ANY
Definition regexp.cpp:137
@ M_CCL
Definition regexp.cpp:138
General definitions used by all ObjectWindows programs.
#define MAPSIZE
Definition regexp.cpp:151
#define ISHEXDIGIT(x)
Definition regexp.cpp:525
#define OPT
Definition regexp.cpp:129
#define E_NOMEM
Definition regexp.cpp:173
#define BOL
Definition regexp.cpp:121
#define NCCL
Definition regexp.cpp:126
#define PCLOSE
Definition regexp.cpp:128
#define EOL
Definition regexp.cpp:122
#define E_NONE
Definition regexp.cpp:171
#define SETBIT(b, map)
Definition regexp.cpp:168
#define CCLEND
Definition regexp.cpp:125
#define ANY
Definition regexp.cpp:123
#define ISOCTDIGIT(x)
Definition regexp.cpp:529
#define E_ILLEGAL
Definition regexp.cpp:172
#define TSTBIT(b, map)
Definition regexp.cpp:169
#define CCL
Definition regexp.cpp:124
#define E_PAT
Definition regexp.cpp:174
#define CLOSURE
Definition regexp.cpp:127
#define ADVANCE(pat)
Definition regexp.cpp:161