OWLNext    7.0
Borland's Object Windows Library for the modern age
Loading...
Searching...
No Matches
regexp.cpp
Go to the documentation of this file.
1
2#include <owl/pch.h>
3#include <owl/defs.h>
4
5#include <algorithm>
6
7//
8// Borland includes this in run-time library
9//
10//#if !defined(BI_COMP_BORLANDC)
11
12#include <owl/private/regexp.h>
13
14namespace owl
15{
16
17/////////////////////////////////////
18// TRegexp Class
19
21
22int makepat(const tchar * exp, PatternType * pat, size_t maxpattern);
23const tchar * matchs(const tchar * str,
24 const PatternType * pat,
25 tchar * * startpat);
26
27const unsigned TRegexp::maxpat=128;
28
30{
31 gen_pattern( str );
32}
33
35{
36 copy_pattern( r );
37}
38
40{
41 delete[] the_pattern;
42}
43
44void TRegexp::copy_pattern(const TRegexp & r)
45{
46 the_pattern = new PatternType[maxpat];
47 memcpy( the_pattern, r.the_pattern, maxpat );
48 stat = r.stat;
49}
50
51void TRegexp::gen_pattern(const tchar * str)
52{
53 the_pattern = new PatternType[maxpat];
54 stat = (StatVal)makepat( str, the_pattern, maxpat );
55}
56
58{
59 delete the_pattern;
60 gen_pattern( cp );
61 return *this;
62}
63
65{
66 if(this != &r)
67 {
68 delete the_pattern;
69 copy_pattern( r );
70 }
71 return *this;
72}
73
75{
76 StatVal temp = stat;
77 stat = OK;
78 return temp;
79}
80
81int makepat(const tchar *exp, PatternType *pat, size_t maxpattern);
82const tchar * matchs( const tchar *str,
83 const PatternType *pat,
84 tchar * *startpat);
85
86size_t TRegexp::find( const tstring & str,
87 size_t *len,
88 size_t i ) const
89{
90 PRECONDITION( stat==OK );
91 tchar * startp;
92 const tchar * s = str.c_str();
93 const tchar * endp = matchs( s+i, the_pattern, &startp );
94 if( endp ){
95 *len = (size_t)((tchar *)endp - startp + 1);
96 return (size_t)(startp - (tchar *)s);
97 }
98 else{
99 *len = 0;
100 return static_cast<size_t>(-1); //JJH added static_cast
101 }
102}
103
104/*
105 *
106 * Author: Allen I. Holub
107 *
108 * (c) C Gazette. May be used freely as long as author and publication are
109 * acknowledged
110 *
111 */
112
113/*
114 *
115 * Modified by Borland International to compile without warnings as C++.
116 *
117 * Modified by Yura Bidus to support UNICODE.
118 */
119
120/* Metacharacters in the input: */
121#define BOL _T('^') /* start-of-line anchor */
122#define EOL _T('$') /* end-of-line anchor */
123#define ANY _T('.') /* matches any character */
124#define CCL _T('[') /* start a character class */
125#define CCLEND _T(']') /* end a character class */
126#define NCCL _T('^') /* negates character class if 1st char. */
127#define CLOSURE _T('*') /* Kleene closure (matches 0 or more) */
128#define PCLOSE _T('+') /* Positive closure (1 or more) */
129#define OPT _T('?') /* Optional closure (0 or 1) */
130
131//?????????????????
132// 0x80 will not work in UNICODE !!!!!!!!!!!!
133typedef enum action { /* These are put in the pattern string */
134 /* to represent metacharacters. */
135 M_BOL = (0x80 | _T('^')),
136 M_EOL = (0x80 | _T('$')),
137 M_ANY = (0x80 | _T('.')),
138 M_CCL = (0x80 | _T('[')),
139 M_OPT = (0x80 | _T('?')),
140 M_CLOSE = (0x80 | _T('*')),
141 M_PCLOSE = (0x80 | _T('+'))
143
144//typedef unsigned char pattern; /* pattern strings are unsigned char */
146
147#define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */
148 /* action that represents a metacharacter */
149
150/*----------------------------------------------------------------------*/
151#define MAPSIZE 16 /* need this many bytes for character class bit map */
152
153/*
154 * Advance a pointer into the pattern template
155 * to the next pattern element, this is a +1 for
156 * all pattern elements but M_CCL, where you
157 * to skip past both the M_CCL character and the
158 * bitmap that follows that character
159 */
160
161#define ADVANCE(pat) (pat += (*pat == (pattern)M_CCL) ? (MAPSIZE+1) : 1)
162
163/*
164 * Bitmap functions. Set bit b in the map and
165 * test bit b to see if it was set previously.
166 */
167
168#define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= pattern(1 << ((b) & 0x07)) )
169#define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) )
170/*----------------------------------------------------------------------*/
171#define E_NONE 0 /* Possible return values from pat_err. */
172#define E_ILLEGAL 1 /* Set in makepat() to indicate prob- */
173#define E_NOMEM 2 /* lems that came up while making the */
174#define E_PAT 3 /* pattern template. */
175
176/*----------------------------------------------------------------------*/
177
178 static const tchar *doccl(pattern *, const tchar *);
179 static int hex2bin(int);
180 extern int makepat( const tchar *, pattern *, size_t);
181 extern const tchar *matchs( const tchar *, const pattern *, tchar * *);
182 static int oct2bin(int);
183 static int omatch(const tchar * *, const pattern *, const tchar *);
184 extern const tchar *patcmp(const tchar *, const pattern *, const tchar *);
185 extern int esc( const tchar * *);
186
187/*----------------------------------------------------------------------*/
188int
189makepat( const tchar *exp, pattern *pat, size_t maxpat)
190// char *exp; /* Regular expression */
191// pattern *pat; /* Assembled compiled pattern */
192// int maxpat; /* Length of pat */
193{
194 /*
195 * Make a pattern template from the string pointed to by exp. Stop when
196 * '\0' or '\n' is found in exp. The pattern template is assembled
197 * in pat whose length is given by maxpat.
198 *
199 * Return:
200 * E_ILLEGAL Illegal input pattern.
201 * E_NOMEM out of memory.
202 * E_PAT pattern too long.
203 */
204
205 pattern *cur; /* pointer to current pattern element */
206 pattern *prev; /* pointer to previous pattern element */
207 int Error = E_ILLEGAL;
208
209 if (!*exp || *exp == _T('\n'))
210 goto exit;
211
212 if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT)
213 goto exit;
214
215 Error = E_NOMEM;
216 if (!pat)
217 goto exit; /* Check for bad pat */
218
219 prev = cur = pat;
220 Error = E_PAT;
221
222 while (*exp && *exp != _T('\n')) {
223
224 if (cur >= &pat[maxpat - 1])
225 goto exit;
226
227 switch (*exp) {
228 case ANY:
229 *cur = (pattern)M_ANY;
230 prev = cur++;
231 ++exp;
232 break;
233
234 case BOL:
235 *cur = (cur == pat) ? (pattern)M_BOL : *exp;
236 prev = cur++;
237 ++exp;
238 break;
239
240 case EOL:
241 *cur = (!exp[1] || exp[1] == _T('\n')) ? (pattern)M_EOL : *exp;
242 prev = cur++;
243 ++exp;
244 break;
245
246 case CCL:
247 if (uint((cur - pat) + MAPSIZE) >= maxpat)
248 goto exit; /* not enough room for bit map */
249 prev = cur;
250 *cur++ = (pattern)M_CCL;
251 exp = doccl(cur, exp);
252 cur += MAPSIZE;
253 break;
254
255 case OPT:
256 case CLOSURE:
257 case PCLOSE:
258 switch (*prev) {
259 case M_BOL:
260 case M_EOL:
261 case M_OPT:
262 case M_PCLOSE:
263 case M_CLOSE:
264 goto exit;
265 }
266
267 /* memmove( prev+1, prev, cur-prev ); */
268 {
269 pattern *p = cur;
270 while (p > prev) {
271 *p = *(p - 1);
272 p--;
273 }
274 }
275 *prev = (*exp == OPT) ? (pattern)M_OPT :
277 ++cur;
278 ++exp;
279 break;
280
281 default:
282 prev = cur;
283 *cur++ = (pattern)esc(&exp);
284 break;
285 }
286 }
287
288 *cur = _T('\0');
289 Error = E_NONE;
290
291 exit:
292 return Error;
293}
294
295/*----------------------------------------------------------------------*/
296static const tchar *
297doccl( pattern *map, const tchar * src)
298{
299 /*
300 * Set bits in the map corresponding to characters specified in the src
301 * character class.
302 */
303
304 int first, last, negative;
305 const tchar *start;
306
307 ++src; /* skip past the [ */
308 negative = (*src == NCCL);
309 if (negative) /* check for negative ccl */
310 ++src;
311 start = src; /* start of characters in class */
312 memset(map, 0, MAPSIZE); /* bitmap initially empty */
313
314 while (*src && *src != CCLEND) {
315 if (*src != _T('-')) {
316 first = esc(&src); /* Use temp. to avoid macro */
317 SETBIT(first, map); /* side effects. */
318 }
319 else if (src == start) {
320 SETBIT(_T('-'), map); /* literal dash at start or end */
321 ++src;
322 }
323 else {
324 ++src; /* skip to end-of-sequence char */
325 if (*src < src[-2]) {
326 first = *src;
327 last = src[-2];
328 }
329 else {
330 first = src[-2];
331 last = *src;
332 }
333 while (++first <= last)
334 SETBIT(first, map);
335 src++;
336 }
337 }
338
339 if (*src == CCLEND)
340 ++src; /* Skip CCLEND */
341
342 if (negative)
343 for (first = MAPSIZE; --first >= 0;)
344 *map++ ^= ~0; /* invert all bits */
345
346 return src;
347}
348
349/*----------------------------------------------------------------------*/
350const tchar *
351matchs( const tchar * str, const pattern * pat, tchar * * startpat)
352{
353 const tchar * endp = nullptr;
354 const tchar * start;
355
356 if (!pat)
357 return nullptr;
358
359 if (*str == _T('\0')) {
360 if ((*pat == (pattern)M_EOL) || (*pat == (pattern)M_BOL && (!pat[1] || pat[1] == (pattern)M_EOL)))
361 endp = str;
362 }
363 else {
364 start = str; /* Do a brute-force substring search,
365 * comparing a pattern against the input string */
366 while (*str) {
367 endp = patcmp(str, pat, start);
368 if (endp)
369 break;
370 str++;
371 }
372 }
373 *startpat = (tchar*)str;
374 return endp;
375}
376
377/*----------------------------------------------------------------------*/
378const tchar *
379patcmp( const tchar * str, const pattern * pat, const tchar * start)
380{
381 /*
382 * Like strcmp, but compares str against pat. Each element of str is
383 * compared with the template until either a mis-match is found or the end
384 * of the template is reached. In the former case a 0 is returned; in the
385 * latter, a pointer into str (pointing to the last character in the
386 * matched pattern) is returned. Strstart points at the first character in
387 * the string, which might not be the same thing as line if the search
388 * started in the middle of the string.
389 */
390
391 const tchar * bocl; /* beginning of closure string. */
392 const tchar * end=0; /* return value: end-of-string pointer. */
393
394 if (!pat) /* make sure pattern is valid */
395 return nullptr;
396
397 while (*pat) {
398 if (*pat == (pattern)M_OPT) {
399 /*
400 * Zero or one matches. It doesn't matter if omatch fails---it will
401 * advance str past the character on success, though. Always advance
402 * the pattern past both the M_OPT and the operand.
403 */
404
405 omatch(&str, ++pat, start);
406 ADVANCE(pat);
407 }
408 else if (!(*pat == (pattern)M_CLOSE || *pat == (pattern)M_PCLOSE)) {
409 /*
410 * Do a simple match. Note that omatch() fails if there's still
411 * something in pat but we're at end of string.
412 */
413
414 if (!omatch(&str, pat, start))
415 return nullptr;
416
417 ADVANCE(pat);
418
419 } else { /* Process a Kleene or positive closure */
420
421 if (*pat++ == (pattern)M_PCLOSE) /* one match required */
422 if (!omatch(&str, pat, start))
423 return nullptr;
424
425 /* Match as many as possible, zero is okay */
426
427 bocl = str;
428 while (*str && omatch(&str, pat, start)) { /* do nothing */ }
429
430 /*
431 * 'str' now points to the character that made made us fail. Try to
432 * process the rest of the string. If the character following the
433 * closure could have been in the closure (as in the pattern "[a-z]*t")
434 * the final 't' will be sucked up in the while loop. So, if the match
435 * fails, back up a notch and try to match the rest of the string
436 * again, repeating this process recursively until we get back to the
437 * beginning of the closure. The recursion goes, at most, one levels
438 * deep.
439 */
440
441 if (*ADVANCE(pat)) {
442 for (; bocl <= str; --str) {
443 end = patcmp(str, pat, start);
444 if (end) break;
445 }
446 return end;
447 }
448 break;
449 }
450 }
451
452 /*
453 * omatch() advances str to point at the next character to be matched. So
454 * str points at the character following the last character matched when
455 * you reach the end of the template. The exceptions are templates
456 * containing only a BOLN or EOLN token. In these cases omatch doesn't
457 * advance. Since we must return a pointer to the last matched character,
458 * decrement str to make it point at the end of the matched string, making
459 * sure that the decrement hasn't gone past the beginning of the string.
460 *
461 * Note that $ is a position, not a character, but in the case of a pattern
462 * ^$, a pointer to the end of line character is returned. In ^xyz$, a
463 * pointer to the z is returned.
464 *
465 * The --str is done outside the return statement because __max() was a macro
466 * with side-effects.
467 */
468
469 --str;
470 return (std::max(start, str));
471}
472
473/*----------------------------------------------------------------------*/
474static int
475omatch( const tchar * * strp,
476 const pattern * pat,
477 const tchar * start )
478{
479 /*
480 * Match one pattern element, pointed at by pat, against the character at
481 * **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
482 * over the matched character on a successful match. Closure is handled one
483 * level up by patcmp().
484 *
485 * "start" points at the character at the left edge of the line. This might
486 * not be the same thing as *strp if the search is starting in the middle
487 * of the string. An end-of- line anchor matches '\n' or '\0'.
488 */
489
490 int advance = -1; /* amount to advance *strp, -1 == error */
491
492 switch (*pat) {
493 case M_BOL: /* First char in string? */
494 if (*strp == start) /* Only one star here. */
495 advance = 0;
496 break;
497
498 case M_ANY: /* . = anything but newline and end-of-string */
499 if (**strp != _T('\n') && **strp != _T('\0'))
500 advance = 1;
501 break;
502
503 case M_EOL:
504 if (**strp == _T('\n') || **strp == _T('\0'))
505 advance = 0;
506 break;
507
508 case M_CCL:
509 if (**strp != _T('\0') && TSTBIT(**strp, pat + 1)) /* the end must never match */
510 advance = 1;
511 break;
512
513 default: /* literal match */
514 if (**strp == *pat)
515 advance = 1;
516 break;
517 }
518
519 if (advance > 0)
520 *strp += advance;
521
522 return (advance + 1);
523}
524
525#define ISHEXDIGIT(x) (_istdigit(x) \
526 || (_T('a')<=(x) && (x)<=_T('f')) \
527 || (_T('A')<=(x) && (x)<=_T('F')) )
528
529#define ISOCTDIGIT(x) (_T('0')<=(x) && (x)<=_T('7'))
530
531static int hex2bin( int c )
532{
533 /* Convert the hex digit represented by 'c' to an int. 'c'
534 * must be one of: 0123456789abcdefABCDEF
535 */
536 return (_istdigit(c) ? (c)-_T('0') : ((_toupper(c))-_T('A'))+10) & 0xf;
537}
538
539static int oct2bin( int c )
540{
541 /* Convert the hex digit represented by 'c' to an int. 'c'
542 * must be a digit in the range '0'-'7'.
543 */
544 return ( ((c)-_T('0')) & 0x7 );
545}
546
547/*------------------------------------------------------------*/
548
549int esc( const tchar * * s)
550{
551 /* Map escape sequences into their equivalent symbols. Return
552 * the equivalent ASCII character. *s is advanced past the
553 * escape sequence. If no escape sequence is present, the
554 * current character is returned and the string is advanced by
555 * one. The following are recognized:
556 *
557 * \b backspace
558 * \f formfeed
559 * \n newline
560 * \r carriage return
561 * \s space
562 * \t tab
563 * \e ASCII ESC character ('\033')
564 * \DDD number formed of 1-3 octal digits
565 * \xDDD number formed of 1-3 hex digits
566 * \^C C = any letter. Control code
567 */
568
569 int rval;
570
571 if( **s != _T('\\') )
572 rval = *( (*s)++ );
573 else {
574 ++(*s); /* Skip the \ */
575 switch( toupper(**s) ) {
576 case _T('\0'): rval = _T('\\'); break;
577 case _T('B'): rval = _T('\b') ; break;
578 case _T('F'): rval = _T('\f') ; break;
579 case _T('N'): rval = _T('\n') ; break;
580 case _T('R'): rval = _T('\r') ; break;
581 case _T('S'): rval = _T(' ') ; break;
582 case _T('T'): rval = _T('\t') ; break;
583 case _T('E'): rval = _T('\033'); break;
584
585 case _T('^'):
586 rval = *++(*s) ;
587 rval = _toupper(rval) - _T('@') ;
588 break;
589
590 case _T('X'):
591 rval = 0;
592 ++(*s);
593 if( ISHEXDIGIT(**s) ) {
594 rval = hex2bin( *(*s)++ );
595 }
596 if( ISHEXDIGIT(**s) ) {
597 rval <<= 4;
598 rval |= hex2bin( *(*s)++ );
599 }
600 if( ISHEXDIGIT(**s) ) {
601 rval <<= 4;
602 rval |= hex2bin( *(*s)++ );
603 }
604 --(*s);
605 break;
606
607 default:
608 if( !ISOCTDIGIT(**s) )
609 rval = **s;
610 else {
611 ++(*s);
612 rval = oct2bin( *(*s)++ );
613 if( ISOCTDIGIT(**s) ) {
614 rval <<= 3;
615 rval |= oct2bin( *(*s)++ );
616 }
617 if( ISOCTDIGIT(**s) ) {
618 rval <<= 3;
619 rval |= oct2bin( *(*s)++ );
620 }
621 --(*s);
622 }
623 break;
624 }
625 ++(*s);
626 }
627 return rval;
628}
629
630}
631
632//#endif
633
634
635//==============================================================================
636
#define PRECONDITION(condition)
Definition checks.h:227
This class represents regular expressions.
Definition regexp.h:94
TRegexp & operator=(const TRegexp &r)
Definition regexp.cpp:64
StatVal status() noexcept
Definition regexp.cpp:74
StatVal
StatVal enumerates the status conditions returned by TRegexp::status.
Definition regexp.h:97
@ OK
Means the given regular expression is legal.
Definition regexp.h:98
size_t find(const tstring &s, size_t *len, size_t start=0) const
Definition regexp.cpp:86
TRegexp(const tchar *cp)
Definition regexp.cpp:29
#define _istdigit
Definition cygwin.h:70
unsigned char _TUCHAR
Definition cygwin.h:44
#define _T(x)
Definition cygwin.h:51
Object Windows Library (OWLNext Core)
Definition animctrl.h:22
_TUCHAR pattern
Definition regexp.cpp:145
int makepat(const tchar *exp, PatternType *pat, size_t maxpattern)
Definition regexp.cpp:189
utchar PatternType
Definition regexp.cpp:20
char tchar
Definition defs.h:77
const tchar * patcmp(const tchar *, const pattern *, const tchar *)
Definition regexp.cpp:379
int esc(const tchar **)
Definition regexp.cpp:549
std::string tstring
Definition defs.h:79
const tchar * matchs(const tchar *str, const PatternType *pat, tchar **startpat)
Definition regexp.cpp:351
unsigned int uint
Definition number.h:25
unsigned char utchar
Definition defs.h:78
action
Definition regexp.cpp:133
@ M_EOL
Definition regexp.cpp:136
@ M_CLOSE
Definition regexp.cpp:140
@ M_OPT
Definition regexp.cpp:139
@ M_PCLOSE
Definition regexp.cpp:141
@ M_BOL
Definition regexp.cpp:135
@ M_ANY
Definition regexp.cpp:137
@ M_CCL
Definition regexp.cpp:138
General definitions used by all ObjectWindows programs.
#define MAPSIZE
Definition regexp.cpp:151
#define ISHEXDIGIT(x)
Definition regexp.cpp:525
#define OPT
Definition regexp.cpp:129
#define E_NOMEM
Definition regexp.cpp:173
#define BOL
Definition regexp.cpp:121
#define NCCL
Definition regexp.cpp:126
#define PCLOSE
Definition regexp.cpp:128
#define EOL
Definition regexp.cpp:122
#define E_NONE
Definition regexp.cpp:171
#define SETBIT(b, map)
Definition regexp.cpp:168
#define CCLEND
Definition regexp.cpp:125
#define ANY
Definition regexp.cpp:123
#define ISOCTDIGIT(x)
Definition regexp.cpp:529
#define E_ILLEGAL
Definition regexp.cpp:172
#define TSTBIT(b, map)
Definition regexp.cpp:169
#define CCL
Definition regexp.cpp:124
#define E_PAT
Definition regexp.cpp:174
#define CLOSURE
Definition regexp.cpp:127
#define ADVANCE(pat)
Definition regexp.cpp:161