regex.h

00001 /* Definitions for data structures and routines for the regular
00002   expression library, version 0.12.
00003 
00004   Copyright (C) 1985, 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc.
00005 
00006   This program is free software; you can redistribute it and/or modify
00007   it under the terms of the GNU General Public License as published by
00008   the Free Software Foundation; either version 2, or (at your option)
00009   any later version.
00010 
00011   This program is distributed in the hope that it will be useful,
00012   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014   GNU General Public License for more details.
00015 
00016   You should have received a copy of the GNU General Public License
00017   along with this program; if not, write to the Free Software
00018   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
00019 
00020 #ifndef __REGEXP_LIBRARY_H__
00021 #define __REGEXP_LIBRARY_H__
00022 
00023 #ifdef __cplusplus
00024 extern "C"
00025 {
00026 #endif
00027 
00028     /* POSIX says that <sys/types.h> must be included (by the caller) before
00029        <regex.h>.  */
00030 
00031 #ifdef VMS 
00032     /* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
00033        should be there.  */
00034 #include <stddef.h>
00035 #endif
00036 
00037 
00038     /* The following bits are used to determine the regexp syntax we
00039        recognize.  The set/not-set meanings are chosen so that Emacs syntax
00040        remains the value 0.  The bits are given in alphabetical order, and
00041        the definitions shifted by one from the previous bit; thus, when we
00042        add or remove a bit, only one other definition need change.  */
00043     typedef unsigned reg_syntax_t;
00044 
00045     /* If this bit is not set, then \ inside a bracket expression is literal.
00046        If set, then such a \ quotes the following character.  */
00047 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1)
00048 
00049     /* If this bit is not set, then + and ? are operators, and \+ and \? are
00050          literals. 
00051        If set, then \+ and \? are operators and + and ? are literals.  */
00052 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
00053 
00054     /* If this bit is set, then character classes are supported.  They are:
00055          [:alpha:], [:upper:], [:lower:],  [:digit:], [:alnum:], [:xdigit:],
00056          [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
00057        If not set, then character classes are not supported.  */
00058 #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
00059 
00060     /* If this bit is set, then ^ and $ are always anchors (outside bracket
00061          expressions, of course).
00062        If this bit is not set, then it depends:
00063             ^  is an anchor if it is at the beginning of a regular
00064                expression or after an open-group or an alternation operator;
00065             $  is an anchor if it is at the end of a regular expression, or
00066                before a close-group or an alternation operator.  
00067      
00068        This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
00069        POSIX draft 11.2 says that * etc. in leading positions is undefined.
00070        We already implemented a previous draft which made those constructs
00071        invalid, though, so we haven't changed the code back.  */
00072 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
00073 
00074     /* If this bit is set, then special characters are always special
00075          regardless of where they are in the pattern.
00076        If this bit is not set, then special characters are special only in
00077          some contexts; otherwise they are ordinary.  Specifically, 
00078          * + ? and intervals are only special when not after the beginning,
00079          open-group, or alternation operator.  */
00080 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
00081 
00082     /* If this bit is set, then *, +, ?, and { cannot be first in an re or
00083          immediately after an alternation or begin-group operator.  */
00084 #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
00085 
00086     /* If this bit is set, then . matches newline.
00087        If not set, then it doesn't.  */
00088 #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
00089 
00090     /* If this bit is set, then . doesn't match NUL.
00091        If not set, then it does.  */
00092 #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
00093 
00094     /* If this bit is set, nonmatching lists [^...] do not match newline.
00095        If not set, they do.  */
00096 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
00097 
00098     /* If this bit is set, either \{...\} or {...} defines an
00099          interval, depending on RE_NO_BK_BRACES. 
00100        If not set, \{, \}, {, and } are literals.  */
00101 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
00102 
00103     /* If this bit is set, +, ? and | aren't recognized as operators.
00104        If not set, they are.  */
00105 #define RE_LIMITED_OPS (RE_INTERVALS << 1)
00106 
00107     /* If this bit is set, newline is an alternation operator.
00108        If not set, newline is literal.  */
00109 #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
00110 
00111     /* If this bit is set, then `{...}' defines an interval, and \{ and \}
00112          are literals.
00113       If not set, then `\{...\}' defines an interval.  */
00114 #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
00115 
00116     /* If this bit is set, (...) defines a group, and \( and \) are literals.
00117        If not set, \(...\) defines a group, and ( and ) are literals.  */
00118 #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
00119 
00120     /* If this bit is set, then <digit> matches <digit>.
00121        If not set, then <digit> is a back-reference.  */
00122 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
00123 
00124     /* If this bit is set, then | is an alternation operator, and \| is literal.
00125        If not set, then \| is an alternation operator, and | is literal.  */
00126 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
00127 
00128     /* If this bit is set, then an ending range point collating higher
00129          than the starting range point, as in [z-a], is invalid.
00130        If not set, then when ending range point collates higher than the
00131          starting range point, the range is ignored.  */
00132 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
00133 
00134     /* If this bit is set, then an unmatched ) is ordinary.
00135        If not set, then an unmatched ) is invalid.  */
00136 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
00137 
00138     /* This global variable defines the particular regexp syntax to use (for
00139        some interfaces).  When a regexp is compiled, the syntax used is
00140        stored in the pattern buffer, so changing this does not affect
00141        already-compiled regexps.  */
00142     extern reg_syntax_t re_syntax_options;
00143     
00144     /* Define combinations of the above bits for the standard possibilities.
00145        (The [[[ comments delimit what gets put into the Texinfo file, so
00146        don't delete them!)  */ 
00147     /* [[[begin syntaxes]]] */
00148 #define RE_SYNTAX_EMACS 0
00149 
00150 #define RE_SYNTAX_AWK                                                   \
00151   (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL                       \
00152    | RE_NO_BK_PARENS            | RE_NO_BK_REFS                         \
00153    | RE_NO_BK_VBAR               | RE_NO_EMPTY_RANGES                   \
00154    | RE_UNMATCHED_RIGHT_PAREN_ORD)
00155 
00156 #define RE_SYNTAX_POSIX_AWK                                             \
00157   (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
00158 
00159 #define RE_SYNTAX_GREP                                                  \
00160   (RE_BK_PLUS_QM              | RE_CHAR_CLASSES                         \
00161    | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS                            \
00162    | RE_NEWLINE_ALT)
00163 
00164 #define RE_SYNTAX_EGREP                                                 \
00165   (RE_CHAR_CLASSES        | RE_CONTEXT_INDEP_ANCHORS                    \
00166    | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE                    \
00167    | RE_NEWLINE_ALT       | RE_NO_BK_PARENS                             \
00168    | RE_NO_BK_VBAR)
00169 
00170 #define RE_SYNTAX_POSIX_EGREP                                           \
00171   (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
00172 
00173     /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
00174 #define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
00175 
00176 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
00177 
00178     /* Syntax bits common to both basic and extended POSIX regex syntax.  */
00179 #define _RE_SYNTAX_POSIX_COMMON                                         \
00180   (RE_CHAR_CLASSES | RE_DOT_NEWLINE      | RE_DOT_NOT_NULL              \
00181    | RE_INTERVALS  | RE_NO_EMPTY_RANGES)
00182 
00183 #define RE_SYNTAX_POSIX_BASIC                                           \
00184   (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
00185 
00186     /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
00187        RE_LIMITED_OPS, i.e., \? \+ \| are not recognized.  Actually, this
00188        isn't minimal, since other operators, such as \`, aren't disabled.  */
00189 #define RE_SYNTAX_POSIX_MINIMAL_BASIC                                   \
00190   (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
00191 
00192 #define RE_SYNTAX_POSIX_EXTENDED                                        \
00193   (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS                   \
00194    | RE_CONTEXT_INDEP_OPS  | RE_NO_BK_BRACES                            \
00195    | RE_NO_BK_PARENS       | RE_NO_BK_VBAR                              \
00196    | RE_UNMATCHED_RIGHT_PAREN_ORD)
00197 
00198     /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
00199        replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added.  */
00200 #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED                                \
00201   (_RE_SYNTAX_POSIX_COMMON  | RE_CONTEXT_INDEP_ANCHORS                  \
00202    | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES                           \
00203    | RE_NO_BK_PARENS        | RE_NO_BK_REFS                             \
00204    | RE_NO_BK_VBAR          | RE_UNMATCHED_RIGHT_PAREN_ORD) 
00205     /* [[[end syntaxes]]] */
00206     
00207     /* Maximum number of duplicates an interval can allow.  Some systems
00208        (erroneously) define this in other header files, but we want our
00209        value, so remove any previous define.  */
00210 #ifdef RE_DUP_MAX
00211 #undef RE_DUP_MAX
00212 #endif
00213 #define RE_DUP_MAX ((1 << 15) - 1)
00214 
00215 
00216     /* POSIX `cflags' bits (i.e., information for `regcomp').  */
00217 
00218     /* If this bit is set, then use extended regular expression syntax.
00219        If not set, then use basic regular expression syntax.  */
00220 #define REG_EXTENDED 1
00221 
00222     /* If this bit is set, then ignore case when matching.
00223        If not set, then case is significant.  */
00224 #define REG_ICASE (REG_EXTENDED << 1)
00225 
00226     /* If this bit is set, then anchors do not match at newline
00227          characters in the string.
00228        If not set, then anchors do match at newlines.  */
00229 #define REG_NEWLINE (REG_ICASE << 1)
00230 
00231     /* If this bit is set, then report only success or fail in regexec.
00232        If not set, then returns differ between not matching and errors.  */
00233 #define REG_NOSUB (REG_NEWLINE << 1)
00234 
00235 
00236     /* POSIX `eflags' bits (i.e., information for regexec).  */
00237 
00238     /* If this bit is set, then the beginning-of-line operator doesn't match
00239          the beginning of the string (presumably because it's not the
00240          beginning of a line).
00241        If not set, then the beginning-of-line operator does match the
00242          beginning of the string.  */
00243 #define REG_NOTBOL 1
00244 
00245     /* Like REG_NOTBOL, except for the end-of-line.  */
00246 #define REG_NOTEOL (1 << 1)
00247 
00248 
00249     /* If any error codes are removed, changed, or added, update the
00250        `re_error_msg' table in regex.c.  */
00251     typedef enum
00252     {
00253         REG_NOERROR = 0,        /* Success.  */
00254         REG_NOMATCH,            /* Didn't find a match (for regexec).  */
00255 
00256         /* POSIX regcomp return error codes.  (In the order listed in the
00257            standard.)  */
00258         REG_BADPAT,             /* Invalid pattern.  */
00259         REG_ECOLLATE,           /* Not implemented.  */
00260         REG_ECTYPE,             /* Invalid character class name.  */
00261         REG_EESCAPE,            /* Trailing backslash.  */
00262         REG_ESUBREG,            /* Invalid back reference.  */
00263         REG_EBRACK,             /* Unmatched left bracket.  */
00264         REG_EPAREN,             /* Parenthesis imbalance.  */
00265         REG_EBRACE,             /* Unmatched \{.  */
00266         REG_BADBR,              /* Invalid contents of \{\}.  */
00267         REG_ERANGE,             /* Invalid range end.  */
00268         REG_ESPACE,             /* Ran out of memory.  */
00269         REG_BADRPT,             /* No preceding re for repetition op.  */
00270 
00271         /* Error codes we've added.  */
00272         REG_EEND,               /* Premature end.  */
00273         REG_ESIZE,              /* Compiled pattern bigger than 2^16 bytes.  */
00274         REG_ERPAREN             /* Unmatched ) or \); not returned from regcomp.  */
00275     } reg_errcode_t;
00276     
00277     /* This data structure represents a compiled pattern.  Before calling
00278        the pattern compiler, the fields `buffer', `allocated', `fastmap',
00279        `translate', and `no_sub' can be set.  After the pattern has been
00280        compiled, the `re_nsub' field is available.  All other fields are
00281        private to the regex routines.  */
00282 
00283     struct re_pattern_buffer {
00284         /* [[[begin pattern_buffer]]] */
00285         /* Space that holds the compiled pattern.  It is declared as
00286                  `unsigned char *' because its elements are
00287                   sometimes used as array indexes.  */
00288         unsigned char *buffer;
00289 
00290         /* Number of bytes to which `buffer' points.  */
00291         unsigned long allocated;
00292 
00293         /* Number of bytes actually used in `buffer'.  */
00294         unsigned long used;
00295 
00296         /* Syntax setting with which the pattern was compiled.  */
00297         reg_syntax_t syntax;
00298 
00299         /* Pointer to a fastmap, if any, otherwise zero.  re_search uses
00300            the fastmap, if there is one, to skip over impossible
00301            starting points for matches.  */
00302         char *fastmap;
00303 
00304         /* Either a translate table to apply to all characters before
00305            comparing them, or zero for no translation.  The translation
00306            is applied to a pattern when it is compiled and to a string
00307            when it is matched.  */
00308         char *translate;
00309 
00310         /* Number of subexpressions found by the compiler.  */
00311         size_t re_nsub;
00312 
00313         /* Zero if this pattern cannot match the empty string, one else.
00314            Well, in truth it's used only in `re_search_2', to see
00315            whether or not we should use the fastmap, so we don't set
00316            this absolutely perfectly; see `re_compile_fastmap' (the
00317            `duplicate' case).  */
00318 unsigned can_be_null :
00319         1;
00320 
00321         /* If REGS_UNALLOCATED, allocate space in the `regs' structure
00322              for `max (RE_NREGS, re_nsub + 1)' groups.
00323            If REGS_REALLOCATE, reallocate space if necessary.
00324            If REGS_FIXED, use what's there.  */
00325 #define REGS_UNALLOCATED 0
00326 #define REGS_REALLOCATE 1
00327 #define REGS_FIXED 2
00328 
00329 unsigned regs_allocated :
00330         2;
00331 
00332         /* Set to zero when `regex_compile' compiles a pattern; set to one
00333            by `re_compile_fastmap' if it updates the fastmap.  */
00334 unsigned fastmap_accurate :
00335         1;
00336 
00337         /* If set, `re_match_2' does not return information about
00338            subexpressions.  */
00339 unsigned no_sub :
00340         1;
00341 
00342         /* If set, a beginning-of-line anchor doesn't match at the
00343            beginning of the string.  */
00344 unsigned not_bol :
00345         1;
00346 
00347         /* Similarly for an end-of-line anchor.  */
00348 unsigned not_eol :
00349         1;
00350 
00351         /* If true, an anchor at a newline matches.  */
00352 unsigned newline_anchor :
00353         1;
00354 
00355         /* [[[end pattern_buffer]]] */
00356     };
00357 
00358     typedef struct re_pattern_buffer regex_t;
00359 
00360 
00361     /* search.c (search_buffer) in Emacs needs this one opcode value.  It is
00362        defined both in `regex.c' and here.  */
00363 #define RE_EXACTN_VALUE 1
00364     
00365     /* Type for byte offsets within the string.  POSIX mandates this.  */
00366     typedef int regoff_t;
00367 
00368 
00369     /* This is the structure we store register match data in.  See
00370        regex.texinfo for a full description of what registers match.  */
00371     struct re_registers {
00372         unsigned num_regs;
00373         regoff_t *start;
00374         regoff_t *end;
00375     };
00376 
00377 
00378     /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
00379        `re_match_2' returns information about at least this many registers
00380        the first time a `regs' structure is passed.  */
00381 #ifndef RE_NREGS
00382 #define RE_NREGS 30
00383 #endif
00384 
00385 
00386     /* POSIX specification for registers.  Aside from the different names than
00387        `re_registers', POSIX uses an array of structures, instead of a
00388        structure of arrays.  */
00389     typedef struct {
00390         regoff_t rm_so;  /* Byte offset from string's start to substring's start.  */
00391         regoff_t rm_eo;  /* Byte offset from string's start to substring's end.  */
00392     }
00393     regmatch_t;
00394     
00395     /* Declarations for routines.  */
00396 
00397     /* To avoid duplicating every routine declaration -- once with a
00398        prototype (if we are ANSI), and once without (if we aren't) -- we
00399        use the following macro to declare argument types.  This
00400        unfortunately clutters up the declarations a bit, but I think it's
00401        worth it.  */
00402 
00403 #if __STDC__
00404 
00405 #define _RE_ARGS(args) args
00406 
00407 #else /* not __STDC__ */
00408 
00409 #define _RE_ARGS(args) ()
00410 
00411 #endif /* not __STDC__ */
00412 
00413     /* Sets the current default syntax to SYNTAX, and return the old syntax.
00414        You can also simply assign to the `re_syntax_options' variable.  */
00415     extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax));
00416 
00417     /* Compile the regular expression PATTERN, with length LENGTH
00418        and syntax given by the global `re_syntax_options', into the buffer
00419        BUFFER.  Return NULL if successful, and an error string if not.  */
00420     extern const char *re_compile_pattern
00421         _RE_ARGS ((const char *pattern, int length,
00422                    struct re_pattern_buffer *buffer));
00423 
00424 
00425     /* Compile a fastmap for the compiled pattern in BUFFER; used to
00426        accelerate searches.  Return 0 if successful and -2 if was an
00427        internal error.  */
00428     extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer));
00429 
00430 
00431     /* Search in the string STRING (with length LENGTH) for the pattern
00432        compiled into BUFFER.  Start searching at position START, for RANGE
00433        characters.  Return the starting position of the match, -1 for no
00434        match, or -2 for an internal error.  Also return register
00435        information in REGS (if REGS and BUFFER->no_sub are nonzero).  */
00436     extern int re_search
00437         _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string,
00438                    int length, int start, int range, struct re_registers *regs));
00439 
00440 
00441     /* Like `re_search', but search in the concatenation of STRING1 and
00442        STRING2.  Also, stop searching at index START + STOP.  */
00443     extern int re_search_2
00444         _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1,
00445                    int length1, const char *string2, int length2,
00446                    int start, int range, struct re_registers *regs, int stop));
00447 
00448 
00449     /* Like `re_search', but return how many characters in STRING the regexp
00450        in BUFFER matched, starting at position START.  */
00451     extern int re_match
00452         _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string,
00453                    int length, int start, struct re_registers *regs));
00454 
00455 
00456     /* Relates to `re_match' as `re_search_2' relates to `re_search'.  */
00457     extern int re_match_2
00458         _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1,
00459                    int length1, const char *string2, int length2,
00460                    int start, struct re_registers *regs, int stop));
00461 
00462 
00463     /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
00464        ENDS.  Subsequent matches using BUFFER and REGS will use this memory
00465        for recording register information.  STARTS and ENDS must be
00466        allocated with malloc, and must each be at least `NUM_REGS * sizeof
00467        (regoff_t)' bytes long.
00468      
00469        If NUM_REGS == 0, then subsequent matches should allocate their own
00470        register data.
00471      
00472        Unless this function is called, the first search or match using
00473        PATTERN_BUFFER will allocate its own register data, without
00474        freeing the old data.  */
00475     extern void re_set_registers
00476         _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs,
00477                    unsigned num_regs, regoff_t *starts, regoff_t *ends));
00478 
00479     /* 4.2 bsd compatibility.  */
00480     extern char *re_comp _RE_ARGS ((const char *));
00481     extern int re_exec _RE_ARGS ((const char *));
00482 
00483     /* POSIX compatibility.  */
00484     extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags));
00485     extern int regexec
00486         _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch,
00487                    regmatch_t pmatch[], int eflags));
00488     extern size_t regerror
00489         _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf,
00490                    size_t errbuf_size));
00491     extern void regfree _RE_ARGS ((regex_t *preg));
00492 
00493 #endif /* not __REGEXP_LIBRARY_H__ */
00494 
00495 #ifdef __cplusplus
00496 }
00497 #endif
00498 
00499 
00500 /*
00501 Local variables:
00502 make-backup-files: t
00503 version-control: t
00504 trim-versions-without-asking: nil
00505 End:
00506 */

Generated on Thu Feb 14 11:16:02 2008 for Jackdmp by  doxygen 1.5.1