Click here to Skip to main content
15,885,278 members
Articles / Desktop Programming / WTL

Henry Spencer's Regexp Engine Revisited

Rate me:
Please Sign up or sign in to vote.
4.88/5 (19 votes)
2 Jul 200317 min read 176.6K   3.4K   67  
A small, Unicode-aware regular expression engine based on Henry Spencer's early work
/*
 * regsub
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "regexp_int.h"
#include "regexp_custom.h"
#include "regmagic.h"

static int internal_sub(const CHAR_TYPE* s, const CHAR_TYPE* source, regmatch matches[10], CHAR_TYPE* dest)
{
    register int length = 0;
    register int no;
    register const CHAR_TYPE* src = source;
    register int len = 0;
    register CHAR_TYPE* dst = dest;
    register CHAR_TYPE c;
    
	while ((c = *src++) != LIT('\0')) {
		if (c == LIT('&'))
			no = 0;
		else if (c == LIT('\\') && cisdigit(*src))
			no = *src++ - LIT('0');
		else
			no = -1;

		if (no < 0) {	/* Ordinary character. */
			if (c == LIT('\\') && (*src == LIT('\\') || *src == LIT('&')))
				c = *src++;
            ++length;
            if(dst)
                *dst++ = c;
        } else if (matches[no].begin != -1 && matches[no].end != -1 &&
                   matches[no].end > matches[no].begin) {
			len = matches[no].end - matches[no].begin;
            length += len;
            if(dst)
            {
                cstrncpy(dst, s + matches[no].begin, len);
                dst += len;
                if(*(dst - 1) == LIT('\0'))
                    return REGEXP_EEND;
            }
		}
	}
    if(dst)
    {
        *dst++ = LIT('\0');
        return 1;
    }
    else
        return length + 1;
}

int re_subcount_w(const regexp* rp, const CHAR_TYPE* s, const CHAR_TYPE* src, regmatch matches[10])
{
    register int error;
    
	if (rp == NULL || src == NULL || s == NULL || matches == NULL) {
		re_report("NULL parameter to regsub");
		return REGEXP_BADARG;
	}
	if ((UCHAR_TYPE)*(rp->program) != MAGIC) {
		re_report("damaged regexp");
		return REGEXP_BADARG;
	}
    
    if ((error = re_exec_w(rp, s, 10, matches)) < 1)
        return error;

    /* run count */
    return internal_sub(s, src, matches, NULL);
}

int re_dosub_w(const CHAR_TYPE* s, const CHAR_TYPE* src, regmatch matches[10], CHAR_TYPE* dest)
{
	if (src == NULL || s == NULL || matches == NULL || dest == NULL) {
		re_report("NULL parameter to regsub");
		return REGEXP_BADARG;
	}

    return internal_sub(s, src, matches, dest);
}

/*
 - reg_sub_w - perform substitutions
 */
int re_sub_w(const regexp* rp, const CHAR_TYPE* s, const CHAR_TYPE* source, CHAR_TYPE** dest)
{
    /* note: there can only be 10 expressions (\0 to \9) */
    regmatch matches[10];
    int error;

    if(dest)
        *dest = NULL;
	if (rp == NULL || source == NULL || s == NULL || dest == NULL) {
		re_report("NULL parameter to regsub");
		return REGEXP_BADARG;
	}
	if ((UCHAR_TYPE)*(rp->program) != MAGIC) {
		re_report("damaged regexp");
		return REGEXP_BADARG;
	}
    /* figure out how much room is needed */
    if((error = re_subcount_w(rp, s, source, matches)) < 1)
        return error;

    /* allocate memory */
    *dest = re_malloc(error * sizeof(CHAR_TYPE));
    if(!*dest)
    {
        re_report("out of memory allocating substitute destination");
        return REGEXP_ESPACE;
    }
    
    /* do actual substitution */
    if((error = re_dosub_w(s, source, matches, *dest)) < 0)
    {
        re_cfree(*dest);
        *dest = NULL;
        return error;
    }
    /* done */
    return error;
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
Canada Canada
I'm a senior software developer, working at Silanis Technology (http://www.silanis.com). I've acquired quite a bit of experience (usually the hard way!) in Win32 and raw COM programming on the job. In my spare time, I like to monkey around with POSIX code.

I'm mostly interested in portable C++ libraries. I'm happiest when I develop portable C++ code--C++ being such a powerful language as long as one keeps clear of the rather nasty subtleties of the language.

I hope the articles I contribute will be of some help to someone. If even one person gains a few hours through use of that code, I'll be very happy.

When not coding, I like to listen to Anime and try to learn Japanese. It's not working too well so far, unfortunately. :{)

Comments and Discussions