Click here to Skip to main content
15,881,044 members
Articles / Programming Languages / Lua

Programming in ObjectScript 1.5

Rate me:
Please Sign up or sign in to vote.
5.00/5 (3 votes)
25 Mar 2013MIT22 min read 18.4K   125   4  
The ObjectScript is a new embedded programing language that mixes benefits of JavaScript, Lua, Ruby, Python and PHP. The ObjectScript has syntax from JavaScript, multiple results from Lua, sugar syntax from Ruby, magic methods from Python and much more.
#include "../objectscript.h"
#include "../os-binder.h"
#include "os-regexp.h"
#include "pcre.h"

#define REGEXP_PATTERN_ORDER			1
#define REGEXP_SET_ORDER				2
#define REGEXP_OFFSET_CAPTURE			(1<<8)

#define	REGEXP_SPLIT_NO_EMPTY			(1<<0)
#define REGEXP_SPLIT_DELIM_CAPTURE		(1<<1)
#define REGEXP_SPLIT_OFFSET_CAPTURE		(1<<2)

#define REGEXP_GLOBAL					(1<<0)

namespace ObjectScript {

class RegexpOS: public OS
{
public:

	static void triggerError(OS * os, const OS::String& msg)
	{
		os->getGlobal(OS_TEXT("RegexpException"));
		os->pushGlobals();
		os->pushString(msg);
		os->call(1, 1);
		os->setException();
	}

	static void triggerError(OS * os, const OS_CHAR * msg)
	{
		triggerError(os, OS::String(os, msg));
	}

	struct RegexpCache
	{
		OS * os;
		pcre * re;
		pcre_extra *extra;
		int compile_options;
		int regexp_options;
		int ref_count;
#if HAVE_SETLOCALE
		need to be implemented?
		char *locale;
		unsigned const char *tables;
#endif
		RegexpCache(OS * p_os)
		{
			os = p_os;
			re = NULL;
			extra = NULL;
			regexp_options = 0;
			compile_options = 0;
			ref_count = 1;
		}

		~RegexpCache()
		{
			pcre_free(re);
			if (extra) pcre_free(extra);
#if HAVE_SETLOCALE
			need to be implemented?
			if ((void*)pce->tables) pefree((void*)pce->tables, 1);
			pefree(pce->locale, 1);
#endif
		}

		RegexpCache * retain()
		{
			ref_count++;
			return this;
		}

		void release()
		{
			if(--ref_count <= 0){
				OS_ASSERT(ref_count == 0);
				OS * os = this->os;
				this->~RegexpCache();
				os->free(this);
			}		
		}

		static void initLibrary(OS * os);
	};

	static RegexpCache * toRegexpCache(OS*);

	struct Regexp
	{
		RegexpCache * cache;
		
		Regexp(RegexpCache * p_cache)
		{
			OS_ASSERT(p_cache);
			cache = p_cache->retain();
		}

		~Regexp()
		{
			OS_ASSERT(cache);
			cache->release();
		}

		static bool isNumericString(const OS_CHAR * str)
		{
			for(; *str; str++){
				if(!OS_IS_ALNUM(*str)){
					return false;
				}
			}
			return true;
		}

		char **makeSubpatsTable(int num_subpats)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			pcre_extra *extra = cache->extra;
			int name_cnt = 0, name_size, ni = 0;
			char *name_table;
			unsigned short name_idx;
			char **subpat_names = (char **)os->malloc(num_subpats * sizeof(char *) OS_DBG_FILEPOS);

			int rc = pcre_fullinfo(cache->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
			if (rc < 0) {
				triggerError(os, OS::String::format(os, "PCRE fullinfo (makeSubpatsTable) error code %d", rc));
				os->free(subpat_names);
				return NULL;
			}
			if (name_cnt > 0) {
				int rc1 = pcre_fullinfo(cache->re, extra, PCRE_INFO_NAMETABLE, &name_table);
				int rc2 = pcre_fullinfo(cache->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
				rc = rc2 ? rc2 : rc1;
				if (rc < 0) {
					triggerError(os, OS::String::format(os, "PCRE fullinfo (makeSubpatsTable #2) error code %d", rc));
					os->free(subpat_names);
					return NULL;
				}

				while (ni++ < name_cnt) {
					name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
					subpat_names[name_idx] = name_table + 2;
					if (isNumericString(subpat_names[name_idx])) {
						triggerError(os, "PCRE numeric named subpatterns are not allowed");
						os->free(subpat_names);
						return NULL;
					}
					name_table += name_size;
				}
			}
			return subpat_names;
		}

		void addNextString(const char * str, int len)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			os->pushStackValue();
			os->pushString(str, len);
			os->addProperty(false);
		}

		void addNextNumber(int value)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			os->pushStackValue();
			os->pushNumber(value);
			os->addProperty(false);
		}

		void addOffsetPair(const char *str, int len, int offset, const char *name)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			os->newArray();

			/* Add (match, offset) to the return value */
			addNextString(str, len);
			addNextNumber(offset);

			if (name) {
				os->pushStackValue(-2);
				os->pushStackValue(-2);
				os->setProperty(name, false);
			}
			os->pushStackValue(-2);
			os->pushStackValue(-2);
			os->addProperty(false);

			os->pop();
		}

		void addAssocString(const char * name, const char *str, int len)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			os->pushStackValue();
			os->pushString(str, len);
			os->setProperty(name, false);
		}

		bool checkExecError(int pcre_code)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			switch (pcre_code) {
				case PCRE_ERROR_MATCHLIMIT:
					triggerError(os, "PCRE backtrack limit error");
					return true;

				case PCRE_ERROR_RECURSIONLIMIT:
					triggerError(os, "PCRE recursion limit error");
					return true;

				case PCRE_ERROR_BADUTF8:
					triggerError(os, "PCRE bad utf8 error");
					return true;

				case PCRE_ERROR_BADUTF8_OFFSET:
					triggerError(os, "PCRE bad utf8 offset error");
					return true;

				default:
					triggerError(os, OS::String::format(os, "PCRE internal error code %d", pcre_code));
					return true;
			}
			return false;
		}

		static RegexpCache * getRegexpCache(OS * os, const Core::String& regex)
		{
			pcre				*re = NULL;
			pcre_extra			*extra;
			int					 compile_options = 0;
			int					 soptions = 0;
			const char			*error;
			int					 erroffset;
			char				 delimiter;
			char				 start_delimiter;
			char				 end_delimiter;
			const char			*p, *pp;
			// const char			*pattern;
			int					 do_study = 0;
			int					 regexp_options = 0;
			int					 count = 0;
			unsigned const char *tables = NULL;
#if HAVE_SETLOCALE
			need to be implemented?
			char				*locale = setlocale(LC_CTYPE, NULL);
#endif

#if 1
			/* Try to lookup the cached regex entry, and if successful, just pass
			back the compiled pattern, otherwise go on and compile it. */
			os->getGlobal("Regexp");
			os->getProperty("cache", false);
			if(os->isNull()){
				os->newObject();
				os->getGlobal("Regexp");
				os->pushStackValue(-2);
				os->setProperty("cache", false);
			}
			int cache_id = os->getValueId();
			os->getProperty(regex, false);
			RegexpCache * cache = toRegexpCache(os);
			if(cache){
				/*
				* We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
				* is, we flush it and compile the pattern from scratch.
				*/
				if (pcre_fullinfo(cache->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
					os->pushValueById(cache_id);
					os->deleteProperty(regex, false);
					os->pop();
				} else {
#if HAVE_SETLOCALE
					need to be implemented?
					if (!strcmp(cache->locale, locale)) {
#endif
					// touch item
					os->pushValueById(cache_id);
					os->deleteProperty(regex, false);
					
					os->pushValueById(cache_id);
					os->pushStackValue(-2); // pushCtypeValue(os, cache);
					os->setProperty(regex, false);

					os->pop();
					return cache;
#if HAVE_SETLOCALE
					need to be implemented?
					}
#endif
				}
			}
			os->pop();
#endif

			p = regex;

			/* Parse through the leading whitespace, and display a warning if we
			get to the end without encountering a delimiter. */
			while (OS_IS_SPACE(*p)) p++;
			if (*p == 0) {
				triggerError(os, "PCRE empty regular expression");
				return NULL;
			}

			/* Get the delimiter and display a warning if it is alphanumeric
			or a backslash. */
			delimiter = *p++;
			if (OS_IS_ALNUM(delimiter) || delimiter == '\\') {
				triggerError(os, "PCRE delimiter must not be alphanumeric or backslash");
				return NULL;
			}

			start_delimiter = delimiter;
			if ((pp = OS_STRCHR("([{< )]}> )]}>", delimiter)))
				delimiter = pp[5];
			end_delimiter = delimiter;

			if (start_delimiter == end_delimiter) {
				/* We need to iterate through the pattern, searching for the ending delimiter,
				but skipping the backslashed delimiters.  If the ending delimiter is not
				found, display a warning. */
				pp = p;
				while (*pp != 0) {
					if (*pp == '\\' && pp[1] != 0) pp++;
					else if (*pp == delimiter)
						break;
					pp++;
				}
				if (*pp == 0) {
					triggerError(os, OS::String::format(os, "PCRE no ending delimiter '%c' found", delimiter));
					return NULL;
				}
			} else {
				/* We iterate through the pattern, searching for the matching ending
				* delimiter. For each matching starting delimiter, we increment nesting
				* level, and decrement it for each matching ending delimiter. If we
				* reach the end of the pattern without matching, display a warning.
				*/
				int brackets = 1; 	/* brackets nesting level */
				pp = p;
				while (*pp != 0) {
					if (*pp == '\\' && pp[1] != 0) pp++;
					else if (*pp == end_delimiter && --brackets <= 0)
						break;
					else if (*pp == start_delimiter)
						brackets++;
					pp++;
				}
				if (*pp == 0) {
					triggerError(os, OS::String::format(os, "PCRE no ending matching delimiter '%c' found", end_delimiter));
					return NULL;
				}
			}

			/* Make a copy of the actual pattern. */
			Core::String pattern(os, p, (int)(pp-p));

			/* Move on to the options */
			pp++;

			/* Parse through the options, setting appropriate flags.  Display
			a warning if we encounter an unknown modifier. */	
			while (*pp != 0) {
				switch (*pp++) {
					/* Perl compatible options */
				case 'i':	compile_options |= PCRE_CASELESS;		break;
				case 'm':	compile_options |= PCRE_MULTILINE;		break;
				case 's':	compile_options |= PCRE_DOTALL;		break;
				case 'x':	compile_options |= PCRE_EXTENDED;		break;

					/* PCRE specific options */
				case 'A':	compile_options |= PCRE_ANCHORED;		break;
				case 'D':	compile_options |= PCRE_DOLLAR_ENDONLY;break;
				case 'S':	do_study  = 1;					break;
				case 'U':	compile_options |= PCRE_UNGREEDY;		break;
				case 'X':	compile_options |= PCRE_EXTRA;			break;
				case 'u':	compile_options |= PCRE_UTF8;
					/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
					characters, even in UTF-8 mode. However, this can be changed by setting
					the PCRE_UCP option. */
#ifdef PCRE_UCP
					compile_options |= PCRE_UCP;
#endif			
					break;

					/* Custom preg options */
				// case 'e':	regexp_options |= REGEXP_REPLACE_EVAL;	break;
				case 'g': regexp_options |= REGEXP_GLOBAL; break;

				/*
				case ' ':
				case '\n':
					break;
				*/

				default:
					triggerError(os, OS::String::format(os, "PCRE unknown modifier '%c'", pp[-1]));
					return NULL;
				}
			}

#if HAVE_SETLOCALE
			need to be implemented?
			if (strcmp(locale, "C"))
				tables = pcre_maketables();
#endif

			/* Compile pattern and display a warning if compilation failed. */
			re = pcre_compile(pattern,
				compile_options,
				&error,
				&erroffset,
				tables);

			if (re == NULL) {
				triggerError(os, OS::String::format(os, "PCRE compilation failed: %s at offset %d", error, erroffset));
				if (tables) {
					os->free((void*)tables);
				}
				return NULL;
			}

			/* If study option was specified, study the pattern and
			store the result in extra for passing to pcre_exec. */
			if (do_study) {
				extra = pcre_study(re, soptions, &error);
				if (extra) {
					extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
				}
				if (error != NULL) {
					triggerError(os, "PCRE error while studying pattern");
					return NULL;
				}
			} else {
				extra = NULL;
			}

			/*
			* If we reached cache limit, clean out the items
			*/
			os->pushValueById(cache_id);
			count = os->getLen();
			for(; count > 512 && os->nextIteratorStep(); count--){
				os->pushValueById(cache_id);
				os->pushStackValue(-3);
				os->deleteProperty(false);
				os->pop(2);
			}

			/* Store the compiled pattern and extra info in the cache. */
			cache = new (os->malloc(sizeof(*cache) OS_DBG_FILEPOS)) RegexpCache(os); 
			cache->re = re;
			cache->extra = extra;
			cache->regexp_options = regexp_options;
			cache->compile_options = compile_options;
#if HAVE_SETLOCALE
			need to be implemented?
			new_entry.locale = pestrdup(locale, 1);
			new_entry.tables = tables;
#endif

#if 1
			os->pushValueById(cache_id);
			pushCtypeValue(os, cache);
			os->setProperty(regex, false);
#endif
			return cache;
		}

		bool match(const OS::String& subject, int subpats_id, int use_flags, int flags, int start_offset)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			pcre_extra * extra = cache->extra;	/* Holds results of studying */
			pcre_extra		 extra_data;		/* Used locally for exec options */
			int				 exoptions = 0;		/* Execution options */
			int				 count = 0;			/* Count of matched subpatterns */
			int				*offsets;			/* Array of subpattern offsets */
			int				 num_subpats;		/* Number of captured subpatterns */
			int				 size_offsets;		/* Size of the offsets array */
			int				 matched;			/* Has anything matched */
			int				 g_notempty = 0;	/* If the match should not be empty */
			const char	   **stringlist;		/* Holds list of subpatterns */
			char 		   **subpat_names;		/* Array for named subpatterns */
			int				 i, rc;
			int				 subpats_order;		/* Order of subpattern matches */
			int				 offset_capture;    /* Capture match offsets: yes/no */

			/* RegexpCache * pce = getRegexpCache(subject);
			if(!pce){
				return false;
			} */

			int global = cache->regexp_options & REGEXP_GLOBAL;
			subpats_order = global ? REGEXP_PATTERN_ORDER : 0;

			if (use_flags) {
				offset_capture = flags & REGEXP_OFFSET_CAPTURE;

				/*
				* subpats_order is pre-set to pattern mode so we change it only if
				* necessary.
				*/
				if (flags & 0xff) {
					subpats_order = flags & 0xff;
				}
				if ((global && (subpats_order < REGEXP_PATTERN_ORDER || subpats_order > REGEXP_SET_ORDER)) ||
					(!global && subpats_order != 0)) {
						triggerError(os, "PCRE invalid flag specified");
						return false;
				}
			} else {
				offset_capture = 0;
			}

			/* Negative offset counts from the end of the string. */
			int subject_len = subject.getLen();
			if (start_offset < 0) {
				start_offset = subject_len + start_offset;
				if (start_offset < 0) {
					start_offset = 0;
				}
			}

			if (extra == NULL) {
				extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
				extra = &extra_data;
			}

			os->getGlobal("Regexp");
			extra->match_limit			 = (os->getProperty(-1, "backtrackLimit"), os->popInt());
			extra->match_limit_recursion = (os->getProperty(-1, "recursionLimit"), os->popInt());
			os->pop(); // Regexp

			/* Calculate the size of the offsets array, and allocate memory for it. */
			rc = pcre_fullinfo(cache->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
			if (rc < 0) {
				triggerError(os, OS::String::format(os, "PCRE fullinfo error code %d", rc));
				return false;
			}
			num_subpats++;
			size_offsets = num_subpats * 3;

			/*
			* Build a mapping from subpattern numbers to their names. We will always
			* allocate the table, even though there may be no named subpatterns. This
			* avoids somewhat more complicated logic in the inner loops.
			*/
			subpat_names = makeSubpatsTable(num_subpats);
			if (!subpat_names) {
				return false;
			}

			offsets = (int *)os->malloc(size_offsets * sizeof(int) OS_DBG_FILEPOS);

			/* Allocate match sets array and initialize the values. */
			struct MatchSets {
				OS * os;
				int * sets;
				int num_subpats;

				MatchSets(OS * p_os)
				{
					os = p_os;
					sets = NULL;
					num_subpats = 0;
				}
				~MatchSets()
				{
					for(int i = 0; i < num_subpats; i++){
						os->releaseValueById(sets[i]);
					}
					os->free(sets);
				}

				void alloc(int p_num_subpats)
				{
					num_subpats = p_num_subpats;
					sets = (int*)os->malloc(num_subpats * sizeof(int) OS_DBG_FILEPOS);
					for (int i = 0; i < num_subpats; i++) {
						os->newObject();
						sets[i] = os->getValueId();
						os->retainValueById(sets[i]);
						os->pop();
					}
				}

			} match_sets(os);
			
			if (global && subpats_id && subpats_order == REGEXP_PATTERN_ORDER) {
				match_sets.alloc(num_subpats);
			}

			matched = 0;
			// PCRE_G(error_code) = PHP_PCRE_NO_ERROR;

			do {
				/* Execute the regular expression. */
				count = pcre_exec(cache->re, extra, subject, subject_len, start_offset,
					exoptions|g_notempty, offsets, size_offsets);

				/* the string was already proved to be valid UTF-8 */
				exoptions |= PCRE_NO_UTF8_CHECK;

				/* Check for too many substrings condition. */
				if (count == 0) {
					triggerError(os, "PCRE matched, but too many substrings");
					return false;
					// count = size_offsets/3;
				}

				/* If something has matched */
				if (count > 0) {
					matched++;

					/* If subpatterns array has been passed, fill it in with values. */
					if (subpats_id) {
						/* Try to get the list of substrings and display a warning if failed. */
						if (pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
							os->free(subpat_names);
							os->free(offsets);
							// if (match_sets) efree(match_sets);
							triggerError(os, "PCRE get subpatterns list failed");
							return false;
						}

						if (global) {	/* global pattern matching */
							if (subpats_id && subpats_order == REGEXP_PATTERN_ORDER) {
								/* For each subpattern, insert it into the appropriate array. */
								for (i = 0; i < count; i++) {
									os->pushValueById(match_sets.sets[i]);
									if (offset_capture) {
										addOffsetPair((char *)stringlist[i],
											offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
									} else {
										addNextString((char *)stringlist[i],
											offsets[(i<<1)+1] - offsets[i<<1]);
									}
									os->pop();
								}
								/*
								* If the number of captured subpatterns on this run is
								* less than the total possible number, pad the result
								* arrays with empty strings.
								*/
								if (count < num_subpats) {
									for (; i < num_subpats; i++) {
										os->pushValueById(match_sets.sets[i]);
										os->pushString("");
										os->addProperty(false);
										// addNextString("", 0);
										// os->pop();
									}
								}
							} else {
								/* Allocate the result set array */
								os->newObject();
								
								/* Add all the subpatterns to it */
								for (i = 0; i < count; i++) {
									if (offset_capture) {
										addOffsetPair((char *)stringlist[i],
											offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
									} else {
										if (subpat_names[i]) {
											addAssocString(subpat_names[i], (char *)stringlist[i],
												offsets[(i<<1)+1] - offsets[i<<1]);
										}
										addNextString((char *)stringlist[i],
											offsets[(i<<1)+1] - offsets[i<<1]);
									}
								}
								/* And add it to the output array */
								os->pushValueById(subpats_id);
								os->pushStackValue(-2);
								os->addProperty(false);

								os->pop();
							}
						} else {			/* single pattern matching */
							/* For each subpattern, insert it into the subpatterns array. */
							os->pushValueById(subpats_id);
							for (i = 0; i < count; i++) {
								if (offset_capture) {
									addOffsetPair((char *)stringlist[i],
										offsets[(i<<1)+1] - offsets[i<<1],
										offsets[i<<1], subpat_names[i]);
								} else {
									if (subpat_names[i]) {
										addAssocString(subpat_names[i], (char *)stringlist[i],
											offsets[(i<<1)+1] - offsets[i<<1]);
									}
									addNextString((char *)stringlist[i],
										offsets[(i<<1)+1] - offsets[i<<1]);
								}
							}
							os->pop();
						}

						pcre_free((void *) stringlist);
					}
				} else if (count == PCRE_ERROR_NOMATCH) {
					/* If we previously set PCRE_NOTEMPTY after a null match,
					this is not necessarily the end. We need to advance
					the start offset, and continue. Fudge the offset values
					to achieve this, unless we're already at the end of the string. */
					if (g_notempty != 0 && start_offset < subject_len) {
						offsets[0] = start_offset;
						offsets[1] = start_offset + 1;
					} else
						break;
				} else {
					if(checkExecError(count)){
						return false;
					}
					OS_ASSERT(false);
					break;
				}

				/* If we have matched an empty string, mimic what Perl's /g options does.
				This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
				the match again at the same point. If this fails (picked up above) we
				advance to the next character. */
				g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;

				/* Advance to the position right after the last full match */
				start_offset = offsets[1];
			} while (global);

			/* Add the match sets to the output array and clean up */
			if (global && subpats_id && subpats_order == REGEXP_PATTERN_ORDER) {
				os->pushValueById(subpats_id);
				for (i = 0; i < num_subpats; i++) {
					if (subpat_names[i]) {
						os->pushStackValue();
						os->pushValueById(match_sets.sets[i]);
						os->setProperty(subpat_names[i], false);
					}
					os->pushStackValue();
					os->pushValueById(match_sets.sets[i]);
					os->addProperty(false);
				}
				os->pop();
				// efree(match_sets);
			}

			os->free(offsets);
			os->free(subpat_names);

			return matched > 0;
		}

		static bool getBackRef(const char **str, int *backref)
		{
			const char *walk = *str;
			OS_ASSERT(*walk == '$');
			if (walk[1] == 0)
				return false;

			walk++;
			if(OS_IS_ALNUM(*walk)){
				*backref = *walk - '0';
				walk++;
			} else
				return false;

			if(*walk && OS_IS_ALNUM(*walk)){
				*backref = *backref * 10 + *walk - '0';
				walk++;
			}

			*str = walk;
			return true;	
		}

		OS::String replaceCallback(int function_id, const char *subject, int *offsets, char **subpat_names, int count, const OS::String& subject_str)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;

			os->pushValueById(function_id);
			OS_ASSERT(os->isFunction());
			os->pushNull(); // this for function

			os->newObject();
			for (int i = 0; i < count; i++) {
				if (subpat_names[i]) {
					addAssocString(subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1]);
				}
				addNextString(&subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1]);
			}
			os->pushString(subject_str);
			os->call(2, 1);
			return os->popString();
		}

		bool replace(const OS::String& subject_str, int replace_id, int limit, int& replace_count)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			pcre_extra		*extra = cache->extra;/* Holds results of studying */
			pcre_extra		 extra_data;		/* Used locally for exec options */
			int				 exoptions = 0;		/* Execution options */
			int				 count = 0;			/* Count of matched subpatterns */
			int				*offsets;			/* Array of subpattern offsets */
			char 			**subpat_names;		/* Array for named subpatterns */
			int				 num_subpats;		/* Number of captured subpatterns */
			int				 size_offsets;		/* Size of the offsets array */
			int				 backref;			/* Backreference number */
			int				 start_offset;		/* Where the new search starts */
			int				 g_notempty=0;		/* If the match should not be empty */
			int				 replace_len=0;		/* Length of replacement string */
			const char * replace = NULL;		/* Replacement string */
			const char * replace_end = NULL;	/* End of replacement string */
			int				 rc;

			if (extra == NULL) {
				extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
				extra = &extra_data;
			}

			os->getGlobal("Regexp");
			extra->match_limit			 = (os->getProperty(-1, "backtrackLimit"), os->popInt());
			extra->match_limit_recursion = (os->getProperty(-1, "recursionLimit"), os->popInt());
			os->pop(); // Regexp

			os->pushValueById(replace_id);
			bool is_callable_replace = os->isFunction();
			OS::String replace_str = is_callable_replace ? OS::String(os) : os->toString();
			if(!is_callable_replace){
				replace = replace_str.toChar();
				replace_len = replace_str.getLen();
				replace_end = replace + replace_len;
			}
			os->pop(); // replace_id

			/* Calculate the size of the offsets array, and allocate memory for it. */
			rc = pcre_fullinfo(cache->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
			if (rc < 0) {
				triggerError(os, OS::String::format(os, "PCRE internal pcre_fullinfo() error %d", rc));
				return false;
			}
			num_subpats++;
			size_offsets = num_subpats * 3;

			/*
			* Build a mapping from subpattern numbers to their names. We will always
			* allocate the table, even though there may be no named subpatterns. This
			* avoids somewhat more complicated logic in the inner loops.
			*/
			subpat_names = makeSubpatsTable(num_subpats);
			if (!subpat_names) {
				return false;
			}

			offsets = (int *)os->malloc(size_offsets * sizeof(int) OS_DBG_FILEPOS);

			Core::Buffer result(os);

			const char * subject = subject_str;
			int subject_len = subject_str.getLen();
			result.reserveCapacity((2 * subject_len + 1)*sizeof(char));

			/* Initialize */
			replace_count = 0;
			start_offset = 0;
			
			while (1) {
				/* Execute the regular expression. */
				count = pcre_exec(cache->re, extra, subject, subject_len, start_offset,
					exoptions|g_notempty, offsets, size_offsets);

				/* the string was already proved to be valid UTF-8 */
				exoptions |= PCRE_NO_UTF8_CHECK;

				/* Check for too many substrings condition. */
				if (count == 0) {
					triggerError(os, "PCRE matched, but too many substrings");
					os->free(offsets);
					os->free(subpat_names);
					return false;
					// count = size_offsets/3;
				}

				const char * piece = subject + start_offset;

				if (count > 0 && (limit == -1 || limit > 0)) {
					++replace_count;
					
					/* Set the match location in subject */
					const char * match = subject + offsets[0];

					/* copy the part of the string before the match */
					result.append(piece, (int)(match-piece));

					/* If evaluating or using custom function, copy result to the buffer
					* and clean up. */
					if (is_callable_replace) {
						OS::String callback_result = replaceCallback(replace_id, subject, offsets, subpat_names, count, subject_str);
						result.append(callback_result);
					} else { /* do regular backreference copying */
						const char * walk = replace;
						while (walk < replace_end) {
							if ('$' == *walk) {
								// $&	Inserts the matched substring (not implemented yet)
								// $`	Inserts the portion of the string that precedes the matched substring (not implemented yet)
								// $'	Inserts the portion of the string that follows the matched substring (not implemented yet)
								// $$	Inserts a "$"
								if(walk[1] == '$'){
									result.append(walk, 1);
									walk += 2;
									continue;
								}else if(getBackRef(&walk, &backref)){
									if (backref < count){
										int match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
										result.append(subject + offsets[backref<<1], match_len);
									}
									continue;
								}
							}
							result.append(walk++, 1);
						}
					}

					if (limit != -1)
						limit--;

				} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
					/* If we previously set PCRE_NOTEMPTY after a null match,
					this is not necessarily the end. We need to advance
					the start offset, and continue. Fudge the offset values
					to achieve this, unless we're already at the end of the string. */
					if (g_notempty != 0 && start_offset < subject_len) {
						offsets[0] = start_offset;
						offsets[1] = start_offset + 1;
						result.append(piece, 1);
					} else {
						/* stick that last bit of string on our output */
						result.append(piece, subject_len - start_offset);
						break;
					}
				} else {
					if(checkExecError(count)){
						os->free(offsets);
						os->free(subpat_names);
						return false;
					}
					OS_ASSERT(false);
					break;
				}

				/* If we have matched an empty string, mimic what Perl's /g options does.
				This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
				the match again at the same point. If this fails (picked up above) we
				advance to the next character. */
				g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;

				/* Advance to the next piece. */
				start_offset = offsets[1];
			}

			os->free(offsets);
			os->free(subpat_names);

			os->pushString(result);
			return true;
		}

		static OS::String escape(OS * os, const OS::String& str, const OS::String& delimiter)
		{
			const char * in_str = str.toChar();
			int in_str_len = str.getLen();
			const char * in_str_end = in_str + in_str_len;

			/* Nothing to do if we got an empty string */
			if (in_str == in_str_end) {
				return OS::String(os);
			}

			char delim_char = '/';
			if (delimiter.getLen() > 0) {
				delim_char = delimiter[0];
			}
	
			/* Allocate enough memory so that even if each character
			   is quoted, we won't run out of room */
			Core::Buffer out(os);
			out.reserveCapacity(in_str_len);
	
			/* Go through the string and quote necessary characters */
			for(const char * p = in_str; p < in_str_end; p++) {
				char c = *p;
				switch(c) {
					case '.':
					case '\\':
					case '+':
					case '*':
					case '?':
					case '[':
					case '^':
					case ']':
					case '$':
					case '(':
					case ')':
					case '{':
					case '}':
					case '=':
					case '!':
					case '>':
					case '<':
					case '|':
					case ':':
					case '-':
						out.append("\\", 1);
						out.append(&c, 1);
						break;

					case '\0':
						out.append("\\000", 4);
						break;

					default:
						if(c == delim_char){
							out.append("\\", 1);
						}
						out.append(&c, 1);
						break;
				}
			}
			return out.toStringOS();
		}

		bool split(const OS::String& subject_str, int limit, int flags)
		{
			OS_ASSERT(cache);
			OS * os = cache->os;
			pcre_extra		*extra = NULL;		/* Holds results of studying */
			// pcre			*re_bump = NULL;	/* Regex instance for empty matches */
			// pcre_extra		*extra_bump = NULL;	/* Almost dummy */
			pcre_extra		 extra_data;		/* Used locally for exec options */
			int				*offsets;			/* Array of subpattern offsets */
			int				 size_offsets;		/* Size of the offsets array */
			int				 exoptions = 0;		/* Execution options */
			int				 count = 0;			/* Count of matched subpatterns */
			int				 start_offset;		/* Where the new search starts */
			int				 next_offset;		/* End of the last delimiter match + 1 */
			int				 g_notempty = 0;	/* If the match should not be empty */
			const char		*last_match;		/* Location of last match */
			int				 rc;
			int				 no_empty;			/* If NO_EMPTY flag is set */
			int				 delim_capture; 	/* If delimiters should be captured */
			int				 offset_capture;	/* If offsets should be captured */

			no_empty = flags & REGEXP_SPLIT_NO_EMPTY;
			delim_capture = flags & REGEXP_SPLIT_DELIM_CAPTURE;
			offset_capture = flags & REGEXP_SPLIT_OFFSET_CAPTURE;

			if (limit == 0) {
				limit = -1;
			}

			if (extra == NULL) {
				extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
				extra = &extra_data;
			}

			os->getGlobal("Regexp");
			extra->match_limit			 = (os->getProperty(-1, "backtrackLimit"), os->popInt());
			extra->match_limit_recursion = (os->getProperty(-1, "recursionLimit"), os->popInt());
			os->pop(); // Regexp

			/* Calculate the size of the offsets array, and allocate memory for it. */
			rc = pcre_fullinfo(cache->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
			if (rc < 0) {
				triggerError(os, OS::String::format(os, "PCRE internal pcre_fullinfo() error %d", rc));
				return false;
			}
			size_offsets = (size_offsets + 1) * 3;
			offsets = (int *)os->malloc(size_offsets * sizeof(int) OS_DBG_FILEPOS);

			/* Start at the beginning of the string */
			start_offset = 0;
			next_offset = 0;
			last_match = subject_str.toChar();

			// os->pushValueById(return_id);
			os->newObject();

			RegexpCache * bump_cache = NULL;
			struct AutoReleaseCache {
				RegexpCache *& cache;
				AutoReleaseCache(RegexpCache *& p_cache): cache(p_cache){}
				~AutoReleaseCache(){ if(cache) cache->release(); }
			} auto_release_cache(bump_cache);

			const char * subject = subject_str.toChar();
			int subject_len = subject_str.getLen();

			/* Get next piece if no limit or limit not yet reached and something matched*/
			while ((limit == -1 || limit > 1)) {
				count = pcre_exec(cache->re, extra, subject,
					subject_len, start_offset,
					exoptions|g_notempty, offsets, size_offsets);

				/* the string was already proved to be valid UTF-8 */
				exoptions |= PCRE_NO_UTF8_CHECK;

				/* Check for too many substrings condition. */
				if (count == 0) {
					triggerError(os, "PCRE matched, but too many substrings");
					os->pop();
					os->free(offsets);
					return false;
					// count = size_offsets/3;
				}

				/* If something matched */
				if (count > 0) {
					if (!no_empty || &subject[offsets[0]] != last_match) {

						if (offset_capture) {
							/* Add (match, offset) pair to the return value */
							addOffsetPair(last_match, (int)(&subject[offsets[0]]-last_match), next_offset, NULL);
						} else {
							/* Add the piece to the return value */
							addNextString(last_match, (int)(&subject[offsets[0]]-last_match));
						}

						/* One less left to do */
						if (limit != -1)
							limit--;
					}

					last_match = &subject[offsets[1]];
					next_offset = offsets[1];

					if (delim_capture) {
						int i, match_len;
						for (i = 1; i < count; i++) {
							match_len = offsets[(i<<1)+1] - offsets[i<<1];
							/* If we have matched a delimiter */
							if (!no_empty || match_len > 0) {
								if (offset_capture) {
									addOffsetPair(&subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
								} else {
									addNextString(&subject[offsets[i<<1]], match_len);
								}
							}
						}
					}
				} else if (count == PCRE_ERROR_NOMATCH) {
					/* If we previously set PCRE_NOTEMPTY after a null match,
					this is not necessarily the end. We need to advance
					the start offset, and continue. Fudge the offset values
					to achieve this, unless we're already at the end of the string. */
					if (g_notempty != 0 && start_offset < subject_len) {
						if (cache->compile_options & PCRE_UTF8) {
							if (!bump_cache) {
								bump_cache = getRegexpCache(os, OS::String(os, "/./us"));
								if(!bump_cache){
									os->pop();
									os->free(offsets);
									return false;
								}
								bump_cache->retain();
							}
							count = pcre_exec(bump_cache->re, bump_cache->extra, subject,
								subject_len, start_offset,
								exoptions, offsets, size_offsets);
							if (count < 1) {
								if(checkExecError(count)){
									os->pop();
									os->free(offsets);
									return false;
								}
								OS_ASSERT(false);
								break;
							}
						} else {
							offsets[0] = start_offset;
							offsets[1] = start_offset + 1;
						}
					} else
						break;
				} else {
					if(checkExecError(count)){
						os->pop();
						os->free(offsets);
						return false;
					}
					OS_ASSERT(false);
					break;
				}

				/* If we have matched an empty string, mimic what Perl's /g options does.
				This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
				the match again at the same point. If this fails (picked up above) we
				advance to the next character. */
				g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;

				/* Advance to the position right after the last full match */
				start_offset = offsets[1];
			}


			start_offset = (int)(last_match - subject); /* the offset might have been incremented, but without further successful matches */

			if (!no_empty || start_offset < subject_len)
			{
				if (offset_capture) {
					/* Add the last (match, offset) pair to the return value */
					addOffsetPair(&subject[start_offset], subject_len - start_offset, start_offset, NULL);
				} else {
					/* Add the last piece to the return value */
					addNextString(last_match, (int)(subject + subject_len - last_match));
				}
			}


			/* Clean up */
			os->free(offsets);
			
			return true;
		}

		static void initLibrary(OS * os);
	};

	static void initLibrary(OS * os)
	{
#define OS_AUTO_TEXT(exp) OS_TEXT(#exp)
		os->eval(OS_AUTO_TEXT(
			RegexpException = extends Exception {
			}
		));
	}
};

template <> struct CtypeName<RegexpOS::Regexp>{ static const OS_CHAR * getName(){ return OS_TEXT("Regexp"); } };
template <> struct CtypeValue<RegexpOS::Regexp*>: public CtypeUserClass<RegexpOS::Regexp*>{};
template <> struct UserDataDestructor<RegexpOS::Regexp>
{
	static void dtor(ObjectScript::OS * os, void * data, void * user_param)
	{
		OS_ASSERT(data && dynamic_cast<RegexpOS::Regexp*>((RegexpOS::Regexp*)data));
		RegexpOS::Regexp * obj = (RegexpOS::Regexp*)data;
		obj->~Regexp();
		os->free(obj);
	}
};

template <> struct CtypeName<RegexpOS::RegexpCache>{ static const OS_CHAR * getName(){ return OS_TEXT("RegexpCache"); } };
template <> struct CtypeValue<RegexpOS::RegexpCache*>: public CtypeUserClass<RegexpOS::RegexpCache*>{};
template <> struct UserDataDestructor<RegexpOS::RegexpCache>
{
	static void dtor(ObjectScript::OS * os, void * data, void * user_param)
	{
		OS_ASSERT(data && dynamic_cast<RegexpOS::RegexpCache*>((RegexpOS::RegexpCache*)data));
		RegexpOS::RegexpCache * buf = (RegexpOS::RegexpCache*)data;
		buf->release();
		// buf->~RegexpCache();
		// os->free(buf);
	}
};

RegexpOS::RegexpCache * RegexpOS::toRegexpCache(OS * os)
{
	return CtypeValue<RegexpOS::RegexpCache*>::getArg(os, -1);
}			

void RegexpOS::RegexpCache::initLibrary(OS * os)
{
	struct Lib
	{
		static void construct(OS * os)
		{
			triggerError(os, OS_TEXT("you should not create new instance of RegexpCache"));
		}
	};

	OS::FuncDef funcs[] = {
		def(OS_TEXT("__construct"), Lib::construct),
		{}
	};

	registerUserClass<RegexpCache>(os, funcs);
}

void RegexpOS::Regexp::initLibrary(OS * os)
{
	struct Lib
	{
		static Regexp * construct(OS * os, const OS::String& pattern)
		{
			RegexpCache * cache = getRegexpCache(os, pattern);
			if(!cache){
				return NULL;
			}
			return new (os->malloc(sizeof(Regexp) OS_DBG_FILEPOS)) Regexp(cache);
		}

		static int exec(OS * os, int params, int, int, void * user_param)
		{
			OS_GET_SELF(Regexp*);

			if(params < 1){
				triggerError(os, "argument required");
				return 0;
			}
			OS::String subject = os->toString(-params+0);

			bool use_flags = params > 1;
			int flags = REGEXP_PATTERN_ORDER;
			int offset = 0;
			if(use_flags){
				flags = os->toInt(-params+1);
				if(params > 2){
					offset = os->toInt(-params+2);
				}
			}

			// os->newArray();
			os->newObject();
			return self->match(subject, os->getValueId(), use_flags, flags, offset) ? 1 : 0;
		}

		static int test(OS * os, int params, int, int, void * user_param)
		{
			OS_GET_SELF(Regexp*);

			if(params < 1){
				triggerError(os, "argument required");
				return 0;
			}
			OS::String subject = os->toString(-params+0);
			os->pushBool(self->match(subject, 0, false, 0, 0));
			return 1;
		}

		static int replace(OS * os, int params, int, int, void * user_param)
		{
			OS_GET_SELF(Regexp*);

			if(params < 2){
				triggerError(os, "two arguments required");
				return 0;
			}
			OS::String subject = os->toString(-params+0);
			int replace_id = os->getValueId(-params+1);
			int limit = params >= 3 ? os->toInt(-params+2, -1) : -1;
			int replace_count = 0;
			return self->replace(subject, replace_id, limit, replace_count) ? 1 : 0;
		}

		static int escape(OS * os, int params, int, int, void * user_param)
		{
			if(params < 1){
				triggerError(os, "argument required");
				return 0;
			}
			OS::String str = os->toString(-params+0);
			OS::String delimiter = params >= 2 ? os->toString(-params+1) : OS::String(os);
			OS::String escaped = Regexp::escape(os, str, delimiter);
			os->pushString(escaped);
			return 1;
		}

		static int split(OS * os, int params, int, int, void * user_param)
		{
			OS_GET_SELF(Regexp*);

			if(params < 1){
				triggerError(os, "argument required");
				return 0;
			}
			OS::String subject = os->toString(-params+0);
			int limit = params >= 2 ? os->toInt(-params+1, -1) : -1;
			int flags = params >= 3 ? os->toInt(-params+2, 0) : 0;
			return self->split(subject, limit, flags) ? 1 : 0;
		}
	};

	OS::FuncDef funcs[] = {
		def(OS_TEXT("__construct"), Lib::construct),
		{OS_TEXT("exec"), Lib::exec},
		{OS_TEXT("test"), Lib::test},
		{OS_TEXT("replace"), Lib::replace},
		{OS_TEXT("escape"), Lib::escape},
		{OS_TEXT("split"), Lib::split},
		{}
	};

	OS::NumberDef numbers[] = {
		{OS_TEXT("PATTERN_ORDER"), REGEXP_PATTERN_ORDER},
		{OS_TEXT("SET_ORDER"), REGEXP_SET_ORDER},
		{OS_TEXT("OFFSET_CAPTURE"), REGEXP_OFFSET_CAPTURE},
		{OS_TEXT("SPLIT_NO_EMPTY"), REGEXP_SPLIT_NO_EMPTY},
		{OS_TEXT("SPLIT_DELIM_CAPTURE"), REGEXP_SPLIT_DELIM_CAPTURE},
		{OS_TEXT("SPLIT_OFFSET_CAPTURE"), REGEXP_SPLIT_OFFSET_CAPTURE},
		{OS_TEXT("backtrackLimit"), 1000000},
		{OS_TEXT("recursionLimit"), 100000},
		{}
	};

	registerUserClass<Regexp>(os, funcs, numbers);
}

void initRegexpLibrary(OS* os)
{
	RegexpOS::Regexp::initLibrary(os);
	RegexpOS::RegexpCache::initLibrary(os);
	RegexpOS::initLibrary(os);
}

} // namespace ObjectScript

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The MIT License


Written By
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions