Click here to Skip to main content
15,880,427 members
Articles / Desktop Programming / MFC

Recursive patterned File Globbing

Rate me:
Please Sign up or sign in to vote.
4.93/5 (20 votes)
27 Aug 20029 min read 185.9K   2K   61  
Class and application to recursively or non-recursively match files or directories based on a wildcard pattern.
/**
	\file FileGlobBase.cpp

	This file contains many of the definitions for parts of the FileGlobBase
	class.
**/
#include "FileGlobBase.h"
#include <windows.h>
#include <assert.h>

// Forward declares.
bool WildMatch( const char* pattern, const char *string, bool caseSensitive );


/**
	Constructor.
**/
FileGlobBase::FileGlobBase()
{
	AddIgnorePattern( "./" );
	AddIgnorePattern( "../" );
}							


/**
	Finds all files matching [inPattern].  The wildcard expansion understands
	the following pattern constructs:

	- ?
		- Any single character.
	- *
		- Any character of which there are zero or more of them.
	- **
		- All subdirectories recursively.

	Additionally, if the pattern closes in a single slash, only directories
	are processed.  Forward slashes and backslashes may be used
	interchangeably.

	- *\
		- List all the directories in the current directory.  Can also be
		  *(forwardslash), but it turns off the C++ comment in this message.
	- **\
		- List all directories recursively.

	Wildcards may appear anywhere in the pattern, including directories.

	- *\*\*\*.c

	Note that *.* only matches files that have an extension.  This is different
	than standard DOS behavior.  Use * all by itself to match files, extension
	or not.

	Recursive wildcards can be used anywhere:

	c:/Dir1/**\A*\**\FileDirs*\**.mp3

	This matches all directories under c:\Dir1\ that start with A.  Under all
	of the directories that start with A, directories starting with FileDirs
	are matched recursively.  Finally, all files ending with an mp3 extension
	are matched.

	Any place that has more than two .. for going up a subdirectory is expanded
	a la 4DOS.

	...../StartSearchHere/**

	Expands to:

	../../../../StartSearchHere/**
		
	\param inPattern The pattern to use for matching.
**/
void FileGlobBase::MatchPattern( const char* inPattern )
{
	char pattern[ MAX_PATH ];

	// Give ourselves a local copy of the inPattern with all \ characters
	// changed to / characters and more than two periods expanded.
	const char* srcPtr = inPattern;

	// Is it a Windows network path?  If so, don't convert the opening \\.
	if ( srcPtr[ 0 ] == '\\'  &&  srcPtr[ 1 ] == '\\' )
		srcPtr += 2;

	const char* lastSlashPtr = srcPtr - 1;
	char* destPtr = pattern;
	int numPeriods = 0;
	while ( *srcPtr != '\0' )
	{
		char ch = *srcPtr;
		
		///////////////////////////////////////////////////////////////////////
		// Check for slashes or backslashes.
		if ( ch == '\\'  ||  ch == '/' )
		{
			*destPtr++ = '/';

			lastSlashPtr = srcPtr;
			numPeriods = 0;
		}

		///////////////////////////////////////////////////////////////////////
		// Check for .
		else if ( ch == '.' )
		{
			if ( srcPtr - numPeriods - 1 == lastSlashPtr )
			{
				numPeriods++;
				if ( numPeriods > 2 )
				{
					*destPtr++ = '/';
					*destPtr++ = '.';
					*destPtr++ = '.';
				}
				else
				{
					*destPtr++ = '.';
				}
			}
			else
			{
				*destPtr++ = '.';
			}
		}

		///////////////////////////////////////////////////////////////////////
		// Check for **
		else if ( ch == '*'  &&  srcPtr[ 1 ] == '*' )
		{
			if ( srcPtr - 1 != lastSlashPtr )
			{
				// Something like this:
				//
				// /Dir**/
				//
				// needs to be translated to:
				//
				// /Dir*/**/
				*destPtr++ = '*';
				*destPtr++ = '/';
			}

			srcPtr += 2;

			*destPtr++ = '*';
			*destPtr++ = '*';

			// Did we get a double star this round?
			if ( srcPtr[ 0 ] != '/'  &&  srcPtr[ 0 ] != '\\' )
			{
				// Handle the case that looks like this:
				//
				// /**Text
				//
				// Translate to:
				//
				// /**/*Text
				*destPtr++ = '/';
				*destPtr++ = '*';
			}
			else if ( srcPtr[ 1 ] == '\0'  ||  srcPtr[ 1 ] == '@' )
			{
				srcPtr++;

				*destPtr++ = '/';
				*destPtr++ = '*';
				*destPtr++ = '/';
			}

			// We added one too many in here... the compiler will optimize.
			srcPtr--;
		}

		///////////////////////////////////////////////////////////////////////
		// Check for @
		else if ( ch == '@' )
		{
			// Gonna finish this processing in another loop.
			break;
		}
			
		///////////////////////////////////////////////////////////////////////
		// Everything else.
		else
		{
			*destPtr++ = *srcPtr;
		}

		srcPtr++;
	}

	*destPtr = 0;

	// Check for the @.
	if ( *srcPtr == '@' )
	{
		// Clear out function registered entries.
		m_ignorePatterns.clear();
		AddIgnorePattern( "./" );
		AddIgnorePattern( "../" );

		m_exclusiveFilePatterns.clear();
	}

	while ( *srcPtr == '@' )
	{
		srcPtr++;

		char ch = *srcPtr++;
		if ( ch == '-'  ||  ch == '=' )
		{
			char buffer[ _MAX_PATH ];
			char* ptr = buffer;
			while ( *srcPtr != '@'  &&  *srcPtr != '\0' )
			{
				*ptr++ = *srcPtr++;
			}

			*ptr = 0;

			if ( ch == '-' )
				AddIgnorePattern( buffer );
			else if ( ch == '=' )
				AddExclusivePattern( buffer );
		}
		else
			break;		// Don't know what it is.
	}

	// Start globbing!
	GlobHelper( pattern );
}


/**
	Adds a pattern to the file glob database of exclusive patterns.  If any
	exclusive patterns are registered, the ignore database is completely
	ignored.  Only patterns matching the exclusive patterns will be
	candidates for matching.

	\param name The exclusive pattern.
**/
void FileGlobBase::AddExclusivePattern( const char* pattern )
{
	for ( StringList::iterator it = m_exclusiveFilePatterns.begin();
			it != m_exclusiveFilePatterns.end(); ++it )
		if ( stricmp( (*it).c_str(), pattern ) == 0 )
			return;

	m_exclusiveFilePatterns.push_back( pattern );
}


/**
	Adds a pattern to ignore to the file glob database.  If a pattern of
	the given name is found, its contents will not be considered for further
	matching.  The result is as if the pattern did not exist for the search
	in the first place.

	\param name The pattern to ignore.
**/
void FileGlobBase::AddIgnorePattern( const char* pattern )
{
	for ( StringList::iterator it = m_ignorePatterns.begin();
			it != m_ignorePatterns.end(); ++it )
		if ( stricmp( (*it).c_str(), pattern ) == 0 )
			return;

	m_ignorePatterns.push_back( pattern );
}


/**
	Match an exclusive pattern.

	\param text The text to match an ignore pattern against.
	\return Returns true if the directory should be ignored, false otherwise.
**/
bool FileGlobBase::MatchExclusivePattern( const char* text )
{
	for ( StringList::iterator it = m_exclusiveFilePatterns.begin();
			it != m_exclusiveFilePatterns.end(); ++it )
	{
		if ( WildMatch( (*it).c_str(), text, false  ) )
			return true;
	}

	return false;
}


/**
	Do a case insensitive find for the pattern.

	\param text The text to match an ignore pattern against.
	\return Returns true if the directory should be ignored, false otherwise.
**/
bool FileGlobBase::MatchIgnorePattern( const char* text )
{
	for ( StringList::iterator it = m_ignorePatterns.begin();
			it != m_ignorePatterns.end(); ++it )
	{
		if ( WildMatch( (*it).c_str(), text, false  ) )
			return true;
	}

	return false;
}


/**
	\internal
	\author Jack Handy

	Borrowed from http://www.codeproject.com/string/wildcmp.asp.
	Modified by Joshua Jensen.
**/
bool WildMatch( const char* pattern, const char *string, bool caseSensitive )
{
	// Handle all the letters of the pattern and the string.
	while ( *string != 0  &&  *pattern != '*' )
	{
		if ( *pattern != '?' )
		{
			if ( caseSensitive )
			{
				if ( *pattern != *string )
					return false;
			}
			else
			{
				if ( toupper( *pattern ) != toupper( *string ) )
					return false;
			}
		}

		pattern++;
		string++;
	}

	const char* mp = NULL;
	const char* cp = NULL;
	while ( *string != 0 )
	{
		if (*pattern == '*')
		{
			// It's a match if the wildcard is at the end.
			if ( *++pattern == 0 )
			{
				return true;
			}

			mp = pattern;
			cp = string + 1;
		}
		else
		{
			if ( caseSensitive )
			{
				if ( *pattern == *string  ||  *pattern == '?' )
				{
					pattern++;
					string++;
				}
				else 
				{
					pattern = mp;
					string = cp++;
				}
			}
			else
			{
				if ( toupper( *pattern ) == toupper( *string )  ||  *pattern == '?' )
				{
					pattern++;
					string++;
				}
				else
				{
					pattern = mp;
					string = cp++;
				}
			}
		}
	}

	// Collapse remaining wildcards.
	while ( *pattern == '*' )
		pattern++;

	return !*pattern;
}


/**
	\internal Simple path splicing (assumes no '/' in either part)
	\author Matthias Wandel (MWandel@rim.net) http://http://www.sentex.net/~mwandel/
**/
static void CatPath(char * dest, const char * p1, const char * p2)
{
	size_t len = strlen( p1 );
	if ( len == 0 )
	{
		strcpy( dest, p2 );
	}
	else
	{
		if ( len + strlen( p2 ) > 200 )
		{
			// Path too long.
			assert( 0 );
		}
		memcpy( dest, p1, len + 1 );
		if ( dest[ len - 1 ] != '/'  &&  dest[ len - 1 ] != ':' )
		{
			dest[ len++ ] = '/';
		}
		strcpy( dest + len, p2 );
	}
}


/**
	\internal Does all the actual globbing.
	\author Matthias Wandel (MWandel@rim.net) http://http://www.sentex.net/~mwandel/
	\author Joshua Jensen (jjensen@workspacewhiz.com)

	Matthias Wandel wrote the original C algorithm, which is contained in
	his Exif Jpeg header parser at http://www.sentex.net/~mwandel/jhead/ under
	the filename MyGlob.c.  It should be noted that the MAJORITY of this
	function is his, albeit rebranded style-wise.

	I have made the following extensions:

	-	Support for ignoring directories.
	-	Perforce-style (and DJGPP-style) ... for recursion, instead of **.
	-	Automatic conversion from ...Stuff to .../*Stuff.  Allows lookup of
		files by extension, too: '....h' translates to '.../*.h'.
	-	Ability to handle forward slashes and backslashes.
	-	A minimal C++ class design.
	-	Wildcard matching not based on FindFirstFile().  Should allow greater
		control in the future and patching in of the POSIX fnmatch() function
		on systems that support it.
**/
void FileGlobBase::GlobHelper( const char* inPattern )
{
	char patternBuf[ _MAX_PATH * 2 ];
	strcpy( patternBuf, inPattern );

DoRecursion:
	char basePath[ _MAX_PATH ];
	char* basePathEndPtr = basePath;
	char* recurseAtPtr = NULL;

	// Split the path into base path and pattern to match against.
	bool hasWildcard = false;

	for ( char* pattern = patternBuf; *pattern != '\0'; ++pattern )
	{
		char ch = *pattern;

		// Is it a '?' ?
		if ( ch == '?' )
			hasWildcard = true;

		// Is it a '*' ?
		else if ( ch == '*' )
		{
			hasWildcard = true;

			// Is there a '**'?
			if ( pattern[ 1 ] == '*' )
			{
				// If we're just starting the pattern or the characters immediately
				// preceding the pattern are a drive letter ':' or a directory path
				// '/', then set up the internals for later recursion.
				if ( pattern == patternBuf  ||  pattern[ -1 ] == '/'  ||
					pattern[ -1 ] == ':')
				{
					char ch2 = pattern[ 2 ];
					if ( ch2 == '/' )
					{
						recurseAtPtr = pattern;
						memcpy(pattern, pattern + 3, strlen( pattern ) - 2 );
					}
					else if ( ch2 == '\0' )
					{
						recurseAtPtr = pattern;
						*pattern = '\0';
					}
				}
			}
		}

		// Is there a '/' or ':' in the pattern at this location?
		if ( ch == '/'  ||  ch == ':' )
		{
			if ( hasWildcard )
				break;
			basePathEndPtr = &basePath[ pattern - patternBuf + 1 ];
		}
	}

	// If there is no wildcard this time, then just add the current file and
	// get out of here.
	if ( !hasWildcard )
	{
		// This should refer to a file.
		FoundMatch( patternBuf );
		return;
	}

	// Did we make it to the end of the pattern?  If so, we should match files,
	// since there were no slashes encountered.
	bool matchFiles = *pattern == '\0';

	// Copy the directory down.
	size_t basePathLen = basePathEndPtr - basePath;
	strncpy( basePath, patternBuf, basePathLen );

	// Copy the wildcard matching string.
	char matchPattern[ _MAX_PATH ];
	size_t matchLen = ( pattern - patternBuf ) - basePathLen;
	strncpy( matchPattern, patternBuf + basePathLen, matchLen + 1 );
	if ( matchPattern[ matchLen ] == '/' )
		matchPattern[ matchLen ] = 0;

	StringList fileList;

	// Do the file search with *.* in the directory specified in basePattern.
	strcpy( basePathEndPtr, "*.*" );

	// Start the find.
	WIN32_FIND_DATA fd;
	HANDLE handle = FindFirstFile( basePath, &fd );

	// Clear out the *.* so we can use the original basePattern string.
	*basePathEndPtr = 0;

	// Any files found?
	if ( handle != INVALID_HANDLE_VALUE )
	{
		for ( ;; )
		{
			// Is the file a directory?
			if ( ( fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY )  &&  !matchFiles )
			{
				// Do a wildcard match.
				if ( WildMatch( matchPattern, fd.cFileName, false ) )
				{
					// It matched.  Let's see if the file should be ignored.
					bool ignore = false;

					// Knock out "." or ".." if they haven't already been.
					int len = strlen( fd.cFileName );
					fd.cFileName[ len ] = '/';
					fd.cFileName[ len + 1 ] = '\0';

					// See if this is a directory to ignore.
					ignore = MatchIgnorePattern( fd.cFileName );

					fd.cFileName[ len ] = 0;

					// Should this file be ignored?
					if ( !ignore )
					{
						// Nope.  Add it to the linked list.
						fileList.push_back( fd.cFileName );
					}
				}
			}
			else if ( !( fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY )  &&  matchFiles )
			{
				// Do a wildcard match.
				if ( WildMatch( matchPattern, fd.cFileName, false ) )
				{
					// It matched.  Let's see if the file should be ignored.
					bool ignore = MatchIgnorePattern( fd.cFileName );

					// Is this pattern exclusive?
					if ( !ignore  &&  m_exclusiveFilePatterns.begin() != m_exclusiveFilePatterns.end() )
					{
						ignore = !MatchExclusivePattern( fd.cFileName );
					}

					// Should this file be ignored?
					if ( !ignore )
					{
						// Nope.  Add it to the linked list.
						fileList.push_back( fd.cFileName );
					}
				}
			}

			// Look up the next file.
			if ( !FindNextFile( handle, &fd ) )
				break;
		}

		// Close down the file find handle.
		FindClose( handle );
	}

	// Sort the list.
	fileList.sort();

	// Iterate the file list and either recurse or add the file as a found
	// file.
	if ( !matchFiles )
	{
		for ( StringList::iterator it = fileList.begin(); it != fileList.end(); ++it )
		{
			char combinedName[ _MAX_PATH * 2 ];

			// Need more directories.
			CatPath( combinedName, basePath, (*it).c_str() );
			strcat( combinedName, pattern );
			GlobHelper( combinedName );
		}
	}
	else // if ( !matchFiles )
	{
		for ( StringList::iterator it = fileList.begin(); it != fileList.end(); ++it )
		{
			char combinedName[ _MAX_PATH * 2 ];
			CatPath( combinedName, basePath, (*it).c_str());
			FoundMatch( combinedName );
		}
	}

	// Clear out the file list, so the goto statement below can recurse
	// internally.
	fileList.clear();

	// Do we need to recurse?
	if ( !recurseAtPtr )
		return;

	// Copy in the new recursive pattern to match.
	strcpy( matchPattern, recurseAtPtr );
	strcpy( recurseAtPtr, "*/**/" );
	strcat( patternBuf, matchPattern );

	// As this function context is no longer needed, we can just go back
	// to the top of it to avoid adding another context on the stack.
	goto DoRecursion;
}


By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
Web Developer
United States United States
Joshua Jensen is a gamer at heart and as such, creates games for a living. He has the distinct pleasure of creating titles exclusively for the Xbox.

In his spare time, he maintains a Visual C++ add-in called Workspace Whiz! Find it at http://workspacewhiz.com/.

Comments and Discussions