Click here to Skip to main content
15,886,737 members
Articles / Programming Languages / C#

String Extension Method - Mixed Alpha/Numeric Sort Order

Rate me:
Please Sign up or sign in to vote.
4.92/5 (9 votes)
17 Feb 2009CPOL7 min read 80.1K   753   18  
Uses .NET extension method as an example on how to sort mixed alpha/numeric text
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace TrentTobler.AlphaNumericSort
{
    public static class StringExtensions
    {
        #region Fields

        // Use a regular expression to simplify searching for 'integer' strings.
        static Regex integerRegex = new Regex(
            "[0-9]+",
            RegexOptions.Compiled
            | RegexOptions.ExplicitCapture);

        // Use a regular expression to simpify searching for mixed 'integer' and hex strings.
        static Regex integerAndHexRegex = new Regex(
            "((?<=0x)[0-9A-F]+)|([0-9]+)",
            RegexOptions.Compiled
            | RegexOptions.ExplicitCapture
            | RegexOptions.CultureInvariant
            | RegexOptions.IgnoreCase );

        #endregion

        #region Implementation

        private static void AppendLengthCode( StringBuilder sb, int len )
        {
            // I expect most integer sequences to be 9 or fewer digits.  This allows a single digit prefix to determine the
            // relative order of these sequences.
            const int largestSmallMagnitude = 9;
            if( len <= largestSmallMagnitude )
            {
                // Append the single digit prefix to give overall order for the integer segment.
                sb.Append( "012345678"[( len - 1 )] );
            }
            else
            {
                // The sequence is longer than 9 digits.  Code with the special 'this is a long sequence' code that orders these numbers after the shorter
                // numbers.
                sb.Append( '9' );

                // Get a 'length' code.  This is how long the digit sequence is.  Can subtract the largest small magnitude to normalize it to 0 for the smallest 'large' magnitude.
                string lenText = ( len - largestSmallMagnitude - 1 ).ToString();

                // double code the length.
                // This allows coding up to 10^9, without consuming too much extra space (log(n) space)
                sb.Append( lenText.Length.ToString() );
                sb.Append( lenText );
            }
        }

        #endregion

        #region String Extension Methods

        /// <summary>
        /// Get a string token that will sort correctly given mixed numeric (integer) and text strings.
        /// </summary>
        /// <param name="s">The string to convert to an alpha-numeric order token</param>
        /// <returns>A token that when ordered, will give ascending order for text intepreted with integer components.</returns>
        public static string GetAlphaNumericOrderToken( this string s )
        {
            return GetAlphaNumericOrderToken( s, false );
        }

        /// <summary>
        /// Get a string token that will sort correctly given mixed numeric (integer) and text strings.
        /// </summary>
        /// <param name="s">The string to convert to an alpha-numeric order token</param>
        /// <param name="ignoreZeroPadding">Indication whether to ignore left zero padding when generating token for lexical ordering of numeric values.</param>
        /// <returns>A token that when ordered, will give ascending order for text intepreted with integer components.</returns>
        public static string GetAlphaNumericOrderToken( this string s, bool includeHexPatterns )
        {
            return GetAlphaNumericOrderToken( s, includeHexPatterns, true );
        }

        /// <summary>
        /// Get a string token that will sort correctly given mixed numeric (integer) and text strings.
        /// </summary>
        /// <param name="s">The string to convert to an alpha-numeric order token</param>
        /// <param name="ignoreZeroPadding">Indication whether to ignore left zero padding when generating token for lexical ordering of numeric values.</param>
        /// <returns>A token that when ordered, will give ascending order for text intepreted with integer components.</returns>
        public static string GetAlphaNumericOrderToken( this string s, bool includeHexPatterns, bool ignoreZeroPadding )
        {
            // Special case for null.
            if( s == null )
            {
                return s;
            }

            // Any sequence of digits [0-9] needs to be coded to ensure it is ordered correctly.
            MatchCollection matches;
            if( includeHexPatterns )
            {
                // Use the combination hex/integer patterns.
                matches = integerAndHexRegex.Matches( s );
            }
            else
            {
                // Use the integer only hex/integer patterns.
                matches = integerRegex.Matches( s );
            }
            if( matches.Count == 0 )
            {
                // No embedded integer text.  No transform is needed.
                return s;
            }

            // Create a string builder.  The size estimate assumes most numbers will be fewer than 9 digits.
            StringBuilder sb = new StringBuilder( s.Length + matches.Count * 2 );

            // Create a string builder to append terminal zero padding determinations; assume 1 per match.
            StringBuilder zeroPadOrder = null;
            if( ignoreZeroPadding )
                zeroPadOrder = new StringBuilder( matches.Count );

            int pos = 0;
            foreach( Match match in matches )
            {
                // Add text preceding the integer text.
                sb.Append( s, pos, match.Index - pos );

                // Encode the length of integer text, skipping any initial zeros.
                int len = match.Length;

                // Don't count any '0' prefix characters in the length.
                int zeros = 0;
                if( ignoreZeroPadding )
                {
                    while( zeros + 1 < len && s[match.Index + zeros] == '0' )
                    {
                        ++zeros;
                    }
                    len -= zeros;
                }

                // Append length code, the number (without zeros), and the zero length code.
                AppendLengthCode( sb, len );
                sb.Append( s, match.Index + zeros, match.Length - zeros );
                if( ignoreZeroPadding )
                {
                    AppendLengthCode( zeroPadOrder, zeros + 1 );
                }

                // Set the position for non-numeric text.
                pos = match.Index + match.Length;
            }

            // Add any remaining text.
            sb.Append( s, pos, s.Length - pos );

            // Add zero pad order.
            if( ignoreZeroPadding )
            {
                sb.Append( " " );
                sb.Append( zeroPadOrder );
            }

            // Return the token for ordering the string.
            return sb.ToString();
        }

        #endregion
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer (Senior)
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions