Click here to Skip to main content
13,092,310 members (59,962 online)
Click here to Skip to main content
Add your own
alternative version

Tagged as


5 bookmarked
Posted 25 Mar 2010

Strip HTML Tags from Text

, 25 Mar 2010
Rate this:
Please Sign up or sign in to vote.
Have you ever wondered how you would show on a web form text that is stripped of HTML tags but you still want to accept HTML tags when saving to your form? Sounds confusing? To make it clear, I will give a good example...

Have you ever wondered how you would show on a web form text that is stripped of HTML tags but you still want to accept HTML tags when saving to your form? Sounds confusing?

To make it clear, I will give a good example.  Let's say you have a form that has a Rich Text Box (FTB or FCK) which you allow users to cut and paste items that have HTML tags so that you can display it properly like a Blog Article, but there are some instances that you want this to be stripped of the HTML tags like displaying a summary on a Grid.  Now my solution is to strip the HTML codes before displaying it when needed.

So when you copy this...

Hello, World! won't show on your grid as this:

 Hello World
 <font size ="4" color="blue">
 Hello, World!

but as this:

"Hello World!"

Now with a mix or Replace and Regular Expressions, I created a class to handle that and here it is:

public string StripHTML(string sInputString)
 string sOutputString;
 sOutputString = sInputString;
 //Initial Cleaning Step
 //Replace new line and carriage return with Spaces
 sOutputString = sOutputString.Replace("\r", " ");
 sOutputString = sOutputString.Replace("\n", " ");
 // Remove sTabs
 sOutputString = sOutputString.Replace("\t", string.Empty);

 //Tag Removal
 DataTable myDataTable = GetTableDefinition();
 myDataTable.DefaultView.Sort = "iID ASC";
 foreach (DataRow drCleaningItem in myDataTable.Rows)
 string sOriginalString = (drCleaningItem["sOriginalString"]).ToString();
 string sReplacementString = (drCleaningItem["sReplacementString"]).ToString();
 sOutputString = Regex.Replace
	(sOutputString, sOriginalString, sReplacementString, RegexOptions.IgnoreCase);

 //Initial replacement target string for linebreaks
 string sBreaks = "\r\r\r";

 // Initial replacement target string for sTabs
 string sTabs = "\t\t\t\t\t";
 for (int x = 0; x < sOutputString.Length; x++)
 sOutputString = sOutputString.Replace(sBreaks, "\r\r");
 sOutputString = sOutputString.Replace(sTabs, "\t\t\t\t");
 sBreaks = sBreaks + "\r";
 sTabs = sTabs + "\t";

 return sOutputString;
 return sInputString;

 private DataTable GetTableDefinition()
 DataTable dtCleaningCollection = new DataTable();
 dtCleaningCollection.Columns.Add("iID", typeof(int));
 dtCleaningCollection.Columns.Add("sOriginalString", typeof(string));
 dtCleaningCollection.Columns.Add("sReplacementString", typeof(string));

 // Replace repeating spaces with single space
 dtCleaningCollection.Rows.Add(1, @"( )+", " ");

 // Prepare and clean Header Tag
 dtCleaningCollection.Rows.Add(2, @"<( )*head([^>])*>", "<head>");
 dtCleaningCollection.Rows.Add(3, @"(<( )*(/)( )*head( )*>)", "</head>");
 dtCleaningCollection.Rows.Add(4, "(<head>).*(</head>)", string.Empty);

 // Prepare and clean Script Tag
 dtCleaningCollection.Rows.Add(5, @"<( )*script([^>])*>", "<script>");
 dtCleaningCollection.Rows.Add(6, @"(<( )*(/)( )*script( )*>)", "</script>");
 dtCleaningCollection.Rows.Add(7, @"(<script>).*(</script>)", string.Empty);

 // Prepare and clean Style Tag
 dtCleaningCollection.Rows.Add(8, @"<( )*style([^>])*>", "<style>");
 dtCleaningCollection.Rows.Add(9, @"(<( )*(/)( )*style( )*>)", "</style>");
 dtCleaningCollection.Rows.Add(10, "(<style>).*(</style>)", string.Empty);

 // Replace <td> with sTabs
 dtCleaningCollection.Rows.Add(11, @"<( )*td([^>])*>", "\t");

 // Replace <BR> and <LI> with Line sBreaks
 dtCleaningCollection.Rows.Add(12, @"<( )*br( )*>", "\r");
 dtCleaningCollection.Rows.Add(13, @"<( )*li( )*>", "\r");

 // Replace <P>, <DIV> and <TR> with Double Line sBreaks
 dtCleaningCollection.Rows.Add(14, @"<( )*div([^>])*>", "\r\r");
 dtCleaningCollection.Rows.Add(15, @"<( )*tr([^>])*>", "\r\r");
 dtCleaningCollection.Rows.Add(16, @"<( )*p([^>])*>", "\r\r");

 // Remove Remaining tags enclosed in < >
 dtCleaningCollection.Rows.Add(17, @"<[^>]*>", string.Empty);

 // Replace special characters:
 dtCleaningCollection.Rows.Add(18, @" ", " ");
 dtCleaningCollection.Rows.Add(19, @"&bull;", " * ");
 dtCleaningCollection.Rows.Add(20, @"&lsaquo;", "<");
 dtCleaningCollection.Rows.Add(21, @"&rsaquo;", ">");
 dtCleaningCollection.Rows.Add(22, @"&trade;", "(tm)");
 dtCleaningCollection.Rows.Add(23, @"&frasl;", "/");
 dtCleaningCollection.Rows.Add(24, @"&lt;", "<");
 dtCleaningCollection.Rows.Add(25, @"&gt;", ">");
 dtCleaningCollection.Rows.Add(26, @"&copy;", "(c)");
 dtCleaningCollection.Rows.Add(27, @"&reg;", "(r)");
 dtCleaningCollection.Rows.Add(28, @"&frac14;", "1/4");
 dtCleaningCollection.Rows.Add(29, @"&frac12;", "1/2");
 dtCleaningCollection.Rows.Add(30, @"&frac34;", "3/4");
 dtCleaningCollection.Rows.Add(31, @"&lsquo;", "'");
 dtCleaningCollection.Rows.Add(32, @"&rsquo;", "'");
 dtCleaningCollection.Rows.Add(33, @"&ldquo;", "\"");
 dtCleaningCollection.Rows.Add(34, @"&rdquo;", "\"");

 // Remove all others remianing special characters
 // you dont want to replace with another string
 dtCleaningCollection.Rows.Add(35, @"&(.{2,6});", string.Empty);

 // Remove extra line sBreaks and sTabs
 dtCleaningCollection.Rows.Add(36, "(\r)( )+(\r)", "\r\r");
 dtCleaningCollection.Rows.Add(37, "(\t)( )+(\t)", "\t\t");
 dtCleaningCollection.Rows.Add(38, "(\t)( )+(\r)", "\t\r");
 dtCleaningCollection.Rows.Add(39, "(\r)( )+(\t)", "\r\t");
 dtCleaningCollection.Rows.Add(40, "(\r)(\t)+(\r)", "\r\r");
 dtCleaningCollection.Rows.Add(41, "(\r)(\t)+", "\r\t");

 return dtCleaningCollection;


This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


About the Author

Raymund Macaalay
Technical Lead
New Zealand New Zealand

You may also be interested in...

Comments and Discussions

QuestionIs it possible to format the text output using the html tags? Pin
andre1234526-Mar-10 2:44
memberandre1234526-Mar-10 2:44 
GeneralHumm... Pin
andre1234525-Mar-10 2:27
memberandre1234525-Mar-10 2:27 
GeneralRe: Humm... Pin
Raymund Macaalay25-Mar-10 8:52
memberRaymund Macaalay25-Mar-10 8:52 
GeneralVery nice Pin
Mohammad Elsheimy25-Mar-10 2:11
memberMohammad Elsheimy25-Mar-10 2:11 

General General    News News    Suggestion Suggestion    Question Question    Bug Bug    Answer Answer    Joke Joke    Praise Praise    Rant Rant    Admin Admin   

Use Ctrl+Left/Right to switch messages, Ctrl+Up/Down to switch threads, Ctrl+Shift+Left/Right to switch pages.

Permalink | Advertise | Privacy | Terms of Use | Mobile
Web04 | 2.8.170813.1 | Last Updated 25 Mar 2010
Article Copyright 2010 by Raymund Macaalay
Everything else Copyright © CodeProject, 1999-2017
Layout: fixed | fluid