Click here to Skip to main content
15,892,575 members
Articles / Programming Languages / C#

Wrapper Class for Parsing Fixed-Width, Multiple Section Files

Rate me:
Please Sign up or sign in to vote.
4.07/5 (4 votes)
21 Apr 2006CPOL8 min read 51.6K   1.1K   33  
An article describing a wrapper class to import very large multiple section reports, typically from a legacy system, into the modern SQL Server or other RDBMS.
using System;
using System.IO;
using System.Text;
using System.Collections;
using System.Data;
using System.Text.RegularExpressions ;

namespace ReportParser
{
	public class FileParser
	{
		/// <summary>
		/// Raised when an error occurs on a record
		/// </summary>
    	private FileSchema m_FileSchema = null;
		private string m_FileName = "";
		private string m_SchemaFile = "";
		private int m_CurrentLineNumber = 0;
		private string LineToFind= "";

		public FileParser(string fileName)
		{
			m_FileName = fileName;
			m_FileSchema = new FileSchema();
		}

		public FileParser(string fileName, string schemaFile)
		{
			m_FileName = fileName;
			m_FileSchema = new FileSchema(schemaFile);
		}

		/// <summary>
		/// Path to the data file.
		/// </summary>
		public string FileName
		{
			get{return m_FileName;}
			set{m_FileName = value;}
		}

		/// <summary>
		/// Path to a schema file.
		/// </summary>
		public string SchemaFile
		{
			get{return m_SchemaFile;}
			set{m_SchemaFile = value;}
		}

		//The current line number when parsing
		public int CurrentLineNumber
		{	
			get{return m_CurrentLineNumber;}
		}

		/// <summary>
		/// Removes extra spaces from around the data
		/// </summary>
		/// <param name="fields"></param>
		private void TrimFields(ref string[] fields)
		{
			for(int x = 0; x < fields.Length; x++)
				fields[x] = fields[x].Trim();
		}

		/// <summary>
		/// Does the same as Parse, but puts the results in a datatable.
		/// </summary>
		/// <returns></returns>
		public DataSet ParseToDataSet()
		{
			DataSet oDS = DataSetFromSchema();
			StreamReader reader = new StreamReader(m_FileName);
			int iSection = 0;					// Section Currently Being worked on in loop.
			string[] Lines = null;				// All Lines Read in from Reader for given section.
			int CurrentLine;					// Current Line in Section Being Read.
			Section oSection = null;			// Pointer to current section for aesethetic purposes only.
			DataRow prevRow = null;				// Previously added row for Foreign Key Fields.
			DataRow parentRow = null;			// Reference to Parent Row for each section for DataRelation.
			string CurrentFieldName="";			// Current Field Name for Diagnostic Purposes
			string CurrentLineValue="";			// Current Line for Diagnostic Purposes
			oSection = m_FileSchema.Sections[0];// Default to first Section. Section advances after each loop done.
			 
			try
			{
				while ( ReadSection(reader, oSection.Length, oSection, ref Lines) )
				{
					CurrentLine = 0;
					foreach (string Line in Lines)
					{
						CurrentLineValue = Line; 

						if (LineToFind == CurrentLineValue)
							System.Diagnostics.Debug.WriteLine("We are Here!!!");

						foreach (TextField tf in oSection.TextFields)
						{
							CurrentFieldName = tf.Name;
							if (tf.LineNumber == CurrentLine) 
							{
								if ( tf.StartIndex > 0 )	// StartIndex > 1 indicates to read from Current Line.
									tf.Value = Line.Substring(tf.StartIndex, tf.Length).Trim();
								else						// If StartIndex = -1, then read from previous DataRow.
									if (iSection > 0 )		// Only Carry over if not the header..., if it is the header,autoincremented.
										tf.Value = prevRow[tf.Name];
							}
						}
						if (oSection.Length > 0 ) // Increment only when section is multi-line
							CurrentLine ++;   
						else					  // Single Line, Multi-Row Section, Add Current Line to DataTable.
							prevRow = AddRow(oDS, oSection, parentRow);
					}
					if (oSection.Length > 0 )// All fields in above considered to be one row of data. Add this row to the DataSet.
						prevRow = AddRow(oDS, oSection, parentRow);
					if (iSection==0)// Set Reference to ParentRow for all following Child Rows..
						parentRow = prevRow;					
					iSection ++;// Increment Section, if section greater than the number of sections, start from 0.
					if (iSection > (m_FileSchema.Sections.Count-1))
						iSection = 0;
					oSection = m_FileSchema.Sections[iSection]; // Advance The Section Reference.
				}		
				reader.Close();
				return oDS;
			}
			catch (Exception ex)
			{
				string strMessage = "FileParser.ParseToDataSet: The following error occured: " + ex.Message 
					+ "\nStack Trace: " + ex.StackTrace
					+ "\nCurrent Field: " + CurrentFieldName 
					+ "\nText of Line Causing Error:\n\"" + CurrentLineValue + "\"";
				throw new ApplicationException(strMessage);
			}
		}

		private DataSet DataSetFromSchema()
		{
			DataSet oDS = new DataSet();
			int i = 0;
			foreach (Section section in m_FileSchema.Sections )
			{
				DataTable dt = MakeTable(section, i==0);
				oDS.Tables.Add(dt);
				AppendDataRelation(oDS, i);
				i++;
			}
			return oDS;
		}

		private void AppendDataRelation(DataSet oDS, int i)
		{
			if (oDS.Tables.Count > 1)
			{
				DataColumn ParentColumn = oDS.Tables[0].Columns[0];
				if ( (oDS.Tables[i].Columns.IndexOf(ParentColumn.ColumnName ))!= -1)
				{
					DataRelation oDRN;
					string RelationName = oDS.Tables[0].TableName + "_" + oDS.Tables[i].TableName ;
					DataColumn ChildColumn = oDS.Tables[i].Columns[oDS.Tables[0].Columns[0].ColumnName];
					oDRN = new DataRelation(RelationName, ParentColumn,ChildColumn);

					// Following Affects output of XML Document Only I Believe
					oDRN.Nested = true;
					
					oDS.Relations.Add(oDRN);
				}
			}
			return;
		}

		/// <summary>
		/// Builds a datatable based on the FileSchema
		/// </summary>
		/// <returns></returns>
		/// 
		private DataTable MakeTable(Section Section, bool isHeader)
		{
			DataTable dt = new DataTable() ;
			dt.TableName = Section.Name;
			DataColumn column;
			foreach(TextField field in Section.TextFields)
			{
				column = new DataColumn(field.Name);
				//I don't really like this.
				//I could not find a way to convert from TypeCode to Type.
				//If you find the way, please let me know.
				switch(field.DataType)
				{
					case TypeCode.Boolean:
						column.DataType = Type.GetType("System.Boolean");
						break;
					case TypeCode.Byte:
						column.DataType = Type.GetType("System.Byte");
						break;
					case TypeCode.Char:
						column.DataType = Type.GetType("System.Char");
						break;
					case TypeCode.DateTime:
						column.DataType = Type.GetType("System.DateTime");
						break;
					case TypeCode.Decimal:
						column.DataType = Type.GetType("System.Decimal");
						break;
					case TypeCode.Double:
						column.DataType = Type.GetType("System.Double");
						break;
					case TypeCode.Int16:
						column.DataType = Type.GetType("System.Int16");
						break;
					case TypeCode.Int32: 
						column.DataType = Type.GetType("System.Int32");
						break;
					case TypeCode.Int64:
						column.DataType = Type.GetType("System.Int64");
						break;
					case TypeCode.Object:
						column.DataType = Type.GetType("System.Object");
						break;
					case TypeCode.Single:
						column.DataType = Type.GetType("System.Single");
						break;
					case TypeCode.String:
						column.DataType = Type.GetType("System.String");
						break;
				}
				
				if (isHeader && (field.StartIndex == -1)) 
				{
					column.AutoIncrement = true;
					column.AutoIncrementStep = -1;
					column.AutoIncrementSeed = -1;
				}

				dt.Columns.Add(column);
			}

			return dt;
		}

		/// <summary>
		/// Adds a row to the datatable
		/// </summary>
		private DataRow AddRow(DataSet oDS, Section oSection, DataRow parentRow)
		{
			DataTable oDT = oDS.Tables[oSection.Name];
			DataRow oDR = oDT.NewRow();
			foreach(TextField field in oSection.TextFields)
			{
				if (oDT.Columns[field.Name].AutoIncrement == false)
					oDR[field.Name] = field.Value;
				else 
					parentRow = null; // Set ParentRow to null since we are on a record
				field.Value = "";
			}
			if (parentRow != null)
			{
				foreach (DataRelation oDRN in oDS.Relations)
				{
					if (oDRN.RelationName.IndexOf(oSection.Name) > 0)
					{
						oDR.SetParentRow(parentRow, oDRN);
						break;
					}
				}
			}
			oDT.Rows.Add(oDR);
			return oDR;			
		}

		/// <summary>
		/// Read a Section from the file.
		/// </summary>
		private bool ReadSection(StreamReader reader, int Length, Section oSection, ref string[] Lines)
		{
			System.Text.StringBuilder oSB = new StringBuilder("");
			if (Length > 0 ) // Multiple Line Header or Footer Section
			{
				int LineNumber = 0;
				while (( reader.Peek() != -1 ) && (LineNumber < Length))
				{
					oSB.Append(reader.ReadLine() + "|");
					LineNumber ++;
				}
			}
			else  // Multi-Line Details Section 
			{
				while( reader.Peek() != -1 ) 
				{
					if ((char)reader.Peek() == '0')  // TODO: Should be reading from the configuration.
						break;
					else
					{
						if ((char)reader.Peek() == '1') // Advance the Reader over a continuation header.
						{
							ReadSection(reader, Lines.Length, oSection.ParentSection, ref Lines);
						}
						else
						{
							oSB.Append (  reader.ReadLine() + "|" );
						}
					}
				}
			}

			Lines = oSB.ToString().Substring(0, oSB.ToString().Length-1).Split('|');	
			
			if (Length == 0)	// When Length = 0, multi-line, return true when at least one line read
				return (Lines.Length > 0);
			else				// When Length > 0, Return True When the Read Length is Equal to the Requested Length.
			{
				string sLine;
				if (oSection.SectionFormat == SectionFormat.Header )
				{
					sLine = Lines[Lines.Length-1] ;
					System.Text.RegularExpressions.Regex regex = new Regex(oSection.EndString);
					Match m = regex.Match(sLine);
					if (m.Success && m.Length == sLine.Length)
						return (Lines.Length == Length);
					else
					{
						while( (reader.Peek() != -1) &&  ((char)reader.Peek() != '1'))
						{
							sLine = reader.ReadLine(); // Advance the reader
						}
						if ((reader.Peek() != -1))
							return ReadSection(reader, Length, oSection, ref Lines);
						else
							return false;
					}
				}
				else // oSection.SectionFormat = SectionFormat.Footer
				{
					while( (reader.Peek() != -1) &&  ((char)reader.Peek() != '1'))
					{
						sLine = reader.ReadLine();
					}
					return (Lines.Length == Length);
				}

			}
		}

		/// <summary>
		/// Gets - sets the path to the data.
		/// </summary>
		public string FilePath
		{
			get{return m_FileSchema.FilePath;}
			set{m_FileSchema.FilePath = value;}
		}

		/// <summary>
		/// Gets - sets the delimiter in a delimitted file
		/// </summary>
		public char Delimeter
		{
			get{return m_FileSchema.Delimeter;}
			set{m_FileSchema.Delimeter = value;}
		}

		/// <summary>
		/// Gets sets the character used for quoted fields
		/// </summary>
		public char QuoteCharacter
		{
			get{return m_FileSchema.QuoteCharacter;}
			set{m_FileSchema.QuoteCharacter = value;}
		}

		/// <summary>
		/// Gets - sets the TextFields
		/// </summary>
		public SectionCollection Sections
		{
			get{return m_FileSchema.Sections;}
			set{m_FileSchema.Sections = value;}
		}
	}
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Web Developer
United States United States
Tampa, FL developer with about 11 years of experience.

Comments and Discussions