Click here to Skip to main content
15,898,036 members
Articles / Programming Languages / C#

Neural Network for Recognition of Handwritten Digits in C#

Rate me:
Please Sign up or sign in to vote.
4.93/5 (89 votes)
14 Mar 2012MIT9 min read 310.9K   48K   217  
This article is an example of an artificial neural network designed to recognize handwritten digits.
using System;
using ArchiveSerialization;
using System.Threading;
namespace NeuralNetworkLibrary
{
    // Layer class
    public class NNLayer : IArchiveSerialization
    {
        public NNWeightList m_Weights;
        public NNNeuronList m_Neurons;

        public string label;
        public NNLayer m_pPrevLayer;
        private SigmoidFunction m_sigmoid;
        bool m_bFloatingPointWarning;  // flag for one-time warning (per layer) about potential floating point overflow
        public NNLayer()
        {
            label = "";
            m_pPrevLayer = null;
            m_sigmoid = new SigmoidFunction();
            m_Weights = new NNWeightList();
            m_Neurons = new NNNeuronList();
            Initialize();

        }
        public NNLayer(string str, NNLayer pPrev /* =NULL */)
        {
            label = str;
            m_pPrevLayer = pPrev;
            m_sigmoid = new SigmoidFunction();
            m_Weights = new NNWeightList();
            m_Neurons = new NNNeuronList();
        }
        public void Initialize()
        {
            m_Weights.Clear();
            m_Neurons.Clear();
            m_bFloatingPointWarning = false;
        }
        public void Calculate()
        {
           double dSum = 0;
           foreach (var nit in m_Neurons)
                {

                    foreach (var cit in nit.m_Connections)
                        {
                            if (cit==nit.m_Connections[0])
                            {
                                dSum = (m_Weights[(int)cit.WeightIndex].value);
                            }
                            else
                            {
                               
                                dSum += (m_Weights[(int)cit.WeightIndex].value) *
                                    (m_pPrevLayer.m_Neurons[(int)cit.NeuronIndex].output);
                            }
                        }
                      
                        nit.output = m_sigmoid.SIGMOID(dSum);
                        

                }
          

        }
        /////////////
        public void Backpropagate(DErrorsList dErr_wrt_dXn /* in */,
                            DErrorsList dErr_wrt_dXnm1 /* out */,
                            NNNeuronOutputs thisLayerOutput,  // memorized values of this layer's output
                            NNNeuronOutputs prevLayerOutput,  // memorized values of previous layer's output
                            double etaLearningRate)
        {
            // nomenclature (repeated from NeuralNetwork class):
            //
            // Err is output error of the entire neural net
            // Xn is the output vector on the n-th layer
            // Xnm1 is the output vector of the previous layer
            // Wn is the vector of weights of the n-th layer
            // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
            // F is the squashing function: Xn = F(Yn)
            // F' is the derivative of the squashing function
            //   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
            try
            {
                int ii, jj;
                uint kk;
                int nIndex;
                double output;
                DErrorsList dErr_wrt_dYn = new DErrorsList(m_Neurons.Count);
                //
                //	std::vector< double > dErr_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
                //////////////////////////////////////////////////
                //
                ///// DESIGN TRADEOFF: REVIEW !!
                // We would prefer (for ease of coding) to use STL vector for the array "dErr_wrt_dWn", which is the 
                // differential of the current pattern's error wrt weights in the layer.  However, for layers with
                // many weights, such as fully-connected layers, there are also many weights.  The STL vector
                // class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable 
                // number of page faults, with a consequent slowing of the application's overall execution time.

                // To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and 
                // delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
                // errors, and did not improve performance.

                // So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
                // write a statement like 
                //    double dErr_wrt_dWn[ m_Weights.size() ];
                // since the compiler insists upon a compile-time known constant value for the size of the array.  
                // To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
                // The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
                // this comment is labeled "REVIEW"
                double[] dErr_wrt_dWn = new double[m_Weights.Count];
                for (ii = 0; ii < m_Weights.Count; ii++)
                {
                    dErr_wrt_dWn[ii] = 0.0;
                }

                bool bMemorized = (thisLayerOutput != null) && (prevLayerOutput != null);
                // calculate dErr_wrt_dYn = F'(Yn) * dErr_wrt_Xn

                for (ii = 0; ii < m_Neurons.Count; ii++)
                {
                    if (bMemorized != false)
                    {
                        output = thisLayerOutput[ii];
                    }
                    else
                    {
                        output = m_Neurons[ii].output;
                    }

                    dErr_wrt_dYn.Add(m_sigmoid.DSIGMOID(output) * dErr_wrt_dXn[ii]);
                }

                // calculate dErr_wrt_Wn = Xnm1 * dErr_wrt_Yn
                // For each neuron in this layer, go through the list of connections from the prior layer, and
                // update the differential for the corresponding weight

                ii = 0;
                foreach (NNNeuron nit in m_Neurons)
                {
                    foreach (NNConnection cit in nit.m_Connections)
                    {
                        kk = cit.NeuronIndex;
                        if (kk == 0xffffffff)
                        {
                            output = 1.0;  // this is the bias weight
                        }
                        else
                        {
                            if (bMemorized != false)
                            {
                                output = prevLayerOutput[(int)kk];
                            }
                            else
                            {
                                output = m_pPrevLayer.m_Neurons[(int)kk].output;
                            }
                        }
                        dErr_wrt_dWn[cit.WeightIndex] += dErr_wrt_dYn[ii] * output;
                    }

                    ii++;
                }
                // calculate dErr_wrt_Xnm1 = Wn * dErr_wrt_dYn, which is needed as the input value of
                // dErr_wrt_Xn for backpropagation of the next (i.e., previous) layer
                // For each neuron in this layer

                ii = 0;
                foreach (NNNeuron nit in m_Neurons)
                {
                    foreach (NNConnection cit in nit.m_Connections)
                    {
                        kk = cit.NeuronIndex;
                        if (kk != 0xffffffff)
                        {
                            // we exclude ULONG_MAX, which signifies the phantom bias neuron with
                            // constant output of "1", since we cannot train the bias neuron

                            nIndex = (int)kk;
                            dErr_wrt_dXnm1[nIndex] += dErr_wrt_dYn[ii] * m_Weights[(int)cit.WeightIndex].value;
                        }

                    }

                    ii++;  // ii tracks the neuron iterator

                }
                // finally, update the weights of this layer neuron using dErr_wrt_dW and the learning rate eta
                // Use an atomic compare-and-exchange operation, which means that another thread might be in 
                // the process of backpropagation and the weights might have shifted slightly
                const double dMicron = 0.10;
                double epsilon, divisor;
                double oldValue;
                double newValue;
                for (jj = 0; jj < m_Weights.Count; ++jj)
                {
                    divisor = m_Weights[jj].diagHessian + dMicron;

                    // the following code has been rendered unnecessary, since the value of the Hessian has been
                    // verified when it was created, so as to ensure that it is strictly
                    // zero-positve.  Thus, it is impossible for the diagHessian to be less than zero,
                    // and it is impossible for the divisor to be less than dMicron
                    /*
                    if ( divisor < dMicron )  
                    {
                    // it should not be possible to reach here, since everything in the second derviative equations 
                    // is strictly zero-positive, and thus "divisor" should definitely be as large as MICRON.
		
                      ASSERT( divisor >= dMicron );
                      divisor = 1.0 ;  // this will limit the size of the update to the same as the size of gloabal eta
                      }
                    */
                    epsilon = etaLearningRate / divisor;
                    oldValue = m_Weights[jj].value;
                    newValue = oldValue - epsilon * dErr_wrt_dWn[jj];
                    while ( oldValue != Interlocked.CompareExchange(
                           ref (m_Weights[jj].value),
                            (double)newValue,(double) oldValue)) 
                    {
                        // another thread must have modified the weight.

                        // Obtain its new value, adjust it, and try again

                        oldValue = m_Weights[jj].value;
                        newValue = oldValue - epsilon * dErr_wrt_dWn[jj];

                        
                        
                    }

                                    
                                   
                }

            }
            catch (Exception ex)
            {
              return;
            }


        }


        public void PeriodicWeightSanityCheck()
        {
            // called periodically by the neural net, to request a check on the "reasonableness" of the 
            // weights.  The warning message is given only once per layer



            foreach (NNWeight wit in m_Weights)
            {

                double val = System.Math.Abs(wit.value);

                if ((val > 100.0) && (m_bFloatingPointWarning == false))
                {
                    // 100.0 is an arbitrary value, that no reasonable weight should ever exceed
                    /*
                    string strMess = ""; ;
                    strMess.Format("Caution: Weights are becoming unboundedly large \n"+
                        "Layer: %s \nWeight: %s \nWeight value = %g \nWeight Hessian = %g\n\n"+
                         "Suggest abandoning this backpropagation and investigating",
                        label, wit.label, wit.value, wit.diagHessian );
                    //show message box
                    //MessageBox.show( NULL, strMess, _T( "Problem With Weights" ), MB_ICONEXCLAMATION | MB_OK );
                    */
                    m_bFloatingPointWarning = true;
                }
            }
        }


        public void EraseHessianInformation()
        {
            // goes through all the weights associated with this layer, and sets each of their
            // diagHessian value to zero

            foreach (NNWeight wit in m_Weights)
            {
                wit.diagHessian = 0.0;
            }

        }

        public void DivideHessianInformationBy(double divisor)
        {
            // goes through all the weights associated with this layer, and divides each of their
            // diagHessian value by the indicated divisor


            double dTemp;

            foreach (NNWeight wit in m_Weights)
            {
                dTemp = wit.diagHessian;

                if (dTemp < 0.0)
                {
                    // it should not be possible to reach here, since all calculations for the second
                    // derviative are strictly zero-positive.  However, there are some early indications 
                    // that this check is necessary anyway
                    dTemp = 0.0;
                }

                wit.diagHessian = dTemp / divisor;
            }
        }
        public void BackpropagateSecondDerivatives(DErrorsList d2Err_wrt_dXn /* in */,
                                                    DErrorsList d2Err_wrt_dXnm1 /* out */)
        {
            // nomenclature (repeated from NeuralNetwork class)
            // NOTE: even though we are addressing SECOND derivatives ( and not first derivatives),
            // we use nearly the same notation as if there were first derivatives, since otherwise the
            // ASCII look would be confusing.  We add one "2" but not two "2's", such as "d2Err_wrt_dXn",
            // to give a gentle emphasis that we are using second derivatives
            //
            // Err is output error of the entire neural net
            // Xn is the output vector on the n-th layer
            // Xnm1 is the output vector of the previous layer
            // Wn is the vector of weights of the n-th layer
            // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
            // F is the squashing function: Xn = F(Yn)
            // F' is the derivative of the squashing function
            //   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input 

            int ii, jj;
            uint kk;
            int nIndex;
            double output;
            double dTemp;

            var d2Err_wrt_dYn = new DErrorsList(m_Neurons.Count);
            //
            // std::vector< double > d2Err_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
            //////////////////////////////////////////////////
            //
            ///// DESIGN TRADEOFF: REVIEW !!
            //
            // Note that the reasoning of this comment is identical to that in the NNLayer::Backpropagate() 
            // function, from which the instant BackpropagateSecondDerivatives() function is derived from
            //
            // We would prefer (for ease of coding) to use STL vector for the array "d2Err_wrt_dWn", which is the 
            // second differential of the current pattern's error wrt weights in the layer.  However, for layers with
            // many weights, such as fully-connected layers, there are also many weights.  The STL vector
            // class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable 
            // number of page faults, with a consequent slowing of the application's overall execution time.

            // To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and 
            // delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
            // errors, and did not improve performance.

            // So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
            // write a statement like 
            //    double d2Err_wrt_dWn[ m_Weights.size() ];
            // since the compiler insists upon a compile-time known constant value for the size of the array.  
            // To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
            // The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
            // this comment is labeled "REVIEW"

            double[] d2Err_wrt_dWn = new double[m_Weights.Count];
            for (ii = 0; ii < m_Weights.Count; ii++)
            {
                d2Err_wrt_dWn[ii] = 0.0;
            }
            // calculate d2Err_wrt_dYn = ( F'(Yn) )^2 * dErr_wrt_Xn (where dErr_wrt_Xn is actually a second derivative )

            for (ii = 0; ii < m_Neurons.Count; ii++)
            {

                output = m_Neurons[ii].output;
                dTemp = m_sigmoid.DSIGMOID(output);
                d2Err_wrt_dYn.Add(d2Err_wrt_dXn[ii] * dTemp * dTemp);
            }
            // calculate d2Err_wrt_Wn = ( Xnm1 )^2 * d2Err_wrt_Yn (where dE2rr_wrt_Yn is actually a second derivative)
            // For each neuron in this layer, go through the list of connections from the prior layer, and
            // update the differential for the corresponding weight

            ii = 0;
            foreach (NNNeuron nit in m_Neurons)
            {
                foreach (NNConnection cit in nit.m_Connections)
                {
                    try
                    {

                        kk = (uint)cit.NeuronIndex;
                        if (kk == 0xffffffff)
                        {
                            output = 1.0;  // this is the bias connection; implied neuron output of "1"
                        }
                        else
                        {
                            output = m_pPrevLayer.m_Neurons[(int)kk].output;
                        }

                        ////////////	ASSERT( (*cit).WeightIndex < d2Err_wrt_dWn.size() );  // since after changing d2Err_wrt_dWn to a C-style array, the size() function this won't work
                        //d2Err_wrt_dWn[cit.WeightIndex] += d2Err_wrt_dYn[ii] * output * output;
                        d2Err_wrt_dWn[cit.WeightIndex] = d2Err_wrt_dYn[ii] * output * output;
                    }
                    catch (Exception ex)
                    {

                    }
                }

                ii++;
            }
            // calculate d2Err_wrt_Xnm1 = ( Wn )^2 * d2Err_wrt_dYn (where d2Err_wrt_dYn is a second derivative not a first).
            // d2Err_wrt_Xnm1 is needed as the input value of
            // d2Err_wrt_Xn for backpropagation of second derivatives for the next (i.e., previous spatially) layer
            // For each neuron in this layer

            ii = 0;
            foreach (NNNeuron nit in m_Neurons)
            {
                foreach (NNConnection cit in nit.m_Connections)
                {
                    try
                    {
                        kk = cit.NeuronIndex;
                        if (kk != 0xffffffff)
                        {
                            // we exclude ULONG_MAX, which signifies the phantom bias neuron with
                            // constant output of "1", since we cannot train the bias neuron

                            nIndex = (int)kk;
                            dTemp = m_Weights[(int)cit.WeightIndex].value;
                            d2Err_wrt_dXnm1[nIndex] += d2Err_wrt_dYn[ii] * dTemp * dTemp;
                        }
                    }
                    catch (Exception ex)
                    {
                        return;
                    }
                }

                ii++;  // ii tracks the neuron iterator

            }
            double oldValue, newValue;

            // finally, update the diagonal Hessians for the weights of this layer neuron using dErr_wrt_dW.
            // By design, this function (and its iteration over many (approx 500 patterns) is called while a 
            // single thread has locked the nueral network, so there is no possibility that another
            // thread might change the value of the Hessian.  Nevertheless, since it's easy to do, we
            // use an atomic compare-and-exchange operation, which means that another thread might be in 
            // the process of backpropagation of second derivatives and the Hessians might have shifted slightly

            for (jj = 0; jj < m_Weights.Count; jj++)
            {
                oldValue = m_Weights[jj].diagHessian;
                newValue = oldValue + d2Err_wrt_dWn[jj];
                m_Weights[jj].diagHessian = newValue;
            }
        }
        virtual public void Serialize(Archive ar)
        {
            int ii, jj;
	
	        if (ar.IsStoring())
	        {
		        // TODO: add storing code here
                // TODO: add storing code here

                ar.Write(label);
                //ar.WriteString(_T("\r\n"));  // ar.ReadString will look for \r\n when loading from the archive
                ar.Write(m_Neurons.Count);
                ar.Write(m_Weights.Count);



                foreach (NNNeuron nit in m_Neurons)
                {
                    ar.Write(nit.label);
                    ar.Write(nit.m_Connections.Count);

                    foreach (NNConnection cit in nit.m_Connections)
                    {
                        ar.Write(cit.NeuronIndex);
                        ar.Write(cit.WeightIndex);
                    }
                }

                foreach (NNWeight wit in m_Weights)
                {
                    ar.Write(wit.label);
                    ar.Write(wit.value);
                }
		    }
	        else
	        {
		        // TODO: add loading code here
		
		        string str;
		        //Read Layter's label
                ar.Read(out str);
		        label = str;
		
		        int iNumNeurons, iNumWeights, iNumConnections;
		        double value;
		
		        NNNeuron pNeuron;
		        NNWeight pWeight;
                
		        //Read No of Neuron, Weight
		        ar.Read(out iNumNeurons);
		        ar.Read(out iNumWeights);
                if (iNumNeurons != 0 )
                {
                    //clear neuron list and weight list.
                    m_Neurons.Clear();
                    m_Neurons = new NNNeuronList(iNumNeurons);
                    m_Weights.Clear();
                    m_Weights = new NNWeightList(iNumWeights);

                    for (ii = 0; ii < iNumNeurons; ii++)
                    {
                        //ar.Read Neuron's label
                        ar.Read(out str);
                        //Read Neuron's Connection number
                        ar.Read(out iNumConnections);
                        pNeuron = new NNNeuron(str, iNumConnections);
                        pNeuron.label = str;
                        m_Neurons.Add(pNeuron);
                        for (jj = 0; jj < iNumConnections; jj++)
                        {
                            var conn = new NNConnection();
                            ar.Read(out conn.NeuronIndex);
                            ar.Read(out conn.WeightIndex);
                            pNeuron.AddConnection(conn);
                        }
                    }

                    for (jj = 0; jj < iNumWeights; jj++)
                    {
                        ar.Read(out str);
                        ar.Read(out value);

                        pWeight = new NNWeight(str, value);
                        m_Weights.Add(pWeight);
                    }
                }
		
	        }
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The MIT License


Written By
Vietnam Maritime University
Vietnam Vietnam
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions