Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

Neural Network for Recognition of Handwritten Digits

, 5 Dec 2006
A convolutional neural network achieves 99.26% accuracy on a modified NIST database of hand-written digits.
// NeuralNetwork.cpp: implementation of the NeuralNetwork class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "NeuralNetwork.h"
#include "MNist.h"  // for the _Intelocked functions
#include <malloc.h>  // for the _alloca function


#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif


///////////////////////////////////////////////////////////////////////
//
//  NeuralNetwork class definition

NeuralNetwork::NeuralNetwork()
{
	Initialize();
}

void NeuralNetwork::Initialize()
{
	// delete all layers
	
	VectorLayers::iterator it;
	
	for( it=m_Layers.begin(); it<m_Layers.end(); it++ )
	{
		delete *it;
	}
	
	m_Layers.clear();
	
	m_etaLearningRate = .001;  // arbitrary, so that brand-new NNs can be serialized with a non-ridiculous number
	m_cBackprops = 0;
	
}

NeuralNetwork::~NeuralNetwork()
{
	// call Initialize(); makes sense if you think
	
	Initialize();
}

void NeuralNetwork::Calculate(double* inputVector, UINT iCount, 
							  double* outputVector /* =NULL */, UINT oCount /* =0 */,
							  std::vector< std::vector< double > >* pNeuronOutputs /* =NULL */ )
{
	VectorLayers::iterator lit = m_Layers.begin();
	VectorNeurons::iterator nit;
	
	// first layer is imput layer: directly set outputs of all of its neurons
	// to the input vector
	
	if ( lit<m_Layers.end() )  
	{
		nit = (*lit)->m_Neurons.begin();
		int count = 0;
		
		ASSERT( iCount == (*lit)->m_Neurons.size() );  // there should be exactly one neuron per input
		
		while( ( nit < (*lit)->m_Neurons.end() ) && ( count < iCount ) )
		{
			(*nit)->output = inputVector[ count ];
			nit++;
			count++;
		}
	}
	
	
	
	for( lit++; lit<m_Layers.end(); lit++ )
	{
		(*lit)->Calculate();
	}
	
	// load up output vector with results
	
	if ( outputVector != NULL )
	{
		lit = m_Layers.end();
		lit--;
		
		nit = (*lit)->m_Neurons.begin();
		
		for ( int ii=0; ii<oCount; ++ii )
		{
			outputVector[ ii ] = (*nit)->output;
			nit++;
		}
	}
	
	// load up neuron output values with results
	
	if ( pNeuronOutputs != NULL )
	{
		// check for first time use (re-use is expected)
		
		if ( pNeuronOutputs->empty() != FALSE )
		{
			// it's empty, so allocate memory for its use
			
			pNeuronOutputs->clear();  // for safekeeping
			
			int ii = 0;
			for( lit=m_Layers.begin(); lit<m_Layers.end(); lit++ )
			{
				std::vector< double > layerOut;
				
				for ( ii=0; ii<(*lit)->m_Neurons.size(); ++ii )
				{
					layerOut.push_back( (*lit)->m_Neurons[ ii ]->output );
				}
				
				pNeuronOutputs->push_back( layerOut);
			}
		}
		else
		{
			// it's not empty, so assume it's been used in a past iteration and memory for
			// it has already been allocated internally.  Simply store the values
			
			int ii, jj = 0;
			for( lit=m_Layers.begin(); lit<m_Layers.end(); lit++ )
			{
				for ( ii=0; ii<(*lit)->m_Neurons.size(); ++ii )
				{
					(*pNeuronOutputs)[ jj ][ ii ] = (*lit)->m_Neurons[ ii ]->output ;
				}
				
				++jj;
			}
			
		}
		
	}
	
}


void NeuralNetwork::Backpropagate(double *actualOutput, double *desiredOutput, UINT count,
								  std::vector< std::vector< double > >* pMemorizedNeuronOutputs )
{
	// backpropagates through the neural net
	
	ASSERT( ( actualOutput != NULL ) && ( desiredOutput != NULL ) && ( count < 256 ) );
	
	ASSERT( m_Layers.size() >= 2 );  // there must be at least two layers in the net
	
	if ( ( actualOutput == NULL ) || ( desiredOutput == NULL ) || ( count >= 256 ) )
		return;
	
	
	// check if it's time for a weight sanity check
	
	m_cBackprops++;
	
	if ( (m_cBackprops % 10000) == 0 )
	{
		// every 10000 backprops
		
		PeriodicWeightSanityCheck();
	}
	
	
	// proceed from the last layer to the first, iteratively
	// We calculate the last layer separately, and first, since it provides the needed derviative
	// (i.e., dErr_wrt_dXnm1) for the previous layers
	
	// nomenclature:
	//
	// Err is output error of the entire neural net
	// Xn is the output vector on the n-th layer
	// Xnm1 is the output vector of the previous layer
	// Wn is the vector of weights of the n-th layer
	// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
	// F is the squashing function: Xn = F(Yn)
	// F' is the derivative of the squashing function
	//   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
	
	
	VectorLayers::iterator lit = m_Layers.end() - 1;
	
	std::vector< double > dErr_wrt_dXlast( (*lit)->m_Neurons.size() );
	std::vector< std::vector< double > > differentials;
	
	int iSize = m_Layers.size();
	
	differentials.resize( iSize );
	
	int ii;
	
	// start the process by calculating dErr_wrt_dXn for the last layer.
	// for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is simply
	// the difference between the target and the actual
	
	for ( ii=0; ii<(*lit)->m_Neurons.size(); ++ii )
	{
		dErr_wrt_dXlast[ ii ] = actualOutput[ ii ] - desiredOutput[ ii ];
	}
	
	
	// store Xlast and reserve memory for the remaining vectors stored in differentials
	
	differentials[ iSize-1 ] = dErr_wrt_dXlast;  // last one
	
	for ( ii=0; ii<iSize-1; ++ii )
	{
		differentials[ ii ].resize( m_Layers[ii]->m_Neurons.size(), 0.0 );
	}
	
	// now iterate through all layers including the last but excluding the first, and ask each of
	// them to backpropagate error and adjust their weights, and to return the differential
	// dErr_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer
	
	BOOL bMemorized = ( pMemorizedNeuronOutputs != NULL );
	
	lit = m_Layers.end() - 1;  // re-initialized to last layer for clarity, although it should already be this value
	
	ii = iSize - 1;
	for ( lit; lit>m_Layers.begin(); lit--)
	{
		if ( bMemorized != FALSE )
		{
			(*lit)->Backpropagate( differentials[ ii ], differentials[ ii - 1 ], 
				&(*pMemorizedNeuronOutputs)[ ii ], &(*pMemorizedNeuronOutputs)[ ii - 1 ], m_etaLearningRate );
		}
		else
		{
			(*lit)->Backpropagate( differentials[ ii ], differentials[ ii - 1 ], 
				NULL, NULL, m_etaLearningRate );
		}
		
		--ii;
	}
	
	
	differentials.clear();
	
}

								  
void NeuralNetwork::PeriodicWeightSanityCheck()
{
	// fucntion that simply goes through all weights, and tests them against an arbitrary
	// "reasonable" upper limit.  If the upper limit is exceeded, a warning is displayed
	
	VectorLayers::iterator lit;
	
	for ( lit=m_Layers.begin(); lit<m_Layers.end(); lit++)
	{
		(*lit)->PeriodicWeightSanityCheck();
	}
	
}




void NeuralNetwork::EraseHessianInformation()
{
	// controls each layer to erase (set to value of zero) all its diagonal Hessian info
	
	VectorLayers::iterator lit;
	
	for ( lit=m_Layers.begin(); lit<m_Layers.end(); lit++ )
	{
		(*lit)->EraseHessianInformation();
	}
	
}


void NeuralNetwork::DivideHessianInformationBy( double divisor )
{
	// controls each layer to divide its current diagonal Hessian info by a common divisor. 
	// A check is also made to ensure that each Hessian is strictly zero-positive
	
	VectorLayers::iterator lit;
	
	for ( lit=m_Layers.begin(); lit<m_Layers.end(); lit++ )
	{
		(*lit)->DivideHessianInformationBy( divisor );
	}
	
}


void NeuralNetwork::BackpropagateSecondDervatives( double* actualOutputVector, 
												  double* targetOutputVector, UINT count )
{
	// calculates the second dervatives (for diagonal Hessian) and backpropagates
	// them through neural net
	
	ASSERT( ( actualOutputVector != NULL ) && ( targetOutputVector != NULL ) && ( count < 256 ) );
	
	ASSERT( m_Layers.size() >= 2 );  // there must be at least two layers in the net
	
	if ( ( actualOutputVector == NULL ) || ( targetOutputVector == NULL ) || ( count >= 256 ) )
		return;
	
	// we use nearly the same nomenclature as above (e.g., "dErr_wrt_dXnm1") even though everything here
	// is actually second derivatives and not first derivatives, since otherwise the ASCII would 
	// become too confusing.  To emphasize that these are second derivatives, we insert a "2"
	// such as "d2Err_wrt_dXnm1".  We don't insert the second "2" that's conventional for designating
	// second derivatives
	
	
	VectorLayers::iterator lit;
	
	lit = m_Layers.end() - 1;  // set to last layer
	
	std::vector< double > d2Err_wrt_dXlast( (*lit)->m_Neurons.size() );
	std::vector< std::vector< double > > differentials;
	
	int iSize = m_Layers.size();
	
	differentials.resize( iSize );
	
	int ii;
	
	// start the process by calculating the second derivative dErr_wrt_dXn for the last layer.
	// for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is 
	// exactly one
	
	lit = m_Layers.end() - 1;  // point to last layer
	
	for ( ii=0; ii<(*lit)->m_Neurons.size(); ++ii )
	{
		d2Err_wrt_dXlast[ ii ] = 1.0;
	}
	
	
	// store Xlast and reserve memory for the remaining vectors stored in differentials
	
	differentials[ iSize-1 ] = d2Err_wrt_dXlast;  // last one
	
	for ( ii=0; ii<iSize-1; ++ii )
	{
		differentials[ ii ].resize( m_Layers[ii]->m_Neurons.size(), 0.0 );
	}
	
	
	// now iterate through all layers including the last but excluding the first, starting from
	// the last, and ask each of
	// them to backpropagate the second derviative and accumulate the diagonal Hessian, and also to
	// return the second dervative
	// d2Err_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer (which
	// is the previous layer spatially)
	
	lit = m_Layers.end() - 1;  // re-initialized to last layer for clarity, although it should already be this value
	
	ii = iSize - 1;
	for ( lit; lit>m_Layers.begin(); lit--)
	{
		(*lit)->BackpropagateSecondDerivatives( differentials[ ii ], differentials[ ii - 1 ] );
		
		--ii;
	}
	
	differentials.clear();
}




void NeuralNetwork::Serialize(CArchive &ar)
{
	if (ar.IsStoring())
	{
		// TODO: add storing code here
		
		ar << m_etaLearningRate;
		
		ar << m_Layers.size();
		
		VectorLayers::iterator lit;
		
		for( lit=m_Layers.begin(); lit<m_Layers.end(); lit++ )
		{
			(*lit)->Serialize( ar );
		}
		
	}
	else
	{
		// TODO: add loading code here
		
		double eta; 
		ar >> eta;
		m_etaLearningRate = eta;  // two-step storage is needed since m_etaLearningRate is "volatile"
		
		int nLayers;
		NNLayer* pLayer = NULL;
		
		ar >> nLayers;
		
		for ( int ii=0; ii<nLayers; ++ii )
		{
			pLayer = new NNLayer( _T(""), pLayer );
			
			m_Layers.push_back( pLayer );
			
			pLayer->Serialize( ar );
		}
		
	}
	
}










///////////////////////////////////////////////////////////////////////
//
//  NNLayer class definition


NNLayer::NNLayer() :
label( _T("") ), m_pPrevLayer( NULL )
{
	Initialize();
}

NNLayer::NNLayer( LPCTSTR str, NNLayer* pPrev /* =NULL */ ) :
label( str ), m_pPrevLayer( pPrev )
{
	Initialize();
}


void NNLayer::Initialize()
{
	VectorWeights::iterator wit;
	VectorNeurons::iterator nit;
	
	for( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		delete *nit;
	}
	
	for( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		delete *wit;
	}
	
	m_Weights.clear();
	m_Neurons.clear();
	
	m_bFloatingPointWarning = false;
	
}

NNLayer::~NNLayer()
{
	// call Initialize(); makes sense if you think
	
	Initialize();
}

void NNLayer::Calculate()
{
	ASSERT( m_pPrevLayer != NULL );
	
	VectorNeurons::iterator nit;
	VectorConnections::iterator cit;
	
	double dSum;
	
	for( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // to ease the terminology
		
		cit = n.m_Connections.begin();
		
		ASSERT( (*cit).WeightIndex < m_Weights.size() );
		
		dSum = m_Weights[ (*cit).WeightIndex ]->value;  // weight of the first connection is the bias; neuron is ignored
		
		for ( cit++ ; cit<n.m_Connections.end(); cit++ )
		{
			ASSERT( (*cit).WeightIndex < m_Weights.size() );
			ASSERT( (*cit).NeuronIndex < m_pPrevLayer->m_Neurons.size() );
			
			dSum += ( m_Weights[ (*cit).WeightIndex ]->value ) * 
				( m_pPrevLayer->m_Neurons[ (*cit).NeuronIndex ]->output );
		}
		
		n.output = SIGMOID( dSum );
		
	}
	
}



void NNLayer::Backpropagate( std::vector< double >& dErr_wrt_dXn /* in */, 
							std::vector< double >& dErr_wrt_dXnm1 /* out */, 
							std::vector< double >* thisLayerOutput,  // memorized values of this layer's output
							std::vector< double >* prevLayerOutput,  // memorized values of previous layer's output
							double etaLearningRate )
{
	// nomenclature (repeated from NeuralNetwork class):
	//
	// Err is output error of the entire neural net
	// Xn is the output vector on the n-th layer
	// Xnm1 is the output vector of the previous layer
	// Wn is the vector of weights of the n-th layer
	// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
	// F is the squashing function: Xn = F(Yn)
	// F' is the derivative of the squashing function
	//   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
	
	ASSERT( dErr_wrt_dXn.size() == m_Neurons.size() );
	ASSERT( m_pPrevLayer != NULL );
	ASSERT( dErr_wrt_dXnm1.size() == m_pPrevLayer->m_Neurons.size() );
	
	int ii, jj;
	UINT kk;
	int nIndex;
	double output;
	
	std::vector< double > dErr_wrt_dYn( m_Neurons.size() );
	//
	//	std::vector< double > dErr_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
	//////////////////////////////////////////////////
	//
	///// DESIGN TRADEOFF: REVIEW !!
	// We would prefer (for ease of coding) to use STL vector for the array "dErr_wrt_dWn", which is the 
	// differential of the current pattern's error wrt weights in the layer.  However, for layers with
	// many weights, such as fully-connected layers, there are also many weights.  The STL vector
	// class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable 
	// number of page faults, with a consequent slowing of the application's overall execution time.
	
	// To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and 
	// delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
	// errors, and did not improve performance.
	
	// So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
	// write a statement like 
	//    double dErr_wrt_dWn[ m_Weights.size() ];
	// since the compiler insists upon a compile-time known constant value for the size of the array.  
	// To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
	// The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
	// this comment is labeled "REVIEW"
	
	double* dErr_wrt_dWn = (double*)( _alloca( sizeof(double) *  m_Weights.size() ) );
	
	for ( ii=0; ii<m_Weights.size(); ++ii )
	{
		dErr_wrt_dWn[ ii ] =0.0;
	}
	
	
	VectorNeurons::iterator nit;
	VectorConnections::iterator cit;
	
	
	BOOL bMemorized = ( thisLayerOutput != NULL ) && ( prevLayerOutput != NULL );
	
	
	// calculate dErr_wrt_dYn = F'(Yn) * dErr_wrt_Xn
	
	for ( ii=0; ii<m_Neurons.size(); ++ii )
	{
		ASSERT( ii<dErr_wrt_dYn.size() );
		ASSERT( ii<dErr_wrt_dXn.size() );
		
		if ( bMemorized != FALSE )
		{
			output = (*thisLayerOutput)[ ii ];
		}
		else
		{
			output = m_Neurons[ ii ]->output;
		}
		
		dErr_wrt_dYn[ ii ] = DSIGMOID( output ) * dErr_wrt_dXn[ ii ];
	}
	
	// calculate dErr_wrt_Wn = Xnm1 * dErr_wrt_Yn
	// For each neuron in this layer, go through the list of connections from the prior layer, and
	// update the differential for the corresponding weight
	
	ii = 0;
	for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // for simplifying the terminology
		
		for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
		{
			kk = (*cit).NeuronIndex;
			if ( kk == ULONG_MAX )
			{
				output = 1.0;  // this is the bias weight
			}
			else
			{
				ASSERT( kk<m_pPrevLayer->m_Neurons.size() );
				
				if ( bMemorized != FALSE )
				{
					output = (*prevLayerOutput)[ kk ];
				}
				else
				{
					output = m_pPrevLayer->m_Neurons[ kk ]->output;
				}
			}
			
			////////////	ASSERT( (*cit).WeightIndex < dErr_wrt_dWn.size() );  // since after changing dErr_wrt_dWn to a C-style array, the size() function this won't work
			ASSERT( ii<dErr_wrt_dYn.size() );
			dErr_wrt_dWn[ (*cit).WeightIndex ] += dErr_wrt_dYn[ ii ] * output;
		}
		
		ii++;
	}
	
	
	// calculate dErr_wrt_Xnm1 = Wn * dErr_wrt_dYn, which is needed as the input value of
	// dErr_wrt_Xn for backpropagation of the next (i.e., previous) layer
	// For each neuron in this layer
	
	ii = 0;
	for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // for simplifying the terminology
		
		for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
		{
			kk=(*cit).NeuronIndex;
			if ( kk != ULONG_MAX )
			{
				// we exclude ULONG_MAX, which signifies the phantom bias neuron with
				// constant output of "1", since we cannot train the bias neuron
				
				nIndex = kk;
				
				ASSERT( nIndex<dErr_wrt_dXnm1.size() );
				ASSERT( ii<dErr_wrt_dYn.size() );
				ASSERT( (*cit).WeightIndex<m_Weights.size() );
				
				dErr_wrt_dXnm1[ nIndex ] += dErr_wrt_dYn[ ii ] * m_Weights[ (*cit).WeightIndex ]->value;
			}
			
		}
		
		ii++;  // ii tracks the neuron iterator
		
	}
	
	struct DOUBLE_UNION
	{
		union 
		{
			double dd;
			unsigned __int64 ullong;
		};
	};
	
	DOUBLE_UNION oldValue, newValue;
	
	// finally, update the weights of this layer neuron using dErr_wrt_dW and the learning rate eta
	// Use an atomic compare-and-exchange operation, which means that another thread might be in 
	// the process of backpropagation and the weights might have shifted slightly
	
	
	double dMicron = ::GetPreferences().m_dMicronLimitParameter;
	double epsilon, divisor;
	
	for ( jj=0; jj<m_Weights.size(); ++jj )
	{
		divisor = m_Weights[ jj ]->diagHessian + dMicron ; 
		
		
		// the following code has been rendered unnecessary, since the value of the Hessian has been
		// verified when it was created, so as to ensure that it is strictly
		// zero-positve.  Thus, it is impossible for the diagHessian to be less than zero,
		// and it is impossible for the divisor to be less than dMicron
		/*
		if ( divisor < dMicron )  
		{
		// it should not be possible to reach here, since everything in the second derviative equations 
		// is strictly zero-positive, and thus "divisor" should definitely be as large as MICRON.
		
		  ASSERT( divisor >= dMicron );
		  divisor = 1.0 ;  // this will limit the size of the update to the same as the size of gloabal eta
		  }
		*/
		
		epsilon = etaLearningRate / divisor;
		oldValue.dd = m_Weights[ jj ]->value;
		newValue.dd = oldValue.dd - epsilon * dErr_wrt_dWn[ jj ];
		
		
		while ( oldValue.ullong != _InterlockedCompareExchange64( (unsigned __int64*)(&m_Weights[ jj ]->value), 
			newValue.ullong, oldValue.ullong ) ) 
		{
			// another thread must have modified the weight.  Obtain its new value, adjust it, and try again
			
			oldValue.dd = m_Weights[ jj ]->value;
			newValue.dd = oldValue.dd - epsilon * dErr_wrt_dWn[ jj ];
		}
		
	}
	
}


void NNLayer::PeriodicWeightSanityCheck()
{
	// called periodically by the neural net, to request a check on the "reasonableness" of the 
	// weights.  The warning message is given only once per layer
	
	VectorWeights::iterator wit;
	
	for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		NNWeight& ww = *(*wit);
		double val = fabs( ww.value );
		
		if ( (val>100.0) && (m_bFloatingPointWarning == false) )
		{
			// 100.0 is an arbitrary value, that no reasonable weight should ever exceed
			
			CString strMess;
			strMess.Format( _T( "Caution: Weights are becoming unboundedly large \n" )
				_T( "Layer: %s \nWeight: %s \nWeight value = %g \nWeight Hessian = %g\n\n" )
				_T( "Suggest abandoning this backpropagation and investigating" ),
				label.c_str(), ww.label.c_str(), ww.value, ww.diagHessian );
			
			::MessageBox( NULL, strMess, _T( "Problem With Weights" ), MB_ICONEXCLAMATION | MB_OK );
			
			m_bFloatingPointWarning = true;
		}
	}
}



void NNLayer::EraseHessianInformation()
{
	// goes through all the weights associated with this layer, and sets each of their
	// diagHessian value to zero
	
	VectorWeights::iterator wit;
	
	for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		(*wit)->diagHessian = 0.0;
	}
	
}

void NNLayer::DivideHessianInformationBy(double divisor)
{
	// goes through all the weights associated with this layer, and divides each of their
	// diagHessian value by the indicated divisor
	
	VectorWeights::iterator wit;
	double dTemp;
	
	for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
	{
		dTemp = (*wit)->diagHessian;
		
		if ( dTemp < 0.0 )
		{
			// it should not be possible to reach here, since all calculations for the second
			// derviative are strictly zero-positive.  However, there are some early indications 
			// that this check is necessary anyway
			
			ASSERT ( dTemp >= 0.0 );  // will break in debug mode
			dTemp = 0.0;
		}
		
		(*wit)->diagHessian = dTemp / divisor ;
	}
}


void NNLayer::BackpropagateSecondDerivatives( std::vector< double >& d2Err_wrt_dXn /* in */, 
											 std::vector< double >& d2Err_wrt_dXnm1 /* out */)
{
	// nomenclature (repeated from NeuralNetwork class)
	// NOTE: even though we are addressing SECOND derivatives ( and not first derivatives),
	// we use nearly the same notation as if there were first derivatives, since otherwise the
	// ASCII look would be confusing.  We add one "2" but not two "2's", such as "d2Err_wrt_dXn",
	// to give a gentle emphasis that we are using second derivatives
	//
	// Err is output error of the entire neural net
	// Xn is the output vector on the n-th layer
	// Xnm1 is the output vector of the previous layer
	// Wn is the vector of weights of the n-th layer
	// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
	// F is the squashing function: Xn = F(Yn)
	// F' is the derivative of the squashing function
	//   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
	
	ASSERT( d2Err_wrt_dXn.size() == m_Neurons.size() );
	ASSERT( m_pPrevLayer != NULL );
	ASSERT( d2Err_wrt_dXnm1.size() == m_pPrevLayer->m_Neurons.size() );

	int ii, jj;
	UINT kk;
	int nIndex;
	double output;
	double dTemp;
		
	std::vector< double > d2Err_wrt_dYn( m_Neurons.size() );
	//
	// std::vector< double > d2Err_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
	//////////////////////////////////////////////////
	//
	///// DESIGN TRADEOFF: REVIEW !!
	//
	// Note that the reasoning of this comment is identical to that in the NNLayer::Backpropagate() 
	// function, from which the instant BackpropagateSecondDerivatives() function is derived from
	//
	// We would prefer (for ease of coding) to use STL vector for the array "d2Err_wrt_dWn", which is the 
	// second differential of the current pattern's error wrt weights in the layer.  However, for layers with
	// many weights, such as fully-connected layers, there are also many weights.  The STL vector
	// class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable 
	// number of page faults, with a consequent slowing of the application's overall execution time.
	
	// To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and 
	// delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
	// errors, and did not improve performance.
	
	// So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
	// write a statement like 
	//    double d2Err_wrt_dWn[ m_Weights.size() ];
	// since the compiler insists upon a compile-time known constant value for the size of the array.  
	// To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
	// The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
	// this comment is labeled "REVIEW"
	
	double* d2Err_wrt_dWn = (double*)( _alloca( sizeof(double) *  m_Weights.size() ) );
	
	for ( ii=0; ii<m_Weights.size(); ++ii )
	{
		d2Err_wrt_dWn[ ii ] =0.0;
	}

	VectorNeurons::iterator nit;
	VectorConnections::iterator cit;

	
	// calculate d2Err_wrt_dYn = ( F'(Yn) )^2 * dErr_wrt_Xn (where dErr_wrt_Xn is actually a second derivative )
	
	for ( ii=0; ii<m_Neurons.size(); ++ii )
	{
		ASSERT( ii<d2Err_wrt_dYn.size() );
		ASSERT( ii<d2Err_wrt_dXn.size() );
		
		output = m_Neurons[ ii ]->output;
		
		dTemp = DSIGMOID( output ) ;
		d2Err_wrt_dYn[ ii ] = d2Err_wrt_dXn[ ii ] * dTemp * dTemp;
	}
	
	// calculate d2Err_wrt_Wn = ( Xnm1 )^2 * d2Err_wrt_Yn (where dE2rr_wrt_Yn is actually a second derivative)
	// For each neuron in this layer, go through the list of connections from the prior layer, and
	// update the differential for the corresponding weight
	
	ii = 0;
	for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // for simplifying the terminology
		
		for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
		{
			kk = (*cit).NeuronIndex;
			if ( kk == ULONG_MAX )
			{
				output = 1.0;  // this is the bias connection; implied neuron output of "1"
			}
			else
			{
				ASSERT( kk<m_pPrevLayer->m_Neurons.size() );
				
				output = m_pPrevLayer->m_Neurons[ kk ]->output;
			}
			
			////////////	ASSERT( (*cit).WeightIndex < d2Err_wrt_dWn.size() );  // since after changing d2Err_wrt_dWn to a C-style array, the size() function this won't work
			ASSERT( ii<d2Err_wrt_dYn.size() );
			d2Err_wrt_dWn[ (*cit).WeightIndex ] += d2Err_wrt_dYn[ ii ] * output * output ;
		}
		
		ii++;
	}
	
	
	// calculate d2Err_wrt_Xnm1 = ( Wn )^2 * d2Err_wrt_dYn (where d2Err_wrt_dYn is a second derivative not a first).
	// d2Err_wrt_Xnm1 is needed as the input value of
	// d2Err_wrt_Xn for backpropagation of second derivatives for the next (i.e., previous spatially) layer
	// For each neuron in this layer
	
	ii = 0;
	for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
	{
		NNNeuron& n = *(*nit);  // for simplifying the terminology
		
		for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
		{
			kk=(*cit).NeuronIndex;
			if ( kk != ULONG_MAX )
			{
				// we exclude ULONG_MAX, which signifies the phantom bias neuron with
				// constant output of "1", since we cannot train the bias neuron
				
				nIndex = kk;
				
				ASSERT( nIndex<d2Err_wrt_dXnm1.size() );
				ASSERT( ii<d2Err_wrt_dYn.size() );
				ASSERT( (*cit).WeightIndex<m_Weights.size() );
				
				dTemp = m_Weights[ (*cit).WeightIndex ]->value ; 
				
				d2Err_wrt_dXnm1[ nIndex ] += d2Err_wrt_dYn[ ii ] * dTemp * dTemp ;
			}
			
		}
		
		ii++;  // ii tracks the neuron iterator
		
	}
	
	struct DOUBLE_UNION
	{
		union 
		{
			double dd;
			unsigned __int64 ullong;
		};
	};
	
	DOUBLE_UNION oldValue, newValue;
	
	// finally, update the diagonal Hessians for the weights of this layer neuron using dErr_wrt_dW.
	// By design, this function (and its iteration over many (approx 500 patterns) is called while a 
	// single thread has locked the nueral network, so there is no possibility that another
	// thread might change the value of the Hessian.  Nevertheless, since it's easy to do, we
	// use an atomic compare-and-exchange operation, which means that another thread might be in 
	// the process of backpropagation of second derivatives and the Hessians might have shifted slightly
	
	for ( jj=0; jj<m_Weights.size(); ++jj )
	{
		oldValue.dd = m_Weights[ jj ]->diagHessian;
		newValue.dd = oldValue.dd + d2Err_wrt_dWn[ jj ];
		
		while ( oldValue.ullong != _InterlockedCompareExchange64( (unsigned __int64*)(&m_Weights[ jj ]->diagHessian), 
			newValue.ullong, oldValue.ullong ) ) 
		{
			// another thread must have modified the weight.  Obtain its new value, adjust it, and try again
			
			oldValue.dd = m_Weights[ jj ]->diagHessian;
			newValue.dd = oldValue.dd + d2Err_wrt_dWn[ jj ];
		}
		
	}
	
}



void NNLayer::Serialize(CArchive &ar)
{
	VectorNeurons::iterator nit;
	VectorWeights::iterator wit;
	VectorConnections::iterator cit;
	
	int ii, jj;
	
	if (ar.IsStoring())
	{
		// TODO: add storing code here
		
		ar.WriteString( label.c_str() );
		ar.WriteString( _T("\r\n") );  // ar.ReadString will look for \r\n when loading from the archive
		ar << m_Neurons.size();
		ar << m_Weights.size();
		
		
		
		for ( nit=m_Neurons.begin(); nit<m_Neurons.end(); nit++ )
		{
			NNNeuron& n = *(*nit);
			ar.WriteString( n.label.c_str() );
			ar.WriteString( _T("\r\n") );
			ar << n.m_Connections.size();
			
			for ( cit=n.m_Connections.begin(); cit<n.m_Connections.end(); cit++ )
			{
				ar << (*cit).NeuronIndex;
				ar << (*cit).WeightIndex;
			}
		}
		
		for ( wit=m_Weights.begin(); wit<m_Weights.end(); wit++ )
		{
			ar.WriteString( (*wit)->label.c_str() );
			ar.WriteString( _T("\r\n") );
			ar << (*wit)->value;
		}
		
		
	}
	else
	{
		// TODO: add loading code here
		
		CString str;
		ar.ReadString( str );
		
		label = str;
		
		int iNumNeurons, iNumWeights, iNumConnections;
		double value;
		
		NNNeuron* pNeuron;
		NNWeight* pWeight;
		NNConnection conn;
		
		ar >> iNumNeurons;
		ar >> iNumWeights;
		
		for ( ii=0; ii<iNumNeurons; ++ii )
		{
			ar.ReadString( str );
			pNeuron = new NNNeuron( (LPCTSTR)str );
			m_Neurons.push_back( pNeuron );
			
			ar >> iNumConnections;
			
			for ( jj=0; jj<iNumConnections; ++jj )
			{
				ar >> conn.NeuronIndex;
				ar >> conn.WeightIndex;
				
				pNeuron->AddConnection( conn );
			}
		}
		
		for ( jj=0; jj<iNumWeights; ++jj )
		{
			ar.ReadString( str );
			ar >> value;
			
			pWeight = new NNWeight( (LPCTSTR)str, value );
			m_Weights.push_back( pWeight );
		}
		
	}
	
}











///////////////////////////////////////////////////////////////////////
//
//  NNWeight

NNWeight::NNWeight() : 
label( _T("") ),
value( 0.0 ), diagHessian( 0.0 )
{
	Initialize();
}

NNWeight::NNWeight( LPCTSTR str, double val /* =0.0 */ ) :
label( str ),
value( val ), diagHessian( 0.0 )
{
	Initialize();
}


void NNWeight::Initialize()
{
	
}

NNWeight::~NNWeight()
{
	
}








///////////////////////////////////////////////////////////////////////
//
//  NNNeuron


NNNeuron::NNNeuron() :
label( _T("") ), output( 0.0 )
{
	Initialize();
}

NNNeuron::NNNeuron( LPCTSTR str ) : 
label( str ), output( 0.0 )
{
	Initialize();
}


void NNNeuron::Initialize()
{
	m_Connections.clear();
}

NNNeuron::~NNNeuron()
{
	Initialize();
}


void NNNeuron::AddConnection( UINT iNeuron, UINT iWeight )
{
	m_Connections.push_back( NNConnection( iNeuron, iWeight ) );
}


void NNNeuron::AddConnection( NNConnection const & conn )
{
	m_Connections.push_back( conn );
}













By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

Share

About the Author

Mike O'Neill

United States United States
Mike O'Neill is a patent attorney in Southern California, where he specializes in computer and software-related patents. He programs as a hobby, and in a vain attempt to keep up with and understand the technology of his clients.

| Advertise | Privacy | Mobile
Web01 | 2.8.140827.1 | Last Updated 5 Dec 2006
Article Copyright 2006 by Mike O'Neill
Everything else Copyright © CodeProject, 1999-2014
Terms of Service
Layout: fixed | fluid