using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using ANN.Perceptron.ArchiveSerialization;
using ANN.Perceptron.Layers;
using ANN.Perceptron.Neurons;
using System.IO;
using System.Threading.Tasks;
namespace ANN.Perceptron.Network
{
// Neural Network class
public class BackPropagation : ForwardPropagation
{
#region Properties
public uint Backprops; // counter used in connection with Weight sanity check
#endregion
public BackPropagation():base()
{
EtaLearningRate = .001; // arbitrary, so that brand-new NNs can be serialized with a non-ridiculous number
Backprops = 0;
network = null;
}
public BackPropagation(ConvolutionNetwork net):base(net)
{
EtaLearningRate = .001; // arbitrary, so that brand-new NNs can be serialized with a non-ridiculous number
Backprops = 0;
network = net;
}
public void Backpropagate(double[] actualOutput, double[] desiredOutput, int count, NeuronOutputs[] pMemorizedNeuronOutputs)
{
// backpropagates through the neural net
if(( network.LayerCount >= 2 )==false) // there must be at least two network.Layers in the net
{
return;
}
if ( ( actualOutput == null ) || ( desiredOutput == null ) || ( count >= 256 ) )
return;
// check if it's time for a weight sanity check
if ( (Backprops % 10000) == 0 )
{
// every 10000 backprops
PeriodicWeightSanityCheck();
}
// proceed from the last layer to the first, iteratively
// We calculate the last layer separately, and first, since it provides the needed derviative
// (i.e., dErr_wrt_dXnm1) for the previous network.Layers
// nomenclature:
//
// Err is output error of the entire neural net
// Xn is the output vector on the n-th layer
// Xnm1 is the output vector of the previous layer
// Wn is the vector of weights of the n-th layer
// Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
// F is the squashing function: Xn = F(Yn)
// F' is the derivative of the squashing function
// Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
int iSize = network.LayerCount;
var dErr_wrt_dXlast = new double[network.Layers[network.LayerCount - 1].NeuronCount];
var differentials = new List<double[]>(iSize);
// start the process by calculating dErr_wrt_dXn for the last layer.
// for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is simply
// the difference between the target and the actual
/*TODO: Check potentially-changing upper bound expression "network.Layers[network.LayerCount - 1].NeuronCount" which is now called only *once*,
to ensure the new Parallel.For call matches behavior in the original for-loop
(where this upper bound expression had previously been evaluated at the start of *every* loop iteration).*/
Parallel.For(0, network.Layers[network.LayerCount - 1].NeuronCount, ii =>
{
dErr_wrt_dXlast[ii] = actualOutput[ii] - desiredOutput[ii];
});
// store Xlast and reserve memory for the remaining vectors stored in differentials
for ( int ii=0; ii<iSize-1; ii++ )
{
var m_differential = new double[network.Layers[ii].NeuronCount];
/*TODO: Check potentially-changing upper bound expression "network.Layers[ii].NeuronCount" which is now called only *once*,
to ensure the new Parallel.For call matches behavior in the original for-loop
(where this upper bound expression had previously been evaluated at the start of *every* loop iteration).*/
Parallel.For(0, network.Layers[ii].NeuronCount, kk =>
{
m_differential[kk] = 0.0;
});
differentials.Add(m_differential);
}
differentials.Add(dErr_wrt_dXlast); // last one
// now iterate through all layers including the last but excluding the first, and ask each of
// them to backpropagate error and adjust their weights, and to return the differential
// dErr_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer
bool bMemorized = ( pMemorizedNeuronOutputs != null );
for ( int jj=iSize-1; jj>0;jj--)
{
if ( bMemorized != false )
{
network.Layers[jj].Backpropagate( differentials[ jj ], differentials[ jj - 1 ],
pMemorizedNeuronOutputs[jj], pMemorizedNeuronOutputs[ jj - 1 ], EtaLearningRate );
}
else
{
network.Layers[jj].Backpropagate(differentials[jj], differentials[jj - 1],
null, null, EtaLearningRate );
}
}
differentials.Clear();
}
public void EraseHessianInformation()
{
foreach (var lit in network.Layers)
{
lit.EraseHessianInformation();
}
}
public void DivideHessianInformationBy(double divisor)
{
// controls each layer to divide its current diagonal Hessian info by a common divisor.
// A check is also made to ensure that each Hessian is strictly zero-positive
foreach (var lit in network.Layers)
{
lit.DivideHessianInformationBy(divisor);
}
}
public void BackpropagateSecondDervatives(double[] actualOutputVector, double[] targetOutputVector, int outputCount)
{
// calculates the second dervatives (for diagonal Hessian) and backpropagates
// them through neural net
if( network.LayerCount< 2 ){return;}; // there must be at least two layers in the net
if ((actualOutputVector == null) || (targetOutputVector == null) || (outputCount >= 256))
{
return;
}
// we use nearly the same nomenclature as above (e.g., "dErr_wrt_dXnm1") even though everything here
// is actually second derivatives and not first derivatives, since otherwise the ASCII would
// become too confusing. To emphasize that these are second derivatives, we insert a "2"
// such as "d2Err_wrt_dXnm1". We don't insert the second "2" that's conventional for designating
// second derivatives"
int iSize = network.LayerCount;
int neuronCount = network.Layers[network.LayerCount - 1].NeuronCount;
var d2Err_wrt_dXlast = new double[neuronCount];
var differentials = new List<double[]>(iSize);
// start the process by calculating the second derivative dErr_wrt_dXn for the last layer.
// for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is
// exactly one
var lit = network.Layers.Last(); // point to last layer
/*TODO: Check potentially-changing upper bound expression "lit.NeuronCount" which is now called only *once*,
to ensure the new Parallel.For call matches behavior in the original for-loop
(where this upper bound expression had previously been evaluated at the start of *every* loop iteration).*/
Parallel.For(0, lit.NeuronCount,ParallelOption, ii =>
{
d2Err_wrt_dXlast[ii] = 1.0;
});
// store Xlast and reserve memory for the remaining vectors stored in differentials
for ( int ii=0; ii<iSize-1; ii++ )
{
var m_differential = new double[network.Layers[ii].NeuronCount];
/*TODO: Check potentially-changing upper bound expression "network.Layers[ii].NeuronCount" which is now called only *once*,
to ensure the new Parallel.For call matches behavior in the original for-loop
(where this upper bound expression had previously been evaluated at the start of *every* loop iteration).*/
Parallel.For(0, network.Layers[ii].NeuronCount, kk =>
{
m_differential[kk] = 0.0;
});
differentials.Add(m_differential);
}
differentials.Add(d2Err_wrt_dXlast); // last one
// now iterate through all layers including the last but excluding the first, starting from
// the last, and ask each of
// them to backpropagate the second derviative and accumulate the diagonal Hessian, and also to
// return the second dervative
// d2Err_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer (which
// is the previous layer spatially)
for ( int ii = iSize - 1; ii>0; ii--)
{
network.Layers[ii].BackpropagateSecondDerivatives( differentials[ ii ], differentials[ ii - 1 ] );
}
differentials.Clear();
}
protected void PeriodicWeightSanityCheck()
{
// fucntion that simply goes through all weights, and tests them against an arbitrary
// "reasonable" upper limit. If the upper limit is exceeded, a warning is displayed
foreach (var lit in network.Layers)
{
lit.PeriodicWeightSanityCheck();
}
}
override public void Serialize(Archive ar)
{
if (ar.IsStoring())
{
// TODO: add storing code here
ar.Write(EtaLearningRate);
ar.Write(network.LayerCount);
foreach (var lit in network.Layers)
{
lit.Serialize( ar );
}
}
else
{
// TODO: add loading code here
double eta;
ar.Read(out eta);
EtaLearningRate = eta; // two-step storage is needed since m_etaLearningRate is "volatile"
int nLayers;
var pLayer = (CommonLayer)null;
ar.Read(out nLayers);
network.Layers = new CommonLayer[nLayers];
for ( int ii=0; ii<nLayers; ii++ )
{
pLayer = new CommonLayer( "", pLayer );
network.Layers[ii]=pLayer;
pLayer.Serialize( ar );
}
}
}
}
}