From 4eac0a6e0d7deb8711db0ecbbdece7e9e241c3c4 Mon Sep 17 00:00:00 2001 From: moneta <lorenzo.moneta@cern.ch> Date: Fri, 20 Apr 2018 13:18:15 +0200 Subject: [PATCH] Fix back propagation through time for both reference and CPU. Improve backdrop test --- tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h | 6 +- .../TMVA/DNN/Architectures/Cpu/CpuMatrix.h | 2 +- tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h | 5 +- .../src/DNN/Architectures/Cpu/Arithmetic.cxx | 14 +- .../Cpu/RecurrentPropagation.cxx | 34 ++- .../Reference/RecurrentPropagation.cxx | 32 ++- tmva/tmva/src/MethodDL.cxx | 3 +- tmva/tmva/test/DNN/RNN/CMakeLists.txt | 3 + .../DNN/RNN/TestRecurrentBackpropagation.cxx | 14 +- .../DNN/RNN/TestRecurrentBackpropagation.h | 199 +++++++++++++++--- .../RNN/TestRecurrentBackpropagationCpu.cxx | 11 +- 11 files changed, 263 insertions(+), 60 deletions(-) diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h index 5bc872b9cc2..116946b1ba2 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h @@ -430,7 +430,8 @@ public: */ static void TransposeMultiply(TCpuMatrix<Scalar_t> &output, const TCpuMatrix<Scalar_t> &input, - const TCpuMatrix<Scalar_t> &Weights); + const TCpuMatrix<Scalar_t> &Weights, + Scalar_t alpha = 1.0, Scalar_t beta = 0.); /** In-place Hadamard (element-wise) product of matrices \p A and \p B * with the result being written into \p A. */ @@ -441,7 +442,8 @@ public: * m elements in \p A. */ static void SumColumns(TCpuMatrix<Scalar_t> &B, - const TCpuMatrix<Scalar_t> &A); + const TCpuMatrix<Scalar_t> &A, + Scalar_t alpha = 1.0, Scalar_t beta = 0.); /** Compute the sum of all elements in \p A */ static Scalar_t Sum(const TCpuMatrix<Scalar_t> &A); diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h index 069ccfd7bcd..831655d9c68 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h @@ -25,7 +25,7 @@ #include "CpuBuffer.h" #include <TMVA/Config.h> -//#define DEBUG_TMVA_TCPUMATRIX +#define DEBUG_TMVA_TCPUMATRIX #if defined(DEBUG_TMVA_TCPUMATRIX) #define PrintMatrix(mat, text) \ { \ diff --git a/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h b/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h index cc5757563f7..6cbbae05ea2 100644 --- a/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h +++ b/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h @@ -298,7 +298,8 @@ auto inline TBasicRNNLayer<Architecture_t>::Backward(Tensor_t &gradients_backwar // gradients_backward is activationGradients of layer before it, which is input layer // currently gradient_backward is for input(x) and not for state // TODO use this to change initial state?? - + + bool dummy = false; if (gradients_backward.size() == 0 || gradients_backward[0].GetNrows() == 0 || gradients_backward[0].GetNcols() == 0) { dummy = true; @@ -313,7 +314,7 @@ auto inline TBasicRNNLayer<Architecture_t>::Backward(Tensor_t &gradients_backwar Tensor_t arr_activations_backward; for (size_t t = 0; t < fTimeSteps; ++t) arr_activations_backward.emplace_back(this->GetBatchSize(), this->GetInputSize()); // T x B x D Architecture_t::Rearrange(arr_activations_backward, activations_backward); - + Matrix_t state_gradients_backward(this->GetBatchSize(), fStateSize); // B x H DNN::initialize<Architecture_t>(state_gradients_backward, DNN::EInitialization::kZero); diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx index b04e3f236c0..07aff00f6ef 100644 --- a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx +++ b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx @@ -55,7 +55,8 @@ void TCpu<Real_t>::Multiply(TCpuMatrix<Real_t> &C, template<typename Real_t> void TCpu<Real_t>::TransposeMultiply(TCpuMatrix<Real_t> &C, const TCpuMatrix<Real_t> &A, - const TCpuMatrix<Real_t> &B) + const TCpuMatrix<Real_t> &B, + Real_t alpha, Real_t beta) { int m = (int) A.GetNcols(); int k = (int) A.GetNrows(); @@ -68,8 +69,8 @@ void TCpu<Real_t>::TransposeMultiply(TCpuMatrix<Real_t> &C, char transa = 'T'; char transb = 'N'; - Real_t alpha = 1.0; - Real_t beta = 0.0; + //Real_t alpha = 1.0; + //Real_t beta = 0.0; const Real_t *APointer = A.GetRawDataPointer(); const Real_t *BPointer = B.GetRawDataPointer(); @@ -112,14 +113,15 @@ void TCpu<Real_t>::Hadamard(TCpuMatrix<Real_t> &B, //____________________________________________________________________________ template<typename Real_t> void TCpu<Real_t>::SumColumns(TCpuMatrix<Real_t> &B, - const TCpuMatrix<Real_t> &A) + const TCpuMatrix<Real_t> &A, + Real_t alpha, Real_t beta) { int m = (int) A.GetNrows(); int n = (int) A.GetNcols(); int inc = 1; - Real_t alpha = 1.0; - Real_t beta = 0.0; + // Real_t alpha = 1.0; + //Real_t beta = 0.0; char trans = 'T'; const Real_t * APointer = A.GetRawDataPointer(); diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx index 4f7311d570a..ab43f052416 100644 --- a/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx +++ b/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx @@ -37,6 +37,14 @@ auto TCpu<AFloat>::RecurrentLayerBackward(TCpuMatrix<AFloat> & state_gradients_b TCpuMatrix<AFloat> & input_gradient) -> TCpuMatrix<AFloat> & { + + // std::cout << "Recurrent Propo" << std::endl; + // PrintMatrix(df,"DF"); + // PrintMatrix(state_gradients_backward,"State grad"); + // PrintMatrix(input_weight_gradients,"input w grad"); + // PrintMatrix(state,"state"); + // PrintMatrix(input,"input"); + // Compute element-wise product. Hadamard(df, state_gradients_backward); // B x H @@ -48,18 +56,30 @@ auto TCpu<AFloat>::RecurrentLayerBackward(TCpuMatrix<AFloat> & state_gradients_b // Weights gradients if (input_weight_gradients.GetNElements() > 0) { - TCpuMatrix<AFloat> tmp(input_weight_gradients); - TransposeMultiply(input_weight_gradients, df, input); // H x B . B x D - ScaleAdd(input_weight_gradients, tmp, 1); + //TCpuMatrix<AFloat> tmp(input_weight_gradients); + TransposeMultiply(input_weight_gradients, df, input, 1. , 1.); // H x B . B x D + //ScaleAdd(input_weight_gradients, tmp, 1); } if (state_weight_gradients.GetNElements() > 0) { - TCpuMatrix<AFloat> tmp(state_weight_gradients); - TransposeMultiply(state_weight_gradients, df, state); // H x B . B x H - ScaleAdd(state_weight_gradients, tmp, 1); + //TCpuMatrix<AFloat> tmp(state_weight_gradients); + TransposeMultiply(state_weight_gradients, df, state, 1. , 1. ); // H x B . B x H + //ScaleAdd(state_weight_gradients, tmp, 1); } // Bias gradients. - if (bias_gradients.GetNElements() > 0) SumColumns(bias_gradients, df); + if (bias_gradients.GetNElements() > 0) { + //TCpuMatrix<AFloat> tmp(bias_gradients); + SumColumns(bias_gradients, df, 1., 1.); // could be probably do all here + //ScaleAdd(bias_gradients, tmp, 1); + } + + //std::cout << "RecurrentPropo: end " << std::endl; + + // PrintMatrix(state_gradients_backward,"State grad"); + // PrintMatrix(input_weight_gradients,"input w grad"); + // PrintMatrix(bias_gradients,"bias grad"); + // PrintMatrix(input_gradient,"input grad"); + return input_gradient; } diff --git a/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx b/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx index 2841df44330..c89d1455d76 100644 --- a/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx +++ b/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx @@ -35,6 +35,19 @@ auto TReference<Scalar_t>::RecurrentLayerBackward(TMatrixT<Scalar_t> & state_gra TMatrixT<Scalar_t> & input_gradient) -> Matrix_t & { + + // std::cout << "Reference Recurrent Propo" << std::endl; + // std::cout << "df\n"; + // df.Print(); + // std::cout << "state gradient\n"; + // state_gradients_backward.Print(); + // std::cout << "inputw gradient\n"; + // input_weight_gradients.Print(); + // std::cout << "state\n"; + // state.Print(); + // std::cout << "input\n"; + // input.Print(); + // Compute element-wise product. for (size_t i = 0; i < (size_t) df.GetNrows(); i++) { for (size_t j = 0; j < (size_t) df.GetNcols(); j++) { @@ -48,7 +61,7 @@ auto TReference<Scalar_t>::RecurrentLayerBackward(TMatrixT<Scalar_t> & state_gra } // State gradients if (state_gradients_backward.GetNoElements() > 0) { - state_gradients_backward.MultT(df, weights_state); // B x H . H x H = B x H + state_gradients_backward.Mult(df, weights_state); // B x H . H x H = B x H } // Weights gradients. @@ -65,14 +78,29 @@ auto TReference<Scalar_t>::RecurrentLayerBackward(TMatrixT<Scalar_t> & state_gra // Bias gradients. B x H -> H x 1 if (bias_gradients.GetNoElements() > 0) { + // this loops on state size for (size_t j = 0; j < (size_t) df.GetNcols(); j++) { Scalar_t sum = 0.0; + // this loops on batch size summing all gradient contributions in a batch for (size_t i = 0; i < (size_t) df.GetNrows(); i++) { sum += df(i,j); } - bias_gradients(j,0) = sum; + bias_gradients(j,0) += sum; } } + + // std::cout << "RecurrentPropo: end " << std::endl; + + // std::cout << "state gradient\n"; + // state_gradients_backward.Print(); + // std::cout << "inputw gradient\n"; + // input_weight_gradients.Print(); + // std::cout << "bias gradient\n"; + // bias_gradients.Print(); + // std::cout << "input gradient\n"; + // input_gradient.Print(); + + return input_gradient; } diff --git a/tmva/tmva/src/MethodDL.cxx b/tmva/tmva/src/MethodDL.cxx index 609fdd7d577..1acce247c70 100644 --- a/tmva/tmva/src/MethodDL.cxx +++ b/tmva/tmva/src/MethodDL.cxx @@ -471,8 +471,7 @@ void MethodDL::CreateDeepNet(DNN::TDeepNet<Architecture_t, Layer_t> &deepNet, } else if (strLayerType == "RESHAPE") { ParseReshapeLayer(deepNet, nets, layerString->GetString(), subDelimiter); } else if (strLayerType == "RNN") { - Log() << kFATAL << "RNN Layer is not yet fully implemented" << Endl; - //ParseRnnLayer(deepNet, nets, layerString->GetString(), subDelimiter); + ParseRnnLayer(deepNet, nets, layerString->GetString(), subDelimiter); } else if (strLayerType == "LSTM") { Log() << kFATAL << "LSTM Layer is not yet fully implemented" << Endl; //ParseLstmLayer(deepNet, nets, layerString->GetString(), subDelimiter); diff --git a/tmva/tmva/test/DNN/RNN/CMakeLists.txt b/tmva/tmva/test/DNN/RNN/CMakeLists.txt index 14bfbf4dadd..5bb1d3da02c 100644 --- a/tmva/tmva/test/DNN/RNN/CMakeLists.txt +++ b/tmva/tmva/test/DNN/RNN/CMakeLists.txt @@ -48,4 +48,7 @@ if (BLAS_FOUND AND imt) ROOT_EXECUTABLE(testForwardPassCpu TestForwardPassCpu.cxx LIBRARIES ${Libraries}) ROOT_ADD_TEST(TMVA-DNN-RNN-Forward-Cpu COMMAND testForwardPassCpu) + ROOT_EXECUTABLE(testRecurrentBackpropagationCpu TestRecurrentBackpropagationCpu.cxx LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-RNN-BackpropagationCpu COMMAND testRecurrentBackpropagationCpu) + endif (BLAS_FOUND AND imt) diff --git a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx index 168f5c37844..bc8b6fef7e6 100644 --- a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx +++ b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx @@ -24,11 +24,19 @@ int main() { std::cout << "Testing RNN backward pass\n"; // timesteps, batchsize, statesize, inputsize - testRecurrentBackpropagationWeights<TReference<double>>(2, 2, 1, 2, 1e-5); + testRecurrentBackpropagation<TReference<double>>(1, 2, 1, 10, 1e-5); - testRecurrentBackpropagationBiases<TReference<double>>(1, 2, 3, 2, 1e-5); + + testRecurrentBackpropagation<TReference<double>>(2, 2, 1, 10, 1e-5); - testRecurrentBackpropagationWeights<TReference<double>>(2, 3, 4, 5, 1e-5); + testRecurrentBackpropagation<TReference<double>>(1, 2, 2, 10, 1e-5); + + + testRecurrentBackpropagation<TReference<double>>(2, 1, 2, 5, 1e-10); + + testRecurrentBackpropagation<TReference<double>>(4, 2, 3, 10, 1e-10); + // using a fixed input + testRecurrentBackpropagation<TReference<double>>(3, 1, 4, 5, 1e-10, false); return 0; } diff --git a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h index 964e621a442..f8b83a11bd0 100644 --- a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h +++ b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h @@ -20,6 +20,8 @@ #include <vector> #include "../Utility.h" +#include "Math/Functor.h" +#include "Math/RichardsonDerivator.h" #include "TMVA/DNN/Functions.h" #include "TMVA/DNN/DeepNet.h" @@ -63,14 +65,15 @@ auto printTensor(const typename Architecture::Matrix_t &A, const std::string nam template <typename Architecture> auto evaluate_net_weight(TDeepNet<Architecture> &net, std::vector<typename Architecture::Matrix_t> & X, const typename Architecture::Matrix_t &Y, const typename Architecture::Matrix_t &W, size_t l, - size_t k, size_t i, size_t j, typename Architecture::Scalar_t dx) -> + size_t k, size_t i, size_t j, typename Architecture::Scalar_t xvalue) -> typename Architecture::Scalar_t { using Scalar_t = typename Architecture::Scalar_t; - net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) += dx; + Scalar_t prev_value = net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j); + net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) = xvalue; Scalar_t res = net.Loss(X, Y, W, false, false); - net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) -= dx; + net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) = prev_value; return res; } @@ -80,60 +83,127 @@ auto evaluate_net_weight(TDeepNet<Architecture> &net, std::vector<typename Archi template <typename Architecture> auto evaluate_net_bias(TDeepNet<Architecture> &net, std::vector<typename Architecture::Matrix_t> & X, const typename Architecture::Matrix_t &Y, const typename Architecture::Matrix_t &W, size_t l, - size_t k, size_t i, typename Architecture::Scalar_t dx) -> typename Architecture::Scalar_t + size_t k, size_t i, typename Architecture::Scalar_t xvalue) -> typename Architecture::Scalar_t { using Scalar_t = typename Architecture::Scalar_t; - - net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) += dx; + + Scalar_t prev_value = net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0); + net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) = xvalue; Scalar_t res = net.Loss(X, Y, W, false, false); - net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) -= dx; + net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) = prev_value; return res; } /*! Generate a DeepNet, test backward pass */ //______________________________________________________________________________ template <typename Architecture> -auto testRecurrentBackpropagationWeights(size_t timeSteps, size_t batchSize, size_t stateSize, - size_t inputSize, typename Architecture::Scalar_t dx) +auto testRecurrentBackpropagation(size_t timeSteps, size_t batchSize, size_t stateSize, + size_t inputSize, typename Architecture::Scalar_t dx = 1.E-5, bool randomInput = true) -> Double_t { + bool debug = false; + using Matrix_t = typename Architecture::Matrix_t; using Tensor_t = std::vector<Matrix_t>; using RNNLayer_t = TBasicRNNLayer<Architecture>; using Net_t = TDeepNet<Architecture>; using Scalar_t = typename Architecture::Scalar_t; - std::vector<TMatrixT<Double_t>> XRef(batchSize, TMatrixT<Double_t>(timeSteps, inputSize)); // B x T x D - Tensor_t XArch; - //for (size_t i = 0; i < batchSize; ++i) XArch.emplace_back(timeSteps, inputSize); // B x T x D + //std::vector<Matrix_t<Double_t>> XRef(batchSize, Matrix_t<Double_t>(timeSteps, inputSize)); // B x T x D + Tensor_t XArch; // B x T x D for (size_t i = 0; i < batchSize; ++i) { - //randomMatrix(XRef[i]); - for (size_t l = 0; l < XRef[i].GetNrows(); ++l) { - for (size_t m = 0; m < XRef[i].GetNcols(); ++m) { - XRef[i](l, m) = i + l + m; - } - } - XArch.emplace_back(XRef[i]); + XArch.emplace_back(timeSteps, inputSize); } + // for random input (default) + if (randomInput) { + for (size_t i = 0; i < batchSize; ++i) { + for (size_t l = 0; l < XArch[i].GetNrows(); ++l) { + for (size_t m = 0; m < XArch[i].GetNcols(); ++m) { + //XArch[i](l, m) = i + l + m; + XArch[i](l, m) = gRandom->Uniform(-1,1); + } + } + } + } + else { + debug = true; + R__ASSERT(inputSize <= 6); + R__ASSERT(timeSteps <= 3); + R__ASSERT(batchSize <= 1); + double x_input[] = {-1, 1, -2, 2, -3, 3 , + -0.5, 0.5,-0.8,0.9, -2, 1.5, + -0.2, 0.1,-0.5,0.4, -1, 1.}; + + TMatrixD Input(3,6,x_input); + for (size_t i = 0; i < batchSize; ++i) { + auto & mat = XArch[i]; + // time 0 + for (int l = 0; l < timeSteps; ++l) { + for (int m = 0; m < inputSize; ++m) { + mat(l,m) = Input(l,m); + } + } + } + gRandom->SetSeed(1); // for weights initizialization + } + printTensor<Architecture>(XArch,"input"); + Matrix_t Y(batchSize, timeSteps * stateSize), weights(batchSize, 1); //randomMatrix(Y); for (size_t i = 0; i < Y.GetNrows(); ++i) { for (size_t j = 0; j < Y.GetNcols(); ++j) { - Y(i, j) = (i + j)/2.0 - 0.75; + Y(i, j) = 1;// (i + j)/2.0 - 0.75; } } fillMatrix(weights, 1.0); + std::cout << "Testing Weight Backprop using RNN with batchsize = " << batchSize << " input = " << inputSize << " state = " << stateSize << " time = " << timeSteps << std::endl; Net_t rnn(batchSize, batchSize, timeSteps, inputSize, 0, 0, 0, ELossFunction::kMeanSquaredError, EInitialization::kGauss); RNNLayer_t* layer = rnn.AddBasicRNNLayer(stateSize, inputSize, timeSteps); + //layer->Print(); rnn.AddReshapeLayer(1, timeSteps, stateSize, true); rnn.Initialize(); + + + auto & wi = layer->GetWeights()[0]; + for (int i = 0; i < stateSize; ++i) { + for (int j = 0; j < inputSize; ++j) { + wi(i,j) = gRandom->Uniform(-1,1); + } + wi(i,i) = 1.; + } + + auto & wh = layer->GetWeights()[1]; + for (int i = 0; i < stateSize; ++i) { + for (int j = 0; j < stateSize; ++j) { + wh(i,j) = gRandom->Uniform(-1,1); + } + wh(i,i) = 0.5; + } + auto & b = layer->GetBiases()[0]; + for (int i = 0; i < b.GetNrows(); ++i) { + for (int j = 0; j < b.GetNcols(); ++j) { + b(i,j) = gRandom->Uniform(-0.5,0.5); + } + } + + rnn.Forward(XArch); rnn.Backward(XArch, Y, weights); - + + if (debug) { + auto & out = layer->GetOutput(); + printTensor<Architecture>(out,"output"); + } + + Scalar_t maximum_error = 0.0; + std::string maxerrorType; + + ROOT::Math::RichardsonDerivator deriv; + // Weights Input, k = 0 auto &Wi = layer->GetWeightsAt(0); @@ -143,26 +213,37 @@ auto testRecurrentBackpropagationWeights(size_t timeSteps, size_t batchSize, siz auto f = [&rnn, &XArch, &Y, &weights, i, j](Scalar_t x) { return evaluate_net_weight(rnn, XArch, Y, weights, 0, 0, i, j, x); }; - Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + ROOT::Math::Functor1D func(f); + double dy = deriv.Derivative1(func, Wi(i,j), 1.E-5); Scalar_t dy_ref = dWi(i, j); // Compute the relative error if dy != 0. Scalar_t error; + std::string errorType; if (std::fabs(dy_ref) > 1e-15) { error = std::fabs((dy - dy_ref) / dy_ref); + errorType = "relative"; } else { error = std::fabs(dy - dy_ref); + errorType = "absolute"; } - maximum_error = std::max(error, maximum_error); + if (debug) std::cout << "Weight-input gradient (" << i << "," << j << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl; + + if (error >= maximum_error) { + maximum_error = error; + maxerrorType = errorType; + } } } std::cout << "\rTesting weight input gradients: "; - std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + std::cout << "maximum error (" << maxerrorType << "): " << print_error(maximum_error) << std::endl; + /// testing weight state gradient + // Weights State, k = 1 - Scalar_t smaximum_error = 0.0; + maximum_error = 0; auto &Ws = layer->GetWeightsAt(1); auto &dWs = layer->GetWeightGradientsAt(1); for (Int_t i = 0; i < Ws.GetNrows(); ++i) { @@ -170,25 +251,68 @@ auto testRecurrentBackpropagationWeights(size_t timeSteps, size_t batchSize, siz auto f = [&rnn, &XArch, &Y, &weights, i, j](Scalar_t x) { return evaluate_net_weight(rnn, XArch, Y, weights, 0, 1, i, j, x); }; - Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + ROOT::Math::Functor1D func(f); + double dy = deriv.Derivative1(func, Ws(i,j), 1.E-5); Scalar_t dy_ref = dWs(i, j); // Compute the relative error if dy != 0. Scalar_t error; + std::string errorType; if (std::fabs(dy_ref) > 1e-15) { error = std::fabs((dy - dy_ref) / dy_ref); + errorType = "relative"; } else { error = std::fabs(dy - dy_ref); + errorType = "absolute"; } - smaximum_error = std::max(error, smaximum_error); + if ( error >= maximum_error) { + maximum_error = error; + maxerrorType = errorType; + } + if (debug) std::cout << "Weight-state gradient (" << i << "," << j << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl; } } std::cout << "\rTesting weight state gradients: "; - std::cout << "maximum relative error: " << print_error(smaximum_error) << std::endl; + std::cout << "maximum error (" << maxerrorType << "): " << print_error(maximum_error) << std::endl; + + // testing bias gradients + maximum_error = 0; + auto &B = layer->GetBiasesAt(0); + auto &dB = layer->GetBiasGradientsAt(0); + for (Int_t i = 0; i < B.GetNrows(); ++i) { + auto f = [&rnn, &XArch, &Y, &weights, i](Scalar_t x) { + return evaluate_net_bias(rnn, XArch, Y, weights, 0, 0, i, x); + }; + ROOT::Math::Functor1D func(f); + double dy = deriv.Derivative1(func, B(i,0), 1.E-5); + Scalar_t dy_ref = dB(i, 0); + + // Compute the relative error if dy != 0. + Scalar_t error; + std::string errorType; + if (std::fabs(dy_ref) > 1e-15) { + error = std::fabs((dy - dy_ref) / dy_ref); + errorType = "relative"; + } else { + error = std::fabs(dy - dy_ref); + errorType = "absolute"; + } + + if ( error >= maximum_error) { + maximum_error = error; + maxerrorType = errorType; + } + if (debug) std::cout << "Bias gradient (" << i << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl; + } - return std::max(maximum_error, smaximum_error); + std::cout << "\rTesting bias gradients: "; + std::cout << "maximum error (" << maxerrorType << "): " << print_error(maximum_error) << std::endl; + + + //return std::max(maximum_error, smaximum_error); + return maximum_error; } /*! Generate a DeepNet, test backward pass */ @@ -216,6 +340,8 @@ auto testRecurrentBackpropagationBiases(size_t timeSteps, size_t batchSize, size randomMatrix(Y); fillMatrix(weights, 1.0); + std::cout << "Testing Bias Backprop using RNN with batchsize = " << batchSize << " input = " << inputSize << " state = " << stateSize << " time = " << timeSteps << std::endl; + Net_t rnn(batchSize, batchSize, timeSteps, inputSize, 0, 0, 0, ELossFunction::kMeanSquaredError, EInitialization::kGauss); RNNLayer_t* layer = rnn.AddBasicRNNLayer(stateSize, inputSize, timeSteps); rnn.AddReshapeLayer(1, timeSteps, stateSize, true); @@ -225,7 +351,8 @@ auto testRecurrentBackpropagationBiases(size_t timeSteps, size_t batchSize, size rnn.Backward(XArch, Y, weights); Scalar_t maximum_error = 0.0; - + std::string merrorType; + auto &B = layer->GetBiasesAt(0); auto &dB = layer->GetBiasGradientsAt(0); for (Int_t i = 0; i < B.GetNrows(); ++i) { @@ -237,17 +364,25 @@ auto testRecurrentBackpropagationBiases(size_t timeSteps, size_t batchSize, size // Compute the relative error if dy != 0. Scalar_t error; + std::string errorType; if (std::fabs(dy_ref) > 1e-15) { error = std::fabs((dy - dy_ref) / dy_ref); + errorType = "relative"; } else { error = std::fabs(dy - dy_ref); + errorType = "absolute"; } - maximum_error = std::max(error, maximum_error); + if ( error >= maximum_error) { + maximum_error = error; + merrorType = errorType; + } + std::cout << "Bias gradient (" << i << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl; + //maximum_error = std::max(error, maximum_error); } std::cout << "\rTesting bias gradients: "; - std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + std::cout << "maximum error (" << merrorType << "): " << print_error(maximum_error) << std::endl; return maximum_error; } diff --git a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx index 776f7b60d9e..a73c1df39cb 100644 --- a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx +++ b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx @@ -25,11 +25,16 @@ int main() { using Scalar_t = Double_t; // timesteps, batchsize, statesize, inputsize - testRecurrentBackpropagationWeights<TCpu<Scalar_t>>(1, 2, 1, 2, 1e-5); + testRecurrentBackpropagation<TCpu<Scalar_t>>(1, 2, 1, 2, 1e-5); - testRecurrentBackpropagationBiases<TCpu<Scalar_t>>(1, 2, 3, 2, 1e-5); + testRecurrentBackpropagation<TCpu<Scalar_t>>(1, 2, 3, 2, 1e-5); + + testRecurrentBackpropagation<TCpu<Scalar_t>>(2, 3, 4, 5, 1e-5); + + + // using a fixed input + testRecurrentBackpropagation<TCpu<Scalar_t>>(3, 1, 4, 5, 1e-10, false); - testRecurrentBackpropagationWeights<TCpu<Scalar_t>>(2, 3, 4, 5, 1e-5); return 0; } -- GitLab