From 4eac0a6e0d7deb8711db0ecbbdece7e9e241c3c4 Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Fri, 20 Apr 2018 13:18:15 +0200
Subject: [PATCH] Fix back propagation through time for both reference and CPU.
 Improve backdrop test

---
 tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h    |   6 +-
 .../TMVA/DNN/Architectures/Cpu/CpuMatrix.h    |   2 +-
 tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h         |   5 +-
 .../src/DNN/Architectures/Cpu/Arithmetic.cxx  |  14 +-
 .../Cpu/RecurrentPropagation.cxx              |  34 ++-
 .../Reference/RecurrentPropagation.cxx        |  32 ++-
 tmva/tmva/src/MethodDL.cxx                    |   3 +-
 tmva/tmva/test/DNN/RNN/CMakeLists.txt         |   3 +
 .../DNN/RNN/TestRecurrentBackpropagation.cxx  |  14 +-
 .../DNN/RNN/TestRecurrentBackpropagation.h    | 199 +++++++++++++++---
 .../RNN/TestRecurrentBackpropagationCpu.cxx   |  11 +-
 11 files changed, 263 insertions(+), 60 deletions(-)

diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
index 5bc872b9cc2..116946b1ba2 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
@@ -430,7 +430,8 @@ public:
     */
    static void TransposeMultiply(TCpuMatrix<Scalar_t> &output,
                                  const TCpuMatrix<Scalar_t> &input,
-                                 const TCpuMatrix<Scalar_t> &Weights);
+                                 const TCpuMatrix<Scalar_t> &Weights,
+                                 Scalar_t alpha = 1.0, Scalar_t beta = 0.);
    /** In-place Hadamard (element-wise) product of matrices \p A and \p B
     *  with the result being written into \p A.
     */
@@ -441,7 +442,8 @@ public:
     * m elements in \p A.
     */
    static void SumColumns(TCpuMatrix<Scalar_t> &B,
-                          const TCpuMatrix<Scalar_t> &A);
+                          const TCpuMatrix<Scalar_t> &A,
+                          Scalar_t alpha = 1.0, Scalar_t beta = 0.);
 
    /** Compute the sum of all elements in \p A */
    static Scalar_t Sum(const TCpuMatrix<Scalar_t> &A);
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h
index 069ccfd7bcd..831655d9c68 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h
@@ -25,7 +25,7 @@
 #include "CpuBuffer.h"
 #include <TMVA/Config.h>
 
-//#define DEBUG_TMVA_TCPUMATRIX
+#define DEBUG_TMVA_TCPUMATRIX
 #if defined(DEBUG_TMVA_TCPUMATRIX)
 #define PrintMatrix(mat, text)                                                                             \
    {                                                                                                       \
diff --git a/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h b/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h
index cc5757563f7..6cbbae05ea2 100644
--- a/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h
+++ b/tmva/tmva/inc/TMVA/DNN/RNN/RNNLayer.h
@@ -298,7 +298,8 @@ auto inline TBasicRNNLayer<Architecture_t>::Backward(Tensor_t &gradients_backwar
    // gradients_backward is activationGradients of layer before it, which is input layer
    // currently gradient_backward is for input(x) and not for state
    // TODO use this to change initial state??
-   
+
+
   bool dummy = false;
   if (gradients_backward.size() == 0 || gradients_backward[0].GetNrows() == 0 || gradients_backward[0].GetNcols() == 0) {
      dummy = true;
@@ -313,7 +314,7 @@ auto inline TBasicRNNLayer<Architecture_t>::Backward(Tensor_t &gradients_backwar
   Tensor_t arr_activations_backward;
   for (size_t t = 0; t < fTimeSteps; ++t) arr_activations_backward.emplace_back(this->GetBatchSize(), this->GetInputSize());  // T x B x D
   Architecture_t::Rearrange(arr_activations_backward, activations_backward);
-   
+
    Matrix_t state_gradients_backward(this->GetBatchSize(), fStateSize);  // B x H
    DNN::initialize<Architecture_t>(state_gradients_backward,  DNN::EInitialization::kZero);
 
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
index b04e3f236c0..07aff00f6ef 100644
--- a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
@@ -55,7 +55,8 @@ void TCpu<Real_t>::Multiply(TCpuMatrix<Real_t> &C,
 template<typename Real_t>
 void TCpu<Real_t>::TransposeMultiply(TCpuMatrix<Real_t> &C,
                                      const TCpuMatrix<Real_t> &A,
-                                     const TCpuMatrix<Real_t> &B)
+                                     const TCpuMatrix<Real_t> &B,
+                                     Real_t alpha, Real_t beta)
 {
     int m = (int) A.GetNcols();
     int k = (int) A.GetNrows();
@@ -68,8 +69,8 @@ void TCpu<Real_t>::TransposeMultiply(TCpuMatrix<Real_t> &C,
     char transa = 'T';
     char transb = 'N';
 
-    Real_t alpha = 1.0;
-    Real_t beta  = 0.0;
+    //Real_t alpha = 1.0;
+    //Real_t beta  = 0.0;
 
     const Real_t *APointer = A.GetRawDataPointer();
     const Real_t *BPointer = B.GetRawDataPointer();
@@ -112,14 +113,15 @@ void TCpu<Real_t>::Hadamard(TCpuMatrix<Real_t> &B,
 //____________________________________________________________________________
 template<typename Real_t>
 void TCpu<Real_t>::SumColumns(TCpuMatrix<Real_t> &B,
-                                           const TCpuMatrix<Real_t> &A)
+                              const TCpuMatrix<Real_t> &A,
+                              Real_t alpha, Real_t beta)
 {
    int m = (int) A.GetNrows();
    int n = (int) A.GetNcols();
    int inc = 1;
 
-   Real_t alpha = 1.0;
-   Real_t beta  = 0.0;
+   // Real_t alpha = 1.0;
+   //Real_t beta  = 0.0;
    char   trans   = 'T';
 
    const Real_t * APointer = A.GetRawDataPointer();
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx
index 4f7311d570a..ab43f052416 100644
--- a/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/RecurrentPropagation.cxx
@@ -37,6 +37,14 @@ auto TCpu<AFloat>::RecurrentLayerBackward(TCpuMatrix<AFloat> & state_gradients_b
                                           TCpuMatrix<AFloat> & input_gradient)
 -> TCpuMatrix<AFloat> &
 {
+
+   // std::cout << "Recurrent Propo" << std::endl;
+   // PrintMatrix(df,"DF");
+   // PrintMatrix(state_gradients_backward,"State grad");
+   // PrintMatrix(input_weight_gradients,"input w grad");
+   // PrintMatrix(state,"state");
+   // PrintMatrix(input,"input");
+   
    // Compute element-wise product.
    Hadamard(df, state_gradients_backward);  // B x H 
    
@@ -48,18 +56,30 @@ auto TCpu<AFloat>::RecurrentLayerBackward(TCpuMatrix<AFloat> & state_gradients_b
 
    // Weights gradients
    if (input_weight_gradients.GetNElements() > 0) {
-      TCpuMatrix<AFloat> tmp(input_weight_gradients);
-      TransposeMultiply(input_weight_gradients, df, input); // H x B . B x D
-      ScaleAdd(input_weight_gradients, tmp, 1);
+      //TCpuMatrix<AFloat> tmp(input_weight_gradients);
+      TransposeMultiply(input_weight_gradients, df, input, 1. , 1.); // H x B . B x D
+      //ScaleAdd(input_weight_gradients, tmp, 1);
    }
    if (state_weight_gradients.GetNElements() > 0) {
-      TCpuMatrix<AFloat> tmp(state_weight_gradients);
-      TransposeMultiply(state_weight_gradients, df, state); // H x B . B x H
-      ScaleAdd(state_weight_gradients, tmp, 1);
+      //TCpuMatrix<AFloat> tmp(state_weight_gradients);
+      TransposeMultiply(state_weight_gradients, df, state, 1. , 1. ); // H x B . B x H
+      //ScaleAdd(state_weight_gradients, tmp, 1);
    }
 
    // Bias gradients.
-   if (bias_gradients.GetNElements() > 0) SumColumns(bias_gradients, df);
+   if (bias_gradients.GetNElements() > 0) {
+      //TCpuMatrix<AFloat> tmp(bias_gradients);
+      SumColumns(bias_gradients, df, 1., 1.);  // could be probably do all here
+      //ScaleAdd(bias_gradients, tmp, 1);
+   }
+
+   //std::cout << "RecurrentPropo: end " << std::endl;
+
+   // PrintMatrix(state_gradients_backward,"State grad");
+   // PrintMatrix(input_weight_gradients,"input w grad");
+   // PrintMatrix(bias_gradients,"bias grad");
+   // PrintMatrix(input_gradient,"input grad");
+
    return input_gradient;
 }
 
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx b/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx
index 2841df44330..c89d1455d76 100644
--- a/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Reference/RecurrentPropagation.cxx
@@ -35,6 +35,19 @@ auto TReference<Scalar_t>::RecurrentLayerBackward(TMatrixT<Scalar_t> & state_gra
                                                   TMatrixT<Scalar_t> & input_gradient)
 -> Matrix_t &
 {
+
+   // std::cout << "Reference Recurrent Propo" << std::endl;
+   // std::cout << "df\n";
+   // df.Print();
+   // std::cout << "state gradient\n";
+   // state_gradients_backward.Print();
+   // std::cout << "inputw gradient\n";
+   // input_weight_gradients.Print(); 
+   // std::cout << "state\n";
+   // state.Print();
+   // std::cout << "input\n";
+   // input.Print();
+   
    // Compute element-wise product.
    for (size_t i = 0; i < (size_t) df.GetNrows(); i++) {
       for (size_t j = 0; j < (size_t) df.GetNcols(); j++) {
@@ -48,7 +61,7 @@ auto TReference<Scalar_t>::RecurrentLayerBackward(TMatrixT<Scalar_t> & state_gra
    }
    // State gradients
    if (state_gradients_backward.GetNoElements() > 0) {
-      state_gradients_backward.MultT(df, weights_state);  // B x H . H x H = B x H
+      state_gradients_backward.Mult(df, weights_state);  // B x H . H x H = B x H
    }
    
    // Weights gradients.
@@ -65,14 +78,29 @@ auto TReference<Scalar_t>::RecurrentLayerBackward(TMatrixT<Scalar_t> & state_gra
    
    // Bias gradients. B x H -> H x 1
    if (bias_gradients.GetNoElements() > 0) {
+      // this loops on state size
       for (size_t j = 0; j < (size_t) df.GetNcols(); j++) {
          Scalar_t sum = 0.0;
+         // this loops on batch size summing all gradient contributions in a batch
          for (size_t i = 0; i < (size_t) df.GetNrows(); i++) {
             sum += df(i,j);
          }
-         bias_gradients(j,0) = sum;
+         bias_gradients(j,0) += sum;
       }
    }
+
+   // std::cout << "RecurrentPropo: end " << std::endl;
+
+   // std::cout << "state gradient\n";
+   // state_gradients_backward.Print();
+   // std::cout << "inputw gradient\n";
+   // input_weight_gradients.Print(); 
+   // std::cout << "bias gradient\n";
+   // bias_gradients.Print(); 
+   // std::cout << "input gradient\n";
+   // input_gradient.Print(); 
+
+   
    return input_gradient;
 }
 
diff --git a/tmva/tmva/src/MethodDL.cxx b/tmva/tmva/src/MethodDL.cxx
index 609fdd7d577..1acce247c70 100644
--- a/tmva/tmva/src/MethodDL.cxx
+++ b/tmva/tmva/src/MethodDL.cxx
@@ -471,8 +471,7 @@ void MethodDL::CreateDeepNet(DNN::TDeepNet<Architecture_t, Layer_t> &deepNet,
       } else if (strLayerType == "RESHAPE") {
          ParseReshapeLayer(deepNet, nets, layerString->GetString(), subDelimiter);
       } else if (strLayerType == "RNN") {
-          Log() << kFATAL << "RNN Layer is not yet fully implemented" << Endl;
-         //ParseRnnLayer(deepNet, nets, layerString->GetString(), subDelimiter);
+         ParseRnnLayer(deepNet, nets, layerString->GetString(), subDelimiter);
       } else if (strLayerType == "LSTM") {
          Log() << kFATAL << "LSTM Layer is not yet fully implemented" << Endl;
          //ParseLstmLayer(deepNet, nets, layerString->GetString(), subDelimiter);
diff --git a/tmva/tmva/test/DNN/RNN/CMakeLists.txt b/tmva/tmva/test/DNN/RNN/CMakeLists.txt
index 14bfbf4dadd..5bb1d3da02c 100644
--- a/tmva/tmva/test/DNN/RNN/CMakeLists.txt
+++ b/tmva/tmva/test/DNN/RNN/CMakeLists.txt
@@ -48,4 +48,7 @@ if (BLAS_FOUND AND imt)
   ROOT_EXECUTABLE(testForwardPassCpu TestForwardPassCpu.cxx LIBRARIES ${Libraries})
   ROOT_ADD_TEST(TMVA-DNN-RNN-Forward-Cpu COMMAND testForwardPassCpu)
 
+  ROOT_EXECUTABLE(testRecurrentBackpropagationCpu TestRecurrentBackpropagationCpu.cxx LIBRARIES ${Libraries})
+ ROOT_ADD_TEST(TMVA-DNN-RNN-BackpropagationCpu COMMAND testRecurrentBackpropagationCpu)
+
 endif (BLAS_FOUND AND imt)
diff --git a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx
index 168f5c37844..bc8b6fef7e6 100644
--- a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx
+++ b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.cxx
@@ -24,11 +24,19 @@ int main() {
    std::cout << "Testing RNN backward pass\n";
 
    // timesteps, batchsize, statesize, inputsize
-   testRecurrentBackpropagationWeights<TReference<double>>(2, 2, 1, 2, 1e-5);
+   testRecurrentBackpropagation<TReference<double>>(1, 2, 1, 10, 1e-5);
 
-   testRecurrentBackpropagationBiases<TReference<double>>(1, 2, 3, 2, 1e-5); 
+   
+   testRecurrentBackpropagation<TReference<double>>(2, 2, 1, 10, 1e-5);
 
-   testRecurrentBackpropagationWeights<TReference<double>>(2, 3, 4, 5, 1e-5);
+   testRecurrentBackpropagation<TReference<double>>(1, 2, 2, 10, 1e-5);
+
+
+   testRecurrentBackpropagation<TReference<double>>(2, 1, 2, 5, 1e-10);
+
+   testRecurrentBackpropagation<TReference<double>>(4, 2, 3, 10, 1e-10);
+   // using a fixed input 
+   testRecurrentBackpropagation<TReference<double>>(3, 1, 4, 5, 1e-10, false);
 
    return 0;
 }
diff --git a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h
index 964e621a442..f8b83a11bd0 100644
--- a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h
+++ b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagation.h
@@ -20,6 +20,8 @@
 #include <vector>
 
 #include "../Utility.h"
+#include "Math/Functor.h"
+#include "Math/RichardsonDerivator.h"
 
 #include "TMVA/DNN/Functions.h"
 #include "TMVA/DNN/DeepNet.h"
@@ -63,14 +65,15 @@ auto printTensor(const typename Architecture::Matrix_t &A, const std::string nam
 template <typename Architecture>
 auto evaluate_net_weight(TDeepNet<Architecture> &net, std::vector<typename Architecture::Matrix_t> & X,
                          const typename Architecture::Matrix_t &Y, const typename Architecture::Matrix_t &W, size_t l,
-                         size_t k, size_t i, size_t j, typename Architecture::Scalar_t dx) ->
+                         size_t k, size_t i, size_t j, typename Architecture::Scalar_t xvalue) ->
    typename Architecture::Scalar_t
 {
     using Scalar_t = typename Architecture::Scalar_t;
 
-    net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) += dx;
+    Scalar_t prev_value = net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j);
+    net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) = xvalue;
     Scalar_t res = net.Loss(X, Y, W, false, false);
-    net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) -= dx;
+    net.GetLayerAt(l)->GetWeightsAt(k).operator()(i,j) = prev_value;
     return res;
 }
 
@@ -80,60 +83,127 @@ auto evaluate_net_weight(TDeepNet<Architecture> &net, std::vector<typename Archi
 template <typename Architecture>
 auto evaluate_net_bias(TDeepNet<Architecture> &net, std::vector<typename Architecture::Matrix_t> & X,
                        const typename Architecture::Matrix_t &Y, const typename Architecture::Matrix_t &W, size_t l,
-                       size_t k, size_t i, typename Architecture::Scalar_t dx) -> typename Architecture::Scalar_t
+                       size_t k, size_t i, typename Architecture::Scalar_t xvalue) -> typename Architecture::Scalar_t
 {
     using Scalar_t = typename Architecture::Scalar_t;
-
-    net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) += dx;
+ 
+    Scalar_t prev_value = net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0);
+    net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) = xvalue;
     Scalar_t res = net.Loss(X, Y, W, false, false);
-    net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) -= dx;
+    net.GetLayerAt(l)->GetBiasesAt(k).operator()(i,0) = prev_value; 
     return res;
 }
 
 /*! Generate a DeepNet, test backward pass */
 //______________________________________________________________________________
 template <typename Architecture>
-auto testRecurrentBackpropagationWeights(size_t timeSteps, size_t batchSize, size_t stateSize, 
-                                         size_t inputSize, typename Architecture::Scalar_t dx)
+auto testRecurrentBackpropagation(size_t timeSteps, size_t batchSize, size_t stateSize, 
+                                  size_t inputSize, typename Architecture::Scalar_t dx = 1.E-5, bool randomInput = true)
 -> Double_t
 {
+   bool debug = false; 
+   
    using Matrix_t   = typename Architecture::Matrix_t;
    using Tensor_t   = std::vector<Matrix_t>;
    using RNNLayer_t = TBasicRNNLayer<Architecture>; 
    using Net_t      = TDeepNet<Architecture>;
    using Scalar_t = typename Architecture::Scalar_t;
 
-   std::vector<TMatrixT<Double_t>> XRef(batchSize, TMatrixT<Double_t>(timeSteps, inputSize));    // B x T x D
-   Tensor_t XArch;
-   //for (size_t i = 0; i < batchSize; ++i) XArch.emplace_back(timeSteps, inputSize); // B x T x D
+   //std::vector<Matrix_t<Double_t>> XRef(batchSize, Matrix_t<Double_t>(timeSteps, inputSize));    // B x T x D
+   Tensor_t XArch;  // B x T x D
    for (size_t i = 0; i < batchSize; ++i) {
-      //randomMatrix(XRef[i]);
-      for (size_t l = 0; l < XRef[i].GetNrows(); ++l) {
-        for (size_t m = 0; m < XRef[i].GetNcols(); ++m) {
-          XRef[i](l, m) = i + l + m;
-        }
-      } 
-      XArch.emplace_back(XRef[i]);
+      XArch.emplace_back(timeSteps, inputSize);
    }
 
+   // for random input (default) 
+   if (randomInput) { 
+   for (size_t i = 0; i < batchSize; ++i) {
+         for (size_t l = 0; l < XArch[i].GetNrows(); ++l) {
+            for (size_t m = 0; m < XArch[i].GetNcols(); ++m) {
+               //XArch[i](l, m) = i + l + m;
+               XArch[i](l, m) = gRandom->Uniform(-1,1);
+            }
+         } 
+      }
+   }
+   else { 
+      debug = true; 
+      R__ASSERT(inputSize <= 6); 
+      R__ASSERT(timeSteps <= 3); 
+      R__ASSERT(batchSize <= 1); 
+      double x_input[] = {-1,   1,  -2,  2, -3,  3 ,
+                          -0.5, 0.5,-0.8,0.9, -2, 1.5,
+                          -0.2, 0.1,-0.5,0.4, -1, 1.};
+
+      TMatrixD Input(3,6,x_input);
+      for (size_t i = 0; i < batchSize; ++i) {
+         auto & mat = XArch[i];
+         // time 0
+         for (int l = 0; l < timeSteps; ++l) {
+            for (int m = 0; m < inputSize; ++m) {
+               mat(l,m) = Input(l,m);
+            }
+         }
+      }
+      gRandom->SetSeed(1); // for weights initizialization
+   }
+   printTensor<Architecture>(XArch,"input"); 
+   
    Matrix_t Y(batchSize, timeSteps * stateSize), weights(batchSize, 1);
    //randomMatrix(Y);
    for (size_t i = 0; i < Y.GetNrows(); ++i) {
      for (size_t j = 0; j < Y.GetNcols(); ++j) {
-       Y(i, j) = (i + j)/2.0 - 0.75;
+        Y(i, j) = 1;// (i + j)/2.0 - 0.75;
      }
    }
    fillMatrix(weights, 1.0);
 
+   std::cout << "Testing Weight Backprop using RNN with batchsize = " << batchSize << " input = " << inputSize << " state = " << stateSize << " time = " << timeSteps << std::endl;
    Net_t rnn(batchSize, batchSize, timeSteps, inputSize, 0, 0, 0, ELossFunction::kMeanSquaredError, EInitialization::kGauss);
    RNNLayer_t* layer = rnn.AddBasicRNNLayer(stateSize, inputSize, timeSteps);
+   //layer->Print(); 
    rnn.AddReshapeLayer(1, timeSteps, stateSize, true);
 
    rnn.Initialize();
+
+
+   auto & wi = layer->GetWeights()[0];
+   for (int i = 0; i < stateSize; ++i) { 
+      for (int j = 0; j < inputSize; ++j) { 
+         wi(i,j) =  gRandom->Uniform(-1,1);
+      }
+         wi(i,i) = 1.; 
+   }
+   
+   auto & wh = layer->GetWeights()[1];
+   for (int i = 0; i < stateSize; ++i) { 
+      for (int j = 0; j < stateSize; ++j) { 
+         wh(i,j) = gRandom->Uniform(-1,1);
+      }
+         wh(i,i) = 0.5; 
+   }
+   auto & b = layer->GetBiases()[0];
+   for (int i = 0; i < b.GetNrows(); ++i) { 
+      for (int j = 0; j < b.GetNcols(); ++j) { 
+         b(i,j) = gRandom->Uniform(-0.5,0.5);
+      }
+   }
+
+   
    rnn.Forward(XArch);
    rnn.Backward(XArch, Y, weights);
-  
+
+   if (debug)  {
+      auto & out = layer->GetOutput();
+      printTensor<Architecture>(out,"output"); 
+   }
+
+   
    Scalar_t maximum_error = 0.0;
+   std::string maxerrorType; 
+
+   ROOT::Math::RichardsonDerivator deriv;
+   
 
    // Weights Input, k = 0
    auto &Wi = layer->GetWeightsAt(0);
@@ -143,26 +213,37 @@ auto testRecurrentBackpropagationWeights(size_t timeSteps, size_t batchSize, siz
          auto f = [&rnn, &XArch, &Y, &weights, i, j](Scalar_t x) {
              return evaluate_net_weight(rnn, XArch, Y, weights, 0, 0, i, j, x);
          };
-         Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx);
+         ROOT::Math::Functor1D func(f);
+         double dy = deriv.Derivative1(func, Wi(i,j), 1.E-5);
          Scalar_t dy_ref = dWi(i, j);
 
          // Compute the relative error if dy != 0.
          Scalar_t error;
+         std::string errorType; 
          if (std::fabs(dy_ref) > 1e-15) {
             error = std::fabs((dy - dy_ref) / dy_ref);
+            errorType = "relative";
          } else {
             error = std::fabs(dy - dy_ref);
+            errorType = "absolute";
          }
 
-         maximum_error = std::max(error, maximum_error);
+         if (debug) std::cout << "Weight-input gradient (" << i << "," << j << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl;
+
+         if (error >= maximum_error) {
+            maximum_error = error;
+            maxerrorType = errorType; 
+         }
       }
    }
 
    std::cout << "\rTesting weight input gradients:      ";
-   std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+   std::cout << "maximum error (" << maxerrorType << "): "  << print_error(maximum_error) << std::endl;
 
+   /// testing weight state gradient
+   
    // Weights State, k = 1
-   Scalar_t smaximum_error = 0.0;
+   maximum_error = 0; 
    auto &Ws = layer->GetWeightsAt(1);
    auto &dWs = layer->GetWeightGradientsAt(1);
    for (Int_t i = 0; i < Ws.GetNrows(); ++i) {
@@ -170,25 +251,68 @@ auto testRecurrentBackpropagationWeights(size_t timeSteps, size_t batchSize, siz
          auto f = [&rnn, &XArch, &Y, &weights, i, j](Scalar_t x) {
              return evaluate_net_weight(rnn, XArch, Y, weights, 0, 1, i, j, x);
          };
-         Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx);
+         ROOT::Math::Functor1D func(f);
+         double dy = deriv.Derivative1(func, Ws(i,j), 1.E-5);
          Scalar_t dy_ref = dWs(i, j);
 
          // Compute the relative error if dy != 0.
          Scalar_t error;
+         std::string errorType;  
          if (std::fabs(dy_ref) > 1e-15) {
             error = std::fabs((dy - dy_ref) / dy_ref);
+            errorType = "relative";
          } else {
             error = std::fabs(dy - dy_ref);
+            errorType = "absolute";
          }
 
-         smaximum_error = std::max(error, smaximum_error);
+         if ( error >= maximum_error) {
+            maximum_error = error; 
+            maxerrorType = errorType; 
+         }
+         if (debug) std::cout << "Weight-state gradient (" << i << "," << j << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl;
       }
    }
 
    std::cout << "\rTesting weight state gradients:      ";
-   std::cout << "maximum relative error: " << print_error(smaximum_error) << std::endl;
+   std::cout << "maximum error (" << maxerrorType << "): "  << print_error(maximum_error) << std::endl;
+
+   // testing bias gradients
+   maximum_error = 0; 
+   auto &B = layer->GetBiasesAt(0);
+   auto &dB = layer->GetBiasGradientsAt(0);
+   for (Int_t i = 0; i < B.GetNrows(); ++i) {
+      auto f = [&rnn, &XArch, &Y, &weights, i](Scalar_t x) {
+          return evaluate_net_bias(rnn, XArch, Y, weights, 0, 0, i, x);
+      };
+      ROOT::Math::Functor1D func(f);
+      double dy = deriv.Derivative1(func, B(i,0), 1.E-5);
+      Scalar_t dy_ref = dB(i, 0);
+
+      // Compute the relative error if dy != 0.
+      Scalar_t error;
+      std::string errorType;  
+      if (std::fabs(dy_ref) > 1e-15) {
+         error = std::fabs((dy - dy_ref) / dy_ref);
+         errorType = "relative";
+      } else {
+         error = std::fabs(dy - dy_ref);
+         errorType = "absolute";
+      }
+
+      if ( error >= maximum_error) {
+            maximum_error = error; 
+            maxerrorType = errorType; 
+      }
+      if (debug) std::cout << "Bias gradient (" << i << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl;
+   }
 
-   return std::max(maximum_error, smaximum_error);
+   std::cout << "\rTesting bias gradients:      ";
+   std::cout << "maximum error (" << maxerrorType << "): "  << print_error(maximum_error) << std::endl;
+
+
+   //return std::max(maximum_error, smaximum_error);
+   return maximum_error; 
 }
 
 /*! Generate a DeepNet, test backward pass */
@@ -216,6 +340,8 @@ auto testRecurrentBackpropagationBiases(size_t timeSteps, size_t batchSize, size
    randomMatrix(Y);
    fillMatrix(weights, 1.0);
 
+   std::cout << "Testing Bias Backprop using RNN with batchsize = " << batchSize << " input = " << inputSize << " state = " << stateSize << " time = " << timeSteps << std::endl;
+
    Net_t rnn(batchSize, batchSize, timeSteps, inputSize, 0, 0, 0, ELossFunction::kMeanSquaredError, EInitialization::kGauss);
    RNNLayer_t* layer = rnn.AddBasicRNNLayer(stateSize, inputSize, timeSteps);
    rnn.AddReshapeLayer(1, timeSteps, stateSize, true);
@@ -225,7 +351,8 @@ auto testRecurrentBackpropagationBiases(size_t timeSteps, size_t batchSize, size
    rnn.Backward(XArch, Y, weights);
   
    Scalar_t maximum_error = 0.0;
-
+   std::string merrorType;  
+      
    auto &B = layer->GetBiasesAt(0);
    auto &dB = layer->GetBiasGradientsAt(0);
    for (Int_t i = 0; i < B.GetNrows(); ++i) {
@@ -237,17 +364,25 @@ auto testRecurrentBackpropagationBiases(size_t timeSteps, size_t batchSize, size
 
       // Compute the relative error if dy != 0.
       Scalar_t error;
+      std::string errorType;  
       if (std::fabs(dy_ref) > 1e-15) {
          error = std::fabs((dy - dy_ref) / dy_ref);
+         errorType = "relative";
       } else {
          error = std::fabs(dy - dy_ref);
+         errorType = "absolute";
       }
 
-      maximum_error = std::max(error, maximum_error);
+      if ( error >= maximum_error) {
+            maximum_error = error; 
+            merrorType = errorType; 
+      }
+      std::cout << "Bias gradient (" << i << ") : (comp, ref) " << dy << " , " << dy_ref << std::endl;
+      //maximum_error = std::max(error, maximum_error);
    }
 
    std::cout << "\rTesting bias gradients:      ";
-   std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+   std::cout << "maximum error (" << merrorType << "): "  << print_error(maximum_error) << std::endl;
 
    return maximum_error;
 }
diff --git a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx
index 776f7b60d9e..a73c1df39cb 100644
--- a/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx
+++ b/tmva/tmva/test/DNN/RNN/TestRecurrentBackpropagationCpu.cxx
@@ -25,11 +25,16 @@ int main() {
    using Scalar_t = Double_t;
 
    // timesteps, batchsize, statesize, inputsize
-   testRecurrentBackpropagationWeights<TCpu<Scalar_t>>(1, 2, 1, 2, 1e-5);
+   testRecurrentBackpropagation<TCpu<Scalar_t>>(1, 2, 1, 2, 1e-5);
 
-   testRecurrentBackpropagationBiases<TCpu<Scalar_t>>(1, 2, 3, 2, 1e-5); 
+   testRecurrentBackpropagation<TCpu<Scalar_t>>(1, 2, 3, 2, 1e-5); 
+
+   testRecurrentBackpropagation<TCpu<Scalar_t>>(2, 3, 4, 5, 1e-5);
+
+
+   // using a fixed input 
+   testRecurrentBackpropagation<TCpu<Scalar_t>>(3, 1, 4, 5, 1e-10, false);
 
-   testRecurrentBackpropagationWeights<TCpu<Scalar_t>>(2, 3, 4, 5, 1e-5);
 
    return 0;
 }
-- 
GitLab