diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f5b35c541e049213f12f1d7a218e53ff642dd7f9..ccabaf87954af8a042366859112f04f6fdfa98f9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -145,6 +145,16 @@ ROOT_ADD_TEST(test-stressvector-interpreted COMMAND ${ROOT_root_CMD} -b -q -l ${
               FAILREGEX "FAILED|Error in" DEPENDS test-stressvector)
 
 #--stressTMVA--------------------------------------------------------------------------------------
+FIND_PACKAGE(CUDA)
+if (CUDA_FOUND)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCUDA")
+endif (CUDA_FOUND)
+
+FIND_PACKAGE(BLAS)
+if (BLAS_FOUND AND imt)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCPU")
+endif (BLAS_FOUND AND imt)
+
 if(ROOT_tmva_FOUND)
   ROOT_EXECUTABLE(stressTMVA stressTMVA.cxx LIBRARIES TMVA)
   ROOT_ADD_TEST(test-stresstmva COMMAND stressTMVA -b)
diff --git a/test/stressTMVA.cxx b/test/stressTMVA.cxx
index 262b11c6052755c81724d0ff652c0e0f50b62543..5ab1a98e8b243588eb3548624c01408b56d6542e 100644
--- a/test/stressTMVA.cxx
+++ b/test/stressTMVA.cxx
@@ -2921,6 +2921,36 @@ void addClassificationTests( UnitTestSuite& TMVA_test, bool full=true)
                                                                 "!H:!V:NTrees=400:nEventsMin=200:MaxDepth=3:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=10:PruneMethod=NoPruning:VarTransform=Decorrelate" , 0.88, 0.98) );
    if (full) TMVA_test.addTest(new MethodUnitTestWithROCLimits( TMVA::Types::kRuleFit, "RuleFit",
                                                                 "H:!V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02" , 0.88, 0.98) );
+
+   TString config = "!H:V:VarTransform=N:ErrorStrategy=CROSSENTROPY"
+      ":WeightInitialization=XAVIER"
+      ":Layout=LINEAR|64,LINEAR|64,LINEAR|64,LINEAR"
+      ":TrainingStrategy=LearningRate=0.1,Momentum=0.9, ConvergenceSteps=20,"
+      "BatchSize=256,Regularization=None,TestRepetitions=5, Multithreading=True"
+      "|LearningRate=0.01,Momentum=0.5,ConvergenceSteps=20,BatchSize=256,"
+      "Regularization=None,TestRepetitions=5, Multithreading=True"
+      "|LearningRate=0.003,Momentum=0.5,ConvergenceSteps=20,BatchSize=256,"
+      "Regularization=None,TestRepetitions=5, Multithreading=True"
+      "|LearningRate=0.001,Momentum=0.0,ConvergenceSteps=20,BatchSize=256,"
+      "Regularization=None,TestRepetitions=5, Multithreading=True";
+   TString configStandard = "Architecture=STANDARD:" + config;
+   TString configCpu      = "Architecture=CPU:" + config;
+   TString configGpu      = "Architecture=GPU:" + config;
+
+
+   TMVA_test.addTest(new MethodUnitTestWithROCLimits(
+                         TMVA::Types::kDNN, "DNN Standard",
+                         configStandard, 0.85, 0.98));
+   #ifdef DNNCPU
+   TMVA_test.addTest(new MethodUnitTestWithROCLimits(
+                         TMVA::Types::kDNN, "DNN CPU", configCpu, 0.85, 0.98)
+                     );
+   #endif
+   #ifdef DNNCUDA
+   TMVA_test.addTest(new MethodUnitTestWithROCLimits(
+                         TMVA::Types::kDNN, "DNN GPU", configGpu, 0.85, 0.98)
+                     );
+   #endif
 }
 
 void addRegressionTests( UnitTestSuite& TMVA_test, bool full=true)
diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt
index 06966edd91990c597da7d3fbbea8a726a26b20e4..0c5637f01ab65bacfde24482c8e5df15bc34eef2 100644
--- a/tmva/tmva/CMakeLists.txt
+++ b/tmva/tmva/CMakeLists.txt
@@ -36,8 +36,10 @@ set(headers4 TNeuron.h TSynapse.h TActivationChooser.h TActivation.h TActivation
 	     VariableTransformBase.h VariableIdentityTransform.h VariableDecorrTransform.h VariablePCATransform.h 
 	     VariableGaussTransform.h VariableNormalizeTransform.h VariableRearrangeTransform.h VariableTransform.h ROCCalc.h ROCCurve.h)
 
-set(headers5 Event.h Results.h ResultsClassification.h ResultsRegression.h ResultsMulticlass.h VariableInfo.h ClassInfo.h
-             DataLoader.h DataSet.h DataSetInfo.h DataInputHandler.h DataSetManager.h DataSetFactory.h)
+set(headers5 Event.h Results.h ResultsClassification.h ResultsRegression.h
+    ResultsMulticlass.h VariableInfo.h ClassInfo.h DataLoader.h DataSet.h
+    DataSetInfo.h DataInputHandler.h DataSetManager.h DataSetFactory.h)
+
 
 #---Need to suffix each header name by TMVA/  -----------------
 foreach(hs headers1 headers2 headers3 headers4 headers5)
@@ -46,9 +48,40 @@ foreach(hs headers1 headers2 headers3 headers4 headers5)
   endforeach()
 endforeach()
 
+SET(DNN_FILES      src/DNN/Architectures/Reference.cxx)
+SET(DNN_CUDA_FILES src/DNN/Architectures/Cuda.cu
+                   src/DNN/Architectures/Cuda/CudaBuffers.cxx
+                   src/DNN/Architectures/Cuda/CudaMatrix.cu)
+SET(DNN_CPU_FILES  src/DNN/Architectures/Cpu.cxx
+                   src/DNN/Architectures/Cpu/CpuBuffer.cxx
+                   src/DNN/Architectures/Cpu/CpuMatrix.cxx)
+
+#---Handle CUDA dependent code. -----------------
+FIND_PACKAGE(CUDA)
+IF   (CUDA_FOUND)
+  CUDA_INCLUDE_DIRECTORIES(${ROOT_INCLUDE_DIRS})
+  CUDA_ADD_LIBRARY(dnn_cuda ${DNN_CUDA_FILES})
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCUDA")
+  SET(DNN_CUDA_LIBRARIES dnn_cuda ${CUDA_CUBLAS_LIBRARIES})
+ELSE (CUDA_FOUND)
+  SET(DNN_CUDA_LIBRARIES)
+ENDIF(CUDA_FOUND)
+
+#---Handle BLAS dependent code. -----------------
+FIND_PACKAGE(BLAS)
+IF (BLAS_FOUND AND imt)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCPU")
+  SET(DNN_CPU_LIBRARIES MathCore Matrix ${BLAS_LIBRARIES} tbb
+      ${CMAKE_THREAD_LIBS_INIT})
+ELSE (BLAS_FOUND AND imt)
+  SET(DNN_CPU_LIBRARIES)
+  SET(DNN_CPU_FILES)
+ENDIF(BLAS_FOUND AND imt)
+
 ROOT_GENERATE_DICTIONARY(G__TMVA ${theaders1} ${theaders2} ${theaders3} ${theaders4} ${theaders5}  MODULE TMVA LINKDEF LinkDef.h OPTIONS "-writeEmptyRootPCM")
 
-ROOT_LINKER_LIBRARY(TMVA *.cxx G__TMVA.cxx LIBRARIES Core
+ROOT_LINKER_LIBRARY(TMVA *.cxx G__TMVA.cxx ${DNN_FILES} ${DNN_CPU_FILES}
+                    LIBRARIES Core ${DNN_CUDA_LIBRARIES} ${DNN_CPU_LIBRARIES}
                     DEPENDENCIES RIO Hist Tree TreePlayer MLP Minuit XMLIO)
 
 install(DIRECTORY inc/TMVA/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/TMVA
@@ -66,7 +99,7 @@ if(NOT gnuinstall)
                  PATTERN "data" EXCLUDE)
 endif()
 
-#ROOT_ADD_TEST_SUBDIRECTORY(test)
+ROOT_ADD_TEST_SUBDIRECTORY(test/DNN)
 
 
 
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb3217aadac03a2eb88e6a5d9a58416fa0c52043
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
@@ -0,0 +1,287 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 05/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+// Definition of the TCpu architecture, which provides a        //
+ // multi-threaded CPU implementation of the low-level interface //
+ // networks for Cpus using tbb                                  //
+ //////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CPU
+#define TMVA_DNN_ARCHITECTURES_CPU
+
+#include "Cpu/CpuBuffer.h"
+#include "Cpu/CpuMatrix.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** The TCpu architecture class.
+ *
+ * Low-level interface class for multi-threaded CPU architectures. Contains as
+ * public types the declaration of the scalar, matrix and data loader types
+ * for this architecture as well as the remaining functions in the low-level
+ * interface in the form of static members.
+ */
+template<typename AReal>
+class TCpu
+{
+public:
+
+   using Scalar_t       = AReal;
+   using Matrix_t       = TCpuMatrix<AReal>;
+   using HostBuffer_t   = TCpuBuffer<AReal>;
+   using DeviceBuffer_t = TCpuBuffer<AReal>;
+
+   //____________________________________________________________________________
+   //
+   // Propagation
+   //____________________________________________________________________________
+
+   /** @name Forward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Matrix-multiply \p input with the transpose of \pweights and
+    *  write the results into \p output. */
+   static void MultiplyTranspose(TCpuMatrix<Scalar_t> &output,
+                                 const TCpuMatrix<Scalar_t> &input,
+                                 const TCpuMatrix<Scalar_t> &weights);
+   /** Add the vectors biases row-wise to the matrix output */
+   static void AddRowWise(TCpuMatrix<Scalar_t> &output,
+                          const TCpuMatrix<Scalar_t> &biases);
+   ///@}
+
+   /** @name Backward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Perform the complete backward propagation step. If the provided
+    *  \p activationGradientsBackward matrix is not empty, compute the
+    *  gradients of the objective function with respect to the activations
+    *  of the previous layer (backward direction).
+    *  Also compute the weight and the bias gradients. Modifies the values
+    *  in \p df and thus produces only a valid result, if it is applied the
+    *  first time after the corresponding forward propagation has been per-
+    *  formed. */
+   static void Backward(TCpuMatrix<Scalar_t> & activationGradientsBackward,
+                        TCpuMatrix<Scalar_t> & weightGradients,
+                        TCpuMatrix<Scalar_t> & biasGradients,
+                        TCpuMatrix<Scalar_t> & df,
+                        const TCpuMatrix<Scalar_t> & activationGradients,
+                        const TCpuMatrix<Scalar_t> & weights,
+                        const TCpuMatrix<Scalar_t> & activationBackward);
+   /** Adds a the elements in matrix B scaled by c to the elements in
+    *  the matrix A. This is required for the weight update in the gradient
+    *  descent step.*/
+   static void ScaleAdd(TCpuMatrix<Scalar_t> & A,
+                        const TCpuMatrix<Scalar_t> & B,
+                        Scalar_t beta = 1.0);
+
+   static void Copy(TCpuMatrix<Scalar_t> & B,
+                    const TCpuMatrix<Scalar_t> & A);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Activation Functions
+   //____________________________________________________________________________
+
+   /** @name Activation Functions
+    * For each activation function, the low-level interface contains two routines.
+    * One that applies the acitvation function to a matrix and one that evaluate
+    * the derivatives of the activation function at the elements of a given matrix
+    * and writes the results into the result matrix.
+    */
+   ///@{
+   static void IdentityDerivative(TCpuMatrix<Scalar_t> & B,
+                                  const TCpuMatrix<Scalar_t> &A);
+
+   static void Relu(TCpuMatrix<Scalar_t> & B);
+   static void ReluDerivative(TCpuMatrix<Scalar_t> & B,
+                              const TCpuMatrix<Scalar_t> & A);
+
+   static void Sigmoid(TCpuMatrix<Scalar_t> & B);
+   static void SigmoidDerivative(TCpuMatrix<Scalar_t> & B,
+                                 const TCpuMatrix<Scalar_t> & A);
+
+   static void Tanh(TCpuMatrix<Scalar_t> & B);
+   static void TanhDerivative(TCpuMatrix<Scalar_t> & B,
+                              const TCpuMatrix<Scalar_t> & A);
+
+   static void SymmetricRelu(TCpuMatrix<Scalar_t> & B);
+   static void SymmetricReluDerivative(TCpuMatrix<Scalar_t> & B,
+                                       const TCpuMatrix<Scalar_t> & A);
+
+   static void SoftSign(TCpuMatrix<Scalar_t> & B);
+   static void SoftSignDerivative(TCpuMatrix<Scalar_t> & B,
+                                  const TCpuMatrix<Scalar_t> & A);
+
+   static void Gauss(TCpuMatrix<Scalar_t> & B);
+   static void GaussDerivative(TCpuMatrix<Scalar_t> & B,
+                               const TCpuMatrix<Scalar_t> & A);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Loss Functions
+   //____________________________________________________________________________
+
+   /** @name Loss Functions
+    * Loss functions compute a scalar value given the \p output of the network
+    * for a given training input and the expected network prediction \p Y that
+    * quantifies the quality of the prediction. For each function also a routing
+    * that computes the gradients (suffixed by Gradients) must be provided for
+    * the starting of the backpropagation algorithm.
+    */
+   ///@{
+
+   static Scalar_t MeanSquaredError(const TCpuMatrix<Scalar_t> &Y,
+                                        const TCpuMatrix<Scalar_t> &output);
+   static void MeanSquaredErrorGradients(TCpuMatrix<Scalar_t> & dY,
+                                         const TCpuMatrix<Scalar_t> &Y,
+                                         const TCpuMatrix<Scalar_t> &output);
+
+    /** Sigmoid transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static Scalar_t CrossEntropy(const TCpuMatrix<Scalar_t> &Y,
+                              const TCpuMatrix<Scalar_t> &output);
+
+   static void CrossEntropyGradients(TCpuMatrix<Scalar_t> & dY,
+                                     const TCpuMatrix<Scalar_t> & Y,
+                                     const TCpuMatrix<Scalar_t> & output);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Output Functions
+   //____________________________________________________________________________
+
+   /** @name Output Functions
+    * Output functions transform the activations \p output of the
+    * output layer in the network to a valid prediction \p YHat for
+    * the desired usage of the network, e.g.  the identity function
+    * for regression or the sigmoid transformation for two-class
+    * classification.
+    */
+   ///@{
+   static void Sigmoid(TCpuMatrix<Scalar_t> &YHat,
+                        const TCpuMatrix<Scalar_t> & );
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Regularization
+   //____________________________________________________________________________
+
+   /** @name Regularization
+    * For each regularization type two functions are required, one named
+    * <tt><Type>Regularization</tt> that evaluates the corresponding
+    * regularization functional for a given weight matrix and the
+    * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
+    * component in the gradients to the provided matrix.
+    */
+   ///@{
+
+   static Scalar_t L1Regularization(const TCpuMatrix<Scalar_t> & W);
+   static void AddL1RegularizationGradients(TCpuMatrix<Scalar_t> & A,
+                                            const TCpuMatrix<Scalar_t> & W,
+                                            Scalar_t weightDecay);
+
+   static Scalar_t L2Regularization(const TCpuMatrix<Scalar_t> & W);
+   static void AddL2RegularizationGradients(TCpuMatrix<Scalar_t> & A,
+                                            const TCpuMatrix<Scalar_t> & W,
+                                            Scalar_t weightDecay);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Initialization
+   //____________________________________________________________________________
+
+   /** @name Initialization
+    * For each initialization method, one function in the low-level interface
+    * is provided. The naming scheme is <p>Initialize<Type></p> for a given
+    * initialization method Type.
+    */
+   ///@{
+
+   static void InitializeGauss(TCpuMatrix<Scalar_t> & A);
+   static void InitializeUniform(TCpuMatrix<Scalar_t> & A);
+   static void InitializeIdentity(TCpuMatrix<Scalar_t> & A);
+   static void InitializeZero(TCpuMatrix<Scalar_t> & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Dropout
+   //____________________________________________________________________________
+
+   /** @name Dropout
+    */
+   ///@{
+
+   /** Apply dropout with activation probability \p p to the given
+    *  matrix \p A and scale the result by reciprocal of \p p. */
+   static void Dropout(TCpuMatrix<Scalar_t> & A, Scalar_t p);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Additional Arithmetic Functions
+   //____________________________________________________________________________
+
+   /** @name Additional Arithmetic Functions
+    *
+    * Additional arithmetic on CUDA matrices  used to implement the low-level
+    * interface.
+    */
+   ///@{
+
+   /** Standard multiplication of two matrices \p A and \p B with the result being
+    *  written into C.
+    */
+   static void Multiply(TCpuMatrix<Scalar_t> &C,
+                        const TCpuMatrix<Scalar_t> &A,
+                        const TCpuMatrix<Scalar_t> &B);
+   /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
+    *  result being written into C.
+    */
+   static void TransposeMultiply(TCpuMatrix<Scalar_t> &output,
+                                 const TCpuMatrix<Scalar_t> &input,
+                                 const TCpuMatrix<Scalar_t> &Weights);
+   /** In-place Hadamard (element-wise) product of matrices \p A and \p B
+    *  with the result being written into \p A.
+    */
+   static void Hadamard(TCpuMatrix<Scalar_t> &A,
+                        const TCpuMatrix<Scalar_t> &B);
+
+   /** Sum columns of (m x n) matrixx \p A and write the results into the first
+    * m elements in \p A.
+    */
+   static void SumColumns(TCpuMatrix<Scalar_t> &B,
+                          const TCpuMatrix<Scalar_t> &A);
+
+   /** Compute the sum of all elements in \p A */
+   static Scalar_t Sum(const TCpuMatrix<Scalar_t> &A);
+
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/Blas.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/Blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..8288db18ba341150eb37f26affb3c9f9dc6839c0
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/Blas.h
@@ -0,0 +1,171 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Declarations of the BLAS functions used for the forward and   //
+// backward propagation of activation through neural networks on //
+// CPUs.                                                         //
+///////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CPU_BLAS
+#define TMVA_DNN_ARCHITECTURES_CPU_BLAS
+
+#include <iostream>
+
+// External Library Routines
+//____________________________________________________________________________
+extern "C" void saxpy_(const int * n, const float * alpha, const float * x,
+                       const int * incx, float * y,   const int * incy);
+extern "C" void daxpy_(const int * n, const double * alpha, const double * x,
+                       const int * incx, double * y, const int * incy);
+extern "C" void sger_(const int * m, const int * n, const float * alpha,
+                      const float * x, const int * incx,
+                      const float * y, const int * incy,
+                      float * A, const int * lda);
+extern "C" void dger_(const int * m, const int * n, const double * alpha,
+                      const double * x, const int * incx,
+                      const double * y, const int * incy,
+                      double * A, const int * lda);
+extern "C" void sgemv_(const char * trans, const int * m, const int * n,
+                       const float * alpha,  const float * A, const int * lda,
+                       const float * x, const int * incx,
+                       const float * beta, float * y, const int * incy);
+extern "C" void dgemv_(const char * trans, const int * m, const int * n,
+                       const double * alpha,  const double * A, const int * lda,
+                       const double * x, const int * incx,
+                       const double * beta, double * y, const int * incy);
+extern "C" void dgemm_(const char * transa, const char * transb,
+                       const int * m, const int * n, const int * k,
+                       const double * alpha, const double * A, const int * lda,
+                       const double * B, const int * ldb, const double * beta,
+                       double * C, const int * ldc);
+extern "C" void sgemm_(const char * transa, const char * transb,
+                       const int * m, const int * n, const int * k,
+                       const float * alpha, const float * A, const int * lda,
+                       const float * B, const int * ldb, const float * beta,
+                       float * C, const int * ldc);
+
+namespace TMVA
+{
+namespace DNN
+{
+namespace Blas
+{
+
+// Type-Generic Wrappers
+//____________________________________________________________________________
+/** Add the vector \p x scaled by \p alpha to \p y scaled by \beta */
+template <typename Real_t>
+inline void Axpy(const int * n, const Real_t * alpha,
+                 const Real_t * x, const int * incx,
+                 Real_t * y, const int * incy);
+
+/** Multiply the vector \p x with the matrix \p A and store the result in \p y. */
+template <typename Real_t>
+inline void Gemv(const char *trans, const int * m, const int * n,
+                 const Real_t * alpha, const Real_t * A, const int * lda,
+                 const Real_t * x, const int * incx,
+                 const Real_t * beta, Real_t * y, const int * incy);
+
+/** Multiply the matrix \p A with the matrix \p B and store the result in \p C. */
+template <typename Real_t>
+inline void Gemm(const char *transa, const char *transb,
+                 const int * m, const int * n, const int* k,
+                 const Real_t * alpha, const Real_t * A, const int * lda,
+                 const Real_t * B, const int * ldb, const Real_t * beta,
+                 Real_t * C, const int * ldc);
+
+/** Add the outer product of \p x and \p y to the matrix \p A. */
+template <typename Real_t>
+inline void Ger(const int * m, const int * n, const Real_t * alpha,
+                const Real_t * x, const int * incx,
+                const Real_t * y, const int * incy,
+                Real_t * A, const int * lda);
+
+// Specializations
+//____________________________________________________________________________
+template<>
+inline void Axpy<double>(const int * n, const double * alpha,
+                         const double * x, const int * incx,
+                         double * y, const int * incy)
+{
+    daxpy_(n, alpha, x, incx, y, incy);
+}
+
+template<>
+inline void Axpy<float>(const int * n, const float * alpha,
+                        const float * x, const int * incx,
+                        float * y, const int * incy)
+{
+   saxpy_(n, alpha, x, incx, y, incy);
+}
+
+template<>
+inline void Gemv<double>(const char *trans, const int * m, const int * n,
+                         const double * alpha, const double * A, const int * lda,
+                         const double * x, const int * incx,
+                         const double * beta, double * y, const int * incy)
+{
+   dgemv_(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+template<>
+inline void Gemv<float>(const char *trans, const int * m, const int * n,
+                        const float * alpha, const float * A, const int * lda,
+                        const float * x, const int * incx,
+                        const float * beta, float * y, const int * incy)
+{
+   sgemv_(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+template<>
+inline void Gemm<double>(const char *transa, const char *transb,
+                         const int * m, const int * n, const int* k,
+                         const double * alpha, const double * A, const int * lda,
+                         const double * B, const int * ldb, const double * beta,
+                         double * C, const int * ldc)
+{
+    dgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+template<>
+inline void Gemm<float>(const char *transa, const char *transb,
+                        const int * m, const int * n, const int* k,
+                        const float * alpha, const float * A, const int * lda,
+                        const float * B, const int * ldb, const float * beta,
+                        float * C, const int * ldc)
+{
+    sgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+inline void Ger<double>(const int * m, const int * n, const double * alpha,
+                        const double * x, const int * incx,
+                        const double * y, const int * incy,
+                        double * A, const int * lda)
+{
+   dger_(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+template <>
+inline void Ger<float>(const int * m, const int * n, const float * alpha,
+                       const float * x, const int * incx,
+                       const float * y, const int * incy,
+                       float * A, const int * lda)
+{
+   sger_(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+} // namespace Blas
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..192ece11b6983c78ec42f33a8f89ca809a64d123
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h
@@ -0,0 +1,86 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////
+// CPU Buffer interface class for the generic data loader. //
+/////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUBUFFER
+#define TMVA_DNN_ARCHITECTURES_CPU_CPUBUFFER
+
+#include "TMVA/DNN/DataLoader.h"
+#include <vector>
+#include <memory>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** TCpuBuffer
+ *
+ * Since the memory on the CPU is homogeneous, only one buffer class is required.
+ * The host and device buffer classes are the same and copying between the host
+ * and device buffer is achieved by simply swapping the memory pointers.
+ *
+ * Memory is handled as a shared pointer to a pointer of type AFloat, which is
+ * the floating point type used for the implementation.
+ *
+ * Copying and assignment of TCpuBuffer objects performs only a shallow copy
+ * meaning the underlying data is shared between those objects.
+ *
+ * \tparam AFloat The floating point type used for the computations.
+ */
+template<typename AFloat>
+class TCpuBuffer
+{
+private:
+
+   size_t fSize;
+   size_t fOffset;
+   std::shared_ptr<AFloat *> fBuffer;
+
+   struct TDestructor
+   {
+       void operator()(AFloat ** pointer);
+       friend TCpuBuffer;
+   } fDestructor;
+
+public:
+
+   /** Construct buffer to hold \p size numbers of type \p AFloat.*/
+    TCpuBuffer(size_t size);
+    TCpuBuffer(const TCpuBuffer  &) = default;
+    TCpuBuffer(      TCpuBuffer &&) = default;
+    TCpuBuffer & operator=(const TCpuBuffer  &) = default;
+    TCpuBuffer & operator=(      TCpuBuffer &&) = default;
+
+    operator AFloat * () const {return (* fBuffer) + fOffset;}
+
+    /** Return subbuffer of siez \p start starting at element \p offset. */
+    TCpuBuffer GetSubBuffer(size_t offset, size_t start);
+
+    AFloat & operator[](size_t i)       {return (*fBuffer.get())[fOffset + i];}
+    AFloat   operator[](size_t i) const {return (*fBuffer.get())[fOffset + i];}
+
+    /** Copy data from another buffer. No real copying is performed, only the
+     *  data pointers are swapped. */
+    void CopyFrom(TCpuBuffer &);
+    /** Copy data to another buffer. No real copying is performed, only the
+     *  data pointers are swapped. */
+    void CopyTo(TCpuBuffer &);
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
+
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..66aa444b28fff41afd5e4ec61996599dcdddf20d
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h
@@ -0,0 +1,157 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////
+// Definition of the CpuMatrix class used to represent  //
+// weight and bias matrices in neural nets.             //
+//////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX
+#define TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX
+
+#include <cstddef>
+#include <vector>
+#include "tbb/tbb.h"
+
+#include "TMatrix.h"
+#include "CpuBuffer.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** The TCpuMatrix class.
+ *
+ * Matrix class for multi-threaded CPU architectures. Uses the TCpuBuffer
+ * class to store the matrices in column-major format for compatibility with
+ * BLAS. Provides Map and MapFrom member functions to simplify the application of
+ * activation functions and derivatives to matrices.
+ *
+ * Copying and assignment of TCpuMatrix objects only performs shallow copies, i.e.
+ * copying is fast and the resulting objects share the element data.
+ *
+ * \tparam AFloat The floating point type used to represent the matrix elements.
+ */
+//______________________________________________________________________________
+template<typename AFloat>
+class TCpuMatrix
+{
+private:
+
+   static std::vector<AFloat> fOnes;  ///< Vector filled with ones used for BLAS calls.
+
+   TCpuBuffer<AFloat> fBuffer; ///< The buffer holding the matrix elements
+                              ///< in column-major format.
+   size_t            fNCols;
+   size_t            fNRows;
+
+public:
+
+   /** Returns pointer to a vector holding only ones with a guaranteed length
+    *  of the number of columns of every instantiated CpuMatrix object. */
+   static const AFloat * GetOnePointer() {return fOnes.data();}
+
+   /** Construct matrix and allocate space for its elements. */
+   TCpuMatrix(size_t nRows, size_t nCols);
+   /** Construct a TCpuMatrix object by (deeply) copying from a
+    *  TMatrixT<Double_t> matrix. */
+   TCpuMatrix(const TMatrixT<Double_t> &);
+   /** Construct a m-times-n matrix from the given buffer. The size must of
+    *  course match. */
+   TCpuMatrix(const TCpuBuffer<AFloat> &buffer, size_t m, size_t n);
+
+   TCpuMatrix(const TCpuMatrix  &)             = default;
+   TCpuMatrix(      TCpuMatrix &&)             = default;
+   TCpuMatrix & operator=(const TCpuMatrix &)  = default;
+   TCpuMatrix & operator=(TCpuMatrix &&)       = default;
+   ~TCpuMatrix()                               = default;
+
+   /** Convert to a TMatrixT<Double_t> object. Performs a deep copy of the matrix
+    *  elements. */
+   operator TMatrixT<Double_t>() const;
+
+   /** Map the given function over the matrix elements. Executed in parallel
+    *  using tbb. */
+   template <typename Function_t>
+   void Map(Function_t &f);
+
+   /** Same as maps but takes the input values from the matrix \p A and writes
+    *  the results in this matrix. */
+   template <typename Function_t>
+   void MapFrom(Function_t &f, const TCpuMatrix & A);
+
+   size_t GetNrows() const {return fNRows;}
+   size_t GetNcols() const {return fNCols;}
+   size_t GetNElements() const {return fNRows * fNCols;}
+
+   /** Return matrix element in row \p i and column \p j. */
+   AFloat   operator()(size_t i, size_t j) const {return fBuffer[j * fNRows + i];}
+   AFloat & operator()(size_t i, size_t j)       {return fBuffer[j * fNRows + i];}
+
+   /** Return raw pointer to the elements stored contiguously in column-major
+    *  order. */
+   AFloat *       GetRawDataPointer()        {return fBuffer;}
+   const AFloat * GetRawDataPointer()  const {return fBuffer;}
+
+private:
+
+   void Initialize();
+
+};
+
+// Inline Functions.
+//______________________________________________________________________________
+template<typename AFloat>
+template<typename Function_t>
+inline void TCpuMatrix<AFloat>::Map(Function_t &f)
+{
+   AFloat __restrict__ *data = GetRawDataPointer();
+
+   auto fRange = [data, &f](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         data[i] = f(data[i]);
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, GetNElements());
+   parallel_for(range, fRange);
+}
+
+template<typename AFloat>
+template<typename Function_t>
+inline void TCpuMatrix<AFloat>::MapFrom(Function_t &f, const TCpuMatrix &A)
+{
+         AFloat __restrict__ *dataB = GetRawDataPointer();
+   const AFloat __restrict__ *dataA = A.GetRawDataPointer();
+
+   auto fRange = [&dataB, &dataA, &f](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+         size_t rangeEnd   = range.end();
+
+         for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+            dataB[i] = f(dataA[i]);
+         }
+   };
+
+   tbb::blocked_range<size_t> range(0, GetNElements());
+   parallel_for(range, fRange);
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..751d5264ea60127b448796aa5747f0af0cb44023
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
@@ -0,0 +1,289 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 05/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Definition of the TCuda architecture class, which provides an //
+// implementation of the low-level functionality for neural      //
+// networks for the CUDA computing architectures.                //
+///////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA
+#define TMVA_DNN_ARCHITECTURES_CUDA
+
+#include "cuda.h"
+#include "Cuda/CudaBuffers.h"
+#include "Cuda/CudaMatrix.h"
+#include "TMVA/DNN/DataLoader.h"
+#include <utility>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** The TCuda architecture class.
+ *
+ * Low-level interface class for CUDA computing architectures. Contains as
+ * public types the declaration of the scalar, matrix and buffer types
+ * for this architecture as well as the remaining functions in the low-level
+ * interface in the form of static members.
+ */
+template<typename AFloat = Double_t>
+class TCuda
+{
+
+public:
+
+    using Scalar_t       = AFloat;
+    using Matrix_t       = TCudaMatrix<AFloat>;
+    using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
+    using HostBuffer_t   = TCudaHostBuffer<AFloat>;
+
+   //____________________________________________________________________________
+   //
+   // Propagation
+   //____________________________________________________________________________
+
+   /** @name Forward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Matrix-multiply \p input with the transpose of \pweights and
+    *  write the results into \p output. */
+   static void MultiplyTranspose(TCudaMatrix<AFloat> &output,
+                                 const TCudaMatrix<AFloat> &input,
+                                 const TCudaMatrix<AFloat> &weights);
+   /** Add the vectors biases row-wise to the matrix output */
+   static void AddRowWise(TCudaMatrix<AFloat> &output,
+                          const TCudaMatrix<AFloat> &biases);
+   ///@}
+
+   /** @name Backward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Perform the complete backward propagation step. If the provided
+    *  \p activationGradientsBackward matrix is not empty, compute the
+    *  gradients of the objective function with respect to the activations
+    *  of the previous layer (backward direction).
+    *  Also compute the weight and the bias gradients. Modifies the values
+    *  in \p df and thus produces only a valid result, if it is applied the
+    *  first time after the corresponding forward propagation has been per-
+    *  formed. */
+   static void Backward(TCudaMatrix<AFloat> & activationGradientsBackward,
+                        TCudaMatrix<AFloat> & weightGradients,
+                        TCudaMatrix<AFloat> & biasGradients,
+                        TCudaMatrix<AFloat> & df,
+                        const TCudaMatrix<AFloat> & activationGradients,
+                        const TCudaMatrix<AFloat> & weights,
+                        const TCudaMatrix<AFloat> & activationBackward);
+   /** Adds a the elements in matrix B scaled by c to the elements in
+    *  the matrix A. This is required for the weight update in the gradient
+    *  descent step.*/
+   static void ScaleAdd(TCudaMatrix<AFloat> & A,
+                        const TCudaMatrix<AFloat> & B,
+                        Scalar_t beta = 1.0);
+   /** Copy the elements of matrix A into matrix B. */
+   static void Copy(TCudaMatrix<AFloat> & B,
+                    const TCudaMatrix<AFloat> & A);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Activation Functions
+   //____________________________________________________________________________
+
+   /** @name Activation Functions
+    * For each activation function, the low-level interface contains two routines.
+    * One that applies the acitvation function to a matrix and one that evaluate
+    * the derivatives of the activation function at the elements of a given matrix
+    * and writes the results into the result matrix.
+    */
+   ///@{
+   static void Identity(TCudaMatrix<AFloat> & B);
+   static void IdentityDerivative(TCudaMatrix<AFloat> & B,
+                                  const TCudaMatrix<AFloat> & A);
+
+   static void Relu(TCudaMatrix<AFloat> & B);
+   static void ReluDerivative(TCudaMatrix<AFloat> & B,
+                              const TCudaMatrix<AFloat> & A);
+
+   static void Sigmoid(TCudaMatrix<AFloat> & B);
+   static void SigmoidDerivative(TCudaMatrix<AFloat> & B,
+                                 const TCudaMatrix<AFloat> & A);
+
+   static void Tanh(TCudaMatrix<AFloat> & B);
+   static void TanhDerivative(TCudaMatrix<AFloat> & B,
+                              const TCudaMatrix<AFloat> & A);
+
+   static void SymmetricRelu(TCudaMatrix<AFloat> & B);
+   static void SymmetricReluDerivative(TCudaMatrix<AFloat> & B,
+                                       const TCudaMatrix<AFloat> & A);
+
+   static void SoftSign(TCudaMatrix<AFloat> & B);
+   static void SoftSignDerivative(TCudaMatrix<AFloat> & B,
+                                  const TCudaMatrix<AFloat> & A);
+
+   static void Gauss(TCudaMatrix<AFloat> & B);
+   static void GaussDerivative(TCudaMatrix<AFloat> & B,
+                               const TCudaMatrix<AFloat> & A);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Loss Functions
+   //____________________________________________________________________________
+
+   /** @name Loss Functions
+    * Loss functions compute a scalar value given the \p output of the network
+    * for a given training input and the expected network prediction \p Y that
+    * quantifies the quality of the prediction. For each function also a routing
+    * that computes the gradients (suffixed by Gradients) must be provided for
+    * the starting of the backpropagation algorithm.
+    */
+   ///@{
+
+   static AFloat MeanSquaredError(const TCudaMatrix<AFloat> &Y,
+                                  const TCudaMatrix<AFloat> &output);
+   static void MeanSquaredErrorGradients(TCudaMatrix<AFloat> & dY,
+                                         const TCudaMatrix<AFloat> &Y,
+                                         const TCudaMatrix<AFloat> &output);
+
+    /** Sigmoid transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static AFloat CrossEntropy(const TCudaMatrix<AFloat> &Y,
+                              const TCudaMatrix<AFloat> &output);
+
+   static void CrossEntropyGradients(TCudaMatrix<AFloat> & dY,
+                                     const TCudaMatrix<AFloat> & Y,
+                                     const TCudaMatrix<AFloat> & output);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Output Functions
+   //____________________________________________________________________________
+
+   /** @name Output Functions
+    * Output functions transform the activations \p output of the
+    * output layer in the network to a valid prediction \p YHat for
+    * the desired usage of the network, e.g.  the identity function
+    * for regression or the sigmoid transformation for two-class
+    * classification.
+    */
+   ///@{
+   static void Sigmoid(TCudaMatrix<AFloat> &YHat,
+                       const TCudaMatrix<AFloat> & );
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Regularization
+   //____________________________________________________________________________
+
+   /** @name Regularization
+    * For each regularization type two functions are required, one named
+    * <tt><Type>Regularization</tt> that evaluates the corresponding
+    * regularization functional for a given weight matrix and the
+    * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
+    * component in the gradients to the provided matrix.
+    */
+   ///@{
+
+   static AFloat L1Regularization(const TCudaMatrix<AFloat> & W);
+   static void AddL1RegularizationGradients(TCudaMatrix<AFloat> & A,
+                                            const TCudaMatrix<AFloat> & W,
+                                            AFloat weightDecay);
+
+   static AFloat L2Regularization(const TCudaMatrix<AFloat> & W);
+   static void AddL2RegularizationGradients(TCudaMatrix<AFloat> & A,
+                                            const TCudaMatrix<AFloat> & W,
+                                            AFloat weightDecay);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Initialization
+   //____________________________________________________________________________
+
+   /** @name Initialization
+    * For each initialization method, one function in the low-level interface
+    * is provided. The naming scheme is <p>Initialize<Type></p> for a given
+    * initialization method Type.
+    */
+   ///@{
+
+   static void InitializeGauss(TCudaMatrix<AFloat> & A);
+   static void InitializeUniform(TCudaMatrix<AFloat> & A);
+   static void InitializeIdentity(TCudaMatrix<AFloat> & A);
+   static void InitializeZero(TCudaMatrix<AFloat> & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Dropout
+   //____________________________________________________________________________
+
+   /** @name Dropout
+    */
+   ///@{
+
+   /** Apply dropout with activation probability \p p to the given
+    *  matrix \p A and scale the result by reciprocal of \p p. */
+   static void Dropout(TCudaMatrix<AFloat> & A, AFloat p);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Additional Arithmetic Functions
+   //____________________________________________________________________________
+
+   /** @name Additional Arithmetic Functions
+    *
+    * Additional arithmetic on CUDA matrices  used to implement the low-level
+    * interface.
+    */
+   ///@{
+
+   /** Standard multiplication of two matrices \p A and \p B with the result being
+    *  written into C.
+    */
+   static void Multiply(TCudaMatrix<AFloat> & C,
+                        const TCudaMatrix<AFloat> & A,
+                        const TCudaMatrix<AFloat> & B);
+   /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
+    *  result being written into C.
+    */
+   static void TransposeMultiply(TCudaMatrix<AFloat> & output,
+                                 const TCudaMatrix<AFloat> & input,
+                                 const TCudaMatrix<AFloat> & Weights);
+   /** In-place Hadamard (element-wise) product of matrices \p A and \p B
+    *  with the result being written into \p A.
+    */
+   static void Hadamard(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B);
+
+   /** Sum columns of (m x n) matrixx \p A and write the results into the first
+    * m elements in \p A.
+    */
+   static void SumColumns(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A);
+
+   /** Compute the sum of all elements in \p A */
+   static AFloat Sum(const TCudaMatrix<AFloat> &A);
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h
new file mode 100644
index 0000000000000000000000000000000000000000..f03483ff444a1042f0e489523be55361c3475040
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h
@@ -0,0 +1,157 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 07/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////
+// Device and host buffer for CUDA architectures. //
+////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDABUFFERS
+#define TMVA_DNN_ARCHITECTURES_CUDA_CUDABUFFERS
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include <memory>
+
+namespace TMVA {
+namespace DNN  {
+
+template<typename AFloat>
+class TCudaDeviceBuffer;
+
+/** TCudaHostBuffer
+ *
+ * Wrapper class for pinned memory buffers on the host.  Uses
+ * std::shared_pointer with custom destructor to ensure consistent
+ * memory management and allow for easy copying/moving of the
+ * buffers. Copying is asynchronous and will set the cudaStream of the
+ * device buffer so that subsequent computations on the device buffer
+ * can be performed on the same stream.
+ *
+ * \tparam AFloat The floating point type to be stored in the buffers.
+ */
+template<typename AFloat>
+class TCudaHostBuffer
+{
+private:
+
+   size_t                    fOffset;        ///< Offset for sub-buffers
+   mutable cudaStream_t      fComputeStream; ///< cudaStream for data transfer
+   std::shared_ptr<AFloat *> fHostPointer;   ///< Pointer to the buffer data
+
+   // Custom destructor required to free pinned host memory using cudaFree.
+   struct TDestructor
+   {
+       TDestructor()                     = default;
+       TDestructor(const TDestructor  &) = default;
+       TDestructor(      TDestructor &&) = default;
+       TDestructor & operator=(const TDestructor  &) = default;
+       TDestructor & operator=(      TDestructor &&) = default;
+       void operator()(AFloat ** devicePointer);
+   } fDestructor;
+
+   friend TCudaDeviceBuffer<AFloat>;
+
+public:
+
+   TCudaHostBuffer(size_t size);
+   TCudaHostBuffer(AFloat *);
+   TCudaHostBuffer() = default;
+   TCudaHostBuffer(const TCudaHostBuffer  &) = default;
+   TCudaHostBuffer(      TCudaHostBuffer &&) = default;
+   TCudaHostBuffer & operator=(const TCudaHostBuffer  &) = default;
+   TCudaHostBuffer & operator=(      TCudaHostBuffer &&) = default;
+
+   /** Return sub-buffer of the current buffer. */
+   TCudaHostBuffer GetSubBuffer(size_t offset, size_t size);
+
+   operator AFloat * () const;
+
+   inline AFloat & operator[](size_t index);
+   inline AFloat   operator[](size_t index) const;
+
+};
+
+/** TCudaDeviceBuffer
+ *
+ *  Service class for on-device memory buffers. Uses
+ *  std::shared_pointer with custom destructor to ensure consistent
+ *  memory management and allow for easy copying/moving. A device
+ *  buffer has an associated CUDA compute stream , which is used for
+ *  implicit synchronization of data transfers.
+ *
+ * \tparam AFloat The floating point type to be stored in the buffers.
+ */
+template<typename AFloat>
+class TCudaDeviceBuffer
+{
+private:
+
+   size_t                    fOffset;        ///< Offset for sub-buffers
+   size_t                    fSize;
+   cudaStream_t              fComputeStream; ///< cudaStream for data transfer
+   std::shared_ptr<AFloat *> fDevicePointer; ///< Pointer to the buffer data
+
+   // Custom destructor required to free pinned host memory using cudaFree.
+   struct TDestructor
+   {
+       TDestructor()                     = default;
+       TDestructor(const TDestructor  &) = default;
+       TDestructor(      TDestructor &&) = default;
+       TDestructor & operator=(const TDestructor  &) = default;
+       TDestructor & operator=(      TDestructor &&) = default;
+       void operator()(AFloat ** devicePointer);
+       friend TCudaDeviceBuffer;
+   } fDestructor;
+
+public:
+
+   TCudaDeviceBuffer(size_t size);
+   TCudaDeviceBuffer(size_t size,    cudaStream_t stream);
+   TCudaDeviceBuffer(AFloat *, size_t size, cudaStream_t stream);
+   TCudaDeviceBuffer() = default;
+   TCudaDeviceBuffer(const TCudaDeviceBuffer  &) = default;
+   TCudaDeviceBuffer(      TCudaDeviceBuffer &&) = default;
+   TCudaDeviceBuffer & operator=(const TCudaDeviceBuffer  &) = default;
+   TCudaDeviceBuffer & operator=(      TCudaDeviceBuffer &&) = default;
+
+   /** Return sub-buffer of the current buffer. */
+   TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size);
+   /** Convert to raw device data pointer.*/
+   operator AFloat * () const;
+
+   void CopyFrom(const TCudaHostBuffer<AFloat> &) const;
+   void CopyTo(const TCudaHostBuffer<AFloat> &)   const;
+
+   cudaStream_t GetComputeStream() const {return fComputeStream;}
+   void SetComputeStream(cudaStream_t stream) {fComputeStream = stream;}
+
+};
+
+//
+// Inline Functions.
+//______________________________________________________________________________
+
+template<typename AFloat>
+AFloat & TCudaHostBuffer<AFloat>::operator[](size_t index)
+{
+   return (*fHostPointer + fOffset)[index];
+}
+
+template<typename AFloat>
+AFloat   TCudaHostBuffer<AFloat>::operator[](size_t index)   const
+{
+   return (*fHostPointer + fOffset)[index];
+}
+
+
+} // namespace DNN
+} // namespace TMVA
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..f46b05e1fee8b7ca7880e5642cfe55c136ed8fd3
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h
@@ -0,0 +1,299 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////////
+// Contains the TCudaMatrix class for the representation of matrices //
+// on CUDA devices as well as the TCudaDeviceReference class which   //
+// is a helper class to emulate lvalue references to floating point  //
+// values on the device.                                             //
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
+#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "curand_kernel.h"
+
+#include "TMatrixT.h"
+#include "CudaBuffers.h"
+
+#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
+
+namespace TMVA {
+namespace DNN {
+
+/** Function to check cuda return code. Taken from
+ * http://stackoverflow.com/questions/14038589/
+ */
+inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
+
+//____________________________________________________________________________
+//
+// Cuda Device Reference
+//____________________________________________________________________________
+
+/** TCudaDeviceReference
+ *
+ * Helper class emulating lvalue references for AFloat values that are
+ * physically on the device. Allows for example to assign to matrix elements.
+ * Note that device access through CudaDeviceReferences enforces synchronization
+ * with all streams and thus qualifies as performance killer. Only used for
+ * testing.
+ */
+template<typename AFloat>
+class TCudaDeviceReference
+{
+private:
+
+    AFloat * fDevicePointer;
+
+public:
+
+    TCudaDeviceReference(AFloat * devicePointer);
+
+    operator AFloat();
+
+    void operator=(const TCudaDeviceReference &other);
+    void operator=(AFloat value);
+    void operator+=(AFloat value);
+    void operator-=(AFloat value);
+};
+
+//____________________________________________________________________________
+//
+// Cuda Matrix
+//____________________________________________________________________________
+
+/** TCudaMatrix Class
+ *
+ * The TCudaMatrix class represents matrices on a CUDA device. The elements
+ * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
+ * the allocation and freeing of the device memory. TCudaMatrices are lightweight
+ * object, that means on assignment and copy creation only a shallow copy is
+ * performed and no new element buffer allocated. To perform a deep copy use
+ * the static Copy method of the TCuda architecture class.
+ *
+ * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
+ * transferred to the device. This stream can be accessed through the
+ * GetComputeStream member function and used to synchronize computations.
+ *
+ * The TCudaMatrix class also holds static references to CUDA resources.
+ * Those are the cublas handle, a buffer of curand states for the generation
+ * of random numbers as well as a vector containing ones, which is used for
+ * summing column matrices using matrix-vector multiplication. The class also
+ * has a static buffer for returning results from the device.
+ *
+ */
+template<typename AFloat>
+class TCudaMatrix
+{
+public:
+
+private:
+
+   static size_t          fInstances;    ///< Current number of matrix instances.
+   static cublasHandle_t  fCublasHandle;
+   static AFloat        * fDeviceReturn; ///< Buffer for kernel return values.
+   static AFloat        * fOnes;         ///< Vector used for summations of columns.
+   static size_t          fNOnes;        ///< Current length of the one vector.
+   static curandState_t * fCurandStates;
+   static size_t          fNCurandStates;
+
+   size_t                    fNRows;
+   size_t                    fNCols;
+   TCudaDeviceBuffer<AFloat> fElementBuffer;
+
+public:
+
+   static AFloat * GetOnes() {return fOnes;}
+
+   TCudaMatrix();
+   TCudaMatrix(size_t i, size_t j);
+   TCudaMatrix(const TMatrixT<Double_t> &);
+   TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
+
+   TCudaMatrix(const TCudaMatrix  &) = default;
+   TCudaMatrix(      TCudaMatrix &&) = default;
+   TCudaMatrix & operator=(const TCudaMatrix  &) = default;
+   TCudaMatrix & operator=(      TCudaMatrix &&) = default;
+   ~TCudaMatrix() = default;
+
+   /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
+   operator TMatrixT<Double_t>() const;
+
+   inline cudaStream_t GetComputeStream() const;
+   inline void         SetComputeStream(cudaStream_t stream);
+   /** Set the return buffer on the device to the specified value. This is
+    * required for example for reductions in order to initialize the
+    * accumulator. */
+   inline static void ResetDeviceReturn(AFloat value = 0.0);
+   /** Transfer the value in the device return buffer to the host. This
+    *  tranfer is synchronous */
+   inline static AFloat GetDeviceReturn();
+   /** Return device pointer to the device return buffer */
+   inline static AFloat *        GetDeviceReturnPointer() {return fDeviceReturn;}
+   inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
+
+   /** Blocking synchronization with the associated compute stream, if it's
+    * not the default stream. */
+   inline void Synchronize(const TCudaMatrix &) const;
+
+   size_t GetNrows() const {return fNRows;}
+   size_t GetNcols() const {return fNCols;}
+   size_t GetNoElements() const {return fNRows * fNCols;}
+   const AFloat * GetDataPointer() const {return fElementBuffer;}
+   AFloat *       GetDataPointer()       {return fElementBuffer;}
+   const cublasHandle_t & GetCublasHandle() const    {return fCublasHandle;}
+
+   /** Access to elements of device matrices provided through TCudaDeviceReference
+    *  class. Note that access is synchronous end enforces device synchronization
+    *  on all streams. Only used for testing. */
+   TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
+
+private:
+
+   /** Initializes all shared devices resource and makes sure that a sufficient
+    *  number of curand states are allocated on the device and initialized as
+    *  well as that the one-vector for the summation over columns has the right
+    *  size. */
+   void InitializeCuda();
+   void InitializeCurandStates();
+
+};
+
+//
+// Inline Functions.
+//______________________________________________________________________________
+inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceReference<AFloat>::TCudaDeviceReference(AFloat * devicePointer)
+    : fDevicePointer(devicePointer)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceReference<AFloat>::operator AFloat()
+{
+    AFloat buffer;
+    cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
+               cudaMemcpyDeviceToHost);
+    return buffer;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceReference<AFloat>::operator=(const TCudaDeviceReference &other)
+{
+   cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
+              cudaMemcpyDeviceToDevice);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceReference<AFloat>::operator=(AFloat value)
+{
+   AFloat buffer = value;
+   cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
+              cudaMemcpyHostToDevice);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceReference<AFloat>::operator+=(AFloat value)
+{
+   AFloat buffer;
+   cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
+              cudaMemcpyDeviceToHost);
+   buffer += value;
+   cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
+              cudaMemcpyHostToDevice);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceReference<AFloat>::operator-=(AFloat value)
+{
+   AFloat buffer;
+   cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
+              cudaMemcpyDeviceToHost);
+   buffer -= value;
+   cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
+              cudaMemcpyHostToDevice);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
+{
+   return fElementBuffer.GetComputeStream();
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
+{
+   return fElementBuffer.SetComputeStream(stream);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+inline void TCudaMatrix<AFloat>::Synchronize(const TCudaMatrix &A) const
+{
+   cudaEvent_t event;
+   cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+   cudaEventRecord(event, A.GetComputeStream());
+   cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
+   cudaEventDestroy(event);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
+{
+   AFloat buffer = value;
+   cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+inline AFloat TCudaMatrix<AFloat>::GetDeviceReturn()
+{
+   AFloat buffer;
+   cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
+   return buffer;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceReference<AFloat> TCudaMatrix<AFloat>::operator()(size_t i, size_t j) const
+{
+    AFloat * elementPointer = fElementBuffer;
+    elementPointer += j * fNRows + i;
+    return TCudaDeviceReference<AFloat>(elementPointer);
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ca30b748078ff3ca8da3079f1513c92a99fa6ee
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
@@ -0,0 +1,86 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Defines the TDevice class which encapsules device specific //
+// settings for the launching of threads.                     //
+////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_DEVICE
+#define TMVA_DNN_ARCHITECTURES_CUDA_DEVICE
+
+#include "cuda.h"
+#include "vector_types.h" // definition of dim3
+#include "CudaMatrix.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** TDevice
+ *
+ * The TDevice class provides static functions for the generation of CUDA
+ * grids for kernel launches and is used to encapsulate the distribution
+ * of threads and blocks over the data.
+ *
+ */
+class TDevice
+{
+public:
+   /* Number of threads per block along first dimensions. */
+   static constexpr int BlockDimX = 1;
+   /* Number of threads per block along second dimensions. */
+   static constexpr int BlockDimY = 32;
+   /* Resulting block size. */
+   static constexpr int BlockSize = BlockDimX * BlockDimY;
+
+   /* Return dim3 object representing the a BlockDimX x BlockDimY 2D
+    * block */
+   static dim3 BlockDims()
+   {
+      return dim3(BlockDimX, BlockDimY);
+   }
+
+   /* Return 2D dim3 object representing the block grid consisting of two-dimensional
+    * BlockDimX x BlockDimY blocks covering the matrix A */
+   template<typename AFloat>
+   static dim3 GridDims(const TCudaMatrix<AFloat> &A)
+   {
+      int gridDimX = A.GetNcols() / TDevice::BlockDimX;
+      if ((A.GetNcols() % TDevice::BlockDimX) != 0)
+          gridDimX += 1;
+      int gridDimY = A.GetNrows() / TDevice::BlockDimY;
+      if ((A.GetNrows() % TDevice::BlockDimY) != 0)
+          gridDimY += 1;
+      return dim3(gridDimX, gridDimY);
+   }
+
+   /* Return the number of threads that will be launched for a given matrix \p A */
+   template<typename AFloat>
+   static int NThreads(const TCudaMatrix<AFloat> &A)
+   {
+      int gridDimX = A.GetNcols() / TDevice::BlockDimX;
+      if ((A.GetNcols() % TDevice::BlockDimX) != 0) {
+         gridDimX += 1;
+      }
+      int gridDimY = A.GetNrows() / TDevice::BlockDimY;
+      if ((A.GetNrows() % TDevice::BlockDimY) != 0) {
+         gridDimY += 1;
+      }
+      return gridDimX * gridDimY * TDevice::BlockDimX * TDevice::BlockDimY;
+   }
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
new file mode 100644
index 0000000000000000000000000000000000000000..7200f199c278045ef3b090d47aa4e04373e595be
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
@@ -0,0 +1,250 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////////
+// Declaration of the TReference architecture, which provides a      //
+// reference implementation of the low-level interface for the DNN   //
+// implementation based on ROOT's TMatrixT matrix type.              //
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_REFERENCE
+#define TMVA_DNN_ARCHITECTURES_REFERENCE
+
+#include "TMatrix.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/*! The reference architecture class.
+*
+* Class template that contains the reference implementation of the low-level
+* interface for the DNN implementation. The reference implementation uses the
+* TMatrixT class template to represent matrices.
+*
+* \tparam Real_t The floating point type used to represent scalars.
+*/
+template<typename Real_t>
+class TReference
+{
+public:
+
+   using Scalar_t     = Real_t;
+   using Matrix_t     = TMatrixT<Real_t>;
+
+   //____________________________________________________________________________
+   //
+   // Propagation
+   //____________________________________________________________________________
+
+   /** @name Forward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Matrix-multiply \p input with the transpose of \pweights and
+    *  write the results into \p output. */
+   static void MultiplyTranspose(TMatrixT<Scalar_t> &output,
+                                 const TMatrixT<Scalar_t> &input,
+                                 const TMatrixT<Scalar_t> &weights);
+   /** Add the vectors biases row-wise to the matrix output */
+   static void AddRowWise(TMatrixT<Scalar_t> &output,
+                          const TMatrixT<Scalar_t> &biases);
+   ///@}
+
+   /** @name Backward Propagation
+    * Low-level functions required for the forward propagation of activations
+    * through the network.
+    */
+   ///@{
+   /** Perform the complete backward propagation step. If the provided
+    *  \p activationGradientsBackward matrix is not empty, compute the
+    *  gradients of the objective function with respect to the activations
+    *  of the previous layer (backward direction).
+    *  Also compute the weight and the bias gradients. Modifies the values
+    *  in \p df and thus produces only a valid result, if it is applied the
+    *  first time after the corresponding forward propagation has been per-
+    *  formed. */
+   static void Backward(TMatrixT<Scalar_t> & activationGradientsBackward,
+                        TMatrixT<Scalar_t> & weightGradients,
+                        TMatrixT<Scalar_t> & biasGradients,
+                        TMatrixT<Scalar_t> & df,
+                        const TMatrixT<Scalar_t> & activationGradients,
+                        const TMatrixT<Scalar_t> & weights,
+                        const TMatrixT<Scalar_t> & activationBackward);
+   /** Adds a the elements in matrix B scaled by c to the elements in
+    *  the matrix A. This is required for the weight update in the gradient
+    *  descent step.*/
+   static void ScaleAdd(TMatrixT<Scalar_t> & A,
+                        const TMatrixT<Scalar_t> & B,
+                        Scalar_t beta = 1.0);
+
+   static void Copy(TMatrixT<Scalar_t> & A,
+                    const TMatrixT<Scalar_t> & B);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Activation Functions
+   //____________________________________________________________________________
+
+   /** @name Activation Functions
+    * For each activation function, the low-level interface contains two routines.
+    * One that applies the acitvation function to a matrix and one that evaluate
+    * the derivatives of the activation function at the elements of a given matrix
+    * and writes the results into the result matrix.
+    */
+   ///@{
+   static void Identity(TMatrixT<Real_t> & B);
+   static void IdentityDerivative(TMatrixT<Real_t> & B,
+                                  const TMatrixT<Real_t> & A);
+
+   static void Relu(TMatrixT<Real_t> & B);
+   static void ReluDerivative(TMatrixT<Real_t> & B,
+                              const TMatrixT<Real_t> & A);
+
+   static void Sigmoid(TMatrixT<Real_t> & B);
+   static void SigmoidDerivative(TMatrixT<Real_t> & B,
+                                 const TMatrixT<Real_t> & A);
+
+   static void Tanh(TMatrixT<Real_t> & B);
+   static void TanhDerivative(TMatrixT<Real_t> & B,
+                              const TMatrixT<Real_t> & A);
+
+   static void SymmetricRelu(TMatrixT<Real_t> & B);
+   static void SymmetricReluDerivative(TMatrixT<Real_t> & B,
+                                       const TMatrixT<Real_t> & A);
+
+   static void SoftSign(TMatrixT<Real_t> & B);
+   static void SoftSignDerivative(TMatrixT<Real_t> & B,
+                                  const TMatrixT<Real_t> & A);
+
+   static void Gauss(TMatrixT<Real_t> & B);
+   static void GaussDerivative(TMatrixT<Real_t> & B,
+                               const TMatrixT<Real_t> & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Loss Functions
+   //____________________________________________________________________________
+
+   /** @name Loss Functions
+    * Loss functions compute a scalar value given the \p output of the network
+    * for a given training input and the expected network prediction \p Y that
+    * quantifies the quality of the prediction. For each function also a routing
+    * that computes the gradients (suffixed by Gradients) must be provided for
+    * the starting of the backpropagation algorithm.
+    */
+   ///@{
+
+   static Real_t MeanSquaredError(const TMatrixT<Real_t> &Y,
+                                  const TMatrixT<Real_t> &output);
+   static void MeanSquaredErrorGradients(TMatrixT<Real_t> & dY,
+                                         const TMatrixT<Real_t> &Y,
+                                         const TMatrixT<Real_t> &output);
+
+    /** Sigmoid transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static Real_t CrossEntropy(const TMatrixT<Real_t> &Y,
+                              const TMatrixT<Real_t> &output);
+
+   static void CrossEntropyGradients(TMatrixT<Real_t> & dY,
+                                     const TMatrixT<Real_t> & Y,
+                                     const TMatrixT<Real_t> & output);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Output Functions
+   //____________________________________________________________________________
+
+   /** @name Output Functions
+    * Output functions transform the activations \p output of the
+    * output layer in the network to a valid prediction \p YHat for
+    * the desired usage of the network, e.g.  the identity function
+    * for regression or the sigmoid transformation for two-class
+    * classification.
+    */
+   ///@{
+   static void Sigmoid(TMatrixT<Real_t> &YHat,
+                        const TMatrixT<Real_t> & );
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Regularization
+   //____________________________________________________________________________
+
+   /** @name Regularization
+    * For each regularization type two functions are required, one named
+    * <tt><Type>Regularization</tt> that evaluates the corresponding
+    * regularization functional for a given weight matrix and the
+    * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
+    * component in the gradients to the provided matrix.
+    */
+   ///@{
+
+   static Real_t L1Regularization(const TMatrixT<Real_t> & W);
+   static void AddL1RegularizationGradients(TMatrixT<Real_t> & A,
+                                            const TMatrixT<Real_t> & W,
+                                            Real_t weightDecay);
+
+   static Real_t L2Regularization(const TMatrixT<Real_t> & W);
+   static void AddL2RegularizationGradients(TMatrixT<Real_t> & A,
+                                            const TMatrixT<Real_t> & W,
+                                            Real_t weightDecay);
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Initialization
+   //____________________________________________________________________________
+
+   /** @name Initialization
+    * For each initialization method, one function in the low-level interface
+    * is provided. The naming scheme is <p>Initialize<Type></p> for a given
+    * initialization method Type.
+    */
+   ///@{
+
+   static void InitializeGauss(TMatrixT<Real_t> & A);
+
+   static void InitializeUniform(TMatrixT<Real_t> & A);
+
+   static void InitializeIdentity(TMatrixT<Real_t> & A);
+
+   static void InitializeZero(TMatrixT<Real_t> & A);
+
+   ///@}
+
+   //____________________________________________________________________________
+   //
+   // Dropout
+   //____________________________________________________________________________
+
+   /** @name Dropout
+    */
+   ///@{
+
+   /** Apply dropout with activation probability \p p to the given
+    *  matrix \p A and scale the result by reciprocal of \p p. */
+   static void Dropout(TMatrixT<Real_t> & A, Real_t dropoutProbability);
+
+   ///@}
+};
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/DataLoader.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3a960fe3514d461905d84f6abf8d2f0a16c9d97
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/DataLoader.h
@@ -0,0 +1,260 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 08/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Generic data loader for neural network input data. Provides a   //
+// high level abstraction for the transfer of training data to the //
+// device.                                                         //
+/////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_DATALOADER
+#define TMVA_DNN_DATALOADER
+
+#include "TMatrix.h"
+#include <vector>
+#include <iostream>
+
+#include "TMVA/Event.h"
+
+namespace TMVA {
+namespace DNN  {
+
+//
+// Input Data Types
+//______________________________________________________________________________
+using MatrixInput_t    = std::pair<const TMatrixT<Double_t> &,
+                                   const TMatrixT<Double_t> &>;
+using TMVAInput_t      = std::vector<Event*>;
+
+using IndexIterator_t = typename std::vector<size_t>::iterator;
+
+/** TBatch
+ *
+ * Class representing training batches consisting of a matrix of input data
+ * and a matrix of output data. The input and output data can be accessed using
+ * the GetInput() and GetOutput() member functions.
+ *
+ * \tparam AArchitecture The underlying architecture.
+ */
+//______________________________________________________________________________
+template <typename AArchitecture>
+class TBatch
+{
+private:
+
+   using Matrix_t       = typename AArchitecture::Matrix_t;
+
+   Matrix_t fInputMatrix;
+   Matrix_t fOutputMatrix;
+
+public:
+
+   TBatch(Matrix_t &, Matrix_t &);
+   TBatch(const TBatch  &) = default;
+   TBatch(      TBatch &&) = default;
+   TBatch & operator=(const TBatch  &) = default;
+   TBatch & operator=(      TBatch &&) = default;
+
+   /** Return the matrix representing the input data. */
+   Matrix_t & GetInput()  {return fInputMatrix;}
+   /** Return the matrix representing the output data. */
+   Matrix_t & GetOutput() {return fOutputMatrix;}
+};
+
+template<typename Data_t, typename AArchitecture> class TDataLoader;
+
+/** TBatchIterator
+ *
+ * Simple iterator class for the iterations over the training batches in
+ * a given data set represented by a TDataLoader object.
+ *
+ * \tparam AData         The input data type.
+ * \tparam AArchitecture The underlying architecture type.
+ */
+template<typename Data_t, typename AArchitecture>
+class TBatchIterator
+{
+private:
+
+   TDataLoader<Data_t, AArchitecture> & fDataLoader;
+   size_t fBatchIndex;
+
+public:
+
+TBatchIterator(TDataLoader<Data_t, AArchitecture> & dataLoader, size_t index = 0)
+: fDataLoader(dataLoader), fBatchIndex(index)
+{
+   // Nothing to do here.
+}
+
+   TBatch<AArchitecture> operator*() {return fDataLoader.GetBatch();}
+   TBatchIterator operator++() {fBatchIndex++; return *this;}
+   bool operator!=(const TBatchIterator & other) {
+      return fBatchIndex != other.fBatchIndex;
+   }
+};
+
+/** TDataLoader
+ *
+ * Service class managing the streaming of the training data from the input data
+ * type to the accelerator device or the CPU. A TDataLoader object manages a number
+ * of host and device buffer pairs that are used in a round-robin manner for the
+ * transfer of batches to the device.
+ *
+ * Each TDataLoader object has an associated batch size and a number of total
+ * samples in the dataset. One epoch is the number of buffers required to transfer
+ * the complete training set. Using the begin() and end() member functions allows
+ * the user to iterate over the batches in one epoch.
+ *
+ * \tparam AData The input data type.
+ * \tparam AArchitecture The achitecture class of the underlying architecture.
+ */
+template<typename Data_t, typename AArchitecture>
+class TDataLoader
+{
+private:
+
+   using HostBuffer_t    = typename AArchitecture::HostBuffer_t;
+   using DeviceBuffer_t  = typename AArchitecture::DeviceBuffer_t;
+   using Matrix_t        = typename AArchitecture::Matrix_t;
+   using BatchIterator_t = TBatchIterator<Data_t, AArchitecture>;
+
+   const Data_t  & fData;
+
+   size_t fNSamples;
+   size_t fBatchSize;
+   size_t fNInputFeatures;
+   size_t fNOutputFeatures;
+   size_t fBatchIndex;
+
+   size_t fNStreams;                            ///< Number of buffer pairs.
+   std::vector<DeviceBuffer_t> fDeviceBuffers;
+   std::vector<HostBuffer_t>   fHostBuffers;
+
+   std::vector<size_t> fSampleIndices; ///< Ordering of the samples in the epoch.
+
+public:
+
+   TDataLoader(const Data_t & data, size_t nSamples, size_t batchSize,
+               size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams = 1);
+   TDataLoader(const TDataLoader  &) = default;
+   TDataLoader(      TDataLoader &&) = default;
+   TDataLoader & operator=(const TDataLoader  &) = default;
+   TDataLoader & operator=(      TDataLoader &&) = default;
+
+   /** Copy input matrix into the given host buffer. Function to be specialized by
+    *  the architecture-specific backend. */
+   void  CopyInput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize);
+   /** Copy output matrix into the given host buffer. Function to be specialized
+    * by the architecture-spcific backend. */
+   void CopyOutput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize);
+
+   BatchIterator_t begin() {return TBatchIterator<Data_t, AArchitecture>(*this);}
+   BatchIterator_t end()
+   {
+      return TBatchIterator<Data_t, AArchitecture>(*this, fNSamples / fBatchSize);
+   }
+
+   /** Shuffle the order of the samples in the batch. The shuffling is indirect,
+    *  i.e. only the indices are shuffled. No input data is moved by this
+    * routine. */
+   void Shuffle();
+
+   /** Return the next batch from the training set. The TDataLoader object
+    *  keeps an internal counter that cycles over the batches in the training
+    *  set. */
+   TBatch<AArchitecture> GetBatch();
+
+};
+
+//
+// TBatch Class.
+//______________________________________________________________________________
+template<typename AArchitecture>
+TBatch<AArchitecture>::TBatch(Matrix_t & inputMatrix, Matrix_t & outputMatrix)
+    : fInputMatrix(inputMatrix), fOutputMatrix(outputMatrix)
+{
+    // Nothing to do here.
+}
+
+//
+// TDataLoader Class.
+//______________________________________________________________________________
+template<typename Data_t, typename AArchitecture>
+TDataLoader<Data_t, AArchitecture>::TDataLoader(
+    const Data_t & data, size_t nSamples, size_t batchSize,
+    size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams)
+    : fData(data), fNSamples(nSamples), fBatchSize(batchSize),
+      fNInputFeatures(nInputFeatures), fNOutputFeatures(nOutputFeatures),
+      fBatchIndex(0), fNStreams(nStreams), fDeviceBuffers(), fHostBuffers(),
+      fSampleIndices()
+{
+   size_t inputMatrixSize  = fBatchSize * fNInputFeatures;
+   size_t outputMatrixSize = fBatchSize * fNOutputFeatures;
+
+   for (size_t i = 0; i < fNStreams; i++)
+   {
+      fHostBuffers.push_back(HostBuffer_t(inputMatrixSize + outputMatrixSize));
+      fDeviceBuffers.push_back(DeviceBuffer_t(inputMatrixSize + outputMatrixSize));
+   }
+
+   fSampleIndices.reserve(fNSamples);
+   for (size_t i = 0; i < fNSamples; i++) {
+      fSampleIndices.push_back(i);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename AArchitecture>
+TBatch<AArchitecture> TDataLoader<Data_t, AArchitecture>::GetBatch()
+{
+   fBatchIndex %= (fNSamples / fBatchSize); // Cycle through samples.
+
+
+   size_t inputMatrixSize  = fBatchSize * fNInputFeatures;
+   size_t outputMatrixSize = fBatchSize * fNOutputFeatures;
+
+   size_t streamIndex = fBatchIndex % fNStreams;
+   HostBuffer_t   & hostBuffer   = fHostBuffers[streamIndex];
+   DeviceBuffer_t & deviceBuffer = fDeviceBuffers[streamIndex];
+
+   HostBuffer_t inputHostBuffer  = hostBuffer.GetSubBuffer(0, inputMatrixSize);
+   HostBuffer_t outputHostBuffer = hostBuffer.GetSubBuffer(inputMatrixSize,
+                                                           outputMatrixSize);
+
+   DeviceBuffer_t inputDeviceBuffer  = deviceBuffer.GetSubBuffer(0, inputMatrixSize);
+   DeviceBuffer_t outputDeviceBuffer = deviceBuffer.GetSubBuffer(inputMatrixSize,
+                                                                 outputMatrixSize);
+   size_t sampleIndex = fBatchIndex * fBatchSize;
+   IndexIterator_t sampleIndexIterator = fSampleIndices.begin() + sampleIndex;
+
+   CopyInput(inputHostBuffer,   sampleIndexIterator, fBatchSize);
+   CopyOutput(outputHostBuffer, sampleIndexIterator, fBatchSize);
+
+   deviceBuffer.CopyFrom(hostBuffer);
+   Matrix_t  inputMatrix(inputDeviceBuffer,  fBatchSize, fNInputFeatures);
+   Matrix_t outputMatrix(outputDeviceBuffer, fBatchSize, fNOutputFeatures);
+
+   fBatchIndex++;
+   return TBatch<AArchitecture>(inputMatrix, outputMatrix);
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename AArchitecture>
+void TDataLoader<Data_t, AArchitecture>::Shuffle()
+{
+   std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end());
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e2f09da46dff9cde43a7c01d94793383a272ed9
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Functions.h
@@ -0,0 +1,266 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Contains function enums for activation and output functions, as //
+// well as generic evaluation functions, that delegate the call to //
+// the corresponding evaluation kernel.                            //
+/////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_FUNCTIONS
+#define TMVA_DNN_FUNCTIONS
+
+namespace TMVA
+{
+namespace DNN
+{
+//______________________________________________________________________________
+//
+//  Enum Definitions
+//______________________________________________________________________________
+
+/*! Enum that represents layer activation functions. */
+enum class EActivationFunction
+{
+   kIdentity = 0,
+   kRelu     = 1,
+   kSigmoid  = 2,
+   kTanh     = 3,
+   kSymmRelu = 4,
+   kSoftSign = 5,
+   kGauss    = 6
+};
+
+/*! Enum that represents output functions */
+enum class EOutputFunction
+{
+   kIdentity = 'I',
+   kSigmoid  = 'S'
+};
+
+/*! Enum that represents objective functions for the net, i.e. functions
+*  that take the output from the last layer in the net together with the
+*  truths and return the objective function values that is to be minimized
+*  in the training process. */
+enum class ELossFunction
+{
+    kCrossEntropy     = 'C',
+    kMeanSquaredError = 'R'
+};
+
+/*! Enum representing the regularization type applied for a given layer */
+enum class ERegularization
+{
+    kNone = '0',
+    kL1   = '1',
+    kL2   = '2'
+    };
+
+/* Enum represnting the initialization method used for this layer. */
+enum class EInitialization {
+    kGauss    = 'G',
+    kUniform  = 'U',
+    kIdentity = 'I',
+    kZero = 'Z'
+};
+
+//______________________________________________________________________________
+//
+//  Activation Functions
+//______________________________________________________________________________
+
+/*! Apply the given activation function to each value in the given
+*  matrix A. */
+template<typename Architecture_t>
+inline void evaluate(typename Architecture_t::Matrix_t &A,
+                    EActivationFunction f)
+{
+    switch(f)
+    {
+    case EActivationFunction::kIdentity : break;
+    case EActivationFunction::kRelu :     Architecture_t::Relu(A);
+        break;
+    case EActivationFunction::kSigmoid  :  Architecture_t::Sigmoid(A);
+        break;
+    case EActivationFunction::kTanh     :  Architecture_t::Tanh(A);
+        break;
+    case EActivationFunction::kSymmRelu :  Architecture_t::SymmetricRelu(A);
+        break;
+    case EActivationFunction::kSoftSign :  Architecture_t::SoftSign(A);
+        break;
+    case EActivationFunction::kGauss    :  Architecture_t::Gauss(A);
+        break;
+    }
+}
+
+
+/*! Compute the first partial derivative of the activation function for
+*  the values given in matrix A and write the results into B. */
+//______________________________________________________________________________
+template<typename Architecture_t>
+inline void evaluateDerivative(typename Architecture_t::Matrix_t & B,
+                                EActivationFunction f,
+                                const typename Architecture_t::Matrix_t & A)
+{
+    switch(f)
+    {
+    case EActivationFunction::kIdentity : Architecture_t::IdentityDerivative(B, A);
+        break;
+    case EActivationFunction::kRelu     : Architecture_t::ReluDerivative(B, A);
+        break;
+    case EActivationFunction::kSigmoid  : Architecture_t::SigmoidDerivative(B, A);
+        break;
+    case EActivationFunction::kTanh     : Architecture_t::TanhDerivative(B, A);
+        break;
+    case EActivationFunction::kSymmRelu : Architecture_t::SymmetricReluDerivative(B, A);
+        break;
+    case EActivationFunction::kSoftSign : Architecture_t::SoftSignDerivative(B, A);
+        break;
+    case EActivationFunction::kGauss    : Architecture_t::GaussDerivative(B, A);
+        break;
+    }
+}
+
+//______________________________________________________________________________
+//
+//  Output Functions
+//______________________________________________________________________________
+
+/*! Apply the given output function to each value in the given
+*  matrix A. */
+template<typename Architecture_t>
+inline void evaluate(typename Architecture_t::Matrix_t &A,
+                    EOutputFunction f,
+                    const typename Architecture_t::Matrix_t &X)
+{
+    switch(f)
+    {
+    case EOutputFunction::kIdentity : Architecture_t::Copy(A, X);
+                                      break;
+    case EOutputFunction::kSigmoid  : Architecture_t::Sigmoid(A, X);
+                                      break;
+    }
+}
+
+//______________________________________________________________________________
+//
+//  Loss Functions
+//______________________________________________________________________________
+
+/*! Compute the value of the objective function f for given activations
+*  of the ouput layer and the truth Y. */
+template<typename Architecture_t>
+inline auto evaluate(ELossFunction f,
+                    const typename Architecture_t::Matrix_t & Y,
+                    const typename Architecture_t::Matrix_t & output)
+-> decltype(Architecture_t::CrossEntropy(Y,output))
+{
+    switch(f)
+    {
+    case ELossFunction::kCrossEntropy :
+        return Architecture_t::CrossEntropy(Y, output);
+    case ELossFunction::kMeanSquaredError :
+        return Architecture_t::MeanSquaredError(Y, output);
+    }
+    return 0.0;
+}
+
+/*! Compute the gradient of the given output function f for given activations
+*  output of the output layer and truth Y and write the results into dY. */
+//______________________________________________________________________________
+template<typename Architecture_t>
+inline void evaluateGradients(typename Architecture_t::Matrix_t & dY,
+                                ELossFunction f,
+                                const typename Architecture_t::Matrix_t &Y,
+                                const typename Architecture_t::Matrix_t &output)
+{
+    switch(f)
+    {
+    case ELossFunction::kCrossEntropy :
+        Architecture_t::CrossEntropyGradients(dY, Y, output);
+        break;
+    case ELossFunction::kMeanSquaredError :
+        Architecture_t::MeanSquaredErrorGradients(dY, Y, output);
+        break;
+    }
+}
+
+
+//______________________________________________________________________________
+//
+// Regularization
+//______________________________________________________________________________
+
+/*! Evaluate the regularization functional for a given weight matrix. */
+template<typename Architecture_t>
+inline auto regularization(const typename Architecture_t::Matrix_t &A,
+                    ERegularization R)
+-> decltype(Architecture_t::L1Regularization(A))
+{
+    switch(R)
+    {
+    case ERegularization::kNone :
+        return 0.0;
+    case ERegularization::kL1 :
+        return Architecture_t::L1Regularization(A);
+    case ERegularization::kL2 :
+        return Architecture_t::L2Regularization(A);
+    }
+    return 0.0;
+}
+
+/*! Add the regularization gradient corresponding to weight matrix W, to
+*  the matrix A. */
+//______________________________________________________________________________
+template<typename Architecture_t>
+inline void addRegularizationGradients(typename Architecture_t::Matrix_t &A,
+                                       const typename Architecture_t::Matrix_t &W,
+                                       typename Architecture_t::Scalar_t weightDecay,
+                                       ERegularization R)
+{
+    switch(R)
+    {
+    case ERegularization::kNone :
+        break;
+    case ERegularization::kL1 :
+        Architecture_t::AddL1RegularizationGradients(A, W, weightDecay);
+        break;
+    case ERegularization::kL2 :
+        Architecture_t::AddL2RegularizationGradients(A, W, weightDecay);
+        break;
+    }
+}
+
+//______________________________________________________________________________
+//
+// Initialization
+//______________________________________________________________________________
+
+template<typename Architecture_t>
+inline void initialize(typename Architecture_t::Matrix_t & A,
+                       EInitialization m)
+{
+   switch(m) {
+   case EInitialization::kGauss    : Architecture_t::InitializeGauss(A);
+       break;
+   case EInitialization::kUniform  : Architecture_t::InitializeUniform(A);
+       break;
+   case EInitialization::kIdentity : Architecture_t::InitializeIdentity(A);
+       break;
+   case EInitialization::kZero     : Architecture_t::InitializeZero(A);
+       break;
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Layer.h b/tmva/tmva/inc/TMVA/DNN/Layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbf1b69ce75437e1b888aab67ff18280037e1d6c
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Layer.h
@@ -0,0 +1,388 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Contains Layer and SharedLayer classes, that represent layers in //
+// neural networks.                                                 //
+//////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_LAYER
+#define TMVA_DNN_LAYER
+
+#include <iostream>
+
+#include "TMatrix.h"
+#include "Functions.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+//
+//  The Layer Class
+//______________________________________________________________________________
+
+/** \class TLayer
+
+    Generic layer class.
+
+    This generic layer class represents a layer of a neural network with
+    a given width n and activation function f. The activation
+    function of each layer is given by \f$\mathbf{u} =
+    \mathbf{W}\mathbf{x} + \boldsymbol{\theta}\f$.
+
+    In addition to the weight and bias matrices, each layer allocates memory
+    for its activations and the corresponding first partial fDerivatives of
+    the activation function as well as the gradients of the fWeights and fBiases.
+
+    The layer provides member functions for the forward propagation of
+    activations through the given layer.
+*/
+template<typename Architecture_t>
+   class TLayer
+{
+
+public:
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+private:
+
+   size_t fBatchSize;  ///< Batch size used for training and evaluation.
+   size_t fInputWidth; ///< Number of neurons of the previous layer.
+   size_t fWidth;      ///< Number of neurons of this layer.
+
+   Scalar_t fDropoutProbability;  ///< Probability that an input is active.
+
+   Matrix_t fWeights;             ///< The fWeights of this layer.
+   Matrix_t fBiases;              ///< The bias values of this layer.
+   Matrix_t fOutput;              ///< Activations of this layer.
+   Matrix_t fDerivatives;         ///< First fDerivatives of the activations of this layer.
+   Matrix_t fWeightGradients;     ///< Gradients w.r.t. the weigths of this layer.
+   Matrix_t fBiasGradients;       ///< Gradients w.r.t. the bias values of this layer.
+   Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
+
+   EActivationFunction fF; ///< Activation function of the layer.
+
+public:
+
+   TLayer(size_t             BatchSize,
+          size_t             InputWidth,
+          size_t             Width,
+          EActivationFunction f,
+          Scalar_t           dropoutProbability);
+   TLayer(const TLayer &);
+
+   /*! Initialize fWeights according to the given initialization
+    *  method. */
+   void Initialize(EInitialization m);
+   /*! Compute activation of the layer for the given input. The input
+    * must be in matrix form with the different rows corresponding to
+    * different events in the batch. Computes activations as well as
+    * the first partial derivative of the activation function at those
+    * activations. */
+   void inline Forward(Matrix_t & input, bool applyDropout = false);
+   /*! Compute weight, bias and activation gradients. Uses the precomputed
+    *  first partial derviatives of the activation function computed during
+    *  forward propagation and modifies them. Must only be called directly
+    *  a the corresponding call to Forward(...). */
+   void inline Backward(Matrix_t & gradients_backward,
+                        const Matrix_t & activations_backward,
+                        ERegularization r,
+                        Scalar_t weightDecay);
+
+   void Print() const;
+
+   size_t GetBatchSize()          const {return fBatchSize;}
+   size_t GetInputWidth()         const {return fInputWidth;}
+   size_t GetWidth()              const {return fWidth;}
+   size_t GetDropoutProbability() const {return fDropoutProbability;}
+
+   void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
+
+   EActivationFunction GetActivationFunction() const {return fF;}
+
+   Matrix_t       & GetOutput()        {return fOutput;}
+   const Matrix_t & GetOutput() const  {return fOutput;}
+   Matrix_t       & GetWeights()       {return fWeights;}
+   const Matrix_t & GetWeights() const {return fWeights;}
+   Matrix_t       & GetBiases()       {return fBiases;}
+   const Matrix_t & GetBiases() const {return fBiases;}
+   Matrix_t       & GetActivationGradients()       {return fActivationGradients;}
+   const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
+   Matrix_t       & GetBiasGradients()       {return fBiasGradients;}
+   const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
+   Matrix_t       & GetWeightGradients()       {return fWeightGradients;}
+   const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
+
+};
+
+//______________________________________________________________________________
+//
+//  The Shared Layer Class
+//______________________________________________________________________________
+
+/** \class TSharedLayer
+
+    Layer class width shared weight and bias layers.
+
+    Like the Layer class only that weight matrices are shared between
+    different instances of the net, which can be used to implement
+    multithreading 'Hogwild' style.
+*/
+
+template<typename Architecture_t>
+class TSharedLayer
+{
+
+public:
+
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+private:
+
+   size_t fBatchSize;  ///< Batch size used for training and evaluation.
+   size_t fInputWidth; ///< Number of neurons of the previous layer.
+   size_t fWidth;      ///< Number of neurons of this layer.
+
+   Scalar_t fDropoutProbability;  ///< Probability that an input is active.
+
+   Matrix_t & fWeights;           ///< Reference to the weight matrix of this layer.
+   Matrix_t & fBiases;            ///< Reference to the bias vectors of this layer.
+   Matrix_t fOutput;              ///< Activations of this layer.
+   Matrix_t fDerivatives;         ///< First fDerivatives of the activations of this layer.
+   Matrix_t fWeightGradients;     ///< Gradients w.r.t. the weigths of this layer.
+   Matrix_t fBiasGradients;       ///< Gradients w.r.t. the bias values of this layer.
+   Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
+
+   EActivationFunction fF; ///< Activation function of the layer.
+
+public:
+
+   TSharedLayer(size_t fBatchSize,
+                TLayer<Architecture_t> & layer);
+   TSharedLayer(const TSharedLayer & layer);
+
+   /*! Compute activation of the layer for the given input. The input
+    * must be in matrix form with the different rows corresponding to
+    * different events in the batch. Computes activations as well as
+    * the first partial derivative of the activation function at those
+    * activations. */
+   void inline Forward(Matrix_t & input, bool applyDropout = false);
+   /*! Compute weight, bias and activation gradients. Uses the precomputed
+    *  first partial derviatives of the activation function computed during
+    *  forward propagation and modifies them. Must only be called directly
+    *  a the corresponding call to Forward(...). */
+   void inline Backward(Matrix_t & gradients_backward,
+                        const Matrix_t & activations_backward,
+                        ERegularization r,
+                        Scalar_t weightDecay);
+
+   void Print() const;
+
+   size_t GetBatchSize()          const {return fBatchSize;}
+   size_t GetInputWidth()         const {return fInputWidth;}
+   size_t GetWidth()              const {return fWidth;}
+   size_t GetDropoutProbability() const {return fDropoutProbability;}
+
+   void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
+
+   EActivationFunction GetActivationFunction() const {return fF;}
+
+   Matrix_t       & GetOutput()        {return fOutput;}
+   const Matrix_t & GetOutput() const  {return fOutput;}
+   Matrix_t       & GetWeights() const {return fWeights;}
+   Matrix_t       & GetBiases()       {return fBiases;}
+   const Matrix_t & GetBiases() const {return fBiases;}
+   Matrix_t       & GetActivationGradients()       {return fActivationGradients;}
+   const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
+   Matrix_t       & GetBiasGradients()       {return fBiasGradients;}
+   const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
+   Matrix_t       & GetWeightGradients()       {return fWeightGradients;}
+   const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
+
+};
+
+//______________________________________________________________________________
+//
+//  The Layer Class - Implementation
+//______________________________________________________________________________
+
+template<typename Architecture_t>
+   TLayer<Architecture_t>::TLayer(size_t batchSize,
+                                  size_t inputWidth,
+                                  size_t width,
+                                  EActivationFunction f,
+                                  Scalar_t dropoutProbability)
+   : fBatchSize(batchSize), fInputWidth(inputWidth), fWidth(width),
+     fDropoutProbability(dropoutProbability), fWeights(width, fInputWidth),
+     fBiases(width, 1), fOutput(fBatchSize, width), fDerivatives(fBatchSize, width),
+     fWeightGradients(width, fInputWidth), fBiasGradients(width, 1),
+     fActivationGradients(fBatchSize, width), fF(f)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TLayer<Architecture_t>::TLayer(const TLayer &layer)
+    : fBatchSize(layer.fBatchSize),
+    fInputWidth(layer.fInputWidth), fWidth(layer.fWidth),
+    fWeights(layer.fWidth, layer.fInputWidth), fBiases(layer.fWidth, 1),
+    fOutput(layer.fBatchSize, layer.fWidth),
+    fDerivatives(layer.fBatchSize, layer.fWidth),
+    fWeightGradients(layer.fWidth, layer.fInputWidth),
+    fBiasGradients(layer.fWidth, 1),
+    fActivationGradients(layer.fBatchSize, layer.fWidth),
+    fF(layer.fF)
+{
+   Architecture_t::Copy(fWeights, layer.GetWeights());
+   Architecture_t::Copy(fBiases,  layer.GetBiases());
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto TLayer<Architecture_t>::Initialize(EInitialization m)
+-> void
+{
+   initialize<Architecture_t>(fWeights, m);
+   initialize<Architecture_t>(fBiases,  EInitialization::kZero);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto inline TLayer<Architecture_t>::Forward(Matrix_t & input,
+                                            bool applyDropout)
+-> void
+{
+   if (applyDropout && (fDropoutProbability != 1.0)) {
+      Architecture_t::Dropout(input, fDropoutProbability);
+   }
+   Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
+   Architecture_t::AddRowWise(fOutput, fBiases);
+   evaluateDerivative<Architecture_t>(fDerivatives, fF, fOutput);
+   evaluate<Architecture_t>(fOutput, fF);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto TLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
+                                    const Matrix_t & activations_backward,
+                                    ERegularization r,
+                                    Scalar_t weightDecay)
+-> void
+{
+   Architecture_t::Backward(gradients_backward,
+                            fWeightGradients,
+                            fBiasGradients,
+                            fDerivatives,
+                            fActivationGradients,
+                            fWeights,
+                            activations_backward);
+   addRegularizationGradients<Architecture_t>(fWeightGradients,
+                                              fWeights,
+                                              weightDecay, r);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+   void TLayer<Architecture_t>::Print() const
+{
+   std::cout << "Width = " << fWeights.GetNrows();
+   std::cout << ", Activation Function = ";
+   std::cout << static_cast<int>(fF) << std::endl;
+}
+
+//______________________________________________________________________________
+//
+//  The Shared Layer Class - Implementation
+//______________________________________________________________________________
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TSharedLayer<Architecture_t>::TSharedLayer(size_t BatchSize,
+                                         TLayer<Architecture_t> &layer)
+: fBatchSize(BatchSize),
+fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
+fDropoutProbability(layer.GetDropoutProbability()),
+fWeights(layer.GetWeights()), fBiases(layer.GetBiases()),
+fOutput(fBatchSize, fWidth), fDerivatives(fBatchSize, fWidth),
+fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1),
+fActivationGradients(fBatchSize, fWidth), fF(layer.GetActivationFunction())
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TSharedLayer<Architecture_t>::TSharedLayer(const TSharedLayer &layer)
+    : fBatchSize(layer.fBatchSize),
+    fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
+    fWeights(layer.fWeights), fBiases(layer.fBiases),
+    fOutput(layer.fBatchSize, fWidth), fDerivatives(layer.fBatchSize, fWidth),
+    fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1),
+    fActivationGradients(layer.fBatchSize, fWidth),
+    fF(layer.fF)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto inline TSharedLayer<Architecture_t>::Forward(Matrix_t & input,
+                                                  bool applyDropout)
+-> void
+{
+   if (applyDropout && (fDropoutProbability != 1.0)) {
+      Architecture_t::Dropout(input, fDropoutProbability);
+   }
+   Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
+   Architecture_t::AddRowWise(fOutput, fBiases);
+   evaluateDerivative<Architecture_t>(fDerivatives, fF, fOutput);
+   evaluate<Architecture_t>(fOutput, fF);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto inline TSharedLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
+                                                 const Matrix_t & activations_backward,
+                                                 ERegularization r,
+                                                 Scalar_t weightDecay)
+-> void
+{
+   Architecture_t::Backward(gradients_backward,
+                            fWeightGradients,
+                            fBiasGradients,
+                            fDerivatives,
+                            fActivationGradients,
+                            fWeights,
+                            activations_backward);
+   addRegularizationGradients<Architecture_t>(fWeightGradients,
+                                              fWeights,
+                                              weightDecay, r);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+void TSharedLayer<Architecture_t>::Print() const
+{
+   std::cout << "Width = " << fWeights.GetNrows();
+   std::cout << ", Activation Function = ";
+   std::cout << static_cast<int>(fF) << std::endl;
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Minimizers.h b/tmva/tmva/inc/TMVA/DNN/Minimizers.h
new file mode 100644
index 0000000000000000000000000000000000000000..561ecc0bdf802158c6744b49c1402beedbd5605a
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Minimizers.h
@@ -0,0 +1,748 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh 21/06/16
+
+/*************************************************************************
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#ifndef TMVA_DNN_MINIMIZERS
+#define TMVA_DNN_MINIMIZERS
+
+#include "DataLoader.h"
+#include "Functions.h"
+#include <chrono>
+
+namespace TMVA {
+namespace DNN {
+
+//______________________________________________________________________________
+//
+// Generic Gradient Descent Class
+//______________________________________________________________________________
+//
+
+/*** \class TGradientDescent
+*
+*   Generic implementation of gradient descent minimization.
+*
+*   The TGradientDescent class implements an architecture and input data
+*   independent implementation of the gradient descent minimization algorithm.
+*
+*   Provides Train(...) and TrainMomentum(...) functions that perform a complete
+*   training of a neural network. Thos are mainly used for testing since for
+*   production a more fine grained control of the training process is desirable.
+*   This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
+*   functions that perform a single minimization step.
+*
+*   The main training characteristics are defined by the provided learning rate,
+*   the test interval and the convergence steps required for convergence. The
+*   test interval defines how often the error on the validation set is computed
+*   and is the values with which the step counter is increased each time
+*   the HasConverged() member function is called. A convergence step is defined as
+*   a step in which the test error is NOT less thatn 0.995 times the current
+*   minimal test error that has been reached. If between two subsequent calls
+*   to HasConverged(Double_t) the test error has not been sufficiently reduced
+*   it is assumed that a number of convergence steps equal to the test interval
+*   has been performed.
+*
+*/
+template<typename Architecture_t>
+class TGradientDescent
+{
+public:
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+private:
+   size_t   fBatchSize; ///< Batch size to use for the training.
+   size_t   fStepCount; ///< Number of steps performed in the current
+   ///< training sessiong.
+   size_t   fConvergenceSteps; ///< Number of training epochs without considerable
+   ///< decrease in the test error for convergence.
+   size_t   fConvergenceCount; ///< Current number of training epochs without
+   ///< considerable decrease in the test error.
+   size_t   fTestInterval; ///< Interval for the computation of the test error.
+   Scalar_t fTrainingError;///< Holds the most recently computed training loss.
+   Scalar_t fTestError;    ///< Holds the most recently computed test loss.
+   Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
+   Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
+   ///< during the current traning session.
+
+public:
+   TGradientDescent();
+   TGradientDescent(Scalar_t learningRate,
+                    size_t   convergenceSteps,
+                    size_t   testInterval);
+   /** Reset minimizer object to initial state. Does nothing for this minimizer. */
+   void Reset() {};
+
+   /** Train the given net using the given training input data (events), training
+       output data (labels), test input data (events), test output data (labels). */
+   template <typename Data_t, typename Net_t>
+   Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
+                  const Data_t & TestDataIn, size_t nTestSamples,
+                  Net_t & net, size_t nThreads = 1);
+
+   /** Same as Train(...) but uses the given momentum.*/
+   template <typename Data_t, typename Net_t>
+   Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
+                          const Data_t & TestDataIn, size_t nTestSamples,
+                          Net_t & net, Scalar_t momentum, size_t nThreads = 1);
+
+   /** Perform a single optimization step on a given batch. Propagates the input
+       matrix foward through the net, evaluates the loss and propagates the gradients
+       backward through the net. The computed gradients are scaled by the learning
+       rate \f$\alpha\f$ and subtracted from the weights and bias values of each
+       layer. */
+   template <typename Net_t>
+   void Step(Net_t &net, Matrix_t &input, const Matrix_t &output);
+
+   /** Same as Step(...) but also evaluate the loss on the given training data.
+    *  Note that this requires synchronization between host and device. */
+   template <typename Net_t>
+   Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output);
+
+   /** Perform multiple optimization steps simultaneously. Performs the
+    *  backprop algorithm on the input batches given in \p batches on
+    *  the neural networks given in \p nets. The forward and backward propagation
+    *  steps are executed in an interleaving manner in order to exploit potential
+    *  batch-level parallelism for asynchronous device calls.
+    */
+   template <typename Net_t>
+   void Step(Net_t &master,
+             std::vector<Net_t> &nets,
+             std::vector<TBatch<Architecture_t>> &batches);
+
+   /** Same as the Step(...) method for multiple batches but uses momentum. */
+   template <typename Net_t>
+   void StepMomentum(Net_t &master,
+                     std::vector<Net_t> &nets,
+                     std::vector<TBatch<Architecture_t>> &batches,
+                     Scalar_t momentum);
+   template <typename Net_t>
+
+   /** Same as the Step(...) method for multiple batches but uses Nesterov
+    *  momentum. */
+   void StepNesterov(Net_t &master,
+                     std::vector<Net_t> &nets,
+                     std::vector<TBatch<Architecture_t>> &batches,
+                     Scalar_t momentum);
+
+   /** Does not evaluate the loss and therefore not trigger a possible synchronization
+    *  with the device. Trains the weights of each layer, but only the bias terms of
+    *  the first layer for compatibility with the previous implementation. */
+   template <typename Net_t>
+   void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
+
+   /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
+    * synchronization with the device. */
+   template <typename Net_t>
+   Scalar_t StepReducedWeightsLoss(Net_t &net,
+                                   Matrix_t &input,
+                                   const Matrix_t &output);
+   /** Increases the minimization step counter by the test error evaluation
+    *  period and uses the current internal value of the test error to
+    *  determine if the minimization has converged. */
+   bool HasConverged();
+   /** Increases the minimization step counter by the test error evaluation
+    * period and uses the provided test error value of to determine if
+    * the minimization has converged. */
+   bool HasConverged(Scalar_t testError);
+
+   size_t   GetConvergenceCount() const {return fConvergenceCount;}
+   size_t   GetConvergenceSteps() const {return fConvergenceSteps;}
+   Scalar_t GetTrainingError() const {return fTrainingError;}
+   Scalar_t GetTestError() const     {return fTestError;}
+   size_t   GetTestInterval() const  {return fTestInterval;}
+
+   void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
+   void SetTestInterval(size_t interval)  {fTestInterval = interval;}
+   void SetLearningRate(Scalar_t rate)    {fLearningRate = rate;}
+   void SetBatchSize(Scalar_t rate)       {fBatchSize    = rate;}
+};
+
+//
+// Implementation
+//______________________________________________________________________________
+template<typename Architecture_t>
+    TGradientDescent<Architecture_t>::TGradientDescent()
+   : fBatchSize(0), fStepCount(0), fConvergenceSteps(0),
+     fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
+     fMinimumError(1e100)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate,
+                                                   size_t   convergenceSteps,
+                                                   size_t   testInterval)
+   : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps),
+     fConvergenceCount(0), fTestInterval(testInterval), fLearningRate(learningRate),
+     fMinimumError(1e100)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Data_t, typename Net_t>
+    auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
+                                                 size_t nTrainingSamples,
+                                                 const Data_t & testData,
+                                                 size_t nTestSamples,
+                                                 Net_t & net,
+                                                 size_t nThreads)
+   -> Scalar_t
+{
+   // Reset iteration state.
+   fMinimumError = 1e100;
+   fConvergenceCount = 0;
+   fStepCount = 0;
+
+   // Prepare training data.
+   bool converged = false;
+
+   TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
+                                                   net.GetBatchSize(),
+                                                   net.GetInputWidth(),
+                                                   net.GetOutputWidth(), nThreads);
+   auto testNet = net.CreateClone(nTestSamples);
+   TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
+                                                  testNet.GetBatchSize(),
+                                                  testNet.GetInputWidth(),
+                                                  net.GetOutputWidth());
+   std::vector<Net_t> nets{};
+   nets.reserve(nThreads);
+   for (size_t i = 0; i < nThreads; i++) {
+       nets.push_back(net);
+       for (size_t j = 0; j < net.GetDepth(); j++)
+       {
+           auto &masterLayer = net.GetLayer(j);
+           auto &layer = nets.back().GetLayer(j);
+           Architecture_t::Copy(layer.GetWeights(),
+                                masterLayer.GetWeights());
+           Architecture_t::Copy(layer.GetBiases(),
+                                masterLayer.GetBiases());
+       }
+   }
+
+   std::chrono::time_point<std::chrono::system_clock> start, end;
+   start = std::chrono::system_clock::now();
+
+   while (!converged)
+   {
+      fStepCount++;
+
+      trainLoader.Shuffle();
+      std::vector<TBatch<Architecture_t>> batches{};
+      for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
+         batches.clear();
+         for (size_t j = 0; j < nThreads; j++) {
+            batches.reserve(nThreads);
+            batches.push_back(trainLoader.GetBatch());
+         }
+         Step(net, nets, batches);
+      }
+
+      // Compute test error.
+      if ((fStepCount % fTestInterval) == 0) {
+
+         end   = std::chrono::system_clock::now();
+         std::chrono::duration<double> elapsed_seconds = end - start;
+         start = std::chrono::system_clock::now();
+         double seconds = elapsed_seconds.count();
+         double batchesInEpoch = (double) (nTrainingSamples / net.GetBatchSize());
+         double nFlops  = batchesInEpoch * fTestInterval;
+         nFlops *= net.GetNFlops();
+         std::cout << "Elapsed time for " << fTestInterval << " Epochs: "
+                   << seconds << " [s] => " << nFlops * 1e-9 / seconds
+                   << " GFlop/s" << std::endl;
+
+         auto b = *testLoader.begin();
+         auto inputMatrix  = b.GetInput();
+         auto outputMatrix = b.GetOutput();
+         Scalar_t loss = testNet.Loss(inputMatrix, outputMatrix);
+
+         std::cout << "Step " << fStepCount << ": Training Error = "
+                   << loss << std::endl;
+         converged = HasConverged();
+      }
+
+   }
+   return fMinimumError;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Data_t, typename Net_t>
+auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
+                                                     size_t nTrainingSamples,
+                                                     const Data_t & testData,
+                                                     size_t nTestSamples,
+                                                     Net_t & net,
+                                                     Scalar_t momentum,
+                                                     size_t nThreads)
+   -> Scalar_t
+{
+   // Reset iteration state.
+   fMinimumError = 1e100;
+   fConvergenceCount = 0;
+   fStepCount = 0;
+
+   // Prepare training data.
+   bool converged = false;
+
+   TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
+                                                   net.GetBatchSize(),
+                                                   net.GetInputWidth(),
+                                                   net.GetOutputWidth(), nThreads);
+   auto testNet = net.CreateClone(net.GetBatchSize());
+   TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
+                                                  testNet.GetBatchSize(),
+                                                  testNet.GetInputWidth(),
+                                                  net.GetOutputWidth());
+
+   net.InitializeGradients();
+   std::vector<Net_t> nets{};
+   nets.reserve(nThreads);
+   for (size_t i = 0; i < nThreads; i++) {
+       nets.push_back(net);
+       for (size_t j = 0; j < net.GetDepth(); j++)
+       {
+           auto &masterLayer = net.GetLayer(j);
+           auto &layer = nets.back().GetLayer(j);
+           Architecture_t::Copy(layer.GetWeights(),
+                                masterLayer.GetWeights());
+           Architecture_t::Copy(layer.GetBiases(),
+                                masterLayer.GetBiases());
+       }
+   }
+
+   std::chrono::time_point<std::chrono::system_clock> start, end;
+   start = std::chrono::system_clock::now();
+
+   while (!converged)
+   {
+      fStepCount++;
+
+      trainLoader.Shuffle();
+      // Iterate over epoch.
+      std::vector<TBatch<Architecture_t>> batches{};
+      for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) {
+         batches.clear();
+         batches.reserve(nThreads);
+         for (size_t j = 0; j < nThreads; j++) {
+            batches.push_back(trainLoader.GetBatch());
+         }
+         if (momentum != 0.0) {
+            StepMomentum(net, nets, batches, momentum);
+         } else {
+            Step(net, nets, batches);
+         }
+      }
+
+      // Compute test error.
+      if ((fStepCount % fTestInterval) == 0) {
+         fTestError = 0.0;
+         for (size_t i = 0; i < nTestSamples / net.GetBatchSize(); i += nThreads) {
+            auto b = testLoader.GetBatch();
+            auto inputMatrix  = b.GetInput();
+            auto outputMatrix = b.GetOutput();
+            fTestError += testNet.Loss(inputMatrix, outputMatrix);
+         }
+         fTestError /= (Double_t) nTestSamples / net.GetBatchSize();
+         converged = HasConverged();
+      }
+
+   }
+   return fMinimumError;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    void inline TGradientDescent<Architecture_t>::Step(Net_t & net,
+                                                       Matrix_t &input,
+                                                       const Matrix_t &output)
+{
+   //Scalar_t loss = net.Loss(input, output);
+   //fTrainingError = loss;
+   net.Forward(input);
+   net.Backward(input, output);
+
+   for (size_t i = 0; i < net.GetDepth(); i++)
+   {
+      auto &layer = net.GetLayer(i);
+      Architecture_t::ScaleAdd(layer.GetWeights(),
+                               layer.GetWeightGradients(),
+                               -fLearningRate);
+      Architecture_t::ScaleAdd(layer.GetBiases(),
+                               layer.GetBiasGradients(),
+                               -fLearningRate);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Net_t>
+auto inline TGradientDescent<Architecture_t>::StepLoss(Net_t & net,
+                                                       Matrix_t &input,
+                                                       const Matrix_t &output)
+   -> Scalar_t
+{
+   //Scalar_t loss = net.Loss(input, output);
+   //fTrainingError = loss;
+   Scalar_t loss = net.Loss(input, output);
+   net.Backward(input, output);
+
+   for (size_t i = 0; i < net.GetDepth(); i++)
+   {
+      auto &layer = net.GetLayer(i);
+      Architecture_t::ScaleAdd(layer.GetWeights(),
+                               layer.GetWeightGradients(),
+                               -fLearningRate);
+      Architecture_t::ScaleAdd(layer.GetBiases(),
+                               layer.GetBiasGradients(),
+                               -fLearningRate);
+   }
+   return loss;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    void inline TGradientDescent<Architecture_t>::Step(
+        Net_t & master,
+        std::vector<Net_t> & nets,
+        std::vector<TBatch<Architecture_t>> & batches)
+{
+   typename Architecture_t::Matrix_t dummy(0,0);
+   size_t depth = master.GetDepth();
+
+   // Forward
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Forward(batches[j].GetInput());
+   }
+
+   for (size_t i = 1; i < depth; i++)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput());
+      }
+   }
+   // Gradients
+   for (size_t j = 0; j < nets.size(); j++) {
+      evaluateGradients<Architecture_t>(
+          nets[j].GetLayer(depth-1).GetActivationGradients(),
+          nets[j].GetLossFunction(),
+          batches[j].GetOutput(),
+          nets[j].GetLayer(depth-1).GetOutput());
+   }
+   // Backward
+   for (size_t i = depth - 1; i > 0; i--)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
+                                      nets[j].GetLayer(i-1).GetOutput(),
+                                      nets[j].GetRegularization(),
+                                      nets[j].GetWeightDecay());
+      }
+   }
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Backward(dummy,
+                                   batches[j].GetInput(),
+                                   nets[j].GetRegularization(),
+                                   nets[j].GetWeightDecay());
+   }
+
+   for (size_t j = 0; j < nets.size(); j++) {
+      for (size_t i = 0; i < depth; i++)
+      {
+         auto &masterLayer = master.GetLayer(i);
+         auto &layer       = nets[j].GetLayer(i);
+         Architecture_t::ScaleAdd(masterLayer.GetWeights(),
+                                  layer.GetWeightGradients(),
+                                  -fLearningRate);
+         Architecture_t::Copy(layer.GetWeights(),
+                              masterLayer.GetWeights());
+         Architecture_t::ScaleAdd(masterLayer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+         Architecture_t::Copy(layer.GetBiases(),
+                              masterLayer.GetBiases());
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Net_t>
+void inline TGradientDescent<Architecture_t>::StepMomentum(
+        Net_t & master,
+        std::vector<Net_t> & nets,
+        std::vector<TBatch<Architecture_t>> & batches,
+        Scalar_t momentum)
+{
+   typename Architecture_t::Matrix_t dummy(0,0);
+   size_t depth = master.GetDepth();
+
+   // Forward
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Forward(batches[j].GetInput());
+   }
+
+   for (size_t i = 1; i < depth; i++)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput());
+      }
+   }
+   // Gradients
+   for (size_t j = 0; j < nets.size(); j++) {
+      evaluateGradients<Architecture_t>(
+          nets[j].GetLayer(depth-1).GetActivationGradients(),
+          nets[j].GetLossFunction(),
+          batches[j].GetOutput(),
+          nets[j].GetLayer(depth-1).GetOutput());
+   }
+   // Backward
+   for (size_t i = depth - 1; i > 0; i--)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
+                                      nets[j].GetLayer(i-1).GetOutput(),
+                                      nets[j].GetRegularization(),
+                                      nets[j].GetWeightDecay());
+         Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
+                                  nets[j].GetLayer(i).GetWeightGradients(),
+                                  - fLearningRate / momentum);
+         Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
+                                  nets[j].GetLayer(i).GetBiasGradients(),
+                                  - fLearningRate / momentum);
+      }
+      Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
+                               master.GetLayer(i).GetWeightGradients(),
+                               momentum - 1.0);
+      Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
+                               master.GetLayer(i).GetBiasGradients(),
+                               momentum - 1.0);
+   }
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Backward(dummy,
+                                   batches[j].GetInput(),
+                                   nets[j].GetRegularization(),
+                                   nets[j].GetWeightDecay());
+      Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
+                               nets[j].GetLayer(0).GetWeightGradients(),
+                               - fLearningRate / momentum);
+      Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
+                               nets[j].GetLayer(0).GetBiasGradients(),
+                               - fLearningRate / momentum);
+   }
+
+   Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
+                            master.GetLayer(0).GetWeightGradients(),
+                            momentum - 1.0);
+   Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
+                            master.GetLayer(0).GetBiasGradients(),
+                            momentum - 1.0);
+
+   for (size_t i = 0; i < depth; i++)
+   {
+       auto &masterLayer = master.GetLayer(i);
+       Architecture_t::ScaleAdd(masterLayer.GetWeights(),
+                                masterLayer.GetWeightGradients(),
+                                1.0);
+       Architecture_t::ScaleAdd(masterLayer.GetBiases(),
+                                masterLayer.GetBiasGradients(),
+                                1.0);
+       for (size_t j = 0; j < nets.size(); j++) {
+         auto &layer       = nets[j].GetLayer(i);
+         Architecture_t::Copy(layer.GetWeights(),
+                              masterLayer.GetWeights());
+         Architecture_t::Copy(layer.GetBiases(),
+                              masterLayer.GetBiases());
+       }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Net_t>
+void inline TGradientDescent<Architecture_t>::StepNesterov(
+        Net_t & master,
+        std::vector<Net_t> & nets,
+        std::vector<TBatch<Architecture_t>> & batches,
+        Scalar_t momentum)
+{
+   typename Architecture_t::Matrix_t dummy(0,0);
+   size_t depth = master.GetDepth();
+
+   // Forward
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Forward(batches[j].GetInput());
+   }
+
+   for (size_t i = 1; i < depth; i++)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput());
+      }
+   }
+
+   // Gradients
+   for (size_t j = 0; j < nets.size(); j++) {
+      evaluateGradients<Architecture_t>(
+          nets[j].GetLayer(depth-1).GetActivationGradients(),
+          nets[j].GetLossFunction(),
+          batches[j].GetOutput(),
+          nets[j].GetLayer(depth-1).GetOutput());
+   }
+
+   // Backward
+   for (size_t i = depth - 1; i > 0; i--)
+   {
+      for (size_t j = 0; j < nets.size(); j++) {
+         nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
+                                      nets[j].GetLayer(i-1).GetOutput(),
+                                      nets[j].GetRegularization(),
+                                      nets[j].GetWeightDecay());
+      }
+   }
+
+   for (size_t j = 0; j < nets.size(); j++) {
+      nets[j].GetLayer(0).Backward(dummy,
+                                   batches[j].GetInput(),
+                                   nets[j].GetRegularization(),
+                                   nets[j].GetWeightDecay());
+   }
+
+   for (size_t i = 0; i < depth; i++)
+   {
+      auto &masterLayer = master.GetLayer(i);
+      for (size_t j = 0; j < nets.size(); j++) {
+         auto &layer       = nets[j].GetLayer(i);
+         Architecture_t::Copy(layer.GetWeights(),
+                              masterLayer.GetWeights());
+         Architecture_t::Copy(layer.GetBiases(),
+                              masterLayer.GetBiases());
+         Architecture_t::ScaleAdd(layer.GetWeights(),
+                                  masterLayer.GetWeightGradients(),
+                                  1.0);
+         Architecture_t::ScaleAdd(layer.GetBiases(),
+                                  masterLayer.GetBiasGradients(),
+                                  1.0);
+      }
+      for (size_t j = 0; j < nets.size(); j++) {
+         auto &layer       = nets[j].GetLayer(i);
+         Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
+                                  layer.GetWeightGradients(),
+                                  - fLearningRate / momentum);
+         Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
+                                  layer.GetBiasGradients(),
+                                  - fLearningRate / momentum);
+      }
+      Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
+                               masterLayer.GetWeightGradients(),
+                               momentum - 1.0);
+      Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
+                               masterLayer.GetBiasGradients(),
+                               momentum - 1.0);
+      Architecture_t::ScaleAdd(masterLayer.GetWeights(),
+                               masterLayer.GetWeightGradients(),
+                               1.0);
+      Architecture_t::ScaleAdd(masterLayer.GetBiases(),
+                               masterLayer.GetBiasGradients(),
+                               1.0);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+template <typename Net_t>
+void inline TGradientDescent<Architecture_t>::StepReducedWeights(
+    Net_t & net,
+    Matrix_t &input,
+    const Matrix_t &output)
+{
+   net.Forward(input);
+   net.Backward(input, output);
+
+   for (size_t i = 0; i < net.GetDepth(); i++)
+   {
+      auto &layer = net.GetLayer(i);
+      Architecture_t::ScaleAdd(layer.GetWeights(),
+                               layer.GetWeightGradients(),
+                               -fLearningRate);
+      if (i == 0) {
+         Architecture_t::ScaleAdd(layer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+    template <typename Net_t>
+    auto inline TGradientDescent<Architecture_t>::StepReducedWeightsLoss(
+        Net_t & net,
+        Matrix_t &input,
+        const Matrix_t &output)
+    -> Scalar_t
+{
+   Scalar_t loss = net.Loss(input, output);
+   fTrainingError = loss;
+   net.Backward(input, output);
+
+   for (size_t i = 0; i < net.GetDepth(); i++)
+   {
+      auto &layer = net.GetLayer(i);
+      Architecture_t::ScaleAdd(layer.GetWeights(),
+                               layer.GetWeightGradients(),
+                               -fLearningRate);
+      if (i == 0) {
+         Architecture_t::ScaleAdd(layer.GetBiases(),
+                                  layer.GetBiasGradients(),
+                                  -fLearningRate);
+      }
+   }
+   return loss;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+bool inline TGradientDescent<Architecture_t>::HasConverged()
+{
+   if (fTestError < fMinimumError * 0.999) {
+      fConvergenceCount = 0;
+      fMinimumError     = fTestError;
+   } else {
+      fConvergenceCount++;
+   }
+
+   return (fConvergenceCount >= fConvergenceSteps);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t>
+bool inline TGradientDescent<Architecture_t>::HasConverged(Scalar_t testError)
+{
+   fTestError = testError;
+   if (fTestError < fMinimumError * 0.999) {
+      fConvergenceCount = 0;
+      fMinimumError     = fTestError;
+   } else {
+      fConvergenceCount += fTestInterval;
+   }
+   return (fConvergenceCount >= fConvergenceSteps);
+}
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Net.h b/tmva/tmva/inc/TMVA/DNN/Net.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fa4121f0d7817477837abc55b7c7c929e5c926f
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/DNN/Net.h
@@ -0,0 +1,408 @@
+// @(#)root/tmva: $Id$
+// Author: Simon Pfreundschuh 20/06/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#ifndef TMVA_DNN_NET
+#define TMVA_DNN_NET
+
+#include <vector>
+#include <iostream>
+
+#include "Layer.h"
+
+namespace TMVA {
+namespace DNN  {
+
+/** \class TNet
+
+    Generic neural network class.
+
+    This generic neural network class represents a concrete neural
+    network through a vector of layers and coordinates the forward
+    and backward propagation through the net.
+
+    The net takes as input a batch from the training data given in
+    matrix form, with each row corresponding to a certain training
+    event.
+
+    On construction, the neural network allocates all the memory
+    required for the training of the neural net and keeps it until
+    its destruction.
+
+    The Architecture type argument simply holds the
+    architecture-specific data types, which are just the matrix type
+    Matrix_t and the used scalar type Scalar_t.
+
+    \tparam Architecture The Architecture type that holds the
+    \tparam Layer_t The type used for the layers. Can be either
+    Layer<Architecture> or SharedWeightLayer<Architecture>.
+    datatypes for a given architecture.
+*/
+template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>>
+   class TNet {
+
+public:
+   using Matrix_t         = typename Architecture_t::Matrix_t;
+   using Scalar_t         = typename Architecture_t::Scalar_t;
+   using LayerIterator_t  = typename std::vector<Layer_t>::iterator;
+
+private:
+   size_t fBatchSize;  ///< Batch size for training and evaluation of the Network.
+   size_t fInputWidth; ///< Number of features in a single input event.
+
+   std::vector<Layer_t> fLayers; ///< Layers in the network.
+
+   Matrix_t fDummy;       ///< Empty matrix for last step in back propagation.
+   ELossFunction fJ;      ///< The loss function of the network.
+   ERegularization fR;    ///< The regularization used for the network.
+   Scalar_t fWeightDecay; ///< The weight decay factor.
+
+public:
+   TNet();
+   TNet(const TNet & other);
+   template<typename OtherArchitecture_t>
+   TNet(size_t batchSize, const TNet<OtherArchitecture_t> &);
+   /*! Construct a neural net for a given batch size with
+    *  given output function * and regularization. */
+   TNet(size_t batchSize,
+        size_t inputWidth,
+        ELossFunction fJ,
+        ERegularization fR = ERegularization::kNone,
+        Scalar_t fWeightDecay = 0.0);
+   /*! Create a clone that uses the same weight and biases matrices but
+    *  potentially a difference batch size. */
+   TNet<Architecture_t, TSharedLayer<Architecture_t>> CreateClone(size_t batchSize);
+
+   /*! Add a layer of the given size to the neural net. */
+   void AddLayer(size_t width, EActivationFunction f,
+                 Scalar_t dropoutProbability = 1.0);
+
+   /*! Remove all layers from the network.*/
+   void Clear();
+
+   /*! Add a layer which shares its weights with another TNet instance. */
+   template <typename SharedLayer>
+   void AddLayer(SharedLayer & layer);
+
+   /*! Iterator to the first layer of the net. */
+   LayerIterator_t LayersBegin() {return fLayers;}
+
+   /*! Iterator to the last layer of the net. */
+   LayerIterator_t LayersEnd() {return fLayers;}
+
+   /*! Initialize the weights in the net with the
+    *  initialization method. */
+   inline void Initialize(EInitialization m);
+
+   /*! Initialize the gradients in the net to zero. Required if net is
+    *  used to store velocities of momentum-based minimization techniques. */
+   inline void InitializeGradients();
+
+   /*! Forward a given input through the neural net. Computes
+    *  all layer activations up to the output layer */
+   inline void Forward(Matrix_t& X, bool applyDropout = false);
+
+   /*! Compute the weight gradients in the net from the given training
+    * samples X and training labels Y. */
+   inline void Backward(const Matrix_t &X, const Matrix_t &Y);
+
+   /*! Evaluate the loss function of the net using the activations
+    *  that are currently stored in the output layer. */
+   inline Scalar_t Loss(const Matrix_t &Y) const;
+
+   /*! Propagate the input batch X through the net and evaluate the
+    *  error function for the resulting activations of the output
+    *  layer */
+   inline Scalar_t Loss(Matrix_t &X, const Matrix_t &Y, bool applyDropout = false);
+
+   /*! Compute the neural network predictionion obtained from forwarding the
+    *  batch X through the neural network and applying the output function
+    *  f to the activation of the last layer in the network. */
+   inline void Prediction(Matrix_t &Y_hat, Matrix_t &X, EOutputFunction f);
+
+   /*! Compute the neural network rediction obtained from applying the output
+    * function f to the activation of the last layer in the network. */
+   inline void Prediction(Matrix_t &Y_hat, EOutputFunction f) const;
+
+   Scalar_t            GetNFlops();
+
+   size_t              GetDepth() const          {return fLayers.size();}
+   size_t              GetBatchSize() const      {return fBatchSize;}
+   Layer_t &           GetLayer(size_t i)        {return fLayers[i];}
+   const Layer_t &     GetLayer(size_t i) const  {return fLayers[i];}
+   ELossFunction       GetLossFunction() const   {return fJ;}
+   Matrix_t &          GetOutput()               {return fLayers.back().GetOutput();}
+   size_t              GetInputWidth() const     {return fInputWidth;}
+   size_t              GetOutputWidth() const    {return fLayers.back().GetWidth();}
+   ERegularization     GetRegularization() const {return fR;}
+   Scalar_t            GetWeightDecay() const    {return fWeightDecay;}
+
+   void SetBatchSize(size_t batchSize)       {fBatchSize = batchSize;}
+   void SetInputWidth(size_t inputWidth)     {fInputWidth = inputWidth;}
+   void SetRegularization(ERegularization R) {fR = R;}
+   void SetLossFunction(ELossFunction J)     {fJ = J;}
+   void SetWeightDecay(Scalar_t weightDecay) {fWeightDecay = weightDecay;}
+   void SetDropoutProbabilities(const std::vector<Double_t> & probabilities);
+
+   void Print();
+};
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   TNet<Architecture_t, Layer_t>::TNet()
+   : fBatchSize(0), fInputWidth(0), fDummy(0,0),
+   fJ(ELossFunction::kMeanSquaredError), fR(ERegularization::kNone)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+
+template<typename Architecture_t, typename Layer_t>
+   TNet<Architecture_t, Layer_t>::TNet(const TNet & other)
+   : fBatchSize(other.fBatchSize), fInputWidth(other.fInputWidth),
+   fLayers(other.fLayers), fDummy(0,0), fJ(other.fJ), fR(other.fR)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+template<typename OtherArchitecture_t>
+TNet<Architecture_t, Layer_t>::TNet(size_t batchSize,
+                                    const TNet<OtherArchitecture_t> & other)
+   : fBatchSize(batchSize), fInputWidth(other.GetInputWidth()),
+     fDummy(0,0), fJ(other.GetLossFunction()), fR(other.GetRegularization())
+{
+   fLayers.reserve(other.GetDepth());
+   for (size_t i = 0; i < other.GetDepth(); i++) {
+      AddLayer(other.GetLayer(i).GetWidth(),
+               other.GetLayer(i).GetActivationFunction(),
+               other.GetLayer(i).GetDropoutProbability());
+      fLayers[i].GetWeights() = (TMatrixT<Double_t>) other.GetLayer(i).GetWeights();
+      fLayers[i].GetBiases()  = (TMatrixT<Double_t>) other.GetLayer(i).GetBiases();
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   TNet<Architecture_t, Layer_t>::TNet(size_t        batchSize,
+                                       size_t        inputWidth,
+                                       ELossFunction J,
+                                       ERegularization R,
+                                       Scalar_t weightDecay)
+   : fBatchSize(batchSize), fInputWidth(inputWidth), fDummy(0,0),
+     fJ(J), fR(R), fWeightDecay(weightDecay)
+{
+   // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   auto TNet<Architecture_t, Layer_t>::CreateClone(size_t BatchSize)
+   -> TNet<Architecture_t, TSharedLayer<Architecture_t>>
+{
+   TNet<Architecture_t, TSharedLayer<Architecture_t>> other(BatchSize,
+                                                        fInputWidth,
+                                                        fJ, fR);
+   for (auto &l : fLayers) {
+      other.AddLayer(l);
+   }
+   return other;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   void TNet<Architecture_t, Layer_t>::AddLayer(size_t width,
+                                                EActivationFunction f,
+                                                Scalar_t dropoutProbability)
+{
+   if (fLayers.size() == 0) {
+      fLayers.emplace_back(fBatchSize, fInputWidth, width, f, dropoutProbability);
+   } else {
+      size_t prevWidth = fLayers.back().GetWidth();
+      fLayers.emplace_back(fBatchSize, prevWidth, width, f, dropoutProbability);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   void TNet<Architecture_t, Layer_t>::Clear()
+{
+   fLayers.clear();
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   template<typename SharedLayer_t>
+   inline void TNet<Architecture_t, Layer_t>::AddLayer(SharedLayer_t & layer)
+{
+   fLayers.emplace_back(fBatchSize, layer);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Initialize(EInitialization m)
+{
+   for (auto &l : fLayers) {
+      l.Initialize(m);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::InitializeGradients()
+{
+   for (auto &l : fLayers) {
+      initialize<Architecture_t>(l.GetWeightGradients(), EInitialization::kZero);
+      initialize<Architecture_t>(l.GetBiasGradients(),   EInitialization::kZero);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+inline void TNet<Architecture_t, Layer_t>::Forward(Matrix_t &input,
+                                                   bool applyDropout)
+{
+   fLayers.front().Forward(input, applyDropout);
+
+   for (size_t i = 1; i < fLayers.size(); i++) {
+      fLayers[i].Forward(fLayers[i-1].GetOutput(), applyDropout);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Backward(const Matrix_t &X,
+                                                       const Matrix_t &Y)
+{
+
+    evaluateGradients<Architecture_t>(fLayers.back().GetActivationGradients(),
+                                      fJ, Y, fLayers.back().GetOutput());
+
+   for (size_t i = fLayers.size()-1; i > 0; i--) {
+      auto & activation_gradient_backward
+         = fLayers[i-1].GetActivationGradients();
+      auto & activations_backward
+         = fLayers[i-1].GetOutput();
+      fLayers[i].Backward(activation_gradient_backward,
+                          activations_backward, fR, fWeightDecay);
+   }
+   fLayers[0].Backward(fDummy, X, fR, fWeightDecay);
+
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline auto TNet<Architecture_t, Layer_t>::Loss(const Matrix_t &Y) const
+   -> Scalar_t
+{
+   auto loss = evaluate<Architecture_t>(fJ, Y, fLayers.back().GetOutput());
+   for (auto &l : fLayers) {
+      loss += fWeightDecay * regularization<Architecture_t>(l.GetWeights(), fR);
+   }
+   return loss;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline auto TNet<Architecture_t, Layer_t>::Loss(Matrix_t &X,
+                                                   const Matrix_t &Y,
+                                                   bool applyDropout)
+   -> Scalar_t
+{
+   Forward(X, applyDropout);
+   return Loss(Y);
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Prediction(Matrix_t &Yhat,
+                                                         Matrix_t &X,
+                                                         EOutputFunction f)
+{
+   Forward(X, false);
+   evaluate<Architecture_t>(Yhat, f, fLayers.back().GetOutput());
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   inline void TNet<Architecture_t, Layer_t>::Prediction(Matrix_t &Y_hat,
+                                                         EOutputFunction f) const
+{
+   evaluate<Architecture_t>(Y_hat, f, fLayers.back().GetOutput());
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+auto TNet<Architecture_t, Layer_t>::GetNFlops()
+   -> Scalar_t
+{
+   Scalar_t flops = 0;
+
+   Scalar_t nb  = (Scalar_t) fBatchSize;
+   Scalar_t nlp = (Scalar_t) fInputWidth;
+
+   for(size_t i = 0; i < fLayers.size(); i++) {
+      Layer_t & layer = fLayers[i];
+      Scalar_t nl = (Scalar_t) layer.GetWidth();
+
+      // Forward propagation.
+      flops += nb * nl * (2.0 * nlp - 1); // Matrix mult.
+      flops += nb * nl;                   // Add bias values.
+      flops += 2 * nb * nl;               // Apply activation function and compute
+                                          // derivative.
+      // Backward propagation.
+      flops += nb * nl;                      // Hadamard
+      flops += nlp * nl * (2.0 * nb - 1.0);  // Weight gradients
+      flops += nl * (nb - 1);                // Bias gradients
+      if (i > 0) {
+         flops += nlp * nb * (2.0 * nl  - 1.0); // Previous layer gradients.
+      }
+      nlp = nl;
+   }
+   return flops;
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+void TNet<Architecture_t, Layer_t>::SetDropoutProbabilities(
+    const std::vector<Double_t> & probabilities)
+{
+   for (size_t i = 0; i < fLayers.size(); i++) {
+      if (i < probabilities.size()) {
+         fLayers[i].SetDropoutProbability(probabilities[i]);
+      } else {
+         fLayers[i].SetDropoutProbability(1.0);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Architecture_t, typename Layer_t>
+   void TNet<Architecture_t, Layer_t>::Print()
+{
+   std::cout << "DEEP NEURAL NETWORK:";
+   std::cout << " Loss function = " << static_cast<char>(fJ);
+   std::cout << ", Depth = " << fLayers.size() << std::endl;
+
+   size_t i = 1;
+   for (auto & l : fLayers) {
+      std::cout << "DNN Layer " << i << ":" << std::endl;
+      l.Print();
+      i++;
+   }
+
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/MethodDNN.h b/tmva/tmva/inc/TMVA/MethodDNN.h
index 3b6bdc13971bbf1082c793b2a004556123ce383b..970fe203be38f9a7fd50a2150ec966086edda4da 100644
--- a/tmva/tmva/inc/TMVA/MethodDNN.h
+++ b/tmva/tmva/inc/TMVA/MethodDNN.h
@@ -11,7 +11,8 @@
  *      NeuralNetwork                                                             *
  *                                                                                *
  * Authors (alphabetical):                                                        *
- *      Peter Speckmayer      <peter.speckmayer@gmx.at> - CERN, Switzerland       *
+ *      Peter Speckmayer      <peter.speckmayer@gmx.at>  - CERN, Switzerland      *
+ *      Simon Pfreundschuh    <s.pfreundschuh@gmail.com> - CERN, Switzerland      *
  *                                                                                *
  * Copyright (c) 2005-2015:                                                       *
  *      CERN, Switzerland                                                         *
@@ -57,117 +58,165 @@
 #include "TMVA/NeuralNet.h"
 #endif
 
+#include "TMVA/Tools.h"
 
+#include "TMVA/DNN/Net.h"
+#include "TMVA/DNN/Minimizers.h"
+#include "TMVA/DNN/Architectures/Reference.h"
 
-namespace TMVA {
-
-   class MethodDNN : public MethodBase
-   {
-
-   public:
-
-      // standard constructors
-      MethodDNN ( const TString& jobName,
-                  const TString&  methodTitle,
-                  DataSetInfo& theData,
-                  const TString& theOption);
-
-      MethodDNN ( DataSetInfo& theData,
-                  const TString& theWeightFile );
-
-      virtual ~MethodDNN();
-
-      virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets );
-      std::vector<std::pair<int,TMVA::DNN::EnumFunction>> ParseLayoutString(TString layerSpec);
-      std::vector<std::map<TString,TString>> ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim);
-
-      void Train();
-
-      virtual Double_t GetMvaValue( Double_t* err=0, Double_t* errUpper=0 );
-      virtual const std::vector<Float_t>& GetRegressionValues();
-      virtual const std::vector<Float_t>& GetMulticlassValues();
-
-      using MethodBase::ReadWeightsFromStream;
-
-      // write weights to stream
-      void AddWeightsXMLTo     ( void* parent ) const;
-
-      // read weights from stream
-      void ReadWeightsFromStream( std::istream & i );
-      void ReadWeightsFromXML   ( void* wghtnode );
-
-      // ranking of input variables
-      const Ranking* CreateRanking();
-
-      // nice output
-      void PrintCoefficients( void );
-
-      // write classifier-specific monitoring information to target file
-      virtual void     WriteMonitoringHistosToFile() const;
-
-   protected:
-
-
-      // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
-      void MakeClassSpecific( std::ostream&, const TString& ) const;
-
-      // get help message text
-      void GetHelpMessage() const;
-
-
-   private:
-
-      void checkGradients ();
-
-      // the option handling methods
-      void DeclareOptions();
-      void ProcessOptions();
+#ifdef DNNCPU
+#include "TMVA/DNN/Architectures/Cpu.h"
+#endif
 
-      // general helper functions
-      void     Init();
+#ifdef DNNCUDA
+#include "TMVA/DNN/Architectures/Cuda.h"
+#endif
 
+using namespace TMVA::DNN;
 
-   private:
-      TMVA::DNN::Net fNet;
-      std::vector<double> fWeights;
+namespace TMVA {
 
-      TString  fLayoutString;
-      std::vector<std::pair<int,TMVA::DNN::EnumFunction>> fLayout;
-      TString  fErrorStrategy;
-      TString  fTrainingStrategy;
-      TMVA::DNN::ModeErrorFunction fModeErrorFunction;
-      std::shared_ptr<TMVA::Monitoring> fMonitoring;
-      double   fSumOfSigWeights_test;
-      double   fSumOfBkgWeights_test;
-      bool     fResume;
-      TString  fWeightInitializationStrategyString;
-      TMVA::DNN::WeightInitializationStrategy fWeightInitializationStrategy;
+class MethodDNN : public MethodBase
+{
+    using Architecture_t = TReference<Double_t>;
+    using Net_t          = TNet<Architecture_t>;
+    using Matrix_t       = typename Architecture_t::Matrix_t;
 
-      std::vector<std::shared_ptr<TMVA::DNN::Settings>> fSettings;
+private:
 
-      TString  fFileName;
-      double fScaleToNumEvents;
+   using LayoutVector_t   = std::vector<std::pair<int, EActivationFunction>>;
+   using KeyValueVector_t = std::vector<std::map<TString, TString>>;
 
-      ClassDef(MethodDNN,0); // neural network 
+   struct TTrainingSettings
+   {
+       size_t                batchSize;
+       size_t                testInterval;
+       size_t                convergenceSteps;
+       ERegularization       regularization;
+       Double_t              learningRate;
+       Double_t              momentum;
+       Double_t              weightDecay;
+       std::vector<Double_t> dropoutProbabilities;
+       bool                  multithreading;
    };
 
-} // namespace TMVA
-
-
-// make_unqiue is only available with C++14
-template <typename T, typename... Args>
-   std::unique_ptr<T> make_unique (Args&&... args)
+   // the option handling methods
+   void DeclareOptions();
+   void ProcessOptions();
+
+   // general helper functions
+   void     Init();
+
+   Net_t             fNet;
+   EInitialization   fWeightInitialization;
+   EOutputFunction   fOutputFunction;
+
+   TString                        fLayoutString;
+   TString                        fErrorStrategy;
+   TString                        fTrainingStrategyString;
+   TString                        fWeightInitializationString;
+   TString                        fArchitectureString;
+   LayoutVector_t                 fLayout;
+   std::vector<TTrainingSettings> fTrainingSettings;
+   bool                           fResume;
+
+   KeyValueVector_t fSettings;
+
+   ClassDef(MethodDNN,0); // neural network
+
+   static inline void WriteMatrixXML(void *parent, const char *name,
+                                     const TMatrixT<Double_t> &X);
+   static inline void ReadMatrixXML(void *xml, const char *name,
+                                    TMatrixT<Double_t> &X);
+protected:
+
+   void MakeClassSpecific( std::ostream&, const TString& ) const;
+   void GetHelpMessage() const;
+
+public:
+
+   // Standard Constructors
+   MethodDNN(const TString& jobName,
+             const TString&  methodTitle,
+             DataSetInfo& theData,
+             const TString& theOption);
+   MethodDNN(DataSetInfo& theData,
+             const TString& theWeightFile);
+   virtual ~MethodDNN();
+
+   virtual Bool_t HasAnalysisType(Types::EAnalysisType type,
+                                  UInt_t numberClasses,
+                                  UInt_t numberTargets );
+   LayoutVector_t   ParseLayoutString(TString layerSpec);
+   KeyValueVector_t ParseKeyValueString(TString parseString,
+                                      TString blockDelim,
+                                      TString tokenDelim);
+   void Train();
+   void TrainGpu();
+   template <typename AFloat>
+   void TrainCpu();
+
+   virtual Double_t GetMvaValue( Double_t* err=0, Double_t* errUpper=0 );
+   virtual const std::vector<Float_t>& GetRegressionValues();
+   virtual const std::vector<Float_t>& GetMulticlassValues();
+
+   using MethodBase::ReadWeightsFromStream;
+
+   // write weights to stream
+   void AddWeightsXMLTo     ( void* parent ) const;
+
+   // read weights from stream
+   void ReadWeightsFromStream( std::istream & i );
+   void ReadWeightsFromXML   ( void* wghtnode );
+
+   // ranking of input variables
+   const Ranking* CreateRanking();
+
+};
+
+inline void MethodDNN::WriteMatrixXML(void *parent,
+                                      const char *name,
+                                      const TMatrixT<Double_t> &X)
 {
-   return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+   std::stringstream matrixStringStream("");
+   matrixStringStream.precision( 16 );
+
+   for (size_t i = 0; i < (size_t) X.GetNrows(); i++)
+   {
+      for (size_t j = 0; j < (size_t) X.GetNcols(); j++)
+      {
+         matrixStringStream << std::scientific << X(i,j) << " ";
+      }
+   }
+   std::string s = matrixStringStream.str();
+   void* matxml = gTools().xmlengine().NewChild(parent, 0, name);
+   gTools().xmlengine().NewAttr(matxml, 0, "rows",
+                                gTools().StringFromInt((int)X.GetNrows()));
+   gTools().xmlengine().NewAttr(matxml, 0, "cols",
+                                gTools().StringFromInt((int)X.GetNcols()));
+   gTools().xmlengine().AddRawLine (matxml, s.c_str());
 }
 
-// make_shared is only available with C++14
-template <typename T, typename... Args>
-   std::shared_ptr<T> make_shared (Args&&... args)
+inline void MethodDNN::ReadMatrixXML(void *xml,
+                                     const char *name,
+                                     TMatrixT<Double_t> &X)
 {
-   return std::shared_ptr<T>(new T(std::forward<Args>(args)...));
-}
+   void *matrixXML = gTools().GetChild(xml, name);
+   size_t rows, cols;
+   gTools().ReadAttr(matrixXML, "rows", rows);
+   gTools().ReadAttr(matrixXML, "cols", cols);
 
+   const char * matrixString = gTools().xmlengine().GetNodeContent(matrixXML);
+   std::stringstream matrixStringStream(matrixString);
 
+   for (size_t i = 0; i < rows; i++)
+   {
+      for (size_t j = 0; j < cols; j++)
+      {
+         matrixStringStream >> X(i,j);
+      }
+   }
+}
+} // namespace TMVA
 
 #endif
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu.cxx b/tmva/tmva/src/DNN/Architectures/Cpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..42d947d009a64eafb2d6ea3671e4af447c0233a6
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu.cxx
@@ -0,0 +1,32 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////
+// Explicit instantiation of the CPU architecture class. //
+///////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+
+#include "Cpu/ActivationFunctions.cxx"
+#include "Cpu/Arithmetic.cxx"
+#include "Cpu/Dropout.cxx"
+#include "Cpu/Initialization.cxx"
+#include "Cpu/LossFunctions.cxx"
+#include "Cpu/OutputFunctions.cxx"
+#include "Cpu/Propagation.cxx"
+#include "Cpu/Regularization.cxx"
+
+namespace TMVA {
+namespace DNN  {
+template class TCpu<Double_t>;
+template class TCpu<Real_t>;
+} // namespace TMVA
+} // namespace DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/ActivationFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/ActivationFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..72b65b49a384166d6bff794fd52acd0d68b398e7
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/ActivationFunctions.cxx
@@ -0,0 +1,149 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 19/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ ///////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for multi-threaded //
+ // CPU architectures using tbb and BLAS.                         //
+ ///////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include <math.h>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::IdentityDerivative(TCpuMatrix<AFloat> & B,
+                                      const TCpuMatrix<AFloat> &/*A*/)
+{
+   auto f = [](AFloat) {return 1.0;};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::Relu(TCpuMatrix<AFloat> & B)
+{
+   auto f = [](AFloat x) {return (x < 0.0) ? 0.0 : x;};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::ReluDerivative(TCpuMatrix<AFloat> & B,
+                                               const TCpuMatrix<AFloat> &A)
+{
+   auto f = [](AFloat x) {return (x < 0.0) ? 0.0 : 1.0;};
+   B.MapFrom(f, A);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::Sigmoid(TCpuMatrix<AFloat> & B)
+{
+   auto f = [](AFloat x) {return 1.0 / (1.0 + exp(-x));};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::SigmoidDerivative(TCpuMatrix<AFloat> & B,
+                                     const TCpuMatrix<AFloat> &A)
+{
+   auto f = [](AFloat x) {
+      AFloat sig = 1.0 / (1.0 + exp(-x));
+      return sig * (1.0 - sig);
+   };
+   B.MapFrom(f, A);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::Tanh(TCpuMatrix<AFloat> & B)
+{
+   auto f = [](AFloat x) {return tanh(x);};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::TanhDerivative(TCpuMatrix<AFloat> & B,
+                                  const TCpuMatrix<AFloat> &A)
+{
+   auto f = [](AFloat x) {
+      AFloat t = tanh(x);
+      return 1 - t * t;
+   };
+   B.MapFrom(f, A);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::SymmetricRelu(TCpuMatrix<AFloat> & B)
+{
+   auto f = [](AFloat x) {return fabs(x);};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::SymmetricReluDerivative(TCpuMatrix<AFloat> & B,
+                                           const TCpuMatrix<AFloat> &A)
+{
+   auto f = [](AFloat x) {
+      return (x < 0.0) ? -1.0 : 1.0;
+   };
+   B.MapFrom(f, A);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::SoftSign(TCpuMatrix<AFloat> & B)
+{
+   auto f = [](AFloat x) {return x / (1 + fabs(x));};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::SoftSignDerivative(TCpuMatrix<AFloat> & B,
+                                      const TCpuMatrix<AFloat> &A)
+{
+   auto f = [](AFloat x) {
+      x = 1.0 + fabs(x);
+      x = 1.0 / (x * x);
+      return x;
+   };
+   B.MapFrom(f, A);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::Gauss(TCpuMatrix<AFloat> & B)
+{
+   auto f = [](AFloat x) {return exp(- x * x);};
+   B.Map(f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::GaussDerivative(TCpuMatrix<AFloat> & B,
+                                   const TCpuMatrix<AFloat> &A)
+{
+   auto f = [](AFloat x) {return - 2.0 * x * exp(- x * x);};
+   B.MapFrom(f, A);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..4ea33431879b91c153199ae6e84d088f08bf729d
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
@@ -0,0 +1,142 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////
+//  Implementation of Helper arithmetic functions for the //
+// multi-threaded CPU implementation of DNNs.             //
+////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TMVA/DNN/Architectures/Cpu/Blas.h"
+#include "tbb/tbb.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::Multiply(TCpuMatrix<Real_t> &C,
+                            const TCpuMatrix<Real_t> &A,
+                            const TCpuMatrix<Real_t> &B)
+{
+    int m = (int) A.GetNrows();
+    int k = (int) A.GetNcols();
+    int n = (int) B.GetNcols();
+
+    char transa = 'N';
+    char transb = 'N';
+
+    Real_t alpha = 1.0;
+    Real_t beta  = 0.0;
+
+    const Real_t * APointer = A.GetRawDataPointer();
+    const Real_t * BPointer = B.GetRawDataPointer();
+          Real_t * CPointer = C.GetRawDataPointer();
+
+    ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha,
+                            APointer, &m, BPointer, &k, &beta, CPointer, &m);
+}
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::TransposeMultiply(TCpuMatrix<Real_t> &C,
+                                                  const TCpuMatrix<Real_t> &A,
+                                                  const TCpuMatrix<Real_t> &B)
+{
+    int m = (int) A.GetNcols();
+    int k = (int) A.GetNrows();
+    int n = (int) B.GetNcols();
+
+    char transa = 'T';
+    char transb = 'N';
+
+    Real_t alpha = 1.0;
+    Real_t beta  = 0.0;
+
+    const Real_t *APointer = A.GetRawDataPointer();
+    const Real_t *BPointer = B.GetRawDataPointer();
+          Real_t *CPointer = C.GetRawDataPointer();
+
+    ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha,
+                            APointer, &k, BPointer, &k, &beta, CPointer, &m);
+}
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::Hadamard(TCpuMatrix<Real_t> &B,
+                                         const TCpuMatrix<Real_t> &A)
+{
+   const Real_t __restrict__ *dataA      = A.GetRawDataPointer();
+         Real_t __restrict__ *dataB      = B.GetRawDataPointer();
+
+   auto f = [&dataA, &dataB](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         dataB[i] *= dataA[i];
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, A.GetNElements());
+   parallel_for(range, f);
+}
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::SumColumns(TCpuMatrix<Real_t> &B,
+                                           const TCpuMatrix<Real_t> &A)
+{
+   int m = (int) A.GetNrows();
+   int n = (int) A.GetNcols();
+   int inc = 1;
+
+   Real_t alpha = 1.0;
+   Real_t beta  = 0.0;
+   char   trans   = 'T';
+
+   const Real_t * APointer = A.GetRawDataPointer();
+         Real_t * BPointer = B.GetRawDataPointer();
+
+   ::TMVA::DNN::Blas::Gemv(&trans, &m, &n, &alpha, APointer, &m,
+                           TCpuMatrix<Real_t>::GetOnePointer(), &inc,
+                           &beta, BPointer, &inc);
+}
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::ScaleAdd(TCpuMatrix<Real_t> &B,
+                            const TCpuMatrix<Real_t> &A,
+                            Real_t alpha)
+{
+   int n = (int) (A.GetNcols() * A.GetNrows());
+   int inc = 1;
+
+   const Real_t *x = A.GetRawDataPointer();
+   Real_t *y = B.GetRawDataPointer();
+
+   ::TMVA::DNN::Blas::Axpy(&n, &alpha, x, &inc, y, &inc);
+}
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::Copy(TCpuMatrix<Real_t> &B,
+                        const TCpuMatrix<Real_t> &A)
+{
+   auto f = [](Real_t x) {return x;};
+   B.MapFrom(f, A);
+}
+
+} // DNN
+} // TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..c28a738ef9a40f6879085ff689f0cccd4a614ecd
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx
@@ -0,0 +1,255 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////
+// CPU Buffer interface class for the generic data loader. //
+/////////////////////////////////////////////////////////////
+
+#include <vector>
+#include <memory>
+#include "TMVA/DNN/DataLoader.h"
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "Rtypes.h"
+#include <iostream>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AReal>
+void TCpuBuffer<AReal>::TDestructor::operator()(AReal ** pointer)
+{
+   delete[] * pointer;
+   delete[] pointer;
+}
+
+//______________________________________________________________________________
+template<typename AReal>
+TCpuBuffer<AReal>::TCpuBuffer(size_t size)
+    : fSize(size), fOffset(0)
+{
+   AReal ** pointer = new AReal * [1];
+   * pointer        = new AReal[size];
+   fBuffer          = std::shared_ptr<AReal *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+template<typename AReal>
+TCpuBuffer<AReal> TCpuBuffer<AReal>::GetSubBuffer(size_t offset, size_t size)
+{
+   TCpuBuffer buffer = *this;
+   buffer.fOffset = offset;
+   buffer.fSize   = size;
+   return buffer;
+}
+
+//______________________________________________________________________________
+template<typename AReal>
+void TCpuBuffer<AReal>::CopyFrom(TCpuBuffer & other)
+{
+   std::swap(*this->fBuffer, *other.fBuffer);
+}
+
+//______________________________________________________________________________
+template<typename AReal>
+void TCpuBuffer<AReal>::CopyTo(TCpuBuffer & other)
+{
+   std::swap(*this->fBuffer, *other.fBuffer);
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCpu<Real_t>>::CopyInput(
+    TCpuBuffer<Real_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Real_t> &inputMatrix  = std::get<0>(fData);
+   size_t n = inputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = static_cast<Real_t>(inputMatrix(sampleIndex, j));
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCpu<Real_t>>::CopyOutput(
+    TCpuBuffer<Real_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Real_t> &outputMatrix  = std::get<1>(fData);
+   size_t n = outputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = static_cast<Real_t>(outputMatrix(sampleIndex, j));
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCpu<Double_t>>::CopyInput(
+    TCpuBuffer<Double_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Double_t> &inputMatrix  = std::get<0>(fData);
+   size_t n = inputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = inputMatrix(sampleIndex, j);
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCpu<Double_t>>::CopyOutput(
+    TCpuBuffer<Double_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Double_t> &outputMatrix  = std::get<1>(fData);
+   size_t n = outputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = outputMatrix(sampleIndex, j);
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCpu<Double_t>>::CopyInput(
+    TCpuBuffer<Double_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n  = event->GetNVariables();
+
+   // Copy input variables.
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = * sampleIterator++;
+      event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = event->GetValue(j);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCpu<Double_t>>::CopyOutput(
+    TCpuBuffer<Double_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+
+   // Copy target(s).
+
+   for (size_t i = 0; i < batchSize; i++) {
+       size_t sampleIndex = * sampleIterator++;
+       event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         // Copy output matrices.
+         size_t bufferIndex = j * batchSize + i;
+         if (event->GetNTargets() == 0) {
+            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+         } else {
+            buffer[bufferIndex] = event->GetTarget(j);
+         }
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCpu<Real_t>>::CopyInput(
+    TCpuBuffer<Real_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n  = event->GetNVariables();
+
+   // Copy input variables.
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = * sampleIterator++;
+      event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = static_cast<Real_t>(event->GetValue(j));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCpu<Real_t>>::CopyOutput(
+    TCpuBuffer<Real_t> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+
+   // Copy target(s).
+
+   for (size_t i = 0; i < batchSize; i++) {
+       size_t sampleIndex = * sampleIterator++;
+       event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         // Copy output matrices.
+         size_t bufferIndex = j * batchSize + i;
+         if (event->GetNTargets() == 0) {
+            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+         } else {
+            buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
+         }
+      }
+   }
+}
+
+// Explicit instantiations.
+template class TCpuBuffer<Double_t>;
+template class TCpuBuffer<Real_t>;
+
+} // namespace DNN
+} // namespace TMVA
+
+
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/CpuMatrix.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/CpuMatrix.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..f21d6832031b3901a6f0f43e98d6274f3f5c6b2a
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/CpuMatrix.cxx
@@ -0,0 +1,87 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 19/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////
+// Implementation of the TCpuMatrix class. //
+/////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu/CpuMatrix.h"
+
+namespace TMVA {
+namespace DNN  {
+
+template<typename AReal>
+std::vector<AReal> TCpuMatrix<AReal>::fOnes{};
+
+//____________________________________________________________________________
+template<typename AReal>
+TCpuMatrix<AReal>::TCpuMatrix(size_t nRows, size_t nCols)
+    : fBuffer(nRows * nCols), fNCols(nCols), fNRows(nRows)
+{
+   Initialize();
+}
+
+//____________________________________________________________________________
+template<typename AReal>
+TCpuMatrix<AReal>::TCpuMatrix(const TMatrixT<Double_t> & B)
+    : fBuffer(B.GetNoElements()), fNCols(B.GetNcols()), fNRows(B.GetNrows())
+{
+   Initialize();
+   for (size_t j = 0; j < fNCols; j++) {
+      for (size_t i = 0; i < fNRows; i++) {
+          (*this)(i,j) = B(i,j);
+      }
+   }
+}
+
+//____________________________________________________________________________
+template<typename AReal>
+TCpuMatrix<AReal>::TCpuMatrix(const TCpuBuffer<AReal> & buffer,
+                               size_t m,
+                               size_t n)
+    : fBuffer(buffer), fNCols(n), fNRows(m)
+{
+   Initialize();
+}
+
+//____________________________________________________________________________
+template<typename AReal>
+TCpuMatrix<AReal>::operator TMatrixT<Double_t>() const
+{
+   TMatrixT<AReal> B(fNRows, fNCols);
+
+   for (size_t j = 0; j < fNCols; j++) {
+      for (size_t i = 0; i < fNRows; i++) {
+         B(i,j) = (*this)(i, j);
+      }
+   }
+   return B;
+}
+
+
+//____________________________________________________________________________
+template<typename AReal>
+void TCpuMatrix<AReal>::Initialize()
+{
+   if (fNRows > fOnes.size()) {
+      fOnes.reserve(fNRows);
+      for (size_t i = fOnes.size(); i < fNRows; i++) {
+         fOnes.push_back(1.0);
+      }
+   }
+}
+
+// Explicit instantiations.
+template class TCpuMatrix<Real_t>;
+template class TCpuMatrix<Double_t>;
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/DataLoader.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/DataLoader.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..448000c546d764867b74c51c37bc007e5e215f26
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/DataLoader.cxx
@@ -0,0 +1,209 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 21/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////
+// Implementation for the DataLoader for the the multi-threaded //
+// CPU implementation of DNNs.                                  //
+//////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu/DataLoader.h"
+#include "TMVA/Event.h"
+#include <iostream>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+// TCpuBatchIterator
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+TCpuBatchIterator<Data_t, Real_t>::TCpuBatchIterator(
+    TCpuDataLoader<Data_t, Real_t> & dataLoader,
+    size_t batchIndex)
+    : fDataLoader(dataLoader), fBatchIndex(batchIndex)
+{
+    // Nothing to do here.
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+TCpuBatch<Real_t> TCpuBatchIterator<Data_t, Real_t>::operator*()
+{
+   return fDataLoader.GetBatch(fBatchIndex);
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+TCpuBatchIterator<Data_t, Real_t> & TCpuBatchIterator<Data_t, Real_t>::operator++()
+{
+    fBatchIndex++;
+    return *this;
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+bool TCpuBatchIterator<Data_t, Real_t>::operator!=(const TCpuBatchIterator & other)
+{
+    return fBatchIndex != other.GetBatchIndex();
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+bool TCpuBatchIterator<Data_t, Real_t>::operator==(const TCpuBatchIterator & other)
+{
+    return fBatchIndex == other.GetBatchIndex();
+}
+
+// TCpuDataLoader
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+TCpuDataLoader<Data_t, Real_t>::TCpuDataLoader(const Data_t &input,
+                                               size_t nsamples,
+                                               size_t batchSize,
+                                               size_t ninputFeatures,
+                                               size_t noutputFeatures,
+                                               size_t bufferSize)
+    : fInput(input), fNSamples(nsamples), fBatchSize(batchSize),
+      fBufferSize(bufferSize), fNInputFeatures(ninputFeatures),
+      fNOutputFeatures(noutputFeatures), fNBatches(nsamples / batchSize),
+      fInputMatrices(), fOutputMatrices(), fSampleIndices()
+{
+   fInputMatrices.reserve(fBufferSize);
+   fOutputMatrices.reserve(fBufferSize);
+   for (size_t i = 0; i < fBufferSize; i++) {
+      fInputMatrices.emplace_back(fBatchSize, fNInputFeatures);
+      fOutputMatrices.emplace_back(fBatchSize, fNOutputFeatures);
+   }
+
+   fSampleIndices.reserve(fNBatches);
+   for (size_t i = 0; i < fNSamples; i++) {
+      fSampleIndices.emplace_back(i);
+   }
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+inline void TCpuDataLoader<Data_t, Real_t>::CopyData(size_t batchIndex)
+{
+   auto copy = [this](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+      size_t sampleIndex = rangeBegin * this->fBatchSize;
+
+      for (size_t batchIndex = rangeBegin; batchIndex != rangeEnd; ++batchIndex) {
+         CopyBatch(this->fInputMatrices[batchIndex % this->fBufferSize],
+                   this->fOutputMatrices[batchIndex % this->fBufferSize],
+                   this->fInput,
+                   this->fSampleIndices.begin() + sampleIndex,
+                   this->fSampleIndices.begin() + sampleIndex + this->fBatchSize);
+         sampleIndex += this->fBatchSize;
+      }
+   };
+
+   size_t end   = std::min(batchIndex + fBufferSize, fNBatches);
+   size_t start = batchIndex;
+   tbb::blocked_range<size_t> range(start, end);
+   parallel_for(range, copy);
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+TCpuBatch<Real_t> TCpuDataLoader<Data_t, Real_t>::GetBatch(size_t batchIndex)
+{
+   size_t fBufferIndex = batchIndex % fBufferSize;
+   if (fBufferIndex == 0) {
+      CopyData(batchIndex);
+   }
+   return TCpuBatch<Real_t>(fInputMatrices[fBufferIndex],
+                            fOutputMatrices[fBufferIndex]);
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+auto TCpuDataLoader<Data_t, Real_t>::begin()
+    -> BatchIterator_t
+{
+   random_shuffle(fSampleIndices.begin(), fSampleIndices.end());
+   return BatchIterator_t(*this, 0);
+}
+
+//______________________________________________________________________________
+template<typename Data_t, typename Real_t>
+auto TCpuDataLoader<Data_t, Real_t>::end()
+    -> BatchIterator_t
+{
+   return BatchIterator_t(*this, fNBatches);
+}
+
+//______________________________________________________________________________
+template <>
+void TCpuDataLoader<MatrixInput_t, Double_t>::CopyBatch(
+    Matrix_t &inputMatrix,
+    Matrix_t &outputMatrix,
+    const MatrixInput_t &input,
+    IndexIterator_t indexBegin,
+    IndexIterator_t indexEnd)
+{
+   auto &in  = std::get<0>(input);
+   auto &out = std::get<1>(input);
+
+   size_t batchIndex = 0;
+   for (IndexIterator_t i = indexBegin; i != indexEnd; i++) {
+      size_t index = *i;
+      for (size_t j = 0; j < (size_t) in.GetNcols(); j++) {
+         inputMatrix(batchIndex, j) = in(index, j);
+      }
+      for (size_t j = 0; j < (size_t) out.GetNcols(); j++) {
+         outputMatrix(batchIndex, j) = out(index, j);
+      }
+      batchIndex++;
+   }
+}
+
+//______________________________________________________________________________
+template <>
+void TCpuDataLoader<TMVAInput_t, Double_t>::CopyBatch(
+    Matrix_t &inputMatrix,
+    Matrix_t &outputMatrix,
+    const TMVAInput_t &input,
+    IndexIterator_t indexBegin,
+    IndexIterator_t indexEnd)
+{
+   size_t batchIndex = 0;
+   for (IndexIterator_t i = indexBegin; i != indexEnd; i++) {
+      size_t index = *i;
+      Event *event = input.at(index);
+      for (size_t j = 0; j < event->GetNVariables(); j++) {
+         inputMatrix(batchIndex, j) = event->GetValue(j);
+      }
+      if (event->GetNTargets() > 0) {
+         for (size_t j = 0; j < event->GetNTargets(); j++) {
+            outputMatrix(batchIndex, j) = event->GetTarget(j);
+         }
+      } else {
+         outputMatrix(batchIndex, 0) = (event->GetClass() == 0) ? 1.0 : 0.0;
+         batchIndex++;
+      }
+   }
+}
+
+// Explicit instantiation.
+//______________________________________________________________________________
+template class TCpuDataLoader<MatrixInput_t, Double_t>;
+template class TCpuDataLoader<TMVAInput_t, Double_t>;
+template class TCpuBatchIterator<MatrixInput_t, Double_t>;
+template class TCpuBatchIterator<TMVAInput_t, Double_t>;
+template class TCpuBatch<Double_t>;
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Dropout.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Dropout.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..ff81d034d24575b9f7b95f8ad662283e2473ee50
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/Dropout.cxx
@@ -0,0 +1,47 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 21/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TRandom.h"
+
+/////////////////////////////////////////////////////////////////////
+// Implementation of Dropout for multi-threaded CPU architectures. //
+/////////////////////////////////////////////////////////////////////
+
+namespace TMVA {
+namespace DNN  {
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::Dropout(TCpuMatrix<AFloat> &A,
+                           AFloat dropoutProbability)
+{
+   AFloat __restrict__ *data = A.GetRawDataPointer();
+
+   auto fRange = [&data, dropoutProbability](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      TRandom rand(time(nullptr) + rangeBegin);
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+          AFloat r = rand.Uniform();
+          data[i] = (r > dropoutProbability) ? 0.0 : data[i] / dropoutProbability;
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, A.GetNElements());
+   parallel_for(range, fRange);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Initialization.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Initialization.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..4875d4580f944751b29b456e16b3b944dbd1822c
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/Initialization.cxx
@@ -0,0 +1,98 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 21/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////
+ // Implementation of the DNN initialization methods for the //
+ // multi-threaded CPU backend.                              //
+ //////////////////////////////////////////////////////////////
+
+#include "TRandom.h"
+#include "TMVA/DNN/Architectures/Cpu.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::InitializeGauss(TCpuMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   AFloat sigma = sqrt(2.0 / ((AFloat) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = rand.Gaus(0.0, sigma);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::InitializeUniform(TCpuMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   AFloat range = sqrt(2.0 / ((AFloat) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = rand.Uniform(-range, range);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::InitializeIdentity(TCpuMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         A(i,j) = 0.0;
+      }
+
+      if (i < n) {
+         A(i,i) = 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::InitializeZero(TCpuMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         A(i,j) = 0.0;
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..bad4155c9cf4dcffd5d89978ba8c6446e152971e
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx
@@ -0,0 +1,147 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ /////////////////////////////////////////////////////////////////////
+ // Implementation of the loss functions for the multi-threaded CPU //
+ // implementation using tbb and BLAS.                              //
+ /////////////////////////////////////////////////////////////////////
+
+#include "tbb/tbb.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCpu<AFloat>::MeanSquaredError(const TCpuMatrix<AFloat> &Y,
+                                      const TCpuMatrix<AFloat> &output)
+{
+   const AFloat __restrict__ *dataY      = Y.GetRawDataPointer();
+   const AFloat __restrict__ *dataOutput = output.GetRawDataPointer();
+
+   auto f = [&dataY, &dataOutput](const tbb::blocked_range<size_t> & range,
+                                  AFloat partialSum)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      AFloat sum = partialSum;
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+          AFloat error = dataY[i] - dataOutput[i];
+          sum += error * error;
+      }
+
+      return sum;
+   };
+
+   auto reduction = [](AFloat sum1, AFloat sum2)
+   {
+      return sum1 + sum2;
+   };
+
+   AFloat norm = 1.0 / ((AFloat) Y.GetNcols() * Y.GetNrows());
+   tbb::blocked_range<size_t> range(0, Y.GetNElements());
+   return norm * parallel_reduce(range, 0.0, f, reduction);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::MeanSquaredErrorGradients(
+    TCpuMatrix<AFloat> & dY,
+    const TCpuMatrix<AFloat> & Y,
+    const TCpuMatrix<AFloat> & output)
+{
+
+         AFloat __restrict__ *dataDY     = dY.GetRawDataPointer();
+   const AFloat __restrict__ *dataY      = Y.GetRawDataPointer();
+   const AFloat __restrict__ *dataOutput = output.GetRawDataPointer();
+   AFloat norm = 1.0 / ((AFloat) Y.GetNrows() * Y.GetNcols());
+
+   auto f = [&dataDY, &dataY, &dataOutput, norm](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         dataDY[i] = - 2.0 * norm * (dataY[i] - dataOutput[i]);
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, Y.GetNElements());
+   parallel_for(range, f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCpu<AFloat>::CrossEntropy(const TCpuMatrix<AFloat> &Y,
+                                               const TCpuMatrix<AFloat> &output)
+{
+   const AFloat __restrict__ *dataY      = Y.GetRawDataPointer();
+   const AFloat __restrict__ *dataOutput = output.GetRawDataPointer();
+
+   auto f = [&dataY, &dataOutput](const tbb::blocked_range<size_t> & range,
+                                  AFloat partialSum)
+   {
+      size_t rangeBegin = range.begin();
+         size_t rangeEnd   = range.end();
+
+         AFloat sum = partialSum;
+         for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+            AFloat y   = dataY[i];
+            AFloat sig = 1.0 / (1.0 + exp(- dataOutput[i]));
+            sum += y * log(sig) + (1.0 - y) * log(1.0 - sig);
+         }
+         return sum;
+   };
+
+   auto reduction = [](AFloat sum1, AFloat sum2)
+   {
+      return sum1 + sum2;
+   };
+
+   tbb::blocked_range<size_t> range(0, Y.GetNElements());
+   AFloat norm = 1.0 / ((AFloat) Y.GetNcols() * Y.GetNrows());
+   return - norm * parallel_reduce(range, 0.0, f, reduction);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::CrossEntropyGradients(
+    TCpuMatrix<AFloat> & dY,
+    const TCpuMatrix<AFloat> & Y,
+    const TCpuMatrix<AFloat> & output)
+{
+         AFloat __restrict__ *dataDY     = dY.GetRawDataPointer();
+   const AFloat __restrict__ *dataY      = Y.GetRawDataPointer();
+   const AFloat __restrict__ *dataOutput = output.GetRawDataPointer();
+   AFloat norm = 1.0 / ((AFloat) Y.GetNrows() * Y.GetNcols());
+
+   auto f = [&dataDY, &dataY, &dataOutput, norm](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         AFloat y   = dataY[i];
+         AFloat sig = 1.0 / (1.0 + exp(- dataOutput[i]));
+         dataDY[i] = norm * (sig - y);
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, Y.GetNElements());
+   parallel_for(range, f);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..4c3fd3994f7b9bf847e97275ff00c160b969d343
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx
@@ -0,0 +1,33 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 21/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////
+// Implementation of output functions for multi-threaded CPU //
+// architectures.                                            //
+///////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename AFloat>
+void TCpu<AFloat>::Sigmoid(TCpuMatrix<AFloat> & B,
+                           const TCpuMatrix<AFloat> & A)
+{
+   auto f = [](AFloat x) {return 1.0 / (1.0 + exp(-x));};
+   B.MapFrom(f, A);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Propagation.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Propagation.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..ef6e7a4168418d3135830666c0370d754ccf4135
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/Propagation.cxx
@@ -0,0 +1,94 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Implementation of the functions required for the forward and     //
+// backward propagation of activations through a neural network for //
+// the reference implementation.                                    //
+//////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TMVA/DNN/Architectures/Cpu/Blas.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename AFloat>
+void TCpu<AFloat>::MultiplyTranspose(TCpuMatrix<AFloat> &output,
+                                     const TCpuMatrix<AFloat> &input,
+                                     const TCpuMatrix<AFloat> &Weights)
+{
+    int m = (int) input.GetNrows();
+    int k = (int) input.GetNcols();
+    int n = (int) Weights.GetNrows();
+
+    char transa = 'N';
+    char transb = 'T';
+
+    AFloat alpha = 1.0;
+    AFloat beta  = 0.0;
+
+    const AFloat *A = input.GetRawDataPointer();
+    const AFloat *B = Weights.GetRawDataPointer();
+          AFloat *C = output.GetRawDataPointer();
+
+    ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha,
+                            A, &m, B, &n, &beta, C, &m);
+}
+
+template<typename AFloat>
+void TCpu<AFloat>::AddRowWise(
+    TCpuMatrix<AFloat> &output,
+    const TCpuMatrix<AFloat> &biases)
+{
+    int m = (int) output.GetNrows();
+    int n = (int) output.GetNcols();
+
+    int inc = 1.0;
+    AFloat alpha = 1.0;
+
+          AFloat * A = output.GetRawDataPointer();
+    const AFloat * x = TCpuMatrix<AFloat>::GetOnePointer();
+    const AFloat * y = biases.GetRawDataPointer();
+
+    ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m);
+}
+
+template<typename AFloat>
+void TCpu<AFloat>::Backward(
+    TCpuMatrix<AFloat> & activationGradientsBackward,
+    TCpuMatrix<AFloat> & weightGradients,
+    TCpuMatrix<AFloat> & biasGradients,
+    TCpuMatrix<AFloat> & df,
+    const TCpuMatrix<AFloat> & activationGradients,
+    const TCpuMatrix<AFloat> & weights,
+    const TCpuMatrix<AFloat> & activationsBackward)
+{
+   // Compute element-wise product.
+   Hadamard(df, activationGradients);
+
+   // Activation gradients.
+   if (activationGradientsBackward.GetNElements() > 0)
+       Multiply(activationGradientsBackward, df, weights);
+
+   // Weight gradients.
+   if (weightGradients.GetNElements() > 0)
+       TransposeMultiply(weightGradients, df, activationsBackward);
+
+   // Bias gradients.
+   if (biasGradients.GetNElements() > 0)
+       SumColumns(biasGradients, df);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Regularization.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Regularization.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..404464d10de4085a296263ac242e12597352fdc8
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/Regularization.cxx
@@ -0,0 +1,133 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 21/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Implementation of the regularization functionals and gradients //
+// for the multi-threaded CPU implementation using tbb.           //
+////////////////////////////////////////////////////////////////////
+
+#include "tbb/tbb.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCpu<AFloat>::L1Regularization(const TCpuMatrix<AFloat> &Weights)
+{
+   const AFloat __restrict__ *data = Weights.GetRawDataPointer();
+
+   auto f = [&data](const tbb::blocked_range<size_t> & range,
+                    AFloat partialSum)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      AFloat sum = partialSum;
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         sum += fabs(data[i]);
+      }
+      return sum;
+   };
+
+   auto reduction = [](AFloat sum1, AFloat sum2)
+   {
+      return sum1 + sum2;
+   };
+
+   tbb::blocked_range<size_t> range(0, Weights.GetNElements());
+   return parallel_reduce(range, 0.0, f, reduction);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::AddL1RegularizationGradients(
+    TCpuMatrix<AFloat> & B,
+    const TCpuMatrix<AFloat> & A,
+    AFloat weightDecay)
+{
+
+         AFloat __restrict__ *dataB     =  B.GetRawDataPointer();
+   const AFloat __restrict__ *dataA      = A.GetRawDataPointer();
+
+   auto f = [&dataA, &dataB, weightDecay](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         AFloat sign = (dataA[i] < 0.0) ? -1.0 : 1.0;
+         dataB[i] += weightDecay * sign;
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, A.GetNElements());
+   parallel_for(range, f);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCpu<AFloat>::L2Regularization(const TCpuMatrix<AFloat> &Weights)
+{
+   const AFloat __restrict__ *data = Weights.GetRawDataPointer();
+
+   auto f = [&data](const tbb::blocked_range<size_t> & range,
+                    AFloat partialSum)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      AFloat sum = partialSum;
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+          sum += data[i] * data[i];
+      }
+      return sum;
+   };
+
+   auto reduction = [](AFloat sum1, AFloat sum2)
+   {
+      return sum1 + sum2;
+   };
+
+   tbb::blocked_range<size_t> range(0, Weights.GetNElements());
+   return parallel_reduce(range, 0.0, f, reduction);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::AddL2RegularizationGradients(
+    TCpuMatrix<AFloat> & B,
+    const TCpuMatrix<AFloat> & A,
+    AFloat weightDecay)
+{
+
+         AFloat __restrict__ *dataB     =  B.GetRawDataPointer();
+   const AFloat __restrict__ *dataA      = A.GetRawDataPointer();
+
+   auto f = [&dataA, &dataB, weightDecay](const tbb::blocked_range<size_t> & range)
+   {
+      size_t rangeBegin = range.begin();
+      size_t rangeEnd   = range.end();
+
+      for (size_t i = rangeBegin; i != rangeEnd; ++i) {
+         dataB[i] += 2.0 * weightDecay * dataA[i];
+      }
+   };
+
+   tbb::blocked_range<size_t> range(0, A.GetNElements());
+   parallel_for(range, f);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda.cu b/tmva/tmva/src/DNN/Architectures/Cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f791c613b487e279a18672c0ba9095bf4f9e291
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda.cu
@@ -0,0 +1,34 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////
+// Explicit instantiation of the TCuda architecture class with //
+// for Double_t and Real_t floating point types.               //
+/////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "Cuda/Propagation.cu"
+#include "Cuda/Arithmetic.cu"
+#include "Cuda/ActivationFunctions.cu"
+#include "Cuda/OutputFunctions.cu"
+#include "Cuda/LossFunctions.cu"
+#include "Cuda/Regularization.cu"
+#include "Cuda/Initialization.cu"
+#include "Cuda/Dropout.cu"
+
+namespace TMVA {
+namespace DNN  {
+
+template class TCuda<Real_t>;
+template class TCuda<Double_t>;
+
+} // namespace tmva
+} // namespace dnn
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5654680158b38968fe624c4a969ba106c1294e5b
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
@@ -0,0 +1,216 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for the TCuda      //
+ // implementation of the low-level interface.                   //
+ //////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::IdentityDerivative(TCudaMatrix<AFloat> & B,
+                                           const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::IdentityDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       (int) B.GetNrows(),
+       (int) B.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Relu(TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Relu<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::ReluDerivative(TCudaMatrix<AFloat> & B,
+                                       const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::ReluDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::SigmoidDerivative(TCudaMatrix<AFloat> & B,
+                                          const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SigmoidDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Tanh(TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Tanh<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::TanhDerivative(TCudaMatrix<AFloat> & B,
+                                       const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::TanhDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::SymmetricRelu(TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SymmetricRelu<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::SymmetricReluDerivative(TCudaMatrix<AFloat> & B,
+                                                const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SymmetricReluDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::SoftSign(TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SoftSign<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::SoftSignDerivative(TCudaMatrix<AFloat> & B,
+                                           const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::SoftSignDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Gauss(TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Gauss<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::GaussDerivative(TCudaMatrix<AFloat> & B,
+                                    const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::GaussDerivative<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f61391687f5ed7bd1e0c6eaa741d416b7e6c7ddc
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
@@ -0,0 +1,238 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Contains additional arithmetic functions required by the CUDA //
+// neural network implementation.                                //
+///////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//____________________________________________________________________________
+template<>
+void TCuda<float>::Multiply(TCudaMatrix<float> &C,
+                             const TCudaMatrix<float> &A,
+                             const TCudaMatrix<float> &B)
+{
+   int m, n, k;
+   m = A.GetNrows();
+   k = A.GetNcols();
+   n = B.GetNcols();
+   float alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A * B)
+   cublasSgemm(A.GetCublasHandle(),
+               CUBLAS_OP_N, CUBLAS_OP_N,
+               m, n, k, & alpha,
+               A.GetDataPointer(), m,   // *A, lda
+               B.GetDataPointer(), k,   // *B, ldb
+               & beta,                  // beta
+               C.GetDataPointer(), m);  // *C, ldc
+
+   C.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<double>::Multiply(TCudaMatrix<double> &C,
+                             const TCudaMatrix<double> &A,
+                             const TCudaMatrix<double> &B)
+{
+   int m, n, k;
+   m = A.GetNrows();
+   k = A.GetNcols();
+   n = B.GetNcols();
+   double alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A * B)
+   cublasDgemm(A.GetCublasHandle(),
+               CUBLAS_OP_N, CUBLAS_OP_N,
+               m, n, k, & alpha,
+               A.GetDataPointer(), m,   // *A, lda
+               B.GetDataPointer(), k,   // *B, ldb
+               & beta,                  // beta
+               C.GetDataPointer(), m);  // *C, ldc
+
+   C.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<float>::TransposeMultiply(TCudaMatrix<float> & C,
+                                      const TCudaMatrix<float> & A,
+                                      const TCudaMatrix<float> & B)
+{
+   int m, n, k;
+   k = A.GetNrows();
+   m = A.GetNcols();
+   n = B.GetNcols();
+   float alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A^T * B)
+   cublasSgemm(A.GetCublasHandle(),
+               CUBLAS_OP_T, CUBLAS_OP_N,
+               m, n, k, & alpha,
+               A.GetDataPointer(), k,     // *A, lda
+               B.GetDataPointer(), k,     // *B, ldb
+               & beta,                    // beta
+               C.GetDataPointer(), m);    // *C, ldc
+
+   C.SetComputeStream(s);
+}
+//____________________________________________________________________________
+template<>
+void TCuda<double>::TransposeMultiply(TCudaMatrix<double> & C,
+                                      const TCudaMatrix<double> & A,
+                                      const TCudaMatrix<double> & B)
+{
+   int m, n, k;
+   k = A.GetNrows();
+   m = A.GetNcols();
+   n = B.GetNcols();
+   double alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A^T * B)
+   cublasDgemm(A.GetCublasHandle(),
+               CUBLAS_OP_T, CUBLAS_OP_N,
+               m, n, k, & alpha,
+               A.GetDataPointer(), k,     // *A, lda
+               B.GetDataPointer(), k,     // *B, ldb
+               & beta,                    // beta
+               C.GetDataPointer(), m);    // *C, ldc
+
+   C.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Hadamard(TCudaMatrix<AFloat> & B,
+                             const TCudaMatrix<AFloat> &A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
+                                                              A.GetDataPointer(),
+                                                              A.GetNrows(),
+                                                              A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+AFloat TCuda<AFloat>::Sum(const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+
+   TCudaMatrix<AFloat>::ResetDeviceReturn();
+   ::TMVA::DNN::Cuda::ReduceMatrix<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix<AFloat>::GetDeviceReturnPointer(),
+       A.GetDataPointer(),
+       A.GetNrows(),
+       A.GetNcols());
+   return TCudaMatrix<AFloat>::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<float>::SumColumns(TCudaMatrix<float> & B,
+                               const TCudaMatrix<float> & A)
+{
+   int m, n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+   float alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A * B)
+   cublasSgemv(A.GetCublasHandle(), CUBLAS_OP_T,
+               m, n, & alpha,
+               A.GetDataPointer(), m,             // *A, lda
+               TCudaMatrix<float>::GetOnes(), 1, // *x, incx
+               & beta, B.GetDataPointer(), 1);    // beta, *y, incy
+
+   B.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<double>::SumColumns(TCudaMatrix<double> & B,
+                               const TCudaMatrix<double> & A)
+{
+   int m, n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+   double alpha = 1.0, beta = 0.0;
+
+   cudaStream_t s = A.GetComputeStream();
+   cublasSetStream(A.GetCublasHandle(), s);
+
+   // Compute C = beta * C + alpha * (A * B)
+   cublasDgemv(A.GetCublasHandle(), CUBLAS_OP_T,
+               m, n, & alpha,
+               A.GetDataPointer(), m,             // *A, lda
+               TCudaMatrix<double>::GetOnes(), 1, // *x, incx
+               & beta, B.GetDataPointer(), 1);    // beta, *y, incy
+
+   B.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<float>::ScaleAdd(TCudaMatrix<float> & B,
+                            const TCudaMatrix<float> & A,
+                            float alpha)
+{
+   cudaStream_t s = 0;
+   cublasSetStream(A.GetCublasHandle(), s);
+   cublasSaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
+               A.GetDataPointer(), 1,
+               B.GetDataPointer(), 1);
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<double>::ScaleAdd(TCudaMatrix<double> & B,
+                             const TCudaMatrix<double> & A,
+                             double alpha)
+{
+   cudaStream_t s = 0;
+   cublasSetStream(A.GetCublasHandle(), s);
+   cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
+               A.GetDataPointer(), 1,
+               B.GetDataPointer(), 1);
+}
+
+} // DNN
+} // TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..68be42d775101357c9c20c8bb74ae3197c0c0146
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx
@@ -0,0 +1,334 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 07/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////////
+// Implementation of device and host buffers for CUDA architectures.  //
+////////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/DataLoader.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/CudaBuffers.h"
+#include "cuda_runtime.h"
+#include <iostream>
+
+namespace TMVA {
+namespace DNN  {
+
+//
+// TCudaHostBuffer
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat ** devicePointer)
+{
+   cudaFreeHost(*devicePointer);
+   delete[] devicePointer;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size)
+    : fOffset(0), fComputeStream(0), fDestructor()
+{
+   AFloat ** pointer = new AFloat * [1];
+   cudaMallocHost(pointer, size * sizeof(AFloat));
+   fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaHostBuffer<AFloat>::operator AFloat * () const
+{
+   return *fHostPointer + fOffset;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(size_t offset,
+                                                              size_t /*size*/)
+{
+   TCudaHostBuffer buffer = *this;
+   buffer.fOffset         = offset;
+   return buffer;
+}
+
+//
+// TCudaDevicePointer
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceBuffer<AFloat>::TDestructor::operator()(AFloat ** devicePointer)
+{
+   cudaFree(*devicePointer);
+   delete[] devicePointer;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size)
+    : fOffset(0), fSize(size), fDestructor()
+{
+   AFloat ** pointer = new AFloat * [1];
+   cudaMalloc(pointer, size * sizeof(AFloat));
+   fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
+   cudaStreamCreate(&fComputeStream);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size,
+                                                 cudaStream_t stream)
+    : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
+{
+   AFloat ** pointer = new AFloat * [1];
+   cudaMalloc(pointer, size * sizeof(AFloat));
+   fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat * devicePointer,
+                                                 size_t size,
+                                                 cudaStream_t stream)
+    : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
+{
+   AFloat ** pointer = new AFloat * [1];
+   *pointer       = devicePointer;
+   fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceBuffer<AFloat> TCudaDeviceBuffer<AFloat>::GetSubBuffer(size_t offset,
+                                                                  size_t size)
+{
+   TCudaDeviceBuffer buffer = *this;
+   buffer.fOffset           = offset;
+   buffer.fSize             = size;
+   return buffer;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+TCudaDeviceBuffer<AFloat>::operator AFloat * () const
+{
+    return *fDevicePointer + fOffset;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceBuffer<AFloat>::CopyFrom(const TCudaHostBuffer<AFloat> &buffer) const
+{
+   cudaStreamSynchronize(fComputeStream);
+   cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
+                   cudaMemcpyHostToDevice, fComputeStream);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCudaDeviceBuffer<AFloat>::CopyTo(const TCudaHostBuffer<AFloat> &buffer) const
+{
+   cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
+                   cudaMemcpyDeviceToHost, fComputeStream);
+   buffer.fComputeStream = fComputeStream;
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCuda<float>>::CopyInput(
+    TCudaHostBuffer<float> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Double_t> &inputMatrix  = std::get<0>(fData);
+   size_t n = inputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCuda<float>>::CopyOutput(
+    TCudaHostBuffer<float> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Double_t> &outputMatrix  = std::get<1>(fData);
+   size_t n = outputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCuda<float>>::CopyInput(
+    TCudaHostBuffer<float> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n  = event->GetNVariables();
+
+   // Copy input variables.
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = * sampleIterator++;
+      event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(
+    TCudaHostBuffer<float> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+
+   // Copy target(s).
+
+   for (size_t i = 0; i < batchSize; i++) {
+       size_t sampleIndex = * sampleIterator++;
+       event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         // Copy output matrices.
+         size_t bufferIndex = j * batchSize + i;
+         if (event->GetNTargets() == 0) {
+            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+         } else {
+            buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
+         }
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCuda<double>>::CopyInput(
+    TCudaHostBuffer<double> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Double_t> &inputMatrix  = std::get<0>(fData);
+   size_t n = inputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = inputMatrix(sampleIndex, j);
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<MatrixInput_t, TCuda<double>>::CopyOutput(
+    TCudaHostBuffer<double> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   const TMatrixT<Double_t> &outputMatrix  = std::get<1>(fData);
+   size_t n = outputMatrix.GetNcols();
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = *sampleIterator;
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = outputMatrix(sampleIndex, j);
+      }
+      sampleIterator++;
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCuda<double>>::CopyInput(
+    TCudaHostBuffer<double> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n  = event->GetNVariables();
+
+   // Copy input variables.
+
+   for (size_t i = 0; i < batchSize; i++) {
+      size_t sampleIndex = * sampleIterator++;
+      event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         size_t bufferIndex = j * batchSize + i;
+         buffer[bufferIndex] = event->GetValue(j);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<>
+void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(
+    TCudaHostBuffer<double> & buffer,
+    IndexIterator_t sampleIterator,
+    size_t batchSize)
+{
+   Event * event  = fData.front();
+   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+
+   // Copy target(s).
+
+   for (size_t i = 0; i < batchSize; i++) {
+       size_t sampleIndex = * sampleIterator++;
+       event = fData[sampleIndex];
+      for (size_t j = 0; j < n; j++) {
+         // Copy output matrices.
+         size_t bufferIndex = j * batchSize + i;
+         if (event->GetNTargets() == 0) {
+            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+         } else {
+            buffer[bufferIndex] = event->GetTarget(j);
+         }
+      }
+   }
+}
+
+// Explicit Instantiations.
+
+template class TCudaDeviceBuffer<float>;
+template class TCudaDeviceBuffer<double>;
+
+template class TCudaHostBuffer<float>;
+template class TCudaHostBuffer<double>;
+
+template class TDataLoader<MatrixInput_t, TCuda<float>>;
+template class TDataLoader<TMVAInput_t,   TCuda<float>>;
+template class TDataLoader<MatrixInput_t, TCuda<double>>;
+template class TDataLoader<TMVAInput_t,   TCuda<double>>;
+
+} // TMVA
+} // DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0a3f27b3f8a04db7a0c3c71d18cba39dd4b38ec1
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
@@ -0,0 +1,167 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////
+// Implementation of the TCudaMatrix class. //
+/////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda/CudaMatrix.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+
+namespace TMVA {
+namespace DNN  {
+
+//____________________________________________________________________________
+__global__ void CurandInitializationKernel(unsigned long long seed,
+                                           curandState_t *state)
+{
+   int i   = blockDim.y * blockIdx.y + threadIdx.y;
+   int j   = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid = i * gridDim.x + j;
+   curand_init(seed + tid, 0, tid, state + tid);
+}
+
+// Static members.
+//____________________________________________________________________________
+template<typename AFloat>
+size_t          TCudaMatrix<AFloat>::fInstances     = 0;
+template<typename AFloat>
+cublasHandle_t  TCudaMatrix<AFloat>::fCublasHandle  = nullptr;
+template<typename AFloat>
+AFloat        * TCudaMatrix<AFloat>::fDeviceReturn  = nullptr;
+template<typename AFloat>
+AFloat        * TCudaMatrix<AFloat>::fOnes          = nullptr;
+template<typename AFloat>
+curandState_t * TCudaMatrix<AFloat>::fCurandStates  = nullptr;
+template<typename AFloat>
+size_t          TCudaMatrix<AFloat>::fNCurandStates = 0;
+template<typename AFloat>
+size_t          TCudaMatrix<AFloat>::fNOnes         = 0;
+
+// Constructors.
+//____________________________________________________________________________
+template<typename AFloat>
+TCudaMatrix<AFloat>::TCudaMatrix()
+    : fNRows(0), fNCols(0), fElementBuffer()
+{
+   InitializeCuda();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+TCudaMatrix<AFloat>::TCudaMatrix(size_t m, size_t n)
+    : fNRows(m), fNCols(n), fElementBuffer(m * n, 0)
+{
+   InitializeCuda();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+TCudaMatrix<AFloat>::TCudaMatrix(const TMatrixT<Double_t> & Host)
+    : fNRows(Host.GetNrows()), fNCols(Host.GetNcols()),
+      fElementBuffer(Host.GetNoElements(), 0)
+{
+   InitializeCuda();
+
+   AFloat * buffer = new AFloat[fNRows * fNCols];
+   size_t index = 0;
+   for (size_t j = 0; j < fNCols; j++) {
+      for (size_t i = 0; i < fNRows; i++) {
+         buffer[index] = static_cast<AFloat>(Host(i, j));
+         index++;
+      }
+   }
+
+   cudaMemcpy(fElementBuffer, buffer, fNRows * fNCols * sizeof(AFloat),
+              cudaMemcpyHostToDevice);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+TCudaMatrix<AFloat>::TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer,
+                         size_t m, size_t n)
+    : fNRows(m), fNCols(n), fElementBuffer(buffer)
+{
+   InitializeCuda();
+}
+
+//____________________________________________________________________________
+template <typename AFloat>
+inline void TCudaMatrix<AFloat>::InitializeCuda()
+{
+   if (fInstances == 0) {
+       cublasCreate(&fCublasHandle);
+       CUDACHECK(cudaMalloc(& fDeviceReturn, sizeof(AFloat)));
+       CUDACHECK(cudaMalloc(& fCurandStates, TDevice::NThreads(*this)));
+   }
+   if (TDevice::NThreads(*this) > (int) fNCurandStates) {
+       fNCurandStates = TDevice::NThreads(*this);
+       if (fCurandStates) {
+           cudaFree(fCurandStates);
+       }
+       cudaMalloc(&fCurandStates, TDevice::NThreads(*this) * sizeof(curandState_t));
+       InitializeCurandStates();
+   }
+   if (fNRows >  fNOnes) {
+      fNOnes = fNRows;
+      if (fOnes) {
+         cudaFree(fOnes);
+      }
+      cudaMalloc(&fOnes, fNRows * sizeof(AFloat));
+      AFloat * buffer = new AFloat[fNRows];
+      for (size_t i = 0; i < fNRows; i++) {
+         buffer[i] = 1.0;
+      }
+      cudaMemcpy(fOnes, buffer, fNRows * sizeof(AFloat),
+                 cudaMemcpyHostToDevice);
+   }
+   fInstances++;
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCudaMatrix<AFloat>::InitializeCurandStates()
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(*this);
+   CurandInitializationKernel<<<gridDims, blockDims>>>(time(nullptr), fCurandStates);
+}
+
+// Conversion to TMatrixT.
+//____________________________________________________________________________
+template<typename AFloat>
+TCudaMatrix<AFloat>::operator TMatrixT<Double_t>() const
+{
+   TMatrixT<Double_t> hostMatrix(GetNrows(), GetNcols());
+
+   AFloat * buffer = new AFloat[fNRows * fNCols];
+   cudaMemcpy(buffer, fElementBuffer, fNRows * fNCols * sizeof(AFloat),
+              cudaMemcpyDeviceToHost);
+
+   size_t index = 0;
+   for (size_t j = 0; j < fNCols; j++) {
+      for (size_t i = 0; i < fNRows; i++) {
+         hostMatrix(i, j) = static_cast<Double_t>(buffer[index]);
+         index++;
+      }
+   }
+
+   delete[] buffer;
+   return hostMatrix;
+}
+
+// Explicit Instantiations.
+
+template class TCudaMatrix<float>;
+template class TCudaMatrix<double>;
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
new file mode 100644
index 0000000000000000000000000000000000000000..501e2e6d2492080b41f12fde17c3d1c31ddd87e0
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
@@ -0,0 +1,40 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 14/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+/////////////////////////////////////////////////////////////////////
+// Implementation of the Dropout function for TCuda architectures. //
+/////////////////////////////////////////////////////////////////////
+
+namespace TMVA {
+namespace DNN  {
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Dropout(TCudaMatrix<AFloat> &A,
+                            AFloat dropoutProbability)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Dropout<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols(),
+       dropoutProbability,
+       TCudaMatrix<AFloat>::GetCurandStatesPointer());
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1492d1473534c39e6a24d6cbdbce9949fe19c826
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu
@@ -0,0 +1,108 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 14/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ /////////////////////////////////////////////////////////////
+ // Implementation of the initialization functions for CUDA //
+ // Architectures                                           //
+ /////////////////////////////////////////////////////////////
+
+#include "TRandom.h"
+#include "TMatrix.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "Kernels.cuh"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::InitializeGauss(TCudaMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+   TMatrixT<Double_t> B(m, n);
+
+   Double_t sigma = sqrt(2.0 / ((Double_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = rand.Gaus(0.0, sigma);
+      }
+   }
+   A = B;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::InitializeUniform(TCudaMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+   TMatrixT<Double_t> B(m, n);
+
+   Double_t range = sqrt(2.0 / ((Double_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = rand.Uniform(-range, range);
+      }
+   }
+   A = B;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::InitializeIdentity(TCudaMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+   TMatrixT<Double_t> B(m, n);
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         B(i,j) = 0.0;
+      }
+
+      if (i < n) {
+         B(i,i) = 1.0;
+      }
+   }
+   A = B;
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::InitializeZero(TCudaMatrix<AFloat> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+   TMatrixT<Double_t> B(m, n);
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         B(i,j) = 0.0;
+      }
+   }
+   A = B;
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a0ace5f1bbdb8c68f4f893c9f1d0a79a1b09be71
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
@@ -0,0 +1,672 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////
+// Implementation of the device kernels for the CUDA implementation of //
+// the low-level interface.                                            //
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
+#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "cuda.h"
+#include "math.h"
+
+namespace TMVA {
+namespace DNN  {
+namespace Cuda {
+
+//____________________________________________________________________________
+template<typename AFloat>
+__device__ AFloat AtomicAdd(AFloat* address, AFloat val);
+
+template<>
+__device__ double AtomicAdd(double* address, double val)
+{
+   unsigned long long int* address_as_ull = (unsigned long long int*)address;
+   unsigned long long int old = *address_as_ull, assumed;
+   do {
+      assumed = old;
+      old = atomicCAS(address_as_ull, assumed,
+                      __double_as_longlong(val +
+                                           __longlong_as_double(assumed)));
+   } while (assumed != old);
+   return __longlong_as_double(old);
+}
+
+template<>
+__device__ float AtomicAdd(float* address, float val)
+{
+   return atomicAdd(address, val);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__device__ void ReduceSumVertical(AFloat *result,
+                                  AFloat * sdata,
+                                  int n)
+{
+   // i,j are block row and column indices.
+   int i = threadIdx.y;
+   int j = threadIdx.x;
+   int index = i * blockDim.x + j;
+
+   __syncthreads();
+   if ((blockDim.y > 512) && (i < 512)) {
+      if ((i + 512) < blockDim.y) {
+         sdata[index] += sdata[index + 512 * blockDim.x];
+      }
+   }
+
+   __syncthreads();
+   if ((blockDim.y > 256) && (i < 256)) {
+      if ((i + 256) < blockDim.y) {
+         sdata[index] += sdata[index + 256 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 128) && (i < 128)) {
+      if ((i + 128) < blockDim.y) {
+         sdata[index] += sdata[index + 128 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 64) && (i < 64)) {
+      if ((i + 64) < blockDim.y) {
+         sdata[index] += sdata[index + 64 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 32) && (i < 32)) {
+      if ((i + 32) < blockDim.y) {
+         sdata[index] += sdata[index + 32 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 16) && (i < 16)) {
+      if ((i + 16) < blockDim.y) {
+         sdata[index] += sdata[index + 16 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 8) && (i < 8)) {
+      if ((i + 8) < blockDim.y) {
+         sdata[index] += sdata[index + 8 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 4) && (i < 4)) {
+      if ((i + 4) < blockDim.y) {
+         sdata[index] += sdata[index + 4 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 2) && (i < 2)) {
+      if ((i + 2) < blockDim.y) {
+         sdata[index] += sdata[index + 2 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((blockDim.y > 1) && (i < 1)) {
+      if ((i + 1) < blockDim.y) {
+         sdata[index] += sdata[index + 1 * blockDim.x];
+      }
+   }
+   __syncthreads();
+   if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) < n)) {
+      AtomicAdd(result + j, sdata[index]);
+   }
+   __syncthreads();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__device__ void ReduceSum(AFloat *result, AFloat * sdata)
+{
+   int tid = threadIdx.x + threadIdx.y * blockDim.x;
+
+   __syncthreads();
+   if ((TDevice::BlockSize > 512) && (tid < 512)) {
+      if ((tid + 512) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 512];
+      }
+   }
+
+   __syncthreads();
+   if ((TDevice::BlockSize > 256) && (tid < 256)) {
+      if ((tid + 256) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 256];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 128) && (tid < 128)) {
+      if ((tid + 128) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 128];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 64) && (tid < 64)) {
+      if ((tid + 64) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 64];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 32) && (tid < 32)) {
+      if ((tid + 32) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 32];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 16) && (tid < 16)) {
+      if ((tid + 16) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 16];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 8) && (tid < 8)) {
+      if ((tid + 8) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 8];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 4) && (tid < 4)) {
+      if ((tid + 4) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 4];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 2) && (tid < 2)) {
+      if ((tid + 2) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 2];
+      }
+   }
+   __syncthreads();
+   if ((TDevice::BlockSize > 1) && (tid < 1)) {
+      if ((tid + 1) < TDevice::BlockSize) {
+         sdata[tid] += sdata[tid + 1];
+      }
+   }
+   if (tid == 0) {
+       AtomicAdd(result, sdata[0]);
+   }
+
+   __syncthreads();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AddRowWise(AFloat * W,
+                           const AFloat * theta,
+                           int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       W[index] += theta[j];
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Hadamard(AFloat * B,
+                         const AFloat * A,
+                         int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       B[index] *= A[index];
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void IdentityDerivative(AFloat * A,
+                                   int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       A[index] = 1.0;
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Relu(AFloat * A,
+                     int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat x = A[index];
+      A[index] = (x < 0.0) ? 0.0 : x;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void ReluDerivative(AFloat * B,
+                               const AFloat * A, int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat x = A[index];
+      B[index] = (x < 0.0) ? 0.0 : 1.0;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Sigmoid(AFloat * A,
+                        int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat sig = 1.0 / (1.0 + exp(-A[index]));
+      A[index] = sig;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Sigmoid(AFloat * B,
+                        const AFloat * A,
+                        int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat sig = 1.0 / (1.0 + exp(-A[index]));
+      B[index] = sig;
+   }
+}
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SigmoidDerivative(AFloat * B,
+                                  const AFloat * A,
+                                  int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat sig = 1.0 / (1.0 + exp(-A[index]));
+      B[index] = sig * (1.0 - sig);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Tanh(AFloat * A,
+                     int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat t = ::tanh(A[index]);
+      A[index] = t;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void TanhDerivative(AFloat * B,
+                               const AFloat * A,
+                               int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat t = ::tanh(A[index]);
+      B[index] = 1 - t*t;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SymmetricRelu(AFloat * A,
+                              int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      A[index] = abs(A[index]);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SymmetricReluDerivative(AFloat * B,
+                                        const AFloat * A,
+                                        int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      B[index] = (A[index] < 0.0) ? -1.0 : 1.0;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SoftSign(AFloat * A,
+                          int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat x = A[index];
+      A[index] = x / (1.0 + abs(x));
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SoftSignDerivative(AFloat * B,
+                                   const AFloat * A,
+                                   int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat x = 1.0 + fabs(A[index]);
+      B[index] = 1 / (x * x);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Gauss(AFloat * A,
+                      int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat x = A[index];
+      A[index] = exp(- x * x);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void GaussDerivative(AFloat * B,
+                                const AFloat * A,
+                                int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat x = A[index];
+      B[index] = - 2.0 * x * exp(- x * x);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void MeanSquaredError(AFloat * result,
+                                 const AFloat * Y,
+                                 const AFloat * output,
+                                 int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ AFloat sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       AFloat norm = 1 / ((AFloat) (m * n));
+       AFloat e   = Y[index] - output[index];
+       sdata[tid] = norm * e * e;
+   } else {
+       sdata[tid] = 0.0;
+   }
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SquaredSum(AFloat * result,
+                           const AFloat * A,
+                           int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ AFloat sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       AFloat e = A[index];
+       sdata[tid] = e * e;
+   } else {
+       sdata[tid] = 0.0;
+   }
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AbsoluteSum(AFloat * result,
+                            const AFloat * A,
+                            int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ AFloat sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       sdata[tid] = abs(A[index]);
+   } else {
+       sdata[tid] = 0.0;
+   }
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void MeanSquaredErrorGradients(AFloat * dY,
+                                          const AFloat * Y,
+                                          const AFloat * output,
+                                          int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n))
+       dY[index] = 2.0 / ((AFloat) (m * n)) * (output[index] - Y[index]);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AddL1RegularizationGradients(AFloat * A,
+                                             const AFloat * B,
+                                             AFloat weightDecay,
+                                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+       AFloat sign = (B[index] < 0.0) ? -1.0 : 1.0;
+       A[index] += sign * weightDecay;
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AddL2RegularizationGradients(AFloat * A,
+                                             const AFloat * B,
+                                             AFloat weightDecay,
+                                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+       A[index] += 2.0 * weightDecay * B[index];
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void CrossEntropy(AFloat * result,
+                             const AFloat * Y,
+                             const AFloat * output,
+                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid   = blockDim.x * threadIdx.y + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ AFloat sdata[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       AFloat norm = 1 / ((AFloat) (m * n));
+       AFloat sig  = 1.0 / (1.0 + exp(-output[index]));
+       AFloat ce   = Y[index] * log(sig) + (1.0 - Y[index]) * log(1.0 - sig);
+       sdata[tid]        = - norm * ce;
+   } else {
+       sdata[tid] = 0.0;
+   }
+
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void CrossEntropyGradients(AFloat * dY,
+                                      const AFloat * Y,
+                                      const AFloat * output,
+                                      int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      AFloat norm = 1 / ((AFloat) (m * n));
+      AFloat y = Y[index];
+      AFloat sig = 1.0 / (1.0 + exp(-output[index]));
+      dY[index] = norm * (sig - y);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void ReduceMatrix(AFloat *result,
+                             const AFloat *A,
+                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid = threadIdx.y * blockDim.x + threadIdx.x;
+   int index = j * m + i;
+
+   __shared__ AFloat smem[TDevice::BlockSize];
+   if ((i < m) && (j < n))
+       smem[tid] = A[index];
+   else
+       smem[tid] = 0.0;
+
+   ReduceSum(result, smem);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SumColumns(AFloat *B,
+                            const AFloat *A,
+                            int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int matrixIndex = j * m + i;
+   int blockIndex  = blockDim.x * threadIdx.y + threadIdx.x;
+
+
+   __shared__ AFloat smem[TDevice::BlockSize];
+
+   if ((i < m) && (j < n)) {
+       smem[blockIndex] = A[matrixIndex];
+   } else {
+       smem[blockIndex] = 0.0;
+   }
+
+   ReduceSumVertical(B + blockDim.x * blockIdx.x, smem, n);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Dropout(AFloat *A,
+                        int m, int n,
+                        AFloat dropoutProbability,
+                        curandState_t *state)
+{
+   int i   = blockDim.y * blockIdx.y + threadIdx.y;
+   int j   = blockDim.x * blockIdx.x + threadIdx.x;
+   int tid = i * gridDim.x + j;
+   if ((i < m) && (j < n)) {
+      float r = curand_uniform(state + tid);
+      if (r > dropoutProbability) {
+         A[j * m + i] = 0.0;
+      } else {
+         A[j * m + i] /= dropoutProbability;
+      }
+   }
+}
+
+} // namespace Cuda
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a242468ff7cf8092721ad0ca6294907e037bfdb
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
@@ -0,0 +1,99 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////////
+// Implementation of the loss functions for the TCuda implementation //
+// of the low-level interface.                                       //
+///////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//____________________________________________________________________________
+template<typename AFloat>
+AFloat TCuda<AFloat>::MeanSquaredError(const TCudaMatrix<AFloat> & Y,
+                                       const TCudaMatrix<AFloat> & output)
+{
+    dim3 blockDims = TDevice::BlockDims();
+    dim3 gridDims  = TDevice::GridDims(Y);
+    cudaStream_t s = Y.GetComputeStream();
+    TCudaMatrix<AFloat>::ResetDeviceReturn();
+    ::TMVA::DNN::Cuda::MeanSquaredError<<<gridDims, blockDims, 0, s>>>(
+        TCudaMatrix<AFloat>::GetDeviceReturnPointer(),
+        Y.GetDataPointer(),
+        output.GetDataPointer(),
+        (int) Y.GetNrows(),
+        (int) Y.GetNcols());
+    return TCudaMatrix<AFloat>::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::MeanSquaredErrorGradients(TCudaMatrix<AFloat> & dY,
+                                              const TCudaMatrix<AFloat> & Y,
+                                              const TCudaMatrix<AFloat> & output)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Y);
+   cudaStream_t s = output.GetComputeStream();
+   ::TMVA::DNN::Cuda::MeanSquaredErrorGradients<<<gridDims, blockDims, 0, s>>>(
+       dY.GetDataPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   dY.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+AFloat TCuda<AFloat>::CrossEntropy(const TCudaMatrix<AFloat> & Y,
+                                   const TCudaMatrix<AFloat> & output)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Y);
+   TCudaMatrix<AFloat>::ResetDeviceReturn();
+   cudaStream_t s = Y.GetComputeStream();
+   ::TMVA::DNN::Cuda::CrossEntropy<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix<AFloat>::GetDeviceReturnPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   return TCudaMatrix<AFloat>::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::CrossEntropyGradients(TCudaMatrix<AFloat> & dY,
+                                          const TCudaMatrix<AFloat> & Y,
+                                          const TCudaMatrix<AFloat> & output)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Y);
+   cudaStream_t s = output.GetComputeStream();
+   ::TMVA::DNN::Cuda::CrossEntropyGradients<<<gridDims, blockDims, 0, s>>>(
+       dY.GetDataPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   dY.SetComputeStream(s);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
new file mode 100644
index 0000000000000000000000000000000000000000..039fb27a8e36795add5e1fb5a60c56ef4e6ad138
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
@@ -0,0 +1,41 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 11/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the Reference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename AFloat>
+void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & B,
+                            const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
+                                                             A.GetDataPointer(),
+                                                             (int) A.GetNrows(),
+                                                             (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..047c6411b52cd9153d707cd79d1eef3e4c4fdc6b
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
@@ -0,0 +1,132 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the functions required for the forward and //
+ // backward propagation of activations through a neural network //
+ // for CUDA architectures.                                      //
+ //////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+namespace TMVA {
+namespace DNN  {
+
+//____________________________________________________________________________
+template<>
+void TCuda<float>::MultiplyTranspose(TCudaMatrix<float> &output,
+                                     const TCudaMatrix<float> &input,
+                                     const TCudaMatrix<float> &Weights)
+{
+   int m, n, k;
+   k = input.GetNcols();
+   m = input.GetNrows();
+   n = Weights.GetNrows();
+   float alpha = 1.0, beta = 0.0;
+
+   // Compute C = beta * C + alpha * (A * B^T)
+   cudaStream_t s = input.GetComputeStream();
+   cublasSetStream(input.GetCublasHandle(), s);
+   cublasSgemm(input.GetCublasHandle(),
+               CUBLAS_OP_N, CUBLAS_OP_T,
+               m, n, k, & alpha,
+               input.GetDataPointer(), m,     // *A, lda
+               Weights.GetDataPointer(), n,   // *B, ldb
+               & beta,                        // beta
+               output.GetDataPointer(), m);   // *C, ldc
+   output.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<>
+void TCuda<double>::MultiplyTranspose(TCudaMatrix<double> &output,
+                                      const TCudaMatrix<double> &input,
+                                      const TCudaMatrix<double> &Weights)
+{
+   int m, n, k;
+   k = input.GetNcols();
+   m = input.GetNrows();
+   n = Weights.GetNrows();
+   double alpha = 1.0, beta = 0.0;
+
+   // Compute C = beta * C + alpha * (A * B^T)
+   cudaStream_t s = input.GetComputeStream();
+   cublasSetStream(input.GetCublasHandle(), s);
+   cublasDgemm(input.GetCublasHandle(),
+               CUBLAS_OP_N, CUBLAS_OP_T,
+               m, n, k, & alpha,
+               input.GetDataPointer(), m,     // *A, lda
+               Weights.GetDataPointer(), n,   // *B, ldb
+               & beta,                        // beta
+               output.GetDataPointer(), m);   // *C, ldc
+   output.SetComputeStream(s);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::AddRowWise(TCudaMatrix<AFloat> &Weights,
+                               const TCudaMatrix<AFloat> &theta)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(Weights);
+   cudaStream_t s = Weights.GetComputeStream();
+   ::TMVA::DNN::Cuda::AddRowWise<<<gridDims, blockDims, 0, s>>>(
+       Weights.GetDataPointer(),
+       theta.GetDataPointer(),
+       Weights.GetNrows(),
+       Weights.GetNcols());
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Backward(TCudaMatrix<AFloat> & activation_gradients_backward,
+                             TCudaMatrix<AFloat> & weight_gradients,
+                             TCudaMatrix<AFloat> & bias_gradients,
+                             TCudaMatrix<AFloat> & df,
+                             const TCudaMatrix<AFloat> & activation_gradients,
+                             const TCudaMatrix<AFloat> & weights,
+                             const TCudaMatrix<AFloat> & activation_backward)
+{
+   // Compute element-wise product.
+   TCuda<AFloat>::Hadamard(df, activation_gradients);
+
+   // Activation gradients.
+   if (activation_gradients_backward.GetNoElements() > 0) {
+      TCuda<AFloat>::Multiply(activation_gradients_backward, df, weights);
+   }
+
+   // Weight gradients.
+   if (weight_gradients.GetNoElements() > 0) {
+      TCuda<AFloat>::TransposeMultiply(weight_gradients, df, activation_backward);
+   }
+
+   // Bias gradients.
+   if (bias_gradients.GetNoElements() > 0) {
+      TCuda<AFloat>::SumColumns(bias_gradients, df);
+   }
+
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Copy(TCudaMatrix<AFloat> & B,
+                             const TCudaMatrix<AFloat> & A)
+{
+   size_t m = B.GetNrows();
+   size_t n = B.GetNcols();
+   cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
+                   m * n * sizeof(AFloat), cudaMemcpyDeviceToDevice, 0);
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
new file mode 100644
index 0000000000000000000000000000000000000000..67851eaef8bb32df3d99c07d30a32086a494e2b4
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
@@ -0,0 +1,92 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 13/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////
+// Contains the definitions of the kernel calling functions for //
+// computation of regularization functionals and gradients      //
+// functions for CUDA architectures.                            //
+//////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Cuda/Device.h"
+#include "Kernels.cuh"
+
+namespace TMVA {
+namespace DNN  {
+
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCuda<AFloat>::L1Regularization(const TCudaMatrix<AFloat> & A)
+{
+    dim3 blockDims = TDevice::BlockDims();
+    dim3 gridDims  = TDevice::GridDims(A);
+    cudaStream_t s = A.GetComputeStream();
+    TCudaMatrix<AFloat>::ResetDeviceReturn();
+    ::TMVA::DNN::Cuda::AbsoluteSum<<<gridDims, blockDims, 0, s>>>(
+        TCudaMatrix<AFloat>::GetDeviceReturnPointer(),
+        A.GetDataPointer(),
+        (int) A.GetNrows(),
+        (int) A.GetNcols());
+    return TCudaMatrix<AFloat>::GetDeviceReturn();
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::AddL1RegularizationGradients(TCudaMatrix<AFloat> & B,
+                                                 const TCudaMatrix<AFloat> & A,
+                                                 AFloat weightDecay)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AddL1RegularizationGradients<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       weightDecay,
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCuda<AFloat>::L2Regularization(const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(A);
+   cudaStream_t s = A.GetComputeStream();
+   TCudaMatrix<AFloat>::ResetDeviceReturn();
+   ::TMVA::DNN::Cuda::SquaredSum<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix<AFloat>::GetDeviceReturnPointer(),
+       A.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+   return TCudaMatrix<AFloat>::GetDeviceReturn();
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::AddL2RegularizationGradients(TCudaMatrix<AFloat> & B,
+                                                 const TCudaMatrix<AFloat> & A,
+                                                 AFloat weightDecay)
+{
+   dim3 blockDims = TDevice::BlockDims();
+   dim3 gridDims  = TDevice::GridDims(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AddL2RegularizationGradients<<<gridDims, blockDims, 0, s>>>(
+       B.GetDataPointer(),
+       A.GetDataPointer(),
+       weightDecay,
+       (int) A.GetNrows(),
+       (int) A.GetNcols());
+}
+
+} // namspace DNN
+} // namspace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference.cxx b/tmva/tmva/src/DNN/Architectures/Reference.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..0d8eaf2b422fe0c1f853da8d9bf2ece8d7f8d47e
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference.cxx
@@ -0,0 +1,32 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the TReference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+
+#include "Reference/Propagation.cxx"
+#include "Reference/ActivationFunctions.cxx"
+#include "Reference/OutputFunctions.cxx"
+#include "Reference/LossFunctions.cxx"
+#include "Reference/Regularization.cxx"
+#include "Reference/Initialization.cxx"
+#include "Reference/Dropout.cxx"
+
+namespace TMVA {
+namespace DNN  {
+template class TReference<Double_t>;
+} // namespace TMVA
+} // namespace DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..e4c2caa52d49903fdee801caed582e51780829a2
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx
@@ -0,0 +1,237 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for the reference //
+ // implementation.                                              //
+ //////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include <math.h>
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::IdentityDerivative(TMatrixT<Real_t> & B,
+                                            const TMatrixT<Real_t> &/*A*/)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::Relu(TMatrixT<Real_t> &A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = std::max((Real_t) 0.0, A(i,j));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::ReluDerivative(TMatrixT<Real_t> & B,
+                                              const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++)
+   {
+      for (size_t j = 0; j < n; j++)
+      {
+         B(i,j) = (A(i,j) < 0) ? 0.0 : 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         A(i,j) = sig;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SigmoidDerivative(TMatrixT<Real_t> & B,
+                                                 const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         B(i,j) = sig * (1.0 - sig);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::Tanh(TMatrixT<Real_t> & B)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t t = tanh(B(i,j));
+         B(i,j) = t;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::TanhDerivative(TMatrixT<Real_t> & B,
+                                              const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t t = tanh(A(i,j));
+         B(i,j) = 1 - t * t;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SymmetricRelu(TMatrixT<Real_t> & B)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = fabs(B(i,j));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SymmetricReluDerivative(TMatrixT<Real_t> & B,
+                                                       const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = (A(i,j) < 0.0) ? -1.0 : 1.0;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SoftSign(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = A(i,j);
+         A(i,j)   = x / (1 + fabs(x));
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::SoftSignDerivative(TMatrixT<Real_t> & B,
+                                                  const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = 1.0 + fabs(A(i,j));
+         B(i,j)   = 1.0 / (x * x);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::Gauss(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = A(i,j);
+         A(i,j)   = exp(- x * x);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+inline void TReference<Real_t>::GaussDerivative(TMatrixT<Real_t> & B,
+                                               const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t x = A(i,j);
+         B(i,j)   = - 2.0 * x * exp(- x * x);
+      }
+   }
+}
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..04e14811436c0518bdbc677cba1cd3a63372d12c
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx
@@ -0,0 +1,50 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////
+ // Implementation of the activation functions for the reference //
+ // implementation.                                              //
+ //////////////////////////////////////////////////////////////////
+
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TRandom.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+
+template<typename Real_t>
+void TReference<Real_t>::Dropout(TMatrixT<Real_t> & B, Real_t dropoutProbability)
+{
+   size_t m,n;
+   m = B.GetNrows();
+   n = B.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t r = rand.Uniform();
+         if (r >= dropoutProbability) {
+            B(i,j) = 0.0;
+         } else {
+            B(i,j) /= dropoutProbability;
+         }
+      }
+   }
+}
+
+}
+}
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..789f08773abcdfb2db099315d40f408484024a9a
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx
@@ -0,0 +1,97 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////////
+ // Implementation of the initialization functions for the reference //
+ // implementation.                                                  //
+ //////////////////////////////////////////////////////////////////////
+
+#include "TRandom.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::InitializeGauss(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   Real_t sigma = sqrt(2.0 / ((Real_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = rand.Gaus(0.0, sigma);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::InitializeUniform(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   TRandom rand(time(nullptr));
+
+   Real_t range = sqrt(2.0 / ((Real_t) n));
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = rand.Uniform(-range, range);
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::InitializeIdentity(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) = 0.0;
+      }
+
+      if (i < n) {
+         A(i,i) = 1.0;
+      }
+   }
+}
+
+template<typename Real_t>
+void TReference<Real_t>::InitializeZero(TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m ; i++) {
+      for (size_t j = 0; j < n ; j++) {
+         A(i,j) = 0.0;
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..aa0b144be2053fa411cb04a6a2a82c9b425ceb89
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
@@ -0,0 +1,101 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ ////////////////////////////////////////////////////////////
+ // Implementation of the loss functions for the reference //
+ // implementation.                                        //
+ ////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::MeanSquaredError(const TMatrixT<Real_t> &Y,
+                                           const TMatrixT<Real_t> &output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t dY = (Y(i,j) - output(i,j));
+         result += dY * dY;
+      }
+   }
+   result /= (Real_t) (m * n);
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::MeanSquaredErrorGradients(TMatrixT<Real_t> & dY,
+                                                  const TMatrixT<Real_t> & Y,
+                                                  const TMatrixT<Real_t> & output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+
+   dY.Minus(Y, output);
+   dY *= - 2.0 / ((Real_t) (m*n));
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::CrossEntropy(const TMatrixT<Real_t> &Y,
+                                       const TMatrixT<Real_t> &output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j)));
+         result      += Y(i,j) * std::log(sig)
+         + (1.0 - Y(i,j)) * std::log(1.0 - sig);
+      }
+   }
+   result /= - (Real_t) (m * n);
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::CrossEntropyGradients(TMatrixT<Real_t> & dY,
+                                              const TMatrixT<Real_t> & Y,
+                                              const TMatrixT<Real_t> & output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+
+   Real_t norm = 1.0 / ((Real_t) (m * n));
+   for (size_t i = 0; i < m; i++)
+   {
+      for (size_t j = 0; j < n; j++)
+      {
+         Real_t y   = Y(i,j);
+         Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j)));
+         dY(i,j) = norm * (sig - y);
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..731c95713d404a49942e8ecc6c86e9f2c51c340f
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
@@ -0,0 +1,37 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 11/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////
+// Explicit instantiation of the TReference architecture class //
+// template for Double_t scalar types.                        //
+////////////////////////////////////////////////////////////////
+
+namespace TMVA {
+namespace DNN  {
+
+template<typename Real_t>
+void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B,
+                                const TMatrixT<Real_t> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         B(i,j) = sig;
+      }
+   }
+}
+
+} // namespace TMVA
+} // namespace DNN
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..aa4d3515d8ccd8162d49f8275344da167e450363
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx
@@ -0,0 +1,102 @@
+// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Implementation of the functions required for the forward and    //
+// backward propagation of activations through a neural network in //
+// the reference implementation.                                   //
+/////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::MultiplyTranspose(TMatrixT<Scalar_t> &output,
+                                            const TMatrixT<Scalar_t> &input,
+                                            const TMatrixT<Scalar_t> &weights)
+{
+    output.MultT(input, weights);
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::AddRowWise(TMatrixT<Scalar_t> &output,
+                                     const TMatrixT<Scalar_t> &biases)
+{
+   for (size_t i = 0; i < (size_t) output.GetNrows(); i++) {
+      for (size_t j = 0; j < (size_t) output.GetNcols(); j++) {
+         output(i,j) += biases(j,0);
+      }
+   }
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::Backward(TMatrixT<Scalar_t> & activation_gradients_backward,
+                                   TMatrixT<Scalar_t> & weight_gradients,
+                                   TMatrixT<Scalar_t> & bias_gradients,
+                                   TMatrixT<Scalar_t> & df,
+                                   const TMatrixT<Scalar_t> & activation_gradients,
+                                   const TMatrixT<Scalar_t> & weights,
+                                   const TMatrixT<Scalar_t> & activations_backward)
+{
+
+   // Compute element-wise product.
+   for (size_t i = 0; i < (size_t) df.GetNrows(); i++) {
+      for (size_t j = 0; j < (size_t) df.GetNcols(); j++) {
+         df(i,j) *= activation_gradients(i,j);
+      }
+   }
+
+   // Activation gradients.
+   if (activation_gradients_backward.GetNoElements() > 0) {
+       activation_gradients_backward.Mult(df, weights);
+   }
+
+   // Weights gradients.
+   if (weight_gradients.GetNoElements() > 0) {
+      weight_gradients.TMult(df, activations_backward);
+   }
+
+   // Bias gradients.
+   if (bias_gradients.GetNoElements() > 0) {
+      for (size_t j = 0; j < (size_t) df.GetNcols(); j++) {
+         Scalar_t sum = 0.0;
+         for (size_t i = 0; i < (size_t) df.GetNrows(); i++) {
+            sum += df(i,j);
+         }
+         bias_gradients(j,0) = sum;
+      }
+   }
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::ScaleAdd(TMatrixT<Scalar_t> & A,
+                                   const TMatrixT<Scalar_t> & B,
+                                   Scalar_t beta)
+{
+   for (size_t i = 0; i < (size_t) A.GetNrows(); i++) {
+      for (size_t j = 0; j < (size_t) A.GetNcols(); j++) {
+         A(i,j) += beta * B(i,j);
+      }
+   }
+}
+
+template<typename Scalar_t>
+void TReference<Scalar_t>::Copy(TMatrixT<Scalar_t> & A,
+                                const TMatrixT<Scalar_t> & B)
+{
+   A = B;
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..ffe0f19ec7db19e5401bc304711c0ec3a2d6ddb8
--- /dev/null
+++ b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx
@@ -0,0 +1,98 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 10/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+ //////////////////////////////////////////////////////////////////////
+ // Implementation of the regularization functions for the reference //
+ // implementation.                                                  //
+ //////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::L1Regularization(const TMatrixT<Real_t> & W)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         result += std::abs(W(i,j));
+      }
+   }
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::AddL1RegularizationGradients(TMatrixT<Real_t> & A,
+                                                     const TMatrixT<Real_t> & W,
+                                                     Real_t weightDecay)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   Real_t sign = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         sign = (W(i,j) > 0.0) ? 1.0 : -1.0;
+         A(i,j) += sign * weightDecay;
+      }
+   }
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+Real_t TReference<Real_t>::L2Regularization(const TMatrixT<Real_t> & W)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   Real_t result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         result += W(i,j) * W(i,j);
+      }
+   }
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename Real_t>
+void TReference<Real_t>::AddL2RegularizationGradients(TMatrixT<Real_t> & A,
+                                                     const TMatrixT<Real_t> & W,
+                                                     Real_t weightDecay)
+{
+   size_t m,n;
+   m = W.GetNrows();
+   n = W.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < n; j++) {
+         A(i,j) += weightDecay * 2.0 * W(i,j);
+      }
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/src/DNN/DataLoader.cxx b/tmva/tmva/src/DNN/DataLoader.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..f07258dcb881e1c6c707fb7418014b8313f20c7f
--- /dev/null
+++ b/tmva/tmva/src/DNN/DataLoader.cxx
@@ -0,0 +1,18 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 08/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation of the generic data loader for neural network input data. //
+//////////////////////////////////////////////////////////////////////////////
+
+namespace TMVA {
+namespace DNN  {
+
diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx
index d3f0e547ca3497ead30a589f0041157fb8685ae6..7ee53e1a9313b34aff23e5c286b8872c5774afc6 100644
--- a/tmva/tmva/src/MethodDNN.cxx
+++ b/tmva/tmva/src/MethodDNN.cxx
@@ -4,14 +4,15 @@
 /**********************************************************************************
  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
  * Package: TMVA                                                                  *
- * Class  : MethodDNN                                                              *
+ * Class  : MethodDNN                                                             *
  * Web    : http://tmva.sourceforge.net                                           *
  *                                                                                *
  * Description:                                                                   *
  *      A neural network implementation                                           *
  *                                                                                *
  * Authors (alphabetical):                                                        *
- *      Peter Speckmayer      <peter.speckmayer@gmx.ch> - CERN, Switzerland       *
+ *      Simon Pfreundschuh    <s.pfreundschuh@gmail.com> - CERN, Switzerland      *
+ *      Peter Speckmayer      <peter.speckmayer@gmx.ch>  - CERN, Switzerland      *
  *                                                                                *
  * Copyright (c) 2005-2015:                                                       *
  *      CERN, Switzerland                                                         *
@@ -24,10 +25,10 @@
  * (http://tmva.sourceforge.net/LICENSE)                                          *
  **********************************************************************************/
 
-//_______________________________________________________________________
+//______________________________________________________________________________
 //
-// neural network implementation
-//_______________________________________________________________________
+// Deep Neural Network Implementation
+//______________________________________________________________________________
 
 #include "TString.h"
 #include "TTree.h"
@@ -42,44 +43,34 @@
 #include "TMVA/Config.h"
 #include "TMVA/Ranking.h"
 
+#include "TMVA/DNN/Net.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
 #include "TMVA/NeuralNet.h"
 #include "TMVA/Monitoring.h"
 
 #include <algorithm>
 #include <iostream>
+#include <string>
+#include <iomanip>
 
 REGISTER_METHOD(DNN)
 
 ClassImp(TMVA::MethodDNN)
 
+using TMVA::DNN::EActivationFunction;
+using TMVA::DNN::ELossFunction;
+using TMVA::DNN::EInitialization;
+using TMVA::DNN::EOutputFunction;
 
-
-
-   namespace TMVA
-   {
-      namespace DNN
-      {
-         template <typename Container, typename T>
-         void gaussDistribution (Container& container, T mean, T sigma)
-         {
-            for (auto it = begin (container), itEnd = end (container); it != itEnd; ++it)
-               {
-                  (*it) = DNN::gaussDouble (mean, sigma);
-               }
-         }
-      };
-   };
-
-
-
-
-
+namespace TMVA
+{
 
 //______________________________________________________________________________
-TMVA::MethodDNN::MethodDNN( const TString& jobName,
-                            const TString& methodTitle,
-                            DataSetInfo& theData,
-                            const TString& theOption )
+TMVA::MethodDNN::MethodDNN(const TString& jobName,
+                           const TString& methodTitle,
+                           DataSetInfo& theData,
+                           const TString& theOption)
    : MethodBase( jobName, Types::kDNN, methodTitle, theData, theOption)
    , fResume (false)
 {
@@ -87,10 +78,9 @@ TMVA::MethodDNN::MethodDNN( const TString& jobName,
 }
 
 //______________________________________________________________________________
-TMVA::MethodDNN::MethodDNN( DataSetInfo& theData,
-                            const TString& theWeightFile)
-   : MethodBase( Types::kDNN, theData, theWeightFile)
-   , fResume (false)
+TMVA::MethodDNN::MethodDNN(DataSetInfo& theData,
+                           const TString& theWeightFile)
+   : MethodBase( Types::kDNN, theData, theWeightFile), fResume (false)
 {
    // constructor from a weight file
 }
@@ -102,10 +92,13 @@ TMVA::MethodDNN::~MethodDNN()
    // nothing to be done
 }
 
-//_______________________________________________________________________
-Bool_t TMVA::MethodDNN::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ )
+//______________________________________________________________________________
+Bool_t TMVA::MethodDNN::HasAnalysisType(Types::EAnalysisType type,
+                                        UInt_t numberClasses,
+                                        UInt_t /*numberTargets*/ )
 {
-   // MLP can handle classification with 2 classes and regression with one regression-target
+   // MLP can handle classification with 2 classes and regression with
+   // one regression-target
    if (type == Types::kClassification && numberClasses == 2 ) return kTRUE;
    if (type == Types::kMulticlass ) return kTRUE;
    if (type == Types::kRegression ) return kTRUE;
@@ -119,221 +112,254 @@ void TMVA::MethodDNN::Init()
    // default initializations
 }
 
-//_______________________________________________________________________
+//______________________________________________________________________________
 void TMVA::MethodDNN::DeclareOptions()
 {
-   // define the options (their key words) that can be set in the option string
-   // know options:
-   // TrainingMethod  <string>     Training method
-   //    available values are:         BP   Back-Propagation <default>
-   //                                  GA   Genetic Algorithm (takes a LONG time)
-   //
-   // LearningRate    <float>      DNN learning rate parameter
-   // DecayRate       <float>      Decay rate for learning parameter
-   // TestRate        <int>        Test for overtraining performed at each #th epochs
+   // Options to be set in the option string:
    //
-   // BPMode          <string>     Back-propagation learning mode
-   //    available values are:         sequential <default>
-   //                                  batch
-   //
-   // BatchSize       <int>        Batch size: number of events/batch, only set if in Batch Mode,
-   //                                          -1 for BatchSize=number_of_events
-
-   // DeclareOptionRef(fTrainMethodS="SD", "TrainingMethod",
-   //                  "Train with back propagation steepest descend");
-   // AddPreDefVal(TString("SD"));
-
-   //   DeclareOptionRef(fLayoutString="TANH|(N+30)*2,TANH|(N+30),LINEAR",    "Layout",    "neural network layout");
-   // DeclareOptionRef(fLayoutString="RELU|(N+20)*2,RELU|(N+10)*2,LINEAR",    "Layout",    "neural network layout");
-   DeclareOptionRef(fLayoutString="SOFTSIGN|(N+100)*2,LINEAR",    "Layout",    "neural network layout");
-
-
-   DeclareOptionRef(fErrorStrategy="CROSSENTROPY",    "ErrorStrategy",    "error strategy (regression: sum of squares; classification: crossentropy; multiclass: crossentropy/mutual exclusive cross entropy");
+   // LearningRate    <float>      DNN learning rate parameter.
+   // DecayRate       <float>      Decay rate for learning parameter.
+   // TestRate        <int>        Period of validation set error computation.
+   // BatchSize       <int>        Number of event per batch.
+
+   DeclareOptionRef(fLayoutString="SOFTSIGN|(N+100)*2,LINEAR",
+                                  "Layout",
+                                  "Layou of the network.");
+
+   DeclareOptionRef(fErrorStrategy="CROSSENTROPY",
+                    "ErrorStrategy",
+                    "Loss function: Mean squared error (regression)"
+                    " or cross entropy (binary classifcation).");
    AddPreDefVal(TString("CROSSENTROPY"));
    AddPreDefVal(TString("SUMOFSQUARES"));
-   AddPreDefVal(TString("MUTUALEXCLUSIVE"));
-   AddPreDefVal(TString("CHECKGRADIENTS"));
 
-
-   DeclareOptionRef(fWeightInitializationStrategyString="XAVIER",    "WeightInitialization",    "Weight initialization strategy");
+   DeclareOptionRef(fWeightInitializationString="XAVIER",
+                    "WeightInitialization",
+                    "Weight initialization strategy");
    AddPreDefVal(TString("XAVIER"));
    AddPreDefVal(TString("XAVIERUNIFORM"));
-   AddPreDefVal(TString("LAYERSIZE"));
-
-
-   DeclareOptionRef(fTrainingStrategy="LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropConfig=0.0+0.5+0.5,DropRepetitions=5,Multithreading=True",    "TrainingStrategy",    "defines the training strategies");
 
-   DeclareOptionRef(fSumOfSigWeights_test=1000.0,    "SignalWeightsSum",    "Sum of weights of signal; Is used to compute the significance on the fly");
-   DeclareOptionRef(fSumOfBkgWeights_test=1000.0,    "BackgroundWeightsSum",    "Sum of weights of background; Is used to compute the significance on the fly");
+   DeclareOptionRef(fArchitectureString="STANDARD",
+                    "Architecture",
+                    "Which architecture to perfrom the training on.");
+   AddPreDefVal(TString("STANDARD"));
+   AddPreDefVal(TString("CPU"));
+   AddPreDefVal(TString("GPU"));
+   AddPreDefVal(TString("OPENCL"));
+
+   DeclareOptionRef(
+       fTrainingStrategyString = "LearningRate=1e-1,"
+                                 "Momentum=0.3,"
+                                 "Repetitions=3,"
+                                 "ConvergenceSteps=50,"
+                                 "BatchSize=30,"
+                                 "TestRepetitions=7,"
+                                 "WeightDecay=0.0,"
+                                 "Renormalize=L2,"
+                                 "DropConfig=0.0,"
+                                 "DropRepetitions=5|LearningRate=1e-4,"
+                                 "Momentum=0.3,"
+                                 "Repetitions=3,"
+                                 "ConvergenceSteps=50,"
+                                 "BatchSize=20,"
+                                 "TestRepetitions=7,"
+                                 "WeightDecay=0.001,"
+                                 "Renormalize=L2,"
+                                 "DropConfig=0.0+0.5+0.5,"
+                                 "DropRepetitions=5,"
+                                 "Multithreading=True",
+                                 "TrainingStrategy",
+                                 "Defines the training strategies.");
 }
 
-
-std::vector<std::pair<int,TMVA::DNN::EnumFunction>> TMVA::MethodDNN::ParseLayoutString(TString layerSpec)
+//______________________________________________________________________________
+auto TMVA::MethodDNN::ParseLayoutString(TString layoutString)
+    -> LayoutVector_t
 {
    // parse layout specification string and return a vector, each entry
    // containing the number of neurons to go in each successive layer
-   std::vector<std::pair<int,TMVA::DNN::EnumFunction>> layout;
-   const TString delim_Layer (",");
-   const TString delim_Sub ("|");
+   LayoutVector_t layout;
+   const TString layerDelimiter(",");
+   const TString subDelimiter("|");
 
-   const size_t inputSize = GetNvar ();
+   const size_t inputSize = GetNvar();
 
-   TObjArray* layerStrings = layerSpec.Tokenize (delim_Layer);
-   TIter nextLayer (layerStrings);
+   TObjArray* layerStrings = layoutString.Tokenize(layerDelimiter);
+   TIter       nextLayer (layerStrings);
    TObjString* layerString = (TObjString*)nextLayer ();
-   for (; layerString != NULL; layerString = (TObjString*)nextLayer ())
-      {
-         int numNodes = 0;
-         TMVA::DNN::EnumFunction eActivationFunction = DNN::EnumFunction::TANH;
-
-         TObjArray* subStrings = layerString->GetString ().Tokenize (delim_Sub);
-         TIter nextToken (subStrings);
-         TObjString* token = (TObjString*)nextToken ();
-         int idxToken = 0;
-         for (; token != NULL; token = (TObjString*)nextToken ())
-            {
-               switch (idxToken)
-                  {
-                  case 0:
-                     {
-                        TString strActFnc (token->GetString ());
-                        if (strActFnc == "RELU")
-                           eActivationFunction = DNN::EnumFunction::RELU;
-                        else if (strActFnc == "TANH")
-                           eActivationFunction = DNN::EnumFunction::TANH;
-                        else if (strActFnc == "SYMMRELU")
-                           eActivationFunction = DNN::EnumFunction::SYMMRELU;
-                        else if (strActFnc == "SOFTSIGN")
-                           eActivationFunction = DNN::EnumFunction::SOFTSIGN;
-                        else if (strActFnc == "SIGMOID")
-                           eActivationFunction = DNN::EnumFunction::SIGMOID;
-                        else if (strActFnc == "LINEAR")
-                           eActivationFunction = DNN::EnumFunction::LINEAR;
-                        else if (strActFnc == "GAUSS")
-                           eActivationFunction = DNN::EnumFunction::GAUSS;
-                     }
-                     break;
-                  case 1: // number of nodes
-                     {
-                        TString strNumNodes (token->GetString ());
-                        TString strN ("x");
-                        strNumNodes.ReplaceAll ("N", strN);
-                        strNumNodes.ReplaceAll ("n", strN);
-                        TFormula fml ("tmp",strNumNodes);
-                        numNodes = fml.Eval (inputSize);
-                     }
-                     break;
-                  }
-               ++idxToken;
+
+   for (; layerString != nullptr; layerString = (TObjString*) nextLayer()) {
+      int numNodes = 0;
+      EActivationFunction activationFunction = EActivationFunction::kTanh;
+
+      TObjArray* subStrings = layerString->GetString().Tokenize(subDelimiter);
+      TIter nextToken (subStrings);
+      TObjString* token = (TObjString *) nextToken();
+      int idxToken = 0;
+      for (; token != nullptr; token = (TObjString *) nextToken()) {
+         switch (idxToken)
+         {
+         case 0:
+         {
+            TString strActFnc (token->GetString ());
+            if (strActFnc == "RELU") {
+                activationFunction = DNN::EActivationFunction::kRelu;
+            } else if (strActFnc == "TANH") {
+                activationFunction = DNN::EActivationFunction::kTanh;
+            } else if (strActFnc == "SYMMRELU") {
+                activationFunction = DNN::EActivationFunction::kSymmRelu;
+            } else if (strActFnc == "SOFTSIGN") {
+                activationFunction = DNN::EActivationFunction::kSoftSign;
+            } else if (strActFnc == "SIGMOID") {
+                activationFunction = DNN::EActivationFunction::kSigmoid;
+            } else if (strActFnc == "LINEAR") {
+                activationFunction = DNN::EActivationFunction::kIdentity;
+            } else if (strActFnc == "GAUSS") {
+                activationFunction = DNN::EActivationFunction::kGauss;
             }
-         layout.push_back (std::make_pair (numNodes,eActivationFunction));
+         }
+         break;
+         case 1: // number of nodes
+         {
+            TString strNumNodes (token->GetString ());
+            TString strN ("x");
+            strNumNodes.ReplaceAll ("N", strN);
+            strNumNodes.ReplaceAll ("n", strN);
+            TFormula fml ("tmp",strNumNodes);
+            numNodes = fml.Eval (inputSize);
+         }
+         break;
+         }
+         ++idxToken;
+      }
+      layout.push_back(std::make_pair(numNodes, activationFunction));
       }
    return layout;
 }
 
-
-
 // parse key value pairs in blocks -> return vector of blocks with map of key value pairs
-std::vector<std::map<TString,TString>> TMVA::MethodDNN::ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim)
+//______________________________________________________________________________
+auto TMVA::MethodDNN::ParseKeyValueString(TString parseString,
+                                          TString blockDelim,
+                                          TString tokenDelim)
+    -> KeyValueVector_t
 {
-   std::vector<std::map<TString,TString>> blockKeyValues;
+   KeyValueVector_t blockKeyValues;
    const TString keyValueDelim ("=");
 
-   //    const size_t inputSize = GetNvar ();
-
    TObjArray* blockStrings = parseString.Tokenize (blockDelim);
    TIter nextBlock (blockStrings);
-   TObjString* blockString = (TObjString*)nextBlock ();
-   for (; blockString != NULL; blockString = (TObjString*)nextBlock ())
+   TObjString* blockString = (TObjString *) nextBlock();
+
+   for (; blockString != nullptr; blockString = (TObjString *) nextBlock())
+   {
+      blockKeyValues.push_back (std::map<TString,TString>());
+      std::map<TString,TString>& currentBlock = blockKeyValues.back ();
+
+      TObjArray* subStrings = blockString->GetString ().Tokenize (tokenDelim);
+      TIter nextToken (subStrings);
+      TObjString* token = (TObjString*)nextToken ();
+
+      for (; token != nullptr; token = (TObjString *)nextToken())
       {
-         blockKeyValues.push_back (std::map<TString,TString> ()); // new block
-         std::map<TString,TString>& currentBlock = blockKeyValues.back ();
-
-         TObjArray* subStrings = blockString->GetString ().Tokenize (tokenDelim);
-         TIter nextToken (subStrings);
-         TObjString* token = (TObjString*)nextToken ();
-       
-         for (; token != NULL; token = (TObjString*)nextToken ())
-            {
-               TString strKeyValue (token->GetString ());
-               int delimPos = strKeyValue.First (keyValueDelim.Data ());
-               if (delimPos <= 0)
-                  continue;
-
-               TString strKey = TString (strKeyValue (0, delimPos));
-               strKey.ToUpper ();
-               TString strValue = TString (strKeyValue (delimPos+1, strKeyValue.Length ()));
-
-               strKey.Strip (TString::kBoth, ' ');
-               strValue.Strip (TString::kBoth, ' ');
-
-               currentBlock.insert (std::make_pair (strKey, strValue));
-            }
+         TString strKeyValue (token->GetString ());
+         int delimPos = strKeyValue.First (keyValueDelim.Data ());
+         if (delimPos <= 0)
+             continue;
+
+         TString strKey = TString (strKeyValue (0, delimPos));
+         strKey.ToUpper();
+         TString strValue = TString (strKeyValue (delimPos+1, strKeyValue.Length ()));
+
+         strKey.Strip (TString::kBoth, ' ');
+         strValue.Strip (TString::kBoth, ' ');
+
+         currentBlock.insert (std::make_pair (strKey, strValue));
       }
+   }
    return blockKeyValues;
 }
 
-
-TString fetchValue (const std::map<TString, TString>& keyValueMap, TString _key)
+//______________________________________________________________________________
+TString fetchValue (const std::map<TString, TString>& keyValueMap, TString key)
 {
-   TString key (_key);
    key.ToUpper ();
    std::map<TString, TString>::const_iterator it = keyValueMap.find (key);
-   if (it == keyValueMap.end ())
+   if (it == keyValueMap.end()) {
       return TString ("");
+   }
    return it->second;
 }
 
+//______________________________________________________________________________
 template <typename T>
-T fetchValue (const std::map<TString,TString>& keyValueMap, TString key, T defaultValue);
+T fetchValue(const std::map<TString,TString>& keyValueMap,
+              TString key,
+              T defaultValue);
 
+//______________________________________________________________________________
 template <>
-int fetchValue (const std::map<TString,TString>& keyValueMap, TString key, int defaultValue)
+int fetchValue(const std::map<TString,TString>& keyValueMap,
+               TString key,
+               int defaultValue)
 {
    TString value (fetchValue (keyValueMap, key));
-   if (value == "")
+   if (value == "") {
       return defaultValue;
+   }
    return value.Atoi ();
 }
 
+//______________________________________________________________________________
 template <>
-double fetchValue (const std::map<TString,TString>& keyValueMap, TString key, double defaultValue)
+double fetchValue (const std::map<TString,TString>& keyValueMap,
+                   TString key, double defaultValue)
 {
    TString value (fetchValue (keyValueMap, key));
-   if (value == "")
+   if (value == "") {
       return defaultValue;
+   }
    return value.Atof ();
 }
 
+//______________________________________________________________________________
 template <>
-TString fetchValue (const std::map<TString,TString>& keyValueMap, TString key, TString defaultValue)
+TString fetchValue (const std::map<TString,TString>& keyValueMap,
+                    TString key, TString defaultValue)
 {
    TString value (fetchValue (keyValueMap, key));
-   if (value == "")
+   if (value == "") {
       return defaultValue;
+   }
    return value;
 }
 
+//______________________________________________________________________________
 template <>
-bool fetchValue (const std::map<TString,TString>& keyValueMap, TString key, bool defaultValue)
+bool fetchValue (const std::map<TString,TString>& keyValueMap,
+                 TString key, bool defaultValue)
 {
    TString value (fetchValue (keyValueMap, key));
-   if (value == "")
+   if (value == "") {
       return defaultValue;
+   }
    value.ToUpper ();
-   if (value == "TRUE" ||
-       value == "T" ||
-       value == "1")
+   if (value == "TRUE" || value == "T" || value == "1") {
       return true;
+   }
    return false;
 }
 
+//______________________________________________________________________________
 template <>
-std::vector<double> fetchValue (const std::map<TString,TString>& keyValueMap, TString key, std::vector<double> defaultValue)
+std::vector<double> fetchValue(const std::map<TString, TString> & keyValueMap,
+                               TString key,
+                               std::vector<double> defaultValue)
 {
    TString parseString (fetchValue (keyValueMap, key));
-   if (parseString == "")
+   if (parseString == "") {
       return defaultValue;
+   }
    parseString.ToUpper ();
    std::vector<double> values;
 
@@ -341,610 +367,776 @@ std::vector<double> fetchValue (const std::map<TString,TString>& keyValueMap, TS
    TObjArray* tokenStrings = parseString.Tokenize (tokenDelim);
    TIter nextToken (tokenStrings);
    TObjString* tokenString = (TObjString*)nextToken ();
-   for (; tokenString != NULL; tokenString = (TObjString*)nextToken ())
-      {
-         std::stringstream sstr;
-         double currentValue;
-         sstr << tokenString->GetString ().Data ();
-         sstr >> currentValue;
-         values.push_back (currentValue);
-      }
+   for (; tokenString != NULL; tokenString = (TObjString*)nextToken ()) {
+      std::stringstream sstr;
+      double currentValue;
+      sstr << tokenString->GetString ().Data ();
+      sstr >> currentValue;
+      values.push_back (currentValue);
+   }
    return values;
 }
 
-
-
-//_______________________________________________________________________
+//______________________________________________________________________________
 void TMVA::MethodDNN::ProcessOptions()
 {
-   // process user options
-   //   MethodBase::ProcessOptions();
-
-   if (fErrorStrategy == "CHECKGRADIENTS") 
-      return checkGradients ();
-
-
-   
    if (IgnoreEventsWithNegWeightsInTraining()) {
-      Log() << kINFO 
+      Log() << kINFO
             << "Will ignore negative events in training!"
             << Endl;
    }
 
+   //
+   // Set network structure.
+   //
+
    fLayout = TMVA::MethodDNN::ParseLayoutString (fLayoutString);
+   size_t inputSize = GetNVariables ();
+   size_t outputSize = (GetNTargets() == 0) ? 1 : GetNTargets();
 
-   //                                                                                         block-delimiter  token-delimiter
-   std::vector<std::map<TString,TString>> strategyKeyValues = ParseKeyValueString (fTrainingStrategy, TString ("|"), TString (","));
+   fNet.SetBatchSize(1);
+   fNet.SetInputWidth(inputSize);
 
+   auto itLayout    = std::begin (fLayout);
+   auto itLayoutEnd = std::end (fLayout)-1;
+   for ( ; itLayout != itLayoutEnd; ++itLayout) {
+      fNet.AddLayer((*itLayout).first, (*itLayout).second);
+   }
+   fNet.AddLayer(outputSize, EActivationFunction::kIdentity);
 
-   if (fWeightInitializationStrategyString == "XAVIER")
-      fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIER;
-   if (fWeightInitializationStrategyString == "XAVIERUNIFORM")
-      fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIERUNIFORM;
-   else if (fWeightInitializationStrategyString == "LAYERSIZE")
-      fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::LAYERSIZE;
-   else if (fWeightInitializationStrategyString == "TEST")
-      fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::TEST;
-   else
-      fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIER;
+   //
+   // Loss function and output.
+   //
 
-   // create settings
+   fOutputFunction = EOutputFunction::kSigmoid;
    if (fAnalysisType == Types::kClassification)
-      {
-         if (fErrorStrategy == "SUMOFSQUARES") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::SUMOFSQUARES;
-         if (fErrorStrategy == "CROSSENTROPY") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY;
-         if (fErrorStrategy == "MUTUALEXCLUSIVE") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE;
+   {
+      if (fErrorStrategy == "SUMOFSQUARES") {
+         fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
       }
-   else if (fAnalysisType == Types::kMulticlass)
-      {
-         if (fErrorStrategy == "SUMOFSQUARES") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::SUMOFSQUARES;
-         if (fErrorStrategy == "CROSSENTROPY") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY;
-         if (fErrorStrategy == "MUTUALEXCLUSIVE") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE;
+      if (fErrorStrategy == "CROSSENTROPY") {
+         fNet.SetLossFunction(ELossFunction::kCrossEntropy);
       }
-   else if (fAnalysisType == Types::kRegression)
-      {
-         if (fErrorStrategy != "SUMOFSQUARES")
-            {
-               Log () << kWARNING 
-                      << "For regression only SUMOFSQUARES is a valid neural net error function."
-                      << "Setting error function to SUMOFSQUARES now."
-                      << Endl;
-            }
-         fModeErrorFunction = TMVA::DNN::ModeErrorFunction::SUMOFSQUARES;
+      fOutputFunction = EOutputFunction::kSigmoid;
+   } else if (fAnalysisType == Types::kRegression) {
+      if (fErrorStrategy != "SUMOFSQUARES") {
+         Log () << kWARNING << "For regression only SUMOFSQUARES is a valid "
+                << " neural net error function. Setting error function to "
+                << " SUMOFSQUARES now." << Endl;
       }
-   
-   for (auto& block : strategyKeyValues)
-      {
-         size_t convergenceSteps = fetchValue (block, "ConvergenceSteps", 100);
-         int batchSize = fetchValue (block, "BatchSize", 30);
-         int testRepetitions = fetchValue (block, "TestRepetitions", 7);
-         double factorWeightDecay = fetchValue (block, "WeightDecay", 0.0);
-         TString regularization = fetchValue (block, "Regularization", TString ("NONE"));
-         double learningRate = fetchValue (block, "LearningRate", 1e-5);
-         double momentum = fetchValue (block, "Momentum", 0.3);
-         int repetitions = fetchValue (block, "Repetitions", 3);
-         TString strMultithreading = fetchValue (block, "Multithreading", TString ("True"));
-         std::vector<double> dropConfig;
-         dropConfig = fetchValue (block, "DropConfig", dropConfig);
-         int dropRepetitions = fetchValue (block, "DropRepetitions", 3);
-
-         TMVA::DNN::EnumRegularization eRegularization = TMVA::DNN::EnumRegularization::NONE;
-         if (regularization == "L1")
-            eRegularization = TMVA::DNN::EnumRegularization::L1;
-         else if (regularization == "L2")
-            eRegularization = TMVA::DNN::EnumRegularization::L2;
-         else if (regularization == "L1MAX")
-            eRegularization = TMVA::DNN::EnumRegularization::L1MAX;
-
-
-         strMultithreading.ToUpper ();
-         bool multithreading = true;
-         if (strMultithreading.BeginsWith ("T"))
-            multithreading = true;
-         else
-            multithreading = false;
-           
-
-         if (fAnalysisType == Types::kClassification)
-            {
-               std::shared_ptr<TMVA::DNN::ClassificationSettings> ptrSettings = make_shared <TMVA::DNN::ClassificationSettings> (
-                                                                                                                                 GetName  (),
-                                                                                                                                 convergenceSteps, batchSize, 
-                                                                                                                                 testRepetitions, factorWeightDecay,
-                                                                                                                                 eRegularization, fScaleToNumEvents, TMVA::DNN::MinimizerType::fSteepest,
-                                                                                                                                 learningRate, 
-                                                                                                                                 momentum, repetitions, multithreading);
-               ptrSettings->setWeightSums (fSumOfSigWeights_test, fSumOfBkgWeights_test);
-               fSettings.push_back (ptrSettings);
-            }
-         else if (fAnalysisType == Types::kMulticlass)
-            {
-               std::shared_ptr<TMVA::DNN::Settings> ptrSettings = make_shared <TMVA::DNN::Settings> (
-                                                                                                     GetName  (),
-                                                                                                     convergenceSteps, batchSize, 
-                                                                                                     testRepetitions, factorWeightDecay,
-                                                                                                     eRegularization, TMVA::DNN::MinimizerType::fSteepest,
-                                                                                                     learningRate, 
-                                                                                                     momentum, repetitions, multithreading);
-               fSettings.push_back (ptrSettings);
-            }
-         else if (fAnalysisType == Types::kRegression)
-            {
-               std::shared_ptr<TMVA::DNN::Settings> ptrSettings = make_shared <TMVA::DNN::Settings> (
-                                                                                                     GetName  (),
-                                                                                                     convergenceSteps, batchSize, 
-                                                                                                     testRepetitions, factorWeightDecay,
-                                                                                                     eRegularization, TMVA::DNN::MinimizerType::fSteepest,
-                                                                                                     learningRate, 
-                                                                                                     momentum, repetitions, multithreading);
-               fSettings.push_back (ptrSettings);
-            }
+      fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
+      fOutputFunction = EOutputFunction::kIdentity;
+   } else if (fAnalysisType == Types::kMulticlass) {
+      if (fErrorStrategy == "SUMOFSQUARES") {
+         fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
+      }
+      if (fErrorStrategy == "CROSSENTROPY") {
+         fNet.SetLossFunction(ELossFunction::kCrossEntropy);
+      }
+      if (fErrorStrategy == "MUTUALEXCLUSIVE") {
+         Log () << kFatal << "MUTUALEXCLUSIVE not yet implemented." << Endl;
+      }
+      fOutputFunction = EOutputFunction::kSigmoid;
+   }
 
-           
-         if (dropRepetitions > 0 && !dropConfig.empty ())
-            {
-               fSettings.back ()->setDropOut (std::begin (dropConfig), std::end (dropConfig), dropRepetitions);
-            }
-           
+   //
+   // Initialization
+   //
+
+   if (fWeightInitializationString == "XAVIER") {
+      fWeightInitialization = DNN::EInitialization::kGauss;
+   }
+   else if (fWeightInitializationString == "XAVIERUNIFORM") {
+      fWeightInitialization = DNN::EInitialization::kUniform;
+   }
+   else {
+      fWeightInitialization = DNN::EInitialization::kGauss;
+   }
+
+   //
+   // Training settings.
+   //
+
+   KeyValueVector_t strategyKeyValues = ParseKeyValueString(fTrainingStrategyString,
+                                                            TString ("|"),
+                                                            TString (","));
+   for (auto& block : strategyKeyValues) {
+      TTrainingSettings settings;
+
+      settings.convergenceSteps = fetchValue(block, "ConvergenceSteps", 100);
+      settings.batchSize        = fetchValue(block, "BatchSize", 30);
+      settings.testInterval     = fetchValue(block, "TestRepetitions", 7);
+      settings.weightDecay      = fetchValue(block, "WeightDecay", 0.0);
+      settings.learningRate         = fetchValue(block, "LearningRate", 1e-5);
+      settings.momentum             = fetchValue(block, "Momentum", 0.3);
+      settings.dropoutProbabilities = fetchValue(block, "DropConfig",
+                                                 std::vector<Double_t>());
+
+      TString regularization = fetchValue(block, "Regularization",
+                                          TString ("NONE"));
+      if (regularization == "L1") {
+         settings.regularization = DNN::ERegularization::kL1;
+      } else if (regularization == "L2") {
+         settings.regularization = DNN::ERegularization::kL2;
+      }
+
+      TString strMultithreading = fetchValue(block, "Multithreading",
+                                             TString ("True"));
+      if (strMultithreading.BeginsWith ("T")) {
+         settings.multithreading = true;
+      } else {
+         settings.multithreading = false;
       }
+
+      fTrainingSettings.push_back(settings);
+   }
 }
 
 //______________________________________________________________________________
 void TMVA::MethodDNN::Train()
 {
-    
-   fMonitoring = NULL;
-   // if (!fMonitoring)
-   // {
-   //     fMonitoring = make_shared<Monitoring>();
-   //     fMonitoring->Start ();
-   // }
-
-   // INITIALIZATION
-   // create pattern
+   if (fArchitectureString == "GPU") {
+       TrainGpu();
+       return;
+   } else if (fArchitectureString == "OpenCL") {
+      Log() << kFATAL << "OpenCL backend not yes supported." << Endl;
+      return;
+   } else if (fArchitectureString == "CPU") {
+      TrainCpu<Double_t>();
+      return;
+   }
+
+   Log() << kINFO << "Using Standard Implementation.";
+
    std::vector<Pattern> trainPattern;
    std::vector<Pattern> testPattern;
 
    const std::vector<TMVA::Event*>& eventCollectionTraining = GetEventCollection (Types::kTraining);
    const std::vector<TMVA::Event*>& eventCollectionTesting  = GetEventCollection (Types::kTesting);
 
-   for (size_t iEvt = 0, iEvtEnd = eventCollectionTraining.size (); iEvt < iEvtEnd; ++iEvt)
-      {
-         const TMVA::Event* event = eventCollectionTraining.at (iEvt);
-         const std::vector<Float_t>& values  = event->GetValues  ();
-         if (fAnalysisType == Types::kClassification)
-            {
-               double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
-               trainPattern.push_back (Pattern (values.begin  (), values.end (), outputValue, event->GetWeight ()));
-               trainPattern.back ().addInput (1.0); // bias node
-            }
-         else
-            {
-               const std::vector<Float_t>& targets = event->GetTargets ();
-               trainPattern.push_back (Pattern (values.begin  (), values.end (), targets.begin (), targets.end (), event->GetWeight ()));
-               trainPattern.back ().addInput (1.0); // bias node
-            }
-      }
-
-   for (size_t iEvt = 0, iEvtEnd = eventCollectionTesting.size (); iEvt < iEvtEnd; ++iEvt)
-      {
-         const TMVA::Event* event = eventCollectionTesting.at (iEvt);
-         const std::vector<Float_t>& values  = event->GetValues  ();
-         if (fAnalysisType == Types::kClassification)
-            {
-               double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
-               testPattern.push_back (Pattern (values.begin  (), values.end (), outputValue, event->GetWeight ()));
-               testPattern.back ().addInput (1.0); // bias node
-            }
-         else
-            {
-               const std::vector<Float_t>& targets = event->GetTargets ();
-               testPattern.push_back (Pattern (values.begin  (), values.end (), targets.begin (), targets.end (), event->GetWeight ()));
-               testPattern.back ().addInput (1.0); // bias node
-            }
+   for (auto &event : eventCollectionTraining) {
+      const std::vector<Float_t>& values = event->GetValues();
+      if (fAnalysisType == Types::kClassification) {
+         double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
+         trainPattern.push_back(Pattern (values.begin(),
+                                         values.end(),
+                                         outputValue,
+                                         event->GetWeight()));
+         trainPattern.back().addInput(1.0);
+      } else {
+         const std::vector<Float_t>& targets = event->GetTargets ();
+         trainPattern.push_back(Pattern(values.begin(),
+                                        values.end(),
+                                        targets.begin(),
+                                        targets.end(),
+                                        event->GetWeight ()));
+         trainPattern.back ().addInput (1.0); // bias node
       }
+   }
 
-   if (trainPattern.empty () || testPattern.empty ())
-      return;
-
-   // create net and weights
-   fNet.clear ();
-   fWeights.clear ();
-
-   // if "resume" from saved weights
-   if (fResume)
-      {
-         std::cout << ".. resume" << std::endl;
-         //        std::tie (fNet, fWeights) = ReadWeights (fFileName);
+   for (auto &event : eventCollectionTesting) {
+      const std::vector<Float_t>& values = event->GetValues();
+      if (fAnalysisType == Types::kClassification) {
+         double outputValue = event->GetClass () == 0 ? 0.9 : 0.1;
+         testPattern.push_back(Pattern (values.begin(),
+                                         values.end(),
+                                         outputValue,
+                                         event->GetWeight()));
+         testPattern.back().addInput(1.0);
+      } else {
+         const std::vector<Float_t>& targets = event->GetTargets ();
+         testPattern.push_back(Pattern(values.begin(),
+                                        values.end(),
+                                        targets.begin(),
+                                        targets.end(),
+                                        event->GetWeight ()));
+         testPattern.back ().addInput (1.0); // bias node
       }
-   else // initialize weights and net
-      {
-         size_t inputSize = GetNVariables (); //trainPattern.front ().input ().size ();
-         size_t outputSize = fAnalysisType == Types::kClassification ? 1 : GetNTargets (); //trainPattern.front ().output ().size ();
-         fNet.setInputSize (inputSize + 1); // num vars + bias node
-         fNet.setOutputSize (outputSize); // num vars + bias node
-        
-         // configure neural net
-         auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one
-         for ( ; itLayout != itLayoutEnd; ++itLayout)
-            {
-               fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); 
-               Log() << kINFO 
-                     << "Add Layer with " << (*itLayout).first << " nodes." 
-                     << Endl;
-            }
+   }
 
-         DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID;
-         if (fAnalysisType == Types::kRegression)
-            {
-               eModeOutputValues = DNN::ModeOutputValues::DIRECT;
-            }
-         else if ((fAnalysisType == Types::kClassification ||
-                   fAnalysisType == Types::kMulticlass) &&
-                  fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES)
-            {
-               eModeOutputValues = DNN::ModeOutputValues::DIRECT;
-            }
-         fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); 
-         Log() << kINFO 
-               << "Add Layer with " << outputSize << " nodes." 
-               << Endl << Endl;
-         fNet.setErrorFunction (fModeErrorFunction); 
-
-         size_t numWeights = fNet.numWeights ();
-         Log() << kINFO 
-               << "Total number of Synapses = " 
-               << numWeights
-               << Endl;
-
-         // initialize weights
-         fNet.initializeWeights (fWeightInitializationStrategy, 
-                                 std::back_inserter (fWeights));
+   TMVA::DNN::Net      net;
+   std::vector<double> weights;
+
+   net.setInputSize(fNet.GetInputWidth() + 1);
+   net.setOutputSize(fNet.GetOutputWidth() + 1);
+
+   for (size_t i = 0; i < fNet.GetDepth(); i++) {
+      EActivationFunction f = fNet.GetLayer(i).GetActivationFunction();
+      EnumFunction        g = EnumFunction::LINEAR;
+      switch(f) {
+         case EActivationFunction::kIdentity: g = EnumFunction::LINEAR;   break;
+         case EActivationFunction::kRelu:     g = EnumFunction::RELU;     break;
+         case EActivationFunction::kSigmoid:  g = EnumFunction::SIGMOID;  break;
+         case EActivationFunction::kTanh:     g = EnumFunction::TANH;     break;
+         case EActivationFunction::kSymmRelu: g = EnumFunction::SYMMRELU; break;
+         case EActivationFunction::kSoftSign: g = EnumFunction::SOFTSIGN; break;
+         case EActivationFunction::kGauss:    g = EnumFunction::GAUSS;    break;
       }
-
-
-   // loop through settings 
-   // and create "settings" and minimizer 
-   int idxSetting = 0;
-   for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); itSettings != itSettingsEnd; ++itSettings, ++idxSetting)
-      {
-         std::shared_ptr<TMVA::DNN::Settings> ptrSettings = *itSettings;
-         ptrSettings->setMonitoring (fMonitoring);
-         Log() << kINFO
-               << "Training with learning rate = " << ptrSettings->learningRate ()
-               << ", momentum = " << ptrSettings->momentum ()
-               << ", repetitions = " << ptrSettings->repetitions ()
-               << Endl;
-
-         ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), (idxSetting+1)*100.0/(fSettings.size ()));
-
-         const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
-         if (!dropConfig.empty ())
-            {
-               Log () << kINFO << "Drop configuration" << Endl
-                      << "    drop repetitions = " << ptrSettings->dropRepetitions () << Endl;
-            }
-         int idx = 0;
-         for (auto f : dropConfig)
-            {
-               Log () << kINFO << "    Layer " << idx << " = " << f << Endl;
-               ++idx;
-            }
-         Log () << kINFO << Endl;
-        
-         if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest)
-            {
-               DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ());
-               /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ());
-            }
-         ptrSettings.reset ();
-         Log () << kINFO << Endl;
+      if (i < fNet.GetDepth() - 1) {
+         net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g));
+      } else {
+         ModeOutputValues h = ModeOutputValues::DIRECT;
+         switch(fOutputFunction) {
+            case EOutputFunction::kIdentity: h = ModeOutputValues::DIRECT;  break;
+            case EOutputFunction::kSigmoid:  h = ModeOutputValues::SIGMOID; break;
+         }
+         net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g, h));
       }
-   fMonitoring = 0;
-}
-
+   }
 
+   switch(fNet.GetLossFunction()) {
+      case ELossFunction::kMeanSquaredError:
+         net.setErrorFunction(ModeErrorFunction::SUMOFSQUARES);
+         break;
+      case ELossFunction::kCrossEntropy:
+         net.setErrorFunction(ModeErrorFunction::CROSSENTROPY);
+         break;
+   }
 
+   switch(fWeightInitialization) {
+      case EInitialization::kGauss:
+          net.initializeWeights(WeightInitializationStrategy::XAVIER,
+                                std::back_inserter(weights));
+          break;
+      case EInitialization::kUniform:
+          net.initializeWeights(WeightInitializationStrategy::XAVIERUNIFORM,
+                                std::back_inserter(weights));
+          break;
+      default:
+          net.initializeWeights(WeightInitializationStrategy::XAVIER,
+                                std::back_inserter(weights));
+          break;
+   }
 
 
-//_______________________________________________________________________
-Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ )
-{
-   if (fWeights.empty ())
-      return 0.0;
-
-   const std::vector<Float_t>& inputValues = GetEvent ()->GetValues ();
-   std::vector<double> input (inputValues.begin (), inputValues.end ());
-   input.push_back (1.0); // bias node
-   std::vector<double> output = fNet.compute (input, fWeights);
-   if (output.empty ())
-      return 0.0;
+   int idxSetting = 0;
+   for (auto s : fTrainingSettings) {
 
-   return output.at (0);
-}
+      EnumRegularization r = EnumRegularization::NONE;
+      switch(s.regularization) {
+         case ERegularization::kNone: r = EnumRegularization::NONE; break;
+         case ERegularization::kL1:   r = EnumRegularization::L1;   break;
+         case ERegularization::kL2:   r = EnumRegularization::L2;   break;
+      }
 
-////////////////////////////////////////////////////////////////////////////////
-/// get the regression value generated by the DNN
+      Settings * settings = new Settings(TString(), s.convergenceSteps, s.batchSize,
+                                         s.testInterval, s.weightDecay, r,
+                                         MinimizerType::fSteepest, s.learningRate,
+                                         s.momentum, 1, s.multithreading);
+      std::shared_ptr<Settings> ptrSettings(settings);
+      ptrSettings->setMonitoring (0);
+      Log() << kINFO
+            << "Training with learning rate = " << ptrSettings->learningRate ()
+            << ", momentum = " << ptrSettings->momentum ()
+            << ", repetitions = " << ptrSettings->repetitions ()
+            << Endl;
 
-const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues() 
-{
-   assert (!fWeights.empty ());
-   if (fWeights.empty ())
-      return *fRegressionReturnVal;
-
-   const Event * ev = GetEvent();
-    
-   const std::vector<Float_t>& inputValues = ev->GetValues ();
-   std::vector<double> input (inputValues.begin (), inputValues.end ());
-   input.push_back (1.0); // bias node
-   std::vector<double> output = fNet.compute (input, fWeights);
-
-   if (fRegressionReturnVal == NULL) fRegressionReturnVal = new std::vector<Float_t>();
-   fRegressionReturnVal->clear();
+      ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()),
+                                      (idxSetting+1)*100.0/(fSettings.size ()));
 
-   assert (!output.empty ());
-   if (output.empty ())
-      return *fRegressionReturnVal;
+      const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
+      if (!dropConfig.empty ()) {
+         Log () << kINFO << "Drop configuration" << Endl
+                << "    drop repetitions = " << ptrSettings->dropRepetitions()
+                << Endl;
+      }
 
-   Event * evT = new Event(*ev);
-   UInt_t ntgts = fNet.outputSize ();
-   for (UInt_t itgt = 0; itgt < ntgts; ++itgt) {
-      evT->SetTarget(itgt,output.at (itgt));
+      int idx = 0;
+      for (auto f : dropConfig) {
+         Log () << kINFO << "    Layer " << idx << " = " << f << Endl;
+         ++idx;
+      }
+      Log () << kINFO << Endl;
+
+      DNN::Steepest minimizer(ptrSettings->learningRate(),
+                              ptrSettings->momentum(),
+                              ptrSettings->repetitions());
+      net.train(weights, trainPattern, testPattern, minimizer, *ptrSettings.get());
+      ptrSettings.reset();
+      Log () << kINFO << Endl;
+      idxSetting++;
    }
-
-   const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
-   for (UInt_t itgt = 0; itgt < ntgts; ++itgt) {
-      fRegressionReturnVal->push_back( evT2->GetTarget(itgt) );
+   size_t weightIndex = 0;
+   for (size_t l = 0; l < fNet.GetDepth(); l++) {
+      auto & layerWeights = fNet.GetLayer(l).GetWeights();
+      for (size_t j = 0; j < layerWeights.GetNcols(); j++) {
+         for (size_t i = 0; i < layerWeights.GetNrows(); i++) {
+            layerWeights(i,j) = weights[weightIndex];
+            weightIndex++;
+         }
+      }
+      auto & layerBiases = fNet.GetLayer(l).GetBiases();
+      if (l == 0) {
+         for (size_t i = 0; i < layerBiases.GetNrows(); i++) {
+            layerBiases(i,0) = weights[weightIndex];
+            weightIndex++;
+         }
+      } else {
+         for (size_t i = 0; i < layerBiases.GetNrows(); i++) {
+            layerBiases(i,0) = 0.0;
+         }
+      }
    }
-
-   delete evT;
-
-   return *fRegressionReturnVal;
 }
 
+//______________________________________________________________________________
+void TMVA::MethodDNN::TrainGpu()
+{
 
+#ifdef DNNCUDA // Included only if DNNCUDA flag is set.
+
+   size_t nTrainingSamples = GetEventCollection(Types::kTraining).size();
+   size_t nTestSamples     = GetEventCollection(Types::kTesting).size();
+
+   Log() << kINFO << "Start of neural network training on GPU." << Endl;
+
+   size_t trainingPhase = 1;
+   fNet.Initialize(fWeightInitialization);
+   for (TTrainingSettings & settings : fTrainingSettings) {
+
+      TNet<TCuda<>> net(settings.batchSize, fNet);
+      net.SetWeightDecay(settings.weightDecay);
+      net.SetRegularization(settings.regularization);
+      net.SetDropoutProbabilities(settings.dropoutProbabilities);
+      net.InitializeGradients();
+      auto testNet = net.CreateClone(settings.batchSize);
+
+      Log() << kINFO << "Training phase " << trainingPhase << " of "
+            << fTrainingSettings.size() << ":" << Endl;
+      trainingPhase++;
+
+      using DataLoader_t = TDataLoader<TMVAInput_t, TCuda<>>;
+
+      size_t nThreads = 1;
+      DataLoader_t trainingData(GetEventCollection(Types::kTraining),
+                                nTrainingSamples,
+                                net.GetBatchSize(),
+                                net.GetInputWidth(),
+                                net.GetOutputWidth(), nThreads);
+      DataLoader_t testData(GetEventCollection(Types::kTesting),
+                            nTestSamples,
+                            testNet.GetBatchSize(),
+                            net.GetInputWidth(),
+                            net.GetOutputWidth(), nThreads);
+      DNN::TGradientDescent<TCuda<>> minimizer(settings.learningRate,
+                                             settings.convergenceSteps,
+                                             settings.testInterval);
+
+      std::vector<TNet<TCuda<>>> nets{};
+      std::vector<TBatch<TCuda<>>> batches{};
+      nets.reserve(nThreads);
+      for (size_t i = 0; i < nThreads; i++) {
+         nets.push_back(net);
+         for (size_t j = 0; j < net.GetDepth(); j++)
+         {
+            auto &masterLayer = net.GetLayer(j);
+            auto &layer = nets.back().GetLayer(j);
+            TCuda<>::Copy(layer.GetWeights(),
+                          masterLayer.GetWeights());
+            TCuda<>::Copy(layer.GetBiases(),
+                          masterLayer.GetBiases());
+         }
+      }
 
+      bool   converged = false;
+      size_t stepCount = 0;
+      size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
 
+      std::chrono::time_point<std::chrono::system_clock> start, end;
+      start = std::chrono::system_clock::now();
 
+      Log() << std::setw(10) << "Epoch" << " | "
+            << std::setw(12) << "Train Err."
+            << std::setw(12) << "Test  Err."
+            << std::setw(12) << "GFLOP/s"
+            << std::setw(12) << "Conv. Steps" << Endl;
+      std::string separator(62, '-');
+      Log() << separator << Endl;
 
+      while (!converged)
+      {
+         stepCount++;
+
+         // Perform minimization steps for a full epoch.
+         trainingData.Shuffle();
+         for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
+             batches.clear();
+             for (size_t j = 0; j < nThreads; j++) {
+                 batches.reserve(nThreads);
+                 batches.push_back(trainingData.GetBatch());
+             }
+             if (settings.momentum > 0.0) {
+                 minimizer.StepMomentum(net, nets, batches, settings.momentum);
+             } else {
+                 minimizer.Step(net, nets, batches);
+             }
+         }
 
+         if ((stepCount % minimizer.GetTestInterval()) == 0) {
 
+            // Compute test error.
+            Double_t testError = 0.0;
+            for (auto batch : testData) {
+               auto inputMatrix  = batch.GetInput();
+               auto outputMatrix = batch.GetOutput();
+               testError += testNet.Loss(inputMatrix, outputMatrix);
+            }
+            testError /= (Double_t) (nTestSamples / settings.batchSize);
 
-////////////////////////////////////////////////////////////////////////////////
-/// get the multiclass classification values generated by the DNN
+            end   = std::chrono::system_clock::now();
 
-const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues()
-{
-   if (fWeights.empty ())
-      return *fRegressionReturnVal;
-
-   const std::vector<Float_t>& inputValues = GetEvent ()->GetValues ();
-   std::vector<double> input (inputValues.begin (), inputValues.end ());
-   input.push_back (1.0); // bias node
-   std::vector<double> output = fNet.compute (input, fWeights);
-
-   // check the output of the network
- 
-   if (fMulticlassReturnVal == NULL) fMulticlassReturnVal = new std::vector<Float_t>();
-   fMulticlassReturnVal->clear();
-   std::vector<Float_t> temp;
-
-   UInt_t nClasses = DataInfo().GetNClasses();
-   assert (nClasses == output.size());
-   for (UInt_t icls = 0; icls < nClasses; icls++) {
-      temp.push_back (output.at (icls));
-   }
-   
-   for(UInt_t iClass=0; iClass<nClasses; iClass++){
-      Double_t norm = 0.0;
-      for(UInt_t j=0;j<nClasses;j++){
-         if(iClass!=j)
-            norm+=exp(temp[j]-temp[iClass]);
+            // Compute training error.
+            Double_t trainingError = 0.0;
+            for (auto batch : trainingData) {
+               auto inputMatrix  = batch.GetInput();
+               auto outputMatrix = batch.GetOutput();
+               trainingError += net.Loss(inputMatrix, outputMatrix);
+            }
+            trainingError /= (Double_t) (nTrainingSamples / settings.batchSize);
+
+            // Compute numerical throughput.
+            std::chrono::duration<double> elapsed_seconds = end - start;
+            double seconds = elapsed_seconds.count();
+            double nFlops  = (double) (settings.testInterval * batchesInEpoch);
+            nFlops *= net.GetNFlops() * 1e-9;
+
+            converged = minimizer.HasConverged(testError);
+            start = std::chrono::system_clock::now();
+
+            Log() << std::setw(10) << stepCount << " | "
+                  << std::setw(12) << trainingError
+                  << std::setw(12) << testError
+                  << std::setw(12) << nFlops / seconds
+                  << std::setw(12) << minimizer.GetConvergenceCount() << Endl;
+            if (converged) {
+               Log() << Endl;
+            }
+         }
+      }
+      for (size_t l = 0; l < net.GetDepth(); l++) {
+         fNet.GetLayer(l).GetWeights() = (TMatrixT<Double_t>) net.GetLayer(l).GetWeights();
+         fNet.GetLayer(l).GetBiases()  = (TMatrixT<Double_t>) net.GetLayer(l).GetBiases();
       }
-      (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
    }
 
+#else // DNNCUDA flag not set.
 
-   
-   return *fMulticlassReturnVal;
+   Log() << kFATAL << "CUDA backend not enabled. Please make sure "
+                      "you have CUDA installed and it was successfully "
+                      "detected by CMAKE." << Endl;
+#endif // DNNCUDA
 }
 
-
-
-
-
-
-//_______________________________________________________________________
-void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const 
+//______________________________________________________________________________
+template<typename AFloat>
+void TMVA::MethodDNN::TrainCpu()
 {
-   // create XML description of DNN classifier
-   // for all layers
 
-   void* nn = gTools().xmlengine().NewChild(parent, 0, "Weights");
-   void* xmlLayout = gTools().xmlengine().NewChild(nn, 0, "Layout");
-   Int_t numLayers = fNet.layers ().size ();
-   gTools().xmlengine().NewAttr(xmlLayout, 0, "NumberLayers", gTools().StringFromInt (numLayers) );
-   for (Int_t i = 0; i < numLayers; i++) 
-      {
-         const TMVA::DNN::Layer& layer = fNet.layers ().at (i);
-         int numNodes = layer.numNodes ();
-         char activationFunction = (char)(layer.activationFunctionType ());
-         int outputMode = (int)layer.modeOutputValues ();
-
-         TString outputModeStr;
-         outputModeStr.Form ("%d", outputMode);
-
-         void* layerxml = gTools().xmlengine().NewChild(xmlLayout, 0, "Layer");
-         gTools().xmlengine().NewAttr(layerxml, 0, "Connection",    TString("FULL") );
-         gTools().xmlengine().NewAttr(layerxml, 0, "Nodes",    gTools().StringFromInt(numNodes) );
-         gTools().xmlengine().NewAttr(layerxml, 0, "ActivationFunction",    TString (activationFunction) );
-         gTools().xmlengine().NewAttr(layerxml, 0, "OutputMode",    outputModeStr);
+#ifdef DNNCPU // Included only if DNNCPU flag is set.
+
+   size_t nTrainingSamples = GetEventCollection(Types::kTraining).size();
+   size_t nTestSamples     = GetEventCollection(Types::kTesting).size();
+
+   Log() << kINFO << "Start of neural network training on CPU." << Endl << Endl;
+
+   fNet.Initialize(fWeightInitialization);
+
+   size_t trainingPhase = 1;
+   for (TTrainingSettings & settings : fTrainingSettings) {
+
+      Log() << "Training phase " << trainingPhase << " of "
+            << fTrainingSettings.size() << ":" << Endl;
+      trainingPhase++;
+
+      TNet<TCpu<AFloat>> net(settings.batchSize, fNet);
+      net.SetWeightDecay(settings.weightDecay);
+      net.SetRegularization(settings.regularization);
+      net.SetDropoutProbabilities(settings.dropoutProbabilities);
+      net.InitializeGradients();
+      auto testNet = net.CreateClone(settings.batchSize);
+
+      using DataLoader_t = TDataLoader<TMVAInput_t, TCpu<AFloat>>;
+
+      size_t nThreads = 1;
+      DataLoader_t trainingData(GetEventCollection(Types::kTraining),
+                                nTrainingSamples,
+                                net.GetBatchSize(),
+                                net.GetInputWidth(),
+                                net.GetOutputWidth(), nThreads);
+      DataLoader_t testData(GetEventCollection(Types::kTesting),
+                            nTestSamples,
+                            testNet.GetBatchSize(),
+                            net.GetInputWidth(),
+                            net.GetOutputWidth(), nThreads);
+      DNN::TGradientDescent<TCpu<AFloat>> minimizer(settings.learningRate,
+                                               settings.convergenceSteps,
+                                               settings.testInterval);
+
+      std::vector<TNet<TCpu<AFloat>>>   nets{};
+      std::vector<TBatch<TCpu<AFloat>>> batches{};
+      nets.reserve(nThreads);
+      for (size_t i = 0; i < nThreads; i++) {
+         nets.push_back(net);
+         for (size_t j = 0; j < net.GetDepth(); j++)
+         {
+            auto &masterLayer = net.GetLayer(j);
+            auto &layer = nets.back().GetLayer(j);
+            TCpu<AFloat>::Copy(layer.GetWeights(),
+                          masterLayer.GetWeights());
+            TCpu<AFloat>::Copy(layer.GetBiases(),
+                          masterLayer.GetBiases());
+         }
       }
 
+      bool   converged = false;
+      size_t stepCount = 0;
+      size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
 
-   void* weightsxml = gTools().xmlengine().NewChild(nn, 0, "Synapses");
-   gTools().xmlengine().NewAttr (weightsxml, 0, "InputSize", gTools().StringFromInt((int)fNet.inputSize ()));
-   gTools().xmlengine().NewAttr (weightsxml, 0, "OutputSize", gTools().StringFromInt((int)fNet.outputSize ()));
-   gTools().xmlengine().NewAttr (weightsxml, 0, "NumberSynapses", gTools().StringFromInt((int)fWeights.size ()));
-   std::stringstream s("");
-   s.precision( 16 );
-   for (std::vector<double>::const_iterator it = fWeights.begin (), itEnd = fWeights.end (); it != itEnd; ++it)
-      {
-         s << std::scientific << (*it) << " ";
-      }
-   gTools().xmlengine().AddRawLine (weightsxml, s.str().c_str());
-}
+      std::chrono::time_point<std::chrono::system_clock> start, end;
+      start = std::chrono::system_clock::now();
 
+      Log() << std::setw(10) << "Epoch" << " | "
+            << std::setw(12) << "Train Err."
+            << std::setw(12) << "Test  Err."
+            << std::setw(12) << "GFLOP/s"
+            << std::setw(12) << "Conv. Steps" << Endl;
+      std::string separator(62, '-');
+      Log() << separator << Endl;
 
-//_______________________________________________________________________
-void TMVA::MethodDNN::ReadWeightsFromXML( void* wghtnode )
-{
-   // read MLP from xml weight file
-   fNet.clear ();
-
-   void* nn = gTools().GetChild(wghtnode, "Weights");
-   if (!nn)
-      {
-         //       std::cout << "no node \"Weights\" in XML, use weightnode" << std::endl;
-         nn = wghtnode;
-      }
-   
-   void* xmlLayout = NULL;
-   xmlLayout = gTools().GetChild(wghtnode, "Layout");
-   if (!xmlLayout)
+      while (!converged)
       {
-         std::cout << "no node Layout in XML" << std::endl;
-         return;
-      }
+         stepCount++;
+         // Perform minimization steps for a full epoch.
+         trainingData.Shuffle();
+         for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
+             batches.clear();
+             for (size_t j = 0; j < nThreads; j++) {
+                 batches.reserve(nThreads);
+                 batches.push_back(trainingData.GetBatch());
+             }
+             if (settings.momentum > 0.0) {
+                 minimizer.StepMomentum(net, nets, batches, settings.momentum);
+             } else {
+                 minimizer.Step(net, nets, batches);
+             }
+         }
 
+         if ((stepCount % minimizer.GetTestInterval()) == 0) {
 
-   
-   //   std::cout << "read layout from XML" << std::endl;
-   void* ch = gTools().xmlengine().GetChild (xmlLayout);
-   TString connection;
-   UInt_t numNodes;
-   TString activationFunction;
-   TString outputMode;
-   fNet.clear ();
-   while (ch) 
-      {
-         gTools().ReadAttr (ch, "Connection", connection);
-         gTools().ReadAttr (ch, "Nodes", numNodes);
-         gTools().ReadAttr (ch, "ActivationFunction", activationFunction);
-         gTools().ReadAttr (ch, "OutputMode", outputMode);
-         ch = gTools().GetNextChild(ch);
+            // Compute test error.
+            AFloat testError = 0.0;
+            for (auto batch : testData) {
+               auto inputMatrix  = batch.GetInput();
+               auto outputMatrix = batch.GetOutput();
+               testError += testNet.Loss(inputMatrix, outputMatrix);
+            }
+            testError /= (Double_t) (nTestSamples / settings.batchSize);
 
-         fNet.addLayer (DNN::Layer (numNodes, (TMVA::DNN::EnumFunction)activationFunction (0), (DNN::ModeOutputValues)outputMode.Atoi ()));
-      }
+            end   = std::chrono::system_clock::now();
 
-   //   std::cout << "read weights XML" << std::endl;
+            // Compute training error.
+            AFloat trainingError = 0.0;
+            for (auto batch : trainingData) {
+               auto inputMatrix  = batch.GetInput();
+               auto outputMatrix = batch.GetOutput();
+               trainingError += net.Loss(inputMatrix, outputMatrix);
+            }
+            trainingError /= (Double_t) (nTrainingSamples / settings.batchSize);
+
+            // Compute numerical throughput.
+            std::chrono::duration<double> elapsed_seconds = end - start;
+            double seconds = elapsed_seconds.count();
+            double nFlops  = (double) (settings.testInterval * batchesInEpoch);
+            nFlops *= net.GetNFlops() * 1e-9;
+
+            converged = minimizer.HasConverged(testError);
+            start = std::chrono::system_clock::now();
+
+            Log() << std::setw(10) << stepCount << " | "
+                  << std::setw(12) << trainingError
+                  << std::setw(12) << testError
+                  << std::setw(12) << nFlops / seconds
+                  << std::setw(12) << minimizer.GetConvergenceCount() << Endl;
+            if (converged) {
+               Log() << Endl;
+            }
+         }
+      }
 
-   void* xmlWeights  = NULL;
-   xmlWeights = gTools().GetChild(wghtnode, "Synapses");
-   if (!xmlWeights)
-      return;
 
-   Int_t numWeights (0);
-   Int_t inputSize (0);
-   Int_t outputSize (0);
-   gTools().ReadAttr (xmlWeights, "NumberSynapses", numWeights);
-   gTools().ReadAttr (xmlWeights, "InputSize", inputSize);
-   gTools().ReadAttr (xmlWeights, "OutputSize", outputSize);
-   fNet.setInputSize (inputSize);
-   fNet.setOutputSize (outputSize); // num vars + bias node
-
-   const char* content = gTools().GetContent (xmlWeights);
-   std::stringstream sstr (content);
-   for (Int_t iWeight = 0; iWeight<numWeights; ++iWeight) 
-      { // synapses
-         Double_t weight;
-         sstr >> weight;
-         fWeights.push_back (weight);
+      for (size_t l = 0; l < net.GetDepth(); l++) {
+         auto & layer = fNet.GetLayer(l);
+         layer.GetWeights() = (TMatrixT<Double_t>) net.GetLayer(l).GetWeights();
+         layer.GetBiases()  = (TMatrixT<Double_t>) net.GetLayer(l).GetBiases();
       }
-}
+   }
 
+#else // DNNCPU flag not set.
+   Log() << kFATAL << "Multi-core CPU backend not enabled. Please make sure "
+                      "you have a BLAS implementation  and tbb installed and"
+                      " it was successfully detected by CMAKE." << Endl;
+#endif // DNNCPU
+}
 
-//_______________________________________________________________________
-void TMVA::MethodDNN::ReadWeightsFromStream( std::istream & /*istr*/)
+//______________________________________________________________________________
+Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ )
 {
-   // // destroy/clear the network then read it back in from the weights file
-
-   // // delete network so we can reconstruct network from scratch
-
-   // TString dummy;
+   size_t nVariables = GetEvent()->GetNVariables();
+   Matrix_t X(1, nVariables);
+   Matrix_t YHat(1, 1);
 
-   // // synapse weights
-   // Double_t weight;
-   // std::vector<Double_t>* weights = new std::vector<Double_t>();
-   // istr>> dummy;
-   // while (istr>> dummy >> weight) weights->push_back(weight); // use w/ slower write-out
-
-   // ForceWeights(weights);
-   
+   const std::vector<Float_t>& inputValues = GetEvent()->GetValues();
+   for (size_t i = 0; i < nVariables; i++) {
+      X(0,i) = inputValues[i];
+   }
 
-   // delete weights;
+   fNet.Prediction(YHat, X, fOutputFunction);
+   return YHat(0,0);
 }
 
-//_______________________________________________________________________
-const TMVA::Ranking* TMVA::MethodDNN::CreateRanking()
+//______________________________________________________________________________
+const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues()
 {
-   // compute ranking of input variables by summing function of weights
-
-   // create the ranking object
-   fRanking = new Ranking( GetName(), "Importance" );
+   size_t nVariables = GetEvent()->GetNVariables();
+   Matrix_t X(1, nVariables);
 
-   for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
-      fRanking->AddRank( Rank( GetInputLabel(ivar), 1.0));
+   const Event *ev = GetEvent();
+   const std::vector<Float_t>& inputValues = ev->GetValues();
+   for (size_t i = 0; i < nVariables; i++) {
+       X(0,i) = inputValues[i];
    }
 
-   // TNeuron*  neuron;
-   // TSynapse* synapse;
-   // Double_t  importance, avgVal;
-   // TString varName;
-
-   // for (UInt_t ivar = 0; ivar < GetNvar(); ivar++) {
-
-   //    neuron = GetInputNeuron(ivar);
-   //    Int_t numSynapses = neuron->NumPostLinks();
-   //    importance = 0;
-   //    varName = GetInputVar(ivar); // fix this line
-
-   //    // figure out average value of variable i
-   //    Double_t meanS, meanB, rmsS, rmsB, xmin, xmax;
-   //    Statistics( TMVA::Types::kTraining, varName, 
-   //                meanS, meanB, rmsS, rmsB, xmin, xmax );
+   size_t nTargets = std::max(1u, ev->GetNTargets());
+   Matrix_t YHat(1, nTargets);
+   std::vector<Float_t> output(nTargets);
+   auto net = fNet.CreateClone(1);
+   net.Prediction(YHat, X, fOutputFunction);
 
-   //    avgVal = (TMath::Abs(meanS) + TMath::Abs(meanB))/2.0;
-   //    double meanrms = (TMath::Abs(rmsS) + TMath::Abs(rmsB))/2.;
-   //    if (avgVal<meanrms) avgVal = meanrms;      
-   //    if (IsNormalised()) avgVal = 0.5*(1 + gTools().NormVariable( avgVal, GetXmin( ivar ), GetXmax( ivar ))); 
+   for (size_t i = 0; i < nTargets; i++)
+       output[i] = YHat(0, i);
 
-   //    for (Int_t j = 0; j < numSynapses; j++) {
-   //       synapse = neuron->PostLinkAt(j);
-   //       importance += synapse->GetWeight() * synapse->GetWeight();
-   //    }
-      
-   //    importance *= avgVal * avgVal;
+   if (fRegressionReturnVal == NULL) {
+       fRegressionReturnVal = new std::vector<Float_t>();
+   }
+   fRegressionReturnVal->clear();
 
-   //    fRanking->AddRank( Rank( varName, importance ) );
-   // }
+   Event * evT = new Event(*ev);
+   for (size_t i = 0; i < nTargets; ++i) {
+      evT->SetTarget(i, output[i]);
+   }
 
-   return fRanking;
+   const Event* evT2 = GetTransformationHandler().InverseTransform(evT);
+   for (size_t i = 0; i < nTargets; ++i) {
+      fRegressionReturnVal->push_back(evT2->GetTarget(i));
+   }
+   delete evT;
+   return *fRegressionReturnVal;
 }
 
+const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues()
+{
+   Log() << kFATAL << "ERROR: Multiclass classification not yet implemented."
+         << Endl;
+   return *fMulticlassReturnVal;
+}
+//______________________________________________________________________________
+void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const 
+{
+   void* nn = gTools().xmlengine().NewChild(parent, 0, "Weights");
+   Int_t inputWidth = fNet.GetInputWidth();
+   Int_t depth      = fNet.GetDepth();
+   char  lossFunction = static_cast<char>(fNet.GetLossFunction());
+   gTools().xmlengine().NewAttr(nn, 0, "InputWidth",
+                                gTools().StringFromInt(inputWidth));
+   gTools().xmlengine().NewAttr(nn, 0, "Depth", gTools().StringFromInt(depth));
+   gTools().xmlengine().NewAttr(nn, 0, "LossFunction", TString(lossFunction));
+   gTools().xmlengine().NewAttr(nn, 0, "OutputFunction",
+                                TString(static_cast<char>(fOutputFunction)));
+
+   for (Int_t i = 0; i < depth; i++) {
+      const auto& layer = fNet.GetLayer(i);
+      auto layerxml = gTools().xmlengine().NewChild(nn, 0, "Layer");
+      char activationFunction = static_cast<char>(layer.GetActivationFunction());
+      gTools().xmlengine().NewAttr(layerxml, 0, "ActivationFunction",
+                                   TString (activationFunction));
+      WriteMatrixXML(layerxml, "Weights", layer.GetWeights());
+      WriteMatrixXML(layerxml, "Biases",  layer.GetBiases());
+   }
+}
 
+//______________________________________________________________________________
+void TMVA::MethodDNN::ReadWeightsFromXML(void* rootXML)
+{
+   auto netXML = gTools().GetChild(rootXML, "Weights");
+   if (!netXML){
+      netXML = rootXML;
+   }
 
+   fNet.Clear();
+   fNet.SetBatchSize(1);
+
+   size_t inputWidth, depth;
+   gTools().ReadAttr(netXML, "InputWidth", inputWidth);
+   gTools().ReadAttr(netXML, "Depth", depth);
+   char lossFunctionChar;
+   gTools().ReadAttr(netXML, "LossFunction", lossFunctionChar);
+   char outputFunctionChar;
+   gTools().ReadAttr(netXML, "OutputFunction", outputFunctionChar);
+
+   fNet.SetInputWidth(inputWidth);
+   fNet.SetLossFunction(static_cast<ELossFunction>(lossFunctionChar));
+   fOutputFunction = static_cast<EOutputFunction>(outputFunctionChar);
+
+   size_t previousWidth = inputWidth;
+   auto layerXML = gTools().xmlengine().GetChild(netXML, "Layer");
+   for (size_t i = 0; i < depth; i++) {
+      TString fString;
+      EActivationFunction f;
+
+      // Read activation function.
+      gTools().ReadAttr(layerXML, "ActivationFunction", fString);
+      f = static_cast<EActivationFunction>(fString(0));
+
+      // Read number of neurons.
+      size_t width;
+      auto matrixXML = gTools().GetChild(layerXML, "Weights");
+      gTools().ReadAttr(matrixXML, "rows", width);
+
+      fNet.AddLayer(width, f);
+      TMatrixT<Double_t> weights(width, previousWidth);
+      TMatrixT<Double_t> biases(width, 1);
+      ReadMatrixXML(layerXML, "Weights", weights);
+      ReadMatrixXML(layerXML, "Biases",  biases);
+      fNet.GetLayer(i).GetWeights() = weights;
+      fNet.GetLayer(i).GetBiases()  = biases;
+
+      layerXML = gTools().GetNextChild(layerXML);
+      previousWidth = width;
+   }
+}
 
+//______________________________________________________________________________
+void TMVA::MethodDNN::ReadWeightsFromStream( std::istream & /*istr*/)
+{
+}
 
+//______________________________________________________________________________
+const TMVA::Ranking* TMVA::MethodDNN::CreateRanking()
+{
+   fRanking = new Ranking( GetName(), "Importance" );
+   for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
+      fRanking->AddRank( Rank( GetInputLabel(ivar), 1.0));
+   }
+   return fRanking;
+}
 
-//_______________________________________________________________________
-void TMVA::MethodDNN::MakeClassSpecific( std::ostream& /*fout*/, const TString& /*className*/ ) const
+//______________________________________________________________________________
+void TMVA::MethodDNN::MakeClassSpecific( std::ostream& /*fout*/,
+                                         const TString& /*className*/ ) const
 {
-   // write specific classifier response
-   //   MethodADNNBase::MakeClassSpecific(fout, className);
 }
 
-//_______________________________________________________________________
+//______________________________________________________________________________
 void TMVA::MethodDNN::GetHelpMessage() const
 {
    // get help message text
@@ -1026,10 +1218,10 @@ reduction of overfitting: \n \
              Random values scaled by the layer size \n \
  \n \
          \"TrainingStrategy\" \n \
-           - example: \"LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \
+           - example: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \
            - explanation: two stacked training settings separated by \"|\" \n \
-             . first training setting: \"LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \
-             . second training setting : \"LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \
+             . first training setting: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \
+             . second training setting : \"LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \
              . LearningRate :  \n \
                - recommended for classification: 0.1 initially, 1e-4 later \n \
                - recommended for regression: 1e-4 and less \n \
@@ -1064,135 +1256,7 @@ reduction of overfitting: \n \
              . Multithreading \n \
                turn on multithreading [recommended: True] \n \
                \n";
-          
    Log () << txt << Endl;
-   
-}
-
-
-
-//_______________________________________________________________________
-void  TMVA::MethodDNN::WriteMonitoringHistosToFile( void ) const
-{
-   // write histograms and PDFs to file for monitoring purposes
-
-   Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
-   BaseDir()->cd();
-}
-
-
-
-
-void TMVA::MethodDNN::checkGradients ()
-{
-   size_t inputSize = 1;
-   size_t outputSize = 1;
-
-   fNet.clear ();
-
-   fNet.setInputSize (inputSize);
-   fNet.setOutputSize (outputSize);
-   fNet.addLayer (DNN::Layer (100, DNN::EnumFunction::SOFTSIGN)); 
-   fNet.addLayer (DNN::Layer (30, DNN::EnumFunction::SOFTSIGN)); 
-   fNet.addLayer (DNN::Layer (outputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::SIGMOID)); 
-   fNet.setErrorFunction (DNN::ModeErrorFunction::CROSSENTROPY);
-   //    net.setErrorFunction (ModeErrorFunction::SUMOFSQUARES);
-
-   size_t numWeights = fNet.numWeights (inputSize);
-   std::vector<double> weights (numWeights);
-   //weights.at (0) = 1000213.2;
-
-   std::vector<Pattern> pattern;
-   for (size_t iPat = 0, iPatEnd = 10; iPat < iPatEnd; ++iPat)
-      {
-         std::vector<double> input;
-         std::vector<double> output;
-         for (size_t i = 0; i < inputSize; ++i)
-            {
-               input.push_back (TMVA::DNN::gaussDouble (0.1, 4));
-            }
-         for (size_t i = 0; i < outputSize; ++i)
-            {
-               output.push_back (TMVA::DNN::gaussDouble (0, 3));
-            }
-         pattern.push_back (Pattern (input,output));
-      }
-
-
-   DNN::Settings settings (TString ("checkGradients"), /*_convergenceSteps*/ 15, /*_batchSize*/ 1, /*_testRepetitions*/ 7, /*_factorWeightDecay*/ 0, /*regularization*/ TMVA::DNN::EnumRegularization::NONE);
-
-   size_t improvements = 0;
-   size_t worsenings = 0;
-   size_t smallDifferences = 0;
-   size_t largeDifferences = 0;
-   for (size_t iTest = 0; iTest < 1000; ++iTest)
-      {
-         TMVA::DNN::uniformDouble (weights, 0.7);
-         std::vector<double> gradients (numWeights, 0);
-         DNN::Batch batch (begin (pattern), end (pattern));
-         DNN::DropContainer dropContainer;
-         std::tuple<DNN::Settings&, DNN::Batch&, DNN::DropContainer&> settingsAndBatch (settings, batch, dropContainer);
-         double E = fNet (settingsAndBatch, weights, gradients);
-         std::vector<double> changedWeights;
-         changedWeights.assign (weights.begin (), weights.end ());
-
-         int changeWeightPosition = TMVA::DNN::randomInt (numWeights);
-         double dEdw = gradients.at (changeWeightPosition);
-         while (dEdw == 0.0)
-            {
-               changeWeightPosition = TMVA::DNN::randomInt (numWeights);
-               dEdw = gradients.at (changeWeightPosition);
-            }
-
-         const double gamma = 0.01;
-         double delta = gamma*dEdw;
-         changedWeights.at (changeWeightPosition) += delta;
-         if (dEdw == 0.0)
-            {
-               std::cout << "dEdw == 0.0 ";
-               continue;
-            }
-        
-         assert (dEdw != 0.0);
-         double Echanged = fNet (settingsAndBatch, changedWeights);
-
-         //       double difference = fabs((E-Echanged) - delta*dEdw);
-         double difference = fabs ((E+delta - Echanged)/E);
-         bool direction = (E-Echanged)>0 ? true : false;
-         //       bool directionGrad = delta>0 ? true : false;
-         bool isOk = difference < 0.3 && difference != 0;
-
-         if (direction)
-            ++improvements;
-         else
-            ++worsenings;
-
-         if (isOk)
-            ++smallDifferences;
-         else
-            ++largeDifferences;
-
-         if (true || !isOk)
-            {
-               if (!direction)
-                  std::cout << "=================" << std::endl;
-               std::cout << "E = " << E << " Echanged = " << Echanged << " delta = " << delta << "   pos=" << changeWeightPosition << "   dEdw=" << dEdw << "  difference= " << difference << "  dirE= " << direction << std::endl;
-            }
-         if (isOk)
-            {
-            }
-         else
-            {
-               //            for_each (begin (weights), end (weights), [](double w){ std::cout << w << ", "; });
-               //            std::cout << std::endl;
-               //            assert (isOk);
-            }
-      }
-   std::cout << "improvements = " << improvements << std::endl;
-   std::cout << "worsenings = " << worsenings << std::endl;
-   std::cout << "smallDifferences = " << smallDifferences << std::endl;
-   std::cout << "largeDifferences = " << largeDifferences << std::endl;
-
-   std::cout << "check gradients done" << std::endl;
 }
 
+} // namespace TMVA
diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fc2a3e633d359cd76dc2b19d8e8db6a66827428a
--- /dev/null
+++ b/tmva/tmva/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+############################################################################
+# CMakeLists.txt file for building ROOT TMVA tests.
+# @author Simon Pfreundschuh
+############################################################################
+
+project(tmva-tests)
+find_package(ROOT REQUIRED)
+
+ROOT_ADD_TEST_SUBDIRECTORY(DNN)
diff --git a/tmva/tmva/test/DNN/CMakeLists.txt b/tmva/tmva/test/DNN/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d3c36c65fbec452f23479450e0e4697335d6b5a
--- /dev/null
+++ b/tmva/tmva/test/DNN/CMakeLists.txt
@@ -0,0 +1,92 @@
+############################################################################
+# CMakeLists.txt file for building TMVA/DNN tests.
+# @author Simon Pfreundschuh
+############################################################################
+
+project(tmva-tests)
+find_package(ROOT REQUIRED)
+
+set(Libraries Core MathCore Matrix TMVA)
+include_directories(${ROOT_INCLUDE_DIRS})
+
+#--- CUDA tests. ---------------------------
+find_package(CUDA)
+if (CUDA_FOUND)
+
+  SET(DNN_CUDA_LIBRARIES dnn_cuda ${CUDA_CUBLAS_LIBRARIES})
+
+  # DNN - Activation Functions Cuda
+  CUDA_ADD_EXECUTABLE(testActivationFunctionsCuda TestActivationFunctionsCuda.cxx)
+  TARGET_LINK_LIBRARIES(testActivationFunctionsCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-ActivationFunctionsCuda COMMAND testActivationFunctionsCuda)
+
+  # DNN - Loss Functions Cuda
+  CUDA_ADD_EXECUTABLE(testLossFunctionsCuda TestLossFunctionsCuda.cxx)
+  TARGET_LINK_LIBRARIES(testLossFunctionsCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-LossFunctionsCuda COMMAND testLossFunctionsCuda)
+
+  # DNN - Derivatives Cuda
+  CUDA_ADD_EXECUTABLE(testDerivativesCuda TestDerivativesCuda.cxx)
+  TARGET_LINK_LIBRARIES(testDerivativesCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-DerivativesCuda COMMAND testDerivativesCuda)
+
+  # DNN - Backpropagation Cuda
+  CUDA_ADD_EXECUTABLE(testBackpropagationCuda TestBackpropagationCuda.cxx)
+  TARGET_LINK_LIBRARIES(testBackpropagationCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-BackpropagationCuda COMMAND testBackpropagationCuda)
+
+  # DNN - Minimization Cuda
+  CUDA_ADD_EXECUTABLE(testMinimizationCuda TestMinimizationCuda.cxx)
+  TARGET_LINK_LIBRARIES(testMinimizationCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-MinimizationCuda COMMAND testMinimizationCuda)
+
+  # DNN - Arithmetic Cuda
+  CUDA_ADD_EXECUTABLE(testArithmeticCuda TestMatrixArithmeticCuda.cxx)
+  TARGET_LINK_LIBRARIES(testArithmeticCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+  ROOT_ADD_TEST(TMVA-DNN-ArithmeticCuda COMMAND testArithmeticCuda)
+
+  # DNN - DataLoader Cuda
+  CUDA_ADD_EXECUTABLE(testDataLoaderCuda TestDataLoaderCuda.cxx)
+  TARGET_LINK_LIBRARIES(testDataLoaderCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
+endif (CUDA_FOUND)
+
+#--- CPU tests. ----------------------------
+find_package(BLAS)
+if (BLAS_FOUND AND imt)
+
+  # DNN - Arithmetic Functions CPU
+  ROOT_EXECUTABLE(testArithmeticCpu TestMatrixArithmeticCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-Arithmetic-Cpu COMMAND testArithmeticCpu)
+
+  # DNN - Activation Functions CPU
+  ROOT_EXECUTABLE(testActivationFunctionsCpu TestActivationFunctionsCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-Activation-Functions-Cpu COMMAND testActivationFunctionsCpu)
+
+  # DNN - Loss Functions CPU
+  ROOT_EXECUTABLE(testLossFunctionsCpu TestLossFunctionsCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-Loss-Functions-Cpu COMMAND testLossFunctionsCpu)
+
+  # DNN - Derivatives CPU
+  ROOT_EXECUTABLE(testDerivativesCpu TestDerivativesCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-Derivatives-Cpu COMMAND testDerivativesCpu)
+
+  # DNN - Backpropagation CPU
+  ROOT_EXECUTABLE(testBackpropagationCpu TestBackpropagationCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-Backpropagation-Cpu COMMAND testBackpropagationCpu)
+
+  # DNN - DataLoader CPU
+  ROOT_EXECUTABLE(testDataLoaderCpu TestDataLoaderCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-Data-Loader-Cpu COMMAND testDataLoaderCpu)
+
+  # DNN - Minimization CPU
+  ROOT_EXECUTABLE(testMinimizationCpu TestMinimizationCpu.cxx
+    LIBRARIES ${Libraries} ${BLAS_openblas_LIBRARY} tbb)
+  ROOT_ADD_TEST(TMVA-DNN-Minimization-Cpu COMMAND testMinimizationCpu)
+
+endif (BLAS_FOUND AND imt)
diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.cxx b/tmva/tmva/test/DNN/TestActivationFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..aed3980a7ed10c9a32faa9f642b3ae02b37f58e2
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctions.cxx
@@ -0,0 +1,128 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+//  Concrete instantiation of the generic activation function test  //
+//  for the reference architecture.                                 //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TestActivationFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+    std::cout << "Testing Activation Functions:" << std::endl;
+
+    Scalar_t error;
+
+    // Identity.
+
+    error = testIdentity<TReference<Scalar_t>>(10);
+    std::cout << "Testing identity activation:               ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testIdentityDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing identity activation derivative:    ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // ReLU.
+
+    error = testRelu<TReference<Scalar_t>>(10);
+    std::cout << "Testing ReLU activation:                   ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testReluDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing ReLU activation derivative:        ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Sigmoid.
+
+    error = testSigmoid<TReference<Scalar_t>>(10);
+    std::cout << "Testing Sigmoid activation:                ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSigmoidDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing Sigmoid activation derivative:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // TanH.
+
+    error = testTanh<TReference<Scalar_t>>(10);
+    std::cout << "Testing TanH activation:                   ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testTanhDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing TanH activation derivative:        ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Symmetric ReLU.
+
+    error = testSymmetricReluDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing Symm. ReLU activation:             ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSymmetricReluDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing Symm. ReLU activation derivative:  ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Soft Sign.
+
+    error = testSoftSign<TReference<Scalar_t>>(10);
+    std::cout << "Testing Soft Sign activation:              ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testSoftSignDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing Soft Sign activation derivative:   ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    // Gauss.
+
+    error = testGauss<TReference<Scalar_t>>(10);
+    std::cout << "Testing Gauss activation:                  ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testGaussDerivative<TReference<Scalar_t>>(10);
+    std::cout << "Testing Gauss activation derivative:       ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.h b/tmva/tmva/test/DNN/TestActivationFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3f479eb706a7c56a7ff0eca7180744c4a0ddfa8
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctions.h
@@ -0,0 +1,457 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Generic tests of the layer activation functions                  //
+//                                                                  //
+// Contains tests for each of the layer activation functions that   //
+// test the evaluation of the function using the evaluate(...)      //
+// method and the computation of the derivatives using              //
+// evaluate_derivative(...) on a randomly generated matrix. Each    //
+// function returns the maximum relative error between the expected //
+// result and the result obtained for the given arcthitecture.      //
+//////////////////////////////////////////////////////////////////////
+
+#ifndef TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS
+#define TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS
+
+#include "TMatrixT.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+//______________________________________________________________________________
+//
+//  Identity Activation Function
+//______________________________________________________________________________
+
+/*! Test application of identity function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testIdentity(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kIdentity);
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the identity function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testIdentityDerivative(size_t ntests)
+    -> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kIdentity, AArch);
+      evaluateDerivative<TReference<Double_t>>(BRef, EActivationFunction::kIdentity,
+                                               ARef);
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, BRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  ReLU Activation Function
+//______________________________________________________________________________
+
+/*! Test application of ReLU function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testRelu(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kRelu);
+      applyMatrix(ARef, [](double x){return x < 0.0 ? 0.0 : x;});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the ReLU function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testReluDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kRelu, AArch);
+      applyMatrix(ARef, [](double x){return x > 0.0 ? 1.0 : 0.0;});
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Sigmoid Activation Function
+//______________________________________________________________________________
+
+/*! Test application of Sigmoid function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSigmoid(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kSigmoid);
+      applyMatrix(ARef, [](double x){return 1.0 / (1.0 + std::exp(-x));});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the ReLU function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSigmoidDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kSigmoid, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             Double_t sig = 1.0 / (1.0 + std::exp(-x));
+             return sig * (1.0 - sig);
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Tanh Activation Function
+//______________________________________________________________________________
+
+/*! Test application of tanh function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testTanh(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kTanh);
+      applyMatrix(ARef, [](double x){return tanh(x);});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the tanh function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testTanhDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kTanh, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             Double_t t = tanh(x);
+             return 1 - t * t;
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Symmetric ReLU Activation Function
+//______________________________________________________________________________
+
+/*! Test application of symmetric ReLU function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSymmetricRelu(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kSymmRelu);
+      applyMatrix(ARef, [](double x){return fabs(x);});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the symmetric ReLU function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSymmetricReluDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kSymmRelu, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             return (x < 0) ? -1.0 : 1.0;
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Soft Sign Activation Function
+//______________________________________________________________________________
+
+/*! Test application of symmetric soft sign function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSoftSign(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kSoftSign);
+      applyMatrix(ARef, [](double x){return x / (1 + fabs(x));});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the soft sign function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSoftSignDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kSoftSign, AArch);
+      applyMatrix(ARef, [](Double_t x){
+             Double_t y = 1 + fabs(x);
+             return 1.0 / (y * y);
+          });
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Gauss Activation Functions
+//______________________________________________________________________________
+
+/*! Test application of Gauss activation function to matrix. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testGauss(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef);
+
+      evaluate<Architecture>(AArch, EActivationFunction::kGauss);
+      applyMatrix(ARef, [](double x){return exp(- x * x);});
+
+      TMatrixT<Double_t> A = AArch;
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+/*! Test computation of the first derivative of the Gauss activation function. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testGaussDerivative(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m, n), BRef(m, n);
+      randomMatrix(ARef);
+      Matrix_t AArch(ARef), BArch(BRef);
+
+      evaluateDerivative<Architecture>(BArch, EActivationFunction::kGauss, AArch);
+      applyMatrix(ARef, [](Double_t x){return -2.0 * x * exp(- x * x);});
+
+      TMatrixT<Double_t> B = BArch;
+      Double_t error = maximumRelativeError(B, ARef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+#endif
diff --git a/tmva/tmva/test/DNN/TestActivationFunctionsCpu.cxx b/tmva/tmva/test/DNN/TestActivationFunctionsCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..998f8bc747d46f5a4e7dd67395f08bc3dec0079e
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctionsCpu.cxx
@@ -0,0 +1,131 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+//  Concrete instantiation of the generic activation function test  //
+//  for the multi-threaded CPU implementation.                      //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "Utility.h"
+#include "TestActivationFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+   using Scalar_t = Double_t;
+
+   std::cout << "Testing Activation Functions:" << std::endl;
+
+   double error;
+
+   // Identity.
+
+   error = testIdentity<TCpu<Scalar_t>>(10);
+   std::cout << "Testing identity activation:            ";
+   std::cout << "maximum relative error = " << error << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testIdentityDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing identity activation derivative: ";
+   std::cout << "maximum relative error = " << error << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   // ReLU.
+
+   error = testRelu<TCpu<Scalar_t>>(10);
+   std::cout << "Testing ReLU activation:                ";
+   std::cout << "maximum relative error = " << error << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testReluDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing ReLU activation derivative:     ";
+   std::cout << "maximum relative error = " << error << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   // Sigmoid.
+
+   error = testSigmoid<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Sigmoid activation:             ";
+   std::cout << "maximum relative error = " << error << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testSigmoidDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Sigmoid activation derivative:  ";
+   std::cout << "maximum relative error = " << error << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   // TanH.
+
+   error = testTanh<TCpu<Scalar_t>>(10);
+   std::cout << "Testing TanH activation:                   ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testTanhDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing TanH activation derivative:        ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   // Symmetric ReLU.
+
+   error = testSymmetricRelu<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Symm. ReLU activation:             ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testSymmetricReluDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Symm. ReLU activation derivative:  ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   // Soft Sign.
+
+   error = testSoftSign<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Soft Sign activation:              ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testSoftSignDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Soft Sign activation derivative:   ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   // Gauss.
+
+   error = testGauss<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Gauss activation:                  ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   error = testGaussDerivative<TCpu<Scalar_t>>(10);
+   std::cout << "Testing Gauss activation derivative:       ";
+   std::cout << "maximum relative error = " << print_error(error) << std::endl;
+   if (error > 1e-10)
+       return 1;
+
+   return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..adcb765098cb50a2c065ed17016abc24ed4df18c
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx
@@ -0,0 +1,74 @@
+// @(#)root/tmva/tmva/test/dnn $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic activation function test   //
+// for the TCuda implementation.                                    //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "Utility.h"
+#include "TestActivationFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+
+    std::cout << "Testing Activation Functions:" << std::endl;
+
+    double error;
+
+    // Identity.
+
+    error = testIdentity<TCuda<Scalar_t>>(10);
+    std::cout << "Testing identity activation:            ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    error = testIdentityDerivative<TCuda<Scalar_t>>(10);
+    std::cout << "Testing identity activation derivative: ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    // ReLU.
+
+    error = testRelu<TCuda<Scalar_t>>(10);
+    std::cout << "Testing ReLU activation:                ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    error = testReluDerivative<TCuda<Scalar_t>>(10);
+    std::cout << "Testing ReLU activation derivative:     ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    // Sigmoid.
+
+    error = testSigmoid<TCuda<Scalar_t>>(10);
+    std::cout << "Testing Sigmoid activation:             ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    error = testSigmoidDerivative<TCuda<Scalar_t>>(10);
+    std::cout << "Testing Sigmoid activation derivative:  ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-5)
+        return 1;
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagation.cxx b/tmva/tmva/test/DNN/TestBackpropagation.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..391aae7e3cd23a5ff17903c1d96b4f8301afc257
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagation.cxx
@@ -0,0 +1,50 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// the reference architecture.                                    //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestBackpropagation.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Backpropagation:" << std::endl;
+
+    double error;
+
+    //
+    // Test backpropagation for linear net.
+    //
+
+    error = testBackpropagationWeightsLinear<TReference<double>>(1.0);
+    if (error > 1e-3)
+        return 1;
+
+    error = testBackpropagationL1Regularization<TReference<double>>(1e-2);
+    if (error > 1e-3)
+        return 1;
+
+    error = testBackpropagationL2Regularization<TReference<double>>(1.0);
+    if (error > 1e-3)
+        return 1;
+
+    error = testBackpropagationBiasesLinear<TReference<double>>(1.0);
+    if (error > 1e-3)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagation.h b/tmva/tmva/test/DNN/TestBackpropagation.h
new file mode 100644
index 0000000000000000000000000000000000000000..0988764ee21f781b8dd37b364c2d25d61cb1cc46
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagation.h
@@ -0,0 +1,361 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Generic tests of the backpropagation algorithm.                //
+//                                                                //
+// All tests randomly generate a net with identity activation     //
+// functions, i.e.  which is completely linear and then tests the //
+// computed gradients for each layer using numerical              //
+// derivation. The restriction to linear nets is to avoid the     //
+// required division by the finite difference interval used to    //
+// approximate the numerical derivatives, which would otherwise   //
+// cause precision loss.                                          //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+/*! Compute the loss of the net as a function of the weight at index (i,j) in
+ *  layer l. dx is added as an offset to the current value of the weight. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto evaluate_net_weight(TNet<Architecture> &net,
+                               typename Architecture::Matrix_t &X,
+                         const typename Architecture::Matrix_t &Y,
+                         size_t l,
+                         size_t i,
+                         size_t j,
+                         typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+
+    net.GetLayer(l).GetWeights().operator()(i,j) += dx;
+    Scalar_t res = net.Loss(X, Y);
+    net.GetLayer(l).GetWeights().operator()(i,j) -= dx;
+    return res;
+}
+
+/*! Compute the loss of the net as a function of the weight at index i in
+ *  layer l. dx is added as an offset to the current value of the weight. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto evaluate_net_bias(TNet<Architecture> &net,
+                               typename Architecture::Matrix_t &X,
+                         const typename Architecture::Matrix_t &Y,
+                         size_t l,
+                         size_t i,
+                         typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+
+    net.GetLayer(l).GetBiases().operator()(i,0) += dx;
+    Scalar_t res = net.Loss(X, Y);
+    net.GetLayer(l).GetBiases().operator()(i,0) -= dx;
+    return res;
+}
+
+/*! Generate a random net, perform forward and backward propagation and check
+ *  the weight gradients using numerical differentiation. Returns the maximum
+ *  relative gradient error and also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationWeightsLinear(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+    using Scalar_t = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+    using Net_t    = TNet<Architecture>;
+
+
+    Net_t net(50, 50, ELossFunction::kMeanSquaredError);
+
+    // Random net.
+    constructRandomLinearNet(net);
+    net.Initialize(EInitialization::kGauss);
+
+    // Random training data.
+    Matrix_t X(50, 50);
+    randomBatch(X);
+
+    Matrix_t Y(50, net.GetOutputWidth());
+    randomMatrix(Y);
+
+    net.Forward(X);
+    net.Backward(X,Y);
+
+    Scalar_t maximum_error = 0.0;
+
+    // Compute derivatives for all weights using finite differences and
+    // compare to result obtained from backpropagation.
+    for (size_t l = 0; l < net.GetDepth(); l++)
+    {
+        std::cout << "\rTesting weight gradients:      layer: "
+                  << l << " / " << net.GetDepth();
+        std::cout << std::flush;
+        auto & layer = net.GetLayer(l);
+        auto & W     = layer.GetWeightGradients();
+
+        for (size_t i = 0; i < layer.GetWidth(); i++)
+        {
+            for (size_t j = 0; j < layer.GetInputWidth(); j++)
+            {
+                auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+                    {
+                        return evaluate_net_weight(net, X, Y, l, i, j, x);
+                    };
+                Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+                Scalar_t dy_ref = W(i,j);
+
+                // Compute the relative error if dy != 0.
+                Scalar_t error;
+                if (std::fabs(dy_ref) > 1e-15)
+                {
+                    error = std::fabs((dy - dy_ref) / dy_ref);
+                }
+                else
+                {
+                    error = std::fabs(dy - dy_ref);
+                }
+
+                maximum_error = std::max(error, maximum_error);
+            }
+        }
+    }
+
+    std::cout << "\rTesting weight gradients:      ";
+    std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+    return maximum_error;
+}
+
+/*! Generate a random, linear net, perform forward and backward propagation with
+ *  L1 regularization and check the weight gradients using numerical
+ *  differentiation. Returns the maximum relative gradient error and
+ *  also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationL1Regularization(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+   using Scalar_t = typename Architecture::Scalar_t;
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Net_t    = TNet<Architecture>;
+
+   Net_t net(50, 50, ELossFunction::kMeanSquaredError, ERegularization::kL1, 0.1);
+
+   // Random net.
+   constructRandomLinearNet(net);
+   net.Initialize(EInitialization::kGauss);
+
+   // Random training data.
+   Matrix_t X(50, 50);
+   randomBatch(X);
+
+   Matrix_t Y(50, net.GetOutputWidth());
+   randomMatrix(Y);
+
+   net.Forward(X);
+   net.Backward(X,Y);
+
+   Scalar_t maximum_error = 0.0;
+
+   // Compute derivatives for all weights using finite differences and
+   // compare to result obtained from backpropagation.
+   for (size_t l = 0; l < net.GetDepth(); l++)
+   {
+      std::cout << "\rTesting weight gradients (L1): layer: "
+                << l << " / " << net.GetDepth();
+      std::cout << std::flush;
+      auto & layer = net.GetLayer(l);
+      auto & W     = layer.GetWeights();
+      auto & dW    = layer.GetWeightGradients();
+
+      for (size_t i = 0; i < layer.GetWidth(); i++) {
+         for (size_t j = 0; j < layer.GetInputWidth(); j++) {
+            // Avoid running into the non-derivable point at 0.0.
+            if (std::abs(W(i,j)) > dx) {
+               auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+               {
+                  return evaluate_net_weight(net, X, Y, l, i, j, x);
+               };
+               Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+               Scalar_t dy_ref = dW(i,j);
+
+               // Compute the relative error if dy != 0.
+               Scalar_t error;
+               if (std::fabs(dy_ref) > 1e-15)
+               {
+                  error = std::fabs((dy - dy_ref) / dy_ref);
+               }
+               else
+               {
+                  error = std::fabs(dy - dy_ref);
+               }
+
+               maximum_error = std::max(error, maximum_error);
+            }
+         }
+      }
+   }
+
+   std::cout << "\rTesting weight gradients (L1): ";
+   std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+   return maximum_error;
+}
+
+/*! Generate a random, linear net, perform forward and backward propagation with
+ *  L2 regularization and check the weight gradients using numerical
+ *  differentiation. Returns the maximum relative gradient error and
+ *  also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationL2Regularization(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+   using Scalar_t = typename Architecture::Scalar_t;
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Net_t    = TNet<Architecture>;
+
+   Net_t net(50, 50, ELossFunction::kMeanSquaredError, ERegularization::kL2, 0.1);
+
+   // Random net.
+   constructRandomLinearNet(net);
+   net.Initialize(EInitialization::kGauss);
+
+   // Random training data.
+   Matrix_t X(50, 50);
+   randomBatch(X);
+
+   Matrix_t Y(50, net.GetOutputWidth());
+   randomMatrix(Y);
+
+   net.Forward(X);
+   net.Backward(X,Y);
+
+   Scalar_t maximum_error = 0.0;
+
+   // Compute derivatives for all weights using finite differences and
+   // compare to result obtained from backpropagation.
+   for (size_t l = 0; l < net.GetDepth(); l++)
+   {
+      std::cout << "\rTesting weight gradients (L2): layer: "
+                << l << " / " << net.GetDepth();
+      std::cout << std::flush;
+      auto & layer = net.GetLayer(l);
+      auto & W     = layer.GetWeightGradients();
+
+      for (size_t i = 0; i < layer.GetWidth(); i++)
+      {
+         for (size_t j = 0; j < layer.GetInputWidth(); j++)
+         {
+            auto f = [& net, & X, &Y, l, i, j](Scalar_t x)
+            {
+               return evaluate_net_weight(net, X, Y, l, i, j, x);
+            };
+            Scalar_t dy     = finiteDifference(f, dx) / (2.0 * dx);
+            Scalar_t dy_ref = W(i,j);
+
+            // Compute the relative error if dy != 0.
+            Scalar_t error;
+            if (std::fabs(dy_ref) > 1e-15)
+            {
+               error = std::fabs((dy - dy_ref) / dy_ref);
+            }
+            else
+            {
+               error = std::fabs(dy - dy_ref);
+            }
+
+            maximum_error = std::max(error, maximum_error);
+         }
+      }
+   }
+
+   std::cout << "\rTesting weight gradients (L2): ";
+   std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+   return maximum_error;
+}
+
+/*! Generate a random net, perform forward and backward propagation and check
+ *  the bias gradients using numerical differentiation. Returns the maximum
+ *  relative gradient error and also prints it to stdout. */
+//______________________________________________________________________________
+template <typename Architecture>
+auto testBackpropagationBiasesLinear(typename Architecture::Scalar_t dx)
+-> typename Architecture::Scalar_t
+{
+   using Net_t    = TNet<Architecture>;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   using Matrix_t = typename Architecture::Matrix_t;
+
+
+   Net_t net(50, 50, ELossFunction::kMeanSquaredError);
+
+   // Random net.
+   constructRandomLinearNet(net);
+   net.Initialize(EInitialization::kGauss);
+
+   // Random training data.
+   Matrix_t X(50, 50);
+   randomBatch(X);
+
+   Matrix_t Y(50, net.GetOutputWidth());
+   randomMatrix(Y);
+
+   net.Forward(X);
+   net.Backward(X,Y);
+
+   Scalar_t maximum_error = 0.0;
+
+   // Compute derivatives for all bias terms using finite differences and
+   // compare to result obtained from backpropagation.
+   for (size_t l = 0; l < net.GetDepth(); l++)
+   {
+      std::cout << "\rTesting bias gradients:       layer: "
+                << l << " / " << net.GetDepth();
+      std::cout << std::flush;
+      auto & layer = net.GetLayer(l);
+      auto & dtheta = layer.GetBiasGradients();
+
+      for (size_t i = 0; i < layer.GetWidth(); i++)
+      {
+         auto f = [& net, & X, &Y, l, i](Scalar_t x)
+         {
+            return evaluate_net_bias(net, X, Y, l, i, x);
+         };
+         Scalar_t dy     = finiteDifference(f, dx);
+         Scalar_t dy_ref = dtheta(i,0) * 2.0 * dx;
+
+         // Compute the relative error if dy != 0.
+         Scalar_t error;
+         if (std::fabs(dy_ref) > 1e-10)
+         {
+            error = std::fabs((dy - dy_ref) / dy_ref);
+         }
+         else
+         {
+            error = std::fabs(dy - dy_ref);
+         }
+
+         maximum_error = std::max(error, maximum_error);
+      }
+   }
+
+   std::cout << "\rTesting bias gradients:        ";
+   std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl;
+   return maximum_error;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagationCpu.cxx b/tmva/tmva/test/DNN/TestBackpropagationCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..c44405b4f6a786561cd0d5500055570f9ebe2abc
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagationCpu.cxx
@@ -0,0 +1,47 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// multi-threaded CPU architectures.                              //
+////////////////////////////////////////////////////////////////////
+
+#include "TMatrix.h"
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TestBackpropagation.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+   using Scalar_t = Double_t;
+   std::cout << "Testing Backpropagation:" << std::endl;
+
+   double error;
+
+   error = testBackpropagationWeightsLinear<TCpu<Scalar_t>>(1.0);
+   if (error > 1e-3)
+       return 1;
+
+   error = testBackpropagationL1Regularization<TCpu<Scalar_t>>(1e-2);
+   if (error > 1e-3)
+       return 1;
+
+   error = testBackpropagationL2Regularization<TCpu<Scalar_t>>(1.0);
+   if (error > 1e-3)
+       return 1;
+
+   error = testBackpropagationBiasesLinear<TCpu<Scalar_t>>(1.0);
+   if (error > 1e-3)
+       return 1;
+
+   return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..55178a612a22bd005a7abbfb47f3cc5b6404aeb6
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx
@@ -0,0 +1,43 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// CUDA architectures.                                            //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMatrix.h"
+#include "TestBackpropagation.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+
+    std::cout << "Testing Backpropagation:" << std::endl;
+    double error;
+    error = testBackpropagationWeightsLinear<TCuda<Scalar_t>>(1.0);
+    if (error > 1e-3)
+        return 1;
+    error = testBackpropagationL1Regularization<TCuda<Scalar_t>>(1e-2);
+    if (error > 1e-3)
+        return 1;
+    error = testBackpropagationL2Regularization<TCuda<Scalar_t>>(1.0);
+    if (error > 1e-3)
+        return 1;
+    error = testBackpropagationBiasesLinear<TCuda<Scalar_t>>(1.0);
+    if (error > 1e-3)
+        return 1;
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestCuda.cxx b/tmva/tmva/test/DNN/TestCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..46c4597f979a7c57f8646a9a1fb4c58153bce2c6
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestCuda.cxx
@@ -0,0 +1,196 @@
+#include "Utility.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+#include <stdlib.h>
+
+using namespace TMVA::DNN;
+
+//_________________________________________________________________________________
+Double_t testMultiply()
+{
+    const size_t ntests = 100;
+
+    Double_t maximumError = 0;
+
+    for (size_t i = 0; i < ntests; i++) {
+        size_t m, n, k;
+        m = rand() % 50 + 1;
+        n = rand() % 50 + 1;
+        k = rand() % 50 + 1;
+
+        TMatrixT<Double_t> A(m,k), AT(k,m) , B(k,n), BT(n,k), C(m,n);
+        randomMatrix(A);
+        randomMatrix(AT);
+        randomMatrix(B);
+        randomMatrix(BT);
+        TCudaMatrix ACuda(A), ATCuda(AT), BCuda(B), BTCuda(BT),  CCuda(C);
+
+        TReference<Double_t>::MultiplyTranspose(C, A, BT);
+        TCuda<false>::MultiplyTranspose(CCuda, ACuda, BTCuda);
+        TMatrixT<Double_t> CRef(CCuda);
+        Double_t error = maximumRelativeError(C, CRef);
+        maximumError   = std::max(error, maximumError);
+
+        C.Mult(A,B);
+        TCuda<false>::Multiply(CCuda, ACuda, BCuda);
+        CRef = CCuda;
+        error = maximumRelativeError(C, CRef);
+        maximumError   = std::max(error, maximumError);
+
+        C.TMult(AT,B);
+        TCuda<false>::TransposeMultiply(CCuda, ATCuda, BCuda);
+        CRef = CCuda;
+        error = maximumRelativeError(C, CRef);
+        maximumError   = std::max(error, maximumError);
+    }
+    return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testAddRowWise()
+{
+   const size_t ntests = 10;
+
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 50 + 1;
+      n = rand() % 50 + 1;
+
+      TMatrixT<Double_t> A(m,n), B(m,n), theta(n,1);
+      //randomMatrix(A);
+      randomMatrix(theta);
+      TCudaMatrix ACuda(A), BCuda(B), thetaCuda(theta);
+
+      TReference<Double_t>::AddRowWise(A, theta);
+      TCuda<false>::AddRowWise(ACuda,thetaCuda);
+      TMatrixT<Double_t> ARef(ACuda);
+
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testHadamard()
+{
+   const size_t ntests = 10;
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 10 + 1;
+      n = rand() % 10 + 1;
+
+      TMatrixT<Double_t> A(m,n), B(m,n);
+      randomMatrix(A);
+      randomMatrix(B);
+      TCudaMatrix ACuda(A), BCuda(B);
+
+      for (size_t j = 0; j < (size_t) A.GetNrows(); j++) {
+         for (size_t k = 0; k < (size_t) A.GetNcols(); k++) {
+             A(j,k) *= B(j,k);
+         }
+      }
+
+      TCuda<false>::Hadamard(ACuda, BCuda);
+      TMatrixT<Double_t> ARef(ACuda);
+      Double_t error = maximumRelativeError(A, ARef);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testReduction()
+{
+   const size_t ntests = 10;
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 1000 + 1;
+      n = rand() % 1000 + 1;
+
+      TMatrixT<Double_t> A(m,n);
+
+      for (size_t j = 0; j < m; j++) {
+         for (size_t k = 0; k < n; k++) {
+            A(j,k) = 1.0;
+         }
+      }
+      TCudaMatrix ACuda(A);
+
+      TCudaMatrix BCuda(1,n);
+      TCuda<false>::InitializeZero(BCuda);
+      Double_t s  = TCuda<false>::Sum(A);
+      TCuda<false>::SumColumns(BCuda, ACuda);
+      TMatrixT<Double_t> B(BCuda);
+
+      Double_t error = s - ((Double_t) m * n);
+      maximumError   = std::max(error, maximumError);
+
+      for (size_t j = 0; j < n; j++) {
+         //std::cout << B(0,j) << " / " << j * m << std::endl;
+         error = std::abs(B(0,j) - m);
+         maximumError   = std::max(error, maximumError);
+      }
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+Double_t testScaleAdd()
+{
+   const size_t ntests   = 10;
+   Double_t maximumError = 0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m, n;
+      m = rand() % 1000 + 1;
+      n = rand() % 1000 + 1;
+
+      TMatrixT<Double_t> A(m,n), B(m,n);
+
+      randomMatrix(A);
+      randomMatrix(B);
+
+      TCudaMatrix ACuda(A);
+      TCudaMatrix BCuda(B);
+
+      Double_t beta = ((Double_t) rand()) / ((Double_t) RAND_MAX);
+      TReference<Double_t>::ScaleAdd(A, B, beta);
+      TCuda<false>::ScaleAdd(ACuda, BCuda, beta);
+
+      Double_t error = maximumRelativeError(A, (TMatrixT<Double_t>) ACuda);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//_________________________________________________________________________________
+int main()
+{
+    Double_t error;
+    error = testReduction();
+    std::cout << "Testing reduction: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testScaleAdd();
+    std::cout << "Testing scale_add: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testHadamard();
+    std::cout << "Testing hadamard: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testMultiply();
+    std::cout << "Testing multiplication: max. rel. error = ";
+    std::cout << error << std::endl;
+
+    error = testAddRowWise();
+    std::cout << "Testing add_row_wise: max. rel. error = ";
+    std::cout << error << std::endl;
+}
diff --git a/tmva/tmva/test/DNN/TestDataLoader.cxx b/tmva/tmva/test/DNN/TestDataLoader.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..283e94a03ddb3643f991a01950041489d27c641a
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoader.cxx
@@ -0,0 +1,26 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////
+// Test the reference data loader implementation. //
+////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestDataLoader.h"
+
+using namespace TMVA::DNN;
+
+int main ()
+{
+   Double_t error = testIdentity<TReference<Double_t>>();
+   std::cout << "Testing reference data loader: Mex. rel. error = " << error;
+   std::cout << std::endl;
+}
diff --git a/tmva/tmva/test/DNN/TestDataLoader.h b/tmva/tmva/test/DNN/TestDataLoader.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a1f901204589ff964a159d81de072a3498535bd
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoader.h
@@ -0,0 +1,93 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 12/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////
+// Generic test for DataLoader implementations. //
+//////////////////////////////////////////////////
+
+#include "TMVA/DNN/Net.h"
+#include "TMVA/DNN/DataLoader.h"
+#include "Utility.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** Test that the data loader loads all data in the data set by summing
+ *  up all elements batch wise and comparing to the result over the complete
+ *  data set. */
+//______________________________________________________________________________
+template <typename Architecture_t>
+auto testSum()
+    -> typename Architecture_t::Scalar_t
+{
+   using Scalar_t     = typename Architecture_t::Scalar_t;
+   using Matrix_t     = typename Architecture_t::Matrix_t;
+   using DataLoader_t = TDataLoader<MatrixInput_t, Architecture_t>;
+
+   size_t nSamples = 10000;
+   TMatrixT<Double_t> X(nSamples,1);
+   randomMatrix(X);
+   for (size_t i = 0; i < 10000; i++) {
+      X(i,0) = i;
+   }
+   MatrixInput_t input(X, X);
+   DataLoader_t  loader(input, nSamples, 5, 1, 1);
+
+   Matrix_t XArch(X), Sum(1,1), SumTotal(1,1);
+   Scalar_t sum = 0.0, sumTotal = 0.0;
+
+   for (auto b : loader) {
+      Architecture_t::SumColumns(Sum, b.GetInput());
+      sum += Sum(0, 0);
+   }
+
+   Architecture_t::SumColumns(SumTotal, XArch);
+   sumTotal = SumTotal(0,0);
+
+   return fabs(sumTotal - sum) / sumTotal;
+}
+
+/** Test the data loader by loading identical input and output data, running it
+ *  through an identity neural network and computing the the mean squared error.
+ *  Should obviously be zero. */
+//______________________________________________________________________________
+template <typename Architecture_t>
+auto testIdentity()
+    -> typename Architecture_t::Scalar_t
+{
+   using Scalar_t     = typename Architecture_t::Scalar_t;
+   using Net_t        = TNet<Architecture_t>;
+   using DataLoader_t = TDataLoader<MatrixInput_t, Architecture_t>;
+
+   TMatrixT<Double_t> X(2000, 100); randomMatrix(X);
+   MatrixInput_t input(X, X);
+   DataLoader_t loader(input, 2000, 20, 100, 100);
+
+   Net_t net(20, 100, ELossFunction::kMeanSquaredError);
+   net.AddLayer(100,  EActivationFunction::kIdentity);
+   net.AddLayer(100,  EActivationFunction::kIdentity);
+   net.Initialize(EInitialization::kIdentity);
+
+   Scalar_t maximumError = 0.0;
+   for (auto b : loader) {
+       auto inputMatrix  = b.GetInput();
+       auto outputMatrix = b.GetOutput();
+       Scalar_t error = net.Loss(inputMatrix, outputMatrix);
+       maximumError = std::max(error, maximumError);
+   }
+
+   return maximumError;
+}
+
+} // namespace DNN
+} // namespace TMVA
diff --git a/tmva/tmva/test/DNN/TestDataLoaderCpu.cxx b/tmva/tmva/test/DNN/TestDataLoaderCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..8e4a787682165546943f76077241390fac667ad4
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoaderCpu.cxx
@@ -0,0 +1,39 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 21/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////
+// Test the multi-threaded CPU data loader implementation. //
+/////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TestDataLoader.h"
+
+using namespace TMVA::DNN;
+
+int main ()
+{
+   using Scalar_t = Real_t;
+
+   std::cout << "Testing data loader:" << std::endl;
+
+   Scalar_t maximumError = 0.0;
+
+   Scalar_t error = testSum<TCpu<Scalar_t>>();
+   std::cout << "Sum:      Maximum relative error = " << error << std::endl;
+   maximumError = std::max(error, maximumError);
+   error = testIdentity<TCpu<Scalar_t>>();
+   std::cout << "Identity: Maximum relative error = " << error << std::endl;
+   maximumError = std::max(error, maximumError);
+
+   if (maximumError > 1e-3) {
+      return 1;
+   }
+}
diff --git a/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..38efb25427234d1c9aac1bf7dd0fce103d6cc89d
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx
@@ -0,0 +1,45 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Simon Pfreundschuh 08/08/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////
+// Test the generic data loader for the CUDA implementation. //
+///////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestDataLoader.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+   std::cout << "Testing data loader:" << std::endl;
+   using Scalar_t = Real_t;
+
+   Scalar_t maximumError = 0.0;
+
+   Scalar_t error = testSum<TCuda<Scalar_t>>();
+   std::cout << "Sum:      Maximum relative error = " << error << std::endl;
+   maximumError = std::max(error, maximumError);
+   error = testIdentity<TCuda<Scalar_t>>();
+   std::cout << "Identity: Maximum relative error = " << error << std::endl;
+   maximumError = std::max(error, maximumError);
+
+   if (maximumError > 1e-3) {
+      return 1;
+   }
+   return 0;
+}
+
+
+
+
+
diff --git a/tmva/tmva/test/DNN/TestDerivatives.cxx b/tmva/tmva/test/DNN/TestDerivatives.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..3a3e742469a692c1025fec6f6db05f403a5148f1
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivatives.cxx
@@ -0,0 +1,65 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic derivative test for the //
+//  reference implementation.                                    //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestDerivatives.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+
+    double error;
+
+    //
+    // Activation Functions
+    //
+
+    std::cout << "Activation Functions:" << std::endl;
+    error = testActivationFunctionDerivatives<TReference<double>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    //
+    // Loss Functions
+    //
+
+    std::cout << "Loss Functions:" << std::endl;
+    error = testLossFunctionGradients<TReference<double>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    //
+    // Regularization Functions
+    //
+
+    std::cout << "Regularization:" << std::endl;
+    error = testRegularizationGradients<TReference<double>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << print_error(error);
+    std::cout << std::endl << std::endl;
+    if (error > 1e-5)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestDerivatives.h b/tmva/tmva/test/DNN/TestDerivatives.h
new file mode 100644
index 0000000000000000000000000000000000000000..7240e23a95183f5b2b95b147894d11d8e625a0dd
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivatives.h
@@ -0,0 +1,253 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Generic tests for the derivatives and gradiens of acitvation,    //
+// loss and regularization functions. Each function generates a     //
+// random 10 x 10 matrix and uses a central finite difference and   //
+// to numerically compute the derivative of the function            //
+// w.r.t. this element. The result is compared to the result        //
+// obtained by the corresponding analytic derivative implemented by //
+// the evaluateDerivative(...), evaluateGradients(...),             //
+// addRegularizationGradients(...) functions.                       //
+//////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+//______________________________________________________________________________
+//
+//  Activation Functions
+//______________________________________________________________________________
+
+/*! Generic function that numerically computes the derivative of a matrix
+ *  function f and the analytical solution given by df the function signatures
+ *  are assumed to be
+ *  - void f(Matrix_t &X)
+ *  - void df(Matrix_t &Y, const Matrix_t &X) -> derivative of f at X(i,j) is
+ *  The function f is supposed to apply the corresponding mathematical function
+ *  to each element in the provided matrix X. The function df is expected to
+ *  set each element in Y to the derivative of the corresponding mathematical
+ *  function evaluated at the corresponding element in X.
+ */
+template<typename Architecture, typename F, typename dF>
+    auto testDerivatives(F f, dF df,
+                         typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+   using Scalar_t   = typename Architecture::Scalar_t;
+   using Matrix_t   = typename Architecture::Matrix_t;
+
+   Scalar_t maximum_error = 0.0;
+
+   for (size_t i = 0; i < 100; i++)
+   {
+      Matrix_t X(10,10), Y(10,10);
+      randomMatrix(Y);
+
+      df(X, Y);
+      Scalar_t dy = X(0,0);
+
+      copyMatrix(X, Y);
+      X(0,0) += dx;
+      f(X);
+      Scalar_t y1 = X(0,0);
+      copyMatrix(X, Y);
+      X(0,0) -= dx;
+      f(X);
+      Scalar_t y0 = X(0,0);
+      Scalar_t dy_num = (y1 - y0) / (2.0 * dx);
+      Scalar_t error = relativeError(dy_num, dy);
+      maximum_error = std::max(maximum_error, error);
+   }
+
+   return maximum_error;
+}
+
+/*! Test derivatives of all activation functions and return the maximum relative
+ *  error. Prints the result for each function to the stdout. */
+//______________________________________________________________________________
+template<typename Architecture>
+auto testActivationFunctionDerivatives()
+    -> typename Architecture::Scalar_t
+{
+   using Scalar_t   = typename Architecture::Scalar_t;
+   using Matrix_t = typename Architecture::Matrix_t;
+
+   // Test only differentiable activation functions.
+   std::vector<EActivationFunction> EActivationFunctions
+   = {EActivationFunction::kIdentity,
+      EActivationFunction::kSigmoid,
+      EActivationFunction::kTanh,
+      EActivationFunction::kSoftSign,
+      EActivationFunction::kGauss};
+
+   Scalar_t error, maximum_error;
+   maximum_error = 0.0;
+
+   for (auto & af : EActivationFunctions)
+   {
+      auto f  = [& af](Matrix_t &X){ evaluate<Architecture>(X, af);};
+      auto df = [& af](Matrix_t &X, const Matrix_t &Y)
+      {
+         evaluateDerivative<Architecture>(X, af, Y);
+      };
+      error = testDerivatives<Architecture>(f, df, 5e-3);
+
+      std::cout << "Testing " << static_cast<int>(af) << ": ";
+      std::cout << "Maximum Relative Error = " << error << std::endl;
+
+      maximum_error = std::max(maximum_error, error);
+   }
+
+   return maximum_error;
+}
+
+//______________________________________________________________________________
+//
+//  Loss functions.
+//______________________________________________________________________________
+
+/*! Similar to testDerivatives only that here the mathematical function is
+ *  expected to be a matrix functional, i.e. to be mapping a matrix to a
+ *  scalar value. The scalar value is supposed to be computed by the provided
+ *  function object f, while the function object is just like above. */
+template<typename Architecture, typename F, typename dF>
+    auto testGradients(F f, dF df,
+                       typename Architecture::Scalar_t dx)
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    Scalar_t maximum_error = 0.0;
+
+    for (size_t i = 0; i < 100; i++)
+    {
+        Matrix_t X(10,10), Y(10,10), Z(10,10);
+        randomMatrix(X);
+        randomMatrix(Y);
+
+        df(Z, Y, X);
+        Scalar_t dy = Z(0,0);
+
+        X(0,0) += dx;
+        Scalar_t y1 = f(Y,X);
+        X(0,0) -= 2.0 * dx;
+        Scalar_t y0 = f(Y,X);
+        Scalar_t dy_num = (y1 - y0) / (2.0 * dx);
+
+        Scalar_t error = 0.0;
+        if (std::fabs(dy) > 0)
+        {
+            error = std::fabs((dy_num - dy) / dy);
+        }
+        else
+            error = dy_num - dy;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
+
+/*! Test gradients of all loss function for the given architecture type and
+ *  return the maximum relative error. Prints results for each function to
+ *  standard out. */
+//______________________________________________________________________________
+template<typename Architecture>
+auto testLossFunctionGradients()
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    std::vector<ELossFunction> LossFunctions
+        = {ELossFunction::kMeanSquaredError,
+           ELossFunction::kCrossEntropy};
+
+    Scalar_t error, maximum_error;
+    maximum_error = 0.0;
+
+    for (auto & lf : LossFunctions)
+    {
+        auto f  = [lf](const Matrix_t &Y, const Matrix_t &Z)
+            {
+                return evaluate<Architecture>(lf, Y, Z);
+            };
+        auto df = [& lf](Matrix_t &X,
+                         const Matrix_t &Y,
+                         const Matrix_t &Z)
+            {
+                evaluateGradients<Architecture>(X, lf, Y, Z);
+            };
+
+        error = testGradients<Architecture>(f, df, 5e-6);
+
+        std::cout << "Testing " << static_cast<char>(lf) << ": ";
+        std::cout << "Maximum Relative Error = " << error << std::endl;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
+
+//______________________________________________________________________________
+//
+//  Regularization.
+//______________________________________________________________________________
+
+/*! Test the computation of gradients for all differentiable regularization types,
+ *  which is so far only L2 and no regularization and print the results to standard
+ *  out */
+template<typename Architecture>
+auto testRegularizationGradients()
+    -> typename Architecture::Scalar_t
+{
+    using Scalar_t   = typename Architecture::Scalar_t;
+    using Matrix_t = typename Architecture::Matrix_t;
+
+    std::vector<ERegularization> Regularizations
+        = {ERegularization::kNone,
+           ERegularization::kL2};
+
+    Scalar_t error, maximum_error;
+    maximum_error = 0.0;
+
+    for (auto & r : Regularizations)
+    {
+        auto f  = [r](const Matrix_t & , const Matrix_t & Y)
+            {
+                return regularization<Architecture>(Y, r);
+            };
+        auto df = [& r](Matrix_t &X,
+                         const Matrix_t & ,
+                         const Matrix_t & Y)
+            {
+                applyMatrix(X, [](double){return 0.0;});
+                addRegularizationGradients<Architecture>(X, Y, (Scalar_t) 1.0, r);
+            };
+
+        error = testGradients<Architecture>(f, df, 1.0);
+
+        std::cout << "Testing " << static_cast<char>(r) << ": ";
+        std::cout << "Maximum Relative Error = " << error << std::endl;
+
+        maximum_error = std::max(maximum_error, error);
+    }
+
+    return maximum_error;
+}
diff --git a/tmva/tmva/test/DNN/TestDerivativesCpu.cxx b/tmva/tmva/test/DNN/TestDerivativesCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..e94e3223809183c2422108922945c5cabf60fc06
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivativesCpu.cxx
@@ -0,0 +1,66 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic derivative test for the //
+//  multi-threaded CPU implementation.                           //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TestDerivatives.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+
+    double error;
+
+    //
+    // Activation Functions
+    //
+
+    std::cout << "Activation Functions:" << std::endl;
+    error = testActivationFunctionDerivatives<TCpu<Scalar_t>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << error;
+    std::cout << std::endl << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Loss Functions
+    //
+
+    std::cout << "Loss Functions:" << std::endl;
+    error = testLossFunctionGradients<TCpu<Scalar_t>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << error;
+    std::cout << std::endl << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Regularization Functions
+    //
+
+    std::cout << "Regularization:" << std::endl;
+    error = testRegularizationGradients<TCpu<Scalar_t>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << error;
+    std::cout << std::endl << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestDerivativesCuda.cxx b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..345c11ef11380cf45fc2862a1f69bdb28093cfd3
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx
@@ -0,0 +1,65 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic derivative test for the //
+//  reference implementation.                                    //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestDerivatives.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+    Double_t error;
+
+    //
+    // Activation Functions
+    //
+
+    std::cout << "Activation Functions:" << std::endl;
+    error = testActivationFunctionDerivatives<TCuda<Scalar_t>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << error;
+    std::cout << std::endl << std::endl;
+    if (error > 1e-2)
+        return 1;
+
+    //
+    // Loss Functions
+    //
+
+    std::cout << "Loss Functions:" << std::endl;
+    error = testLossFunctionGradients<TCuda<Scalar_t>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << error;
+    std::cout << std::endl << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Regularization Functions
+    //
+
+    std::cout << "Regularization:" << std::endl;
+    error = testRegularizationGradients<TCuda<Scalar_t>>();
+    std::cout << "Total    : ";
+    std::cout << "Maximum Relative Error = " << error;
+    std::cout << std::endl << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctions.cxx b/tmva/tmva/test/DNN/TestLossFunctions.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..9ae39ae5ba6b703ffc7bee711014bc4d2834a898
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctions.cxx
@@ -0,0 +1,60 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test for the loss function reference implementation using the //
+// generic test defined in TestLossFunctions.h.                  //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestLossFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing Loss Functions:" << std::endl << std::endl;
+
+    double error;
+
+    //
+    // Mean Squared Error.
+    //
+
+    error = testMeanSquaredError<TReference<double>>(10);
+    std::cout << "Testing mean squared error loss:     ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testMeanSquaredErrorGradients<TReference<double>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    //
+    // Cross Entropy.
+    //
+
+    error = testCrossEntropy<TReference<double>>(10);
+    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+
+    error = testCrossEntropyGradients<TReference<double>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << error << std::endl;
+    if (error > 1e-10)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctions.h b/tmva/tmva/test/DNN/TestLossFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d81bc3f052653b4fdf2dcd22e1edafc81e3a2509
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctions.h
@@ -0,0 +1,193 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////////
+// Generic tests of the loss functions                              //
+//                                                                  //
+// Contains generic test for architecture-specific implementations  //
+// of the loss functions. Requires the architecture-specific matrix //
+// type to be constructible and convertible from/to the             //
+// TMatrixT<Double_t> type.                                         //
+//////////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+//______________________________________________________________________________
+//
+//  Mean Squared Error
+//______________________________________________________________________________
+
+template <typename Architecture>
+auto testMeanSquaredError(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> Z(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+
+      Scalar_t mse = evaluate<Architecture>(ELossFunction::kMeanSquaredError,
+                                            YArch, XArch);
+      zipWithMatrix(Z, [](Scalar_t x, Scalar_t y){return x - y;}, X, Y);
+      auto squaredSum = [](Scalar_t x, Scalar_t y){return x + y * y;};
+      Scalar_t mseReference = reduceMean(squaredSum, 0.0, Z);
+
+      Double_t error;
+      if (mseReference != 0.0)
+          error = std::fabs((mse - mseReference) / mseReference);
+      else
+          error = std::fabs(mse - mseReference);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+template <typename Architecture>
+auto testMeanSquaredErrorGradients(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> ZRef(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+      Matrix_t ZArch(Y);
+
+      evaluateGradients<Architecture>(ZArch, ELossFunction::kMeanSquaredError,
+                                     XArch, YArch);
+      auto normedDifference = [m, n](Scalar_t x, Scalar_t y) {
+         return 2.0 * (y - x) / (m * n);
+      };
+      zipWithMatrix(ZRef, normedDifference, X, Y);
+      TMatrixT<Double_t> Z(ZArch);
+      Double_t error = maximumRelativeError(Z, ZRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+//
+//  Cross Entropy
+//______________________________________________________________________________
+
+template <typename Architecture>
+auto testCrossEntropy(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> Z(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+
+      Scalar_t ce = evaluate<Architecture>(ELossFunction::kCrossEntropy,
+                                           YArch, XArch);
+
+      auto crossCorrelation = [](Scalar_t x, Scalar_t y) {
+         Scalar_t sig = 1.0 / (1.0 + std::exp(-x));
+            return y * std::log(sig) + (1 - y) * std::log(1 - sig);
+      };
+      zipWithMatrix(Z, crossCorrelation, X, Y);
+      auto sum = [](Scalar_t x, Scalar_t y) {return x + y;};
+      Scalar_t ceReference = - reduceMean(sum, 0.0, Z);
+
+      Double_t error;
+      if (ceReference != 0.0)
+          error = std::fabs((ce - ceReference) / ceReference);
+      else
+          error = std::fabs(ce - ceReference);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+template <typename Architecture>
+auto testCrossEntropyGradients(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = 8; //rand() % 100 + 1;
+      size_t n = 8; //rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> ZRef(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+      Matrix_t ZArch(Y);
+
+      evaluateGradients<Architecture>(ZArch, ELossFunction::kCrossEntropy,
+                                     YArch, XArch);
+      auto crossCorrelationGradient = [m, n](Scalar_t x, Scalar_t y) {
+         Scalar_t sig  = 1.0 / (1.0 + std::exp(-x));
+         Scalar_t norm = 1.0 / ((Scalar_t) m * n);
+         return (sig - y) * norm;};
+      zipWithMatrix(ZRef, crossCorrelationGradient, X, Y);
+
+      TMatrixT<Double_t> Z(ZArch);
+      Double_t error = maximumRelativeError(Z, ZRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..94b426d780f629128c0d4c691507b7fba1cb5e74
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx
@@ -0,0 +1,63 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+//////////////////////////////////////////////////////////////////
+// Test for the loss function implementatoins for the           //
+// multi-threaded CPU version using the generic test defined in //
+// TestLossFunctions.h.                                         //
+//////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TestLossFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+
+    std::cout << "Testing Loss Functions:" << std::endl << std::endl;
+
+    double error;
+
+    //
+    // Mean Squared Error.
+    //
+
+    error = testMeanSquaredError<TCpu<Scalar_t>>(10);
+    std::cout << "Testing mean squared error loss:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testMeanSquaredErrorGradients<TCpu<Scalar_t>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Cross Entropy.
+    //
+
+    error = testCrossEntropy<TCpu<Scalar_t>>(10);
+    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testCrossEntropyGradients<TCpu<Scalar_t>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..1bf9ccae98322975e1d668982270d86b1ce84959
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
@@ -0,0 +1,61 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test for the loss function reference implementation using the //
+// generic test defined in TestLossFunctions.h.                  //
+///////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestLossFunctions.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    using Scalar_t = Double_t;
+    std::cout << "Testing Loss Functions:" << std::endl << std::endl;
+
+    double error;
+
+    //
+    // Mean Squared Error.
+    //
+
+    error = testMeanSquaredError<TCuda<Scalar_t>>(10);
+    std::cout << "Testing mean squared error loss:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testMeanSquaredErrorGradients<TCuda<Scalar_t>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Cross Entropy.
+    //
+
+    error = testCrossEntropy<TCuda<Scalar_t>>(10);
+    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testCrossEntropyGradients<TCuda<Scalar_t>>(10);
+    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestMatrixArithmetic.h b/tmva/tmva/test/DNN/TestMatrixArithmetic.h
new file mode 100644
index 0000000000000000000000000000000000000000..10aed0ed1d372bc4ec176de11281fa8b3922dadf
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMatrixArithmetic.h
@@ -0,0 +1,119 @@
+// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test arithmetic functions defined on matrices and compare the //
+// results to the reference implementation.                      //
+///////////////////////////////////////////////////////////////////
+
+#include "TMatrix.h"
+#include "Utility.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+
+/** Test multiplication (standard, transposed, hadamard) operation on
+ *  architecture specific matrix types and compare with results
+ *  obtained with TMatrixT.
+ */
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto testMultiplication(size_t ntests)
+    -> typename Architecture_t::Scalar_t
+{
+
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+   Scalar_t maximumError = 0.0;
+
+   for (size_t t = 0; t < ntests; t++) {
+      size_t m, n, k;
+      m = rand() % 100 + 1;
+      n = rand() % 100 + 1;
+      k = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m,k), A2Ref(m,k), ATRef(k,m) , BRef(k,n),
+          BTRef(n,k), CRef(m,n);
+      TMVA::DNN::randomMatrix(ARef);
+      TMVA::DNN::randomMatrix(A2Ref);
+      TMVA::DNN::randomMatrix(ATRef);
+      TMVA::DNN::randomMatrix(BRef);
+      TMVA::DNN::randomMatrix(BTRef);
+      Matrix_t A(ARef), A2(A2Ref), AT(ATRef), B(BRef), BT(BTRef),  C(CRef);
+
+      // A * B
+      CRef.Mult(ARef,BRef);
+      Architecture_t::Multiply(C, A, B);
+      Scalar_t error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) C, CRef);
+      maximumError   = std::max(error, maximumError);
+
+      // A^T * B
+      CRef.TMult(ATRef,BRef);
+      Architecture_t::TransposeMultiply(C, AT, B);
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) C, CRef);
+      maximumError   = std::max(error, maximumError);
+
+      // A * B^T
+      CRef.MultT(ARef,BTRef);
+      Architecture_t::MultiplyTranspose(C, A, BT);
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) C, CRef);
+      maximumError   = std::max(error, maximumError);
+
+      // A .* B
+      for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) {
+         for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) {
+            ARef(i,j) *= A2Ref(i,j);
+         }
+      }
+      Architecture_t::Hadamard(A, A2);
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) A, ARef);
+      maximumError   = std::max(error, maximumError);
+   }
+
+   return maximumError;
+}
+
+/** Test the summing over columns by summing by the sums obtained
+ *  from a matrix filled with column indices as elements.
+ */
+//______________________________________________________________________________
+template<typename Architecture_t>
+auto testSumColumns(size_t ntests)
+    -> typename Architecture_t::Scalar_t
+{
+
+   using Scalar_t = typename Architecture_t::Scalar_t;
+   using Matrix_t = typename Architecture_t::Matrix_t;
+
+   Scalar_t maximumError = 0.0;
+   for (size_t t = 0; t < ntests; t++) {
+
+      Scalar_t error;
+
+      size_t m, n;
+      m = rand() % 100 + 1;
+      n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> ARef(m,n), BRef(n,1);
+
+      for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) {
+         for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) {
+            ARef(i,j) = j;
+            if (i == 0) BRef(j, 0) = m * j;
+         }
+      }
+
+      Matrix_t A(ARef), B(n, 1);
+      Architecture_t::SumColumns(B, A);
+
+      error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) B ,BRef);
+      maximumError   = std::max(error, maximumError);
+   }
+   return maximumError;
+}
diff --git a/tmva/tmva/test/DNN/TestMatrixArithmeticCpu.cxx b/tmva/tmva/test/DNN/TestMatrixArithmeticCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..c7aa073325b14bdfee6ae6628fbb742f4e4ac4cc
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMatrixArithmeticCpu.cxx
@@ -0,0 +1,46 @@
+// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 20/07/16
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh                                *
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+///////////////////////////////////////////////////////////////////
+// Test arithmetic on CpuMatrix class using the generic tests in //
+// TestArithmetic.h                                              //
+///////////////////////////////////////////////////////////////////
+
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TestMatrixArithmetic.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing CPU matrix arithmetic (double):" << std::endl;
+
+    Double_t error = testMultiplication<TCpu<Double_t>>(10);
+    std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testSumColumns<TCpu<Double_t>>(1);
+    std::cout << "Column Sum:     " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    std::cout << "Testing CPU matrix arithmetic (float):" << std::endl;
+
+    error = testMultiplication<TCpu<Real_t>>(10);
+    std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-1)
+        return 1;
+
+    error = testSumColumns<TCpu<Real_t>>(1);
+    std::cout << "Column Sum:     " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-1)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..cf50a6d9f12826f98885e73eef952ae04def3e78
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx
@@ -0,0 +1,48 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////////////
+// Concrete instantiation of the generic backpropagation test for //
+// CUDA architectures.                                            //
+////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TestMatrixArithmetic.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    std::cout << "Testing CUDA matrix arithmetic (double):" << std::endl;
+
+    Double_t error = testMultiplication<TCuda<Double_t>>(10);
+    std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testSumColumns<TCuda<Double_t>>(1);
+    std::cout << "Column Sum:     " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    std::cout << "Testing CUDA matrix arithmetic (float):" << std::endl;
+
+    error = testMultiplication<TCuda<Real_t>>(10);
+    std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl;
+    if (error > 1)
+        return 1;
+
+    error = testSumColumns<TCuda<Real_t>>(1);
+    std::cout << "Column Sum:     " << "Max. rel. error: " << error << std::endl;
+    if (error > 1e-3)
+        return 1;
+}
diff --git a/tmva/tmva/test/DNN/TestMinimization.cxx b/tmva/tmva/test/DNN/TestMinimization.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..a2e5b36b7a249ad4bb22522675dd295c8a616c06
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimization.cxx
@@ -0,0 +1,29 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+////////////////////////////////////////////////////////////
+// Test the Neural Network training using the reference   //
+// implementation.                                        //
+//                                                        //
+// Calls the generic testMinimization function defined in //
+// TestMinimization.cpp for the reference architecture.   //
+////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TestMinimization.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+    testMinimization<TReference<double>>();
+}
diff --git a/tmva/tmva/test/DNN/TestMinimization.h b/tmva/tmva/test/DNN/TestMinimization.h
new file mode 100644
index 0000000000000000000000000000000000000000..98803b950bb2ad9abd9f272e016d26248fb21fb4
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimization.h
@@ -0,0 +1,121 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Test Standard Minimizer                                         //
+//                                                                 //
+// This test trains a linear neural network on a linear function   //
+// F(x) = W * x and computes the relative error between the matrix //
+// W' representing the linear function learned by the net to the   //
+// orignal matrix W.                                               //
+/////////////////////////////////////////////////////////////////////
+
+#include "TMatrix.h"
+#include "TMVA/DNN/Minimizers.h"
+#include "TMVA/DNN/Net.h"
+#include "Utility.h"
+
+using namespace TMVA::DNN;
+
+/** Train a linear neural network on a randomly generated linear mapping
+ *  from a 20-dimensional input space to a 1-dimensional output space.
+ *  Returns the error of the response of the network to the input containing
+ *  only ones to the 1x20 matrix generating the mapping.
+ */
+template <typename Architecture>
+   auto testMinimization()
+   -> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Net_t    = TNet<Architecture>;
+
+   size_t nSamples  = 10000;
+   size_t nFeatures = 20;
+   size_t batchSize = 256;
+
+   TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1),
+   XTest(batchSize, nFeatures), YTest(batchSize, 1), W(nFeatures, 1);
+
+   randomMatrix(W);
+   randomMatrix(XTrain);
+   randomMatrix(XTest);
+   YTrain.Mult(XTrain, W);
+   YTest.Mult(XTest, W);
+
+   Net_t net(batchSize, nFeatures, ELossFunction::kMeanSquaredError);
+   net.AddLayer(64, EActivationFunction::kIdentity);
+   net.AddLayer(64, EActivationFunction::kIdentity);
+   net.AddLayer(64, EActivationFunction::kIdentity);
+   net.AddLayer(1, EActivationFunction::kIdentity);
+   net.Initialize(EInitialization::kGauss);
+
+   TGradientDescent<Architecture> minimizer(0.0001, 5, 1);
+   MatrixInput_t trainingData(XTrain, YTrain);
+   MatrixInput_t testData(XTest, YTest);
+   minimizer.TrainMomentum(trainingData, nSamples, testData, batchSize, net, 0.8, 1);
+
+   TMatrixT<Double_t> I(nFeatures, nFeatures);
+   for (size_t i = 0; i < nFeatures; i++) {
+      I(i, i) = 1.0;
+   }
+   Matrix_t Id(I);
+   auto clone = net.CreateClone(nFeatures);
+   clone.Forward(Id);
+   TMatrixT<Double_t> Y(clone.GetOutput());
+
+   return maximumRelativeError(Y, W);
+}
+
+/** Similar to testMinimization() as the function above except that
+ *  it uses momentum for the training */
+template <typename Architecture>
+   auto testMinimizationMomentum()
+   -> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Net_t    = TNet<Architecture>;
+
+   size_t nSamples  = 10000;
+   size_t nFeatures = 20;
+   size_t batchSize = 256;
+
+   TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1),
+   XTest(batchSize, nFeatures), YTest(batchSize, 1), W(nFeatures, 1);
+
+   randomMatrix(W);
+   randomMatrix(XTrain);
+   randomMatrix(XTest);
+   YTrain.Mult(XTrain, W);
+   YTest.Mult(XTest, W);
+
+   Net_t net(batchSize, nFeatures, ELossFunction::kMeanSquaredError);
+   net.AddLayer(64, EActivationFunction::kIdentity);
+   net.AddLayer(64, EActivationFunction::kIdentity);
+   net.AddLayer(64, EActivationFunction::kIdentity);
+   net.AddLayer(1, EActivationFunction::kIdentity);
+   net.Initialize(EInitialization::kGauss);
+
+   TGradientDescent<Architecture> minimizer(0.0001, 5, 5);
+   MatrixInput_t trainingData(XTrain, YTrain);
+   MatrixInput_t testData(XTest, YTest);
+   minimizer.TrainMomentum(trainingData, nSamples, testData, batchSize, net, 0.9, 1);
+
+   TMatrixT<Double_t> I(nFeatures, nFeatures);
+   for (size_t i = 0; i < nFeatures; i++) {
+      I(i, i) = 1.0;
+   }
+   Matrix_t Id(I);
+   auto clone = net.CreateClone(nFeatures);
+   clone.Forward(Id);
+   TMatrixT<Double_t> Y(clone.GetOutput());
+
+   return maximumRelativeError(Y, W);
+}
diff --git a/tmva/tmva/test/DNN/TestMinimizationCpu.cxx b/tmva/tmva/test/DNN/TestMinimizationCpu.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..86c4de988f020bfa335acdc170c3223cbddf1dc0
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimizationCpu.cxx
@@ -0,0 +1,55 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Train the multi-threaded CPU implementation of DNNs on a random //
+// linear mapping. In the linear case the minimization problem is  //
+// convex and the gradient descent training should converge to the //
+// global minimum.                                                 //
+/////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Cpu.h"
+#include "TestMinimization.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+
+   std::cout << "Testing minimization: (single precision)" << std::endl;
+
+   Double_t error = testMinimization<TCpu<Real_t>>();
+   std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl;
+   if (error > 1e-3) {
+       return 1;
+   }
+
+   error = testMinimizationMomentum<TCpu<Real_t>>();
+   std::cout << "Momentum:         Maximum relative error = " << error << std::endl;
+   if (error > 1e-3) {
+       return 1;
+   }
+   std::cout << std::endl << "Testing minimization: (double precision)" << std::endl;
+
+   error = testMinimization<TCpu<Double_t>>();
+   std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl;
+   if (error > 1e-5) {
+       return 1;
+   }
+
+   error = testMinimizationMomentum<TCpu<Double_t>>();
+   std::cout << "Momentum:         Maximum relative error = " << error << std::endl;
+   if (error > 1e-5) {
+       return 1;
+   }
+   return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestMinimizationCuda.cxx b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..117ef5fce3a4ec01c0acc18ad548983377231100
--- /dev/null
+++ b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx
@@ -0,0 +1,53 @@
+// @(#)root/tmva $Id$
+// Author: Simon Pfreundschuh
+
+/*************************************************************************
+ * Copyright (C) 2016, Simon Pfreundschuh
+ * All rights reserved.                                                  *
+ *                                                                       *
+ * For the licensing terms see $ROOTSYS/LICENSE.                         *
+ * For the list of contributors see $ROOTSYS/README/CREDITS.             *
+ *************************************************************************/
+
+/////////////////////////////////////////////////////////////////////
+// Use the generic tests defined in TestMinimization.h to test the //
+// training of Neural Networks for CUDA architectures.             //
+/////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Architectures/Cuda.h"
+#include "TMVA/DNN/Minimizers.h"
+#include "TestMinimization.h"
+
+using namespace TMVA::DNN;
+
+int main()
+{
+   std::cout << "Testing minimization: (single precision)" << std::endl;
+
+   Double_t error = testMinimization<TCuda<Real_t>>();
+   std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl;
+   if (error > 1) {
+       return 1;
+   }
+
+   error = testMinimizationMomentum<TCuda<Real_t>>();
+   std::cout << "Momentum:         Maximum relative error = " << error << std::endl;
+    if (error > 1) {
+       return 1;
+   }
+   std::cout << std::endl << "Testing minimization: (double precision)" << std::endl;
+
+   error = testMinimization<TCuda<Double_t>>();
+   std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl;
+   if (error > 1e-3) {
+       return 1;
+   }
+
+   error = testMinimizationMomentum<TCuda<Double_t>>();
+   std::cout << "Momentum:         Maximum relative error = " << error << std::endl;
+   if (error > 1e-3) {
+       return 1;
+   }
+}
diff --git a/tmva/tmva/test/DNN/Utility.h b/tmva/tmva/test/DNN/Utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..46077fc6a14c9406cf36ff3b7a6b443b31fe6b29
--- /dev/null
+++ b/tmva/tmva/test/DNN/Utility.h
@@ -0,0 +1,269 @@
+#ifndef TMVA_TEST_DNN_UTILITY
+#define TMVA_TEST_DNN_UTILITY
+
+#include <iostream>
+#include <sstream>
+#include <type_traits>
+#include "stdlib.h"
+#include "TRandom.h"
+#include "TMVA/DNN/Architectures/Reference.h"
+#include "TMVA/DNN/Functions.h"
+#include "TMVA/DNN/Net.h"
+
+namespace TMVA
+{
+namespace DNN
+{
+
+/** Construct a random linear neural network with up to five layers.*/
+//______________________________________________________________________________
+template <typename AArchitecture>
+void constructRandomLinearNet(TNet<AArchitecture> & net)
+{
+    int nlayers = rand() % 5 + 1;
+
+    std::vector<EActivationFunction> ActivationFunctions
+    = {EActivationFunction::kIdentity};
+
+    for (int i = 0; i < nlayers; i++) {
+        int width = rand() % 20 + 1;
+        EActivationFunction f =
+        ActivationFunctions[rand() % ActivationFunctions.size()];
+        net.AddLayer(width, f);
+    }
+}
+
+/*! Set matrix to the identity matrix */
+//______________________________________________________________________________
+template <typename AMatrix>
+void identityMatrix(AMatrix &X)
+{
+    size_t m, n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        X(i,j) = 0.0;
+        }
+        if (i < n) {
+        X(i,i) = 1.0;
+        }
+    }
+}
+
+/*! Fill matrix with random, Gaussian-distributed values. */
+//______________________________________________________________________________
+template <typename AMatrix>
+void randomMatrix(AMatrix &X)
+{
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    TRandom rand(clock());
+
+    Double_t sigma = sqrt(10.0);
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        X(i,j) = rand.Gaus(0.0, sigma);
+        }
+    }
+}
+
+/*! Generate a random batch as input for a neural net. */
+//______________________________________________________________________________
+template <typename AMatrix>
+void randomBatch(AMatrix &X)
+{
+    randomMatrix(X);
+}
+
+/*! Generate a random batch as input for a neural net. */
+//______________________________________________________________________________
+template <typename AMatrix>
+void copyMatrix(AMatrix &X, const AMatrix &Y)
+{
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        X(i,j) = Y(i,j);
+        }
+    }
+}
+
+/*! Apply functional to each element in the matrix. */
+//______________________________________________________________________________
+template <typename AMatrix, typename F>
+void applyMatrix(AMatrix &X, F f)
+{
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        X(i,j) = f(X(i,j));
+        }
+    }
+}
+
+/*! Combine elements of two given matrices into a single matrix using
+ *  the given function f. */
+//______________________________________________________________________________
+template <typename AMatrix, typename F>
+void zipWithMatrix(AMatrix &Z,
+                    F f,
+                    const AMatrix &X,
+                    const AMatrix &Y)
+{
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        Z(i,j) = f(X(i,j), Y(i,j));
+        }
+    }
+}
+
+/** Generate a random batch as input for a neural net. */
+//______________________________________________________________________________
+template <typename AMatrix, typename AFloat, typename F>
+AFloat reduce(F f, AFloat start, const AMatrix &X)
+{
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    AFloat result = start;
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        result = f(result, X(i,j));
+        }
+    }
+    return result;
+}
+
+/** Apply function to matrix element-wise and compute the mean of the resulting
+ *  element values */
+//______________________________________________________________________________
+template <typename AMatrix, typename AFloat, typename F>
+AFloat reduceMean(F f, AFloat start, const AMatrix &X)
+{
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    AFloat result = start;
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        result = f(result, X(i,j));
+        }
+    }
+    return result / (AFloat) (m * n);
+}
+
+/** Compute the relative error of x and y normalized by y. Specialized for
+ *  float and double to make sure both arguments are above expected machine
+ *  precision (1e-5 and 1e-10). */
+//______________________________________________________________________________
+template <typename AFloat>
+inline AFloat relativeError(const AFloat &x,
+                            const AFloat &y);
+
+
+//______________________________________________________________________________
+template <>
+inline Double_t relativeError(const Double_t &x,
+                              const Double_t &y)
+{
+    if ((std::abs(x) > 1e-10) && (std::abs(y) > 1e-10)) {
+        return std::fabs((x - y) / y);
+    } else {
+        return std::fabs(x - y);
+    }
+}
+
+//______________________________________________________________________________
+template <>
+inline Real_t relativeError(const Real_t &x,
+                            const Real_t &y)
+{
+    if ((std::abs(x) > 1e-5) && (std::abs(y) > 1e-5)) {
+        return std::fabs((x - y) / y);
+    } else {
+        return std::fabs(x - y);
+    }
+}
+
+/*! Compute the maximum, element-wise relative error of the matrices
+*  X and Y normalized by the element of Y. Protected against division
+*  by zero. */
+//______________________________________________________________________________
+template <typename AMatrix>
+auto maximumRelativeError(const AMatrix &X,
+                          const AMatrix &Y)
+-> decltype(X(0,0))
+{
+
+    using AFloat = decltype(X(0,0));
+
+    size_t m,n;
+    m = X.GetNrows();
+    n = X.GetNcols();
+
+    AFloat maximumError = 0.0;
+
+    for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < n; j++) {
+        AFloat error = relativeError(X(i,j), Y(i,j));
+        maximumError = std::max(error, maximumError);
+        }
+    }
+    return maximumError;
+}
+
+/*! Numerically compute the derivative of the functional f using finite
+*  differences. */
+//______________________________________________________________________________
+template <typename F, typename AFloat>
+inline AFloat finiteDifference(F f, AFloat dx)
+{
+    return f(dx) - f(0.0 - dx);
+}
+
+/*! Color code error. */
+//______________________________________________________________________________
+template <typename AFloat>
+std::string print_error(AFloat &e)
+{
+    std::ostringstream out{};
+
+    out << ("\e[");
+
+    if (e > 1e-5)
+        out << "31m";
+    else if (e > 1e-9)
+        out << "33m";
+    else
+        out << "32m";
+
+    out << e;
+    out << "\e[39m";
+
+    return out.str();
+}
+
+}
+}
+
+#endif