From 4ebe8d8898f14ca69f09c87094948ced6283b807 Mon Sep 17 00:00:00 2001
From: Simon Pfreundschuh <simonpf@chalmers.se>
Date: Wed, 12 Oct 2016 06:26:01 +0200
Subject: [PATCH] Implemented multi-class classification for CUDA and CPU
 backend.

---
 tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h    |  10 ++
 .../TMVA/DNN/Architectures/Cpu/CpuBuffer.h    |   2 +
 tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h   |  10 ++
 .../TMVA/DNN/Architectures/Cuda/CudaBuffers.h |   3 +
 .../inc/TMVA/DNN/Architectures/Cuda/Device.h  |  25 +++-
 .../inc/TMVA/DNN/Architectures/Reference.h    | 110 ++++++++++--------
 tmva/tmva/inc/TMVA/DNN/Functions.h            |  21 +++-
 tmva/tmva/inc/TMVA/NeuralNet.icc              |   1 +
 .../src/DNN/Architectures/Cpu/CpuBuffer.cxx   |  38 ++++--
 .../DNN/Architectures/Cpu/LossFunctions.cxx   |  68 +++++++++++
 .../DNN/Architectures/Cpu/OutputFunctions.cxx |  24 ++++
 .../Architectures/Cuda/ActivationFunctions.cu |  52 ++++-----
 .../src/DNN/Architectures/Cuda/Arithmetic.cu  |   8 +-
 .../DNN/Architectures/Cuda/CudaBuffers.cxx    |  33 +++++-
 .../src/DNN/Architectures/Cuda/CudaMatrix.cu  |   4 +-
 .../src/DNN/Architectures/Cuda/Dropout.cu     |   4 +-
 .../src/DNN/Architectures/Cuda/Kernels.cuh    |  73 ++++++++++++
 .../DNN/Architectures/Cuda/LossFunctions.cu   |  52 +++++++--
 .../DNN/Architectures/Cuda/OutputFunctions.cu |  19 ++-
 .../src/DNN/Architectures/Cuda/Propagation.cu |   4 +-
 .../DNN/Architectures/Cuda/Regularization.cu  |  16 +--
 .../Architectures/Reference/LossFunctions.cxx |  96 +++++++++++----
 .../Reference/OutputFunctions.cxx             |  27 ++++-
 tmva/tmva/src/MethodDNN.cxx                   |  53 +++++++--
 tmva/tmva/test/DNN/TestDerivatives.h          |   3 +-
 tmva/tmva/test/DNN/TestLossFunctions.cxx      |   2 +-
 tmva/tmva/test/DNN/TestLossFunctions.h        |  99 ++++++++++++++++
 tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx   |  20 +++-
 tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx  |  24 +++-
 tutorials/tmva/TMVAClassification.C           |  93 ++++++++-------
 tutorials/tmva/TMVAMulticlass.C               |  16 +++
 31 files changed, 798 insertions(+), 212 deletions(-)

diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
index 8ef17292082..6cce3b68f66 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
@@ -162,6 +162,14 @@ public:
    static void CrossEntropyGradients(TCpuMatrix<Scalar_t> & dY,
                                      const TCpuMatrix<Scalar_t> & Y,
                                      const TCpuMatrix<Scalar_t> & output);
+
+    /** Softmax transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static Scalar_t SoftmaxCrossEntropy(const TCpuMatrix<Scalar_t> &Y,
+                                     const TCpuMatrix<Scalar_t> &output);
+   static void SoftmaxCrossEntropyGradients(TCpuMatrix<Scalar_t> & dY,
+                                            const TCpuMatrix<Scalar_t> & Y,
+                                            const TCpuMatrix<Scalar_t> & output);
    ///@}
 
    //____________________________________________________________________________
@@ -179,6 +187,8 @@ public:
    ///@{
    static void Sigmoid(TCpuMatrix<Scalar_t> &YHat,
                         const TCpuMatrix<Scalar_t> & );
+   static void Softmax(TCpuMatrix<Scalar_t> &YHat,
+                       const TCpuMatrix<Scalar_t> & );
    ///@}
 
    //____________________________________________________________________________
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h
index 192ece11b69..f09bd0a70f3 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h
@@ -77,6 +77,8 @@ public:
     /** Copy data to another buffer. No real copying is performed, only the
      *  data pointers are swapped. */
     void CopyTo(TCpuBuffer &);
+
+    size_t GetSize() const {return fSize;}
 };
 
 } // namespace DNN
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
index 751d5264ea6..5eb9b34f1ba 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
@@ -167,6 +167,14 @@ public:
    static void CrossEntropyGradients(TCudaMatrix<AFloat> & dY,
                                      const TCudaMatrix<AFloat> & Y,
                                      const TCudaMatrix<AFloat> & output);
+
+    /** Softmax transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static AFloat SoftmaxCrossEntropy(const TCudaMatrix<AFloat> &Y,
+                                     const TCudaMatrix<AFloat> &output);
+   static void SoftmaxCrossEntropyGradients(TCudaMatrix<AFloat> & dY,
+                                            const TCudaMatrix<AFloat> & Y,
+                                            const TCudaMatrix<AFloat> & output);
    ///@}
 
    //____________________________________________________________________________
@@ -184,6 +192,8 @@ public:
    ///@{
    static void Sigmoid(TCudaMatrix<AFloat> &YHat,
                        const TCudaMatrix<AFloat> & );
+   static void Softmax(TCudaMatrix<AFloat> &YHat,
+                       const TCudaMatrix<AFloat> & );
    ///@}
 
    //____________________________________________________________________________
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h
index f03483ff444..9a12b48fdd2 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h
@@ -43,6 +43,7 @@ class TCudaHostBuffer
 private:
 
    size_t                    fOffset;        ///< Offset for sub-buffers
+   size_t                    fSize;
    mutable cudaStream_t      fComputeStream; ///< cudaStream for data transfer
    std::shared_ptr<AFloat *> fHostPointer;   ///< Pointer to the buffer data
 
@@ -77,6 +78,8 @@ public:
    inline AFloat & operator[](size_t index);
    inline AFloat   operator[](size_t index) const;
 
+   size_t GetSize() const {return fSize;}
+
 };
 
 /** TCudaDeviceBuffer
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
index 3ca30b74807..4f32ac973ab 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h
@@ -43,17 +43,36 @@ public:
    /* Resulting block size. */
    static constexpr int BlockSize = BlockDimX * BlockDimY;
 
-   /* Return dim3 object representing the a BlockDimX x BlockDimY 2D
+   /* Return 1D block of size 1 along the x-dimension and BlockSize along
+    * the y-dimension. */
+   static dim3 BlockDims1D()
+   {
+       return dim3(1, BlockSize);
+   }
+
+   /* Return dim3 object representing a BlockDimX x BlockDimY 2D
     * block */
-   static dim3 BlockDims()
+   static dim3 BlockDims2D()
    {
       return dim3(BlockDimX, BlockDimY);
    }
 
+   /* Return 1D dim3 object representing the block grid covering the row-range
+    * of the matrix A along the y-dimension. */
+   template<typename AFloat>
+   static dim3 GridDims1D(const TCudaMatrix<AFloat> &A)
+   {
+      int gridDim = A.GetNrows() / TDevice::BlockSize;
+      if ((A.GetNrows() % TDevice::BlockSize) != 0) {
+          gridDim += 1;
+      }
+      return dim3(1, gridDim);
+   }
+
    /* Return 2D dim3 object representing the block grid consisting of two-dimensional
     * BlockDimX x BlockDimY blocks covering the matrix A */
    template<typename AFloat>
-   static dim3 GridDims(const TCudaMatrix<AFloat> &A)
+   static dim3 GridDims2D(const TCudaMatrix<AFloat> &A)
    {
       int gridDimX = A.GetNcols() / TDevice::BlockDimX;
       if ((A.GetNcols() % TDevice::BlockDimX) != 0)
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
index 7200f199c27..2c31d0f99e0 100644
--- a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
+++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
@@ -31,15 +31,15 @@ namespace DNN
 * interface for the DNN implementation. The reference implementation uses the
 * TMatrixT class template to represent matrices.
 *
-* \tparam Real_t The floating point type used to represent scalars.
+* \tparam AReal The floating point type used to represent scalars.
 */
-template<typename Real_t>
+template<typename AReal>
 class TReference
 {
 public:
 
-   using Scalar_t     = Real_t;
-   using Matrix_t     = TMatrixT<Real_t>;
+   using Scalar_t     = AReal;
+   using Matrix_t     = TMatrixT<AReal>;
 
    //____________________________________________________________________________
    //
@@ -104,33 +104,33 @@ public:
     * and writes the results into the result matrix.
     */
    ///@{
-   static void Identity(TMatrixT<Real_t> & B);
-   static void IdentityDerivative(TMatrixT<Real_t> & B,
-                                  const TMatrixT<Real_t> & A);
+   static void Identity(TMatrixT<AReal> & B);
+   static void IdentityDerivative(TMatrixT<AReal> & B,
+                                  const TMatrixT<AReal> & A);
 
-   static void Relu(TMatrixT<Real_t> & B);
-   static void ReluDerivative(TMatrixT<Real_t> & B,
-                              const TMatrixT<Real_t> & A);
+   static void Relu(TMatrixT<AReal> & B);
+   static void ReluDerivative(TMatrixT<AReal> & B,
+                              const TMatrixT<AReal> & A);
 
-   static void Sigmoid(TMatrixT<Real_t> & B);
-   static void SigmoidDerivative(TMatrixT<Real_t> & B,
-                                 const TMatrixT<Real_t> & A);
+   static void Sigmoid(TMatrixT<AReal> & B);
+   static void SigmoidDerivative(TMatrixT<AReal> & B,
+                                 const TMatrixT<AReal> & A);
 
-   static void Tanh(TMatrixT<Real_t> & B);
-   static void TanhDerivative(TMatrixT<Real_t> & B,
-                              const TMatrixT<Real_t> & A);
+   static void Tanh(TMatrixT<AReal> & B);
+   static void TanhDerivative(TMatrixT<AReal> & B,
+                              const TMatrixT<AReal> & A);
 
-   static void SymmetricRelu(TMatrixT<Real_t> & B);
-   static void SymmetricReluDerivative(TMatrixT<Real_t> & B,
-                                       const TMatrixT<Real_t> & A);
+   static void SymmetricRelu(TMatrixT<AReal> & B);
+   static void SymmetricReluDerivative(TMatrixT<AReal> & B,
+                                       const TMatrixT<AReal> & A);
 
-   static void SoftSign(TMatrixT<Real_t> & B);
-   static void SoftSignDerivative(TMatrixT<Real_t> & B,
-                                  const TMatrixT<Real_t> & A);
+   static void SoftSign(TMatrixT<AReal> & B);
+   static void SoftSignDerivative(TMatrixT<AReal> & B,
+                                  const TMatrixT<AReal> & A);
 
-   static void Gauss(TMatrixT<Real_t> & B);
-   static void GaussDerivative(TMatrixT<Real_t> & B,
-                               const TMatrixT<Real_t> & A);
+   static void Gauss(TMatrixT<AReal> & B);
+   static void GaussDerivative(TMatrixT<AReal> & B,
+                               const TMatrixT<AReal> & A);
 
    ///@}
 
@@ -148,20 +148,28 @@ public:
     */
    ///@{
 
-   static Real_t MeanSquaredError(const TMatrixT<Real_t> &Y,
-                                  const TMatrixT<Real_t> &output);
-   static void MeanSquaredErrorGradients(TMatrixT<Real_t> & dY,
-                                         const TMatrixT<Real_t> &Y,
-                                         const TMatrixT<Real_t> &output);
+   static AReal MeanSquaredError(const TMatrixT<AReal> &Y,
+                                  const TMatrixT<AReal> &output);
+   static void MeanSquaredErrorGradients(TMatrixT<AReal> & dY,
+                                         const TMatrixT<AReal> &Y,
+                                         const TMatrixT<AReal> &output);
 
     /** Sigmoid transformation is implicitly applied, thus \p output should
      *  hold the linear activations of the last layer in the net. */
-   static Real_t CrossEntropy(const TMatrixT<Real_t> &Y,
-                              const TMatrixT<Real_t> &output);
+   static AReal CrossEntropy(const TMatrixT<AReal> &Y,
+                              const TMatrixT<AReal> &output);
 
-   static void CrossEntropyGradients(TMatrixT<Real_t> & dY,
-                                     const TMatrixT<Real_t> & Y,
-                                     const TMatrixT<Real_t> & output);
+   static void CrossEntropyGradients(TMatrixT<AReal> & dY,
+                                     const TMatrixT<AReal> & Y,
+                                     const TMatrixT<AReal> & output);
+
+    /** Softmax transformation is implicitly applied, thus \p output should
+     *  hold the linear activations of the last layer in the net. */
+   static AReal SoftmaxCrossEntropy(const TMatrixT<AReal> &Y,
+                                    const TMatrixT<AReal> &output);
+   static void SoftmaxCrossEntropyGradients(TMatrixT<AReal> & dY,
+                                            const TMatrixT<AReal> & Y,
+                                            const TMatrixT<AReal> & output);
    ///@}
 
    //____________________________________________________________________________
@@ -177,8 +185,10 @@ public:
     * classification.
     */
    ///@{
-   static void Sigmoid(TMatrixT<Real_t> &YHat,
-                        const TMatrixT<Real_t> & );
+   static void Sigmoid(TMatrixT<AReal> &YHat,
+                       const TMatrixT<AReal> & );
+   static void Softmax(TMatrixT<AReal> &YHat,
+                       const TMatrixT<AReal> & );
    ///@}
 
    //____________________________________________________________________________
@@ -195,15 +205,15 @@ public:
     */
    ///@{
 
-   static Real_t L1Regularization(const TMatrixT<Real_t> & W);
-   static void AddL1RegularizationGradients(TMatrixT<Real_t> & A,
-                                            const TMatrixT<Real_t> & W,
-                                            Real_t weightDecay);
+   static AReal L1Regularization(const TMatrixT<AReal> & W);
+   static void AddL1RegularizationGradients(TMatrixT<AReal> & A,
+                                            const TMatrixT<AReal> & W,
+                                            AReal weightDecay);
 
-   static Real_t L2Regularization(const TMatrixT<Real_t> & W);
-   static void AddL2RegularizationGradients(TMatrixT<Real_t> & A,
-                                            const TMatrixT<Real_t> & W,
-                                            Real_t weightDecay);
+   static AReal L2Regularization(const TMatrixT<AReal> & W);
+   static void AddL2RegularizationGradients(TMatrixT<AReal> & A,
+                                            const TMatrixT<AReal> & W,
+                                            AReal weightDecay);
    ///@}
 
    //____________________________________________________________________________
@@ -218,13 +228,13 @@ public:
     */
    ///@{
 
-   static void InitializeGauss(TMatrixT<Real_t> & A);
+   static void InitializeGauss(TMatrixT<AReal> & A);
 
-   static void InitializeUniform(TMatrixT<Real_t> & A);
+   static void InitializeUniform(TMatrixT<AReal> & A);
 
-   static void InitializeIdentity(TMatrixT<Real_t> & A);
+   static void InitializeIdentity(TMatrixT<AReal> & A);
 
-   static void InitializeZero(TMatrixT<Real_t> & A);
+   static void InitializeZero(TMatrixT<AReal> & A);
 
    ///@}
 
@@ -239,7 +249,7 @@ public:
 
    /** Apply dropout with activation probability \p p to the given
     *  matrix \p A and scale the result by reciprocal of \p p. */
-   static void Dropout(TMatrixT<Real_t> & A, Real_t dropoutProbability);
+   static void Dropout(TMatrixT<AReal> & A, AReal dropoutProbability);
 
    ///@}
 };
diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h
index 5e2f09da46d..847d96880b1 100644
--- a/tmva/tmva/inc/TMVA/DNN/Functions.h
+++ b/tmva/tmva/inc/TMVA/DNN/Functions.h
@@ -43,7 +43,8 @@ enum class EActivationFunction
 enum class EOutputFunction
 {
    kIdentity = 'I',
-   kSigmoid  = 'S'
+   kSigmoid  = 'S',
+   kSoftmax  = 'M'
 };
 
 /*! Enum that represents objective functions for the net, i.e. functions
@@ -52,8 +53,9 @@ enum class EOutputFunction
 *  in the training process. */
 enum class ELossFunction
 {
-    kCrossEntropy     = 'C',
-    kMeanSquaredError = 'R'
+    kCrossEntropy        = 'C',
+    kMeanSquaredError    = 'R',
+    kSoftmaxCrossEntropy = 'S'
 };
 
 /*! Enum representing the regularization type applied for a given layer */
@@ -147,6 +149,8 @@ inline void evaluate(typename Architecture_t::Matrix_t &A,
                                       break;
     case EOutputFunction::kSigmoid  : Architecture_t::Sigmoid(A, X);
                                       break;
+    case EOutputFunction::kSoftmax  : Architecture_t::Softmax(A, X);
+                                      break;
     }
 }
 
@@ -169,6 +173,8 @@ inline auto evaluate(ELossFunction f,
         return Architecture_t::CrossEntropy(Y, output);
     case ELossFunction::kMeanSquaredError :
         return Architecture_t::MeanSquaredError(Y, output);
+    case ELossFunction::kSoftmaxCrossEntropy :
+        return Architecture_t::SoftmaxCrossEntropy(Y, output);
     }
     return 0.0;
 }
@@ -178,9 +184,9 @@ inline auto evaluate(ELossFunction f,
 //______________________________________________________________________________
 template<typename Architecture_t>
 inline void evaluateGradients(typename Architecture_t::Matrix_t & dY,
-                                ELossFunction f,
-                                const typename Architecture_t::Matrix_t &Y,
-                                const typename Architecture_t::Matrix_t &output)
+                              ELossFunction f,
+                              const typename Architecture_t::Matrix_t &Y,
+                              const typename Architecture_t::Matrix_t &output)
 {
     switch(f)
     {
@@ -190,6 +196,9 @@ inline void evaluateGradients(typename Architecture_t::Matrix_t & dY,
     case ELossFunction::kMeanSquaredError :
         Architecture_t::MeanSquaredErrorGradients(dY, Y, output);
         break;
+    case ELossFunction::kSoftmaxCrossEntropy :
+        Architecture_t::SoftmaxCrossEntropyGradients(dY, Y, output);
+        break;
     }
 }
 
diff --git a/tmva/tmva/inc/TMVA/NeuralNet.icc b/tmva/tmva/inc/TMVA/NeuralNet.icc
index 1bf4bb80770..0797345b867 100644
--- a/tmva/tmva/inc/TMVA/NeuralNet.icc
+++ b/tmva/tmva/inc/TMVA/NeuralNet.icc
@@ -1606,6 +1606,7 @@ template <typename LAYERDATA>
             }
             case ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE:
             {
+                std::cout << "softmax." << std::endl;
                 assert (!TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.outputMode ()));
                 std::vector<double> probabilities = layerData.probabilities ();
                 error = softMaxCrossEntropy (begin (probabilities), end (probabilities), 
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx
index c28a738ef9a..20581fdf0dd 100644
--- a/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx
@@ -177,20 +177,30 @@ void TDataLoader<TMVAInput_t, TCpu<Double_t>>::CopyOutput(
     size_t batchSize)
 {
    Event * event  = fData.front();
-   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+   size_t n       = buffer.GetSize() / batchSize;
 
    // Copy target(s).
 
    for (size_t i = 0; i < batchSize; i++) {
-       size_t sampleIndex = * sampleIterator++;
-       event = fData[sampleIndex];
+      size_t sampleIndex = * sampleIterator++;
+      event = fData[sampleIndex];
       for (size_t j = 0; j < n; j++) {
          // Copy output matrices.
          size_t bufferIndex = j * batchSize + i;
+         // Classification
          if (event->GetNTargets() == 0) {
-            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            if (n == 1) {
+               // Binary.
+               buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            } else {
+               // Multiclass.
+               buffer[bufferIndex] = 0.0;
+               if (j == event->GetClass()) {
+                  buffer[bufferIndex] = 1.0;
+               }
+            }
          } else {
-            buffer[bufferIndex] = event->GetTarget(j);
+            buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
          }
       }
    }
@@ -226,18 +236,28 @@ void TDataLoader<TMVAInput_t, TCpu<Real_t>>::CopyOutput(
     size_t batchSize)
 {
    Event * event  = fData.front();
-   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+   size_t n       = buffer.GetSize() / batchSize;
 
    // Copy target(s).
 
    for (size_t i = 0; i < batchSize; i++) {
-       size_t sampleIndex = * sampleIterator++;
-       event = fData[sampleIndex];
+      size_t sampleIndex = * sampleIterator++;
+      event = fData[sampleIndex];
       for (size_t j = 0; j < n; j++) {
          // Copy output matrices.
          size_t bufferIndex = j * batchSize + i;
+         // Classification
          if (event->GetNTargets() == 0) {
-            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            if (n == 1) {
+               // Binary.
+               buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            } else {
+               // Multiclass.
+               buffer[bufferIndex] = 0.0;
+               if (j == event->GetClass()) {
+                  buffer[bufferIndex] = 1.0;
+               }
+            }
          } else {
             buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
          }
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx
index 43af30423ad..3800e9b45b5 100644
--- a/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx
@@ -119,5 +119,73 @@ void TCpu<AFloat>::CrossEntropyGradients(
    Y.GetThreadExecutor().Map(f, ROOT::TSeqI(Y.GetNElements()));
 }
 
+//______________________________________________________________________________
+template<typename AFloat>
+AFloat TCpu<AFloat>::SoftmaxCrossEntropy(
+    const TCpuMatrix<AFloat> &Y,
+    const TCpuMatrix<AFloat> &output)
+{
+   const AFloat  *dataY      = Y.GetRawDataPointer();
+   const AFloat  *dataOutput = output.GetRawDataPointer();
+   std::vector<AFloat> temp(Y.GetNrows());
+   size_t m = Y.GetNrows();
+   size_t n = Y.GetNcols();
+   AFloat norm = 1.0 / ((AFloat) m);
+
+   auto f = [&dataY, &dataOutput, &temp, n, m](UInt_t workerID)
+   {
+      AFloat sum = 0.0;
+      for (size_t j = 0; j < n; j++) {
+         sum += exp(dataOutput[workerID + j * m]);
+      }
+      for (size_t j = 0; j < n; j++) {
+         temp[workerID] -=
+            dataY[workerID + j * m] * log(exp(dataOutput[workerID + j * m]) / sum);
+      }
+      return 0;
+   };
+
+   auto reduction = [](AFloat sum1, AFloat sum2)
+   {
+      return sum1 + sum2;
+   };
+
+   Y.GetThreadPool().Map(f, ROOT::TSeqI(Y.GetNrows()));
+   return norm * Y.GetThreadPool().Reduce(temp, reduction);
+}
+
+//______________________________________________________________________________
+template<typename AFloat>
+void TCpu<AFloat>::SoftmaxCrossEntropyGradients(
+    TCpuMatrix<AFloat> & dY,
+    const TCpuMatrix<AFloat> & Y,
+    const TCpuMatrix<AFloat> & output)
+{
+         AFloat  *dataDY     = dY.GetRawDataPointer();
+   const AFloat  *dataY      = Y.GetRawDataPointer();
+   const AFloat  *dataOutput = output.GetRawDataPointer();
+   size_t m = Y.GetNrows();
+   size_t n = Y.GetNcols();
+   AFloat norm = 1.0 / ((AFloat) m);
+
+   auto f = [&dataDY, &dataY, &dataOutput, norm, n, m](UInt_t workerID)
+   {
+      AFloat sum  = 0.0;
+      AFloat sumY = 0.0;
+      for (size_t j = 0; j < n; j++) {
+         sum  += exp(dataOutput[workerID + j * m]);
+         sumY += dataY[workerID + j * m];
+      }
+      for (size_t j = 0; j < n; j++) {
+         dataDY[workerID + j * m] =
+            norm * (exp(dataOutput[workerID + j * m]) / sum * sumY - dataY[workerID + j * m]);
+
+      }
+      return 0;
+   };
+
+   Y.GetThreadPool().Map(f, ROOT::TSeqI(Y.GetNrows()));
+}
+
 } // namespace DNN
 } // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx
index 4c3fd3994f7..96ad8cc1fb2 100644
--- a/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx
@@ -29,5 +29,29 @@ void TCpu<AFloat>::Sigmoid(TCpuMatrix<AFloat> & B,
    B.MapFrom(f, A);
 }
 
+template<typename AFloat>
+void TCpu<AFloat>::Softmax(TCpuMatrix<AFloat> & B,
+                           const TCpuMatrix<AFloat> & A)
+{
+   const AFloat  *dataA = A.GetRawDataPointer();
+         AFloat  *dataB = B.GetRawDataPointer();
+   size_t n = A.GetNcols();
+   size_t m = A.GetNrows();
+
+   auto f = [&dataA, &dataB, n, m](UInt_t workerID)
+   {
+      AFloat sum = 0.0;
+      for (size_t i = 0; i < n; i++) {
+         sum += exp(dataA[workerID + i * m]);
+      }
+      for (size_t i = 0; i < n; i++) {
+         dataB[workerID + i * m] = exp(dataA[workerID + i * m]) / sum;
+      }
+      return 0;
+   };
+
+   B.GetThreadExecutor().Map(f, ROOT::TSeqI(A.GetNrows()));
+}
+
 } // namespace DNN
 } // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
index 5654680158b..d9f0c7ae854 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu
@@ -28,8 +28,8 @@ template<typename AFloat>
 void TCuda<AFloat>::IdentityDerivative(TCudaMatrix<AFloat> & B,
                                            const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::IdentityDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -42,8 +42,8 @@ void TCuda<AFloat>::IdentityDerivative(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 void TCuda<AFloat>::Relu(TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Relu<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
@@ -56,8 +56,8 @@ template<typename AFloat>
 void TCuda<AFloat>::ReluDerivative(TCudaMatrix<AFloat> & B,
                                        const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::ReluDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -71,8 +71,8 @@ void TCuda<AFloat>::ReluDerivative(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
@@ -85,8 +85,8 @@ template<typename AFloat>
 void TCuda<AFloat>::SigmoidDerivative(TCudaMatrix<AFloat> & B,
                                           const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::SigmoidDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -100,8 +100,8 @@ void TCuda<AFloat>::SigmoidDerivative(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 void TCuda<AFloat>::Tanh(TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Tanh<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
@@ -114,8 +114,8 @@ template<typename AFloat>
 void TCuda<AFloat>::TanhDerivative(TCudaMatrix<AFloat> & B,
                                        const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::TanhDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -129,8 +129,8 @@ void TCuda<AFloat>::TanhDerivative(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 void TCuda<AFloat>::SymmetricRelu(TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::SymmetricRelu<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
@@ -143,8 +143,8 @@ template<typename AFloat>
 void TCuda<AFloat>::SymmetricReluDerivative(TCudaMatrix<AFloat> & B,
                                                 const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::SymmetricReluDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -158,8 +158,8 @@ void TCuda<AFloat>::SymmetricReluDerivative(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 void TCuda<AFloat>::SoftSign(TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::SoftSign<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
@@ -172,8 +172,8 @@ template<typename AFloat>
 void TCuda<AFloat>::SoftSignDerivative(TCudaMatrix<AFloat> & B,
                                            const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::SoftSignDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -187,8 +187,8 @@ void TCuda<AFloat>::SoftSignDerivative(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 void TCuda<AFloat>::Gauss(TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Gauss<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
@@ -201,8 +201,8 @@ template<typename AFloat>
 void TCuda<AFloat>::GaussDerivative(TCudaMatrix<AFloat> & B,
                                     const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::GaussDerivative<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
index f61391687f5..43ffa15d327 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
@@ -135,8 +135,8 @@ template<typename AFloat>
 void TCuda<AFloat>::Hadamard(TCudaMatrix<AFloat> & B,
                              const TCudaMatrix<AFloat> &A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
                                                               A.GetDataPointer(),
@@ -149,8 +149,8 @@ void TCuda<AFloat>::Hadamard(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 AFloat TCuda<AFloat>::Sum(const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
 
    TCudaMatrix<AFloat>::ResetDeviceReturn();
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx
index 68be42d7751..28bfcd936c9 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx
@@ -35,7 +35,7 @@ void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat ** devicePointer)
 //______________________________________________________________________________
 template<typename AFloat>
 TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size)
-    : fOffset(0), fComputeStream(0), fDestructor()
+    : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
 {
    AFloat ** pointer = new AFloat * [1];
    cudaMallocHost(pointer, size * sizeof(AFloat));
@@ -52,10 +52,11 @@ TCudaHostBuffer<AFloat>::operator AFloat * () const
 //______________________________________________________________________________
 template<typename AFloat>
 TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(size_t offset,
-                                                              size_t /*size*/)
+                                                              size_t size)
 {
    TCudaHostBuffer buffer = *this;
    buffer.fOffset         = offset;
+   buffer.fSize           = size;
    return buffer;
 }
 
@@ -209,7 +210,7 @@ void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(
     size_t batchSize)
 {
    Event * event  = fData.front();
-   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+   size_t n       = buffer.GetSize() / batchSize;
 
    // Copy target(s).
 
@@ -219,8 +220,18 @@ void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(
       for (size_t j = 0; j < n; j++) {
          // Copy output matrices.
          size_t bufferIndex = j * batchSize + i;
+         // Classification
          if (event->GetNTargets() == 0) {
-            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            if (n == 1) {
+               // Binary.
+               buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            } else {
+               // Multiclass.
+               buffer[bufferIndex] = 0.0;
+               if (j == event->GetClass()) {
+                  buffer[bufferIndex] = 1.0;
+               }
+            }
          } else {
             buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
          }
@@ -298,7 +309,7 @@ void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(
     size_t batchSize)
 {
    Event * event  = fData.front();
-   size_t n       = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
+   size_t n       = buffer.GetSize() / batchSize;
 
    // Copy target(s).
 
@@ -308,8 +319,18 @@ void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(
       for (size_t j = 0; j < n; j++) {
          // Copy output matrices.
          size_t bufferIndex = j * batchSize + i;
+         // Classification
          if (event->GetNTargets() == 0) {
-            buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+               // Binary.
+            if (n == 1) {
+               buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
+            } else {
+               // Multiclass.
+               buffer[bufferIndex] = 0.0;
+               if (j == event->GetClass()) {
+                  buffer[bufferIndex] = 1.0;
+               }
+            }
          } else {
             buffer[bufferIndex] = event->GetTarget(j);
          }
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
index 0a3f27b3f8a..9d40161b371 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu
@@ -130,8 +130,8 @@ inline void TCudaMatrix<AFloat>::InitializeCuda()
 template<typename AFloat>
 void TCudaMatrix<AFloat>::InitializeCurandStates()
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(*this);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(*this);
    CurandInitializationKernel<<<gridDims, blockDims>>>(time(nullptr), fCurandStates);
 }
 
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
index 501e2e6d249..6ae9213a0b7 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu
@@ -25,8 +25,8 @@ template<typename AFloat>
 void TCuda<AFloat>::Dropout(TCudaMatrix<AFloat> &A,
                             AFloat dropoutProbability)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Dropout<<<gridDims, blockDims, 0, s>>>(
        A.GetDataPointer(),
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
index a0ace5f1bbd..e828b9038bd 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
@@ -304,6 +304,7 @@ __global__ void Sigmoid(AFloat * B,
       B[index] = sig;
    }
 }
+
 //____________________________________________________________________________
 template<typename AFloat>
 __global__ void SigmoidDerivative(AFloat * B,
@@ -320,6 +321,25 @@ __global__ void SigmoidDerivative(AFloat * B,
    }
 }
 
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void Softmax(AFloat * B,
+                        const AFloat * A,
+                        int m, int n)
+{
+   int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+   if (i < m) {
+      AFloat sum = 0.0;
+      for (int j = 0; j < n; j++) {
+         sum += exp(A[i + j * n]);
+      }
+      for (int j = 0; j < n; j++) {
+         B[i + j * n] = exp(A[i * n + j]) / sum;
+      }
+   }
+}
+
 //____________________________________________________________________________
 template<typename AFloat>
 __global__ void Tanh(AFloat * A,
@@ -602,6 +622,59 @@ __global__ void CrossEntropyGradients(AFloat * dY,
    }
 }
 
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SoftmaxCrossEntropy(AFloat * result,
+                                    const AFloat * Y,
+                                    const AFloat * output,
+                                    int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int tid = threadIdx.y;
+
+   __shared__ AFloat sdata[TDevice::BlockSize];
+   AFloat norm = 1.0 / ((AFloat) m);
+
+   sdata[tid] = 0.0;
+   if (i < m) {
+      AFloat sum  = 0.0;
+      for (int j = 0; j < n; j++) {
+         sum  += exp(output[i + j * m]);
+      }
+      for (int j = 0; j < n; j++) {
+         sdata[tid] += Y[i + j * m] * log(exp(output[i + j * m]) / sum);
+      }
+      sdata[tid] *= - norm;
+   } else {
+      sdata[tid] = 0.0;
+   }
+
+   ReduceSum(result, sdata);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void SoftmaxCrossEntropyGradients(AFloat * dY,
+                                             const AFloat * Y,
+                                             const AFloat * output,
+                                             int m, int n)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   AFloat norm = 1.0 / ((AFloat) m);
+
+   if (i < m) {
+      AFloat sum  = 0.0;
+      AFloat sumY = 0.0;
+      for (int j = 0; j < n; j++) {
+         sum  += exp(output[i + j * m]);
+         sumY += Y[i + j * m];
+      }
+      for (int j = 0; j < n; j++) {
+         dY[i + j * m] = norm * (sumY * exp(output[i + j * m]) / sum - Y[i + j * m]);
+      }
+   }
+}
+
 //____________________________________________________________________________
 template<typename AFloat>
 __global__ void ReduceMatrix(AFloat *result,
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
index 8a242468ff7..c65bf9b8de3 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu
@@ -28,8 +28,8 @@ template<typename AFloat>
 AFloat TCuda<AFloat>::MeanSquaredError(const TCudaMatrix<AFloat> & Y,
                                        const TCudaMatrix<AFloat> & output)
 {
-    dim3 blockDims = TDevice::BlockDims();
-    dim3 gridDims  = TDevice::GridDims(Y);
+    dim3 blockDims = TDevice::BlockDims2D();
+    dim3 gridDims  = TDevice::GridDims2D(Y);
     cudaStream_t s = Y.GetComputeStream();
     TCudaMatrix<AFloat>::ResetDeviceReturn();
     ::TMVA::DNN::Cuda::MeanSquaredError<<<gridDims, blockDims, 0, s>>>(
@@ -47,8 +47,8 @@ void TCuda<AFloat>::MeanSquaredErrorGradients(TCudaMatrix<AFloat> & dY,
                                               const TCudaMatrix<AFloat> & Y,
                                               const TCudaMatrix<AFloat> & output)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(Y);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(Y);
    cudaStream_t s = output.GetComputeStream();
    ::TMVA::DNN::Cuda::MeanSquaredErrorGradients<<<gridDims, blockDims, 0, s>>>(
        dY.GetDataPointer(),
@@ -64,8 +64,8 @@ template<typename AFloat>
 AFloat TCuda<AFloat>::CrossEntropy(const TCudaMatrix<AFloat> & Y,
                                    const TCudaMatrix<AFloat> & output)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(Y);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(Y);
    TCudaMatrix<AFloat>::ResetDeviceReturn();
    cudaStream_t s = Y.GetComputeStream();
    ::TMVA::DNN::Cuda::CrossEntropy<<<gridDims, blockDims, 0, s>>>(
@@ -83,8 +83,8 @@ void TCuda<AFloat>::CrossEntropyGradients(TCudaMatrix<AFloat> & dY,
                                           const TCudaMatrix<AFloat> & Y,
                                           const TCudaMatrix<AFloat> & output)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(Y);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(Y);
    cudaStream_t s = output.GetComputeStream();
    ::TMVA::DNN::Cuda::CrossEntropyGradients<<<gridDims, blockDims, 0, s>>>(
        dY.GetDataPointer(),
@@ -95,5 +95,41 @@ void TCuda<AFloat>::CrossEntropyGradients(TCudaMatrix<AFloat> & dY,
    dY.SetComputeStream(s);
 }
 
+//____________________________________________________________________________
+template<typename AFloat>
+AFloat TCuda<AFloat>::SoftmaxCrossEntropy(const TCudaMatrix<AFloat> & Y,
+                                          const TCudaMatrix<AFloat> & output)
+{
+   dim3 blockDims = TDevice::BlockDims1D();
+   dim3 gridDims  = TDevice::GridDims1D(Y);
+   TCudaMatrix<AFloat>::ResetDeviceReturn();
+   cudaStream_t s = Y.GetComputeStream();
+   ::TMVA::DNN::Cuda::SoftmaxCrossEntropy<<<gridDims, blockDims, 0, s>>>(
+       TCudaMatrix<AFloat>::GetDeviceReturnPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   return TCudaMatrix<AFloat>::GetDeviceReturn();
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::SoftmaxCrossEntropyGradients(TCudaMatrix<AFloat> & dY,
+                                                 const TCudaMatrix<AFloat> & Y,
+                                                 const TCudaMatrix<AFloat> & output)
+{
+   dim3 blockDims = TDevice::BlockDims1D();
+   dim3 gridDims  = TDevice::GridDims1D(Y);
+   cudaStream_t s = output.GetComputeStream();
+   ::TMVA::DNN::Cuda::SoftmaxCrossEntropyGradients<<<gridDims, blockDims, 0, s>>>(
+       dY.GetDataPointer(),
+       Y.GetDataPointer(),
+       output.GetDataPointer(),
+       (int) Y.GetNrows(),
+       (int) Y.GetNcols());
+   dY.SetComputeStream(s);
+}
+
 } // namespace DNN
 } // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
index 039fb27a8e3..303cc4a429d 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu
@@ -27,8 +27,8 @@ template<typename AFloat>
 void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & B,
                             const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
                                                              A.GetDataPointer(),
@@ -37,5 +37,20 @@ void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & B,
    B.SetComputeStream(s);
 }
 
+//______________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::Softmax(TCudaMatrix<AFloat> & B,
+                            const TCudaMatrix<AFloat> & A)
+{
+   dim3 blockDims = TDevice::BlockDims1D();
+   dim3 gridDims  = TDevice::GridDims1D(B);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::Softmax<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
+                                                             A.GetDataPointer(),
+                                                             (int) A.GetNrows(),
+                                                             (int) A.GetNcols());
+   B.SetComputeStream(s);
+}
+
 } // namespace DNN
 } // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
index 047c6411b52..35331b7855f 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
@@ -77,8 +77,8 @@ template<typename AFloat>
 void TCuda<AFloat>::AddRowWise(TCudaMatrix<AFloat> &Weights,
                                const TCudaMatrix<AFloat> &theta)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(Weights);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(Weights);
    cudaStream_t s = Weights.GetComputeStream();
    ::TMVA::DNN::Cuda::AddRowWise<<<gridDims, blockDims, 0, s>>>(
        Weights.GetDataPointer(),
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
index 67851eaef8b..dce384ebb0b 100644
--- a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
+++ b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu
@@ -26,8 +26,8 @@ namespace DNN  {
 template<typename AFloat>
 AFloat TCuda<AFloat>::L1Regularization(const TCudaMatrix<AFloat> & A)
 {
-    dim3 blockDims = TDevice::BlockDims();
-    dim3 gridDims  = TDevice::GridDims(A);
+    dim3 blockDims = TDevice::BlockDims2D();
+    dim3 gridDims  = TDevice::GridDims2D(A);
     cudaStream_t s = A.GetComputeStream();
     TCudaMatrix<AFloat>::ResetDeviceReturn();
     ::TMVA::DNN::Cuda::AbsoluteSum<<<gridDims, blockDims, 0, s>>>(
@@ -44,8 +44,8 @@ void TCuda<AFloat>::AddL1RegularizationGradients(TCudaMatrix<AFloat> & B,
                                                  const TCudaMatrix<AFloat> & A,
                                                  AFloat weightDecay)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::AddL1RegularizationGradients<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
@@ -59,8 +59,8 @@ void TCuda<AFloat>::AddL1RegularizationGradients(TCudaMatrix<AFloat> & B,
 template<typename AFloat>
 AFloat TCuda<AFloat>::L2Regularization(const TCudaMatrix<AFloat> & A)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(A);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
    cudaStream_t s = A.GetComputeStream();
    TCudaMatrix<AFloat>::ResetDeviceReturn();
    ::TMVA::DNN::Cuda::SquaredSum<<<gridDims, blockDims, 0, s>>>(
@@ -77,8 +77,8 @@ void TCuda<AFloat>::AddL2RegularizationGradients(TCudaMatrix<AFloat> & B,
                                                  const TCudaMatrix<AFloat> & A,
                                                  AFloat weightDecay)
 {
-   dim3 blockDims = TDevice::BlockDims();
-   dim3 gridDims  = TDevice::GridDims(B);
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(B);
    cudaStream_t s = A.GetComputeStream();
    ::TMVA::DNN::Cuda::AddL2RegularizationGradients<<<gridDims, blockDims, 0, s>>>(
        B.GetDataPointer(),
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
index aa0b144be20..e52943d272c 100644
--- a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx
@@ -21,81 +21,129 @@ namespace TMVA
 namespace DNN
 {
 //______________________________________________________________________________
-template<typename Real_t>
-Real_t TReference<Real_t>::MeanSquaredError(const TMatrixT<Real_t> &Y,
-                                           const TMatrixT<Real_t> &output)
+template<typename AReal>
+AReal TReference<AReal>::MeanSquaredError(const TMatrixT<AReal> &Y,
+                                           const TMatrixT<AReal> &output)
 {
    size_t m,n;
    m = Y.GetNrows();
    n = Y.GetNcols();
-   Real_t result = 0.0;
+   AReal result = 0.0;
 
    for (size_t i = 0; i < m; i++) {
       for (size_t j = 0; j < n; j++) {
-         Real_t dY = (Y(i,j) - output(i,j));
+         AReal dY = (Y(i,j) - output(i,j));
          result += dY * dY;
       }
    }
-   result /= (Real_t) (m * n);
+   result /= (AReal) (m * n);
    return result;
 }
 
 //______________________________________________________________________________
-template<typename Real_t>
-void TReference<Real_t>::MeanSquaredErrorGradients(TMatrixT<Real_t> & dY,
-                                                  const TMatrixT<Real_t> & Y,
-                                                  const TMatrixT<Real_t> & output)
+template<typename AReal>
+void TReference<AReal>::MeanSquaredErrorGradients(TMatrixT<AReal> & dY,
+                                                  const TMatrixT<AReal> & Y,
+                                                  const TMatrixT<AReal> & output)
 {
    size_t m,n;
    m = Y.GetNrows();
    n = Y.GetNcols();
 
    dY.Minus(Y, output);
-   dY *= - 2.0 / ((Real_t) (m*n));
+   dY *= - 2.0 / ((AReal) (m*n));
 }
 
 //______________________________________________________________________________
-template<typename Real_t>
-Real_t TReference<Real_t>::CrossEntropy(const TMatrixT<Real_t> &Y,
-                                       const TMatrixT<Real_t> &output)
+template<typename AReal>
+AReal TReference<AReal>::CrossEntropy(const TMatrixT<AReal> &Y,
+                                       const TMatrixT<AReal> &output)
 {
    size_t m,n;
    m = Y.GetNrows();
    n = Y.GetNcols();
-   Real_t result = 0.0;
+   AReal result = 0.0;
 
    for (size_t i = 0; i < m; i++) {
       for (size_t j = 0; j < n; j++) {
-         Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j)));
+         AReal sig = 1.0 / (1.0 + std::exp(-output(i,j)));
          result      += Y(i,j) * std::log(sig)
          + (1.0 - Y(i,j)) * std::log(1.0 - sig);
       }
    }
-   result /= - (Real_t) (m * n);
+   result /= - (AReal) (m * n);
    return result;
 }
 
 //______________________________________________________________________________
-template<typename Real_t>
-void TReference<Real_t>::CrossEntropyGradients(TMatrixT<Real_t> & dY,
-                                              const TMatrixT<Real_t> & Y,
-                                              const TMatrixT<Real_t> & output)
+template<typename AReal>
+void TReference<AReal>::CrossEntropyGradients(TMatrixT<AReal> & dY,
+                                              const TMatrixT<AReal> & Y,
+                                              const TMatrixT<AReal> & output)
 {
    size_t m,n;
    m = Y.GetNrows();
    n = Y.GetNcols();
 
-   Real_t norm = 1.0 / ((Real_t) (m * n));
+   AReal norm = 1.0 / ((AReal) (m * n));
    for (size_t i = 0; i < m; i++)
    {
       for (size_t j = 0; j < n; j++)
       {
-         Real_t y   = Y(i,j);
-         Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j)));
+         AReal y   = Y(i,j);
+         AReal sig = 1.0 / (1.0 + std::exp(-output(i,j)));
          dY(i,j) = norm * (sig - y);
       }
    }
 }
 
+//______________________________________________________________________________
+template<typename AReal>
+AReal TReference<AReal>::SoftmaxCrossEntropy(const TMatrixT<AReal> &Y,
+                                               const TMatrixT<AReal> &output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+   AReal result = 0.0;
+
+   for (size_t i = 0; i < m; i++) {
+      AReal sum = 0.0;
+      for (size_t j = 0; j < n; j++) {
+         sum += exp(output(i,j));
+      }
+      for (size_t j = 0; j < n; j++) {
+         result += Y(i,j) * log(exp(output(i,j)) / sum);
+      }
+   }
+   result /= - m;
+   return result;
+}
+
+//______________________________________________________________________________
+template<typename AReal>
+void TReference<AReal>::SoftmaxCrossEntropyGradients(TMatrixT<AReal> & dY,
+                                                      const TMatrixT<AReal> & Y,
+                                                      const TMatrixT<AReal> & output)
+{
+   size_t m,n;
+   m = Y.GetNrows();
+   n = Y.GetNcols();
+   AReal norm = 1.0 / m ;
+
+   for (size_t i = 0; i < m; i++)
+   {
+      AReal sum  = 0.0;
+      AReal sumY = 0.0;
+      for (size_t j = 0; j < n; j++) {
+         sum  += exp(output(i,j));
+         sumY += Y(i,j);
+      }
+      for (size_t j = 0; j < n; j++) {
+         dY(i,j) = norm * (exp(output(i,j)) / sum * sumY - Y(i,j));
+      }
+   }
+}
+
 } // namespace DNN
 } // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
index 731c95713d4..b635532a053 100644
--- a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
+++ b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx
@@ -17,9 +17,9 @@
 namespace TMVA {
 namespace DNN  {
 
-template<typename Real_t>
-void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B,
-                                const TMatrixT<Real_t> & A)
+template<typename AReal>
+void TReference<AReal>::Sigmoid(TMatrixT<AReal> & B,
+                                const TMatrixT<AReal> & A)
 {
    size_t m,n;
    m = A.GetNrows();
@@ -27,11 +27,30 @@ void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B,
 
    for (size_t i = 0; i < m; i++) {
       for (size_t j = 0; j < n; j++) {
-         Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j)));
+         AReal sig = 1.0 / (1.0 + std::exp(-A(i,j)));
          B(i,j) = sig;
       }
    }
 }
 
+template<typename AReal>
+void TReference<AReal>::Softmax(TMatrixT<AReal> & B,
+                                const TMatrixT<AReal> & A)
+{
+   size_t m,n;
+   m = A.GetNrows();
+   n = A.GetNcols();
+
+   for (size_t i = 0; i < m; i++) {
+      AReal sum = 0.0;
+      for (size_t j = 0; j < n; j++) {
+         sum += exp(A(i,j));
+      }
+      for (size_t j = 0; j < n; j++) {
+         B(i,j) = exp(A(i,j)) / sum;
+      }
+   }
+}
+
 } // namespace TMVA
 } // namespace DNN
diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx
index 045b96cd27c..dcd6ef02b38 100644
--- a/tmva/tmva/src/MethodDNN.cxx
+++ b/tmva/tmva/src/MethodDNN.cxx
@@ -401,7 +401,12 @@ void TMVA::MethodDNN::ProcessOptions()
 
    fLayout = TMVA::MethodDNN::ParseLayoutString (fLayoutString);
    size_t inputSize = GetNVariables ();
-   size_t outputSize = (GetNTargets() == 0) ? 1 : GetNTargets();
+   size_t outputSize = 1;
+   if (GetNTargets() != 0) {
+      outputSize = GetNTargets();
+   } else if (DataInfo().GetNClasses() > 2) {
+      outputSize = DataInfo().GetNClasses();
+   }
 
    fNet.SetBatchSize(1);
    fNet.SetInputWidth(inputSize);
@@ -443,9 +448,9 @@ void TMVA::MethodDNN::ProcessOptions()
          fNet.SetLossFunction(ELossFunction::kCrossEntropy);
       }
       if (fErrorStrategy == "MUTUALEXCLUSIVE") {
-         Log () << kFatal << "MUTUALEXCLUSIVE not yet implemented." << Endl;
+         fNet.SetLossFunction(ELossFunction::kSoftmaxCrossEntropy);
       }
-      fOutputFunction = EOutputFunction::kSigmoid;
+      fOutputFunction = EOutputFunction::kSoftmax;
    }
 
    //
@@ -543,6 +548,13 @@ void TMVA::MethodDNN::Train()
                                          outputValue,
                                          event->GetWeight()));
          trainPattern.back().addInput(1.0);
+      } else if (fAnalysisType == Types::kMulticlass) {
+         std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0);
+         oneHot[event->GetClass()] = 1.0;
+         trainPattern.push_back(Pattern (values.begin(), values.end(),
+                                        oneHot.cbegin(), oneHot.cend(),
+                                        event->GetWeight()));
+         trainPattern.back().addInput(1.0);
       } else {
          const std::vector<Float_t>& targets = event->GetTargets ();
          trainPattern.push_back(Pattern(values.begin(),
@@ -563,6 +575,13 @@ void TMVA::MethodDNN::Train()
                                          outputValue,
                                          event->GetWeight()));
          testPattern.back().addInput(1.0);
+      } else if (fAnalysisType == Types::kMulticlass) {
+         std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0);
+         oneHot[event->GetClass()] = 1.0;
+         testPattern.push_back(Pattern (values.begin(), values.end(),
+                                        oneHot.cbegin(), oneHot.cend(),
+                                        event->GetWeight()));
+         testPattern.back().addInput(1.0);
       } else {
          const std::vector<Float_t>& targets = event->GetTargets ();
          testPattern.push_back(Pattern(values.begin(),
@@ -601,6 +620,7 @@ void TMVA::MethodDNN::Train()
          switch(fOutputFunction) {
             case EOutputFunction::kIdentity: h = ModeOutputValues::DIRECT;  break;
             case EOutputFunction::kSigmoid:  h = ModeOutputValues::SIGMOID; break;
+            case EOutputFunction::kSoftmax:  h = ModeOutputValues::SOFTMAX; break;
          }
          net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g, h));
       }
@@ -613,6 +633,9 @@ void TMVA::MethodDNN::Train()
       case ELossFunction::kCrossEntropy:
          net.setErrorFunction(ModeErrorFunction::CROSSENTROPY);
          break;
+      case ELossFunction::kSoftmaxCrossEntropy:
+         net.setErrorFunction(ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE);
+         break;
    }
 
    switch(fWeightInitialization) {
@@ -630,7 +653,6 @@ void TMVA::MethodDNN::Train()
           break;
    }
 
-
    int idxSetting = 0;
    for (auto s : fTrainingSettings) {
 
@@ -1039,7 +1061,7 @@ Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUp
 }
 
 //______________________________________________________________________________
-const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues()
+const std::vector<Float_t> & TMVA::MethodDNN::GetRegressionValues()
 {
    size_t nVariables = GetEvent()->GetNVariables();
    Matrix_t X(1, nVariables);
@@ -1077,12 +1099,27 @@ const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues()
    return *fRegressionReturnVal;
 }
 
-const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues()
+const std::vector<Float_t> & TMVA::MethodDNN::GetMulticlassValues()
 {
-   Log() << kFATAL << "ERROR: Multiclass classification not yet implemented."
-         << Endl;
+   size_t nVariables = GetEvent()->GetNVariables();
+   Matrix_t X(1, nVariables);
+   Matrix_t YHat(1, DataInfo().GetNClasses());
+   if (fMulticlassReturnVal == NULL) {
+      fMulticlassReturnVal = new std::vector<Float_t>(DataInfo().GetNClasses());
+   }
+
+   const std::vector<Float_t>& inputValues = GetEvent()->GetValues();
+   for (size_t i = 0; i < nVariables; i++) {
+      X(0,i) = inputValues[i];
+   }
+
+   fNet.Prediction(YHat, X, fOutputFunction);
+   for (size_t i = 0; i < (size_t) YHat.GetNcols(); i++) {
+      (*fMulticlassReturnVal)[i] = YHat(0, i);
+   }
    return *fMulticlassReturnVal;
 }
+
 //______________________________________________________________________________
 void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const 
 {
diff --git a/tmva/tmva/test/DNN/TestDerivatives.h b/tmva/tmva/test/DNN/TestDerivatives.h
index 7240e23a951..1acf46f86bd 100644
--- a/tmva/tmva/test/DNN/TestDerivatives.h
+++ b/tmva/tmva/test/DNN/TestDerivatives.h
@@ -176,7 +176,8 @@ auto testLossFunctionGradients()
 
     std::vector<ELossFunction> LossFunctions
         = {ELossFunction::kMeanSquaredError,
-           ELossFunction::kCrossEntropy};
+           ELossFunction::kCrossEntropy,
+           ELossFunction::kSoftmaxCrossEntropy};
 
     Scalar_t error, maximum_error;
     maximum_error = 0.0;
diff --git a/tmva/tmva/test/DNN/TestLossFunctions.cxx b/tmva/tmva/test/DNN/TestLossFunctions.cxx
index 9ae39ae5ba6..0acc959acfb 100644
--- a/tmva/tmva/test/DNN/TestLossFunctions.cxx
+++ b/tmva/tmva/test/DNN/TestLossFunctions.cxx
@@ -53,7 +53,7 @@ int main()
         return 1;
 
     error = testCrossEntropyGradients<TReference<double>>(10);
-    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "Testing cross entropy gradient:      ";
     std::cout << "maximum relative error = " << error << std::endl;
     if (error > 1e-10)
         return 1;
diff --git a/tmva/tmva/test/DNN/TestLossFunctions.h b/tmva/tmva/test/DNN/TestLossFunctions.h
index d81bc3f0526..89b1118ce38 100644
--- a/tmva/tmva/test/DNN/TestLossFunctions.h
+++ b/tmva/tmva/test/DNN/TestLossFunctions.h
@@ -191,3 +191,102 @@ auto testCrossEntropyGradients(size_t ntests)
    }
    return maximumError;
 }
+
+//______________________________________________________________________________
+//
+//  Softmax Cross Entropy
+//______________________________________________________________________________
+
+template <typename Architecture>
+auto testSoftmaxCrossEntropy(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = rand() % 100 + 1;
+      size_t n = rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> Z(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+
+      Scalar_t ce = evaluate<Architecture>(ELossFunction::kSoftmaxCrossEntropy,
+                                           YArch, XArch);
+
+      Scalar_t ceReference = 0.0;
+      for (size_t j = 0; j < m; j++) {
+         Scalar_t sum  = 0.0;
+         for (size_t k = 0; k < n; k++) {
+            sum  += exp(X(j,k));
+         }
+         for (size_t k = 0; k < n; k++) {
+            ceReference -= Y(j,k) * log(exp(X(j,k)) / sum);
+         }
+      }
+      ceReference /= (Scalar_t) m;
+
+      Double_t error;
+      if (ceReference != 0.0)
+          error = std::fabs((ce - ceReference) / ceReference);
+      else
+          error = std::fabs(ce - ceReference);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
+
+//______________________________________________________________________________
+template <typename Architecture>
+auto testSoftmaxCrossEntropyGradients(size_t ntests)
+-> typename Architecture::Scalar_t
+{
+   using Matrix_t = typename Architecture::Matrix_t;
+   using Scalar_t   = typename Architecture::Scalar_t;
+   Double_t maximumError = 0.0;
+
+   for (size_t i = 0; i < ntests; i++) {
+      size_t m = 8; //rand() % 100 + 1;
+      size_t n = 8; //rand() % 100 + 1;
+
+      TMatrixT<Double_t> X(m, n);
+      TMatrixT<Double_t> Y(m, n);
+      TMatrixT<Double_t> ZRef(m, n);
+
+      randomMatrix(X);
+      randomMatrix(Y);
+
+      Matrix_t XArch(X);
+      Matrix_t YArch(Y);
+      Matrix_t ZArch(Y);
+
+      evaluateGradients<Architecture>(ZArch, ELossFunction::kSoftmaxCrossEntropy,
+                                     YArch, XArch);
+
+      for (size_t j = 0; j < m; j++) {
+         Scalar_t sum  = 0.0;
+         Scalar_t sumY = 0.0;
+         for (size_t k = 0; k < n; k++) {
+            sum  += exp(X(j,k));
+            sumY += Y(j,k);
+         }
+         for (size_t k = 0; k < n; k++) {
+            Scalar_t sig = exp(X(j,k)) / sum;
+            ZRef(j,k) = (sig * sumY - Y(j,k)) / ((Scalar_t) m);
+         }
+      }
+
+      TMatrixT<Double_t> Z(ZArch);
+      Double_t error = maximumRelativeError(Z, ZRef);
+      maximumError = std::max(error, maximumError);
+   }
+   return maximumError;
+}
diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx
index 94b426d780f..5313dec72e9 100644
--- a/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx
+++ b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx
@@ -50,13 +50,29 @@ int main()
     //
 
     error = testCrossEntropy<TCpu<Scalar_t>>(10);
-    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "Testing cross entropy loss:             ";
     std::cout << "maximum relative error = " << print_error(error) << std::endl;
     if (error > 1e-3)
         return 1;
 
     error = testCrossEntropyGradients<TCpu<Scalar_t>>(10);
-    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "Testing mean squared error gradient:    ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Softmax Cross Entropy.
+    //
+
+    error = testSoftmaxCrossEntropy<TCpu<Scalar_t>>(10);
+    std::cout << "Testing softmax cross entropy loss:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testSoftmaxCrossEntropyGradients<TCpu<Scalar_t>>(10);
+    std::cout << "Testing softmax cross entropy gradient: ";
     std::cout << "maximum relative error = " << print_error(error) << std::endl;
     if (error > 1e-3)
         return 1;
diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
index 1bf9ccae983..dd566b22dcb 100644
--- a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
+++ b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx
@@ -32,13 +32,13 @@ int main()
     //
 
     error = testMeanSquaredError<TCuda<Scalar_t>>(10);
-    std::cout << "Testing mean squared error loss:     ";
+    std::cout << "Testing mean squared error loss:        ";
     std::cout << "maximum relative error = " << print_error(error) << std::endl;
     if (error > 1e-3)
         return 1;
 
     error = testMeanSquaredErrorGradients<TCuda<Scalar_t>>(10);
-    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "Testing mean squared error gradient:    ";
     std::cout << "maximum relative error = " << print_error(error) << std::endl;
     if (error > 1e-3)
         return 1;
@@ -48,13 +48,29 @@ int main()
     //
 
     error = testCrossEntropy<TCuda<Scalar_t>>(10);
-    std::cout << "Testing cross entropy loss:          ";
+    std::cout << "Testing cross entropy loss:             ";
     std::cout << "maximum relative error = " << print_error(error) << std::endl;
     if (error > 1e-3)
         return 1;
 
     error = testCrossEntropyGradients<TCuda<Scalar_t>>(10);
-    std::cout << "Testing mean squared error gradient: ";
+    std::cout << "Testing mean squared error gradient:    ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    //
+    // Softmax Cross Entropy.
+    //
+
+    error = testSoftmaxCrossEntropy<TCuda<Scalar_t>>(10);
+    std::cout << "Testing softmax cross entropy loss:     ";
+    std::cout << "maximum relative error = " << print_error(error) << std::endl;
+    if (error > 1e-3)
+        return 1;
+
+    error = testSoftmaxCrossEntropyGradients<TCuda<Scalar_t>>(10);
+    std::cout << "Testing softmax cross entropy gradient: ";
     std::cout << "maximum relative error = " << print_error(error) << std::endl;
     if (error > 1e-3)
         return 1;
diff --git a/tutorials/tmva/TMVAClassification.C b/tutorials/tmva/TMVAClassification.C
index 0ee3b503a18..6b32c45df37 100644
--- a/tutorials/tmva/TMVAClassification.C
+++ b/tutorials/tmva/TMVAClassification.C
@@ -85,36 +85,36 @@ int TMVAClassification( TString myMethodList = "" )
    std::map<std::string,int> Use;
 
    // Cut optimisation
-   Use["Cuts"]            = 1;
-   Use["CutsD"]           = 1;
+   Use["Cuts"]            = 0;
+   Use["CutsD"]           = 0;
    Use["CutsPCA"]         = 0;
    Use["CutsGA"]          = 0;
    Use["CutsSA"]          = 0;
    //
    // 1-dimensional likelihood ("naive Bayes estimator")
-   Use["Likelihood"]      = 1;
+   Use["Likelihood"]      = 0;
    Use["LikelihoodD"]     = 0; // the "D" extension indicates decorrelated input variables (see option strings)
-   Use["LikelihoodPCA"]   = 1; // the "PCA" extension indicates PCA-transformed input variables (see option strings)
+   Use["LikelihoodPCA"]   = 0; // the "PCA" extension indicates PCA-transformed input variables (see option strings)
    Use["LikelihoodKDE"]   = 0;
    Use["LikelihoodMIX"]   = 0;
    //
    // Mutidimensional likelihood and Nearest-Neighbour methods
-   Use["PDERS"]           = 1;
+   Use["PDERS"]           = 0;
    Use["PDERSD"]          = 0;
    Use["PDERSPCA"]        = 0;
-   Use["PDEFoam"]         = 1;
+   Use["PDEFoam"]         = 0;
    Use["PDEFoamBoost"]    = 0; // uses generalised MVA method boosting
    Use["KNN"]             = 1; // k-nearest neighbour method
    //
    // Linear Discriminant Analysis
-   Use["LD"]              = 1; // Linear Discriminant identical to Fisher
+   Use["LD"]              = 0; // Linear Discriminant identical to Fisher
    Use["Fisher"]          = 0;
    Use["FisherG"]         = 0;
    Use["BoostedFisher"]   = 0; // uses generalised MVA method boosting
    Use["HMatrix"]         = 0;
    //
    // Function Discriminant analysis
-   Use["FDA_GA"]          = 1; // minimisation of user-defined function using Genetics Algorithm
+   Use["FDA_GA"]          = 0; // minimisation of user-defined function using Genetics Algorithm
    Use["FDA_SA"]          = 0;
    Use["FDA_MC"]          = 0;
    Use["FDA_MT"]          = 0;
@@ -124,16 +124,18 @@ int TMVAClassification( TString myMethodList = "" )
    // Neural Networks (all are feed-forward Multilayer Perceptrons)
    Use["MLP"]             = 0; // Recommended ANN
    Use["MLPBFGS"]         = 0; // Recommended ANN with optional training method
-   Use["MLPBNN"]          = 1; // Recommended ANN with BFGS training method and bayesian regulator
+   Use["MLPBNN"]          = 0; // Recommended ANN with BFGS training method and bayesian regulator
    Use["CFMlpANN"]        = 0; // Depreciated ANN from ALEPH
    Use["TMlpANN"]         = 0; // ROOT's own ANN
-   Use["DNN"]             = 0; // improved implementation of a NN
+   Use["DNN"]             = 0;     // Deep Neural Network
+   Use["DNN_GPU"]         = 0; // CUDA-accelerated DNN training.
+   Use["DNN_CPU"]         = 0; // Multi-core accelerated DNN.
    //
    // Support Vector Machine
    Use["SVM"]             = 1;
    //
    // Boosted Decision Trees
-   Use["BDT"]             = 1; // uses Adaptive Boost
+   Use["BDT"]             = 0; // uses Adaptive Boost
    Use["BDTG"]            = 0; // uses Gradient Boost
    Use["BDTB"]            = 0; // uses Bagging
    Use["BDTD"]            = 0; // decorrelation + Adaptive Boost
@@ -436,39 +438,50 @@ int TMVAClassification( TString myMethodList = "" )
       factory->BookMethod( dataloader, TMVA::Types::kMLP, "MLPBNN", "H:!V:NeuronType=tanh:VarTransform=N:NCycles=600:HiddenLayers=N+5:TestRate=5:TrainingMethod=BFGS:UseRegulator" ); // BFGS training with bayesian regulators
 
 
-   // improved neural network implementation
+   // Multi-architecture DNN implementation.
    if (Use["DNN"])
    {
-    /*
-       TString layoutString ("Layout=TANH|(N+100)*2,LINEAR");
-       TString layoutString ("Layout=SOFTSIGN|100,SOFTSIGN|50,SOFTSIGN|20,LINEAR");
-       TString layoutString ("Layout=RELU|300,RELU|100,RELU|30,RELU|10,LINEAR");
-       TString layoutString ("Layout=SOFTSIGN|50,SOFTSIGN|30,SOFTSIGN|20,SOFTSIGN|10,LINEAR");
-       TString layoutString ("Layout=TANH|50,TANH|30,TANH|20,TANH|10,LINEAR");
-       TString layoutString ("Layout=SOFTSIGN|50,SOFTSIGN|20,LINEAR");
-    */
-       TString layoutString ("Layout=TANH|100,TANH|50,TANH|10,LINEAR");
-
-       TString training0 ("LearningRate=1e-1,Momentum=0.0,Repetitions=1,ConvergenceSteps=300,BatchSize=20,TestRepetitions=15,WeightDecay=0.001,Regularization=NONE,DropConfig=0.0+0.5+0.5+0.5,DropRepetitions=1,Multithreading=True");
-       TString training1 ("LearningRate=1e-2,Momentum=0.5,Repetitions=1,ConvergenceSteps=300,BatchSize=30,TestRepetitions=7,WeightDecay=0.001,Regularization=L2,Multithreading=True,DropConfig=0.0+0.1+0.1+0.1,DropRepetitions=1");
-       TString training2 ("LearningRate=1e-2,Momentum=0.3,Repetitions=1,ConvergenceSteps=300,BatchSize=40,TestRepetitions=7,WeightDecay=0.0001,Regularization=L2,Multithreading=True");
-       TString training3 ("LearningRate=1e-3,Momentum=0.1,Repetitions=1,ConvergenceSteps=200,BatchSize=70,TestRepetitions=7,WeightDecay=0.0001,Regularization=NONE,Multithreading=True");
-
-       TString trainingStrategyString ("TrainingStrategy=");
-       trainingStrategyString += training0 + "|" + training1 + "|" + training2 + "|" + training3;
-
-
-       // TString nnOptions ("!H:V:VarTransform=Normalize:ErrorStrategy=CROSSENTROPY");
-       TString nnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=G:WeightInitialization=XAVIERUNIFORM");
-       // TString nnOptions ("!H:V:VarTransform=Normalize:ErrorStrategy=CHECKGRADIENTS");
-       nnOptions.Append (":"); nnOptions.Append (layoutString);
-       nnOptions.Append (":"); nnOptions.Append (trainingStrategyString);
-
-       factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN", nnOptions ); // NN
+      // General layout.
+      TString layoutString ("Layout=TANH|128,TANH|128,TANH|128,LINEAR");
+
+      // Training strategies.
+      TString training0("LearningRate=1e-1,Momentum=0.9,Repetitions=1,"
+                        "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10,"
+                        "WeightDecay=1e-4,Regularization=L2,"
+                        "DropConfig=0.0+0.5+0.5+0.5, Multithreading=True");
+      TString training1("LearningRate=1e-2,Momentum=0.9,Repetitions=1,"
+                        "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10,"
+                        "WeightDecay=1e-4,Regularization=L2,"
+                        "DropConfig=0.0+0.0+0.0+0.0, Multithreading=True");
+      TString training2("LearningRate=1e-3,Momentum=0.0,Repetitions=1,"
+                        "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10,"
+                        "WeightDecay=1e-4,Regularization=L2,"
+                        "DropConfig=0.0+0.0+0.0+0.0, Multithreading=True");
+      TString trainingStrategyString ("TrainingStrategy=");
+      trainingStrategyString += training0 + "|" + training1 + "|" + training2;
+
+      // General Options.
+      TString dnnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=N:"
+                          "WeightInitialization=XAVIERUNIFORM");
+      dnnOptions.Append (":"); nnOptions.Append (layoutString);
+      dnnOptions.Append (":"); nnOptions.Append (trainingStrategyString);
+
+      // Standard implementation, no dependencies.
+      TString stdOptions = dnnOptions + ":Architecture=STANDARD";
+      factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN", stdOptions);
+
+      // Cuda implementation.
+      if (Use["DNN_GPU"]) {
+         TString gpuOptions = dnnOptions + ":Architecture=GPU";
+         factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN GPU", gpuOptions);
+      }
+      // Multi-core CPU implementation.
+      if (Use["DNN_CPU"]) {
+         TString cpuOptions = dnnOptions + ":Architecture=CPU";
+         factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN CPU", cpuOptions);
+      }
    }
 
-
-
    // CF(Clermont-Ferrand)ANN
    if (Use["CFMlpANN"])
       factory->BookMethod( dataloader, TMVA::Types::kCFMlpANN, "CFMlpANN", "!H:!V:NCycles=2000:HiddenLayers=N+1,N"  ); // n_cycles:#nodes:#nodes:...
diff --git a/tutorials/tmva/TMVAMulticlass.C b/tutorials/tmva/TMVAMulticlass.C
index 5693874f99a..129723834f9 100644
--- a/tutorials/tmva/TMVAMulticlass.C
+++ b/tutorials/tmva/TMVAMulticlass.C
@@ -51,6 +51,7 @@ void TMVAMulticlass( TString myMethodList = "" )
    std::map<std::string,int> Use;
    Use["MLP"]             = 1;
    Use["BDTG"]            = 1;
+   Use["DNN"]             = 0;
    Use["FDA_GA"]          = 0;
    Use["PDEFoam"]         = 0;
    //---------------------------------------------------------------
@@ -130,6 +131,21 @@ void TMVAMulticlass( TString myMethodList = "" )
    if (Use["PDEFoam"]) // PDE-Foam approach
       factory->BookMethod( dataloader,  TMVA::Types::kPDEFoam, "PDEFoam", "!H:!V:TailCut=0.001:VolFrac=0.0666:nActiveCells=500:nSampl=2000:nBin=5:Nmin=100:Kernel=None:Compress=T" );
 
+   if (Use["DNN"]) {
+       TString layoutString ("Layout=TANH|100,TANH|50,TANH|10,LINEAR");
+       TString training0 ("LearningRate=1e-1, Momentum=0.5, Repetitions=1, ConvergenceSteps=10,"
+                          " BatchSize=256, TestRepetitions=10, Multithreading=True");
+       TString training1 ("LearningRate=1e-2, Momentum=0.0, Repetitions=1, ConvergenceSteps=10,"
+                          " BatchSize=256, TestRepetitions=7, Multithreading=True");
+       TString trainingStrategyString ("TrainingStrategy=");
+       trainingStrategyString += training0 + "|" + training1;
+       TString nnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=N:"
+                          "WeightInitialization=XAVIERUNIFORM:Architecture=STANDARD");
+       nnOptions.Append (":"); nnOptions.Append (layoutString);
+       nnOptions.Append (":"); nnOptions.Append (trainingStrategyString);
+       factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN", nnOptions );
+   }
+
    // Train MVAs using the set of training events
    factory->TrainAllMethods();
 
-- 
GitLab