From 4ebe8d8898f14ca69f09c87094948ced6283b807 Mon Sep 17 00:00:00 2001 From: Simon Pfreundschuh <simonpf@chalmers.se> Date: Wed, 12 Oct 2016 06:26:01 +0200 Subject: [PATCH] Implemented multi-class classification for CUDA and CPU backend. --- tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h | 10 ++ .../TMVA/DNN/Architectures/Cpu/CpuBuffer.h | 2 + tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h | 10 ++ .../TMVA/DNN/Architectures/Cuda/CudaBuffers.h | 3 + .../inc/TMVA/DNN/Architectures/Cuda/Device.h | 25 +++- .../inc/TMVA/DNN/Architectures/Reference.h | 110 ++++++++++-------- tmva/tmva/inc/TMVA/DNN/Functions.h | 21 +++- tmva/tmva/inc/TMVA/NeuralNet.icc | 1 + .../src/DNN/Architectures/Cpu/CpuBuffer.cxx | 38 ++++-- .../DNN/Architectures/Cpu/LossFunctions.cxx | 68 +++++++++++ .../DNN/Architectures/Cpu/OutputFunctions.cxx | 24 ++++ .../Architectures/Cuda/ActivationFunctions.cu | 52 ++++----- .../src/DNN/Architectures/Cuda/Arithmetic.cu | 8 +- .../DNN/Architectures/Cuda/CudaBuffers.cxx | 33 +++++- .../src/DNN/Architectures/Cuda/CudaMatrix.cu | 4 +- .../src/DNN/Architectures/Cuda/Dropout.cu | 4 +- .../src/DNN/Architectures/Cuda/Kernels.cuh | 73 ++++++++++++ .../DNN/Architectures/Cuda/LossFunctions.cu | 52 +++++++-- .../DNN/Architectures/Cuda/OutputFunctions.cu | 19 ++- .../src/DNN/Architectures/Cuda/Propagation.cu | 4 +- .../DNN/Architectures/Cuda/Regularization.cu | 16 +-- .../Architectures/Reference/LossFunctions.cxx | 96 +++++++++++---- .../Reference/OutputFunctions.cxx | 27 ++++- tmva/tmva/src/MethodDNN.cxx | 53 +++++++-- tmva/tmva/test/DNN/TestDerivatives.h | 3 +- tmva/tmva/test/DNN/TestLossFunctions.cxx | 2 +- tmva/tmva/test/DNN/TestLossFunctions.h | 99 ++++++++++++++++ tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx | 20 +++- tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx | 24 +++- tutorials/tmva/TMVAClassification.C | 93 ++++++++------- tutorials/tmva/TMVAMulticlass.C | 16 +++ 31 files changed, 798 insertions(+), 212 deletions(-) diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h index 8ef17292082..6cce3b68f66 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h @@ -162,6 +162,14 @@ public: static void CrossEntropyGradients(TCpuMatrix<Scalar_t> & dY, const TCpuMatrix<Scalar_t> & Y, const TCpuMatrix<Scalar_t> & output); + + /** Softmax transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static Scalar_t SoftmaxCrossEntropy(const TCpuMatrix<Scalar_t> &Y, + const TCpuMatrix<Scalar_t> &output); + static void SoftmaxCrossEntropyGradients(TCpuMatrix<Scalar_t> & dY, + const TCpuMatrix<Scalar_t> & Y, + const TCpuMatrix<Scalar_t> & output); ///@} //____________________________________________________________________________ @@ -179,6 +187,8 @@ public: ///@{ static void Sigmoid(TCpuMatrix<Scalar_t> &YHat, const TCpuMatrix<Scalar_t> & ); + static void Softmax(TCpuMatrix<Scalar_t> &YHat, + const TCpuMatrix<Scalar_t> & ); ///@} //____________________________________________________________________________ diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h index 192ece11b69..f09bd0a70f3 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h @@ -77,6 +77,8 @@ public: /** Copy data to another buffer. No real copying is performed, only the * data pointers are swapped. */ void CopyTo(TCpuBuffer &); + + size_t GetSize() const {return fSize;} }; } // namespace DNN diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h index 751d5264ea6..5eb9b34f1ba 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h @@ -167,6 +167,14 @@ public: static void CrossEntropyGradients(TCudaMatrix<AFloat> & dY, const TCudaMatrix<AFloat> & Y, const TCudaMatrix<AFloat> & output); + + /** Softmax transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static AFloat SoftmaxCrossEntropy(const TCudaMatrix<AFloat> &Y, + const TCudaMatrix<AFloat> &output); + static void SoftmaxCrossEntropyGradients(TCudaMatrix<AFloat> & dY, + const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output); ///@} //____________________________________________________________________________ @@ -184,6 +192,8 @@ public: ///@{ static void Sigmoid(TCudaMatrix<AFloat> &YHat, const TCudaMatrix<AFloat> & ); + static void Softmax(TCudaMatrix<AFloat> &YHat, + const TCudaMatrix<AFloat> & ); ///@} //____________________________________________________________________________ diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h index f03483ff444..9a12b48fdd2 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h @@ -43,6 +43,7 @@ class TCudaHostBuffer private: size_t fOffset; ///< Offset for sub-buffers + size_t fSize; mutable cudaStream_t fComputeStream; ///< cudaStream for data transfer std::shared_ptr<AFloat *> fHostPointer; ///< Pointer to the buffer data @@ -77,6 +78,8 @@ public: inline AFloat & operator[](size_t index); inline AFloat operator[](size_t index) const; + size_t GetSize() const {return fSize;} + }; /** TCudaDeviceBuffer diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h index 3ca30b74807..4f32ac973ab 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h @@ -43,17 +43,36 @@ public: /* Resulting block size. */ static constexpr int BlockSize = BlockDimX * BlockDimY; - /* Return dim3 object representing the a BlockDimX x BlockDimY 2D + /* Return 1D block of size 1 along the x-dimension and BlockSize along + * the y-dimension. */ + static dim3 BlockDims1D() + { + return dim3(1, BlockSize); + } + + /* Return dim3 object representing a BlockDimX x BlockDimY 2D * block */ - static dim3 BlockDims() + static dim3 BlockDims2D() { return dim3(BlockDimX, BlockDimY); } + /* Return 1D dim3 object representing the block grid covering the row-range + * of the matrix A along the y-dimension. */ + template<typename AFloat> + static dim3 GridDims1D(const TCudaMatrix<AFloat> &A) + { + int gridDim = A.GetNrows() / TDevice::BlockSize; + if ((A.GetNrows() % TDevice::BlockSize) != 0) { + gridDim += 1; + } + return dim3(1, gridDim); + } + /* Return 2D dim3 object representing the block grid consisting of two-dimensional * BlockDimX x BlockDimY blocks covering the matrix A */ template<typename AFloat> - static dim3 GridDims(const TCudaMatrix<AFloat> &A) + static dim3 GridDims2D(const TCudaMatrix<AFloat> &A) { int gridDimX = A.GetNcols() / TDevice::BlockDimX; if ((A.GetNcols() % TDevice::BlockDimX) != 0) diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h index 7200f199c27..2c31d0f99e0 100644 --- a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h @@ -31,15 +31,15 @@ namespace DNN * interface for the DNN implementation. The reference implementation uses the * TMatrixT class template to represent matrices. * -* \tparam Real_t The floating point type used to represent scalars. +* \tparam AReal The floating point type used to represent scalars. */ -template<typename Real_t> +template<typename AReal> class TReference { public: - using Scalar_t = Real_t; - using Matrix_t = TMatrixT<Real_t>; + using Scalar_t = AReal; + using Matrix_t = TMatrixT<AReal>; //____________________________________________________________________________ // @@ -104,33 +104,33 @@ public: * and writes the results into the result matrix. */ ///@{ - static void Identity(TMatrixT<Real_t> & B); - static void IdentityDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void Identity(TMatrixT<AReal> & B); + static void IdentityDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); - static void Relu(TMatrixT<Real_t> & B); - static void ReluDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void Relu(TMatrixT<AReal> & B); + static void ReluDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); - static void Sigmoid(TMatrixT<Real_t> & B); - static void SigmoidDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void Sigmoid(TMatrixT<AReal> & B); + static void SigmoidDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); - static void Tanh(TMatrixT<Real_t> & B); - static void TanhDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void Tanh(TMatrixT<AReal> & B); + static void TanhDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); - static void SymmetricRelu(TMatrixT<Real_t> & B); - static void SymmetricReluDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void SymmetricRelu(TMatrixT<AReal> & B); + static void SymmetricReluDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); - static void SoftSign(TMatrixT<Real_t> & B); - static void SoftSignDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void SoftSign(TMatrixT<AReal> & B); + static void SoftSignDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); - static void Gauss(TMatrixT<Real_t> & B); - static void GaussDerivative(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A); + static void Gauss(TMatrixT<AReal> & B); + static void GaussDerivative(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A); ///@} @@ -148,20 +148,28 @@ public: */ ///@{ - static Real_t MeanSquaredError(const TMatrixT<Real_t> &Y, - const TMatrixT<Real_t> &output); - static void MeanSquaredErrorGradients(TMatrixT<Real_t> & dY, - const TMatrixT<Real_t> &Y, - const TMatrixT<Real_t> &output); + static AReal MeanSquaredError(const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output); + static void MeanSquaredErrorGradients(TMatrixT<AReal> & dY, + const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output); /** Sigmoid transformation is implicitly applied, thus \p output should * hold the linear activations of the last layer in the net. */ - static Real_t CrossEntropy(const TMatrixT<Real_t> &Y, - const TMatrixT<Real_t> &output); + static AReal CrossEntropy(const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output); - static void CrossEntropyGradients(TMatrixT<Real_t> & dY, - const TMatrixT<Real_t> & Y, - const TMatrixT<Real_t> & output); + static void CrossEntropyGradients(TMatrixT<AReal> & dY, + const TMatrixT<AReal> & Y, + const TMatrixT<AReal> & output); + + /** Softmax transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static AReal SoftmaxCrossEntropy(const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output); + static void SoftmaxCrossEntropyGradients(TMatrixT<AReal> & dY, + const TMatrixT<AReal> & Y, + const TMatrixT<AReal> & output); ///@} //____________________________________________________________________________ @@ -177,8 +185,10 @@ public: * classification. */ ///@{ - static void Sigmoid(TMatrixT<Real_t> &YHat, - const TMatrixT<Real_t> & ); + static void Sigmoid(TMatrixT<AReal> &YHat, + const TMatrixT<AReal> & ); + static void Softmax(TMatrixT<AReal> &YHat, + const TMatrixT<AReal> & ); ///@} //____________________________________________________________________________ @@ -195,15 +205,15 @@ public: */ ///@{ - static Real_t L1Regularization(const TMatrixT<Real_t> & W); - static void AddL1RegularizationGradients(TMatrixT<Real_t> & A, - const TMatrixT<Real_t> & W, - Real_t weightDecay); + static AReal L1Regularization(const TMatrixT<AReal> & W); + static void AddL1RegularizationGradients(TMatrixT<AReal> & A, + const TMatrixT<AReal> & W, + AReal weightDecay); - static Real_t L2Regularization(const TMatrixT<Real_t> & W); - static void AddL2RegularizationGradients(TMatrixT<Real_t> & A, - const TMatrixT<Real_t> & W, - Real_t weightDecay); + static AReal L2Regularization(const TMatrixT<AReal> & W); + static void AddL2RegularizationGradients(TMatrixT<AReal> & A, + const TMatrixT<AReal> & W, + AReal weightDecay); ///@} //____________________________________________________________________________ @@ -218,13 +228,13 @@ public: */ ///@{ - static void InitializeGauss(TMatrixT<Real_t> & A); + static void InitializeGauss(TMatrixT<AReal> & A); - static void InitializeUniform(TMatrixT<Real_t> & A); + static void InitializeUniform(TMatrixT<AReal> & A); - static void InitializeIdentity(TMatrixT<Real_t> & A); + static void InitializeIdentity(TMatrixT<AReal> & A); - static void InitializeZero(TMatrixT<Real_t> & A); + static void InitializeZero(TMatrixT<AReal> & A); ///@} @@ -239,7 +249,7 @@ public: /** Apply dropout with activation probability \p p to the given * matrix \p A and scale the result by reciprocal of \p p. */ - static void Dropout(TMatrixT<Real_t> & A, Real_t dropoutProbability); + static void Dropout(TMatrixT<AReal> & A, AReal dropoutProbability); ///@} }; diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h index 5e2f09da46d..847d96880b1 100644 --- a/tmva/tmva/inc/TMVA/DNN/Functions.h +++ b/tmva/tmva/inc/TMVA/DNN/Functions.h @@ -43,7 +43,8 @@ enum class EActivationFunction enum class EOutputFunction { kIdentity = 'I', - kSigmoid = 'S' + kSigmoid = 'S', + kSoftmax = 'M' }; /*! Enum that represents objective functions for the net, i.e. functions @@ -52,8 +53,9 @@ enum class EOutputFunction * in the training process. */ enum class ELossFunction { - kCrossEntropy = 'C', - kMeanSquaredError = 'R' + kCrossEntropy = 'C', + kMeanSquaredError = 'R', + kSoftmaxCrossEntropy = 'S' }; /*! Enum representing the regularization type applied for a given layer */ @@ -147,6 +149,8 @@ inline void evaluate(typename Architecture_t::Matrix_t &A, break; case EOutputFunction::kSigmoid : Architecture_t::Sigmoid(A, X); break; + case EOutputFunction::kSoftmax : Architecture_t::Softmax(A, X); + break; } } @@ -169,6 +173,8 @@ inline auto evaluate(ELossFunction f, return Architecture_t::CrossEntropy(Y, output); case ELossFunction::kMeanSquaredError : return Architecture_t::MeanSquaredError(Y, output); + case ELossFunction::kSoftmaxCrossEntropy : + return Architecture_t::SoftmaxCrossEntropy(Y, output); } return 0.0; } @@ -178,9 +184,9 @@ inline auto evaluate(ELossFunction f, //______________________________________________________________________________ template<typename Architecture_t> inline void evaluateGradients(typename Architecture_t::Matrix_t & dY, - ELossFunction f, - const typename Architecture_t::Matrix_t &Y, - const typename Architecture_t::Matrix_t &output) + ELossFunction f, + const typename Architecture_t::Matrix_t &Y, + const typename Architecture_t::Matrix_t &output) { switch(f) { @@ -190,6 +196,9 @@ inline void evaluateGradients(typename Architecture_t::Matrix_t & dY, case ELossFunction::kMeanSquaredError : Architecture_t::MeanSquaredErrorGradients(dY, Y, output); break; + case ELossFunction::kSoftmaxCrossEntropy : + Architecture_t::SoftmaxCrossEntropyGradients(dY, Y, output); + break; } } diff --git a/tmva/tmva/inc/TMVA/NeuralNet.icc b/tmva/tmva/inc/TMVA/NeuralNet.icc index 1bf4bb80770..0797345b867 100644 --- a/tmva/tmva/inc/TMVA/NeuralNet.icc +++ b/tmva/tmva/inc/TMVA/NeuralNet.icc @@ -1606,6 +1606,7 @@ template <typename LAYERDATA> } case ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE: { + std::cout << "softmax." << std::endl; assert (!TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.outputMode ())); std::vector<double> probabilities = layerData.probabilities (); error = softMaxCrossEntropy (begin (probabilities), end (probabilities), diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx index c28a738ef9a..20581fdf0dd 100644 --- a/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx +++ b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx @@ -177,20 +177,30 @@ void TDataLoader<TMVAInput_t, TCpu<Double_t>>::CopyOutput( size_t batchSize) { Event * event = fData.front(); - size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + size_t n = buffer.GetSize() / batchSize; // Copy target(s). for (size_t i = 0; i < batchSize; i++) { - size_t sampleIndex = * sampleIterator++; - event = fData[sampleIndex]; + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; for (size_t j = 0; j < n; j++) { // Copy output matrices. size_t bufferIndex = j * batchSize + i; + // Classification if (event->GetNTargets() == 0) { - buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + if (n == 1) { + // Binary. + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + // Multiclass. + buffer[bufferIndex] = 0.0; + if (j == event->GetClass()) { + buffer[bufferIndex] = 1.0; + } + } } else { - buffer[bufferIndex] = event->GetTarget(j); + buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j)); } } } @@ -226,18 +236,28 @@ void TDataLoader<TMVAInput_t, TCpu<Real_t>>::CopyOutput( size_t batchSize) { Event * event = fData.front(); - size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + size_t n = buffer.GetSize() / batchSize; // Copy target(s). for (size_t i = 0; i < batchSize; i++) { - size_t sampleIndex = * sampleIterator++; - event = fData[sampleIndex]; + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; for (size_t j = 0; j < n; j++) { // Copy output matrices. size_t bufferIndex = j * batchSize + i; + // Classification if (event->GetNTargets() == 0) { - buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + if (n == 1) { + // Binary. + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + // Multiclass. + buffer[bufferIndex] = 0.0; + if (j == event->GetClass()) { + buffer[bufferIndex] = 1.0; + } + } } else { buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j)); } diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx index 43af30423ad..3800e9b45b5 100644 --- a/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx +++ b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx @@ -119,5 +119,73 @@ void TCpu<AFloat>::CrossEntropyGradients( Y.GetThreadExecutor().Map(f, ROOT::TSeqI(Y.GetNElements())); } +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCpu<AFloat>::SoftmaxCrossEntropy( + const TCpuMatrix<AFloat> &Y, + const TCpuMatrix<AFloat> &output) +{ + const AFloat *dataY = Y.GetRawDataPointer(); + const AFloat *dataOutput = output.GetRawDataPointer(); + std::vector<AFloat> temp(Y.GetNrows()); + size_t m = Y.GetNrows(); + size_t n = Y.GetNcols(); + AFloat norm = 1.0 / ((AFloat) m); + + auto f = [&dataY, &dataOutput, &temp, n, m](UInt_t workerID) + { + AFloat sum = 0.0; + for (size_t j = 0; j < n; j++) { + sum += exp(dataOutput[workerID + j * m]); + } + for (size_t j = 0; j < n; j++) { + temp[workerID] -= + dataY[workerID + j * m] * log(exp(dataOutput[workerID + j * m]) / sum); + } + return 0; + }; + + auto reduction = [](AFloat sum1, AFloat sum2) + { + return sum1 + sum2; + }; + + Y.GetThreadPool().Map(f, ROOT::TSeqI(Y.GetNrows())); + return norm * Y.GetThreadPool().Reduce(temp, reduction); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::SoftmaxCrossEntropyGradients( + TCpuMatrix<AFloat> & dY, + const TCpuMatrix<AFloat> & Y, + const TCpuMatrix<AFloat> & output) +{ + AFloat *dataDY = dY.GetRawDataPointer(); + const AFloat *dataY = Y.GetRawDataPointer(); + const AFloat *dataOutput = output.GetRawDataPointer(); + size_t m = Y.GetNrows(); + size_t n = Y.GetNcols(); + AFloat norm = 1.0 / ((AFloat) m); + + auto f = [&dataDY, &dataY, &dataOutput, norm, n, m](UInt_t workerID) + { + AFloat sum = 0.0; + AFloat sumY = 0.0; + for (size_t j = 0; j < n; j++) { + sum += exp(dataOutput[workerID + j * m]); + sumY += dataY[workerID + j * m]; + } + for (size_t j = 0; j < n; j++) { + dataDY[workerID + j * m] = + norm * (exp(dataOutput[workerID + j * m]) / sum * sumY - dataY[workerID + j * m]); + + } + return 0; + }; + + Y.GetThreadPool().Map(f, ROOT::TSeqI(Y.GetNrows())); +} + } // namespace DNN } // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx index 4c3fd3994f7..96ad8cc1fb2 100644 --- a/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx +++ b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx @@ -29,5 +29,29 @@ void TCpu<AFloat>::Sigmoid(TCpuMatrix<AFloat> & B, B.MapFrom(f, A); } +template<typename AFloat> +void TCpu<AFloat>::Softmax(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> & A) +{ + const AFloat *dataA = A.GetRawDataPointer(); + AFloat *dataB = B.GetRawDataPointer(); + size_t n = A.GetNcols(); + size_t m = A.GetNrows(); + + auto f = [&dataA, &dataB, n, m](UInt_t workerID) + { + AFloat sum = 0.0; + for (size_t i = 0; i < n; i++) { + sum += exp(dataA[workerID + i * m]); + } + for (size_t i = 0; i < n; i++) { + dataB[workerID + i * m] = exp(dataA[workerID + i * m]) / sum; + } + return 0; + }; + + B.GetThreadExecutor().Map(f, ROOT::TSeqI(A.GetNrows())); +} + } // namespace DNN } // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu index 5654680158b..d9f0c7ae854 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu @@ -28,8 +28,8 @@ template<typename AFloat> void TCuda<AFloat>::IdentityDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::IdentityDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -42,8 +42,8 @@ void TCuda<AFloat>::IdentityDerivative(TCudaMatrix<AFloat> & B, template<typename AFloat> void TCuda<AFloat>::Relu(TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Relu<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), @@ -56,8 +56,8 @@ template<typename AFloat> void TCuda<AFloat>::ReluDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::ReluDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -71,8 +71,8 @@ void TCuda<AFloat>::ReluDerivative(TCudaMatrix<AFloat> & B, template<typename AFloat> void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), @@ -85,8 +85,8 @@ template<typename AFloat> void TCuda<AFloat>::SigmoidDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::SigmoidDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -100,8 +100,8 @@ void TCuda<AFloat>::SigmoidDerivative(TCudaMatrix<AFloat> & B, template<typename AFloat> void TCuda<AFloat>::Tanh(TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Tanh<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), @@ -114,8 +114,8 @@ template<typename AFloat> void TCuda<AFloat>::TanhDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::TanhDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -129,8 +129,8 @@ void TCuda<AFloat>::TanhDerivative(TCudaMatrix<AFloat> & B, template<typename AFloat> void TCuda<AFloat>::SymmetricRelu(TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::SymmetricRelu<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), @@ -143,8 +143,8 @@ template<typename AFloat> void TCuda<AFloat>::SymmetricReluDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::SymmetricReluDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -158,8 +158,8 @@ void TCuda<AFloat>::SymmetricReluDerivative(TCudaMatrix<AFloat> & B, template<typename AFloat> void TCuda<AFloat>::SoftSign(TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::SoftSign<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), @@ -172,8 +172,8 @@ template<typename AFloat> void TCuda<AFloat>::SoftSignDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::SoftSignDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -187,8 +187,8 @@ void TCuda<AFloat>::SoftSignDerivative(TCudaMatrix<AFloat> & B, template<typename AFloat> void TCuda<AFloat>::Gauss(TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Gauss<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), @@ -201,8 +201,8 @@ template<typename AFloat> void TCuda<AFloat>::GaussDerivative(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::GaussDerivative<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu index f61391687f5..43ffa15d327 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu @@ -135,8 +135,8 @@ template<typename AFloat> void TCuda<AFloat>::Hadamard(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> &A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(), A.GetDataPointer(), @@ -149,8 +149,8 @@ void TCuda<AFloat>::Hadamard(TCudaMatrix<AFloat> & B, template<typename AFloat> AFloat TCuda<AFloat>::Sum(const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); TCudaMatrix<AFloat>::ResetDeviceReturn(); diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx index 68be42d7751..28bfcd936c9 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx +++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx @@ -35,7 +35,7 @@ void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat ** devicePointer) //______________________________________________________________________________ template<typename AFloat> TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size) - : fOffset(0), fComputeStream(0), fDestructor() + : fOffset(0), fSize(size), fComputeStream(0), fDestructor() { AFloat ** pointer = new AFloat * [1]; cudaMallocHost(pointer, size * sizeof(AFloat)); @@ -52,10 +52,11 @@ TCudaHostBuffer<AFloat>::operator AFloat * () const //______________________________________________________________________________ template<typename AFloat> TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(size_t offset, - size_t /*size*/) + size_t size) { TCudaHostBuffer buffer = *this; buffer.fOffset = offset; + buffer.fSize = size; return buffer; } @@ -209,7 +210,7 @@ void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput( size_t batchSize) { Event * event = fData.front(); - size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + size_t n = buffer.GetSize() / batchSize; // Copy target(s). @@ -219,8 +220,18 @@ void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput( for (size_t j = 0; j < n; j++) { // Copy output matrices. size_t bufferIndex = j * batchSize + i; + // Classification if (event->GetNTargets() == 0) { - buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + if (n == 1) { + // Binary. + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + // Multiclass. + buffer[bufferIndex] = 0.0; + if (j == event->GetClass()) { + buffer[bufferIndex] = 1.0; + } + } } else { buffer[bufferIndex] = static_cast<float>(event->GetTarget(j)); } @@ -298,7 +309,7 @@ void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput( size_t batchSize) { Event * event = fData.front(); - size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + size_t n = buffer.GetSize() / batchSize; // Copy target(s). @@ -308,8 +319,18 @@ void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput( for (size_t j = 0; j < n; j++) { // Copy output matrices. size_t bufferIndex = j * batchSize + i; + // Classification if (event->GetNTargets() == 0) { - buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + // Binary. + if (n == 1) { + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + // Multiclass. + buffer[bufferIndex] = 0.0; + if (j == event->GetClass()) { + buffer[bufferIndex] = 1.0; + } + } } else { buffer[bufferIndex] = event->GetTarget(j); } diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu index 0a3f27b3f8a..9d40161b371 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu @@ -130,8 +130,8 @@ inline void TCudaMatrix<AFloat>::InitializeCuda() template<typename AFloat> void TCudaMatrix<AFloat>::InitializeCurandStates() { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(*this); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(*this); CurandInitializationKernel<<<gridDims, blockDims>>>(time(nullptr), fCurandStates); } diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu index 501e2e6d249..6ae9213a0b7 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu @@ -25,8 +25,8 @@ template<typename AFloat> void TCuda<AFloat>::Dropout(TCudaMatrix<AFloat> &A, AFloat dropoutProbability) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Dropout<<<gridDims, blockDims, 0, s>>>( A.GetDataPointer(), diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh index a0ace5f1bbd..e828b9038bd 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh @@ -304,6 +304,7 @@ __global__ void Sigmoid(AFloat * B, B[index] = sig; } } + //____________________________________________________________________________ template<typename AFloat> __global__ void SigmoidDerivative(AFloat * B, @@ -320,6 +321,25 @@ __global__ void SigmoidDerivative(AFloat * B, } } +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Softmax(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < m) { + AFloat sum = 0.0; + for (int j = 0; j < n; j++) { + sum += exp(A[i + j * n]); + } + for (int j = 0; j < n; j++) { + B[i + j * n] = exp(A[i * n + j]) / sum; + } + } +} + //____________________________________________________________________________ template<typename AFloat> __global__ void Tanh(AFloat * A, @@ -602,6 +622,59 @@ __global__ void CrossEntropyGradients(AFloat * dY, } } +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SoftmaxCrossEntropy(AFloat * result, + const AFloat * Y, + const AFloat * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int tid = threadIdx.y; + + __shared__ AFloat sdata[TDevice::BlockSize]; + AFloat norm = 1.0 / ((AFloat) m); + + sdata[tid] = 0.0; + if (i < m) { + AFloat sum = 0.0; + for (int j = 0; j < n; j++) { + sum += exp(output[i + j * m]); + } + for (int j = 0; j < n; j++) { + sdata[tid] += Y[i + j * m] * log(exp(output[i + j * m]) / sum); + } + sdata[tid] *= - norm; + } else { + sdata[tid] = 0.0; + } + + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SoftmaxCrossEntropyGradients(AFloat * dY, + const AFloat * Y, + const AFloat * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + AFloat norm = 1.0 / ((AFloat) m); + + if (i < m) { + AFloat sum = 0.0; + AFloat sumY = 0.0; + for (int j = 0; j < n; j++) { + sum += exp(output[i + j * m]); + sumY += Y[i + j * m]; + } + for (int j = 0; j < n; j++) { + dY[i + j * m] = norm * (sumY * exp(output[i + j * m]) / sum - Y[i + j * m]); + } + } +} + //____________________________________________________________________________ template<typename AFloat> __global__ void ReduceMatrix(AFloat *result, diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu index 8a242468ff7..c65bf9b8de3 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu @@ -28,8 +28,8 @@ template<typename AFloat> AFloat TCuda<AFloat>::MeanSquaredError(const TCudaMatrix<AFloat> & Y, const TCudaMatrix<AFloat> & output) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(Y); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(Y); cudaStream_t s = Y.GetComputeStream(); TCudaMatrix<AFloat>::ResetDeviceReturn(); ::TMVA::DNN::Cuda::MeanSquaredError<<<gridDims, blockDims, 0, s>>>( @@ -47,8 +47,8 @@ void TCuda<AFloat>::MeanSquaredErrorGradients(TCudaMatrix<AFloat> & dY, const TCudaMatrix<AFloat> & Y, const TCudaMatrix<AFloat> & output) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(Y); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(Y); cudaStream_t s = output.GetComputeStream(); ::TMVA::DNN::Cuda::MeanSquaredErrorGradients<<<gridDims, blockDims, 0, s>>>( dY.GetDataPointer(), @@ -64,8 +64,8 @@ template<typename AFloat> AFloat TCuda<AFloat>::CrossEntropy(const TCudaMatrix<AFloat> & Y, const TCudaMatrix<AFloat> & output) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(Y); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(Y); TCudaMatrix<AFloat>::ResetDeviceReturn(); cudaStream_t s = Y.GetComputeStream(); ::TMVA::DNN::Cuda::CrossEntropy<<<gridDims, blockDims, 0, s>>>( @@ -83,8 +83,8 @@ void TCuda<AFloat>::CrossEntropyGradients(TCudaMatrix<AFloat> & dY, const TCudaMatrix<AFloat> & Y, const TCudaMatrix<AFloat> & output) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(Y); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(Y); cudaStream_t s = output.GetComputeStream(); ::TMVA::DNN::Cuda::CrossEntropyGradients<<<gridDims, blockDims, 0, s>>>( dY.GetDataPointer(), @@ -95,5 +95,41 @@ void TCuda<AFloat>::CrossEntropyGradients(TCudaMatrix<AFloat> & dY, dY.SetComputeStream(s); } +//____________________________________________________________________________ +template<typename AFloat> +AFloat TCuda<AFloat>::SoftmaxCrossEntropy(const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output) +{ + dim3 blockDims = TDevice::BlockDims1D(); + dim3 gridDims = TDevice::GridDims1D(Y); + TCudaMatrix<AFloat>::ResetDeviceReturn(); + cudaStream_t s = Y.GetComputeStream(); + ::TMVA::DNN::Cuda::SoftmaxCrossEntropy<<<gridDims, blockDims, 0, s>>>( + TCudaMatrix<AFloat>::GetDeviceReturnPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + return TCudaMatrix<AFloat>::GetDeviceReturn(); +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::SoftmaxCrossEntropyGradients(TCudaMatrix<AFloat> & dY, + const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output) +{ + dim3 blockDims = TDevice::BlockDims1D(); + dim3 gridDims = TDevice::GridDims1D(Y); + cudaStream_t s = output.GetComputeStream(); + ::TMVA::DNN::Cuda::SoftmaxCrossEntropyGradients<<<gridDims, blockDims, 0, s>>>( + dY.GetDataPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + dY.SetComputeStream(s); +} + } // namespace DNN } // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu index 039fb27a8e3..303cc4a429d 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu @@ -27,8 +27,8 @@ template<typename AFloat> void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(), A.GetDataPointer(), @@ -37,5 +37,20 @@ void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & B, B.SetComputeStream(s); } +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Softmax(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims1D(); + dim3 gridDims = TDevice::GridDims1D(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Softmax<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + } // namespace DNN } // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu index 047c6411b52..35331b7855f 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu @@ -77,8 +77,8 @@ template<typename AFloat> void TCuda<AFloat>::AddRowWise(TCudaMatrix<AFloat> &Weights, const TCudaMatrix<AFloat> &theta) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(Weights); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(Weights); cudaStream_t s = Weights.GetComputeStream(); ::TMVA::DNN::Cuda::AddRowWise<<<gridDims, blockDims, 0, s>>>( Weights.GetDataPointer(), diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu index 67851eaef8b..dce384ebb0b 100644 --- a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu @@ -26,8 +26,8 @@ namespace DNN { template<typename AFloat> AFloat TCuda<AFloat>::L1Regularization(const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); TCudaMatrix<AFloat>::ResetDeviceReturn(); ::TMVA::DNN::Cuda::AbsoluteSum<<<gridDims, blockDims, 0, s>>>( @@ -44,8 +44,8 @@ void TCuda<AFloat>::AddL1RegularizationGradients(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A, AFloat weightDecay) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::AddL1RegularizationGradients<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), @@ -59,8 +59,8 @@ void TCuda<AFloat>::AddL1RegularizationGradients(TCudaMatrix<AFloat> & B, template<typename AFloat> AFloat TCuda<AFloat>::L2Regularization(const TCudaMatrix<AFloat> & A) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(A); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(A); cudaStream_t s = A.GetComputeStream(); TCudaMatrix<AFloat>::ResetDeviceReturn(); ::TMVA::DNN::Cuda::SquaredSum<<<gridDims, blockDims, 0, s>>>( @@ -77,8 +77,8 @@ void TCuda<AFloat>::AddL2RegularizationGradients(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A, AFloat weightDecay) { - dim3 blockDims = TDevice::BlockDims(); - dim3 gridDims = TDevice::GridDims(B); + dim3 blockDims = TDevice::BlockDims2D(); + dim3 gridDims = TDevice::GridDims2D(B); cudaStream_t s = A.GetComputeStream(); ::TMVA::DNN::Cuda::AddL2RegularizationGradients<<<gridDims, blockDims, 0, s>>>( B.GetDataPointer(), diff --git a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx index aa0b144be20..e52943d272c 100644 --- a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx +++ b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx @@ -21,81 +21,129 @@ namespace TMVA namespace DNN { //______________________________________________________________________________ -template<typename Real_t> -Real_t TReference<Real_t>::MeanSquaredError(const TMatrixT<Real_t> &Y, - const TMatrixT<Real_t> &output) +template<typename AReal> +AReal TReference<AReal>::MeanSquaredError(const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output) { size_t m,n; m = Y.GetNrows(); n = Y.GetNcols(); - Real_t result = 0.0; + AReal result = 0.0; for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < n; j++) { - Real_t dY = (Y(i,j) - output(i,j)); + AReal dY = (Y(i,j) - output(i,j)); result += dY * dY; } } - result /= (Real_t) (m * n); + result /= (AReal) (m * n); return result; } //______________________________________________________________________________ -template<typename Real_t> -void TReference<Real_t>::MeanSquaredErrorGradients(TMatrixT<Real_t> & dY, - const TMatrixT<Real_t> & Y, - const TMatrixT<Real_t> & output) +template<typename AReal> +void TReference<AReal>::MeanSquaredErrorGradients(TMatrixT<AReal> & dY, + const TMatrixT<AReal> & Y, + const TMatrixT<AReal> & output) { size_t m,n; m = Y.GetNrows(); n = Y.GetNcols(); dY.Minus(Y, output); - dY *= - 2.0 / ((Real_t) (m*n)); + dY *= - 2.0 / ((AReal) (m*n)); } //______________________________________________________________________________ -template<typename Real_t> -Real_t TReference<Real_t>::CrossEntropy(const TMatrixT<Real_t> &Y, - const TMatrixT<Real_t> &output) +template<typename AReal> +AReal TReference<AReal>::CrossEntropy(const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output) { size_t m,n; m = Y.GetNrows(); n = Y.GetNcols(); - Real_t result = 0.0; + AReal result = 0.0; for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < n; j++) { - Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j))); + AReal sig = 1.0 / (1.0 + std::exp(-output(i,j))); result += Y(i,j) * std::log(sig) + (1.0 - Y(i,j)) * std::log(1.0 - sig); } } - result /= - (Real_t) (m * n); + result /= - (AReal) (m * n); return result; } //______________________________________________________________________________ -template<typename Real_t> -void TReference<Real_t>::CrossEntropyGradients(TMatrixT<Real_t> & dY, - const TMatrixT<Real_t> & Y, - const TMatrixT<Real_t> & output) +template<typename AReal> +void TReference<AReal>::CrossEntropyGradients(TMatrixT<AReal> & dY, + const TMatrixT<AReal> & Y, + const TMatrixT<AReal> & output) { size_t m,n; m = Y.GetNrows(); n = Y.GetNcols(); - Real_t norm = 1.0 / ((Real_t) (m * n)); + AReal norm = 1.0 / ((AReal) (m * n)); for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < n; j++) { - Real_t y = Y(i,j); - Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j))); + AReal y = Y(i,j); + AReal sig = 1.0 / (1.0 + std::exp(-output(i,j))); dY(i,j) = norm * (sig - y); } } } +//______________________________________________________________________________ +template<typename AReal> +AReal TReference<AReal>::SoftmaxCrossEntropy(const TMatrixT<AReal> &Y, + const TMatrixT<AReal> &output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + AReal result = 0.0; + + for (size_t i = 0; i < m; i++) { + AReal sum = 0.0; + for (size_t j = 0; j < n; j++) { + sum += exp(output(i,j)); + } + for (size_t j = 0; j < n; j++) { + result += Y(i,j) * log(exp(output(i,j)) / sum); + } + } + result /= - m; + return result; +} + +//______________________________________________________________________________ +template<typename AReal> +void TReference<AReal>::SoftmaxCrossEntropyGradients(TMatrixT<AReal> & dY, + const TMatrixT<AReal> & Y, + const TMatrixT<AReal> & output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + AReal norm = 1.0 / m ; + + for (size_t i = 0; i < m; i++) + { + AReal sum = 0.0; + AReal sumY = 0.0; + for (size_t j = 0; j < n; j++) { + sum += exp(output(i,j)); + sumY += Y(i,j); + } + for (size_t j = 0; j < n; j++) { + dY(i,j) = norm * (exp(output(i,j)) / sum * sumY - Y(i,j)); + } + } +} + } // namespace DNN } // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx index 731c95713d4..b635532a053 100644 --- a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx +++ b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx @@ -17,9 +17,9 @@ namespace TMVA { namespace DNN { -template<typename Real_t> -void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B, - const TMatrixT<Real_t> & A) +template<typename AReal> +void TReference<AReal>::Sigmoid(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A) { size_t m,n; m = A.GetNrows(); @@ -27,11 +27,30 @@ void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B, for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < n; j++) { - Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + AReal sig = 1.0 / (1.0 + std::exp(-A(i,j))); B(i,j) = sig; } } } +template<typename AReal> +void TReference<AReal>::Softmax(TMatrixT<AReal> & B, + const TMatrixT<AReal> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + AReal sum = 0.0; + for (size_t j = 0; j < n; j++) { + sum += exp(A(i,j)); + } + for (size_t j = 0; j < n; j++) { + B(i,j) = exp(A(i,j)) / sum; + } + } +} + } // namespace TMVA } // namespace DNN diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx index 045b96cd27c..dcd6ef02b38 100644 --- a/tmva/tmva/src/MethodDNN.cxx +++ b/tmva/tmva/src/MethodDNN.cxx @@ -401,7 +401,12 @@ void TMVA::MethodDNN::ProcessOptions() fLayout = TMVA::MethodDNN::ParseLayoutString (fLayoutString); size_t inputSize = GetNVariables (); - size_t outputSize = (GetNTargets() == 0) ? 1 : GetNTargets(); + size_t outputSize = 1; + if (GetNTargets() != 0) { + outputSize = GetNTargets(); + } else if (DataInfo().GetNClasses() > 2) { + outputSize = DataInfo().GetNClasses(); + } fNet.SetBatchSize(1); fNet.SetInputWidth(inputSize); @@ -443,9 +448,9 @@ void TMVA::MethodDNN::ProcessOptions() fNet.SetLossFunction(ELossFunction::kCrossEntropy); } if (fErrorStrategy == "MUTUALEXCLUSIVE") { - Log () << kFatal << "MUTUALEXCLUSIVE not yet implemented." << Endl; + fNet.SetLossFunction(ELossFunction::kSoftmaxCrossEntropy); } - fOutputFunction = EOutputFunction::kSigmoid; + fOutputFunction = EOutputFunction::kSoftmax; } // @@ -543,6 +548,13 @@ void TMVA::MethodDNN::Train() outputValue, event->GetWeight())); trainPattern.back().addInput(1.0); + } else if (fAnalysisType == Types::kMulticlass) { + std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0); + oneHot[event->GetClass()] = 1.0; + trainPattern.push_back(Pattern (values.begin(), values.end(), + oneHot.cbegin(), oneHot.cend(), + event->GetWeight())); + trainPattern.back().addInput(1.0); } else { const std::vector<Float_t>& targets = event->GetTargets (); trainPattern.push_back(Pattern(values.begin(), @@ -563,6 +575,13 @@ void TMVA::MethodDNN::Train() outputValue, event->GetWeight())); testPattern.back().addInput(1.0); + } else if (fAnalysisType == Types::kMulticlass) { + std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0); + oneHot[event->GetClass()] = 1.0; + testPattern.push_back(Pattern (values.begin(), values.end(), + oneHot.cbegin(), oneHot.cend(), + event->GetWeight())); + testPattern.back().addInput(1.0); } else { const std::vector<Float_t>& targets = event->GetTargets (); testPattern.push_back(Pattern(values.begin(), @@ -601,6 +620,7 @@ void TMVA::MethodDNN::Train() switch(fOutputFunction) { case EOutputFunction::kIdentity: h = ModeOutputValues::DIRECT; break; case EOutputFunction::kSigmoid: h = ModeOutputValues::SIGMOID; break; + case EOutputFunction::kSoftmax: h = ModeOutputValues::SOFTMAX; break; } net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g, h)); } @@ -613,6 +633,9 @@ void TMVA::MethodDNN::Train() case ELossFunction::kCrossEntropy: net.setErrorFunction(ModeErrorFunction::CROSSENTROPY); break; + case ELossFunction::kSoftmaxCrossEntropy: + net.setErrorFunction(ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE); + break; } switch(fWeightInitialization) { @@ -630,7 +653,6 @@ void TMVA::MethodDNN::Train() break; } - int idxSetting = 0; for (auto s : fTrainingSettings) { @@ -1039,7 +1061,7 @@ Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUp } //______________________________________________________________________________ -const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues() +const std::vector<Float_t> & TMVA::MethodDNN::GetRegressionValues() { size_t nVariables = GetEvent()->GetNVariables(); Matrix_t X(1, nVariables); @@ -1077,12 +1099,27 @@ const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues() return *fRegressionReturnVal; } -const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues() +const std::vector<Float_t> & TMVA::MethodDNN::GetMulticlassValues() { - Log() << kFATAL << "ERROR: Multiclass classification not yet implemented." - << Endl; + size_t nVariables = GetEvent()->GetNVariables(); + Matrix_t X(1, nVariables); + Matrix_t YHat(1, DataInfo().GetNClasses()); + if (fMulticlassReturnVal == NULL) { + fMulticlassReturnVal = new std::vector<Float_t>(DataInfo().GetNClasses()); + } + + const std::vector<Float_t>& inputValues = GetEvent()->GetValues(); + for (size_t i = 0; i < nVariables; i++) { + X(0,i) = inputValues[i]; + } + + fNet.Prediction(YHat, X, fOutputFunction); + for (size_t i = 0; i < (size_t) YHat.GetNcols(); i++) { + (*fMulticlassReturnVal)[i] = YHat(0, i); + } return *fMulticlassReturnVal; } + //______________________________________________________________________________ void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const { diff --git a/tmva/tmva/test/DNN/TestDerivatives.h b/tmva/tmva/test/DNN/TestDerivatives.h index 7240e23a951..1acf46f86bd 100644 --- a/tmva/tmva/test/DNN/TestDerivatives.h +++ b/tmva/tmva/test/DNN/TestDerivatives.h @@ -176,7 +176,8 @@ auto testLossFunctionGradients() std::vector<ELossFunction> LossFunctions = {ELossFunction::kMeanSquaredError, - ELossFunction::kCrossEntropy}; + ELossFunction::kCrossEntropy, + ELossFunction::kSoftmaxCrossEntropy}; Scalar_t error, maximum_error; maximum_error = 0.0; diff --git a/tmva/tmva/test/DNN/TestLossFunctions.cxx b/tmva/tmva/test/DNN/TestLossFunctions.cxx index 9ae39ae5ba6..0acc959acfb 100644 --- a/tmva/tmva/test/DNN/TestLossFunctions.cxx +++ b/tmva/tmva/test/DNN/TestLossFunctions.cxx @@ -53,7 +53,7 @@ int main() return 1; error = testCrossEntropyGradients<TReference<double>>(10); - std::cout << "Testing mean squared error gradient: "; + std::cout << "Testing cross entropy gradient: "; std::cout << "maximum relative error = " << error << std::endl; if (error > 1e-10) return 1; diff --git a/tmva/tmva/test/DNN/TestLossFunctions.h b/tmva/tmva/test/DNN/TestLossFunctions.h index d81bc3f0526..89b1118ce38 100644 --- a/tmva/tmva/test/DNN/TestLossFunctions.h +++ b/tmva/tmva/test/DNN/TestLossFunctions.h @@ -191,3 +191,102 @@ auto testCrossEntropyGradients(size_t ntests) } return maximumError; } + +//______________________________________________________________________________ +// +// Softmax Cross Entropy +//______________________________________________________________________________ + +template <typename Architecture> +auto testSoftmaxCrossEntropy(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> X(m, n); + TMatrixT<Double_t> Y(m, n); + TMatrixT<Double_t> Z(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + + Scalar_t ce = evaluate<Architecture>(ELossFunction::kSoftmaxCrossEntropy, + YArch, XArch); + + Scalar_t ceReference = 0.0; + for (size_t j = 0; j < m; j++) { + Scalar_t sum = 0.0; + for (size_t k = 0; k < n; k++) { + sum += exp(X(j,k)); + } + for (size_t k = 0; k < n; k++) { + ceReference -= Y(j,k) * log(exp(X(j,k)) / sum); + } + } + ceReference /= (Scalar_t) m; + + Double_t error; + if (ceReference != 0.0) + error = std::fabs((ce - ceReference) / ceReference); + else + error = std::fabs(ce - ceReference); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +template <typename Architecture> +auto testSoftmaxCrossEntropyGradients(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = 8; //rand() % 100 + 1; + size_t n = 8; //rand() % 100 + 1; + + TMatrixT<Double_t> X(m, n); + TMatrixT<Double_t> Y(m, n); + TMatrixT<Double_t> ZRef(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + Matrix_t ZArch(Y); + + evaluateGradients<Architecture>(ZArch, ELossFunction::kSoftmaxCrossEntropy, + YArch, XArch); + + for (size_t j = 0; j < m; j++) { + Scalar_t sum = 0.0; + Scalar_t sumY = 0.0; + for (size_t k = 0; k < n; k++) { + sum += exp(X(j,k)); + sumY += Y(j,k); + } + for (size_t k = 0; k < n; k++) { + Scalar_t sig = exp(X(j,k)) / sum; + ZRef(j,k) = (sig * sumY - Y(j,k)) / ((Scalar_t) m); + } + } + + TMatrixT<Double_t> Z(ZArch); + Double_t error = maximumRelativeError(Z, ZRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx index 94b426d780f..5313dec72e9 100644 --- a/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx +++ b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx @@ -50,13 +50,29 @@ int main() // error = testCrossEntropy<TCpu<Scalar_t>>(10); - std::cout << "Testing cross entropy loss: "; + std::cout << "Testing cross entropy loss: "; std::cout << "maximum relative error = " << print_error(error) << std::endl; if (error > 1e-3) return 1; error = testCrossEntropyGradients<TCpu<Scalar_t>>(10); - std::cout << "Testing mean squared error gradient: "; + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + // + // Softmax Cross Entropy. + // + + error = testSoftmaxCrossEntropy<TCpu<Scalar_t>>(10); + std::cout << "Testing softmax cross entropy loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + error = testSoftmaxCrossEntropyGradients<TCpu<Scalar_t>>(10); + std::cout << "Testing softmax cross entropy gradient: "; std::cout << "maximum relative error = " << print_error(error) << std::endl; if (error > 1e-3) return 1; diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx index 1bf9ccae983..dd566b22dcb 100644 --- a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx +++ b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx @@ -32,13 +32,13 @@ int main() // error = testMeanSquaredError<TCuda<Scalar_t>>(10); - std::cout << "Testing mean squared error loss: "; + std::cout << "Testing mean squared error loss: "; std::cout << "maximum relative error = " << print_error(error) << std::endl; if (error > 1e-3) return 1; error = testMeanSquaredErrorGradients<TCuda<Scalar_t>>(10); - std::cout << "Testing mean squared error gradient: "; + std::cout << "Testing mean squared error gradient: "; std::cout << "maximum relative error = " << print_error(error) << std::endl; if (error > 1e-3) return 1; @@ -48,13 +48,29 @@ int main() // error = testCrossEntropy<TCuda<Scalar_t>>(10); - std::cout << "Testing cross entropy loss: "; + std::cout << "Testing cross entropy loss: "; std::cout << "maximum relative error = " << print_error(error) << std::endl; if (error > 1e-3) return 1; error = testCrossEntropyGradients<TCuda<Scalar_t>>(10); - std::cout << "Testing mean squared error gradient: "; + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + // + // Softmax Cross Entropy. + // + + error = testSoftmaxCrossEntropy<TCuda<Scalar_t>>(10); + std::cout << "Testing softmax cross entropy loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + error = testSoftmaxCrossEntropyGradients<TCuda<Scalar_t>>(10); + std::cout << "Testing softmax cross entropy gradient: "; std::cout << "maximum relative error = " << print_error(error) << std::endl; if (error > 1e-3) return 1; diff --git a/tutorials/tmva/TMVAClassification.C b/tutorials/tmva/TMVAClassification.C index 0ee3b503a18..6b32c45df37 100644 --- a/tutorials/tmva/TMVAClassification.C +++ b/tutorials/tmva/TMVAClassification.C @@ -85,36 +85,36 @@ int TMVAClassification( TString myMethodList = "" ) std::map<std::string,int> Use; // Cut optimisation - Use["Cuts"] = 1; - Use["CutsD"] = 1; + Use["Cuts"] = 0; + Use["CutsD"] = 0; Use["CutsPCA"] = 0; Use["CutsGA"] = 0; Use["CutsSA"] = 0; // // 1-dimensional likelihood ("naive Bayes estimator") - Use["Likelihood"] = 1; + Use["Likelihood"] = 0; Use["LikelihoodD"] = 0; // the "D" extension indicates decorrelated input variables (see option strings) - Use["LikelihoodPCA"] = 1; // the "PCA" extension indicates PCA-transformed input variables (see option strings) + Use["LikelihoodPCA"] = 0; // the "PCA" extension indicates PCA-transformed input variables (see option strings) Use["LikelihoodKDE"] = 0; Use["LikelihoodMIX"] = 0; // // Mutidimensional likelihood and Nearest-Neighbour methods - Use["PDERS"] = 1; + Use["PDERS"] = 0; Use["PDERSD"] = 0; Use["PDERSPCA"] = 0; - Use["PDEFoam"] = 1; + Use["PDEFoam"] = 0; Use["PDEFoamBoost"] = 0; // uses generalised MVA method boosting Use["KNN"] = 1; // k-nearest neighbour method // // Linear Discriminant Analysis - Use["LD"] = 1; // Linear Discriminant identical to Fisher + Use["LD"] = 0; // Linear Discriminant identical to Fisher Use["Fisher"] = 0; Use["FisherG"] = 0; Use["BoostedFisher"] = 0; // uses generalised MVA method boosting Use["HMatrix"] = 0; // // Function Discriminant analysis - Use["FDA_GA"] = 1; // minimisation of user-defined function using Genetics Algorithm + Use["FDA_GA"] = 0; // minimisation of user-defined function using Genetics Algorithm Use["FDA_SA"] = 0; Use["FDA_MC"] = 0; Use["FDA_MT"] = 0; @@ -124,16 +124,18 @@ int TMVAClassification( TString myMethodList = "" ) // Neural Networks (all are feed-forward Multilayer Perceptrons) Use["MLP"] = 0; // Recommended ANN Use["MLPBFGS"] = 0; // Recommended ANN with optional training method - Use["MLPBNN"] = 1; // Recommended ANN with BFGS training method and bayesian regulator + Use["MLPBNN"] = 0; // Recommended ANN with BFGS training method and bayesian regulator Use["CFMlpANN"] = 0; // Depreciated ANN from ALEPH Use["TMlpANN"] = 0; // ROOT's own ANN - Use["DNN"] = 0; // improved implementation of a NN + Use["DNN"] = 0; // Deep Neural Network + Use["DNN_GPU"] = 0; // CUDA-accelerated DNN training. + Use["DNN_CPU"] = 0; // Multi-core accelerated DNN. // // Support Vector Machine Use["SVM"] = 1; // // Boosted Decision Trees - Use["BDT"] = 1; // uses Adaptive Boost + Use["BDT"] = 0; // uses Adaptive Boost Use["BDTG"] = 0; // uses Gradient Boost Use["BDTB"] = 0; // uses Bagging Use["BDTD"] = 0; // decorrelation + Adaptive Boost @@ -436,39 +438,50 @@ int TMVAClassification( TString myMethodList = "" ) factory->BookMethod( dataloader, TMVA::Types::kMLP, "MLPBNN", "H:!V:NeuronType=tanh:VarTransform=N:NCycles=600:HiddenLayers=N+5:TestRate=5:TrainingMethod=BFGS:UseRegulator" ); // BFGS training with bayesian regulators - // improved neural network implementation + // Multi-architecture DNN implementation. if (Use["DNN"]) { - /* - TString layoutString ("Layout=TANH|(N+100)*2,LINEAR"); - TString layoutString ("Layout=SOFTSIGN|100,SOFTSIGN|50,SOFTSIGN|20,LINEAR"); - TString layoutString ("Layout=RELU|300,RELU|100,RELU|30,RELU|10,LINEAR"); - TString layoutString ("Layout=SOFTSIGN|50,SOFTSIGN|30,SOFTSIGN|20,SOFTSIGN|10,LINEAR"); - TString layoutString ("Layout=TANH|50,TANH|30,TANH|20,TANH|10,LINEAR"); - TString layoutString ("Layout=SOFTSIGN|50,SOFTSIGN|20,LINEAR"); - */ - TString layoutString ("Layout=TANH|100,TANH|50,TANH|10,LINEAR"); - - TString training0 ("LearningRate=1e-1,Momentum=0.0,Repetitions=1,ConvergenceSteps=300,BatchSize=20,TestRepetitions=15,WeightDecay=0.001,Regularization=NONE,DropConfig=0.0+0.5+0.5+0.5,DropRepetitions=1,Multithreading=True"); - TString training1 ("LearningRate=1e-2,Momentum=0.5,Repetitions=1,ConvergenceSteps=300,BatchSize=30,TestRepetitions=7,WeightDecay=0.001,Regularization=L2,Multithreading=True,DropConfig=0.0+0.1+0.1+0.1,DropRepetitions=1"); - TString training2 ("LearningRate=1e-2,Momentum=0.3,Repetitions=1,ConvergenceSteps=300,BatchSize=40,TestRepetitions=7,WeightDecay=0.0001,Regularization=L2,Multithreading=True"); - TString training3 ("LearningRate=1e-3,Momentum=0.1,Repetitions=1,ConvergenceSteps=200,BatchSize=70,TestRepetitions=7,WeightDecay=0.0001,Regularization=NONE,Multithreading=True"); - - TString trainingStrategyString ("TrainingStrategy="); - trainingStrategyString += training0 + "|" + training1 + "|" + training2 + "|" + training3; - - - // TString nnOptions ("!H:V:VarTransform=Normalize:ErrorStrategy=CROSSENTROPY"); - TString nnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=G:WeightInitialization=XAVIERUNIFORM"); - // TString nnOptions ("!H:V:VarTransform=Normalize:ErrorStrategy=CHECKGRADIENTS"); - nnOptions.Append (":"); nnOptions.Append (layoutString); - nnOptions.Append (":"); nnOptions.Append (trainingStrategyString); - - factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN", nnOptions ); // NN + // General layout. + TString layoutString ("Layout=TANH|128,TANH|128,TANH|128,LINEAR"); + + // Training strategies. + TString training0("LearningRate=1e-1,Momentum=0.9,Repetitions=1," + "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10," + "WeightDecay=1e-4,Regularization=L2," + "DropConfig=0.0+0.5+0.5+0.5, Multithreading=True"); + TString training1("LearningRate=1e-2,Momentum=0.9,Repetitions=1," + "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10," + "WeightDecay=1e-4,Regularization=L2," + "DropConfig=0.0+0.0+0.0+0.0, Multithreading=True"); + TString training2("LearningRate=1e-3,Momentum=0.0,Repetitions=1," + "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10," + "WeightDecay=1e-4,Regularization=L2," + "DropConfig=0.0+0.0+0.0+0.0, Multithreading=True"); + TString trainingStrategyString ("TrainingStrategy="); + trainingStrategyString += training0 + "|" + training1 + "|" + training2; + + // General Options. + TString dnnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=N:" + "WeightInitialization=XAVIERUNIFORM"); + dnnOptions.Append (":"); nnOptions.Append (layoutString); + dnnOptions.Append (":"); nnOptions.Append (trainingStrategyString); + + // Standard implementation, no dependencies. + TString stdOptions = dnnOptions + ":Architecture=STANDARD"; + factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN", stdOptions); + + // Cuda implementation. + if (Use["DNN_GPU"]) { + TString gpuOptions = dnnOptions + ":Architecture=GPU"; + factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN GPU", gpuOptions); + } + // Multi-core CPU implementation. + if (Use["DNN_CPU"]) { + TString cpuOptions = dnnOptions + ":Architecture=CPU"; + factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN CPU", cpuOptions); + } } - - // CF(Clermont-Ferrand)ANN if (Use["CFMlpANN"]) factory->BookMethod( dataloader, TMVA::Types::kCFMlpANN, "CFMlpANN", "!H:!V:NCycles=2000:HiddenLayers=N+1,N" ); // n_cycles:#nodes:#nodes:... diff --git a/tutorials/tmva/TMVAMulticlass.C b/tutorials/tmva/TMVAMulticlass.C index 5693874f99a..129723834f9 100644 --- a/tutorials/tmva/TMVAMulticlass.C +++ b/tutorials/tmva/TMVAMulticlass.C @@ -51,6 +51,7 @@ void TMVAMulticlass( TString myMethodList = "" ) std::map<std::string,int> Use; Use["MLP"] = 1; Use["BDTG"] = 1; + Use["DNN"] = 0; Use["FDA_GA"] = 0; Use["PDEFoam"] = 0; //--------------------------------------------------------------- @@ -130,6 +131,21 @@ void TMVAMulticlass( TString myMethodList = "" ) if (Use["PDEFoam"]) // PDE-Foam approach factory->BookMethod( dataloader, TMVA::Types::kPDEFoam, "PDEFoam", "!H:!V:TailCut=0.001:VolFrac=0.0666:nActiveCells=500:nSampl=2000:nBin=5:Nmin=100:Kernel=None:Compress=T" ); + if (Use["DNN"]) { + TString layoutString ("Layout=TANH|100,TANH|50,TANH|10,LINEAR"); + TString training0 ("LearningRate=1e-1, Momentum=0.5, Repetitions=1, ConvergenceSteps=10," + " BatchSize=256, TestRepetitions=10, Multithreading=True"); + TString training1 ("LearningRate=1e-2, Momentum=0.0, Repetitions=1, ConvergenceSteps=10," + " BatchSize=256, TestRepetitions=7, Multithreading=True"); + TString trainingStrategyString ("TrainingStrategy="); + trainingStrategyString += training0 + "|" + training1; + TString nnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=N:" + "WeightInitialization=XAVIERUNIFORM:Architecture=STANDARD"); + nnOptions.Append (":"); nnOptions.Append (layoutString); + nnOptions.Append (":"); nnOptions.Append (trainingStrategyString); + factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN", nnOptions ); + } + // Train MVAs using the set of training events factory->TrainAllMethods(); -- GitLab