diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f5b35c541e049213f12f1d7a218e53ff642dd7f9..ccabaf87954af8a042366859112f04f6fdfa98f9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -145,6 +145,16 @@ ROOT_ADD_TEST(test-stressvector-interpreted COMMAND ${ROOT_root_CMD} -b -q -l ${ FAILREGEX "FAILED|Error in" DEPENDS test-stressvector) #--stressTMVA-------------------------------------------------------------------------------------- +FIND_PACKAGE(CUDA) +if (CUDA_FOUND) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCUDA") +endif (CUDA_FOUND) + +FIND_PACKAGE(BLAS) +if (BLAS_FOUND AND imt) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCPU") +endif (BLAS_FOUND AND imt) + if(ROOT_tmva_FOUND) ROOT_EXECUTABLE(stressTMVA stressTMVA.cxx LIBRARIES TMVA) ROOT_ADD_TEST(test-stresstmva COMMAND stressTMVA -b) diff --git a/test/stressTMVA.cxx b/test/stressTMVA.cxx index 262b11c6052755c81724d0ff652c0e0f50b62543..5ab1a98e8b243588eb3548624c01408b56d6542e 100644 --- a/test/stressTMVA.cxx +++ b/test/stressTMVA.cxx @@ -2921,6 +2921,36 @@ void addClassificationTests( UnitTestSuite& TMVA_test, bool full=true) "!H:!V:NTrees=400:nEventsMin=200:MaxDepth=3:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=10:PruneMethod=NoPruning:VarTransform=Decorrelate" , 0.88, 0.98) ); if (full) TMVA_test.addTest(new MethodUnitTestWithROCLimits( TMVA::Types::kRuleFit, "RuleFit", "H:!V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02" , 0.88, 0.98) ); + + TString config = "!H:V:VarTransform=N:ErrorStrategy=CROSSENTROPY" + ":WeightInitialization=XAVIER" + ":Layout=LINEAR|64,LINEAR|64,LINEAR|64,LINEAR" + ":TrainingStrategy=LearningRate=0.1,Momentum=0.9, ConvergenceSteps=20," + "BatchSize=256,Regularization=None,TestRepetitions=5, Multithreading=True" + "|LearningRate=0.01,Momentum=0.5,ConvergenceSteps=20,BatchSize=256," + "Regularization=None,TestRepetitions=5, Multithreading=True" + "|LearningRate=0.003,Momentum=0.5,ConvergenceSteps=20,BatchSize=256," + "Regularization=None,TestRepetitions=5, Multithreading=True" + "|LearningRate=0.001,Momentum=0.0,ConvergenceSteps=20,BatchSize=256," + "Regularization=None,TestRepetitions=5, Multithreading=True"; + TString configStandard = "Architecture=STANDARD:" + config; + TString configCpu = "Architecture=CPU:" + config; + TString configGpu = "Architecture=GPU:" + config; + + + TMVA_test.addTest(new MethodUnitTestWithROCLimits( + TMVA::Types::kDNN, "DNN Standard", + configStandard, 0.85, 0.98)); + #ifdef DNNCPU + TMVA_test.addTest(new MethodUnitTestWithROCLimits( + TMVA::Types::kDNN, "DNN CPU", configCpu, 0.85, 0.98) + ); + #endif + #ifdef DNNCUDA + TMVA_test.addTest(new MethodUnitTestWithROCLimits( + TMVA::Types::kDNN, "DNN GPU", configGpu, 0.85, 0.98) + ); + #endif } void addRegressionTests( UnitTestSuite& TMVA_test, bool full=true) diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt index 06966edd91990c597da7d3fbbea8a726a26b20e4..0c5637f01ab65bacfde24482c8e5df15bc34eef2 100644 --- a/tmva/tmva/CMakeLists.txt +++ b/tmva/tmva/CMakeLists.txt @@ -36,8 +36,10 @@ set(headers4 TNeuron.h TSynapse.h TActivationChooser.h TActivation.h TActivation VariableTransformBase.h VariableIdentityTransform.h VariableDecorrTransform.h VariablePCATransform.h VariableGaussTransform.h VariableNormalizeTransform.h VariableRearrangeTransform.h VariableTransform.h ROCCalc.h ROCCurve.h) -set(headers5 Event.h Results.h ResultsClassification.h ResultsRegression.h ResultsMulticlass.h VariableInfo.h ClassInfo.h - DataLoader.h DataSet.h DataSetInfo.h DataInputHandler.h DataSetManager.h DataSetFactory.h) +set(headers5 Event.h Results.h ResultsClassification.h ResultsRegression.h + ResultsMulticlass.h VariableInfo.h ClassInfo.h DataLoader.h DataSet.h + DataSetInfo.h DataInputHandler.h DataSetManager.h DataSetFactory.h) + #---Need to suffix each header name by TMVA/ ----------------- foreach(hs headers1 headers2 headers3 headers4 headers5) @@ -46,9 +48,40 @@ foreach(hs headers1 headers2 headers3 headers4 headers5) endforeach() endforeach() +SET(DNN_FILES src/DNN/Architectures/Reference.cxx) +SET(DNN_CUDA_FILES src/DNN/Architectures/Cuda.cu + src/DNN/Architectures/Cuda/CudaBuffers.cxx + src/DNN/Architectures/Cuda/CudaMatrix.cu) +SET(DNN_CPU_FILES src/DNN/Architectures/Cpu.cxx + src/DNN/Architectures/Cpu/CpuBuffer.cxx + src/DNN/Architectures/Cpu/CpuMatrix.cxx) + +#---Handle CUDA dependent code. ----------------- +FIND_PACKAGE(CUDA) +IF (CUDA_FOUND) + CUDA_INCLUDE_DIRECTORIES(${ROOT_INCLUDE_DIRS}) + CUDA_ADD_LIBRARY(dnn_cuda ${DNN_CUDA_FILES}) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCUDA") + SET(DNN_CUDA_LIBRARIES dnn_cuda ${CUDA_CUBLAS_LIBRARIES}) +ELSE (CUDA_FOUND) + SET(DNN_CUDA_LIBRARIES) +ENDIF(CUDA_FOUND) + +#---Handle BLAS dependent code. ----------------- +FIND_PACKAGE(BLAS) +IF (BLAS_FOUND AND imt) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDNNCPU") + SET(DNN_CPU_LIBRARIES MathCore Matrix ${BLAS_LIBRARIES} tbb + ${CMAKE_THREAD_LIBS_INIT}) +ELSE (BLAS_FOUND AND imt) + SET(DNN_CPU_LIBRARIES) + SET(DNN_CPU_FILES) +ENDIF(BLAS_FOUND AND imt) + ROOT_GENERATE_DICTIONARY(G__TMVA ${theaders1} ${theaders2} ${theaders3} ${theaders4} ${theaders5} MODULE TMVA LINKDEF LinkDef.h OPTIONS "-writeEmptyRootPCM") -ROOT_LINKER_LIBRARY(TMVA *.cxx G__TMVA.cxx LIBRARIES Core +ROOT_LINKER_LIBRARY(TMVA *.cxx G__TMVA.cxx ${DNN_FILES} ${DNN_CPU_FILES} + LIBRARIES Core ${DNN_CUDA_LIBRARIES} ${DNN_CPU_LIBRARIES} DEPENDENCIES RIO Hist Tree TreePlayer MLP Minuit XMLIO) install(DIRECTORY inc/TMVA/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/TMVA @@ -66,7 +99,7 @@ if(NOT gnuinstall) PATTERN "data" EXCLUDE) endif() -#ROOT_ADD_TEST_SUBDIRECTORY(test) +ROOT_ADD_TEST_SUBDIRECTORY(test/DNN) diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..eb3217aadac03a2eb88e6a5d9a58416fa0c52043 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h @@ -0,0 +1,287 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 05/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// +// Definition of the TCpu architecture, which provides a // + // multi-threaded CPU implementation of the low-level interface // + // networks for Cpus using tbb // + ////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CPU +#define TMVA_DNN_ARCHITECTURES_CPU + +#include "Cpu/CpuBuffer.h" +#include "Cpu/CpuMatrix.h" + +namespace TMVA +{ +namespace DNN +{ + +/** The TCpu architecture class. + * + * Low-level interface class for multi-threaded CPU architectures. Contains as + * public types the declaration of the scalar, matrix and data loader types + * for this architecture as well as the remaining functions in the low-level + * interface in the form of static members. + */ +template<typename AReal> +class TCpu +{ +public: + + using Scalar_t = AReal; + using Matrix_t = TCpuMatrix<AReal>; + using HostBuffer_t = TCpuBuffer<AReal>; + using DeviceBuffer_t = TCpuBuffer<AReal>; + + //____________________________________________________________________________ + // + // Propagation + //____________________________________________________________________________ + + /** @name Forward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Matrix-multiply \p input with the transpose of \pweights and + * write the results into \p output. */ + static void MultiplyTranspose(TCpuMatrix<Scalar_t> &output, + const TCpuMatrix<Scalar_t> &input, + const TCpuMatrix<Scalar_t> &weights); + /** Add the vectors biases row-wise to the matrix output */ + static void AddRowWise(TCpuMatrix<Scalar_t> &output, + const TCpuMatrix<Scalar_t> &biases); + ///@} + + /** @name Backward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Perform the complete backward propagation step. If the provided + * \p activationGradientsBackward matrix is not empty, compute the + * gradients of the objective function with respect to the activations + * of the previous layer (backward direction). + * Also compute the weight and the bias gradients. Modifies the values + * in \p df and thus produces only a valid result, if it is applied the + * first time after the corresponding forward propagation has been per- + * formed. */ + static void Backward(TCpuMatrix<Scalar_t> & activationGradientsBackward, + TCpuMatrix<Scalar_t> & weightGradients, + TCpuMatrix<Scalar_t> & biasGradients, + TCpuMatrix<Scalar_t> & df, + const TCpuMatrix<Scalar_t> & activationGradients, + const TCpuMatrix<Scalar_t> & weights, + const TCpuMatrix<Scalar_t> & activationBackward); + /** Adds a the elements in matrix B scaled by c to the elements in + * the matrix A. This is required for the weight update in the gradient + * descent step.*/ + static void ScaleAdd(TCpuMatrix<Scalar_t> & A, + const TCpuMatrix<Scalar_t> & B, + Scalar_t beta = 1.0); + + static void Copy(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + ///@} + + //____________________________________________________________________________ + // + // Activation Functions + //____________________________________________________________________________ + + /** @name Activation Functions + * For each activation function, the low-level interface contains two routines. + * One that applies the acitvation function to a matrix and one that evaluate + * the derivatives of the activation function at the elements of a given matrix + * and writes the results into the result matrix. + */ + ///@{ + static void IdentityDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> &A); + + static void Relu(TCpuMatrix<Scalar_t> & B); + static void ReluDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + + static void Sigmoid(TCpuMatrix<Scalar_t> & B); + static void SigmoidDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + + static void Tanh(TCpuMatrix<Scalar_t> & B); + static void TanhDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + + static void SymmetricRelu(TCpuMatrix<Scalar_t> & B); + static void SymmetricReluDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + + static void SoftSign(TCpuMatrix<Scalar_t> & B); + static void SoftSignDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + + static void Gauss(TCpuMatrix<Scalar_t> & B); + static void GaussDerivative(TCpuMatrix<Scalar_t> & B, + const TCpuMatrix<Scalar_t> & A); + ///@} + + //____________________________________________________________________________ + // + // Loss Functions + //____________________________________________________________________________ + + /** @name Loss Functions + * Loss functions compute a scalar value given the \p output of the network + * for a given training input and the expected network prediction \p Y that + * quantifies the quality of the prediction. For each function also a routing + * that computes the gradients (suffixed by Gradients) must be provided for + * the starting of the backpropagation algorithm. + */ + ///@{ + + static Scalar_t MeanSquaredError(const TCpuMatrix<Scalar_t> &Y, + const TCpuMatrix<Scalar_t> &output); + static void MeanSquaredErrorGradients(TCpuMatrix<Scalar_t> & dY, + const TCpuMatrix<Scalar_t> &Y, + const TCpuMatrix<Scalar_t> &output); + + /** Sigmoid transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static Scalar_t CrossEntropy(const TCpuMatrix<Scalar_t> &Y, + const TCpuMatrix<Scalar_t> &output); + + static void CrossEntropyGradients(TCpuMatrix<Scalar_t> & dY, + const TCpuMatrix<Scalar_t> & Y, + const TCpuMatrix<Scalar_t> & output); + ///@} + + //____________________________________________________________________________ + // + // Output Functions + //____________________________________________________________________________ + + /** @name Output Functions + * Output functions transform the activations \p output of the + * output layer in the network to a valid prediction \p YHat for + * the desired usage of the network, e.g. the identity function + * for regression or the sigmoid transformation for two-class + * classification. + */ + ///@{ + static void Sigmoid(TCpuMatrix<Scalar_t> &YHat, + const TCpuMatrix<Scalar_t> & ); + ///@} + + //____________________________________________________________________________ + // + // Regularization + //____________________________________________________________________________ + + /** @name Regularization + * For each regularization type two functions are required, one named + * <tt><Type>Regularization</tt> that evaluates the corresponding + * regularization functional for a given weight matrix and the + * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization + * component in the gradients to the provided matrix. + */ + ///@{ + + static Scalar_t L1Regularization(const TCpuMatrix<Scalar_t> & W); + static void AddL1RegularizationGradients(TCpuMatrix<Scalar_t> & A, + const TCpuMatrix<Scalar_t> & W, + Scalar_t weightDecay); + + static Scalar_t L2Regularization(const TCpuMatrix<Scalar_t> & W); + static void AddL2RegularizationGradients(TCpuMatrix<Scalar_t> & A, + const TCpuMatrix<Scalar_t> & W, + Scalar_t weightDecay); + ///@} + + //____________________________________________________________________________ + // + // Initialization + //____________________________________________________________________________ + + /** @name Initialization + * For each initialization method, one function in the low-level interface + * is provided. The naming scheme is <p>Initialize<Type></p> for a given + * initialization method Type. + */ + ///@{ + + static void InitializeGauss(TCpuMatrix<Scalar_t> & A); + static void InitializeUniform(TCpuMatrix<Scalar_t> & A); + static void InitializeIdentity(TCpuMatrix<Scalar_t> & A); + static void InitializeZero(TCpuMatrix<Scalar_t> & A); + + ///@} + + //____________________________________________________________________________ + // + // Dropout + //____________________________________________________________________________ + + /** @name Dropout + */ + ///@{ + + /** Apply dropout with activation probability \p p to the given + * matrix \p A and scale the result by reciprocal of \p p. */ + static void Dropout(TCpuMatrix<Scalar_t> & A, Scalar_t p); + + ///@} + + //____________________________________________________________________________ + // + // Additional Arithmetic Functions + //____________________________________________________________________________ + + /** @name Additional Arithmetic Functions + * + * Additional arithmetic on CUDA matrices used to implement the low-level + * interface. + */ + ///@{ + + /** Standard multiplication of two matrices \p A and \p B with the result being + * written into C. + */ + static void Multiply(TCpuMatrix<Scalar_t> &C, + const TCpuMatrix<Scalar_t> &A, + const TCpuMatrix<Scalar_t> &B); + /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the + * result being written into C. + */ + static void TransposeMultiply(TCpuMatrix<Scalar_t> &output, + const TCpuMatrix<Scalar_t> &input, + const TCpuMatrix<Scalar_t> &Weights); + /** In-place Hadamard (element-wise) product of matrices \p A and \p B + * with the result being written into \p A. + */ + static void Hadamard(TCpuMatrix<Scalar_t> &A, + const TCpuMatrix<Scalar_t> &B); + + /** Sum columns of (m x n) matrixx \p A and write the results into the first + * m elements in \p A. + */ + static void SumColumns(TCpuMatrix<Scalar_t> &B, + const TCpuMatrix<Scalar_t> &A); + + /** Compute the sum of all elements in \p A */ + static Scalar_t Sum(const TCpuMatrix<Scalar_t> &A); + +}; + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/Blas.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/Blas.h new file mode 100644 index 0000000000000000000000000000000000000000..8288db18ba341150eb37f26affb3c9f9dc6839c0 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/Blas.h @@ -0,0 +1,171 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Declarations of the BLAS functions used for the forward and // +// backward propagation of activation through neural networks on // +// CPUs. // +/////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CPU_BLAS +#define TMVA_DNN_ARCHITECTURES_CPU_BLAS + +#include <iostream> + +// External Library Routines +//____________________________________________________________________________ +extern "C" void saxpy_(const int * n, const float * alpha, const float * x, + const int * incx, float * y, const int * incy); +extern "C" void daxpy_(const int * n, const double * alpha, const double * x, + const int * incx, double * y, const int * incy); +extern "C" void sger_(const int * m, const int * n, const float * alpha, + const float * x, const int * incx, + const float * y, const int * incy, + float * A, const int * lda); +extern "C" void dger_(const int * m, const int * n, const double * alpha, + const double * x, const int * incx, + const double * y, const int * incy, + double * A, const int * lda); +extern "C" void sgemv_(const char * trans, const int * m, const int * n, + const float * alpha, const float * A, const int * lda, + const float * x, const int * incx, + const float * beta, float * y, const int * incy); +extern "C" void dgemv_(const char * trans, const int * m, const int * n, + const double * alpha, const double * A, const int * lda, + const double * x, const int * incx, + const double * beta, double * y, const int * incy); +extern "C" void dgemm_(const char * transa, const char * transb, + const int * m, const int * n, const int * k, + const double * alpha, const double * A, const int * lda, + const double * B, const int * ldb, const double * beta, + double * C, const int * ldc); +extern "C" void sgemm_(const char * transa, const char * transb, + const int * m, const int * n, const int * k, + const float * alpha, const float * A, const int * lda, + const float * B, const int * ldb, const float * beta, + float * C, const int * ldc); + +namespace TMVA +{ +namespace DNN +{ +namespace Blas +{ + +// Type-Generic Wrappers +//____________________________________________________________________________ +/** Add the vector \p x scaled by \p alpha to \p y scaled by \beta */ +template <typename Real_t> +inline void Axpy(const int * n, const Real_t * alpha, + const Real_t * x, const int * incx, + Real_t * y, const int * incy); + +/** Multiply the vector \p x with the matrix \p A and store the result in \p y. */ +template <typename Real_t> +inline void Gemv(const char *trans, const int * m, const int * n, + const Real_t * alpha, const Real_t * A, const int * lda, + const Real_t * x, const int * incx, + const Real_t * beta, Real_t * y, const int * incy); + +/** Multiply the matrix \p A with the matrix \p B and store the result in \p C. */ +template <typename Real_t> +inline void Gemm(const char *transa, const char *transb, + const int * m, const int * n, const int* k, + const Real_t * alpha, const Real_t * A, const int * lda, + const Real_t * B, const int * ldb, const Real_t * beta, + Real_t * C, const int * ldc); + +/** Add the outer product of \p x and \p y to the matrix \p A. */ +template <typename Real_t> +inline void Ger(const int * m, const int * n, const Real_t * alpha, + const Real_t * x, const int * incx, + const Real_t * y, const int * incy, + Real_t * A, const int * lda); + +// Specializations +//____________________________________________________________________________ +template<> +inline void Axpy<double>(const int * n, const double * alpha, + const double * x, const int * incx, + double * y, const int * incy) +{ + daxpy_(n, alpha, x, incx, y, incy); +} + +template<> +inline void Axpy<float>(const int * n, const float * alpha, + const float * x, const int * incx, + float * y, const int * incy) +{ + saxpy_(n, alpha, x, incx, y, incy); +} + +template<> +inline void Gemv<double>(const char *trans, const int * m, const int * n, + const double * alpha, const double * A, const int * lda, + const double * x, const int * incx, + const double * beta, double * y, const int * incy) +{ + dgemv_(trans, m, n, alpha, A, lda, x, incx, beta, y, incy); +} + +template<> +inline void Gemv<float>(const char *trans, const int * m, const int * n, + const float * alpha, const float * A, const int * lda, + const float * x, const int * incx, + const float * beta, float * y, const int * incy) +{ + sgemv_(trans, m, n, alpha, A, lda, x, incx, beta, y, incy); +} + +template<> +inline void Gemm<double>(const char *transa, const char *transb, + const int * m, const int * n, const int* k, + const double * alpha, const double * A, const int * lda, + const double * B, const int * ldb, const double * beta, + double * C, const int * ldc) +{ + dgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} + +template<> +inline void Gemm<float>(const char *transa, const char *transb, + const int * m, const int * n, const int* k, + const float * alpha, const float * A, const int * lda, + const float * B, const int * ldb, const float * beta, + float * C, const int * ldc) +{ + sgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} + +template <> +inline void Ger<double>(const int * m, const int * n, const double * alpha, + const double * x, const int * incx, + const double * y, const int * incy, + double * A, const int * lda) +{ + dger_(m, n, alpha, x, incx, y, incy, A, lda); +} + +template <> +inline void Ger<float>(const int * m, const int * n, const float * alpha, + const float * x, const int * incx, + const float * y, const int * incy, + float * A, const int * lda) +{ + sger_(m, n, alpha, x, incx, y, incy, A, lda); +} + +} // namespace Blas +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..192ece11b6983c78ec42f33a8f89ca809a64d123 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuBuffer.h @@ -0,0 +1,86 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////// +// CPU Buffer interface class for the generic data loader. // +///////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUBUFFER +#define TMVA_DNN_ARCHITECTURES_CPU_CPUBUFFER + +#include "TMVA/DNN/DataLoader.h" +#include <vector> +#include <memory> + +namespace TMVA +{ +namespace DNN +{ + +/** TCpuBuffer + * + * Since the memory on the CPU is homogeneous, only one buffer class is required. + * The host and device buffer classes are the same and copying between the host + * and device buffer is achieved by simply swapping the memory pointers. + * + * Memory is handled as a shared pointer to a pointer of type AFloat, which is + * the floating point type used for the implementation. + * + * Copying and assignment of TCpuBuffer objects performs only a shallow copy + * meaning the underlying data is shared between those objects. + * + * \tparam AFloat The floating point type used for the computations. + */ +template<typename AFloat> +class TCpuBuffer +{ +private: + + size_t fSize; + size_t fOffset; + std::shared_ptr<AFloat *> fBuffer; + + struct TDestructor + { + void operator()(AFloat ** pointer); + friend TCpuBuffer; + } fDestructor; + +public: + + /** Construct buffer to hold \p size numbers of type \p AFloat.*/ + TCpuBuffer(size_t size); + TCpuBuffer(const TCpuBuffer &) = default; + TCpuBuffer( TCpuBuffer &&) = default; + TCpuBuffer & operator=(const TCpuBuffer &) = default; + TCpuBuffer & operator=( TCpuBuffer &&) = default; + + operator AFloat * () const {return (* fBuffer) + fOffset;} + + /** Return subbuffer of siez \p start starting at element \p offset. */ + TCpuBuffer GetSubBuffer(size_t offset, size_t start); + + AFloat & operator[](size_t i) {return (*fBuffer.get())[fOffset + i];} + AFloat operator[](size_t i) const {return (*fBuffer.get())[fOffset + i];} + + /** Copy data from another buffer. No real copying is performed, only the + * data pointers are swapped. */ + void CopyFrom(TCpuBuffer &); + /** Copy data to another buffer. No real copying is performed, only the + * data pointers are swapped. */ + void CopyTo(TCpuBuffer &); +}; + +} // namespace DNN +} // namespace TMVA + +#endif + diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h new file mode 100644 index 0000000000000000000000000000000000000000..66aa444b28fff41afd5e4ec61996599dcdddf20d --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu/CpuMatrix.h @@ -0,0 +1,157 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////// +// Definition of the CpuMatrix class used to represent // +// weight and bias matrices in neural nets. // +////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX +#define TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX + +#include <cstddef> +#include <vector> +#include "tbb/tbb.h" + +#include "TMatrix.h" +#include "CpuBuffer.h" + +namespace TMVA +{ +namespace DNN +{ + +/** The TCpuMatrix class. + * + * Matrix class for multi-threaded CPU architectures. Uses the TCpuBuffer + * class to store the matrices in column-major format for compatibility with + * BLAS. Provides Map and MapFrom member functions to simplify the application of + * activation functions and derivatives to matrices. + * + * Copying and assignment of TCpuMatrix objects only performs shallow copies, i.e. + * copying is fast and the resulting objects share the element data. + * + * \tparam AFloat The floating point type used to represent the matrix elements. + */ +//______________________________________________________________________________ +template<typename AFloat> +class TCpuMatrix +{ +private: + + static std::vector<AFloat> fOnes; ///< Vector filled with ones used for BLAS calls. + + TCpuBuffer<AFloat> fBuffer; ///< The buffer holding the matrix elements + ///< in column-major format. + size_t fNCols; + size_t fNRows; + +public: + + /** Returns pointer to a vector holding only ones with a guaranteed length + * of the number of columns of every instantiated CpuMatrix object. */ + static const AFloat * GetOnePointer() {return fOnes.data();} + + /** Construct matrix and allocate space for its elements. */ + TCpuMatrix(size_t nRows, size_t nCols); + /** Construct a TCpuMatrix object by (deeply) copying from a + * TMatrixT<Double_t> matrix. */ + TCpuMatrix(const TMatrixT<Double_t> &); + /** Construct a m-times-n matrix from the given buffer. The size must of + * course match. */ + TCpuMatrix(const TCpuBuffer<AFloat> &buffer, size_t m, size_t n); + + TCpuMatrix(const TCpuMatrix &) = default; + TCpuMatrix( TCpuMatrix &&) = default; + TCpuMatrix & operator=(const TCpuMatrix &) = default; + TCpuMatrix & operator=(TCpuMatrix &&) = default; + ~TCpuMatrix() = default; + + /** Convert to a TMatrixT<Double_t> object. Performs a deep copy of the matrix + * elements. */ + operator TMatrixT<Double_t>() const; + + /** Map the given function over the matrix elements. Executed in parallel + * using tbb. */ + template <typename Function_t> + void Map(Function_t &f); + + /** Same as maps but takes the input values from the matrix \p A and writes + * the results in this matrix. */ + template <typename Function_t> + void MapFrom(Function_t &f, const TCpuMatrix & A); + + size_t GetNrows() const {return fNRows;} + size_t GetNcols() const {return fNCols;} + size_t GetNElements() const {return fNRows * fNCols;} + + /** Return matrix element in row \p i and column \p j. */ + AFloat operator()(size_t i, size_t j) const {return fBuffer[j * fNRows + i];} + AFloat & operator()(size_t i, size_t j) {return fBuffer[j * fNRows + i];} + + /** Return raw pointer to the elements stored contiguously in column-major + * order. */ + AFloat * GetRawDataPointer() {return fBuffer;} + const AFloat * GetRawDataPointer() const {return fBuffer;} + +private: + + void Initialize(); + +}; + +// Inline Functions. +//______________________________________________________________________________ +template<typename AFloat> +template<typename Function_t> +inline void TCpuMatrix<AFloat>::Map(Function_t &f) +{ + AFloat __restrict__ *data = GetRawDataPointer(); + + auto fRange = [data, &f](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + data[i] = f(data[i]); + } + }; + + tbb::blocked_range<size_t> range(0, GetNElements()); + parallel_for(range, fRange); +} + +template<typename AFloat> +template<typename Function_t> +inline void TCpuMatrix<AFloat>::MapFrom(Function_t &f, const TCpuMatrix &A) +{ + AFloat __restrict__ *dataB = GetRawDataPointer(); + const AFloat __restrict__ *dataA = A.GetRawDataPointer(); + + auto fRange = [&dataB, &dataA, &f](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + dataB[i] = f(dataA[i]); + } + }; + + tbb::blocked_range<size_t> range(0, GetNElements()); + parallel_for(range, fRange); +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..751d5264ea60127b448796aa5747f0af0cb44023 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h @@ -0,0 +1,289 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 05/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Definition of the TCuda architecture class, which provides an // +// implementation of the low-level functionality for neural // +// networks for the CUDA computing architectures. // +/////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA +#define TMVA_DNN_ARCHITECTURES_CUDA + +#include "cuda.h" +#include "Cuda/CudaBuffers.h" +#include "Cuda/CudaMatrix.h" +#include "TMVA/DNN/DataLoader.h" +#include <utility> + +namespace TMVA +{ +namespace DNN +{ + +/** The TCuda architecture class. + * + * Low-level interface class for CUDA computing architectures. Contains as + * public types the declaration of the scalar, matrix and buffer types + * for this architecture as well as the remaining functions in the low-level + * interface in the form of static members. + */ +template<typename AFloat = Double_t> +class TCuda +{ + +public: + + using Scalar_t = AFloat; + using Matrix_t = TCudaMatrix<AFloat>; + using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>; + using HostBuffer_t = TCudaHostBuffer<AFloat>; + + //____________________________________________________________________________ + // + // Propagation + //____________________________________________________________________________ + + /** @name Forward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Matrix-multiply \p input with the transpose of \pweights and + * write the results into \p output. */ + static void MultiplyTranspose(TCudaMatrix<AFloat> &output, + const TCudaMatrix<AFloat> &input, + const TCudaMatrix<AFloat> &weights); + /** Add the vectors biases row-wise to the matrix output */ + static void AddRowWise(TCudaMatrix<AFloat> &output, + const TCudaMatrix<AFloat> &biases); + ///@} + + /** @name Backward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Perform the complete backward propagation step. If the provided + * \p activationGradientsBackward matrix is not empty, compute the + * gradients of the objective function with respect to the activations + * of the previous layer (backward direction). + * Also compute the weight and the bias gradients. Modifies the values + * in \p df and thus produces only a valid result, if it is applied the + * first time after the corresponding forward propagation has been per- + * formed. */ + static void Backward(TCudaMatrix<AFloat> & activationGradientsBackward, + TCudaMatrix<AFloat> & weightGradients, + TCudaMatrix<AFloat> & biasGradients, + TCudaMatrix<AFloat> & df, + const TCudaMatrix<AFloat> & activationGradients, + const TCudaMatrix<AFloat> & weights, + const TCudaMatrix<AFloat> & activationBackward); + /** Adds a the elements in matrix B scaled by c to the elements in + * the matrix A. This is required for the weight update in the gradient + * descent step.*/ + static void ScaleAdd(TCudaMatrix<AFloat> & A, + const TCudaMatrix<AFloat> & B, + Scalar_t beta = 1.0); + /** Copy the elements of matrix A into matrix B. */ + static void Copy(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + ///@} + + //____________________________________________________________________________ + // + // Activation Functions + //____________________________________________________________________________ + + /** @name Activation Functions + * For each activation function, the low-level interface contains two routines. + * One that applies the acitvation function to a matrix and one that evaluate + * the derivatives of the activation function at the elements of a given matrix + * and writes the results into the result matrix. + */ + ///@{ + static void Identity(TCudaMatrix<AFloat> & B); + static void IdentityDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + + static void Relu(TCudaMatrix<AFloat> & B); + static void ReluDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + + static void Sigmoid(TCudaMatrix<AFloat> & B); + static void SigmoidDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + + static void Tanh(TCudaMatrix<AFloat> & B); + static void TanhDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + + static void SymmetricRelu(TCudaMatrix<AFloat> & B); + static void SymmetricReluDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + + static void SoftSign(TCudaMatrix<AFloat> & B); + static void SoftSignDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + + static void Gauss(TCudaMatrix<AFloat> & B); + static void GaussDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A); + ///@} + + //____________________________________________________________________________ + // + // Loss Functions + //____________________________________________________________________________ + + /** @name Loss Functions + * Loss functions compute a scalar value given the \p output of the network + * for a given training input and the expected network prediction \p Y that + * quantifies the quality of the prediction. For each function also a routing + * that computes the gradients (suffixed by Gradients) must be provided for + * the starting of the backpropagation algorithm. + */ + ///@{ + + static AFloat MeanSquaredError(const TCudaMatrix<AFloat> &Y, + const TCudaMatrix<AFloat> &output); + static void MeanSquaredErrorGradients(TCudaMatrix<AFloat> & dY, + const TCudaMatrix<AFloat> &Y, + const TCudaMatrix<AFloat> &output); + + /** Sigmoid transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static AFloat CrossEntropy(const TCudaMatrix<AFloat> &Y, + const TCudaMatrix<AFloat> &output); + + static void CrossEntropyGradients(TCudaMatrix<AFloat> & dY, + const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output); + ///@} + + //____________________________________________________________________________ + // + // Output Functions + //____________________________________________________________________________ + + /** @name Output Functions + * Output functions transform the activations \p output of the + * output layer in the network to a valid prediction \p YHat for + * the desired usage of the network, e.g. the identity function + * for regression or the sigmoid transformation for two-class + * classification. + */ + ///@{ + static void Sigmoid(TCudaMatrix<AFloat> &YHat, + const TCudaMatrix<AFloat> & ); + ///@} + + //____________________________________________________________________________ + // + // Regularization + //____________________________________________________________________________ + + /** @name Regularization + * For each regularization type two functions are required, one named + * <tt><Type>Regularization</tt> that evaluates the corresponding + * regularization functional for a given weight matrix and the + * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization + * component in the gradients to the provided matrix. + */ + ///@{ + + static AFloat L1Regularization(const TCudaMatrix<AFloat> & W); + static void AddL1RegularizationGradients(TCudaMatrix<AFloat> & A, + const TCudaMatrix<AFloat> & W, + AFloat weightDecay); + + static AFloat L2Regularization(const TCudaMatrix<AFloat> & W); + static void AddL2RegularizationGradients(TCudaMatrix<AFloat> & A, + const TCudaMatrix<AFloat> & W, + AFloat weightDecay); + ///@} + + //____________________________________________________________________________ + // + // Initialization + //____________________________________________________________________________ + + /** @name Initialization + * For each initialization method, one function in the low-level interface + * is provided. The naming scheme is <p>Initialize<Type></p> for a given + * initialization method Type. + */ + ///@{ + + static void InitializeGauss(TCudaMatrix<AFloat> & A); + static void InitializeUniform(TCudaMatrix<AFloat> & A); + static void InitializeIdentity(TCudaMatrix<AFloat> & A); + static void InitializeZero(TCudaMatrix<AFloat> & A); + + ///@} + + //____________________________________________________________________________ + // + // Dropout + //____________________________________________________________________________ + + /** @name Dropout + */ + ///@{ + + /** Apply dropout with activation probability \p p to the given + * matrix \p A and scale the result by reciprocal of \p p. */ + static void Dropout(TCudaMatrix<AFloat> & A, AFloat p); + + ///@} + + //____________________________________________________________________________ + // + // Additional Arithmetic Functions + //____________________________________________________________________________ + + /** @name Additional Arithmetic Functions + * + * Additional arithmetic on CUDA matrices used to implement the low-level + * interface. + */ + ///@{ + + /** Standard multiplication of two matrices \p A and \p B with the result being + * written into C. + */ + static void Multiply(TCudaMatrix<AFloat> & C, + const TCudaMatrix<AFloat> & A, + const TCudaMatrix<AFloat> & B); + /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the + * result being written into C. + */ + static void TransposeMultiply(TCudaMatrix<AFloat> & output, + const TCudaMatrix<AFloat> & input, + const TCudaMatrix<AFloat> & Weights); + /** In-place Hadamard (element-wise) product of matrices \p A and \p B + * with the result being written into \p A. + */ + static void Hadamard(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B); + + /** Sum columns of (m x n) matrixx \p A and write the results into the first + * m elements in \p A. + */ + static void SumColumns(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A); + + /** Compute the sum of all elements in \p A */ + static AFloat Sum(const TCudaMatrix<AFloat> &A); +}; + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h new file mode 100644 index 0000000000000000000000000000000000000000..f03483ff444a1042f0e489523be55361c3475040 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaBuffers.h @@ -0,0 +1,157 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 07/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////// +// Device and host buffer for CUDA architectures. // +//////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDABUFFERS +#define TMVA_DNN_ARCHITECTURES_CUDA_CUDABUFFERS + +#include "cuda.h" +#include "cuda_runtime.h" +#include <memory> + +namespace TMVA { +namespace DNN { + +template<typename AFloat> +class TCudaDeviceBuffer; + +/** TCudaHostBuffer + * + * Wrapper class for pinned memory buffers on the host. Uses + * std::shared_pointer with custom destructor to ensure consistent + * memory management and allow for easy copying/moving of the + * buffers. Copying is asynchronous and will set the cudaStream of the + * device buffer so that subsequent computations on the device buffer + * can be performed on the same stream. + * + * \tparam AFloat The floating point type to be stored in the buffers. + */ +template<typename AFloat> +class TCudaHostBuffer +{ +private: + + size_t fOffset; ///< Offset for sub-buffers + mutable cudaStream_t fComputeStream; ///< cudaStream for data transfer + std::shared_ptr<AFloat *> fHostPointer; ///< Pointer to the buffer data + + // Custom destructor required to free pinned host memory using cudaFree. + struct TDestructor + { + TDestructor() = default; + TDestructor(const TDestructor &) = default; + TDestructor( TDestructor &&) = default; + TDestructor & operator=(const TDestructor &) = default; + TDestructor & operator=( TDestructor &&) = default; + void operator()(AFloat ** devicePointer); + } fDestructor; + + friend TCudaDeviceBuffer<AFloat>; + +public: + + TCudaHostBuffer(size_t size); + TCudaHostBuffer(AFloat *); + TCudaHostBuffer() = default; + TCudaHostBuffer(const TCudaHostBuffer &) = default; + TCudaHostBuffer( TCudaHostBuffer &&) = default; + TCudaHostBuffer & operator=(const TCudaHostBuffer &) = default; + TCudaHostBuffer & operator=( TCudaHostBuffer &&) = default; + + /** Return sub-buffer of the current buffer. */ + TCudaHostBuffer GetSubBuffer(size_t offset, size_t size); + + operator AFloat * () const; + + inline AFloat & operator[](size_t index); + inline AFloat operator[](size_t index) const; + +}; + +/** TCudaDeviceBuffer + * + * Service class for on-device memory buffers. Uses + * std::shared_pointer with custom destructor to ensure consistent + * memory management and allow for easy copying/moving. A device + * buffer has an associated CUDA compute stream , which is used for + * implicit synchronization of data transfers. + * + * \tparam AFloat The floating point type to be stored in the buffers. + */ +template<typename AFloat> +class TCudaDeviceBuffer +{ +private: + + size_t fOffset; ///< Offset for sub-buffers + size_t fSize; + cudaStream_t fComputeStream; ///< cudaStream for data transfer + std::shared_ptr<AFloat *> fDevicePointer; ///< Pointer to the buffer data + + // Custom destructor required to free pinned host memory using cudaFree. + struct TDestructor + { + TDestructor() = default; + TDestructor(const TDestructor &) = default; + TDestructor( TDestructor &&) = default; + TDestructor & operator=(const TDestructor &) = default; + TDestructor & operator=( TDestructor &&) = default; + void operator()(AFloat ** devicePointer); + friend TCudaDeviceBuffer; + } fDestructor; + +public: + + TCudaDeviceBuffer(size_t size); + TCudaDeviceBuffer(size_t size, cudaStream_t stream); + TCudaDeviceBuffer(AFloat *, size_t size, cudaStream_t stream); + TCudaDeviceBuffer() = default; + TCudaDeviceBuffer(const TCudaDeviceBuffer &) = default; + TCudaDeviceBuffer( TCudaDeviceBuffer &&) = default; + TCudaDeviceBuffer & operator=(const TCudaDeviceBuffer &) = default; + TCudaDeviceBuffer & operator=( TCudaDeviceBuffer &&) = default; + + /** Return sub-buffer of the current buffer. */ + TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size); + /** Convert to raw device data pointer.*/ + operator AFloat * () const; + + void CopyFrom(const TCudaHostBuffer<AFloat> &) const; + void CopyTo(const TCudaHostBuffer<AFloat> &) const; + + cudaStream_t GetComputeStream() const {return fComputeStream;} + void SetComputeStream(cudaStream_t stream) {fComputeStream = stream;} + +}; + +// +// Inline Functions. +//______________________________________________________________________________ + +template<typename AFloat> +AFloat & TCudaHostBuffer<AFloat>::operator[](size_t index) +{ + return (*fHostPointer + fOffset)[index]; +} + +template<typename AFloat> +AFloat TCudaHostBuffer<AFloat>::operator[](size_t index) const +{ + return (*fHostPointer + fOffset)[index]; +} + + +} // namespace DNN +} // namespace TMVA +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h new file mode 100644 index 0000000000000000000000000000000000000000..f46b05e1fee8b7ca7880e5642cfe55c136ed8fd3 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h @@ -0,0 +1,299 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////////// +// Contains the TCudaMatrix class for the representation of matrices // +// on CUDA devices as well as the TCudaDeviceReference class which // +// is a helper class to emulate lvalue references to floating point // +// values on the device. // +/////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX +#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX + +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "curand_kernel.h" + +#include "TMatrixT.h" +#include "CudaBuffers.h" + +#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); } + +namespace TMVA { +namespace DNN { + +/** Function to check cuda return code. Taken from + * http://stackoverflow.com/questions/14038589/ + */ +inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true); + +//____________________________________________________________________________ +// +// Cuda Device Reference +//____________________________________________________________________________ + +/** TCudaDeviceReference + * + * Helper class emulating lvalue references for AFloat values that are + * physically on the device. Allows for example to assign to matrix elements. + * Note that device access through CudaDeviceReferences enforces synchronization + * with all streams and thus qualifies as performance killer. Only used for + * testing. + */ +template<typename AFloat> +class TCudaDeviceReference +{ +private: + + AFloat * fDevicePointer; + +public: + + TCudaDeviceReference(AFloat * devicePointer); + + operator AFloat(); + + void operator=(const TCudaDeviceReference &other); + void operator=(AFloat value); + void operator+=(AFloat value); + void operator-=(AFloat value); +}; + +//____________________________________________________________________________ +// +// Cuda Matrix +//____________________________________________________________________________ + +/** TCudaMatrix Class + * + * The TCudaMatrix class represents matrices on a CUDA device. The elements + * of the matrix are stored in a TCudaDeviceBuffer object which takes care of + * the allocation and freeing of the device memory. TCudaMatrices are lightweight + * object, that means on assignment and copy creation only a shallow copy is + * performed and no new element buffer allocated. To perform a deep copy use + * the static Copy method of the TCuda architecture class. + * + * The TCudaDeviceBuffer has an associated cuda stream, on which the data is + * transferred to the device. This stream can be accessed through the + * GetComputeStream member function and used to synchronize computations. + * + * The TCudaMatrix class also holds static references to CUDA resources. + * Those are the cublas handle, a buffer of curand states for the generation + * of random numbers as well as a vector containing ones, which is used for + * summing column matrices using matrix-vector multiplication. The class also + * has a static buffer for returning results from the device. + * + */ +template<typename AFloat> +class TCudaMatrix +{ +public: + +private: + + static size_t fInstances; ///< Current number of matrix instances. + static cublasHandle_t fCublasHandle; + static AFloat * fDeviceReturn; ///< Buffer for kernel return values. + static AFloat * fOnes; ///< Vector used for summations of columns. + static size_t fNOnes; ///< Current length of the one vector. + static curandState_t * fCurandStates; + static size_t fNCurandStates; + + size_t fNRows; + size_t fNCols; + TCudaDeviceBuffer<AFloat> fElementBuffer; + +public: + + static AFloat * GetOnes() {return fOnes;} + + TCudaMatrix(); + TCudaMatrix(size_t i, size_t j); + TCudaMatrix(const TMatrixT<Double_t> &); + TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n); + + TCudaMatrix(const TCudaMatrix &) = default; + TCudaMatrix( TCudaMatrix &&) = default; + TCudaMatrix & operator=(const TCudaMatrix &) = default; + TCudaMatrix & operator=( TCudaMatrix &&) = default; + ~TCudaMatrix() = default; + + /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */ + operator TMatrixT<Double_t>() const; + + inline cudaStream_t GetComputeStream() const; + inline void SetComputeStream(cudaStream_t stream); + /** Set the return buffer on the device to the specified value. This is + * required for example for reductions in order to initialize the + * accumulator. */ + inline static void ResetDeviceReturn(AFloat value = 0.0); + /** Transfer the value in the device return buffer to the host. This + * tranfer is synchronous */ + inline static AFloat GetDeviceReturn(); + /** Return device pointer to the device return buffer */ + inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;} + inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;} + + /** Blocking synchronization with the associated compute stream, if it's + * not the default stream. */ + inline void Synchronize(const TCudaMatrix &) const; + + size_t GetNrows() const {return fNRows;} + size_t GetNcols() const {return fNCols;} + size_t GetNoElements() const {return fNRows * fNCols;} + const AFloat * GetDataPointer() const {return fElementBuffer;} + AFloat * GetDataPointer() {return fElementBuffer;} + const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;} + + /** Access to elements of device matrices provided through TCudaDeviceReference + * class. Note that access is synchronous end enforces device synchronization + * on all streams. Only used for testing. */ + TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const; + +private: + + /** Initializes all shared devices resource and makes sure that a sufficient + * number of curand states are allocated on the device and initialized as + * well as that the one-vector for the summation over columns has the right + * size. */ + void InitializeCuda(); + void InitializeCurandStates(); + +}; + +// +// Inline Functions. +//______________________________________________________________________________ +inline void cudaError(cudaError_t code, const char *file, int line, bool abort) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceReference<AFloat>::TCudaDeviceReference(AFloat * devicePointer) + : fDevicePointer(devicePointer) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceReference<AFloat>::operator AFloat() +{ + AFloat buffer; + cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat), + cudaMemcpyDeviceToHost); + return buffer; +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceReference<AFloat>::operator=(const TCudaDeviceReference &other) +{ + cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat), + cudaMemcpyDeviceToDevice); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceReference<AFloat>::operator=(AFloat value) +{ + AFloat buffer = value; + cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat), + cudaMemcpyHostToDevice); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceReference<AFloat>::operator+=(AFloat value) +{ + AFloat buffer; + cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat), + cudaMemcpyDeviceToHost); + buffer += value; + cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat), + cudaMemcpyHostToDevice); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceReference<AFloat>::operator-=(AFloat value) +{ + AFloat buffer; + cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat), + cudaMemcpyDeviceToHost); + buffer -= value; + cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat), + cudaMemcpyHostToDevice); +} + +//______________________________________________________________________________ +template<typename AFloat> +inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const +{ + return fElementBuffer.GetComputeStream(); +} + +//______________________________________________________________________________ +template<typename AFloat> +inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream) +{ + return fElementBuffer.SetComputeStream(stream); +} + +//______________________________________________________________________________ +template<typename AFloat> +inline void TCudaMatrix<AFloat>::Synchronize(const TCudaMatrix &A) const +{ + cudaEvent_t event; + cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + cudaEventRecord(event, A.GetComputeStream()); + cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0); + cudaEventDestroy(event); +} + +//______________________________________________________________________________ +template<typename AFloat> +inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value) +{ + AFloat buffer = value; + cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice); +} + +//______________________________________________________________________________ +template<typename AFloat> +inline AFloat TCudaMatrix<AFloat>::GetDeviceReturn() +{ + AFloat buffer; + cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost); + return buffer; +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceReference<AFloat> TCudaMatrix<AFloat>::operator()(size_t i, size_t j) const +{ + AFloat * elementPointer = fElementBuffer; + elementPointer += j * fNRows + i; + return TCudaDeviceReference<AFloat>(elementPointer); +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h new file mode 100644 index 0000000000000000000000000000000000000000..3ca30b748078ff3ca8da3079f1513c92a99fa6ee --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/Device.h @@ -0,0 +1,86 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Defines the TDevice class which encapsules device specific // +// settings for the launching of threads. // +//////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_DEVICE +#define TMVA_DNN_ARCHITECTURES_CUDA_DEVICE + +#include "cuda.h" +#include "vector_types.h" // definition of dim3 +#include "CudaMatrix.h" + +namespace TMVA +{ +namespace DNN +{ + +/** TDevice + * + * The TDevice class provides static functions for the generation of CUDA + * grids for kernel launches and is used to encapsulate the distribution + * of threads and blocks over the data. + * + */ +class TDevice +{ +public: + /* Number of threads per block along first dimensions. */ + static constexpr int BlockDimX = 1; + /* Number of threads per block along second dimensions. */ + static constexpr int BlockDimY = 32; + /* Resulting block size. */ + static constexpr int BlockSize = BlockDimX * BlockDimY; + + /* Return dim3 object representing the a BlockDimX x BlockDimY 2D + * block */ + static dim3 BlockDims() + { + return dim3(BlockDimX, BlockDimY); + } + + /* Return 2D dim3 object representing the block grid consisting of two-dimensional + * BlockDimX x BlockDimY blocks covering the matrix A */ + template<typename AFloat> + static dim3 GridDims(const TCudaMatrix<AFloat> &A) + { + int gridDimX = A.GetNcols() / TDevice::BlockDimX; + if ((A.GetNcols() % TDevice::BlockDimX) != 0) + gridDimX += 1; + int gridDimY = A.GetNrows() / TDevice::BlockDimY; + if ((A.GetNrows() % TDevice::BlockDimY) != 0) + gridDimY += 1; + return dim3(gridDimX, gridDimY); + } + + /* Return the number of threads that will be launched for a given matrix \p A */ + template<typename AFloat> + static int NThreads(const TCudaMatrix<AFloat> &A) + { + int gridDimX = A.GetNcols() / TDevice::BlockDimX; + if ((A.GetNcols() % TDevice::BlockDimX) != 0) { + gridDimX += 1; + } + int gridDimY = A.GetNrows() / TDevice::BlockDimY; + if ((A.GetNrows() % TDevice::BlockDimY) != 0) { + gridDimY += 1; + } + return gridDimX * gridDimY * TDevice::BlockDimX * TDevice::BlockDimY; + } +}; + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h new file mode 100644 index 0000000000000000000000000000000000000000..7200f199c278045ef3b090d47aa4e04373e595be --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h @@ -0,0 +1,250 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////////// +// Declaration of the TReference architecture, which provides a // +// reference implementation of the low-level interface for the DNN // +// implementation based on ROOT's TMatrixT matrix type. // +/////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_REFERENCE +#define TMVA_DNN_ARCHITECTURES_REFERENCE + +#include "TMatrix.h" + +namespace TMVA +{ +namespace DNN +{ + +/*! The reference architecture class. +* +* Class template that contains the reference implementation of the low-level +* interface for the DNN implementation. The reference implementation uses the +* TMatrixT class template to represent matrices. +* +* \tparam Real_t The floating point type used to represent scalars. +*/ +template<typename Real_t> +class TReference +{ +public: + + using Scalar_t = Real_t; + using Matrix_t = TMatrixT<Real_t>; + + //____________________________________________________________________________ + // + // Propagation + //____________________________________________________________________________ + + /** @name Forward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Matrix-multiply \p input with the transpose of \pweights and + * write the results into \p output. */ + static void MultiplyTranspose(TMatrixT<Scalar_t> &output, + const TMatrixT<Scalar_t> &input, + const TMatrixT<Scalar_t> &weights); + /** Add the vectors biases row-wise to the matrix output */ + static void AddRowWise(TMatrixT<Scalar_t> &output, + const TMatrixT<Scalar_t> &biases); + ///@} + + /** @name Backward Propagation + * Low-level functions required for the forward propagation of activations + * through the network. + */ + ///@{ + /** Perform the complete backward propagation step. If the provided + * \p activationGradientsBackward matrix is not empty, compute the + * gradients of the objective function with respect to the activations + * of the previous layer (backward direction). + * Also compute the weight and the bias gradients. Modifies the values + * in \p df and thus produces only a valid result, if it is applied the + * first time after the corresponding forward propagation has been per- + * formed. */ + static void Backward(TMatrixT<Scalar_t> & activationGradientsBackward, + TMatrixT<Scalar_t> & weightGradients, + TMatrixT<Scalar_t> & biasGradients, + TMatrixT<Scalar_t> & df, + const TMatrixT<Scalar_t> & activationGradients, + const TMatrixT<Scalar_t> & weights, + const TMatrixT<Scalar_t> & activationBackward); + /** Adds a the elements in matrix B scaled by c to the elements in + * the matrix A. This is required for the weight update in the gradient + * descent step.*/ + static void ScaleAdd(TMatrixT<Scalar_t> & A, + const TMatrixT<Scalar_t> & B, + Scalar_t beta = 1.0); + + static void Copy(TMatrixT<Scalar_t> & A, + const TMatrixT<Scalar_t> & B); + ///@} + + //____________________________________________________________________________ + // + // Activation Functions + //____________________________________________________________________________ + + /** @name Activation Functions + * For each activation function, the low-level interface contains two routines. + * One that applies the acitvation function to a matrix and one that evaluate + * the derivatives of the activation function at the elements of a given matrix + * and writes the results into the result matrix. + */ + ///@{ + static void Identity(TMatrixT<Real_t> & B); + static void IdentityDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + static void Relu(TMatrixT<Real_t> & B); + static void ReluDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + static void Sigmoid(TMatrixT<Real_t> & B); + static void SigmoidDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + static void Tanh(TMatrixT<Real_t> & B); + static void TanhDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + static void SymmetricRelu(TMatrixT<Real_t> & B); + static void SymmetricReluDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + static void SoftSign(TMatrixT<Real_t> & B); + static void SoftSignDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + static void Gauss(TMatrixT<Real_t> & B); + static void GaussDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A); + + ///@} + + //____________________________________________________________________________ + // + // Loss Functions + //____________________________________________________________________________ + + /** @name Loss Functions + * Loss functions compute a scalar value given the \p output of the network + * for a given training input and the expected network prediction \p Y that + * quantifies the quality of the prediction. For each function also a routing + * that computes the gradients (suffixed by Gradients) must be provided for + * the starting of the backpropagation algorithm. + */ + ///@{ + + static Real_t MeanSquaredError(const TMatrixT<Real_t> &Y, + const TMatrixT<Real_t> &output); + static void MeanSquaredErrorGradients(TMatrixT<Real_t> & dY, + const TMatrixT<Real_t> &Y, + const TMatrixT<Real_t> &output); + + /** Sigmoid transformation is implicitly applied, thus \p output should + * hold the linear activations of the last layer in the net. */ + static Real_t CrossEntropy(const TMatrixT<Real_t> &Y, + const TMatrixT<Real_t> &output); + + static void CrossEntropyGradients(TMatrixT<Real_t> & dY, + const TMatrixT<Real_t> & Y, + const TMatrixT<Real_t> & output); + ///@} + + //____________________________________________________________________________ + // + // Output Functions + //____________________________________________________________________________ + + /** @name Output Functions + * Output functions transform the activations \p output of the + * output layer in the network to a valid prediction \p YHat for + * the desired usage of the network, e.g. the identity function + * for regression or the sigmoid transformation for two-class + * classification. + */ + ///@{ + static void Sigmoid(TMatrixT<Real_t> &YHat, + const TMatrixT<Real_t> & ); + ///@} + + //____________________________________________________________________________ + // + // Regularization + //____________________________________________________________________________ + + /** @name Regularization + * For each regularization type two functions are required, one named + * <tt><Type>Regularization</tt> that evaluates the corresponding + * regularization functional for a given weight matrix and the + * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization + * component in the gradients to the provided matrix. + */ + ///@{ + + static Real_t L1Regularization(const TMatrixT<Real_t> & W); + static void AddL1RegularizationGradients(TMatrixT<Real_t> & A, + const TMatrixT<Real_t> & W, + Real_t weightDecay); + + static Real_t L2Regularization(const TMatrixT<Real_t> & W); + static void AddL2RegularizationGradients(TMatrixT<Real_t> & A, + const TMatrixT<Real_t> & W, + Real_t weightDecay); + ///@} + + //____________________________________________________________________________ + // + // Initialization + //____________________________________________________________________________ + + /** @name Initialization + * For each initialization method, one function in the low-level interface + * is provided. The naming scheme is <p>Initialize<Type></p> for a given + * initialization method Type. + */ + ///@{ + + static void InitializeGauss(TMatrixT<Real_t> & A); + + static void InitializeUniform(TMatrixT<Real_t> & A); + + static void InitializeIdentity(TMatrixT<Real_t> & A); + + static void InitializeZero(TMatrixT<Real_t> & A); + + ///@} + + //____________________________________________________________________________ + // + // Dropout + //____________________________________________________________________________ + + /** @name Dropout + */ + ///@{ + + /** Apply dropout with activation probability \p p to the given + * matrix \p A and scale the result by reciprocal of \p p. */ + static void Dropout(TMatrixT<Real_t> & A, Real_t dropoutProbability); + + ///@} +}; + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/DataLoader.h b/tmva/tmva/inc/TMVA/DNN/DataLoader.h new file mode 100644 index 0000000000000000000000000000000000000000..a3a960fe3514d461905d84f6abf8d2f0a16c9d97 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/DataLoader.h @@ -0,0 +1,260 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 08/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Generic data loader for neural network input data. Provides a // +// high level abstraction for the transfer of training data to the // +// device. // +///////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_DATALOADER +#define TMVA_DNN_DATALOADER + +#include "TMatrix.h" +#include <vector> +#include <iostream> + +#include "TMVA/Event.h" + +namespace TMVA { +namespace DNN { + +// +// Input Data Types +//______________________________________________________________________________ +using MatrixInput_t = std::pair<const TMatrixT<Double_t> &, + const TMatrixT<Double_t> &>; +using TMVAInput_t = std::vector<Event*>; + +using IndexIterator_t = typename std::vector<size_t>::iterator; + +/** TBatch + * + * Class representing training batches consisting of a matrix of input data + * and a matrix of output data. The input and output data can be accessed using + * the GetInput() and GetOutput() member functions. + * + * \tparam AArchitecture The underlying architecture. + */ +//______________________________________________________________________________ +template <typename AArchitecture> +class TBatch +{ +private: + + using Matrix_t = typename AArchitecture::Matrix_t; + + Matrix_t fInputMatrix; + Matrix_t fOutputMatrix; + +public: + + TBatch(Matrix_t &, Matrix_t &); + TBatch(const TBatch &) = default; + TBatch( TBatch &&) = default; + TBatch & operator=(const TBatch &) = default; + TBatch & operator=( TBatch &&) = default; + + /** Return the matrix representing the input data. */ + Matrix_t & GetInput() {return fInputMatrix;} + /** Return the matrix representing the output data. */ + Matrix_t & GetOutput() {return fOutputMatrix;} +}; + +template<typename Data_t, typename AArchitecture> class TDataLoader; + +/** TBatchIterator + * + * Simple iterator class for the iterations over the training batches in + * a given data set represented by a TDataLoader object. + * + * \tparam AData The input data type. + * \tparam AArchitecture The underlying architecture type. + */ +template<typename Data_t, typename AArchitecture> +class TBatchIterator +{ +private: + + TDataLoader<Data_t, AArchitecture> & fDataLoader; + size_t fBatchIndex; + +public: + +TBatchIterator(TDataLoader<Data_t, AArchitecture> & dataLoader, size_t index = 0) +: fDataLoader(dataLoader), fBatchIndex(index) +{ + // Nothing to do here. +} + + TBatch<AArchitecture> operator*() {return fDataLoader.GetBatch();} + TBatchIterator operator++() {fBatchIndex++; return *this;} + bool operator!=(const TBatchIterator & other) { + return fBatchIndex != other.fBatchIndex; + } +}; + +/** TDataLoader + * + * Service class managing the streaming of the training data from the input data + * type to the accelerator device or the CPU. A TDataLoader object manages a number + * of host and device buffer pairs that are used in a round-robin manner for the + * transfer of batches to the device. + * + * Each TDataLoader object has an associated batch size and a number of total + * samples in the dataset. One epoch is the number of buffers required to transfer + * the complete training set. Using the begin() and end() member functions allows + * the user to iterate over the batches in one epoch. + * + * \tparam AData The input data type. + * \tparam AArchitecture The achitecture class of the underlying architecture. + */ +template<typename Data_t, typename AArchitecture> +class TDataLoader +{ +private: + + using HostBuffer_t = typename AArchitecture::HostBuffer_t; + using DeviceBuffer_t = typename AArchitecture::DeviceBuffer_t; + using Matrix_t = typename AArchitecture::Matrix_t; + using BatchIterator_t = TBatchIterator<Data_t, AArchitecture>; + + const Data_t & fData; + + size_t fNSamples; + size_t fBatchSize; + size_t fNInputFeatures; + size_t fNOutputFeatures; + size_t fBatchIndex; + + size_t fNStreams; ///< Number of buffer pairs. + std::vector<DeviceBuffer_t> fDeviceBuffers; + std::vector<HostBuffer_t> fHostBuffers; + + std::vector<size_t> fSampleIndices; ///< Ordering of the samples in the epoch. + +public: + + TDataLoader(const Data_t & data, size_t nSamples, size_t batchSize, + size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams = 1); + TDataLoader(const TDataLoader &) = default; + TDataLoader( TDataLoader &&) = default; + TDataLoader & operator=(const TDataLoader &) = default; + TDataLoader & operator=( TDataLoader &&) = default; + + /** Copy input matrix into the given host buffer. Function to be specialized by + * the architecture-specific backend. */ + void CopyInput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize); + /** Copy output matrix into the given host buffer. Function to be specialized + * by the architecture-spcific backend. */ + void CopyOutput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize); + + BatchIterator_t begin() {return TBatchIterator<Data_t, AArchitecture>(*this);} + BatchIterator_t end() + { + return TBatchIterator<Data_t, AArchitecture>(*this, fNSamples / fBatchSize); + } + + /** Shuffle the order of the samples in the batch. The shuffling is indirect, + * i.e. only the indices are shuffled. No input data is moved by this + * routine. */ + void Shuffle(); + + /** Return the next batch from the training set. The TDataLoader object + * keeps an internal counter that cycles over the batches in the training + * set. */ + TBatch<AArchitecture> GetBatch(); + +}; + +// +// TBatch Class. +//______________________________________________________________________________ +template<typename AArchitecture> +TBatch<AArchitecture>::TBatch(Matrix_t & inputMatrix, Matrix_t & outputMatrix) + : fInputMatrix(inputMatrix), fOutputMatrix(outputMatrix) +{ + // Nothing to do here. +} + +// +// TDataLoader Class. +//______________________________________________________________________________ +template<typename Data_t, typename AArchitecture> +TDataLoader<Data_t, AArchitecture>::TDataLoader( + const Data_t & data, size_t nSamples, size_t batchSize, + size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams) + : fData(data), fNSamples(nSamples), fBatchSize(batchSize), + fNInputFeatures(nInputFeatures), fNOutputFeatures(nOutputFeatures), + fBatchIndex(0), fNStreams(nStreams), fDeviceBuffers(), fHostBuffers(), + fSampleIndices() +{ + size_t inputMatrixSize = fBatchSize * fNInputFeatures; + size_t outputMatrixSize = fBatchSize * fNOutputFeatures; + + for (size_t i = 0; i < fNStreams; i++) + { + fHostBuffers.push_back(HostBuffer_t(inputMatrixSize + outputMatrixSize)); + fDeviceBuffers.push_back(DeviceBuffer_t(inputMatrixSize + outputMatrixSize)); + } + + fSampleIndices.reserve(fNSamples); + for (size_t i = 0; i < fNSamples; i++) { + fSampleIndices.push_back(i); + } +} + +//______________________________________________________________________________ +template<typename Data_t, typename AArchitecture> +TBatch<AArchitecture> TDataLoader<Data_t, AArchitecture>::GetBatch() +{ + fBatchIndex %= (fNSamples / fBatchSize); // Cycle through samples. + + + size_t inputMatrixSize = fBatchSize * fNInputFeatures; + size_t outputMatrixSize = fBatchSize * fNOutputFeatures; + + size_t streamIndex = fBatchIndex % fNStreams; + HostBuffer_t & hostBuffer = fHostBuffers[streamIndex]; + DeviceBuffer_t & deviceBuffer = fDeviceBuffers[streamIndex]; + + HostBuffer_t inputHostBuffer = hostBuffer.GetSubBuffer(0, inputMatrixSize); + HostBuffer_t outputHostBuffer = hostBuffer.GetSubBuffer(inputMatrixSize, + outputMatrixSize); + + DeviceBuffer_t inputDeviceBuffer = deviceBuffer.GetSubBuffer(0, inputMatrixSize); + DeviceBuffer_t outputDeviceBuffer = deviceBuffer.GetSubBuffer(inputMatrixSize, + outputMatrixSize); + size_t sampleIndex = fBatchIndex * fBatchSize; + IndexIterator_t sampleIndexIterator = fSampleIndices.begin() + sampleIndex; + + CopyInput(inputHostBuffer, sampleIndexIterator, fBatchSize); + CopyOutput(outputHostBuffer, sampleIndexIterator, fBatchSize); + + deviceBuffer.CopyFrom(hostBuffer); + Matrix_t inputMatrix(inputDeviceBuffer, fBatchSize, fNInputFeatures); + Matrix_t outputMatrix(outputDeviceBuffer, fBatchSize, fNOutputFeatures); + + fBatchIndex++; + return TBatch<AArchitecture>(inputMatrix, outputMatrix); +} + +//______________________________________________________________________________ +template<typename Data_t, typename AArchitecture> +void TDataLoader<Data_t, AArchitecture>::Shuffle() +{ + std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end()); +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h new file mode 100644 index 0000000000000000000000000000000000000000..5e2f09da46dff9cde43a7c01d94793383a272ed9 --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Functions.h @@ -0,0 +1,266 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Contains function enums for activation and output functions, as // +// well as generic evaluation functions, that delegate the call to // +// the corresponding evaluation kernel. // +///////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_FUNCTIONS +#define TMVA_DNN_FUNCTIONS + +namespace TMVA +{ +namespace DNN +{ +//______________________________________________________________________________ +// +// Enum Definitions +//______________________________________________________________________________ + +/*! Enum that represents layer activation functions. */ +enum class EActivationFunction +{ + kIdentity = 0, + kRelu = 1, + kSigmoid = 2, + kTanh = 3, + kSymmRelu = 4, + kSoftSign = 5, + kGauss = 6 +}; + +/*! Enum that represents output functions */ +enum class EOutputFunction +{ + kIdentity = 'I', + kSigmoid = 'S' +}; + +/*! Enum that represents objective functions for the net, i.e. functions +* that take the output from the last layer in the net together with the +* truths and return the objective function values that is to be minimized +* in the training process. */ +enum class ELossFunction +{ + kCrossEntropy = 'C', + kMeanSquaredError = 'R' +}; + +/*! Enum representing the regularization type applied for a given layer */ +enum class ERegularization +{ + kNone = '0', + kL1 = '1', + kL2 = '2' + }; + +/* Enum represnting the initialization method used for this layer. */ +enum class EInitialization { + kGauss = 'G', + kUniform = 'U', + kIdentity = 'I', + kZero = 'Z' +}; + +//______________________________________________________________________________ +// +// Activation Functions +//______________________________________________________________________________ + +/*! Apply the given activation function to each value in the given +* matrix A. */ +template<typename Architecture_t> +inline void evaluate(typename Architecture_t::Matrix_t &A, + EActivationFunction f) +{ + switch(f) + { + case EActivationFunction::kIdentity : break; + case EActivationFunction::kRelu : Architecture_t::Relu(A); + break; + case EActivationFunction::kSigmoid : Architecture_t::Sigmoid(A); + break; + case EActivationFunction::kTanh : Architecture_t::Tanh(A); + break; + case EActivationFunction::kSymmRelu : Architecture_t::SymmetricRelu(A); + break; + case EActivationFunction::kSoftSign : Architecture_t::SoftSign(A); + break; + case EActivationFunction::kGauss : Architecture_t::Gauss(A); + break; + } +} + + +/*! Compute the first partial derivative of the activation function for +* the values given in matrix A and write the results into B. */ +//______________________________________________________________________________ +template<typename Architecture_t> +inline void evaluateDerivative(typename Architecture_t::Matrix_t & B, + EActivationFunction f, + const typename Architecture_t::Matrix_t & A) +{ + switch(f) + { + case EActivationFunction::kIdentity : Architecture_t::IdentityDerivative(B, A); + break; + case EActivationFunction::kRelu : Architecture_t::ReluDerivative(B, A); + break; + case EActivationFunction::kSigmoid : Architecture_t::SigmoidDerivative(B, A); + break; + case EActivationFunction::kTanh : Architecture_t::TanhDerivative(B, A); + break; + case EActivationFunction::kSymmRelu : Architecture_t::SymmetricReluDerivative(B, A); + break; + case EActivationFunction::kSoftSign : Architecture_t::SoftSignDerivative(B, A); + break; + case EActivationFunction::kGauss : Architecture_t::GaussDerivative(B, A); + break; + } +} + +//______________________________________________________________________________ +// +// Output Functions +//______________________________________________________________________________ + +/*! Apply the given output function to each value in the given +* matrix A. */ +template<typename Architecture_t> +inline void evaluate(typename Architecture_t::Matrix_t &A, + EOutputFunction f, + const typename Architecture_t::Matrix_t &X) +{ + switch(f) + { + case EOutputFunction::kIdentity : Architecture_t::Copy(A, X); + break; + case EOutputFunction::kSigmoid : Architecture_t::Sigmoid(A, X); + break; + } +} + +//______________________________________________________________________________ +// +// Loss Functions +//______________________________________________________________________________ + +/*! Compute the value of the objective function f for given activations +* of the ouput layer and the truth Y. */ +template<typename Architecture_t> +inline auto evaluate(ELossFunction f, + const typename Architecture_t::Matrix_t & Y, + const typename Architecture_t::Matrix_t & output) +-> decltype(Architecture_t::CrossEntropy(Y,output)) +{ + switch(f) + { + case ELossFunction::kCrossEntropy : + return Architecture_t::CrossEntropy(Y, output); + case ELossFunction::kMeanSquaredError : + return Architecture_t::MeanSquaredError(Y, output); + } + return 0.0; +} + +/*! Compute the gradient of the given output function f for given activations +* output of the output layer and truth Y and write the results into dY. */ +//______________________________________________________________________________ +template<typename Architecture_t> +inline void evaluateGradients(typename Architecture_t::Matrix_t & dY, + ELossFunction f, + const typename Architecture_t::Matrix_t &Y, + const typename Architecture_t::Matrix_t &output) +{ + switch(f) + { + case ELossFunction::kCrossEntropy : + Architecture_t::CrossEntropyGradients(dY, Y, output); + break; + case ELossFunction::kMeanSquaredError : + Architecture_t::MeanSquaredErrorGradients(dY, Y, output); + break; + } +} + + +//______________________________________________________________________________ +// +// Regularization +//______________________________________________________________________________ + +/*! Evaluate the regularization functional for a given weight matrix. */ +template<typename Architecture_t> +inline auto regularization(const typename Architecture_t::Matrix_t &A, + ERegularization R) +-> decltype(Architecture_t::L1Regularization(A)) +{ + switch(R) + { + case ERegularization::kNone : + return 0.0; + case ERegularization::kL1 : + return Architecture_t::L1Regularization(A); + case ERegularization::kL2 : + return Architecture_t::L2Regularization(A); + } + return 0.0; +} + +/*! Add the regularization gradient corresponding to weight matrix W, to +* the matrix A. */ +//______________________________________________________________________________ +template<typename Architecture_t> +inline void addRegularizationGradients(typename Architecture_t::Matrix_t &A, + const typename Architecture_t::Matrix_t &W, + typename Architecture_t::Scalar_t weightDecay, + ERegularization R) +{ + switch(R) + { + case ERegularization::kNone : + break; + case ERegularization::kL1 : + Architecture_t::AddL1RegularizationGradients(A, W, weightDecay); + break; + case ERegularization::kL2 : + Architecture_t::AddL2RegularizationGradients(A, W, weightDecay); + break; + } +} + +//______________________________________________________________________________ +// +// Initialization +//______________________________________________________________________________ + +template<typename Architecture_t> +inline void initialize(typename Architecture_t::Matrix_t & A, + EInitialization m) +{ + switch(m) { + case EInitialization::kGauss : Architecture_t::InitializeGauss(A); + break; + case EInitialization::kUniform : Architecture_t::InitializeUniform(A); + break; + case EInitialization::kIdentity : Architecture_t::InitializeIdentity(A); + break; + case EInitialization::kZero : Architecture_t::InitializeZero(A); + break; + } +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Layer.h b/tmva/tmva/inc/TMVA/DNN/Layer.h new file mode 100644 index 0000000000000000000000000000000000000000..fbf1b69ce75437e1b888aab67ff18280037e1d6c --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Layer.h @@ -0,0 +1,388 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Contains Layer and SharedLayer classes, that represent layers in // +// neural networks. // +////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_LAYER +#define TMVA_DNN_LAYER + +#include <iostream> + +#include "TMatrix.h" +#include "Functions.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +// +// The Layer Class +//______________________________________________________________________________ + +/** \class TLayer + + Generic layer class. + + This generic layer class represents a layer of a neural network with + a given width n and activation function f. The activation + function of each layer is given by \f$\mathbf{u} = + \mathbf{W}\mathbf{x} + \boldsymbol{\theta}\f$. + + In addition to the weight and bias matrices, each layer allocates memory + for its activations and the corresponding first partial fDerivatives of + the activation function as well as the gradients of the fWeights and fBiases. + + The layer provides member functions for the forward propagation of + activations through the given layer. +*/ +template<typename Architecture_t> + class TLayer +{ + +public: + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + +private: + + size_t fBatchSize; ///< Batch size used for training and evaluation. + size_t fInputWidth; ///< Number of neurons of the previous layer. + size_t fWidth; ///< Number of neurons of this layer. + + Scalar_t fDropoutProbability; ///< Probability that an input is active. + + Matrix_t fWeights; ///< The fWeights of this layer. + Matrix_t fBiases; ///< The bias values of this layer. + Matrix_t fOutput; ///< Activations of this layer. + Matrix_t fDerivatives; ///< First fDerivatives of the activations of this layer. + Matrix_t fWeightGradients; ///< Gradients w.r.t. the weigths of this layer. + Matrix_t fBiasGradients; ///< Gradients w.r.t. the bias values of this layer. + Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer. + + EActivationFunction fF; ///< Activation function of the layer. + +public: + + TLayer(size_t BatchSize, + size_t InputWidth, + size_t Width, + EActivationFunction f, + Scalar_t dropoutProbability); + TLayer(const TLayer &); + + /*! Initialize fWeights according to the given initialization + * method. */ + void Initialize(EInitialization m); + /*! Compute activation of the layer for the given input. The input + * must be in matrix form with the different rows corresponding to + * different events in the batch. Computes activations as well as + * the first partial derivative of the activation function at those + * activations. */ + void inline Forward(Matrix_t & input, bool applyDropout = false); + /*! Compute weight, bias and activation gradients. Uses the precomputed + * first partial derviatives of the activation function computed during + * forward propagation and modifies them. Must only be called directly + * a the corresponding call to Forward(...). */ + void inline Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay); + + void Print() const; + + size_t GetBatchSize() const {return fBatchSize;} + size_t GetInputWidth() const {return fInputWidth;} + size_t GetWidth() const {return fWidth;} + size_t GetDropoutProbability() const {return fDropoutProbability;} + + void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;} + + EActivationFunction GetActivationFunction() const {return fF;} + + Matrix_t & GetOutput() {return fOutput;} + const Matrix_t & GetOutput() const {return fOutput;} + Matrix_t & GetWeights() {return fWeights;} + const Matrix_t & GetWeights() const {return fWeights;} + Matrix_t & GetBiases() {return fBiases;} + const Matrix_t & GetBiases() const {return fBiases;} + Matrix_t & GetActivationGradients() {return fActivationGradients;} + const Matrix_t & GetActivationGradients() const {return fActivationGradients;} + Matrix_t & GetBiasGradients() {return fBiasGradients;} + const Matrix_t & GetBiasGradients() const {return fBiasGradients;} + Matrix_t & GetWeightGradients() {return fWeightGradients;} + const Matrix_t & GetWeightGradients() const {return fWeightGradients;} + +}; + +//______________________________________________________________________________ +// +// The Shared Layer Class +//______________________________________________________________________________ + +/** \class TSharedLayer + + Layer class width shared weight and bias layers. + + Like the Layer class only that weight matrices are shared between + different instances of the net, which can be used to implement + multithreading 'Hogwild' style. +*/ + +template<typename Architecture_t> +class TSharedLayer +{ + +public: + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + +private: + + size_t fBatchSize; ///< Batch size used for training and evaluation. + size_t fInputWidth; ///< Number of neurons of the previous layer. + size_t fWidth; ///< Number of neurons of this layer. + + Scalar_t fDropoutProbability; ///< Probability that an input is active. + + Matrix_t & fWeights; ///< Reference to the weight matrix of this layer. + Matrix_t & fBiases; ///< Reference to the bias vectors of this layer. + Matrix_t fOutput; ///< Activations of this layer. + Matrix_t fDerivatives; ///< First fDerivatives of the activations of this layer. + Matrix_t fWeightGradients; ///< Gradients w.r.t. the weigths of this layer. + Matrix_t fBiasGradients; ///< Gradients w.r.t. the bias values of this layer. + Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer. + + EActivationFunction fF; ///< Activation function of the layer. + +public: + + TSharedLayer(size_t fBatchSize, + TLayer<Architecture_t> & layer); + TSharedLayer(const TSharedLayer & layer); + + /*! Compute activation of the layer for the given input. The input + * must be in matrix form with the different rows corresponding to + * different events in the batch. Computes activations as well as + * the first partial derivative of the activation function at those + * activations. */ + void inline Forward(Matrix_t & input, bool applyDropout = false); + /*! Compute weight, bias and activation gradients. Uses the precomputed + * first partial derviatives of the activation function computed during + * forward propagation and modifies them. Must only be called directly + * a the corresponding call to Forward(...). */ + void inline Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay); + + void Print() const; + + size_t GetBatchSize() const {return fBatchSize;} + size_t GetInputWidth() const {return fInputWidth;} + size_t GetWidth() const {return fWidth;} + size_t GetDropoutProbability() const {return fDropoutProbability;} + + void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;} + + EActivationFunction GetActivationFunction() const {return fF;} + + Matrix_t & GetOutput() {return fOutput;} + const Matrix_t & GetOutput() const {return fOutput;} + Matrix_t & GetWeights() const {return fWeights;} + Matrix_t & GetBiases() {return fBiases;} + const Matrix_t & GetBiases() const {return fBiases;} + Matrix_t & GetActivationGradients() {return fActivationGradients;} + const Matrix_t & GetActivationGradients() const {return fActivationGradients;} + Matrix_t & GetBiasGradients() {return fBiasGradients;} + const Matrix_t & GetBiasGradients() const {return fBiasGradients;} + Matrix_t & GetWeightGradients() {return fWeightGradients;} + const Matrix_t & GetWeightGradients() const {return fWeightGradients;} + +}; + +//______________________________________________________________________________ +// +// The Layer Class - Implementation +//______________________________________________________________________________ + +template<typename Architecture_t> + TLayer<Architecture_t>::TLayer(size_t batchSize, + size_t inputWidth, + size_t width, + EActivationFunction f, + Scalar_t dropoutProbability) + : fBatchSize(batchSize), fInputWidth(inputWidth), fWidth(width), + fDropoutProbability(dropoutProbability), fWeights(width, fInputWidth), + fBiases(width, 1), fOutput(fBatchSize, width), fDerivatives(fBatchSize, width), + fWeightGradients(width, fInputWidth), fBiasGradients(width, 1), + fActivationGradients(fBatchSize, width), fF(f) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t> +TLayer<Architecture_t>::TLayer(const TLayer &layer) + : fBatchSize(layer.fBatchSize), + fInputWidth(layer.fInputWidth), fWidth(layer.fWidth), + fWeights(layer.fWidth, layer.fInputWidth), fBiases(layer.fWidth, 1), + fOutput(layer.fBatchSize, layer.fWidth), + fDerivatives(layer.fBatchSize, layer.fWidth), + fWeightGradients(layer.fWidth, layer.fInputWidth), + fBiasGradients(layer.fWidth, 1), + fActivationGradients(layer.fBatchSize, layer.fWidth), + fF(layer.fF) +{ + Architecture_t::Copy(fWeights, layer.GetWeights()); + Architecture_t::Copy(fBiases, layer.GetBiases()); +} + +//______________________________________________________________________________ +template<typename Architecture_t> +auto TLayer<Architecture_t>::Initialize(EInitialization m) +-> void +{ + initialize<Architecture_t>(fWeights, m); + initialize<Architecture_t>(fBiases, EInitialization::kZero); +} + +//______________________________________________________________________________ +template<typename Architecture_t> +auto inline TLayer<Architecture_t>::Forward(Matrix_t & input, + bool applyDropout) +-> void +{ + if (applyDropout && (fDropoutProbability != 1.0)) { + Architecture_t::Dropout(input, fDropoutProbability); + } + Architecture_t::MultiplyTranspose(fOutput, input, fWeights); + Architecture_t::AddRowWise(fOutput, fBiases); + evaluateDerivative<Architecture_t>(fDerivatives, fF, fOutput); + evaluate<Architecture_t>(fOutput, fF); +} + +//______________________________________________________________________________ +template<typename Architecture_t> +auto TLayer<Architecture_t>::Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay) +-> void +{ + Architecture_t::Backward(gradients_backward, + fWeightGradients, + fBiasGradients, + fDerivatives, + fActivationGradients, + fWeights, + activations_backward); + addRegularizationGradients<Architecture_t>(fWeightGradients, + fWeights, + weightDecay, r); +} + +//______________________________________________________________________________ +template<typename Architecture_t> + void TLayer<Architecture_t>::Print() const +{ + std::cout << "Width = " << fWeights.GetNrows(); + std::cout << ", Activation Function = "; + std::cout << static_cast<int>(fF) << std::endl; +} + +//______________________________________________________________________________ +// +// The Shared Layer Class - Implementation +//______________________________________________________________________________ + +//______________________________________________________________________________ +template<typename Architecture_t> +TSharedLayer<Architecture_t>::TSharedLayer(size_t BatchSize, + TLayer<Architecture_t> &layer) +: fBatchSize(BatchSize), +fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()), +fDropoutProbability(layer.GetDropoutProbability()), +fWeights(layer.GetWeights()), fBiases(layer.GetBiases()), +fOutput(fBatchSize, fWidth), fDerivatives(fBatchSize, fWidth), +fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1), +fActivationGradients(fBatchSize, fWidth), fF(layer.GetActivationFunction()) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t> +TSharedLayer<Architecture_t>::TSharedLayer(const TSharedLayer &layer) + : fBatchSize(layer.fBatchSize), + fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()), + fWeights(layer.fWeights), fBiases(layer.fBiases), + fOutput(layer.fBatchSize, fWidth), fDerivatives(layer.fBatchSize, fWidth), + fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1), + fActivationGradients(layer.fBatchSize, fWidth), + fF(layer.fF) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t> +auto inline TSharedLayer<Architecture_t>::Forward(Matrix_t & input, + bool applyDropout) +-> void +{ + if (applyDropout && (fDropoutProbability != 1.0)) { + Architecture_t::Dropout(input, fDropoutProbability); + } + Architecture_t::MultiplyTranspose(fOutput, input, fWeights); + Architecture_t::AddRowWise(fOutput, fBiases); + evaluateDerivative<Architecture_t>(fDerivatives, fF, fOutput); + evaluate<Architecture_t>(fOutput, fF); +} + +//______________________________________________________________________________ +template<typename Architecture_t> +auto inline TSharedLayer<Architecture_t>::Backward(Matrix_t & gradients_backward, + const Matrix_t & activations_backward, + ERegularization r, + Scalar_t weightDecay) +-> void +{ + Architecture_t::Backward(gradients_backward, + fWeightGradients, + fBiasGradients, + fDerivatives, + fActivationGradients, + fWeights, + activations_backward); + addRegularizationGradients<Architecture_t>(fWeightGradients, + fWeights, + weightDecay, r); +} + +//______________________________________________________________________________ +template<typename Architecture_t> +void TSharedLayer<Architecture_t>::Print() const +{ + std::cout << "Width = " << fWeights.GetNrows(); + std::cout << ", Activation Function = "; + std::cout << static_cast<int>(fF) << std::endl; +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Minimizers.h b/tmva/tmva/inc/TMVA/DNN/Minimizers.h new file mode 100644 index 0000000000000000000000000000000000000000..561ecc0bdf802158c6744b49c1402beedbd5605a --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Minimizers.h @@ -0,0 +1,748 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh 21/06/16 + +/************************************************************************* + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_DNN_MINIMIZERS +#define TMVA_DNN_MINIMIZERS + +#include "DataLoader.h" +#include "Functions.h" +#include <chrono> + +namespace TMVA { +namespace DNN { + +//______________________________________________________________________________ +// +// Generic Gradient Descent Class +//______________________________________________________________________________ +// + +/*** \class TGradientDescent +* +* Generic implementation of gradient descent minimization. +* +* The TGradientDescent class implements an architecture and input data +* independent implementation of the gradient descent minimization algorithm. +* +* Provides Train(...) and TrainMomentum(...) functions that perform a complete +* training of a neural network. Thos are mainly used for testing since for +* production a more fine grained control of the training process is desirable. +* This is provided by the Step(...), StepMomentum(...) and StepNesterov(...) +* functions that perform a single minimization step. +* +* The main training characteristics are defined by the provided learning rate, +* the test interval and the convergence steps required for convergence. The +* test interval defines how often the error on the validation set is computed +* and is the values with which the step counter is increased each time +* the HasConverged() member function is called. A convergence step is defined as +* a step in which the test error is NOT less thatn 0.995 times the current +* minimal test error that has been reached. If between two subsequent calls +* to HasConverged(Double_t) the test error has not been sufficiently reduced +* it is assumed that a number of convergence steps equal to the test interval +* has been performed. +* +*/ +template<typename Architecture_t> +class TGradientDescent +{ +public: + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + +private: + size_t fBatchSize; ///< Batch size to use for the training. + size_t fStepCount; ///< Number of steps performed in the current + ///< training sessiong. + size_t fConvergenceSteps; ///< Number of training epochs without considerable + ///< decrease in the test error for convergence. + size_t fConvergenceCount; ///< Current number of training epochs without + ///< considerable decrease in the test error. + size_t fTestInterval; ///< Interval for the computation of the test error. + Scalar_t fTrainingError;///< Holds the most recently computed training loss. + Scalar_t fTestError; ///< Holds the most recently computed test loss. + Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$ + Scalar_t fMinimumError; ///< The minimum loss achieved on the training set + ///< during the current traning session. + +public: + TGradientDescent(); + TGradientDescent(Scalar_t learningRate, + size_t convergenceSteps, + size_t testInterval); + /** Reset minimizer object to initial state. Does nothing for this minimizer. */ + void Reset() {}; + + /** Train the given net using the given training input data (events), training + output data (labels), test input data (events), test output data (labels). */ + template <typename Data_t, typename Net_t> + Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples, + const Data_t & TestDataIn, size_t nTestSamples, + Net_t & net, size_t nThreads = 1); + + /** Same as Train(...) but uses the given momentum.*/ + template <typename Data_t, typename Net_t> + Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples, + const Data_t & TestDataIn, size_t nTestSamples, + Net_t & net, Scalar_t momentum, size_t nThreads = 1); + + /** Perform a single optimization step on a given batch. Propagates the input + matrix foward through the net, evaluates the loss and propagates the gradients + backward through the net. The computed gradients are scaled by the learning + rate \f$\alpha\f$ and subtracted from the weights and bias values of each + layer. */ + template <typename Net_t> + void Step(Net_t &net, Matrix_t &input, const Matrix_t &output); + + /** Same as Step(...) but also evaluate the loss on the given training data. + * Note that this requires synchronization between host and device. */ + template <typename Net_t> + Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output); + + /** Perform multiple optimization steps simultaneously. Performs the + * backprop algorithm on the input batches given in \p batches on + * the neural networks given in \p nets. The forward and backward propagation + * steps are executed in an interleaving manner in order to exploit potential + * batch-level parallelism for asynchronous device calls. + */ + template <typename Net_t> + void Step(Net_t &master, + std::vector<Net_t> &nets, + std::vector<TBatch<Architecture_t>> &batches); + + /** Same as the Step(...) method for multiple batches but uses momentum. */ + template <typename Net_t> + void StepMomentum(Net_t &master, + std::vector<Net_t> &nets, + std::vector<TBatch<Architecture_t>> &batches, + Scalar_t momentum); + template <typename Net_t> + + /** Same as the Step(...) method for multiple batches but uses Nesterov + * momentum. */ + void StepNesterov(Net_t &master, + std::vector<Net_t> &nets, + std::vector<TBatch<Architecture_t>> &batches, + Scalar_t momentum); + + /** Does not evaluate the loss and therefore not trigger a possible synchronization + * with the device. Trains the weights of each layer, but only the bias terms of + * the first layer for compatibility with the previous implementation. */ + template <typename Net_t> + void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output); + + /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger + * synchronization with the device. */ + template <typename Net_t> + Scalar_t StepReducedWeightsLoss(Net_t &net, + Matrix_t &input, + const Matrix_t &output); + /** Increases the minimization step counter by the test error evaluation + * period and uses the current internal value of the test error to + * determine if the minimization has converged. */ + bool HasConverged(); + /** Increases the minimization step counter by the test error evaluation + * period and uses the provided test error value of to determine if + * the minimization has converged. */ + bool HasConverged(Scalar_t testError); + + size_t GetConvergenceCount() const {return fConvergenceCount;} + size_t GetConvergenceSteps() const {return fConvergenceSteps;} + Scalar_t GetTrainingError() const {return fTrainingError;} + Scalar_t GetTestError() const {return fTestError;} + size_t GetTestInterval() const {return fTestInterval;} + + void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;} + void SetTestInterval(size_t interval) {fTestInterval = interval;} + void SetLearningRate(Scalar_t rate) {fLearningRate = rate;} + void SetBatchSize(Scalar_t rate) {fBatchSize = rate;} +}; + +// +// Implementation +//______________________________________________________________________________ +template<typename Architecture_t> + TGradientDescent<Architecture_t>::TGradientDescent() + : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), + fConvergenceCount(0), fTestInterval(0), fLearningRate(0), + fMinimumError(1e100) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t> +TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, + size_t convergenceSteps, + size_t testInterval) + : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), + fConvergenceCount(0), fTestInterval(testInterval), fLearningRate(learningRate), + fMinimumError(1e100) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t> +template <typename Data_t, typename Net_t> + auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData, + size_t nTrainingSamples, + const Data_t & testData, + size_t nTestSamples, + Net_t & net, + size_t nThreads) + -> Scalar_t +{ + // Reset iteration state. + fMinimumError = 1e100; + fConvergenceCount = 0; + fStepCount = 0; + + // Prepare training data. + bool converged = false; + + TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples, + net.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + auto testNet = net.CreateClone(nTestSamples); + TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples, + testNet.GetBatchSize(), + testNet.GetInputWidth(), + net.GetOutputWidth()); + std::vector<Net_t> nets{}; + nets.reserve(nThreads); + for (size_t i = 0; i < nThreads; i++) { + nets.push_back(net); + for (size_t j = 0; j < net.GetDepth(); j++) + { + auto &masterLayer = net.GetLayer(j); + auto &layer = nets.back().GetLayer(j); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } + + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + while (!converged) + { + fStepCount++; + + trainLoader.Shuffle(); + std::vector<TBatch<Architecture_t>> batches{}; + for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) { + batches.clear(); + for (size_t j = 0; j < nThreads; j++) { + batches.reserve(nThreads); + batches.push_back(trainLoader.GetBatch()); + } + Step(net, nets, batches); + } + + // Compute test error. + if ((fStepCount % fTestInterval) == 0) { + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + start = std::chrono::system_clock::now(); + double seconds = elapsed_seconds.count(); + double batchesInEpoch = (double) (nTrainingSamples / net.GetBatchSize()); + double nFlops = batchesInEpoch * fTestInterval; + nFlops *= net.GetNFlops(); + std::cout << "Elapsed time for " << fTestInterval << " Epochs: " + << seconds << " [s] => " << nFlops * 1e-9 / seconds + << " GFlop/s" << std::endl; + + auto b = *testLoader.begin(); + auto inputMatrix = b.GetInput(); + auto outputMatrix = b.GetOutput(); + Scalar_t loss = testNet.Loss(inputMatrix, outputMatrix); + + std::cout << "Step " << fStepCount << ": Training Error = " + << loss << std::endl; + converged = HasConverged(); + } + + } + return fMinimumError; +} + +//______________________________________________________________________________ +template<typename Architecture_t> +template <typename Data_t, typename Net_t> +auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData, + size_t nTrainingSamples, + const Data_t & testData, + size_t nTestSamples, + Net_t & net, + Scalar_t momentum, + size_t nThreads) + -> Scalar_t +{ + // Reset iteration state. + fMinimumError = 1e100; + fConvergenceCount = 0; + fStepCount = 0; + + // Prepare training data. + bool converged = false; + + TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples, + net.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + auto testNet = net.CreateClone(net.GetBatchSize()); + TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples, + testNet.GetBatchSize(), + testNet.GetInputWidth(), + net.GetOutputWidth()); + + net.InitializeGradients(); + std::vector<Net_t> nets{}; + nets.reserve(nThreads); + for (size_t i = 0; i < nThreads; i++) { + nets.push_back(net); + for (size_t j = 0; j < net.GetDepth(); j++) + { + auto &masterLayer = net.GetLayer(j); + auto &layer = nets.back().GetLayer(j); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } + + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + while (!converged) + { + fStepCount++; + + trainLoader.Shuffle(); + // Iterate over epoch. + std::vector<TBatch<Architecture_t>> batches{}; + for (size_t i = 0; i < nTrainingSamples / net.GetBatchSize(); i += nThreads) { + batches.clear(); + batches.reserve(nThreads); + for (size_t j = 0; j < nThreads; j++) { + batches.push_back(trainLoader.GetBatch()); + } + if (momentum != 0.0) { + StepMomentum(net, nets, batches, momentum); + } else { + Step(net, nets, batches); + } + } + + // Compute test error. + if ((fStepCount % fTestInterval) == 0) { + fTestError = 0.0; + for (size_t i = 0; i < nTestSamples / net.GetBatchSize(); i += nThreads) { + auto b = testLoader.GetBatch(); + auto inputMatrix = b.GetInput(); + auto outputMatrix = b.GetOutput(); + fTestError += testNet.Loss(inputMatrix, outputMatrix); + } + fTestError /= (Double_t) nTestSamples / net.GetBatchSize(); + converged = HasConverged(); + } + + } + return fMinimumError; +} + +//______________________________________________________________________________ +template<typename Architecture_t> + template <typename Net_t> + void inline TGradientDescent<Architecture_t>::Step(Net_t & net, + Matrix_t &input, + const Matrix_t &output) +{ + //Scalar_t loss = net.Loss(input, output); + //fTrainingError = loss; + net.Forward(input); + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t> +template <typename Net_t> +auto inline TGradientDescent<Architecture_t>::StepLoss(Net_t & net, + Matrix_t &input, + const Matrix_t &output) + -> Scalar_t +{ + //Scalar_t loss = net.Loss(input, output); + //fTrainingError = loss; + Scalar_t loss = net.Loss(input, output); + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } + return loss; +} + +//______________________________________________________________________________ +template<typename Architecture_t> + template <typename Net_t> + void inline TGradientDescent<Architecture_t>::Step( + Net_t & master, + std::vector<Net_t> & nets, + std::vector<TBatch<Architecture_t>> & batches) +{ + typename Architecture_t::Matrix_t dummy(0,0); + size_t depth = master.GetDepth(); + + // Forward + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Forward(batches[j].GetInput()); + } + + for (size_t i = 1; i < depth; i++) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput()); + } + } + // Gradients + for (size_t j = 0; j < nets.size(); j++) { + evaluateGradients<Architecture_t>( + nets[j].GetLayer(depth-1).GetActivationGradients(), + nets[j].GetLossFunction(), + batches[j].GetOutput(), + nets[j].GetLayer(depth-1).GetOutput()); + } + // Backward + for (size_t i = depth - 1; i > 0; i--) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(), + nets[j].GetLayer(i-1).GetOutput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + } + } + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Backward(dummy, + batches[j].GetInput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + } + + for (size_t j = 0; j < nets.size(); j++) { + for (size_t i = 0; i < depth; i++) + { + auto &masterLayer = master.GetLayer(i); + auto &layer = nets[j].GetLayer(i); + Architecture_t::ScaleAdd(masterLayer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::ScaleAdd(masterLayer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } +} + +//______________________________________________________________________________ +template<typename Architecture_t> +template <typename Net_t> +void inline TGradientDescent<Architecture_t>::StepMomentum( + Net_t & master, + std::vector<Net_t> & nets, + std::vector<TBatch<Architecture_t>> & batches, + Scalar_t momentum) +{ + typename Architecture_t::Matrix_t dummy(0,0); + size_t depth = master.GetDepth(); + + // Forward + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Forward(batches[j].GetInput()); + } + + for (size_t i = 1; i < depth; i++) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput()); + } + } + // Gradients + for (size_t j = 0; j < nets.size(); j++) { + evaluateGradients<Architecture_t>( + nets[j].GetLayer(depth-1).GetActivationGradients(), + nets[j].GetLossFunction(), + batches[j].GetOutput(), + nets[j].GetLayer(depth-1).GetOutput()); + } + // Backward + for (size_t i = depth - 1; i > 0; i--) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(), + nets[j].GetLayer(i-1).GetOutput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(), + nets[j].GetLayer(i).GetWeightGradients(), + - fLearningRate / momentum); + Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(), + nets[j].GetLayer(i).GetBiasGradients(), + - fLearningRate / momentum); + } + Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(), + master.GetLayer(i).GetWeightGradients(), + momentum - 1.0); + Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(), + master.GetLayer(i).GetBiasGradients(), + momentum - 1.0); + } + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Backward(dummy, + batches[j].GetInput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(), + nets[j].GetLayer(0).GetWeightGradients(), + - fLearningRate / momentum); + Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(), + nets[j].GetLayer(0).GetBiasGradients(), + - fLearningRate / momentum); + } + + Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(), + master.GetLayer(0).GetWeightGradients(), + momentum - 1.0); + Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(), + master.GetLayer(0).GetBiasGradients(), + momentum - 1.0); + + for (size_t i = 0; i < depth; i++) + { + auto &masterLayer = master.GetLayer(i); + Architecture_t::ScaleAdd(masterLayer.GetWeights(), + masterLayer.GetWeightGradients(), + 1.0); + Architecture_t::ScaleAdd(masterLayer.GetBiases(), + masterLayer.GetBiasGradients(), + 1.0); + for (size_t j = 0; j < nets.size(); j++) { + auto &layer = nets[j].GetLayer(i); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } +} + +//______________________________________________________________________________ +template<typename Architecture_t> +template <typename Net_t> +void inline TGradientDescent<Architecture_t>::StepNesterov( + Net_t & master, + std::vector<Net_t> & nets, + std::vector<TBatch<Architecture_t>> & batches, + Scalar_t momentum) +{ + typename Architecture_t::Matrix_t dummy(0,0); + size_t depth = master.GetDepth(); + + // Forward + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Forward(batches[j].GetInput()); + } + + for (size_t i = 1; i < depth; i++) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput()); + } + } + + // Gradients + for (size_t j = 0; j < nets.size(); j++) { + evaluateGradients<Architecture_t>( + nets[j].GetLayer(depth-1).GetActivationGradients(), + nets[j].GetLossFunction(), + batches[j].GetOutput(), + nets[j].GetLayer(depth-1).GetOutput()); + } + + // Backward + for (size_t i = depth - 1; i > 0; i--) + { + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(), + nets[j].GetLayer(i-1).GetOutput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + } + } + + for (size_t j = 0; j < nets.size(); j++) { + nets[j].GetLayer(0).Backward(dummy, + batches[j].GetInput(), + nets[j].GetRegularization(), + nets[j].GetWeightDecay()); + } + + for (size_t i = 0; i < depth; i++) + { + auto &masterLayer = master.GetLayer(i); + for (size_t j = 0; j < nets.size(); j++) { + auto &layer = nets[j].GetLayer(i); + Architecture_t::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + Architecture_t::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + Architecture_t::ScaleAdd(layer.GetWeights(), + masterLayer.GetWeightGradients(), + 1.0); + Architecture_t::ScaleAdd(layer.GetBiases(), + masterLayer.GetBiasGradients(), + 1.0); + } + for (size_t j = 0; j < nets.size(); j++) { + auto &layer = nets[j].GetLayer(i); + Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(), + layer.GetWeightGradients(), + - fLearningRate / momentum); + Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(), + layer.GetBiasGradients(), + - fLearningRate / momentum); + } + Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(), + masterLayer.GetWeightGradients(), + momentum - 1.0); + Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(), + masterLayer.GetBiasGradients(), + momentum - 1.0); + Architecture_t::ScaleAdd(masterLayer.GetWeights(), + masterLayer.GetWeightGradients(), + 1.0); + Architecture_t::ScaleAdd(masterLayer.GetBiases(), + masterLayer.GetBiasGradients(), + 1.0); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t> +template <typename Net_t> +void inline TGradientDescent<Architecture_t>::StepReducedWeights( + Net_t & net, + Matrix_t &input, + const Matrix_t &output) +{ + net.Forward(input); + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + if (i == 0) { + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } + } +} + +//______________________________________________________________________________ +template<typename Architecture_t> + template <typename Net_t> + auto inline TGradientDescent<Architecture_t>::StepReducedWeightsLoss( + Net_t & net, + Matrix_t &input, + const Matrix_t &output) + -> Scalar_t +{ + Scalar_t loss = net.Loss(input, output); + fTrainingError = loss; + net.Backward(input, output); + + for (size_t i = 0; i < net.GetDepth(); i++) + { + auto &layer = net.GetLayer(i); + Architecture_t::ScaleAdd(layer.GetWeights(), + layer.GetWeightGradients(), + -fLearningRate); + if (i == 0) { + Architecture_t::ScaleAdd(layer.GetBiases(), + layer.GetBiasGradients(), + -fLearningRate); + } + } + return loss; +} + +//______________________________________________________________________________ +template<typename Architecture_t> +bool inline TGradientDescent<Architecture_t>::HasConverged() +{ + if (fTestError < fMinimumError * 0.999) { + fConvergenceCount = 0; + fMinimumError = fTestError; + } else { + fConvergenceCount++; + } + + return (fConvergenceCount >= fConvergenceSteps); +} + +//______________________________________________________________________________ +template<typename Architecture_t> +bool inline TGradientDescent<Architecture_t>::HasConverged(Scalar_t testError) +{ + fTestError = testError; + if (fTestError < fMinimumError * 0.999) { + fConvergenceCount = 0; + fMinimumError = fTestError; + } else { + fConvergenceCount += fTestInterval; + } + return (fConvergenceCount >= fConvergenceSteps); +} +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/DNN/Net.h b/tmva/tmva/inc/TMVA/DNN/Net.h new file mode 100644 index 0000000000000000000000000000000000000000..0fa4121f0d7817477837abc55b7c7c929e5c926f --- /dev/null +++ b/tmva/tmva/inc/TMVA/DNN/Net.h @@ -0,0 +1,408 @@ +// @(#)root/tmva: $Id$ +// Author: Simon Pfreundschuh 20/06/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef TMVA_DNN_NET +#define TMVA_DNN_NET + +#include <vector> +#include <iostream> + +#include "Layer.h" + +namespace TMVA { +namespace DNN { + +/** \class TNet + + Generic neural network class. + + This generic neural network class represents a concrete neural + network through a vector of layers and coordinates the forward + and backward propagation through the net. + + The net takes as input a batch from the training data given in + matrix form, with each row corresponding to a certain training + event. + + On construction, the neural network allocates all the memory + required for the training of the neural net and keeps it until + its destruction. + + The Architecture type argument simply holds the + architecture-specific data types, which are just the matrix type + Matrix_t and the used scalar type Scalar_t. + + \tparam Architecture The Architecture type that holds the + \tparam Layer_t The type used for the layers. Can be either + Layer<Architecture> or SharedWeightLayer<Architecture>. + datatypes for a given architecture. +*/ +template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>> + class TNet { + +public: + using Matrix_t = typename Architecture_t::Matrix_t; + using Scalar_t = typename Architecture_t::Scalar_t; + using LayerIterator_t = typename std::vector<Layer_t>::iterator; + +private: + size_t fBatchSize; ///< Batch size for training and evaluation of the Network. + size_t fInputWidth; ///< Number of features in a single input event. + + std::vector<Layer_t> fLayers; ///< Layers in the network. + + Matrix_t fDummy; ///< Empty matrix for last step in back propagation. + ELossFunction fJ; ///< The loss function of the network. + ERegularization fR; ///< The regularization used for the network. + Scalar_t fWeightDecay; ///< The weight decay factor. + +public: + TNet(); + TNet(const TNet & other); + template<typename OtherArchitecture_t> + TNet(size_t batchSize, const TNet<OtherArchitecture_t> &); + /*! Construct a neural net for a given batch size with + * given output function * and regularization. */ + TNet(size_t batchSize, + size_t inputWidth, + ELossFunction fJ, + ERegularization fR = ERegularization::kNone, + Scalar_t fWeightDecay = 0.0); + /*! Create a clone that uses the same weight and biases matrices but + * potentially a difference batch size. */ + TNet<Architecture_t, TSharedLayer<Architecture_t>> CreateClone(size_t batchSize); + + /*! Add a layer of the given size to the neural net. */ + void AddLayer(size_t width, EActivationFunction f, + Scalar_t dropoutProbability = 1.0); + + /*! Remove all layers from the network.*/ + void Clear(); + + /*! Add a layer which shares its weights with another TNet instance. */ + template <typename SharedLayer> + void AddLayer(SharedLayer & layer); + + /*! Iterator to the first layer of the net. */ + LayerIterator_t LayersBegin() {return fLayers;} + + /*! Iterator to the last layer of the net. */ + LayerIterator_t LayersEnd() {return fLayers;} + + /*! Initialize the weights in the net with the + * initialization method. */ + inline void Initialize(EInitialization m); + + /*! Initialize the gradients in the net to zero. Required if net is + * used to store velocities of momentum-based minimization techniques. */ + inline void InitializeGradients(); + + /*! Forward a given input through the neural net. Computes + * all layer activations up to the output layer */ + inline void Forward(Matrix_t& X, bool applyDropout = false); + + /*! Compute the weight gradients in the net from the given training + * samples X and training labels Y. */ + inline void Backward(const Matrix_t &X, const Matrix_t &Y); + + /*! Evaluate the loss function of the net using the activations + * that are currently stored in the output layer. */ + inline Scalar_t Loss(const Matrix_t &Y) const; + + /*! Propagate the input batch X through the net and evaluate the + * error function for the resulting activations of the output + * layer */ + inline Scalar_t Loss(Matrix_t &X, const Matrix_t &Y, bool applyDropout = false); + + /*! Compute the neural network predictionion obtained from forwarding the + * batch X through the neural network and applying the output function + * f to the activation of the last layer in the network. */ + inline void Prediction(Matrix_t &Y_hat, Matrix_t &X, EOutputFunction f); + + /*! Compute the neural network rediction obtained from applying the output + * function f to the activation of the last layer in the network. */ + inline void Prediction(Matrix_t &Y_hat, EOutputFunction f) const; + + Scalar_t GetNFlops(); + + size_t GetDepth() const {return fLayers.size();} + size_t GetBatchSize() const {return fBatchSize;} + Layer_t & GetLayer(size_t i) {return fLayers[i];} + const Layer_t & GetLayer(size_t i) const {return fLayers[i];} + ELossFunction GetLossFunction() const {return fJ;} + Matrix_t & GetOutput() {return fLayers.back().GetOutput();} + size_t GetInputWidth() const {return fInputWidth;} + size_t GetOutputWidth() const {return fLayers.back().GetWidth();} + ERegularization GetRegularization() const {return fR;} + Scalar_t GetWeightDecay() const {return fWeightDecay;} + + void SetBatchSize(size_t batchSize) {fBatchSize = batchSize;} + void SetInputWidth(size_t inputWidth) {fInputWidth = inputWidth;} + void SetRegularization(ERegularization R) {fR = R;} + void SetLossFunction(ELossFunction J) {fJ = J;} + void SetWeightDecay(Scalar_t weightDecay) {fWeightDecay = weightDecay;} + void SetDropoutProbabilities(const std::vector<Double_t> & probabilities); + + void Print(); +}; + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + TNet<Architecture_t, Layer_t>::TNet() + : fBatchSize(0), fInputWidth(0), fDummy(0,0), + fJ(ELossFunction::kMeanSquaredError), fR(ERegularization::kNone) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ + +template<typename Architecture_t, typename Layer_t> + TNet<Architecture_t, Layer_t>::TNet(const TNet & other) + : fBatchSize(other.fBatchSize), fInputWidth(other.fInputWidth), + fLayers(other.fLayers), fDummy(0,0), fJ(other.fJ), fR(other.fR) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> +template<typename OtherArchitecture_t> +TNet<Architecture_t, Layer_t>::TNet(size_t batchSize, + const TNet<OtherArchitecture_t> & other) + : fBatchSize(batchSize), fInputWidth(other.GetInputWidth()), + fDummy(0,0), fJ(other.GetLossFunction()), fR(other.GetRegularization()) +{ + fLayers.reserve(other.GetDepth()); + for (size_t i = 0; i < other.GetDepth(); i++) { + AddLayer(other.GetLayer(i).GetWidth(), + other.GetLayer(i).GetActivationFunction(), + other.GetLayer(i).GetDropoutProbability()); + fLayers[i].GetWeights() = (TMatrixT<Double_t>) other.GetLayer(i).GetWeights(); + fLayers[i].GetBiases() = (TMatrixT<Double_t>) other.GetLayer(i).GetBiases(); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + TNet<Architecture_t, Layer_t>::TNet(size_t batchSize, + size_t inputWidth, + ELossFunction J, + ERegularization R, + Scalar_t weightDecay) + : fBatchSize(batchSize), fInputWidth(inputWidth), fDummy(0,0), + fJ(J), fR(R), fWeightDecay(weightDecay) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + auto TNet<Architecture_t, Layer_t>::CreateClone(size_t BatchSize) + -> TNet<Architecture_t, TSharedLayer<Architecture_t>> +{ + TNet<Architecture_t, TSharedLayer<Architecture_t>> other(BatchSize, + fInputWidth, + fJ, fR); + for (auto &l : fLayers) { + other.AddLayer(l); + } + return other; +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + void TNet<Architecture_t, Layer_t>::AddLayer(size_t width, + EActivationFunction f, + Scalar_t dropoutProbability) +{ + if (fLayers.size() == 0) { + fLayers.emplace_back(fBatchSize, fInputWidth, width, f, dropoutProbability); + } else { + size_t prevWidth = fLayers.back().GetWidth(); + fLayers.emplace_back(fBatchSize, prevWidth, width, f, dropoutProbability); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + void TNet<Architecture_t, Layer_t>::Clear() +{ + fLayers.clear(); +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + template<typename SharedLayer_t> + inline void TNet<Architecture_t, Layer_t>::AddLayer(SharedLayer_t & layer) +{ + fLayers.emplace_back(fBatchSize, layer); +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline void TNet<Architecture_t, Layer_t>::Initialize(EInitialization m) +{ + for (auto &l : fLayers) { + l.Initialize(m); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline void TNet<Architecture_t, Layer_t>::InitializeGradients() +{ + for (auto &l : fLayers) { + initialize<Architecture_t>(l.GetWeightGradients(), EInitialization::kZero); + initialize<Architecture_t>(l.GetBiasGradients(), EInitialization::kZero); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> +inline void TNet<Architecture_t, Layer_t>::Forward(Matrix_t &input, + bool applyDropout) +{ + fLayers.front().Forward(input, applyDropout); + + for (size_t i = 1; i < fLayers.size(); i++) { + fLayers[i].Forward(fLayers[i-1].GetOutput(), applyDropout); + } +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline void TNet<Architecture_t, Layer_t>::Backward(const Matrix_t &X, + const Matrix_t &Y) +{ + + evaluateGradients<Architecture_t>(fLayers.back().GetActivationGradients(), + fJ, Y, fLayers.back().GetOutput()); + + for (size_t i = fLayers.size()-1; i > 0; i--) { + auto & activation_gradient_backward + = fLayers[i-1].GetActivationGradients(); + auto & activations_backward + = fLayers[i-1].GetOutput(); + fLayers[i].Backward(activation_gradient_backward, + activations_backward, fR, fWeightDecay); + } + fLayers[0].Backward(fDummy, X, fR, fWeightDecay); + +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline auto TNet<Architecture_t, Layer_t>::Loss(const Matrix_t &Y) const + -> Scalar_t +{ + auto loss = evaluate<Architecture_t>(fJ, Y, fLayers.back().GetOutput()); + for (auto &l : fLayers) { + loss += fWeightDecay * regularization<Architecture_t>(l.GetWeights(), fR); + } + return loss; +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline auto TNet<Architecture_t, Layer_t>::Loss(Matrix_t &X, + const Matrix_t &Y, + bool applyDropout) + -> Scalar_t +{ + Forward(X, applyDropout); + return Loss(Y); +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline void TNet<Architecture_t, Layer_t>::Prediction(Matrix_t &Yhat, + Matrix_t &X, + EOutputFunction f) +{ + Forward(X, false); + evaluate<Architecture_t>(Yhat, f, fLayers.back().GetOutput()); +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + inline void TNet<Architecture_t, Layer_t>::Prediction(Matrix_t &Y_hat, + EOutputFunction f) const +{ + evaluate<Architecture_t>(Y_hat, f, fLayers.back().GetOutput()); +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> +auto TNet<Architecture_t, Layer_t>::GetNFlops() + -> Scalar_t +{ + Scalar_t flops = 0; + + Scalar_t nb = (Scalar_t) fBatchSize; + Scalar_t nlp = (Scalar_t) fInputWidth; + + for(size_t i = 0; i < fLayers.size(); i++) { + Layer_t & layer = fLayers[i]; + Scalar_t nl = (Scalar_t) layer.GetWidth(); + + // Forward propagation. + flops += nb * nl * (2.0 * nlp - 1); // Matrix mult. + flops += nb * nl; // Add bias values. + flops += 2 * nb * nl; // Apply activation function and compute + // derivative. + // Backward propagation. + flops += nb * nl; // Hadamard + flops += nlp * nl * (2.0 * nb - 1.0); // Weight gradients + flops += nl * (nb - 1); // Bias gradients + if (i > 0) { + flops += nlp * nb * (2.0 * nl - 1.0); // Previous layer gradients. + } + nlp = nl; + } + return flops; +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> +void TNet<Architecture_t, Layer_t>::SetDropoutProbabilities( + const std::vector<Double_t> & probabilities) +{ + for (size_t i = 0; i < fLayers.size(); i++) { + if (i < probabilities.size()) { + fLayers[i].SetDropoutProbability(probabilities[i]); + } else { + fLayers[i].SetDropoutProbability(1.0); + } + } +} + +//______________________________________________________________________________ +template<typename Architecture_t, typename Layer_t> + void TNet<Architecture_t, Layer_t>::Print() +{ + std::cout << "DEEP NEURAL NETWORK:"; + std::cout << " Loss function = " << static_cast<char>(fJ); + std::cout << ", Depth = " << fLayers.size() << std::endl; + + size_t i = 1; + for (auto & l : fLayers) { + std::cout << "DNN Layer " << i << ":" << std::endl; + l.Print(); + i++; + } + +} + +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/inc/TMVA/MethodDNN.h b/tmva/tmva/inc/TMVA/MethodDNN.h index 3b6bdc13971bbf1082c793b2a004556123ce383b..970fe203be38f9a7fd50a2150ec966086edda4da 100644 --- a/tmva/tmva/inc/TMVA/MethodDNN.h +++ b/tmva/tmva/inc/TMVA/MethodDNN.h @@ -11,7 +11,8 @@ * NeuralNetwork * * * * Authors (alphabetical): * - * Peter Speckmayer <peter.speckmayer@gmx.at> - CERN, Switzerland * + * Peter Speckmayer <peter.speckmayer@gmx.at> - CERN, Switzerland * + * Simon Pfreundschuh <s.pfreundschuh@gmail.com> - CERN, Switzerland * * * * Copyright (c) 2005-2015: * * CERN, Switzerland * @@ -57,117 +58,165 @@ #include "TMVA/NeuralNet.h" #endif +#include "TMVA/Tools.h" +#include "TMVA/DNN/Net.h" +#include "TMVA/DNN/Minimizers.h" +#include "TMVA/DNN/Architectures/Reference.h" -namespace TMVA { - - class MethodDNN : public MethodBase - { - - public: - - // standard constructors - MethodDNN ( const TString& jobName, - const TString& methodTitle, - DataSetInfo& theData, - const TString& theOption); - - MethodDNN ( DataSetInfo& theData, - const TString& theWeightFile ); - - virtual ~MethodDNN(); - - virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets ); - std::vector<std::pair<int,TMVA::DNN::EnumFunction>> ParseLayoutString(TString layerSpec); - std::vector<std::map<TString,TString>> ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim); - - void Train(); - - virtual Double_t GetMvaValue( Double_t* err=0, Double_t* errUpper=0 ); - virtual const std::vector<Float_t>& GetRegressionValues(); - virtual const std::vector<Float_t>& GetMulticlassValues(); - - using MethodBase::ReadWeightsFromStream; - - // write weights to stream - void AddWeightsXMLTo ( void* parent ) const; - - // read weights from stream - void ReadWeightsFromStream( std::istream & i ); - void ReadWeightsFromXML ( void* wghtnode ); - - // ranking of input variables - const Ranking* CreateRanking(); - - // nice output - void PrintCoefficients( void ); - - // write classifier-specific monitoring information to target file - virtual void WriteMonitoringHistosToFile() const; - - protected: - - - // make ROOT-independent C++ class for classifier response (classifier-specific implementation) - void MakeClassSpecific( std::ostream&, const TString& ) const; - - // get help message text - void GetHelpMessage() const; - - - private: - - void checkGradients (); - - // the option handling methods - void DeclareOptions(); - void ProcessOptions(); +#ifdef DNNCPU +#include "TMVA/DNN/Architectures/Cpu.h" +#endif - // general helper functions - void Init(); +#ifdef DNNCUDA +#include "TMVA/DNN/Architectures/Cuda.h" +#endif +using namespace TMVA::DNN; - private: - TMVA::DNN::Net fNet; - std::vector<double> fWeights; +namespace TMVA { - TString fLayoutString; - std::vector<std::pair<int,TMVA::DNN::EnumFunction>> fLayout; - TString fErrorStrategy; - TString fTrainingStrategy; - TMVA::DNN::ModeErrorFunction fModeErrorFunction; - std::shared_ptr<TMVA::Monitoring> fMonitoring; - double fSumOfSigWeights_test; - double fSumOfBkgWeights_test; - bool fResume; - TString fWeightInitializationStrategyString; - TMVA::DNN::WeightInitializationStrategy fWeightInitializationStrategy; +class MethodDNN : public MethodBase +{ + using Architecture_t = TReference<Double_t>; + using Net_t = TNet<Architecture_t>; + using Matrix_t = typename Architecture_t::Matrix_t; - std::vector<std::shared_ptr<TMVA::DNN::Settings>> fSettings; +private: - TString fFileName; - double fScaleToNumEvents; + using LayoutVector_t = std::vector<std::pair<int, EActivationFunction>>; + using KeyValueVector_t = std::vector<std::map<TString, TString>>; - ClassDef(MethodDNN,0); // neural network + struct TTrainingSettings + { + size_t batchSize; + size_t testInterval; + size_t convergenceSteps; + ERegularization regularization; + Double_t learningRate; + Double_t momentum; + Double_t weightDecay; + std::vector<Double_t> dropoutProbabilities; + bool multithreading; }; -} // namespace TMVA - - -// make_unqiue is only available with C++14 -template <typename T, typename... Args> - std::unique_ptr<T> make_unique (Args&&... args) + // the option handling methods + void DeclareOptions(); + void ProcessOptions(); + + // general helper functions + void Init(); + + Net_t fNet; + EInitialization fWeightInitialization; + EOutputFunction fOutputFunction; + + TString fLayoutString; + TString fErrorStrategy; + TString fTrainingStrategyString; + TString fWeightInitializationString; + TString fArchitectureString; + LayoutVector_t fLayout; + std::vector<TTrainingSettings> fTrainingSettings; + bool fResume; + + KeyValueVector_t fSettings; + + ClassDef(MethodDNN,0); // neural network + + static inline void WriteMatrixXML(void *parent, const char *name, + const TMatrixT<Double_t> &X); + static inline void ReadMatrixXML(void *xml, const char *name, + TMatrixT<Double_t> &X); +protected: + + void MakeClassSpecific( std::ostream&, const TString& ) const; + void GetHelpMessage() const; + +public: + + // Standard Constructors + MethodDNN(const TString& jobName, + const TString& methodTitle, + DataSetInfo& theData, + const TString& theOption); + MethodDNN(DataSetInfo& theData, + const TString& theWeightFile); + virtual ~MethodDNN(); + + virtual Bool_t HasAnalysisType(Types::EAnalysisType type, + UInt_t numberClasses, + UInt_t numberTargets ); + LayoutVector_t ParseLayoutString(TString layerSpec); + KeyValueVector_t ParseKeyValueString(TString parseString, + TString blockDelim, + TString tokenDelim); + void Train(); + void TrainGpu(); + template <typename AFloat> + void TrainCpu(); + + virtual Double_t GetMvaValue( Double_t* err=0, Double_t* errUpper=0 ); + virtual const std::vector<Float_t>& GetRegressionValues(); + virtual const std::vector<Float_t>& GetMulticlassValues(); + + using MethodBase::ReadWeightsFromStream; + + // write weights to stream + void AddWeightsXMLTo ( void* parent ) const; + + // read weights from stream + void ReadWeightsFromStream( std::istream & i ); + void ReadWeightsFromXML ( void* wghtnode ); + + // ranking of input variables + const Ranking* CreateRanking(); + +}; + +inline void MethodDNN::WriteMatrixXML(void *parent, + const char *name, + const TMatrixT<Double_t> &X) { - return std::unique_ptr<T>(new T(std::forward<Args>(args)...)); + std::stringstream matrixStringStream(""); + matrixStringStream.precision( 16 ); + + for (size_t i = 0; i < (size_t) X.GetNrows(); i++) + { + for (size_t j = 0; j < (size_t) X.GetNcols(); j++) + { + matrixStringStream << std::scientific << X(i,j) << " "; + } + } + std::string s = matrixStringStream.str(); + void* matxml = gTools().xmlengine().NewChild(parent, 0, name); + gTools().xmlengine().NewAttr(matxml, 0, "rows", + gTools().StringFromInt((int)X.GetNrows())); + gTools().xmlengine().NewAttr(matxml, 0, "cols", + gTools().StringFromInt((int)X.GetNcols())); + gTools().xmlengine().AddRawLine (matxml, s.c_str()); } -// make_shared is only available with C++14 -template <typename T, typename... Args> - std::shared_ptr<T> make_shared (Args&&... args) +inline void MethodDNN::ReadMatrixXML(void *xml, + const char *name, + TMatrixT<Double_t> &X) { - return std::shared_ptr<T>(new T(std::forward<Args>(args)...)); -} + void *matrixXML = gTools().GetChild(xml, name); + size_t rows, cols; + gTools().ReadAttr(matrixXML, "rows", rows); + gTools().ReadAttr(matrixXML, "cols", cols); + const char * matrixString = gTools().xmlengine().GetNodeContent(matrixXML); + std::stringstream matrixStringStream(matrixString); + for (size_t i = 0; i < rows; i++) + { + for (size_t j = 0; j < cols; j++) + { + matrixStringStream >> X(i,j); + } + } +} +} // namespace TMVA #endif diff --git a/tmva/tmva/src/DNN/Architectures/Cpu.cxx b/tmva/tmva/src/DNN/Architectures/Cpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..42d947d009a64eafb2d6ea3671e4af447c0233a6 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu.cxx @@ -0,0 +1,32 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////// +// Explicit instantiation of the CPU architecture class. // +/////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" + +#include "Cpu/ActivationFunctions.cxx" +#include "Cpu/Arithmetic.cxx" +#include "Cpu/Dropout.cxx" +#include "Cpu/Initialization.cxx" +#include "Cpu/LossFunctions.cxx" +#include "Cpu/OutputFunctions.cxx" +#include "Cpu/Propagation.cxx" +#include "Cpu/Regularization.cxx" + +namespace TMVA { +namespace DNN { +template class TCpu<Double_t>; +template class TCpu<Real_t>; +} // namespace TMVA +} // namespace DNN diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/ActivationFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/ActivationFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..72b65b49a384166d6bff794fd52acd0d68b398e7 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/ActivationFunctions.cxx @@ -0,0 +1,149 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 19/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + /////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for multi-threaded // + // CPU architectures using tbb and BLAS. // + /////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" +#include <math.h> + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::IdentityDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &/*A*/) +{ + auto f = [](AFloat) {return 1.0;}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::Relu(TCpuMatrix<AFloat> & B) +{ + auto f = [](AFloat x) {return (x < 0.0) ? 0.0 : x;}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::ReluDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &A) +{ + auto f = [](AFloat x) {return (x < 0.0) ? 0.0 : 1.0;}; + B.MapFrom(f, A); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::Sigmoid(TCpuMatrix<AFloat> & B) +{ + auto f = [](AFloat x) {return 1.0 / (1.0 + exp(-x));}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::SigmoidDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &A) +{ + auto f = [](AFloat x) { + AFloat sig = 1.0 / (1.0 + exp(-x)); + return sig * (1.0 - sig); + }; + B.MapFrom(f, A); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::Tanh(TCpuMatrix<AFloat> & B) +{ + auto f = [](AFloat x) {return tanh(x);}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::TanhDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &A) +{ + auto f = [](AFloat x) { + AFloat t = tanh(x); + return 1 - t * t; + }; + B.MapFrom(f, A); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::SymmetricRelu(TCpuMatrix<AFloat> & B) +{ + auto f = [](AFloat x) {return fabs(x);}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::SymmetricReluDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &A) +{ + auto f = [](AFloat x) { + return (x < 0.0) ? -1.0 : 1.0; + }; + B.MapFrom(f, A); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::SoftSign(TCpuMatrix<AFloat> & B) +{ + auto f = [](AFloat x) {return x / (1 + fabs(x));}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::SoftSignDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &A) +{ + auto f = [](AFloat x) { + x = 1.0 + fabs(x); + x = 1.0 / (x * x); + return x; + }; + B.MapFrom(f, A); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::Gauss(TCpuMatrix<AFloat> & B) +{ + auto f = [](AFloat x) {return exp(- x * x);}; + B.Map(f); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::GaussDerivative(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> &A) +{ + auto f = [](AFloat x) {return - 2.0 * x * exp(- x * x);}; + B.MapFrom(f, A); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4ea33431879b91c153199ae6e84d088f08bf729d --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx @@ -0,0 +1,142 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////// +// Implementation of Helper arithmetic functions for the // +// multi-threaded CPU implementation of DNNs. // +//////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TMVA/DNN/Architectures/Cpu/Blas.h" +#include "tbb/tbb.h" + +namespace TMVA +{ +namespace DNN +{ + +//____________________________________________________________________________ +template<typename Real_t> +void TCpu<Real_t>::Multiply(TCpuMatrix<Real_t> &C, + const TCpuMatrix<Real_t> &A, + const TCpuMatrix<Real_t> &B) +{ + int m = (int) A.GetNrows(); + int k = (int) A.GetNcols(); + int n = (int) B.GetNcols(); + + char transa = 'N'; + char transb = 'N'; + + Real_t alpha = 1.0; + Real_t beta = 0.0; + + const Real_t * APointer = A.GetRawDataPointer(); + const Real_t * BPointer = B.GetRawDataPointer(); + Real_t * CPointer = C.GetRawDataPointer(); + + ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha, + APointer, &m, BPointer, &k, &beta, CPointer, &m); +} + +//____________________________________________________________________________ +template<typename Real_t> +void TCpu<Real_t>::TransposeMultiply(TCpuMatrix<Real_t> &C, + const TCpuMatrix<Real_t> &A, + const TCpuMatrix<Real_t> &B) +{ + int m = (int) A.GetNcols(); + int k = (int) A.GetNrows(); + int n = (int) B.GetNcols(); + + char transa = 'T'; + char transb = 'N'; + + Real_t alpha = 1.0; + Real_t beta = 0.0; + + const Real_t *APointer = A.GetRawDataPointer(); + const Real_t *BPointer = B.GetRawDataPointer(); + Real_t *CPointer = C.GetRawDataPointer(); + + ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha, + APointer, &k, BPointer, &k, &beta, CPointer, &m); +} + +//____________________________________________________________________________ +template<typename Real_t> +void TCpu<Real_t>::Hadamard(TCpuMatrix<Real_t> &B, + const TCpuMatrix<Real_t> &A) +{ + const Real_t __restrict__ *dataA = A.GetRawDataPointer(); + Real_t __restrict__ *dataB = B.GetRawDataPointer(); + + auto f = [&dataA, &dataB](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + dataB[i] *= dataA[i]; + } + }; + + tbb::blocked_range<size_t> range(0, A.GetNElements()); + parallel_for(range, f); +} + +//____________________________________________________________________________ +template<typename Real_t> +void TCpu<Real_t>::SumColumns(TCpuMatrix<Real_t> &B, + const TCpuMatrix<Real_t> &A) +{ + int m = (int) A.GetNrows(); + int n = (int) A.GetNcols(); + int inc = 1; + + Real_t alpha = 1.0; + Real_t beta = 0.0; + char trans = 'T'; + + const Real_t * APointer = A.GetRawDataPointer(); + Real_t * BPointer = B.GetRawDataPointer(); + + ::TMVA::DNN::Blas::Gemv(&trans, &m, &n, &alpha, APointer, &m, + TCpuMatrix<Real_t>::GetOnePointer(), &inc, + &beta, BPointer, &inc); +} + +//____________________________________________________________________________ +template<typename Real_t> +void TCpu<Real_t>::ScaleAdd(TCpuMatrix<Real_t> &B, + const TCpuMatrix<Real_t> &A, + Real_t alpha) +{ + int n = (int) (A.GetNcols() * A.GetNrows()); + int inc = 1; + + const Real_t *x = A.GetRawDataPointer(); + Real_t *y = B.GetRawDataPointer(); + + ::TMVA::DNN::Blas::Axpy(&n, &alpha, x, &inc, y, &inc); +} + +//____________________________________________________________________________ +template<typename Real_t> +void TCpu<Real_t>::Copy(TCpuMatrix<Real_t> &B, + const TCpuMatrix<Real_t> &A) +{ + auto f = [](Real_t x) {return x;}; + B.MapFrom(f, A); +} + +} // DNN +} // TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx new file mode 100644 index 0000000000000000000000000000000000000000..c28a738ef9a40f6879085ff689f0cccd4a614ecd --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/CpuBuffer.cxx @@ -0,0 +1,255 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////// +// CPU Buffer interface class for the generic data loader. // +///////////////////////////////////////////////////////////// + +#include <vector> +#include <memory> +#include "TMVA/DNN/DataLoader.h" +#include "TMVA/DNN/Architectures/Cpu.h" +#include "Rtypes.h" +#include <iostream> + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AReal> +void TCpuBuffer<AReal>::TDestructor::operator()(AReal ** pointer) +{ + delete[] * pointer; + delete[] pointer; +} + +//______________________________________________________________________________ +template<typename AReal> +TCpuBuffer<AReal>::TCpuBuffer(size_t size) + : fSize(size), fOffset(0) +{ + AReal ** pointer = new AReal * [1]; + * pointer = new AReal[size]; + fBuffer = std::shared_ptr<AReal *>(pointer, fDestructor); +} + +//______________________________________________________________________________ +template<typename AReal> +TCpuBuffer<AReal> TCpuBuffer<AReal>::GetSubBuffer(size_t offset, size_t size) +{ + TCpuBuffer buffer = *this; + buffer.fOffset = offset; + buffer.fSize = size; + return buffer; +} + +//______________________________________________________________________________ +template<typename AReal> +void TCpuBuffer<AReal>::CopyFrom(TCpuBuffer & other) +{ + std::swap(*this->fBuffer, *other.fBuffer); +} + +//______________________________________________________________________________ +template<typename AReal> +void TCpuBuffer<AReal>::CopyTo(TCpuBuffer & other) +{ + std::swap(*this->fBuffer, *other.fBuffer); +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCpu<Real_t>>::CopyInput( + TCpuBuffer<Real_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Real_t> &inputMatrix = std::get<0>(fData); + size_t n = inputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = static_cast<Real_t>(inputMatrix(sampleIndex, j)); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCpu<Real_t>>::CopyOutput( + TCpuBuffer<Real_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Real_t> &outputMatrix = std::get<1>(fData); + size_t n = outputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = static_cast<Real_t>(outputMatrix(sampleIndex, j)); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCpu<Double_t>>::CopyInput( + TCpuBuffer<Double_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData); + size_t n = inputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = inputMatrix(sampleIndex, j); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCpu<Double_t>>::CopyOutput( + TCpuBuffer<Double_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData); + size_t n = outputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = outputMatrix(sampleIndex, j); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCpu<Double_t>>::CopyInput( + TCpuBuffer<Double_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = event->GetNVariables(); + + // Copy input variables. + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = event->GetValue(j); + } + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCpu<Double_t>>::CopyOutput( + TCpuBuffer<Double_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + + // Copy target(s). + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + // Copy output matrices. + size_t bufferIndex = j * batchSize + i; + if (event->GetNTargets() == 0) { + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + buffer[bufferIndex] = event->GetTarget(j); + } + } + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCpu<Real_t>>::CopyInput( + TCpuBuffer<Real_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = event->GetNVariables(); + + // Copy input variables. + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = static_cast<Real_t>(event->GetValue(j)); + } + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCpu<Real_t>>::CopyOutput( + TCpuBuffer<Real_t> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + + // Copy target(s). + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + // Copy output matrices. + size_t bufferIndex = j * batchSize + i; + if (event->GetNTargets() == 0) { + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j)); + } + } + } +} + +// Explicit instantiations. +template class TCpuBuffer<Double_t>; +template class TCpuBuffer<Real_t>; + +} // namespace DNN +} // namespace TMVA + + diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/CpuMatrix.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/CpuMatrix.cxx new file mode 100644 index 0000000000000000000000000000000000000000..f21d6832031b3901a6f0f43e98d6274f3f5c6b2a --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/CpuMatrix.cxx @@ -0,0 +1,87 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 19/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////// +// Implementation of the TCpuMatrix class. // +///////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu/CpuMatrix.h" + +namespace TMVA { +namespace DNN { + +template<typename AReal> +std::vector<AReal> TCpuMatrix<AReal>::fOnes{}; + +//____________________________________________________________________________ +template<typename AReal> +TCpuMatrix<AReal>::TCpuMatrix(size_t nRows, size_t nCols) + : fBuffer(nRows * nCols), fNCols(nCols), fNRows(nRows) +{ + Initialize(); +} + +//____________________________________________________________________________ +template<typename AReal> +TCpuMatrix<AReal>::TCpuMatrix(const TMatrixT<Double_t> & B) + : fBuffer(B.GetNoElements()), fNCols(B.GetNcols()), fNRows(B.GetNrows()) +{ + Initialize(); + for (size_t j = 0; j < fNCols; j++) { + for (size_t i = 0; i < fNRows; i++) { + (*this)(i,j) = B(i,j); + } + } +} + +//____________________________________________________________________________ +template<typename AReal> +TCpuMatrix<AReal>::TCpuMatrix(const TCpuBuffer<AReal> & buffer, + size_t m, + size_t n) + : fBuffer(buffer), fNCols(n), fNRows(m) +{ + Initialize(); +} + +//____________________________________________________________________________ +template<typename AReal> +TCpuMatrix<AReal>::operator TMatrixT<Double_t>() const +{ + TMatrixT<AReal> B(fNRows, fNCols); + + for (size_t j = 0; j < fNCols; j++) { + for (size_t i = 0; i < fNRows; i++) { + B(i,j) = (*this)(i, j); + } + } + return B; +} + + +//____________________________________________________________________________ +template<typename AReal> +void TCpuMatrix<AReal>::Initialize() +{ + if (fNRows > fOnes.size()) { + fOnes.reserve(fNRows); + for (size_t i = fOnes.size(); i < fNRows; i++) { + fOnes.push_back(1.0); + } + } +} + +// Explicit instantiations. +template class TCpuMatrix<Real_t>; +template class TCpuMatrix<Double_t>; + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/DataLoader.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/DataLoader.cxx new file mode 100644 index 0000000000000000000000000000000000000000..448000c546d764867b74c51c37bc007e5e215f26 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/DataLoader.cxx @@ -0,0 +1,209 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 21/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////// +// Implementation for the DataLoader for the the multi-threaded // +// CPU implementation of DNNs. // +////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu/DataLoader.h" +#include "TMVA/Event.h" +#include <iostream> + +namespace TMVA +{ +namespace DNN +{ + +// TCpuBatchIterator +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +TCpuBatchIterator<Data_t, Real_t>::TCpuBatchIterator( + TCpuDataLoader<Data_t, Real_t> & dataLoader, + size_t batchIndex) + : fDataLoader(dataLoader), fBatchIndex(batchIndex) +{ + // Nothing to do here. +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +TCpuBatch<Real_t> TCpuBatchIterator<Data_t, Real_t>::operator*() +{ + return fDataLoader.GetBatch(fBatchIndex); +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +TCpuBatchIterator<Data_t, Real_t> & TCpuBatchIterator<Data_t, Real_t>::operator++() +{ + fBatchIndex++; + return *this; +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +bool TCpuBatchIterator<Data_t, Real_t>::operator!=(const TCpuBatchIterator & other) +{ + return fBatchIndex != other.GetBatchIndex(); +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +bool TCpuBatchIterator<Data_t, Real_t>::operator==(const TCpuBatchIterator & other) +{ + return fBatchIndex == other.GetBatchIndex(); +} + +// TCpuDataLoader +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +TCpuDataLoader<Data_t, Real_t>::TCpuDataLoader(const Data_t &input, + size_t nsamples, + size_t batchSize, + size_t ninputFeatures, + size_t noutputFeatures, + size_t bufferSize) + : fInput(input), fNSamples(nsamples), fBatchSize(batchSize), + fBufferSize(bufferSize), fNInputFeatures(ninputFeatures), + fNOutputFeatures(noutputFeatures), fNBatches(nsamples / batchSize), + fInputMatrices(), fOutputMatrices(), fSampleIndices() +{ + fInputMatrices.reserve(fBufferSize); + fOutputMatrices.reserve(fBufferSize); + for (size_t i = 0; i < fBufferSize; i++) { + fInputMatrices.emplace_back(fBatchSize, fNInputFeatures); + fOutputMatrices.emplace_back(fBatchSize, fNOutputFeatures); + } + + fSampleIndices.reserve(fNBatches); + for (size_t i = 0; i < fNSamples; i++) { + fSampleIndices.emplace_back(i); + } +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +inline void TCpuDataLoader<Data_t, Real_t>::CopyData(size_t batchIndex) +{ + auto copy = [this](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + size_t sampleIndex = rangeBegin * this->fBatchSize; + + for (size_t batchIndex = rangeBegin; batchIndex != rangeEnd; ++batchIndex) { + CopyBatch(this->fInputMatrices[batchIndex % this->fBufferSize], + this->fOutputMatrices[batchIndex % this->fBufferSize], + this->fInput, + this->fSampleIndices.begin() + sampleIndex, + this->fSampleIndices.begin() + sampleIndex + this->fBatchSize); + sampleIndex += this->fBatchSize; + } + }; + + size_t end = std::min(batchIndex + fBufferSize, fNBatches); + size_t start = batchIndex; + tbb::blocked_range<size_t> range(start, end); + parallel_for(range, copy); +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +TCpuBatch<Real_t> TCpuDataLoader<Data_t, Real_t>::GetBatch(size_t batchIndex) +{ + size_t fBufferIndex = batchIndex % fBufferSize; + if (fBufferIndex == 0) { + CopyData(batchIndex); + } + return TCpuBatch<Real_t>(fInputMatrices[fBufferIndex], + fOutputMatrices[fBufferIndex]); +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +auto TCpuDataLoader<Data_t, Real_t>::begin() + -> BatchIterator_t +{ + random_shuffle(fSampleIndices.begin(), fSampleIndices.end()); + return BatchIterator_t(*this, 0); +} + +//______________________________________________________________________________ +template<typename Data_t, typename Real_t> +auto TCpuDataLoader<Data_t, Real_t>::end() + -> BatchIterator_t +{ + return BatchIterator_t(*this, fNBatches); +} + +//______________________________________________________________________________ +template <> +void TCpuDataLoader<MatrixInput_t, Double_t>::CopyBatch( + Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const MatrixInput_t &input, + IndexIterator_t indexBegin, + IndexIterator_t indexEnd) +{ + auto &in = std::get<0>(input); + auto &out = std::get<1>(input); + + size_t batchIndex = 0; + for (IndexIterator_t i = indexBegin; i != indexEnd; i++) { + size_t index = *i; + for (size_t j = 0; j < (size_t) in.GetNcols(); j++) { + inputMatrix(batchIndex, j) = in(index, j); + } + for (size_t j = 0; j < (size_t) out.GetNcols(); j++) { + outputMatrix(batchIndex, j) = out(index, j); + } + batchIndex++; + } +} + +//______________________________________________________________________________ +template <> +void TCpuDataLoader<TMVAInput_t, Double_t>::CopyBatch( + Matrix_t &inputMatrix, + Matrix_t &outputMatrix, + const TMVAInput_t &input, + IndexIterator_t indexBegin, + IndexIterator_t indexEnd) +{ + size_t batchIndex = 0; + for (IndexIterator_t i = indexBegin; i != indexEnd; i++) { + size_t index = *i; + Event *event = input.at(index); + for (size_t j = 0; j < event->GetNVariables(); j++) { + inputMatrix(batchIndex, j) = event->GetValue(j); + } + if (event->GetNTargets() > 0) { + for (size_t j = 0; j < event->GetNTargets(); j++) { + outputMatrix(batchIndex, j) = event->GetTarget(j); + } + } else { + outputMatrix(batchIndex, 0) = (event->GetClass() == 0) ? 1.0 : 0.0; + batchIndex++; + } + } +} + +// Explicit instantiation. +//______________________________________________________________________________ +template class TCpuDataLoader<MatrixInput_t, Double_t>; +template class TCpuDataLoader<TMVAInput_t, Double_t>; +template class TCpuBatchIterator<MatrixInput_t, Double_t>; +template class TCpuBatchIterator<TMVAInput_t, Double_t>; +template class TCpuBatch<Double_t>; + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Dropout.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Dropout.cxx new file mode 100644 index 0000000000000000000000000000000000000000..ff81d034d24575b9f7b95f8ad662283e2473ee50 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/Dropout.cxx @@ -0,0 +1,47 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 21/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TRandom.h" + +///////////////////////////////////////////////////////////////////// +// Implementation of Dropout for multi-threaded CPU architectures. // +///////////////////////////////////////////////////////////////////// + +namespace TMVA { +namespace DNN { + +//____________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::Dropout(TCpuMatrix<AFloat> &A, + AFloat dropoutProbability) +{ + AFloat __restrict__ *data = A.GetRawDataPointer(); + + auto fRange = [&data, dropoutProbability](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + TRandom rand(time(nullptr) + rangeBegin); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + AFloat r = rand.Uniform(); + data[i] = (r > dropoutProbability) ? 0.0 : data[i] / dropoutProbability; + } + }; + + tbb::blocked_range<size_t> range(0, A.GetNElements()); + parallel_for(range, fRange); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Initialization.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Initialization.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4875d4580f944751b29b456e16b3b944dbd1822c --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/Initialization.cxx @@ -0,0 +1,98 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 21/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////// + // Implementation of the DNN initialization methods for the // + // multi-threaded CPU backend. // + ////////////////////////////////////////////////////////////// + +#include "TRandom.h" +#include "TMVA/DNN/Architectures/Cpu.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::InitializeGauss(TCpuMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + + AFloat sigma = sqrt(2.0 / ((AFloat) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = rand.Gaus(0.0, sigma); + } + } +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::InitializeUniform(TCpuMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + + AFloat range = sqrt(2.0 / ((AFloat) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = rand.Uniform(-range, range); + } + } +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::InitializeIdentity(TCpuMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n ; j++) { + A(i,j) = 0.0; + } + + if (i < n) { + A(i,i) = 1.0; + } + } +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::InitializeZero(TCpuMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n ; j++) { + A(i,j) = 0.0; + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..bad4155c9cf4dcffd5d89978ba8c6446e152971e --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/LossFunctions.cxx @@ -0,0 +1,147 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ///////////////////////////////////////////////////////////////////// + // Implementation of the loss functions for the multi-threaded CPU // + // implementation using tbb and BLAS. // + ///////////////////////////////////////////////////////////////////// + +#include "tbb/tbb.h" +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCpu<AFloat>::MeanSquaredError(const TCpuMatrix<AFloat> &Y, + const TCpuMatrix<AFloat> &output) +{ + const AFloat __restrict__ *dataY = Y.GetRawDataPointer(); + const AFloat __restrict__ *dataOutput = output.GetRawDataPointer(); + + auto f = [&dataY, &dataOutput](const tbb::blocked_range<size_t> & range, + AFloat partialSum) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + AFloat sum = partialSum; + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + AFloat error = dataY[i] - dataOutput[i]; + sum += error * error; + } + + return sum; + }; + + auto reduction = [](AFloat sum1, AFloat sum2) + { + return sum1 + sum2; + }; + + AFloat norm = 1.0 / ((AFloat) Y.GetNcols() * Y.GetNrows()); + tbb::blocked_range<size_t> range(0, Y.GetNElements()); + return norm * parallel_reduce(range, 0.0, f, reduction); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::MeanSquaredErrorGradients( + TCpuMatrix<AFloat> & dY, + const TCpuMatrix<AFloat> & Y, + const TCpuMatrix<AFloat> & output) +{ + + AFloat __restrict__ *dataDY = dY.GetRawDataPointer(); + const AFloat __restrict__ *dataY = Y.GetRawDataPointer(); + const AFloat __restrict__ *dataOutput = output.GetRawDataPointer(); + AFloat norm = 1.0 / ((AFloat) Y.GetNrows() * Y.GetNcols()); + + auto f = [&dataDY, &dataY, &dataOutput, norm](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + dataDY[i] = - 2.0 * norm * (dataY[i] - dataOutput[i]); + } + }; + + tbb::blocked_range<size_t> range(0, Y.GetNElements()); + parallel_for(range, f); +} + +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCpu<AFloat>::CrossEntropy(const TCpuMatrix<AFloat> &Y, + const TCpuMatrix<AFloat> &output) +{ + const AFloat __restrict__ *dataY = Y.GetRawDataPointer(); + const AFloat __restrict__ *dataOutput = output.GetRawDataPointer(); + + auto f = [&dataY, &dataOutput](const tbb::blocked_range<size_t> & range, + AFloat partialSum) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + AFloat sum = partialSum; + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + AFloat y = dataY[i]; + AFloat sig = 1.0 / (1.0 + exp(- dataOutput[i])); + sum += y * log(sig) + (1.0 - y) * log(1.0 - sig); + } + return sum; + }; + + auto reduction = [](AFloat sum1, AFloat sum2) + { + return sum1 + sum2; + }; + + tbb::blocked_range<size_t> range(0, Y.GetNElements()); + AFloat norm = 1.0 / ((AFloat) Y.GetNcols() * Y.GetNrows()); + return - norm * parallel_reduce(range, 0.0, f, reduction); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::CrossEntropyGradients( + TCpuMatrix<AFloat> & dY, + const TCpuMatrix<AFloat> & Y, + const TCpuMatrix<AFloat> & output) +{ + AFloat __restrict__ *dataDY = dY.GetRawDataPointer(); + const AFloat __restrict__ *dataY = Y.GetRawDataPointer(); + const AFloat __restrict__ *dataOutput = output.GetRawDataPointer(); + AFloat norm = 1.0 / ((AFloat) Y.GetNrows() * Y.GetNcols()); + + auto f = [&dataDY, &dataY, &dataOutput, norm](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + AFloat y = dataY[i]; + AFloat sig = 1.0 / (1.0 + exp(- dataOutput[i])); + dataDY[i] = norm * (sig - y); + } + }; + + tbb::blocked_range<size_t> range(0, Y.GetNElements()); + parallel_for(range, f); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..4c3fd3994f7b9bf847e97275ff00c160b969d343 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/OutputFunctions.cxx @@ -0,0 +1,33 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 21/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////// +// Implementation of output functions for multi-threaded CPU // +// architectures. // +/////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" + +namespace TMVA +{ +namespace DNN +{ + +template<typename AFloat> +void TCpu<AFloat>::Sigmoid(TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> & A) +{ + auto f = [](AFloat x) {return 1.0 / (1.0 + exp(-x));}; + B.MapFrom(f, A); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Propagation.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Propagation.cxx new file mode 100644 index 0000000000000000000000000000000000000000..ef6e7a4168418d3135830666c0370d754ccf4135 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/Propagation.cxx @@ -0,0 +1,94 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Implementation of the functions required for the forward and // +// backward propagation of activations through a neural network for // +// the reference implementation. // +////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TMVA/DNN/Architectures/Cpu/Blas.h" + +namespace TMVA +{ +namespace DNN +{ + +template<typename AFloat> +void TCpu<AFloat>::MultiplyTranspose(TCpuMatrix<AFloat> &output, + const TCpuMatrix<AFloat> &input, + const TCpuMatrix<AFloat> &Weights) +{ + int m = (int) input.GetNrows(); + int k = (int) input.GetNcols(); + int n = (int) Weights.GetNrows(); + + char transa = 'N'; + char transb = 'T'; + + AFloat alpha = 1.0; + AFloat beta = 0.0; + + const AFloat *A = input.GetRawDataPointer(); + const AFloat *B = Weights.GetRawDataPointer(); + AFloat *C = output.GetRawDataPointer(); + + ::TMVA::DNN::Blas::Gemm(&transa, &transb, &m, &n, &k, &alpha, + A, &m, B, &n, &beta, C, &m); +} + +template<typename AFloat> +void TCpu<AFloat>::AddRowWise( + TCpuMatrix<AFloat> &output, + const TCpuMatrix<AFloat> &biases) +{ + int m = (int) output.GetNrows(); + int n = (int) output.GetNcols(); + + int inc = 1.0; + AFloat alpha = 1.0; + + AFloat * A = output.GetRawDataPointer(); + const AFloat * x = TCpuMatrix<AFloat>::GetOnePointer(); + const AFloat * y = biases.GetRawDataPointer(); + + ::TMVA::DNN::Blas::Ger(&m, &n, &alpha, x, &inc, y, &inc, A, &m); +} + +template<typename AFloat> +void TCpu<AFloat>::Backward( + TCpuMatrix<AFloat> & activationGradientsBackward, + TCpuMatrix<AFloat> & weightGradients, + TCpuMatrix<AFloat> & biasGradients, + TCpuMatrix<AFloat> & df, + const TCpuMatrix<AFloat> & activationGradients, + const TCpuMatrix<AFloat> & weights, + const TCpuMatrix<AFloat> & activationsBackward) +{ + // Compute element-wise product. + Hadamard(df, activationGradients); + + // Activation gradients. + if (activationGradientsBackward.GetNElements() > 0) + Multiply(activationGradientsBackward, df, weights); + + // Weight gradients. + if (weightGradients.GetNElements() > 0) + TransposeMultiply(weightGradients, df, activationsBackward); + + // Bias gradients. + if (biasGradients.GetNElements() > 0) + SumColumns(biasGradients, df); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Regularization.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Regularization.cxx new file mode 100644 index 0000000000000000000000000000000000000000..404464d10de4085a296263ac242e12597352fdc8 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cpu/Regularization.cxx @@ -0,0 +1,133 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 21/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Implementation of the regularization functionals and gradients // +// for the multi-threaded CPU implementation using tbb. // +//////////////////////////////////////////////////////////////////// + +#include "tbb/tbb.h" +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCpu<AFloat>::L1Regularization(const TCpuMatrix<AFloat> &Weights) +{ + const AFloat __restrict__ *data = Weights.GetRawDataPointer(); + + auto f = [&data](const tbb::blocked_range<size_t> & range, + AFloat partialSum) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + AFloat sum = partialSum; + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + sum += fabs(data[i]); + } + return sum; + }; + + auto reduction = [](AFloat sum1, AFloat sum2) + { + return sum1 + sum2; + }; + + tbb::blocked_range<size_t> range(0, Weights.GetNElements()); + return parallel_reduce(range, 0.0, f, reduction); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::AddL1RegularizationGradients( + TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> & A, + AFloat weightDecay) +{ + + AFloat __restrict__ *dataB = B.GetRawDataPointer(); + const AFloat __restrict__ *dataA = A.GetRawDataPointer(); + + auto f = [&dataA, &dataB, weightDecay](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + AFloat sign = (dataA[i] < 0.0) ? -1.0 : 1.0; + dataB[i] += weightDecay * sign; + } + }; + + tbb::blocked_range<size_t> range(0, A.GetNElements()); + parallel_for(range, f); +} + +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCpu<AFloat>::L2Regularization(const TCpuMatrix<AFloat> &Weights) +{ + const AFloat __restrict__ *data = Weights.GetRawDataPointer(); + + auto f = [&data](const tbb::blocked_range<size_t> & range, + AFloat partialSum) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + AFloat sum = partialSum; + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + sum += data[i] * data[i]; + } + return sum; + }; + + auto reduction = [](AFloat sum1, AFloat sum2) + { + return sum1 + sum2; + }; + + tbb::blocked_range<size_t> range(0, Weights.GetNElements()); + return parallel_reduce(range, 0.0, f, reduction); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCpu<AFloat>::AddL2RegularizationGradients( + TCpuMatrix<AFloat> & B, + const TCpuMatrix<AFloat> & A, + AFloat weightDecay) +{ + + AFloat __restrict__ *dataB = B.GetRawDataPointer(); + const AFloat __restrict__ *dataA = A.GetRawDataPointer(); + + auto f = [&dataA, &dataB, weightDecay](const tbb::blocked_range<size_t> & range) + { + size_t rangeBegin = range.begin(); + size_t rangeEnd = range.end(); + + for (size_t i = rangeBegin; i != rangeEnd; ++i) { + dataB[i] += 2.0 * weightDecay * dataA[i]; + } + }; + + tbb::blocked_range<size_t> range(0, A.GetNElements()); + parallel_for(range, f); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda.cu b/tmva/tmva/src/DNN/Architectures/Cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f791c613b487e279a18672c0ba9095bf4f9e291 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda.cu @@ -0,0 +1,34 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////// +// Explicit instantiation of the TCuda architecture class with // +// for Double_t and Real_t floating point types. // +///////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "Cuda/Propagation.cu" +#include "Cuda/Arithmetic.cu" +#include "Cuda/ActivationFunctions.cu" +#include "Cuda/OutputFunctions.cu" +#include "Cuda/LossFunctions.cu" +#include "Cuda/Regularization.cu" +#include "Cuda/Initialization.cu" +#include "Cuda/Dropout.cu" + +namespace TMVA { +namespace DNN { + +template class TCuda<Real_t>; +template class TCuda<Double_t>; + +} // namespace tmva +} // namespace dnn diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu new file mode 100644 index 0000000000000000000000000000000000000000..5654680158b38968fe624c4a969ba106c1294e5b --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/ActivationFunctions.cu @@ -0,0 +1,216 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for the TCuda // + // implementation of the low-level interface. // + ////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::IdentityDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::IdentityDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + (int) B.GetNrows(), + (int) B.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Relu(TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Relu<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::ReluDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::ReluDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::SigmoidDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SigmoidDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Tanh(TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Tanh<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::TanhDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::TanhDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::SymmetricRelu(TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SymmetricRelu<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::SymmetricReluDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SymmetricReluDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::SoftSign(TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SoftSign<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::SoftSignDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::SoftSignDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Gauss(TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Gauss<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::GaussDerivative(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::GaussDerivative<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu new file mode 100644 index 0000000000000000000000000000000000000000..f61391687f5ed7bd1e0c6eaa741d416b7e6c7ddc --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu @@ -0,0 +1,238 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Contains additional arithmetic functions required by the CUDA // +// neural network implementation. // +/////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +namespace TMVA +{ +namespace DNN +{ + +//____________________________________________________________________________ +template<> +void TCuda<float>::Multiply(TCudaMatrix<float> &C, + const TCudaMatrix<float> &A, + const TCudaMatrix<float> &B) +{ + int m, n, k; + m = A.GetNrows(); + k = A.GetNcols(); + n = B.GetNcols(); + float alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A * B) + cublasSgemm(A.GetCublasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + m, n, k, & alpha, + A.GetDataPointer(), m, // *A, lda + B.GetDataPointer(), k, // *B, ldb + & beta, // beta + C.GetDataPointer(), m); // *C, ldc + + C.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<> +void TCuda<double>::Multiply(TCudaMatrix<double> &C, + const TCudaMatrix<double> &A, + const TCudaMatrix<double> &B) +{ + int m, n, k; + m = A.GetNrows(); + k = A.GetNcols(); + n = B.GetNcols(); + double alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A * B) + cublasDgemm(A.GetCublasHandle(), + CUBLAS_OP_N, CUBLAS_OP_N, + m, n, k, & alpha, + A.GetDataPointer(), m, // *A, lda + B.GetDataPointer(), k, // *B, ldb + & beta, // beta + C.GetDataPointer(), m); // *C, ldc + + C.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<> +void TCuda<float>::TransposeMultiply(TCudaMatrix<float> & C, + const TCudaMatrix<float> & A, + const TCudaMatrix<float> & B) +{ + int m, n, k; + k = A.GetNrows(); + m = A.GetNcols(); + n = B.GetNcols(); + float alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A^T * B) + cublasSgemm(A.GetCublasHandle(), + CUBLAS_OP_T, CUBLAS_OP_N, + m, n, k, & alpha, + A.GetDataPointer(), k, // *A, lda + B.GetDataPointer(), k, // *B, ldb + & beta, // beta + C.GetDataPointer(), m); // *C, ldc + + C.SetComputeStream(s); +} +//____________________________________________________________________________ +template<> +void TCuda<double>::TransposeMultiply(TCudaMatrix<double> & C, + const TCudaMatrix<double> & A, + const TCudaMatrix<double> & B) +{ + int m, n, k; + k = A.GetNrows(); + m = A.GetNcols(); + n = B.GetNcols(); + double alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A^T * B) + cublasDgemm(A.GetCublasHandle(), + CUBLAS_OP_T, CUBLAS_OP_N, + m, n, k, & alpha, + A.GetDataPointer(), k, // *A, lda + B.GetDataPointer(), k, // *B, ldb + & beta, // beta + C.GetDataPointer(), m); // *C, ldc + + C.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Hadamard(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> &A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(), + A.GetDataPointer(), + A.GetNrows(), + A.GetNcols()); + B.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<typename AFloat> +AFloat TCuda<AFloat>::Sum(const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + + TCudaMatrix<AFloat>::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::ReduceMatrix<<<gridDims, blockDims, 0, s>>>( + TCudaMatrix<AFloat>::GetDeviceReturnPointer(), + A.GetDataPointer(), + A.GetNrows(), + A.GetNcols()); + return TCudaMatrix<AFloat>::GetDeviceReturn(); +} + +//____________________________________________________________________________ +template<> +void TCuda<float>::SumColumns(TCudaMatrix<float> & B, + const TCudaMatrix<float> & A) +{ + int m, n; + m = A.GetNrows(); + n = A.GetNcols(); + float alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A * B) + cublasSgemv(A.GetCublasHandle(), CUBLAS_OP_T, + m, n, & alpha, + A.GetDataPointer(), m, // *A, lda + TCudaMatrix<float>::GetOnes(), 1, // *x, incx + & beta, B.GetDataPointer(), 1); // beta, *y, incy + + B.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<> +void TCuda<double>::SumColumns(TCudaMatrix<double> & B, + const TCudaMatrix<double> & A) +{ + int m, n; + m = A.GetNrows(); + n = A.GetNcols(); + double alpha = 1.0, beta = 0.0; + + cudaStream_t s = A.GetComputeStream(); + cublasSetStream(A.GetCublasHandle(), s); + + // Compute C = beta * C + alpha * (A * B) + cublasDgemv(A.GetCublasHandle(), CUBLAS_OP_T, + m, n, & alpha, + A.GetDataPointer(), m, // *A, lda + TCudaMatrix<double>::GetOnes(), 1, // *x, incx + & beta, B.GetDataPointer(), 1); // beta, *y, incy + + B.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<> +void TCuda<float>::ScaleAdd(TCudaMatrix<float> & B, + const TCudaMatrix<float> & A, + float alpha) +{ + cudaStream_t s = 0; + cublasSetStream(A.GetCublasHandle(), s); + cublasSaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha, + A.GetDataPointer(), 1, + B.GetDataPointer(), 1); +} + +//____________________________________________________________________________ +template<> +void TCuda<double>::ScaleAdd(TCudaMatrix<double> & B, + const TCudaMatrix<double> & A, + double alpha) +{ + cudaStream_t s = 0; + cublasSetStream(A.GetCublasHandle(), s); + cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha, + A.GetDataPointer(), 1, + B.GetDataPointer(), 1); +} + +} // DNN +} // TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx new file mode 100644 index 0000000000000000000000000000000000000000..68be42d775101357c9c20c8bb74ae3197c0c0146 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaBuffers.cxx @@ -0,0 +1,334 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 07/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////////// +// Implementation of device and host buffers for CUDA architectures. // +//////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/DataLoader.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/CudaBuffers.h" +#include "cuda_runtime.h" +#include <iostream> + +namespace TMVA { +namespace DNN { + +// +// TCudaHostBuffer +//______________________________________________________________________________ +template<typename AFloat> +void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat ** devicePointer) +{ + cudaFreeHost(*devicePointer); + delete[] devicePointer; +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size) + : fOffset(0), fComputeStream(0), fDestructor() +{ + AFloat ** pointer = new AFloat * [1]; + cudaMallocHost(pointer, size * sizeof(AFloat)); + fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor); +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaHostBuffer<AFloat>::operator AFloat * () const +{ + return *fHostPointer + fOffset; +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(size_t offset, + size_t /*size*/) +{ + TCudaHostBuffer buffer = *this; + buffer.fOffset = offset; + return buffer; +} + +// +// TCudaDevicePointer +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceBuffer<AFloat>::TDestructor::operator()(AFloat ** devicePointer) +{ + cudaFree(*devicePointer); + delete[] devicePointer; +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size) + : fOffset(0), fSize(size), fDestructor() +{ + AFloat ** pointer = new AFloat * [1]; + cudaMalloc(pointer, size * sizeof(AFloat)); + fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor); + cudaStreamCreate(&fComputeStream); +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size, + cudaStream_t stream) + : fOffset(0), fSize(size), fComputeStream(stream), fDestructor() +{ + AFloat ** pointer = new AFloat * [1]; + cudaMalloc(pointer, size * sizeof(AFloat)); + fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor); +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat * devicePointer, + size_t size, + cudaStream_t stream) + : fOffset(0), fSize(size), fComputeStream(stream), fDestructor() +{ + AFloat ** pointer = new AFloat * [1]; + *pointer = devicePointer; + fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor); +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceBuffer<AFloat> TCudaDeviceBuffer<AFloat>::GetSubBuffer(size_t offset, + size_t size) +{ + TCudaDeviceBuffer buffer = *this; + buffer.fOffset = offset; + buffer.fSize = size; + return buffer; +} + +//______________________________________________________________________________ +template<typename AFloat> +TCudaDeviceBuffer<AFloat>::operator AFloat * () const +{ + return *fDevicePointer + fOffset; +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceBuffer<AFloat>::CopyFrom(const TCudaHostBuffer<AFloat> &buffer) const +{ + cudaStreamSynchronize(fComputeStream); + cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), + cudaMemcpyHostToDevice, fComputeStream); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCudaDeviceBuffer<AFloat>::CopyTo(const TCudaHostBuffer<AFloat> &buffer) const +{ + cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), + cudaMemcpyDeviceToHost, fComputeStream); + buffer.fComputeStream = fComputeStream; +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCuda<float>>::CopyInput( + TCudaHostBuffer<float> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData); + size_t n = inputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j)); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCuda<float>>::CopyOutput( + TCudaHostBuffer<float> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData); + size_t n = outputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j)); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCuda<float>>::CopyInput( + TCudaHostBuffer<float> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = event->GetNVariables(); + + // Copy input variables. + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = static_cast<float>(event->GetValue(j)); + } + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput( + TCudaHostBuffer<float> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + + // Copy target(s). + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + // Copy output matrices. + size_t bufferIndex = j * batchSize + i; + if (event->GetNTargets() == 0) { + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + buffer[bufferIndex] = static_cast<float>(event->GetTarget(j)); + } + } + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCuda<double>>::CopyInput( + TCudaHostBuffer<double> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData); + size_t n = inputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = inputMatrix(sampleIndex, j); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<MatrixInput_t, TCuda<double>>::CopyOutput( + TCudaHostBuffer<double> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData); + size_t n = outputMatrix.GetNcols(); + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = *sampleIterator; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = outputMatrix(sampleIndex, j); + } + sampleIterator++; + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCuda<double>>::CopyInput( + TCudaHostBuffer<double> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = event->GetNVariables(); + + // Copy input variables. + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + size_t bufferIndex = j * batchSize + i; + buffer[bufferIndex] = event->GetValue(j); + } + } +} + +//______________________________________________________________________________ +template<> +void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput( + TCudaHostBuffer<double> & buffer, + IndexIterator_t sampleIterator, + size_t batchSize) +{ + Event * event = fData.front(); + size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets(); + + // Copy target(s). + + for (size_t i = 0; i < batchSize; i++) { + size_t sampleIndex = * sampleIterator++; + event = fData[sampleIndex]; + for (size_t j = 0; j < n; j++) { + // Copy output matrices. + size_t bufferIndex = j * batchSize + i; + if (event->GetNTargets() == 0) { + buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0; + } else { + buffer[bufferIndex] = event->GetTarget(j); + } + } + } +} + +// Explicit Instantiations. + +template class TCudaDeviceBuffer<float>; +template class TCudaDeviceBuffer<double>; + +template class TCudaHostBuffer<float>; +template class TCudaHostBuffer<double>; + +template class TDataLoader<MatrixInput_t, TCuda<float>>; +template class TDataLoader<TMVAInput_t, TCuda<float>>; +template class TDataLoader<MatrixInput_t, TCuda<double>>; +template class TDataLoader<TMVAInput_t, TCuda<double>>; + +} // TMVA +} // DNN diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu new file mode 100644 index 0000000000000000000000000000000000000000..0a3f27b3f8a04db7a0c3c71d18cba39dd4b38ec1 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/CudaMatrix.cu @@ -0,0 +1,167 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////// +// Implementation of the TCudaMatrix class. // +///////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda/CudaMatrix.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" + +namespace TMVA { +namespace DNN { + +//____________________________________________________________________________ +__global__ void CurandInitializationKernel(unsigned long long seed, + curandState_t *state) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = i * gridDim.x + j; + curand_init(seed + tid, 0, tid, state + tid); +} + +// Static members. +//____________________________________________________________________________ +template<typename AFloat> +size_t TCudaMatrix<AFloat>::fInstances = 0; +template<typename AFloat> +cublasHandle_t TCudaMatrix<AFloat>::fCublasHandle = nullptr; +template<typename AFloat> +AFloat * TCudaMatrix<AFloat>::fDeviceReturn = nullptr; +template<typename AFloat> +AFloat * TCudaMatrix<AFloat>::fOnes = nullptr; +template<typename AFloat> +curandState_t * TCudaMatrix<AFloat>::fCurandStates = nullptr; +template<typename AFloat> +size_t TCudaMatrix<AFloat>::fNCurandStates = 0; +template<typename AFloat> +size_t TCudaMatrix<AFloat>::fNOnes = 0; + +// Constructors. +//____________________________________________________________________________ +template<typename AFloat> +TCudaMatrix<AFloat>::TCudaMatrix() + : fNRows(0), fNCols(0), fElementBuffer() +{ + InitializeCuda(); +} + +//____________________________________________________________________________ +template<typename AFloat> +TCudaMatrix<AFloat>::TCudaMatrix(size_t m, size_t n) + : fNRows(m), fNCols(n), fElementBuffer(m * n, 0) +{ + InitializeCuda(); +} + +//____________________________________________________________________________ +template<typename AFloat> +TCudaMatrix<AFloat>::TCudaMatrix(const TMatrixT<Double_t> & Host) + : fNRows(Host.GetNrows()), fNCols(Host.GetNcols()), + fElementBuffer(Host.GetNoElements(), 0) +{ + InitializeCuda(); + + AFloat * buffer = new AFloat[fNRows * fNCols]; + size_t index = 0; + for (size_t j = 0; j < fNCols; j++) { + for (size_t i = 0; i < fNRows; i++) { + buffer[index] = static_cast<AFloat>(Host(i, j)); + index++; + } + } + + cudaMemcpy(fElementBuffer, buffer, fNRows * fNCols * sizeof(AFloat), + cudaMemcpyHostToDevice); +} + +//____________________________________________________________________________ +template<typename AFloat> +TCudaMatrix<AFloat>::TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, + size_t m, size_t n) + : fNRows(m), fNCols(n), fElementBuffer(buffer) +{ + InitializeCuda(); +} + +//____________________________________________________________________________ +template <typename AFloat> +inline void TCudaMatrix<AFloat>::InitializeCuda() +{ + if (fInstances == 0) { + cublasCreate(&fCublasHandle); + CUDACHECK(cudaMalloc(& fDeviceReturn, sizeof(AFloat))); + CUDACHECK(cudaMalloc(& fCurandStates, TDevice::NThreads(*this))); + } + if (TDevice::NThreads(*this) > (int) fNCurandStates) { + fNCurandStates = TDevice::NThreads(*this); + if (fCurandStates) { + cudaFree(fCurandStates); + } + cudaMalloc(&fCurandStates, TDevice::NThreads(*this) * sizeof(curandState_t)); + InitializeCurandStates(); + } + if (fNRows > fNOnes) { + fNOnes = fNRows; + if (fOnes) { + cudaFree(fOnes); + } + cudaMalloc(&fOnes, fNRows * sizeof(AFloat)); + AFloat * buffer = new AFloat[fNRows]; + for (size_t i = 0; i < fNRows; i++) { + buffer[i] = 1.0; + } + cudaMemcpy(fOnes, buffer, fNRows * sizeof(AFloat), + cudaMemcpyHostToDevice); + } + fInstances++; +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCudaMatrix<AFloat>::InitializeCurandStates() +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(*this); + CurandInitializationKernel<<<gridDims, blockDims>>>(time(nullptr), fCurandStates); +} + +// Conversion to TMatrixT. +//____________________________________________________________________________ +template<typename AFloat> +TCudaMatrix<AFloat>::operator TMatrixT<Double_t>() const +{ + TMatrixT<Double_t> hostMatrix(GetNrows(), GetNcols()); + + AFloat * buffer = new AFloat[fNRows * fNCols]; + cudaMemcpy(buffer, fElementBuffer, fNRows * fNCols * sizeof(AFloat), + cudaMemcpyDeviceToHost); + + size_t index = 0; + for (size_t j = 0; j < fNCols; j++) { + for (size_t i = 0; i < fNRows; i++) { + hostMatrix(i, j) = static_cast<Double_t>(buffer[index]); + index++; + } + } + + delete[] buffer; + return hostMatrix; +} + +// Explicit Instantiations. + +template class TCudaMatrix<float>; +template class TCudaMatrix<double>; + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu new file mode 100644 index 0000000000000000000000000000000000000000..501e2e6d2492080b41f12fde17c3d1c31ddd87e0 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Dropout.cu @@ -0,0 +1,40 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 14/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +///////////////////////////////////////////////////////////////////// +// Implementation of the Dropout function for TCuda architectures. // +///////////////////////////////////////////////////////////////////// + +namespace TMVA { +namespace DNN { + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Dropout(TCudaMatrix<AFloat> &A, + AFloat dropoutProbability) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Dropout<<<gridDims, blockDims, 0, s>>>( + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols(), + dropoutProbability, + TCudaMatrix<AFloat>::GetCurandStatesPointer()); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu new file mode 100644 index 0000000000000000000000000000000000000000..1492d1473534c39e6a24d6cbdbce9949fe19c826 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Initialization.cu @@ -0,0 +1,108 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 14/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ///////////////////////////////////////////////////////////// + // Implementation of the initialization functions for CUDA // + // Architectures // + ///////////////////////////////////////////////////////////// + +#include "TRandom.h" +#include "TMatrix.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "Kernels.cuh" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::InitializeGauss(TCudaMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + TMatrixT<Double_t> B(m, n); + + Double_t sigma = sqrt(2.0 / ((Double_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = rand.Gaus(0.0, sigma); + } + } + A = B; +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::InitializeUniform(TCudaMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + TMatrixT<Double_t> B(m, n); + + Double_t range = sqrt(2.0 / ((Double_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = rand.Uniform(-range, range); + } + } + A = B; +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::InitializeIdentity(TCudaMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + TMatrixT<Double_t> B(m, n); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n ; j++) { + B(i,j) = 0.0; + } + + if (i < n) { + B(i,i) = 1.0; + } + } + A = B; +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::InitializeZero(TCudaMatrix<AFloat> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + TMatrixT<Double_t> B(m, n); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n ; j++) { + B(i,j) = 0.0; + } + } + A = B; +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a0ace5f1bbdb8c68f4f893c9f1d0a79a1b09be71 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh @@ -0,0 +1,672 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////////// +// Implementation of the device kernels for the CUDA implementation of // +// the low-level interface. // +///////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS +#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "cuda.h" +#include "math.h" + +namespace TMVA { +namespace DNN { +namespace Cuda { + +//____________________________________________________________________________ +template<typename AFloat> +__device__ AFloat AtomicAdd(AFloat* address, AFloat val); + +template<> +__device__ double AtomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +template<> +__device__ float AtomicAdd(float* address, float val) +{ + return atomicAdd(address, val); +} + +//____________________________________________________________________________ +template<typename AFloat> +__device__ void ReduceSumVertical(AFloat *result, + AFloat * sdata, + int n) +{ + // i,j are block row and column indices. + int i = threadIdx.y; + int j = threadIdx.x; + int index = i * blockDim.x + j; + + __syncthreads(); + if ((blockDim.y > 512) && (i < 512)) { + if ((i + 512) < blockDim.y) { + sdata[index] += sdata[index + 512 * blockDim.x]; + } + } + + __syncthreads(); + if ((blockDim.y > 256) && (i < 256)) { + if ((i + 256) < blockDim.y) { + sdata[index] += sdata[index + 256 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 128) && (i < 128)) { + if ((i + 128) < blockDim.y) { + sdata[index] += sdata[index + 128 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 64) && (i < 64)) { + if ((i + 64) < blockDim.y) { + sdata[index] += sdata[index + 64 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 32) && (i < 32)) { + if ((i + 32) < blockDim.y) { + sdata[index] += sdata[index + 32 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 16) && (i < 16)) { + if ((i + 16) < blockDim.y) { + sdata[index] += sdata[index + 16 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 8) && (i < 8)) { + if ((i + 8) < blockDim.y) { + sdata[index] += sdata[index + 8 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 4) && (i < 4)) { + if ((i + 4) < blockDim.y) { + sdata[index] += sdata[index + 4 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 2) && (i < 2)) { + if ((i + 2) < blockDim.y) { + sdata[index] += sdata[index + 2 * blockDim.x]; + } + } + __syncthreads(); + if ((blockDim.y > 1) && (i < 1)) { + if ((i + 1) < blockDim.y) { + sdata[index] += sdata[index + 1 * blockDim.x]; + } + } + __syncthreads(); + if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) < n)) { + AtomicAdd(result + j, sdata[index]); + } + __syncthreads(); +} + +//____________________________________________________________________________ +template<typename AFloat> +__device__ void ReduceSum(AFloat *result, AFloat * sdata) +{ + int tid = threadIdx.x + threadIdx.y * blockDim.x; + + __syncthreads(); + if ((TDevice::BlockSize > 512) && (tid < 512)) { + if ((tid + 512) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 512]; + } + } + + __syncthreads(); + if ((TDevice::BlockSize > 256) && (tid < 256)) { + if ((tid + 256) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 256]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 128) && (tid < 128)) { + if ((tid + 128) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 128]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 64) && (tid < 64)) { + if ((tid + 64) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 64]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 32) && (tid < 32)) { + if ((tid + 32) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 32]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 16) && (tid < 16)) { + if ((tid + 16) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 16]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 8) && (tid < 8)) { + if ((tid + 8) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 8]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 4) && (tid < 4)) { + if ((tid + 4) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 4]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 2) && (tid < 2)) { + if ((tid + 2) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 2]; + } + } + __syncthreads(); + if ((TDevice::BlockSize > 1) && (tid < 1)) { + if ((tid + 1) < TDevice::BlockSize) { + sdata[tid] += sdata[tid + 1]; + } + } + if (tid == 0) { + AtomicAdd(result, sdata[0]); + } + + __syncthreads(); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void AddRowWise(AFloat * W, + const AFloat * theta, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + W[index] += theta[j]; +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Hadamard(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + B[index] *= A[index]; +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void IdentityDerivative(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + A[index] = 1.0; +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Relu(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat x = A[index]; + A[index] = (x < 0.0) ? 0.0 : x; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void ReluDerivative(AFloat * B, + const AFloat * A, int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat x = A[index]; + B[index] = (x < 0.0) ? 0.0 : 1.0; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Sigmoid(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat sig = 1.0 / (1.0 + exp(-A[index])); + A[index] = sig; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Sigmoid(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat sig = 1.0 / (1.0 + exp(-A[index])); + B[index] = sig; + } +} +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SigmoidDerivative(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat sig = 1.0 / (1.0 + exp(-A[index])); + B[index] = sig * (1.0 - sig); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Tanh(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat t = ::tanh(A[index]); + A[index] = t; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void TanhDerivative(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat t = ::tanh(A[index]); + B[index] = 1 - t*t; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SymmetricRelu(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + A[index] = abs(A[index]); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SymmetricReluDerivative(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + B[index] = (A[index] < 0.0) ? -1.0 : 1.0; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SoftSign(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat x = A[index]; + A[index] = x / (1.0 + abs(x)); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SoftSignDerivative(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat x = 1.0 + fabs(A[index]); + B[index] = 1 / (x * x); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Gauss(AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat x = A[index]; + A[index] = exp(- x * x); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void GaussDerivative(AFloat * B, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat x = A[index]; + B[index] = - 2.0 * x * exp(- x * x); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void MeanSquaredError(AFloat * result, + const AFloat * Y, + const AFloat * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ AFloat sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + AFloat norm = 1 / ((AFloat) (m * n)); + AFloat e = Y[index] - output[index]; + sdata[tid] = norm * e * e; + } else { + sdata[tid] = 0.0; + } + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SquaredSum(AFloat * result, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ AFloat sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + AFloat e = A[index]; + sdata[tid] = e * e; + } else { + sdata[tid] = 0.0; + } + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void AbsoluteSum(AFloat * result, + const AFloat * A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ AFloat sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + sdata[tid] = abs(A[index]); + } else { + sdata[tid] = 0.0; + } + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void MeanSquaredErrorGradients(AFloat * dY, + const AFloat * Y, + const AFloat * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) + dY[index] = 2.0 / ((AFloat) (m * n)) * (output[index] - Y[index]); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void AddL1RegularizationGradients(AFloat * A, + const AFloat * B, + AFloat weightDecay, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat sign = (B[index] < 0.0) ? -1.0 : 1.0; + A[index] += sign * weightDecay; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void AddL2RegularizationGradients(AFloat * A, + const AFloat * B, + AFloat weightDecay, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + A[index] += 2.0 * weightDecay * B[index]; + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void CrossEntropy(AFloat * result, + const AFloat * Y, + const AFloat * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + int index = j * m + i; + + __shared__ AFloat sdata[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + AFloat norm = 1 / ((AFloat) (m * n)); + AFloat sig = 1.0 / (1.0 + exp(-output[index])); + AFloat ce = Y[index] * log(sig) + (1.0 - Y[index]) * log(1.0 - sig); + sdata[tid] = - norm * ce; + } else { + sdata[tid] = 0.0; + } + + ReduceSum(result, sdata); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void CrossEntropyGradients(AFloat * dY, + const AFloat * Y, + const AFloat * output, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int index = j * m + i; + + if ((i < m) && (j < n)) { + AFloat norm = 1 / ((AFloat) (m * n)); + AFloat y = Y[index]; + AFloat sig = 1.0 / (1.0 + exp(-output[index])); + dY[index] = norm * (sig - y); + } +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void ReduceMatrix(AFloat *result, + const AFloat *A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = threadIdx.y * blockDim.x + threadIdx.x; + int index = j * m + i; + + __shared__ AFloat smem[TDevice::BlockSize]; + if ((i < m) && (j < n)) + smem[tid] = A[index]; + else + smem[tid] = 0.0; + + ReduceSum(result, smem); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void SumColumns(AFloat *B, + const AFloat *A, + int m, int n) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int matrixIndex = j * m + i; + int blockIndex = blockDim.x * threadIdx.y + threadIdx.x; + + + __shared__ AFloat smem[TDevice::BlockSize]; + + if ((i < m) && (j < n)) { + smem[blockIndex] = A[matrixIndex]; + } else { + smem[blockIndex] = 0.0; + } + + ReduceSumVertical(B + blockDim.x * blockIdx.x, smem, n); +} + +//____________________________________________________________________________ +template<typename AFloat> +__global__ void Dropout(AFloat *A, + int m, int n, + AFloat dropoutProbability, + curandState_t *state) +{ + int i = blockDim.y * blockIdx.y + threadIdx.y; + int j = blockDim.x * blockIdx.x + threadIdx.x; + int tid = i * gridDim.x + j; + if ((i < m) && (j < n)) { + float r = curand_uniform(state + tid); + if (r > dropoutProbability) { + A[j * m + i] = 0.0; + } else { + A[j * m + i] /= dropoutProbability; + } + } +} + +} // namespace Cuda +} // namespace DNN +} // namespace TMVA + +#endif diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a242468ff7cf8092721ad0ca6294907e037bfdb --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/LossFunctions.cu @@ -0,0 +1,99 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////////// +// Implementation of the loss functions for the TCuda implementation // +// of the low-level interface. // +/////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +namespace TMVA +{ +namespace DNN +{ + +//____________________________________________________________________________ +template<typename AFloat> +AFloat TCuda<AFloat>::MeanSquaredError(const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + cudaStream_t s = Y.GetComputeStream(); + TCudaMatrix<AFloat>::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::MeanSquaredError<<<gridDims, blockDims, 0, s>>>( + TCudaMatrix<AFloat>::GetDeviceReturnPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + return TCudaMatrix<AFloat>::GetDeviceReturn(); +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::MeanSquaredErrorGradients(TCudaMatrix<AFloat> & dY, + const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + cudaStream_t s = output.GetComputeStream(); + ::TMVA::DNN::Cuda::MeanSquaredErrorGradients<<<gridDims, blockDims, 0, s>>>( + dY.GetDataPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + dY.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<typename AFloat> +AFloat TCuda<AFloat>::CrossEntropy(const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + TCudaMatrix<AFloat>::ResetDeviceReturn(); + cudaStream_t s = Y.GetComputeStream(); + ::TMVA::DNN::Cuda::CrossEntropy<<<gridDims, blockDims, 0, s>>>( + TCudaMatrix<AFloat>::GetDeviceReturnPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + return TCudaMatrix<AFloat>::GetDeviceReturn(); +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::CrossEntropyGradients(TCudaMatrix<AFloat> & dY, + const TCudaMatrix<AFloat> & Y, + const TCudaMatrix<AFloat> & output) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Y); + cudaStream_t s = output.GetComputeStream(); + ::TMVA::DNN::Cuda::CrossEntropyGradients<<<gridDims, blockDims, 0, s>>>( + dY.GetDataPointer(), + Y.GetDataPointer(), + output.GetDataPointer(), + (int) Y.GetNrows(), + (int) Y.GetNcols()); + dY.SetComputeStream(s); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu new file mode 100644 index 0000000000000000000000000000000000000000..039fb27a8e36795add5e1fb5a60c56ef4e6ad138 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/OutputFunctions.cu @@ -0,0 +1,41 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 11/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the Reference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +namespace TMVA +{ +namespace DNN +{ + +template<typename AFloat> +void TCuda<AFloat>::Sigmoid(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::Sigmoid<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + B.SetComputeStream(s); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu new file mode 100644 index 0000000000000000000000000000000000000000..047c6411b52cd9153d707cd79d1eef3e4c4fdc6b --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu @@ -0,0 +1,132 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the functions required for the forward and // + // backward propagation of activations through a neural network // + // for CUDA architectures. // + ////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +namespace TMVA { +namespace DNN { + +//____________________________________________________________________________ +template<> +void TCuda<float>::MultiplyTranspose(TCudaMatrix<float> &output, + const TCudaMatrix<float> &input, + const TCudaMatrix<float> &Weights) +{ + int m, n, k; + k = input.GetNcols(); + m = input.GetNrows(); + n = Weights.GetNrows(); + float alpha = 1.0, beta = 0.0; + + // Compute C = beta * C + alpha * (A * B^T) + cudaStream_t s = input.GetComputeStream(); + cublasSetStream(input.GetCublasHandle(), s); + cublasSgemm(input.GetCublasHandle(), + CUBLAS_OP_N, CUBLAS_OP_T, + m, n, k, & alpha, + input.GetDataPointer(), m, // *A, lda + Weights.GetDataPointer(), n, // *B, ldb + & beta, // beta + output.GetDataPointer(), m); // *C, ldc + output.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<> +void TCuda<double>::MultiplyTranspose(TCudaMatrix<double> &output, + const TCudaMatrix<double> &input, + const TCudaMatrix<double> &Weights) +{ + int m, n, k; + k = input.GetNcols(); + m = input.GetNrows(); + n = Weights.GetNrows(); + double alpha = 1.0, beta = 0.0; + + // Compute C = beta * C + alpha * (A * B^T) + cudaStream_t s = input.GetComputeStream(); + cublasSetStream(input.GetCublasHandle(), s); + cublasDgemm(input.GetCublasHandle(), + CUBLAS_OP_N, CUBLAS_OP_T, + m, n, k, & alpha, + input.GetDataPointer(), m, // *A, lda + Weights.GetDataPointer(), n, // *B, ldb + & beta, // beta + output.GetDataPointer(), m); // *C, ldc + output.SetComputeStream(s); +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::AddRowWise(TCudaMatrix<AFloat> &Weights, + const TCudaMatrix<AFloat> &theta) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(Weights); + cudaStream_t s = Weights.GetComputeStream(); + ::TMVA::DNN::Cuda::AddRowWise<<<gridDims, blockDims, 0, s>>>( + Weights.GetDataPointer(), + theta.GetDataPointer(), + Weights.GetNrows(), + Weights.GetNcols()); +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Backward(TCudaMatrix<AFloat> & activation_gradients_backward, + TCudaMatrix<AFloat> & weight_gradients, + TCudaMatrix<AFloat> & bias_gradients, + TCudaMatrix<AFloat> & df, + const TCudaMatrix<AFloat> & activation_gradients, + const TCudaMatrix<AFloat> & weights, + const TCudaMatrix<AFloat> & activation_backward) +{ + // Compute element-wise product. + TCuda<AFloat>::Hadamard(df, activation_gradients); + + // Activation gradients. + if (activation_gradients_backward.GetNoElements() > 0) { + TCuda<AFloat>::Multiply(activation_gradients_backward, df, weights); + } + + // Weight gradients. + if (weight_gradients.GetNoElements() > 0) { + TCuda<AFloat>::TransposeMultiply(weight_gradients, df, activation_backward); + } + + // Bias gradients. + if (bias_gradients.GetNoElements() > 0) { + TCuda<AFloat>::SumColumns(bias_gradients, df); + } + +} + +//____________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::Copy(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A) +{ + size_t m = B.GetNrows(); + size_t n = B.GetNcols(); + cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(), + m * n * sizeof(AFloat), cudaMemcpyDeviceToDevice, 0); +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu new file mode 100644 index 0000000000000000000000000000000000000000..67851eaef8bb32df3d99c07d30a32086a494e2b4 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Cuda/Regularization.cu @@ -0,0 +1,92 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 13/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////// +// Contains the definitions of the kernel calling functions for // +// computation of regularization functionals and gradients // +// functions for CUDA architectures. // +////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Cuda/Device.h" +#include "Kernels.cuh" + +namespace TMVA { +namespace DNN { + +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCuda<AFloat>::L1Regularization(const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + TCudaMatrix<AFloat>::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::AbsoluteSum<<<gridDims, blockDims, 0, s>>>( + TCudaMatrix<AFloat>::GetDeviceReturnPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + return TCudaMatrix<AFloat>::GetDeviceReturn(); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::AddL1RegularizationGradients(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A, + AFloat weightDecay) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::AddL1RegularizationGradients<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + weightDecay, + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +//______________________________________________________________________________ +template<typename AFloat> +AFloat TCuda<AFloat>::L2Regularization(const TCudaMatrix<AFloat> & A) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(A); + cudaStream_t s = A.GetComputeStream(); + TCudaMatrix<AFloat>::ResetDeviceReturn(); + ::TMVA::DNN::Cuda::SquaredSum<<<gridDims, blockDims, 0, s>>>( + TCudaMatrix<AFloat>::GetDeviceReturnPointer(), + A.GetDataPointer(), + (int) A.GetNrows(), + (int) A.GetNcols()); + return TCudaMatrix<AFloat>::GetDeviceReturn(); +} + +//______________________________________________________________________________ +template<typename AFloat> +void TCuda<AFloat>::AddL2RegularizationGradients(TCudaMatrix<AFloat> & B, + const TCudaMatrix<AFloat> & A, + AFloat weightDecay) +{ + dim3 blockDims = TDevice::BlockDims(); + dim3 gridDims = TDevice::GridDims(B); + cudaStream_t s = A.GetComputeStream(); + ::TMVA::DNN::Cuda::AddL2RegularizationGradients<<<gridDims, blockDims, 0, s>>>( + B.GetDataPointer(), + A.GetDataPointer(), + weightDecay, + (int) A.GetNrows(), + (int) A.GetNcols()); +} + +} // namspace DNN +} // namspace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference.cxx b/tmva/tmva/src/DNN/Architectures/Reference.cxx new file mode 100644 index 0000000000000000000000000000000000000000..0d8eaf2b422fe0c1f853da8d9bf2ece8d7f8d47e --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference.cxx @@ -0,0 +1,32 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the TReference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Reference.h" + +#include "Reference/Propagation.cxx" +#include "Reference/ActivationFunctions.cxx" +#include "Reference/OutputFunctions.cxx" +#include "Reference/LossFunctions.cxx" +#include "Reference/Regularization.cxx" +#include "Reference/Initialization.cxx" +#include "Reference/Dropout.cxx" + +namespace TMVA { +namespace DNN { +template class TReference<Double_t>; +} // namespace TMVA +} // namespace DNN diff --git a/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..e4c2caa52d49903fdee801caed582e51780829a2 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/ActivationFunctions.cxx @@ -0,0 +1,237 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" +#include <math.h> + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::IdentityDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> &/*A*/) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = 1.0; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::Relu(TMatrixT<Real_t> &A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = std::max((Real_t) 0.0, A(i,j)); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::ReluDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + B(i,j) = (A(i,j) < 0) ? 0.0 : 1.0; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + A(i,j) = sig; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::SigmoidDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + B(i,j) = sig * (1.0 - sig); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::Tanh(TMatrixT<Real_t> & B) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t t = tanh(B(i,j)); + B(i,j) = t; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::TanhDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t t = tanh(A(i,j)); + B(i,j) = 1 - t * t; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::SymmetricRelu(TMatrixT<Real_t> & B) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = fabs(B(i,j)); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::SymmetricReluDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + B(i,j) = (A(i,j) < 0.0) ? -1.0 : 1.0; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::SoftSign(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = A(i,j); + A(i,j) = x / (1 + fabs(x)); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::SoftSignDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = 1.0 + fabs(A(i,j)); + B(i,j) = 1.0 / (x * x); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::Gauss(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = A(i,j); + A(i,j) = exp(- x * x); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +inline void TReference<Real_t>::GaussDerivative(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t x = A(i,j); + B(i,j) = - 2.0 * x * exp(- x * x); + } + } +} +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx new file mode 100644 index 0000000000000000000000000000000000000000..04e14811436c0518bdbc677cba1cd3a63372d12c --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Dropout.cxx @@ -0,0 +1,50 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////// + // Implementation of the activation functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////// + + +#include "TMVA/DNN/Architectures/Reference.h" +#include "TRandom.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ + +template<typename Real_t> +void TReference<Real_t>::Dropout(TMatrixT<Real_t> & B, Real_t dropoutProbability) +{ + size_t m,n; + m = B.GetNrows(); + n = B.GetNcols(); + + TRandom rand(time(nullptr)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t r = rand.Uniform(); + if (r >= dropoutProbability) { + B(i,j) = 0.0; + } else { + B(i,j) /= dropoutProbability; + } + } + } +} + +} +} diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx new file mode 100644 index 0000000000000000000000000000000000000000..789f08773abcdfb2db099315d40f408484024a9a --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Initialization.cxx @@ -0,0 +1,97 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////////// + // Implementation of the initialization functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////////// + +#include "TRandom.h" +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::InitializeGauss(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + + Real_t sigma = sqrt(2.0 / ((Real_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = rand.Gaus(0.0, sigma); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::InitializeUniform(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + TRandom rand(time(nullptr)); + + Real_t range = sqrt(2.0 / ((Real_t) n)); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = rand.Uniform(-range, range); + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::InitializeIdentity(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) = 0.0; + } + + if (i < n) { + A(i,i) = 1.0; + } + } +} + +template<typename Real_t> +void TReference<Real_t>::InitializeZero(TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m ; i++) { + for (size_t j = 0; j < n ; j++) { + A(i,j) = 0.0; + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..aa0b144be2053fa411cb04a6a2a82c9b425ceb89 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/LossFunctions.cxx @@ -0,0 +1,101 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + //////////////////////////////////////////////////////////// + // Implementation of the loss functions for the reference // + // implementation. // + //////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ +//______________________________________________________________________________ +template<typename Real_t> +Real_t TReference<Real_t>::MeanSquaredError(const TMatrixT<Real_t> &Y, + const TMatrixT<Real_t> &output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t dY = (Y(i,j) - output(i,j)); + result += dY * dY; + } + } + result /= (Real_t) (m * n); + return result; +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::MeanSquaredErrorGradients(TMatrixT<Real_t> & dY, + const TMatrixT<Real_t> & Y, + const TMatrixT<Real_t> & output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + + dY.Minus(Y, output); + dY *= - 2.0 / ((Real_t) (m*n)); +} + +//______________________________________________________________________________ +template<typename Real_t> +Real_t TReference<Real_t>::CrossEntropy(const TMatrixT<Real_t> &Y, + const TMatrixT<Real_t> &output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j))); + result += Y(i,j) * std::log(sig) + + (1.0 - Y(i,j)) * std::log(1.0 - sig); + } + } + result /= - (Real_t) (m * n); + return result; +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::CrossEntropyGradients(TMatrixT<Real_t> & dY, + const TMatrixT<Real_t> & Y, + const TMatrixT<Real_t> & output) +{ + size_t m,n; + m = Y.GetNrows(); + n = Y.GetNcols(); + + Real_t norm = 1.0 / ((Real_t) (m * n)); + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + Real_t y = Y(i,j); + Real_t sig = 1.0 / (1.0 + std::exp(-output(i,j))); + dY(i,j) = norm * (sig - y); + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..731c95713d404a49942e8ecc6c86e9f2c51c340f --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/OutputFunctions.cxx @@ -0,0 +1,37 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 11/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////// +// Explicit instantiation of the TReference architecture class // +// template for Double_t scalar types. // +//////////////////////////////////////////////////////////////// + +namespace TMVA { +namespace DNN { + +template<typename Real_t> +void TReference<Real_t>::Sigmoid(TMatrixT<Real_t> & B, + const TMatrixT<Real_t> & A) +{ + size_t m,n; + m = A.GetNrows(); + n = A.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Real_t sig = 1.0 / (1.0 + std::exp(-A(i,j))); + B(i,j) = sig; + } + } +} + +} // namespace TMVA +} // namespace DNN diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx new file mode 100644 index 0000000000000000000000000000000000000000..aa4d3515d8ccd8162d49f8275344da167e450363 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Propagation.cxx @@ -0,0 +1,102 @@ +// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Implementation of the functions required for the forward and // +// backward propagation of activations through a neural network in // +// the reference implementation. // +///////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +template<typename Scalar_t> +void TReference<Scalar_t>::MultiplyTranspose(TMatrixT<Scalar_t> &output, + const TMatrixT<Scalar_t> &input, + const TMatrixT<Scalar_t> &weights) +{ + output.MultT(input, weights); +} + +template<typename Scalar_t> +void TReference<Scalar_t>::AddRowWise(TMatrixT<Scalar_t> &output, + const TMatrixT<Scalar_t> &biases) +{ + for (size_t i = 0; i < (size_t) output.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) output.GetNcols(); j++) { + output(i,j) += biases(j,0); + } + } +} + +template<typename Scalar_t> +void TReference<Scalar_t>::Backward(TMatrixT<Scalar_t> & activation_gradients_backward, + TMatrixT<Scalar_t> & weight_gradients, + TMatrixT<Scalar_t> & bias_gradients, + TMatrixT<Scalar_t> & df, + const TMatrixT<Scalar_t> & activation_gradients, + const TMatrixT<Scalar_t> & weights, + const TMatrixT<Scalar_t> & activations_backward) +{ + + // Compute element-wise product. + for (size_t i = 0; i < (size_t) df.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) df.GetNcols(); j++) { + df(i,j) *= activation_gradients(i,j); + } + } + + // Activation gradients. + if (activation_gradients_backward.GetNoElements() > 0) { + activation_gradients_backward.Mult(df, weights); + } + + // Weights gradients. + if (weight_gradients.GetNoElements() > 0) { + weight_gradients.TMult(df, activations_backward); + } + + // Bias gradients. + if (bias_gradients.GetNoElements() > 0) { + for (size_t j = 0; j < (size_t) df.GetNcols(); j++) { + Scalar_t sum = 0.0; + for (size_t i = 0; i < (size_t) df.GetNrows(); i++) { + sum += df(i,j); + } + bias_gradients(j,0) = sum; + } + } +} + +template<typename Scalar_t> +void TReference<Scalar_t>::ScaleAdd(TMatrixT<Scalar_t> & A, + const TMatrixT<Scalar_t> & B, + Scalar_t beta) +{ + for (size_t i = 0; i < (size_t) A.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) A.GetNcols(); j++) { + A(i,j) += beta * B(i,j); + } + } +} + +template<typename Scalar_t> +void TReference<Scalar_t>::Copy(TMatrixT<Scalar_t> & A, + const TMatrixT<Scalar_t> & B) +{ + A = B; +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx new file mode 100644 index 0000000000000000000000000000000000000000..ffe0f19ec7db19e5401bc304711c0ec3a2d6ddb8 --- /dev/null +++ b/tmva/tmva/src/DNN/Architectures/Reference/Regularization.cxx @@ -0,0 +1,98 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 10/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + + ////////////////////////////////////////////////////////////////////// + // Implementation of the regularization functions for the reference // + // implementation. // + ////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" + +namespace TMVA +{ +namespace DNN +{ + +//______________________________________________________________________________ +template<typename Real_t> +Real_t TReference<Real_t>::L1Regularization(const TMatrixT<Real_t> & W) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + result += std::abs(W(i,j)); + } + } + return result; +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::AddL1RegularizationGradients(TMatrixT<Real_t> & A, + const TMatrixT<Real_t> & W, + Real_t weightDecay) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + Real_t sign = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + sign = (W(i,j) > 0.0) ? 1.0 : -1.0; + A(i,j) += sign * weightDecay; + } + } +} + +//______________________________________________________________________________ +template<typename Real_t> +Real_t TReference<Real_t>::L2Regularization(const TMatrixT<Real_t> & W) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + Real_t result = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + result += W(i,j) * W(i,j); + } + } + return result; +} + +//______________________________________________________________________________ +template<typename Real_t> +void TReference<Real_t>::AddL2RegularizationGradients(TMatrixT<Real_t> & A, + const TMatrixT<Real_t> & W, + Real_t weightDecay) +{ + size_t m,n; + m = W.GetNrows(); + n = W.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + A(i,j) += weightDecay * 2.0 * W(i,j); + } + } +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/src/DNN/DataLoader.cxx b/tmva/tmva/src/DNN/DataLoader.cxx new file mode 100644 index 0000000000000000000000000000000000000000..f07258dcb881e1c6c707fb7418014b8313f20c7f --- /dev/null +++ b/tmva/tmva/src/DNN/DataLoader.cxx @@ -0,0 +1,18 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 08/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////////////// +// Implementation of the generic data loader for neural network input data. // +////////////////////////////////////////////////////////////////////////////// + +namespace TMVA { +namespace DNN { + diff --git a/tmva/tmva/src/MethodDNN.cxx b/tmva/tmva/src/MethodDNN.cxx index d3f0e547ca3497ead30a589f0041157fb8685ae6..7ee53e1a9313b34aff23e5c286b8872c5774afc6 100644 --- a/tmva/tmva/src/MethodDNN.cxx +++ b/tmva/tmva/src/MethodDNN.cxx @@ -4,14 +4,15 @@ /********************************************************************************** * Project: TMVA - a Root-integrated toolkit for multivariate data analysis * * Package: TMVA * - * Class : MethodDNN * + * Class : MethodDNN * * Web : http://tmva.sourceforge.net * * * * Description: * * A neural network implementation * * * * Authors (alphabetical): * - * Peter Speckmayer <peter.speckmayer@gmx.ch> - CERN, Switzerland * + * Simon Pfreundschuh <s.pfreundschuh@gmail.com> - CERN, Switzerland * + * Peter Speckmayer <peter.speckmayer@gmx.ch> - CERN, Switzerland * * * * Copyright (c) 2005-2015: * * CERN, Switzerland * @@ -24,10 +25,10 @@ * (http://tmva.sourceforge.net/LICENSE) * **********************************************************************************/ -//_______________________________________________________________________ +//______________________________________________________________________________ // -// neural network implementation -//_______________________________________________________________________ +// Deep Neural Network Implementation +//______________________________________________________________________________ #include "TString.h" #include "TTree.h" @@ -42,44 +43,34 @@ #include "TMVA/Config.h" #include "TMVA/Ranking.h" +#include "TMVA/DNN/Net.h" +#include "TMVA/DNN/Architectures/Reference.h" + #include "TMVA/NeuralNet.h" #include "TMVA/Monitoring.h" #include <algorithm> #include <iostream> +#include <string> +#include <iomanip> REGISTER_METHOD(DNN) ClassImp(TMVA::MethodDNN) +using TMVA::DNN::EActivationFunction; +using TMVA::DNN::ELossFunction; +using TMVA::DNN::EInitialization; +using TMVA::DNN::EOutputFunction; - - - namespace TMVA - { - namespace DNN - { - template <typename Container, typename T> - void gaussDistribution (Container& container, T mean, T sigma) - { - for (auto it = begin (container), itEnd = end (container); it != itEnd; ++it) - { - (*it) = DNN::gaussDouble (mean, sigma); - } - } - }; - }; - - - - - +namespace TMVA +{ //______________________________________________________________________________ -TMVA::MethodDNN::MethodDNN( const TString& jobName, - const TString& methodTitle, - DataSetInfo& theData, - const TString& theOption ) +TMVA::MethodDNN::MethodDNN(const TString& jobName, + const TString& methodTitle, + DataSetInfo& theData, + const TString& theOption) : MethodBase( jobName, Types::kDNN, methodTitle, theData, theOption) , fResume (false) { @@ -87,10 +78,9 @@ TMVA::MethodDNN::MethodDNN( const TString& jobName, } //______________________________________________________________________________ -TMVA::MethodDNN::MethodDNN( DataSetInfo& theData, - const TString& theWeightFile) - : MethodBase( Types::kDNN, theData, theWeightFile) - , fResume (false) +TMVA::MethodDNN::MethodDNN(DataSetInfo& theData, + const TString& theWeightFile) + : MethodBase( Types::kDNN, theData, theWeightFile), fResume (false) { // constructor from a weight file } @@ -102,10 +92,13 @@ TMVA::MethodDNN::~MethodDNN() // nothing to be done } -//_______________________________________________________________________ -Bool_t TMVA::MethodDNN::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ ) +//______________________________________________________________________________ +Bool_t TMVA::MethodDNN::HasAnalysisType(Types::EAnalysisType type, + UInt_t numberClasses, + UInt_t /*numberTargets*/ ) { - // MLP can handle classification with 2 classes and regression with one regression-target + // MLP can handle classification with 2 classes and regression with + // one regression-target if (type == Types::kClassification && numberClasses == 2 ) return kTRUE; if (type == Types::kMulticlass ) return kTRUE; if (type == Types::kRegression ) return kTRUE; @@ -119,221 +112,254 @@ void TMVA::MethodDNN::Init() // default initializations } -//_______________________________________________________________________ +//______________________________________________________________________________ void TMVA::MethodDNN::DeclareOptions() { - // define the options (their key words) that can be set in the option string - // know options: - // TrainingMethod <string> Training method - // available values are: BP Back-Propagation <default> - // GA Genetic Algorithm (takes a LONG time) - // - // LearningRate <float> DNN learning rate parameter - // DecayRate <float> Decay rate for learning parameter - // TestRate <int> Test for overtraining performed at each #th epochs + // Options to be set in the option string: // - // BPMode <string> Back-propagation learning mode - // available values are: sequential <default> - // batch - // - // BatchSize <int> Batch size: number of events/batch, only set if in Batch Mode, - // -1 for BatchSize=number_of_events - - // DeclareOptionRef(fTrainMethodS="SD", "TrainingMethod", - // "Train with back propagation steepest descend"); - // AddPreDefVal(TString("SD")); - - // DeclareOptionRef(fLayoutString="TANH|(N+30)*2,TANH|(N+30),LINEAR", "Layout", "neural network layout"); - // DeclareOptionRef(fLayoutString="RELU|(N+20)*2,RELU|(N+10)*2,LINEAR", "Layout", "neural network layout"); - DeclareOptionRef(fLayoutString="SOFTSIGN|(N+100)*2,LINEAR", "Layout", "neural network layout"); - - - DeclareOptionRef(fErrorStrategy="CROSSENTROPY", "ErrorStrategy", "error strategy (regression: sum of squares; classification: crossentropy; multiclass: crossentropy/mutual exclusive cross entropy"); + // LearningRate <float> DNN learning rate parameter. + // DecayRate <float> Decay rate for learning parameter. + // TestRate <int> Period of validation set error computation. + // BatchSize <int> Number of event per batch. + + DeclareOptionRef(fLayoutString="SOFTSIGN|(N+100)*2,LINEAR", + "Layout", + "Layou of the network."); + + DeclareOptionRef(fErrorStrategy="CROSSENTROPY", + "ErrorStrategy", + "Loss function: Mean squared error (regression)" + " or cross entropy (binary classifcation)."); AddPreDefVal(TString("CROSSENTROPY")); AddPreDefVal(TString("SUMOFSQUARES")); - AddPreDefVal(TString("MUTUALEXCLUSIVE")); - AddPreDefVal(TString("CHECKGRADIENTS")); - - DeclareOptionRef(fWeightInitializationStrategyString="XAVIER", "WeightInitialization", "Weight initialization strategy"); + DeclareOptionRef(fWeightInitializationString="XAVIER", + "WeightInitialization", + "Weight initialization strategy"); AddPreDefVal(TString("XAVIER")); AddPreDefVal(TString("XAVIERUNIFORM")); - AddPreDefVal(TString("LAYERSIZE")); - - - DeclareOptionRef(fTrainingStrategy="LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropConfig=0.0+0.5+0.5,DropRepetitions=5,Multithreading=True", "TrainingStrategy", "defines the training strategies"); - DeclareOptionRef(fSumOfSigWeights_test=1000.0, "SignalWeightsSum", "Sum of weights of signal; Is used to compute the significance on the fly"); - DeclareOptionRef(fSumOfBkgWeights_test=1000.0, "BackgroundWeightsSum", "Sum of weights of background; Is used to compute the significance on the fly"); + DeclareOptionRef(fArchitectureString="STANDARD", + "Architecture", + "Which architecture to perfrom the training on."); + AddPreDefVal(TString("STANDARD")); + AddPreDefVal(TString("CPU")); + AddPreDefVal(TString("GPU")); + AddPreDefVal(TString("OPENCL")); + + DeclareOptionRef( + fTrainingStrategyString = "LearningRate=1e-1," + "Momentum=0.3," + "Repetitions=3," + "ConvergenceSteps=50," + "BatchSize=30," + "TestRepetitions=7," + "WeightDecay=0.0," + "Renormalize=L2," + "DropConfig=0.0," + "DropRepetitions=5|LearningRate=1e-4," + "Momentum=0.3," + "Repetitions=3," + "ConvergenceSteps=50," + "BatchSize=20," + "TestRepetitions=7," + "WeightDecay=0.001," + "Renormalize=L2," + "DropConfig=0.0+0.5+0.5," + "DropRepetitions=5," + "Multithreading=True", + "TrainingStrategy", + "Defines the training strategies."); } - -std::vector<std::pair<int,TMVA::DNN::EnumFunction>> TMVA::MethodDNN::ParseLayoutString(TString layerSpec) +//______________________________________________________________________________ +auto TMVA::MethodDNN::ParseLayoutString(TString layoutString) + -> LayoutVector_t { // parse layout specification string and return a vector, each entry // containing the number of neurons to go in each successive layer - std::vector<std::pair<int,TMVA::DNN::EnumFunction>> layout; - const TString delim_Layer (","); - const TString delim_Sub ("|"); + LayoutVector_t layout; + const TString layerDelimiter(","); + const TString subDelimiter("|"); - const size_t inputSize = GetNvar (); + const size_t inputSize = GetNvar(); - TObjArray* layerStrings = layerSpec.Tokenize (delim_Layer); - TIter nextLayer (layerStrings); + TObjArray* layerStrings = layoutString.Tokenize(layerDelimiter); + TIter nextLayer (layerStrings); TObjString* layerString = (TObjString*)nextLayer (); - for (; layerString != NULL; layerString = (TObjString*)nextLayer ()) - { - int numNodes = 0; - TMVA::DNN::EnumFunction eActivationFunction = DNN::EnumFunction::TANH; - - TObjArray* subStrings = layerString->GetString ().Tokenize (delim_Sub); - TIter nextToken (subStrings); - TObjString* token = (TObjString*)nextToken (); - int idxToken = 0; - for (; token != NULL; token = (TObjString*)nextToken ()) - { - switch (idxToken) - { - case 0: - { - TString strActFnc (token->GetString ()); - if (strActFnc == "RELU") - eActivationFunction = DNN::EnumFunction::RELU; - else if (strActFnc == "TANH") - eActivationFunction = DNN::EnumFunction::TANH; - else if (strActFnc == "SYMMRELU") - eActivationFunction = DNN::EnumFunction::SYMMRELU; - else if (strActFnc == "SOFTSIGN") - eActivationFunction = DNN::EnumFunction::SOFTSIGN; - else if (strActFnc == "SIGMOID") - eActivationFunction = DNN::EnumFunction::SIGMOID; - else if (strActFnc == "LINEAR") - eActivationFunction = DNN::EnumFunction::LINEAR; - else if (strActFnc == "GAUSS") - eActivationFunction = DNN::EnumFunction::GAUSS; - } - break; - case 1: // number of nodes - { - TString strNumNodes (token->GetString ()); - TString strN ("x"); - strNumNodes.ReplaceAll ("N", strN); - strNumNodes.ReplaceAll ("n", strN); - TFormula fml ("tmp",strNumNodes); - numNodes = fml.Eval (inputSize); - } - break; - } - ++idxToken; + + for (; layerString != nullptr; layerString = (TObjString*) nextLayer()) { + int numNodes = 0; + EActivationFunction activationFunction = EActivationFunction::kTanh; + + TObjArray* subStrings = layerString->GetString().Tokenize(subDelimiter); + TIter nextToken (subStrings); + TObjString* token = (TObjString *) nextToken(); + int idxToken = 0; + for (; token != nullptr; token = (TObjString *) nextToken()) { + switch (idxToken) + { + case 0: + { + TString strActFnc (token->GetString ()); + if (strActFnc == "RELU") { + activationFunction = DNN::EActivationFunction::kRelu; + } else if (strActFnc == "TANH") { + activationFunction = DNN::EActivationFunction::kTanh; + } else if (strActFnc == "SYMMRELU") { + activationFunction = DNN::EActivationFunction::kSymmRelu; + } else if (strActFnc == "SOFTSIGN") { + activationFunction = DNN::EActivationFunction::kSoftSign; + } else if (strActFnc == "SIGMOID") { + activationFunction = DNN::EActivationFunction::kSigmoid; + } else if (strActFnc == "LINEAR") { + activationFunction = DNN::EActivationFunction::kIdentity; + } else if (strActFnc == "GAUSS") { + activationFunction = DNN::EActivationFunction::kGauss; } - layout.push_back (std::make_pair (numNodes,eActivationFunction)); + } + break; + case 1: // number of nodes + { + TString strNumNodes (token->GetString ()); + TString strN ("x"); + strNumNodes.ReplaceAll ("N", strN); + strNumNodes.ReplaceAll ("n", strN); + TFormula fml ("tmp",strNumNodes); + numNodes = fml.Eval (inputSize); + } + break; + } + ++idxToken; + } + layout.push_back(std::make_pair(numNodes, activationFunction)); } return layout; } - - // parse key value pairs in blocks -> return vector of blocks with map of key value pairs -std::vector<std::map<TString,TString>> TMVA::MethodDNN::ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim) +//______________________________________________________________________________ +auto TMVA::MethodDNN::ParseKeyValueString(TString parseString, + TString blockDelim, + TString tokenDelim) + -> KeyValueVector_t { - std::vector<std::map<TString,TString>> blockKeyValues; + KeyValueVector_t blockKeyValues; const TString keyValueDelim ("="); - // const size_t inputSize = GetNvar (); - TObjArray* blockStrings = parseString.Tokenize (blockDelim); TIter nextBlock (blockStrings); - TObjString* blockString = (TObjString*)nextBlock (); - for (; blockString != NULL; blockString = (TObjString*)nextBlock ()) + TObjString* blockString = (TObjString *) nextBlock(); + + for (; blockString != nullptr; blockString = (TObjString *) nextBlock()) + { + blockKeyValues.push_back (std::map<TString,TString>()); + std::map<TString,TString>& currentBlock = blockKeyValues.back (); + + TObjArray* subStrings = blockString->GetString ().Tokenize (tokenDelim); + TIter nextToken (subStrings); + TObjString* token = (TObjString*)nextToken (); + + for (; token != nullptr; token = (TObjString *)nextToken()) { - blockKeyValues.push_back (std::map<TString,TString> ()); // new block - std::map<TString,TString>& currentBlock = blockKeyValues.back (); - - TObjArray* subStrings = blockString->GetString ().Tokenize (tokenDelim); - TIter nextToken (subStrings); - TObjString* token = (TObjString*)nextToken (); - - for (; token != NULL; token = (TObjString*)nextToken ()) - { - TString strKeyValue (token->GetString ()); - int delimPos = strKeyValue.First (keyValueDelim.Data ()); - if (delimPos <= 0) - continue; - - TString strKey = TString (strKeyValue (0, delimPos)); - strKey.ToUpper (); - TString strValue = TString (strKeyValue (delimPos+1, strKeyValue.Length ())); - - strKey.Strip (TString::kBoth, ' '); - strValue.Strip (TString::kBoth, ' '); - - currentBlock.insert (std::make_pair (strKey, strValue)); - } + TString strKeyValue (token->GetString ()); + int delimPos = strKeyValue.First (keyValueDelim.Data ()); + if (delimPos <= 0) + continue; + + TString strKey = TString (strKeyValue (0, delimPos)); + strKey.ToUpper(); + TString strValue = TString (strKeyValue (delimPos+1, strKeyValue.Length ())); + + strKey.Strip (TString::kBoth, ' '); + strValue.Strip (TString::kBoth, ' '); + + currentBlock.insert (std::make_pair (strKey, strValue)); } + } return blockKeyValues; } - -TString fetchValue (const std::map<TString, TString>& keyValueMap, TString _key) +//______________________________________________________________________________ +TString fetchValue (const std::map<TString, TString>& keyValueMap, TString key) { - TString key (_key); key.ToUpper (); std::map<TString, TString>::const_iterator it = keyValueMap.find (key); - if (it == keyValueMap.end ()) + if (it == keyValueMap.end()) { return TString (""); + } return it->second; } +//______________________________________________________________________________ template <typename T> -T fetchValue (const std::map<TString,TString>& keyValueMap, TString key, T defaultValue); +T fetchValue(const std::map<TString,TString>& keyValueMap, + TString key, + T defaultValue); +//______________________________________________________________________________ template <> -int fetchValue (const std::map<TString,TString>& keyValueMap, TString key, int defaultValue) +int fetchValue(const std::map<TString,TString>& keyValueMap, + TString key, + int defaultValue) { TString value (fetchValue (keyValueMap, key)); - if (value == "") + if (value == "") { return defaultValue; + } return value.Atoi (); } +//______________________________________________________________________________ template <> -double fetchValue (const std::map<TString,TString>& keyValueMap, TString key, double defaultValue) +double fetchValue (const std::map<TString,TString>& keyValueMap, + TString key, double defaultValue) { TString value (fetchValue (keyValueMap, key)); - if (value == "") + if (value == "") { return defaultValue; + } return value.Atof (); } +//______________________________________________________________________________ template <> -TString fetchValue (const std::map<TString,TString>& keyValueMap, TString key, TString defaultValue) +TString fetchValue (const std::map<TString,TString>& keyValueMap, + TString key, TString defaultValue) { TString value (fetchValue (keyValueMap, key)); - if (value == "") + if (value == "") { return defaultValue; + } return value; } +//______________________________________________________________________________ template <> -bool fetchValue (const std::map<TString,TString>& keyValueMap, TString key, bool defaultValue) +bool fetchValue (const std::map<TString,TString>& keyValueMap, + TString key, bool defaultValue) { TString value (fetchValue (keyValueMap, key)); - if (value == "") + if (value == "") { return defaultValue; + } value.ToUpper (); - if (value == "TRUE" || - value == "T" || - value == "1") + if (value == "TRUE" || value == "T" || value == "1") { return true; + } return false; } +//______________________________________________________________________________ template <> -std::vector<double> fetchValue (const std::map<TString,TString>& keyValueMap, TString key, std::vector<double> defaultValue) +std::vector<double> fetchValue(const std::map<TString, TString> & keyValueMap, + TString key, + std::vector<double> defaultValue) { TString parseString (fetchValue (keyValueMap, key)); - if (parseString == "") + if (parseString == "") { return defaultValue; + } parseString.ToUpper (); std::vector<double> values; @@ -341,610 +367,776 @@ std::vector<double> fetchValue (const std::map<TString,TString>& keyValueMap, TS TObjArray* tokenStrings = parseString.Tokenize (tokenDelim); TIter nextToken (tokenStrings); TObjString* tokenString = (TObjString*)nextToken (); - for (; tokenString != NULL; tokenString = (TObjString*)nextToken ()) - { - std::stringstream sstr; - double currentValue; - sstr << tokenString->GetString ().Data (); - sstr >> currentValue; - values.push_back (currentValue); - } + for (; tokenString != NULL; tokenString = (TObjString*)nextToken ()) { + std::stringstream sstr; + double currentValue; + sstr << tokenString->GetString ().Data (); + sstr >> currentValue; + values.push_back (currentValue); + } return values; } - - -//_______________________________________________________________________ +//______________________________________________________________________________ void TMVA::MethodDNN::ProcessOptions() { - // process user options - // MethodBase::ProcessOptions(); - - if (fErrorStrategy == "CHECKGRADIENTS") - return checkGradients (); - - - if (IgnoreEventsWithNegWeightsInTraining()) { - Log() << kINFO + Log() << kINFO << "Will ignore negative events in training!" << Endl; } + // + // Set network structure. + // + fLayout = TMVA::MethodDNN::ParseLayoutString (fLayoutString); + size_t inputSize = GetNVariables (); + size_t outputSize = (GetNTargets() == 0) ? 1 : GetNTargets(); - // block-delimiter token-delimiter - std::vector<std::map<TString,TString>> strategyKeyValues = ParseKeyValueString (fTrainingStrategy, TString ("|"), TString (",")); + fNet.SetBatchSize(1); + fNet.SetInputWidth(inputSize); + auto itLayout = std::begin (fLayout); + auto itLayoutEnd = std::end (fLayout)-1; + for ( ; itLayout != itLayoutEnd; ++itLayout) { + fNet.AddLayer((*itLayout).first, (*itLayout).second); + } + fNet.AddLayer(outputSize, EActivationFunction::kIdentity); - if (fWeightInitializationStrategyString == "XAVIER") - fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIER; - if (fWeightInitializationStrategyString == "XAVIERUNIFORM") - fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIERUNIFORM; - else if (fWeightInitializationStrategyString == "LAYERSIZE") - fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::LAYERSIZE; - else if (fWeightInitializationStrategyString == "TEST") - fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::TEST; - else - fWeightInitializationStrategy = TMVA::DNN::WeightInitializationStrategy::XAVIER; + // + // Loss function and output. + // - // create settings + fOutputFunction = EOutputFunction::kSigmoid; if (fAnalysisType == Types::kClassification) - { - if (fErrorStrategy == "SUMOFSQUARES") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::SUMOFSQUARES; - if (fErrorStrategy == "CROSSENTROPY") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY; - if (fErrorStrategy == "MUTUALEXCLUSIVE") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE; + { + if (fErrorStrategy == "SUMOFSQUARES") { + fNet.SetLossFunction(ELossFunction::kMeanSquaredError); } - else if (fAnalysisType == Types::kMulticlass) - { - if (fErrorStrategy == "SUMOFSQUARES") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::SUMOFSQUARES; - if (fErrorStrategy == "CROSSENTROPY") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY; - if (fErrorStrategy == "MUTUALEXCLUSIVE") fModeErrorFunction = TMVA::DNN::ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE; + if (fErrorStrategy == "CROSSENTROPY") { + fNet.SetLossFunction(ELossFunction::kCrossEntropy); } - else if (fAnalysisType == Types::kRegression) - { - if (fErrorStrategy != "SUMOFSQUARES") - { - Log () << kWARNING - << "For regression only SUMOFSQUARES is a valid neural net error function." - << "Setting error function to SUMOFSQUARES now." - << Endl; - } - fModeErrorFunction = TMVA::DNN::ModeErrorFunction::SUMOFSQUARES; + fOutputFunction = EOutputFunction::kSigmoid; + } else if (fAnalysisType == Types::kRegression) { + if (fErrorStrategy != "SUMOFSQUARES") { + Log () << kWARNING << "For regression only SUMOFSQUARES is a valid " + << " neural net error function. Setting error function to " + << " SUMOFSQUARES now." << Endl; } - - for (auto& block : strategyKeyValues) - { - size_t convergenceSteps = fetchValue (block, "ConvergenceSteps", 100); - int batchSize = fetchValue (block, "BatchSize", 30); - int testRepetitions = fetchValue (block, "TestRepetitions", 7); - double factorWeightDecay = fetchValue (block, "WeightDecay", 0.0); - TString regularization = fetchValue (block, "Regularization", TString ("NONE")); - double learningRate = fetchValue (block, "LearningRate", 1e-5); - double momentum = fetchValue (block, "Momentum", 0.3); - int repetitions = fetchValue (block, "Repetitions", 3); - TString strMultithreading = fetchValue (block, "Multithreading", TString ("True")); - std::vector<double> dropConfig; - dropConfig = fetchValue (block, "DropConfig", dropConfig); - int dropRepetitions = fetchValue (block, "DropRepetitions", 3); - - TMVA::DNN::EnumRegularization eRegularization = TMVA::DNN::EnumRegularization::NONE; - if (regularization == "L1") - eRegularization = TMVA::DNN::EnumRegularization::L1; - else if (regularization == "L2") - eRegularization = TMVA::DNN::EnumRegularization::L2; - else if (regularization == "L1MAX") - eRegularization = TMVA::DNN::EnumRegularization::L1MAX; - - - strMultithreading.ToUpper (); - bool multithreading = true; - if (strMultithreading.BeginsWith ("T")) - multithreading = true; - else - multithreading = false; - - - if (fAnalysisType == Types::kClassification) - { - std::shared_ptr<TMVA::DNN::ClassificationSettings> ptrSettings = make_shared <TMVA::DNN::ClassificationSettings> ( - GetName (), - convergenceSteps, batchSize, - testRepetitions, factorWeightDecay, - eRegularization, fScaleToNumEvents, TMVA::DNN::MinimizerType::fSteepest, - learningRate, - momentum, repetitions, multithreading); - ptrSettings->setWeightSums (fSumOfSigWeights_test, fSumOfBkgWeights_test); - fSettings.push_back (ptrSettings); - } - else if (fAnalysisType == Types::kMulticlass) - { - std::shared_ptr<TMVA::DNN::Settings> ptrSettings = make_shared <TMVA::DNN::Settings> ( - GetName (), - convergenceSteps, batchSize, - testRepetitions, factorWeightDecay, - eRegularization, TMVA::DNN::MinimizerType::fSteepest, - learningRate, - momentum, repetitions, multithreading); - fSettings.push_back (ptrSettings); - } - else if (fAnalysisType == Types::kRegression) - { - std::shared_ptr<TMVA::DNN::Settings> ptrSettings = make_shared <TMVA::DNN::Settings> ( - GetName (), - convergenceSteps, batchSize, - testRepetitions, factorWeightDecay, - eRegularization, TMVA::DNN::MinimizerType::fSteepest, - learningRate, - momentum, repetitions, multithreading); - fSettings.push_back (ptrSettings); - } + fNet.SetLossFunction(ELossFunction::kMeanSquaredError); + fOutputFunction = EOutputFunction::kIdentity; + } else if (fAnalysisType == Types::kMulticlass) { + if (fErrorStrategy == "SUMOFSQUARES") { + fNet.SetLossFunction(ELossFunction::kMeanSquaredError); + } + if (fErrorStrategy == "CROSSENTROPY") { + fNet.SetLossFunction(ELossFunction::kCrossEntropy); + } + if (fErrorStrategy == "MUTUALEXCLUSIVE") { + Log () << kFatal << "MUTUALEXCLUSIVE not yet implemented." << Endl; + } + fOutputFunction = EOutputFunction::kSigmoid; + } - - if (dropRepetitions > 0 && !dropConfig.empty ()) - { - fSettings.back ()->setDropOut (std::begin (dropConfig), std::end (dropConfig), dropRepetitions); - } - + // + // Initialization + // + + if (fWeightInitializationString == "XAVIER") { + fWeightInitialization = DNN::EInitialization::kGauss; + } + else if (fWeightInitializationString == "XAVIERUNIFORM") { + fWeightInitialization = DNN::EInitialization::kUniform; + } + else { + fWeightInitialization = DNN::EInitialization::kGauss; + } + + // + // Training settings. + // + + KeyValueVector_t strategyKeyValues = ParseKeyValueString(fTrainingStrategyString, + TString ("|"), + TString (",")); + for (auto& block : strategyKeyValues) { + TTrainingSettings settings; + + settings.convergenceSteps = fetchValue(block, "ConvergenceSteps", 100); + settings.batchSize = fetchValue(block, "BatchSize", 30); + settings.testInterval = fetchValue(block, "TestRepetitions", 7); + settings.weightDecay = fetchValue(block, "WeightDecay", 0.0); + settings.learningRate = fetchValue(block, "LearningRate", 1e-5); + settings.momentum = fetchValue(block, "Momentum", 0.3); + settings.dropoutProbabilities = fetchValue(block, "DropConfig", + std::vector<Double_t>()); + + TString regularization = fetchValue(block, "Regularization", + TString ("NONE")); + if (regularization == "L1") { + settings.regularization = DNN::ERegularization::kL1; + } else if (regularization == "L2") { + settings.regularization = DNN::ERegularization::kL2; + } + + TString strMultithreading = fetchValue(block, "Multithreading", + TString ("True")); + if (strMultithreading.BeginsWith ("T")) { + settings.multithreading = true; + } else { + settings.multithreading = false; } + + fTrainingSettings.push_back(settings); + } } //______________________________________________________________________________ void TMVA::MethodDNN::Train() { - - fMonitoring = NULL; - // if (!fMonitoring) - // { - // fMonitoring = make_shared<Monitoring>(); - // fMonitoring->Start (); - // } - - // INITIALIZATION - // create pattern + if (fArchitectureString == "GPU") { + TrainGpu(); + return; + } else if (fArchitectureString == "OpenCL") { + Log() << kFATAL << "OpenCL backend not yes supported." << Endl; + return; + } else if (fArchitectureString == "CPU") { + TrainCpu<Double_t>(); + return; + } + + Log() << kINFO << "Using Standard Implementation."; + std::vector<Pattern> trainPattern; std::vector<Pattern> testPattern; const std::vector<TMVA::Event*>& eventCollectionTraining = GetEventCollection (Types::kTraining); const std::vector<TMVA::Event*>& eventCollectionTesting = GetEventCollection (Types::kTesting); - for (size_t iEvt = 0, iEvtEnd = eventCollectionTraining.size (); iEvt < iEvtEnd; ++iEvt) - { - const TMVA::Event* event = eventCollectionTraining.at (iEvt); - const std::vector<Float_t>& values = event->GetValues (); - if (fAnalysisType == Types::kClassification) - { - double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; - trainPattern.push_back (Pattern (values.begin (), values.end (), outputValue, event->GetWeight ())); - trainPattern.back ().addInput (1.0); // bias node - } - else - { - const std::vector<Float_t>& targets = event->GetTargets (); - trainPattern.push_back (Pattern (values.begin (), values.end (), targets.begin (), targets.end (), event->GetWeight ())); - trainPattern.back ().addInput (1.0); // bias node - } - } - - for (size_t iEvt = 0, iEvtEnd = eventCollectionTesting.size (); iEvt < iEvtEnd; ++iEvt) - { - const TMVA::Event* event = eventCollectionTesting.at (iEvt); - const std::vector<Float_t>& values = event->GetValues (); - if (fAnalysisType == Types::kClassification) - { - double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; - testPattern.push_back (Pattern (values.begin (), values.end (), outputValue, event->GetWeight ())); - testPattern.back ().addInput (1.0); // bias node - } - else - { - const std::vector<Float_t>& targets = event->GetTargets (); - testPattern.push_back (Pattern (values.begin (), values.end (), targets.begin (), targets.end (), event->GetWeight ())); - testPattern.back ().addInput (1.0); // bias node - } + for (auto &event : eventCollectionTraining) { + const std::vector<Float_t>& values = event->GetValues(); + if (fAnalysisType == Types::kClassification) { + double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; + trainPattern.push_back(Pattern (values.begin(), + values.end(), + outputValue, + event->GetWeight())); + trainPattern.back().addInput(1.0); + } else { + const std::vector<Float_t>& targets = event->GetTargets (); + trainPattern.push_back(Pattern(values.begin(), + values.end(), + targets.begin(), + targets.end(), + event->GetWeight ())); + trainPattern.back ().addInput (1.0); // bias node } + } - if (trainPattern.empty () || testPattern.empty ()) - return; - - // create net and weights - fNet.clear (); - fWeights.clear (); - - // if "resume" from saved weights - if (fResume) - { - std::cout << ".. resume" << std::endl; - // std::tie (fNet, fWeights) = ReadWeights (fFileName); + for (auto &event : eventCollectionTesting) { + const std::vector<Float_t>& values = event->GetValues(); + if (fAnalysisType == Types::kClassification) { + double outputValue = event->GetClass () == 0 ? 0.9 : 0.1; + testPattern.push_back(Pattern (values.begin(), + values.end(), + outputValue, + event->GetWeight())); + testPattern.back().addInput(1.0); + } else { + const std::vector<Float_t>& targets = event->GetTargets (); + testPattern.push_back(Pattern(values.begin(), + values.end(), + targets.begin(), + targets.end(), + event->GetWeight ())); + testPattern.back ().addInput (1.0); // bias node } - else // initialize weights and net - { - size_t inputSize = GetNVariables (); //trainPattern.front ().input ().size (); - size_t outputSize = fAnalysisType == Types::kClassification ? 1 : GetNTargets (); //trainPattern.front ().output ().size (); - fNet.setInputSize (inputSize + 1); // num vars + bias node - fNet.setOutputSize (outputSize); // num vars + bias node - - // configure neural net - auto itLayout = std::begin (fLayout), itLayoutEnd = std::end (fLayout)-1; // all layers except the last one - for ( ; itLayout != itLayoutEnd; ++itLayout) - { - fNet.addLayer (DNN::Layer ((*itLayout).first, (*itLayout).second)); - Log() << kINFO - << "Add Layer with " << (*itLayout).first << " nodes." - << Endl; - } + } - DNN::ModeOutputValues eModeOutputValues = DNN::ModeOutputValues::SIGMOID; - if (fAnalysisType == Types::kRegression) - { - eModeOutputValues = DNN::ModeOutputValues::DIRECT; - } - else if ((fAnalysisType == Types::kClassification || - fAnalysisType == Types::kMulticlass) && - fModeErrorFunction == TMVA::DNN::ModeErrorFunction::SUMOFSQUARES) - { - eModeOutputValues = DNN::ModeOutputValues::DIRECT; - } - fNet.addLayer (DNN::Layer (outputSize, (*itLayout).second, eModeOutputValues)); - Log() << kINFO - << "Add Layer with " << outputSize << " nodes." - << Endl << Endl; - fNet.setErrorFunction (fModeErrorFunction); - - size_t numWeights = fNet.numWeights (); - Log() << kINFO - << "Total number of Synapses = " - << numWeights - << Endl; - - // initialize weights - fNet.initializeWeights (fWeightInitializationStrategy, - std::back_inserter (fWeights)); + TMVA::DNN::Net net; + std::vector<double> weights; + + net.setInputSize(fNet.GetInputWidth() + 1); + net.setOutputSize(fNet.GetOutputWidth() + 1); + + for (size_t i = 0; i < fNet.GetDepth(); i++) { + EActivationFunction f = fNet.GetLayer(i).GetActivationFunction(); + EnumFunction g = EnumFunction::LINEAR; + switch(f) { + case EActivationFunction::kIdentity: g = EnumFunction::LINEAR; break; + case EActivationFunction::kRelu: g = EnumFunction::RELU; break; + case EActivationFunction::kSigmoid: g = EnumFunction::SIGMOID; break; + case EActivationFunction::kTanh: g = EnumFunction::TANH; break; + case EActivationFunction::kSymmRelu: g = EnumFunction::SYMMRELU; break; + case EActivationFunction::kSoftSign: g = EnumFunction::SOFTSIGN; break; + case EActivationFunction::kGauss: g = EnumFunction::GAUSS; break; } - - - // loop through settings - // and create "settings" and minimizer - int idxSetting = 0; - for (auto itSettings = std::begin (fSettings), itSettingsEnd = std::end (fSettings); itSettings != itSettingsEnd; ++itSettings, ++idxSetting) - { - std::shared_ptr<TMVA::DNN::Settings> ptrSettings = *itSettings; - ptrSettings->setMonitoring (fMonitoring); - Log() << kINFO - << "Training with learning rate = " << ptrSettings->learningRate () - << ", momentum = " << ptrSettings->momentum () - << ", repetitions = " << ptrSettings->repetitions () - << Endl; - - ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), (idxSetting+1)*100.0/(fSettings.size ())); - - const std::vector<double>& dropConfig = ptrSettings->dropFractions (); - if (!dropConfig.empty ()) - { - Log () << kINFO << "Drop configuration" << Endl - << " drop repetitions = " << ptrSettings->dropRepetitions () << Endl; - } - int idx = 0; - for (auto f : dropConfig) - { - Log () << kINFO << " Layer " << idx << " = " << f << Endl; - ++idx; - } - Log () << kINFO << Endl; - - if (ptrSettings->minimizerType () == TMVA::DNN::MinimizerType::fSteepest) - { - DNN::Steepest minimizer (ptrSettings->learningRate (), ptrSettings->momentum (), ptrSettings->repetitions ()); - /*E =*/fNet.train (fWeights, trainPattern, testPattern, minimizer, *ptrSettings.get ()); - } - ptrSettings.reset (); - Log () << kINFO << Endl; + if (i < fNet.GetDepth() - 1) { + net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g)); + } else { + ModeOutputValues h = ModeOutputValues::DIRECT; + switch(fOutputFunction) { + case EOutputFunction::kIdentity: h = ModeOutputValues::DIRECT; break; + case EOutputFunction::kSigmoid: h = ModeOutputValues::SIGMOID; break; + } + net.addLayer(Layer(fNet.GetLayer(i).GetWidth(), g, h)); } - fMonitoring = 0; -} - + } + switch(fNet.GetLossFunction()) { + case ELossFunction::kMeanSquaredError: + net.setErrorFunction(ModeErrorFunction::SUMOFSQUARES); + break; + case ELossFunction::kCrossEntropy: + net.setErrorFunction(ModeErrorFunction::CROSSENTROPY); + break; + } + switch(fWeightInitialization) { + case EInitialization::kGauss: + net.initializeWeights(WeightInitializationStrategy::XAVIER, + std::back_inserter(weights)); + break; + case EInitialization::kUniform: + net.initializeWeights(WeightInitializationStrategy::XAVIERUNIFORM, + std::back_inserter(weights)); + break; + default: + net.initializeWeights(WeightInitializationStrategy::XAVIER, + std::back_inserter(weights)); + break; + } -//_______________________________________________________________________ -Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ ) -{ - if (fWeights.empty ()) - return 0.0; - - const std::vector<Float_t>& inputValues = GetEvent ()->GetValues (); - std::vector<double> input (inputValues.begin (), inputValues.end ()); - input.push_back (1.0); // bias node - std::vector<double> output = fNet.compute (input, fWeights); - if (output.empty ()) - return 0.0; + int idxSetting = 0; + for (auto s : fTrainingSettings) { - return output.at (0); -} + EnumRegularization r = EnumRegularization::NONE; + switch(s.regularization) { + case ERegularization::kNone: r = EnumRegularization::NONE; break; + case ERegularization::kL1: r = EnumRegularization::L1; break; + case ERegularization::kL2: r = EnumRegularization::L2; break; + } -//////////////////////////////////////////////////////////////////////////////// -/// get the regression value generated by the DNN + Settings * settings = new Settings(TString(), s.convergenceSteps, s.batchSize, + s.testInterval, s.weightDecay, r, + MinimizerType::fSteepest, s.learningRate, + s.momentum, 1, s.multithreading); + std::shared_ptr<Settings> ptrSettings(settings); + ptrSettings->setMonitoring (0); + Log() << kINFO + << "Training with learning rate = " << ptrSettings->learningRate () + << ", momentum = " << ptrSettings->momentum () + << ", repetitions = " << ptrSettings->repetitions () + << Endl; -const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues() -{ - assert (!fWeights.empty ()); - if (fWeights.empty ()) - return *fRegressionReturnVal; - - const Event * ev = GetEvent(); - - const std::vector<Float_t>& inputValues = ev->GetValues (); - std::vector<double> input (inputValues.begin (), inputValues.end ()); - input.push_back (1.0); // bias node - std::vector<double> output = fNet.compute (input, fWeights); - - if (fRegressionReturnVal == NULL) fRegressionReturnVal = new std::vector<Float_t>(); - fRegressionReturnVal->clear(); + ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()), + (idxSetting+1)*100.0/(fSettings.size ())); - assert (!output.empty ()); - if (output.empty ()) - return *fRegressionReturnVal; + const std::vector<double>& dropConfig = ptrSettings->dropFractions (); + if (!dropConfig.empty ()) { + Log () << kINFO << "Drop configuration" << Endl + << " drop repetitions = " << ptrSettings->dropRepetitions() + << Endl; + } - Event * evT = new Event(*ev); - UInt_t ntgts = fNet.outputSize (); - for (UInt_t itgt = 0; itgt < ntgts; ++itgt) { - evT->SetTarget(itgt,output.at (itgt)); + int idx = 0; + for (auto f : dropConfig) { + Log () << kINFO << " Layer " << idx << " = " << f << Endl; + ++idx; + } + Log () << kINFO << Endl; + + DNN::Steepest minimizer(ptrSettings->learningRate(), + ptrSettings->momentum(), + ptrSettings->repetitions()); + net.train(weights, trainPattern, testPattern, minimizer, *ptrSettings.get()); + ptrSettings.reset(); + Log () << kINFO << Endl; + idxSetting++; } - - const Event* evT2 = GetTransformationHandler().InverseTransform( evT ); - for (UInt_t itgt = 0; itgt < ntgts; ++itgt) { - fRegressionReturnVal->push_back( evT2->GetTarget(itgt) ); + size_t weightIndex = 0; + for (size_t l = 0; l < fNet.GetDepth(); l++) { + auto & layerWeights = fNet.GetLayer(l).GetWeights(); + for (size_t j = 0; j < layerWeights.GetNcols(); j++) { + for (size_t i = 0; i < layerWeights.GetNrows(); i++) { + layerWeights(i,j) = weights[weightIndex]; + weightIndex++; + } + } + auto & layerBiases = fNet.GetLayer(l).GetBiases(); + if (l == 0) { + for (size_t i = 0; i < layerBiases.GetNrows(); i++) { + layerBiases(i,0) = weights[weightIndex]; + weightIndex++; + } + } else { + for (size_t i = 0; i < layerBiases.GetNrows(); i++) { + layerBiases(i,0) = 0.0; + } + } } - - delete evT; - - return *fRegressionReturnVal; } +//______________________________________________________________________________ +void TMVA::MethodDNN::TrainGpu() +{ +#ifdef DNNCUDA // Included only if DNNCUDA flag is set. + + size_t nTrainingSamples = GetEventCollection(Types::kTraining).size(); + size_t nTestSamples = GetEventCollection(Types::kTesting).size(); + + Log() << kINFO << "Start of neural network training on GPU." << Endl; + + size_t trainingPhase = 1; + fNet.Initialize(fWeightInitialization); + for (TTrainingSettings & settings : fTrainingSettings) { + + TNet<TCuda<>> net(settings.batchSize, fNet); + net.SetWeightDecay(settings.weightDecay); + net.SetRegularization(settings.regularization); + net.SetDropoutProbabilities(settings.dropoutProbabilities); + net.InitializeGradients(); + auto testNet = net.CreateClone(settings.batchSize); + + Log() << kINFO << "Training phase " << trainingPhase << " of " + << fTrainingSettings.size() << ":" << Endl; + trainingPhase++; + + using DataLoader_t = TDataLoader<TMVAInput_t, TCuda<>>; + + size_t nThreads = 1; + DataLoader_t trainingData(GetEventCollection(Types::kTraining), + nTrainingSamples, + net.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + DataLoader_t testData(GetEventCollection(Types::kTesting), + nTestSamples, + testNet.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + DNN::TGradientDescent<TCuda<>> minimizer(settings.learningRate, + settings.convergenceSteps, + settings.testInterval); + + std::vector<TNet<TCuda<>>> nets{}; + std::vector<TBatch<TCuda<>>> batches{}; + nets.reserve(nThreads); + for (size_t i = 0; i < nThreads; i++) { + nets.push_back(net); + for (size_t j = 0; j < net.GetDepth(); j++) + { + auto &masterLayer = net.GetLayer(j); + auto &layer = nets.back().GetLayer(j); + TCuda<>::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + TCuda<>::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } + } + bool converged = false; + size_t stepCount = 0; + size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize(); + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + Log() << std::setw(10) << "Epoch" << " | " + << std::setw(12) << "Train Err." + << std::setw(12) << "Test Err." + << std::setw(12) << "GFLOP/s" + << std::setw(12) << "Conv. Steps" << Endl; + std::string separator(62, '-'); + Log() << separator << Endl; + while (!converged) + { + stepCount++; + + // Perform minimization steps for a full epoch. + trainingData.Shuffle(); + for (size_t i = 0; i < batchesInEpoch; i += nThreads) { + batches.clear(); + for (size_t j = 0; j < nThreads; j++) { + batches.reserve(nThreads); + batches.push_back(trainingData.GetBatch()); + } + if (settings.momentum > 0.0) { + minimizer.StepMomentum(net, nets, batches, settings.momentum); + } else { + minimizer.Step(net, nets, batches); + } + } + if ((stepCount % minimizer.GetTestInterval()) == 0) { + // Compute test error. + Double_t testError = 0.0; + for (auto batch : testData) { + auto inputMatrix = batch.GetInput(); + auto outputMatrix = batch.GetOutput(); + testError += testNet.Loss(inputMatrix, outputMatrix); + } + testError /= (Double_t) (nTestSamples / settings.batchSize); -//////////////////////////////////////////////////////////////////////////////// -/// get the multiclass classification values generated by the DNN + end = std::chrono::system_clock::now(); -const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues() -{ - if (fWeights.empty ()) - return *fRegressionReturnVal; - - const std::vector<Float_t>& inputValues = GetEvent ()->GetValues (); - std::vector<double> input (inputValues.begin (), inputValues.end ()); - input.push_back (1.0); // bias node - std::vector<double> output = fNet.compute (input, fWeights); - - // check the output of the network - - if (fMulticlassReturnVal == NULL) fMulticlassReturnVal = new std::vector<Float_t>(); - fMulticlassReturnVal->clear(); - std::vector<Float_t> temp; - - UInt_t nClasses = DataInfo().GetNClasses(); - assert (nClasses == output.size()); - for (UInt_t icls = 0; icls < nClasses; icls++) { - temp.push_back (output.at (icls)); - } - - for(UInt_t iClass=0; iClass<nClasses; iClass++){ - Double_t norm = 0.0; - for(UInt_t j=0;j<nClasses;j++){ - if(iClass!=j) - norm+=exp(temp[j]-temp[iClass]); + // Compute training error. + Double_t trainingError = 0.0; + for (auto batch : trainingData) { + auto inputMatrix = batch.GetInput(); + auto outputMatrix = batch.GetOutput(); + trainingError += net.Loss(inputMatrix, outputMatrix); + } + trainingError /= (Double_t) (nTrainingSamples / settings.batchSize); + + // Compute numerical throughput. + std::chrono::duration<double> elapsed_seconds = end - start; + double seconds = elapsed_seconds.count(); + double nFlops = (double) (settings.testInterval * batchesInEpoch); + nFlops *= net.GetNFlops() * 1e-9; + + converged = minimizer.HasConverged(testError); + start = std::chrono::system_clock::now(); + + Log() << std::setw(10) << stepCount << " | " + << std::setw(12) << trainingError + << std::setw(12) << testError + << std::setw(12) << nFlops / seconds + << std::setw(12) << minimizer.GetConvergenceCount() << Endl; + if (converged) { + Log() << Endl; + } + } + } + for (size_t l = 0; l < net.GetDepth(); l++) { + fNet.GetLayer(l).GetWeights() = (TMatrixT<Double_t>) net.GetLayer(l).GetWeights(); + fNet.GetLayer(l).GetBiases() = (TMatrixT<Double_t>) net.GetLayer(l).GetBiases(); } - (*fMulticlassReturnVal).push_back(1.0/(1.0+norm)); } +#else // DNNCUDA flag not set. - - return *fMulticlassReturnVal; + Log() << kFATAL << "CUDA backend not enabled. Please make sure " + "you have CUDA installed and it was successfully " + "detected by CMAKE." << Endl; +#endif // DNNCUDA } - - - - - -//_______________________________________________________________________ -void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const +//______________________________________________________________________________ +template<typename AFloat> +void TMVA::MethodDNN::TrainCpu() { - // create XML description of DNN classifier - // for all layers - void* nn = gTools().xmlengine().NewChild(parent, 0, "Weights"); - void* xmlLayout = gTools().xmlengine().NewChild(nn, 0, "Layout"); - Int_t numLayers = fNet.layers ().size (); - gTools().xmlengine().NewAttr(xmlLayout, 0, "NumberLayers", gTools().StringFromInt (numLayers) ); - for (Int_t i = 0; i < numLayers; i++) - { - const TMVA::DNN::Layer& layer = fNet.layers ().at (i); - int numNodes = layer.numNodes (); - char activationFunction = (char)(layer.activationFunctionType ()); - int outputMode = (int)layer.modeOutputValues (); - - TString outputModeStr; - outputModeStr.Form ("%d", outputMode); - - void* layerxml = gTools().xmlengine().NewChild(xmlLayout, 0, "Layer"); - gTools().xmlengine().NewAttr(layerxml, 0, "Connection", TString("FULL") ); - gTools().xmlengine().NewAttr(layerxml, 0, "Nodes", gTools().StringFromInt(numNodes) ); - gTools().xmlengine().NewAttr(layerxml, 0, "ActivationFunction", TString (activationFunction) ); - gTools().xmlengine().NewAttr(layerxml, 0, "OutputMode", outputModeStr); +#ifdef DNNCPU // Included only if DNNCPU flag is set. + + size_t nTrainingSamples = GetEventCollection(Types::kTraining).size(); + size_t nTestSamples = GetEventCollection(Types::kTesting).size(); + + Log() << kINFO << "Start of neural network training on CPU." << Endl << Endl; + + fNet.Initialize(fWeightInitialization); + + size_t trainingPhase = 1; + for (TTrainingSettings & settings : fTrainingSettings) { + + Log() << "Training phase " << trainingPhase << " of " + << fTrainingSettings.size() << ":" << Endl; + trainingPhase++; + + TNet<TCpu<AFloat>> net(settings.batchSize, fNet); + net.SetWeightDecay(settings.weightDecay); + net.SetRegularization(settings.regularization); + net.SetDropoutProbabilities(settings.dropoutProbabilities); + net.InitializeGradients(); + auto testNet = net.CreateClone(settings.batchSize); + + using DataLoader_t = TDataLoader<TMVAInput_t, TCpu<AFloat>>; + + size_t nThreads = 1; + DataLoader_t trainingData(GetEventCollection(Types::kTraining), + nTrainingSamples, + net.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + DataLoader_t testData(GetEventCollection(Types::kTesting), + nTestSamples, + testNet.GetBatchSize(), + net.GetInputWidth(), + net.GetOutputWidth(), nThreads); + DNN::TGradientDescent<TCpu<AFloat>> minimizer(settings.learningRate, + settings.convergenceSteps, + settings.testInterval); + + std::vector<TNet<TCpu<AFloat>>> nets{}; + std::vector<TBatch<TCpu<AFloat>>> batches{}; + nets.reserve(nThreads); + for (size_t i = 0; i < nThreads; i++) { + nets.push_back(net); + for (size_t j = 0; j < net.GetDepth(); j++) + { + auto &masterLayer = net.GetLayer(j); + auto &layer = nets.back().GetLayer(j); + TCpu<AFloat>::Copy(layer.GetWeights(), + masterLayer.GetWeights()); + TCpu<AFloat>::Copy(layer.GetBiases(), + masterLayer.GetBiases()); + } } + bool converged = false; + size_t stepCount = 0; + size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize(); - void* weightsxml = gTools().xmlengine().NewChild(nn, 0, "Synapses"); - gTools().xmlengine().NewAttr (weightsxml, 0, "InputSize", gTools().StringFromInt((int)fNet.inputSize ())); - gTools().xmlengine().NewAttr (weightsxml, 0, "OutputSize", gTools().StringFromInt((int)fNet.outputSize ())); - gTools().xmlengine().NewAttr (weightsxml, 0, "NumberSynapses", gTools().StringFromInt((int)fWeights.size ())); - std::stringstream s(""); - s.precision( 16 ); - for (std::vector<double>::const_iterator it = fWeights.begin (), itEnd = fWeights.end (); it != itEnd; ++it) - { - s << std::scientific << (*it) << " "; - } - gTools().xmlengine().AddRawLine (weightsxml, s.str().c_str()); -} + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + Log() << std::setw(10) << "Epoch" << " | " + << std::setw(12) << "Train Err." + << std::setw(12) << "Test Err." + << std::setw(12) << "GFLOP/s" + << std::setw(12) << "Conv. Steps" << Endl; + std::string separator(62, '-'); + Log() << separator << Endl; -//_______________________________________________________________________ -void TMVA::MethodDNN::ReadWeightsFromXML( void* wghtnode ) -{ - // read MLP from xml weight file - fNet.clear (); - - void* nn = gTools().GetChild(wghtnode, "Weights"); - if (!nn) - { - // std::cout << "no node \"Weights\" in XML, use weightnode" << std::endl; - nn = wghtnode; - } - - void* xmlLayout = NULL; - xmlLayout = gTools().GetChild(wghtnode, "Layout"); - if (!xmlLayout) + while (!converged) { - std::cout << "no node Layout in XML" << std::endl; - return; - } + stepCount++; + // Perform minimization steps for a full epoch. + trainingData.Shuffle(); + for (size_t i = 0; i < batchesInEpoch; i += nThreads) { + batches.clear(); + for (size_t j = 0; j < nThreads; j++) { + batches.reserve(nThreads); + batches.push_back(trainingData.GetBatch()); + } + if (settings.momentum > 0.0) { + minimizer.StepMomentum(net, nets, batches, settings.momentum); + } else { + minimizer.Step(net, nets, batches); + } + } + if ((stepCount % minimizer.GetTestInterval()) == 0) { - - // std::cout << "read layout from XML" << std::endl; - void* ch = gTools().xmlengine().GetChild (xmlLayout); - TString connection; - UInt_t numNodes; - TString activationFunction; - TString outputMode; - fNet.clear (); - while (ch) - { - gTools().ReadAttr (ch, "Connection", connection); - gTools().ReadAttr (ch, "Nodes", numNodes); - gTools().ReadAttr (ch, "ActivationFunction", activationFunction); - gTools().ReadAttr (ch, "OutputMode", outputMode); - ch = gTools().GetNextChild(ch); + // Compute test error. + AFloat testError = 0.0; + for (auto batch : testData) { + auto inputMatrix = batch.GetInput(); + auto outputMatrix = batch.GetOutput(); + testError += testNet.Loss(inputMatrix, outputMatrix); + } + testError /= (Double_t) (nTestSamples / settings.batchSize); - fNet.addLayer (DNN::Layer (numNodes, (TMVA::DNN::EnumFunction)activationFunction (0), (DNN::ModeOutputValues)outputMode.Atoi ())); - } + end = std::chrono::system_clock::now(); - // std::cout << "read weights XML" << std::endl; + // Compute training error. + AFloat trainingError = 0.0; + for (auto batch : trainingData) { + auto inputMatrix = batch.GetInput(); + auto outputMatrix = batch.GetOutput(); + trainingError += net.Loss(inputMatrix, outputMatrix); + } + trainingError /= (Double_t) (nTrainingSamples / settings.batchSize); + + // Compute numerical throughput. + std::chrono::duration<double> elapsed_seconds = end - start; + double seconds = elapsed_seconds.count(); + double nFlops = (double) (settings.testInterval * batchesInEpoch); + nFlops *= net.GetNFlops() * 1e-9; + + converged = minimizer.HasConverged(testError); + start = std::chrono::system_clock::now(); + + Log() << std::setw(10) << stepCount << " | " + << std::setw(12) << trainingError + << std::setw(12) << testError + << std::setw(12) << nFlops / seconds + << std::setw(12) << minimizer.GetConvergenceCount() << Endl; + if (converged) { + Log() << Endl; + } + } + } - void* xmlWeights = NULL; - xmlWeights = gTools().GetChild(wghtnode, "Synapses"); - if (!xmlWeights) - return; - Int_t numWeights (0); - Int_t inputSize (0); - Int_t outputSize (0); - gTools().ReadAttr (xmlWeights, "NumberSynapses", numWeights); - gTools().ReadAttr (xmlWeights, "InputSize", inputSize); - gTools().ReadAttr (xmlWeights, "OutputSize", outputSize); - fNet.setInputSize (inputSize); - fNet.setOutputSize (outputSize); // num vars + bias node - - const char* content = gTools().GetContent (xmlWeights); - std::stringstream sstr (content); - for (Int_t iWeight = 0; iWeight<numWeights; ++iWeight) - { // synapses - Double_t weight; - sstr >> weight; - fWeights.push_back (weight); + for (size_t l = 0; l < net.GetDepth(); l++) { + auto & layer = fNet.GetLayer(l); + layer.GetWeights() = (TMatrixT<Double_t>) net.GetLayer(l).GetWeights(); + layer.GetBiases() = (TMatrixT<Double_t>) net.GetLayer(l).GetBiases(); } -} + } +#else // DNNCPU flag not set. + Log() << kFATAL << "Multi-core CPU backend not enabled. Please make sure " + "you have a BLAS implementation and tbb installed and" + " it was successfully detected by CMAKE." << Endl; +#endif // DNNCPU +} -//_______________________________________________________________________ -void TMVA::MethodDNN::ReadWeightsFromStream( std::istream & /*istr*/) +//______________________________________________________________________________ +Double_t TMVA::MethodDNN::GetMvaValue( Double_t* /*errLower*/, Double_t* /*errUpper*/ ) { - // // destroy/clear the network then read it back in from the weights file - - // // delete network so we can reconstruct network from scratch - - // TString dummy; + size_t nVariables = GetEvent()->GetNVariables(); + Matrix_t X(1, nVariables); + Matrix_t YHat(1, 1); - // // synapse weights - // Double_t weight; - // std::vector<Double_t>* weights = new std::vector<Double_t>(); - // istr>> dummy; - // while (istr>> dummy >> weight) weights->push_back(weight); // use w/ slower write-out - - // ForceWeights(weights); - + const std::vector<Float_t>& inputValues = GetEvent()->GetValues(); + for (size_t i = 0; i < nVariables; i++) { + X(0,i) = inputValues[i]; + } - // delete weights; + fNet.Prediction(YHat, X, fOutputFunction); + return YHat(0,0); } -//_______________________________________________________________________ -const TMVA::Ranking* TMVA::MethodDNN::CreateRanking() +//______________________________________________________________________________ +const std::vector<Float_t> &TMVA::MethodDNN::GetRegressionValues() { - // compute ranking of input variables by summing function of weights - - // create the ranking object - fRanking = new Ranking( GetName(), "Importance" ); + size_t nVariables = GetEvent()->GetNVariables(); + Matrix_t X(1, nVariables); - for (UInt_t ivar=0; ivar<GetNvar(); ivar++) { - fRanking->AddRank( Rank( GetInputLabel(ivar), 1.0)); + const Event *ev = GetEvent(); + const std::vector<Float_t>& inputValues = ev->GetValues(); + for (size_t i = 0; i < nVariables; i++) { + X(0,i) = inputValues[i]; } - // TNeuron* neuron; - // TSynapse* synapse; - // Double_t importance, avgVal; - // TString varName; - - // for (UInt_t ivar = 0; ivar < GetNvar(); ivar++) { - - // neuron = GetInputNeuron(ivar); - // Int_t numSynapses = neuron->NumPostLinks(); - // importance = 0; - // varName = GetInputVar(ivar); // fix this line - - // // figure out average value of variable i - // Double_t meanS, meanB, rmsS, rmsB, xmin, xmax; - // Statistics( TMVA::Types::kTraining, varName, - // meanS, meanB, rmsS, rmsB, xmin, xmax ); + size_t nTargets = std::max(1u, ev->GetNTargets()); + Matrix_t YHat(1, nTargets); + std::vector<Float_t> output(nTargets); + auto net = fNet.CreateClone(1); + net.Prediction(YHat, X, fOutputFunction); - // avgVal = (TMath::Abs(meanS) + TMath::Abs(meanB))/2.0; - // double meanrms = (TMath::Abs(rmsS) + TMath::Abs(rmsB))/2.; - // if (avgVal<meanrms) avgVal = meanrms; - // if (IsNormalised()) avgVal = 0.5*(1 + gTools().NormVariable( avgVal, GetXmin( ivar ), GetXmax( ivar ))); + for (size_t i = 0; i < nTargets; i++) + output[i] = YHat(0, i); - // for (Int_t j = 0; j < numSynapses; j++) { - // synapse = neuron->PostLinkAt(j); - // importance += synapse->GetWeight() * synapse->GetWeight(); - // } - - // importance *= avgVal * avgVal; + if (fRegressionReturnVal == NULL) { + fRegressionReturnVal = new std::vector<Float_t>(); + } + fRegressionReturnVal->clear(); - // fRanking->AddRank( Rank( varName, importance ) ); - // } + Event * evT = new Event(*ev); + for (size_t i = 0; i < nTargets; ++i) { + evT->SetTarget(i, output[i]); + } - return fRanking; + const Event* evT2 = GetTransformationHandler().InverseTransform(evT); + for (size_t i = 0; i < nTargets; ++i) { + fRegressionReturnVal->push_back(evT2->GetTarget(i)); + } + delete evT; + return *fRegressionReturnVal; } +const std::vector<Float_t> &TMVA::MethodDNN::GetMulticlassValues() +{ + Log() << kFATAL << "ERROR: Multiclass classification not yet implemented." + << Endl; + return *fMulticlassReturnVal; +} +//______________________________________________________________________________ +void TMVA::MethodDNN::AddWeightsXMLTo( void* parent ) const +{ + void* nn = gTools().xmlengine().NewChild(parent, 0, "Weights"); + Int_t inputWidth = fNet.GetInputWidth(); + Int_t depth = fNet.GetDepth(); + char lossFunction = static_cast<char>(fNet.GetLossFunction()); + gTools().xmlengine().NewAttr(nn, 0, "InputWidth", + gTools().StringFromInt(inputWidth)); + gTools().xmlengine().NewAttr(nn, 0, "Depth", gTools().StringFromInt(depth)); + gTools().xmlengine().NewAttr(nn, 0, "LossFunction", TString(lossFunction)); + gTools().xmlengine().NewAttr(nn, 0, "OutputFunction", + TString(static_cast<char>(fOutputFunction))); + + for (Int_t i = 0; i < depth; i++) { + const auto& layer = fNet.GetLayer(i); + auto layerxml = gTools().xmlengine().NewChild(nn, 0, "Layer"); + char activationFunction = static_cast<char>(layer.GetActivationFunction()); + gTools().xmlengine().NewAttr(layerxml, 0, "ActivationFunction", + TString (activationFunction)); + WriteMatrixXML(layerxml, "Weights", layer.GetWeights()); + WriteMatrixXML(layerxml, "Biases", layer.GetBiases()); + } +} +//______________________________________________________________________________ +void TMVA::MethodDNN::ReadWeightsFromXML(void* rootXML) +{ + auto netXML = gTools().GetChild(rootXML, "Weights"); + if (!netXML){ + netXML = rootXML; + } + fNet.Clear(); + fNet.SetBatchSize(1); + + size_t inputWidth, depth; + gTools().ReadAttr(netXML, "InputWidth", inputWidth); + gTools().ReadAttr(netXML, "Depth", depth); + char lossFunctionChar; + gTools().ReadAttr(netXML, "LossFunction", lossFunctionChar); + char outputFunctionChar; + gTools().ReadAttr(netXML, "OutputFunction", outputFunctionChar); + + fNet.SetInputWidth(inputWidth); + fNet.SetLossFunction(static_cast<ELossFunction>(lossFunctionChar)); + fOutputFunction = static_cast<EOutputFunction>(outputFunctionChar); + + size_t previousWidth = inputWidth; + auto layerXML = gTools().xmlengine().GetChild(netXML, "Layer"); + for (size_t i = 0; i < depth; i++) { + TString fString; + EActivationFunction f; + + // Read activation function. + gTools().ReadAttr(layerXML, "ActivationFunction", fString); + f = static_cast<EActivationFunction>(fString(0)); + + // Read number of neurons. + size_t width; + auto matrixXML = gTools().GetChild(layerXML, "Weights"); + gTools().ReadAttr(matrixXML, "rows", width); + + fNet.AddLayer(width, f); + TMatrixT<Double_t> weights(width, previousWidth); + TMatrixT<Double_t> biases(width, 1); + ReadMatrixXML(layerXML, "Weights", weights); + ReadMatrixXML(layerXML, "Biases", biases); + fNet.GetLayer(i).GetWeights() = weights; + fNet.GetLayer(i).GetBiases() = biases; + + layerXML = gTools().GetNextChild(layerXML); + previousWidth = width; + } +} +//______________________________________________________________________________ +void TMVA::MethodDNN::ReadWeightsFromStream( std::istream & /*istr*/) +{ +} +//______________________________________________________________________________ +const TMVA::Ranking* TMVA::MethodDNN::CreateRanking() +{ + fRanking = new Ranking( GetName(), "Importance" ); + for (UInt_t ivar=0; ivar<GetNvar(); ivar++) { + fRanking->AddRank( Rank( GetInputLabel(ivar), 1.0)); + } + return fRanking; +} -//_______________________________________________________________________ -void TMVA::MethodDNN::MakeClassSpecific( std::ostream& /*fout*/, const TString& /*className*/ ) const +//______________________________________________________________________________ +void TMVA::MethodDNN::MakeClassSpecific( std::ostream& /*fout*/, + const TString& /*className*/ ) const { - // write specific classifier response - // MethodADNNBase::MakeClassSpecific(fout, className); } -//_______________________________________________________________________ +//______________________________________________________________________________ void TMVA::MethodDNN::GetHelpMessage() const { // get help message text @@ -1026,10 +1218,10 @@ reduction of overfitting: \n \ Random values scaled by the layer size \n \ \n \ \"TrainingStrategy\" \n \ - - example: \"LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \ + - example: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \ - explanation: two stacked training settings separated by \"|\" \n \ - . first training setting: \"LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \ - . second training setting : \"LearningRate=1e-4,Momentum=0.3,Repetitions=3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \ + . first training setting: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \ + . second training setting : \"LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \ . LearningRate : \n \ - recommended for classification: 0.1 initially, 1e-4 later \n \ - recommended for regression: 1e-4 and less \n \ @@ -1064,135 +1256,7 @@ reduction of overfitting: \n \ . Multithreading \n \ turn on multithreading [recommended: True] \n \ \n"; - Log () << txt << Endl; - -} - - - -//_______________________________________________________________________ -void TMVA::MethodDNN::WriteMonitoringHistosToFile( void ) const -{ - // write histograms and PDFs to file for monitoring purposes - - Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl; - BaseDir()->cd(); -} - - - - -void TMVA::MethodDNN::checkGradients () -{ - size_t inputSize = 1; - size_t outputSize = 1; - - fNet.clear (); - - fNet.setInputSize (inputSize); - fNet.setOutputSize (outputSize); - fNet.addLayer (DNN::Layer (100, DNN::EnumFunction::SOFTSIGN)); - fNet.addLayer (DNN::Layer (30, DNN::EnumFunction::SOFTSIGN)); - fNet.addLayer (DNN::Layer (outputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::SIGMOID)); - fNet.setErrorFunction (DNN::ModeErrorFunction::CROSSENTROPY); - // net.setErrorFunction (ModeErrorFunction::SUMOFSQUARES); - - size_t numWeights = fNet.numWeights (inputSize); - std::vector<double> weights (numWeights); - //weights.at (0) = 1000213.2; - - std::vector<Pattern> pattern; - for (size_t iPat = 0, iPatEnd = 10; iPat < iPatEnd; ++iPat) - { - std::vector<double> input; - std::vector<double> output; - for (size_t i = 0; i < inputSize; ++i) - { - input.push_back (TMVA::DNN::gaussDouble (0.1, 4)); - } - for (size_t i = 0; i < outputSize; ++i) - { - output.push_back (TMVA::DNN::gaussDouble (0, 3)); - } - pattern.push_back (Pattern (input,output)); - } - - - DNN::Settings settings (TString ("checkGradients"), /*_convergenceSteps*/ 15, /*_batchSize*/ 1, /*_testRepetitions*/ 7, /*_factorWeightDecay*/ 0, /*regularization*/ TMVA::DNN::EnumRegularization::NONE); - - size_t improvements = 0; - size_t worsenings = 0; - size_t smallDifferences = 0; - size_t largeDifferences = 0; - for (size_t iTest = 0; iTest < 1000; ++iTest) - { - TMVA::DNN::uniformDouble (weights, 0.7); - std::vector<double> gradients (numWeights, 0); - DNN::Batch batch (begin (pattern), end (pattern)); - DNN::DropContainer dropContainer; - std::tuple<DNN::Settings&, DNN::Batch&, DNN::DropContainer&> settingsAndBatch (settings, batch, dropContainer); - double E = fNet (settingsAndBatch, weights, gradients); - std::vector<double> changedWeights; - changedWeights.assign (weights.begin (), weights.end ()); - - int changeWeightPosition = TMVA::DNN::randomInt (numWeights); - double dEdw = gradients.at (changeWeightPosition); - while (dEdw == 0.0) - { - changeWeightPosition = TMVA::DNN::randomInt (numWeights); - dEdw = gradients.at (changeWeightPosition); - } - - const double gamma = 0.01; - double delta = gamma*dEdw; - changedWeights.at (changeWeightPosition) += delta; - if (dEdw == 0.0) - { - std::cout << "dEdw == 0.0 "; - continue; - } - - assert (dEdw != 0.0); - double Echanged = fNet (settingsAndBatch, changedWeights); - - // double difference = fabs((E-Echanged) - delta*dEdw); - double difference = fabs ((E+delta - Echanged)/E); - bool direction = (E-Echanged)>0 ? true : false; - // bool directionGrad = delta>0 ? true : false; - bool isOk = difference < 0.3 && difference != 0; - - if (direction) - ++improvements; - else - ++worsenings; - - if (isOk) - ++smallDifferences; - else - ++largeDifferences; - - if (true || !isOk) - { - if (!direction) - std::cout << "=================" << std::endl; - std::cout << "E = " << E << " Echanged = " << Echanged << " delta = " << delta << " pos=" << changeWeightPosition << " dEdw=" << dEdw << " difference= " << difference << " dirE= " << direction << std::endl; - } - if (isOk) - { - } - else - { - // for_each (begin (weights), end (weights), [](double w){ std::cout << w << ", "; }); - // std::cout << std::endl; - // assert (isOk); - } - } - std::cout << "improvements = " << improvements << std::endl; - std::cout << "worsenings = " << worsenings << std::endl; - std::cout << "smallDifferences = " << smallDifferences << std::endl; - std::cout << "largeDifferences = " << largeDifferences << std::endl; - - std::cout << "check gradients done" << std::endl; } +} // namespace TMVA diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc2a3e633d359cd76dc2b19d8e8db6a66827428a --- /dev/null +++ b/tmva/tmva/test/CMakeLists.txt @@ -0,0 +1,9 @@ +############################################################################ +# CMakeLists.txt file for building ROOT TMVA tests. +# @author Simon Pfreundschuh +############################################################################ + +project(tmva-tests) +find_package(ROOT REQUIRED) + +ROOT_ADD_TEST_SUBDIRECTORY(DNN) diff --git a/tmva/tmva/test/DNN/CMakeLists.txt b/tmva/tmva/test/DNN/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d3c36c65fbec452f23479450e0e4697335d6b5a --- /dev/null +++ b/tmva/tmva/test/DNN/CMakeLists.txt @@ -0,0 +1,92 @@ +############################################################################ +# CMakeLists.txt file for building TMVA/DNN tests. +# @author Simon Pfreundschuh +############################################################################ + +project(tmva-tests) +find_package(ROOT REQUIRED) + +set(Libraries Core MathCore Matrix TMVA) +include_directories(${ROOT_INCLUDE_DIRS}) + +#--- CUDA tests. --------------------------- +find_package(CUDA) +if (CUDA_FOUND) + + SET(DNN_CUDA_LIBRARIES dnn_cuda ${CUDA_CUBLAS_LIBRARIES}) + + # DNN - Activation Functions Cuda + CUDA_ADD_EXECUTABLE(testActivationFunctionsCuda TestActivationFunctionsCuda.cxx) + TARGET_LINK_LIBRARIES(testActivationFunctionsCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-ActivationFunctionsCuda COMMAND testActivationFunctionsCuda) + + # DNN - Loss Functions Cuda + CUDA_ADD_EXECUTABLE(testLossFunctionsCuda TestLossFunctionsCuda.cxx) + TARGET_LINK_LIBRARIES(testLossFunctionsCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-LossFunctionsCuda COMMAND testLossFunctionsCuda) + + # DNN - Derivatives Cuda + CUDA_ADD_EXECUTABLE(testDerivativesCuda TestDerivativesCuda.cxx) + TARGET_LINK_LIBRARIES(testDerivativesCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-DerivativesCuda COMMAND testDerivativesCuda) + + # DNN - Backpropagation Cuda + CUDA_ADD_EXECUTABLE(testBackpropagationCuda TestBackpropagationCuda.cxx) + TARGET_LINK_LIBRARIES(testBackpropagationCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-BackpropagationCuda COMMAND testBackpropagationCuda) + + # DNN - Minimization Cuda + CUDA_ADD_EXECUTABLE(testMinimizationCuda TestMinimizationCuda.cxx) + TARGET_LINK_LIBRARIES(testMinimizationCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-MinimizationCuda COMMAND testMinimizationCuda) + + # DNN - Arithmetic Cuda + CUDA_ADD_EXECUTABLE(testArithmeticCuda TestMatrixArithmeticCuda.cxx) + TARGET_LINK_LIBRARIES(testArithmeticCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) + ROOT_ADD_TEST(TMVA-DNN-ArithmeticCuda COMMAND testArithmeticCuda) + + # DNN - DataLoader Cuda + CUDA_ADD_EXECUTABLE(testDataLoaderCuda TestDataLoaderCuda.cxx) + TARGET_LINK_LIBRARIES(testDataLoaderCuda ${Libraries} ${DNN_CUDA_LIBRARIES}) +endif (CUDA_FOUND) + +#--- CPU tests. ---------------------------- +find_package(BLAS) +if (BLAS_FOUND AND imt) + + # DNN - Arithmetic Functions CPU + ROOT_EXECUTABLE(testArithmeticCpu TestMatrixArithmeticCpu.cxx + LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-Arithmetic-Cpu COMMAND testArithmeticCpu) + + # DNN - Activation Functions CPU + ROOT_EXECUTABLE(testActivationFunctionsCpu TestActivationFunctionsCpu.cxx + LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-Activation-Functions-Cpu COMMAND testActivationFunctionsCpu) + + # DNN - Loss Functions CPU + ROOT_EXECUTABLE(testLossFunctionsCpu TestLossFunctionsCpu.cxx + LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-Loss-Functions-Cpu COMMAND testLossFunctionsCpu) + + # DNN - Derivatives CPU + ROOT_EXECUTABLE(testDerivativesCpu TestDerivativesCpu.cxx + LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-Derivatives-Cpu COMMAND testDerivativesCpu) + + # DNN - Backpropagation CPU + ROOT_EXECUTABLE(testBackpropagationCpu TestBackpropagationCpu.cxx + LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-Backpropagation-Cpu COMMAND testBackpropagationCpu) + + # DNN - DataLoader CPU + ROOT_EXECUTABLE(testDataLoaderCpu TestDataLoaderCpu.cxx + LIBRARIES ${Libraries}) + ROOT_ADD_TEST(TMVA-DNN-Data-Loader-Cpu COMMAND testDataLoaderCpu) + + # DNN - Minimization CPU + ROOT_EXECUTABLE(testMinimizationCpu TestMinimizationCpu.cxx + LIBRARIES ${Libraries} ${BLAS_openblas_LIBRARY} tbb) + ROOT_ADD_TEST(TMVA-DNN-Minimization-Cpu COMMAND testMinimizationCpu) + +endif (BLAS_FOUND AND imt) diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.cxx b/tmva/tmva/test/DNN/TestActivationFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..aed3980a7ed10c9a32faa9f642b3ae02b37f58e2 --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctions.cxx @@ -0,0 +1,128 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic activation function test // +// for the reference architecture. // +////////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TestActivationFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + std::cout << "Testing Activation Functions:" << std::endl; + + Scalar_t error; + + // Identity. + + error = testIdentity<TReference<Scalar_t>>(10); + std::cout << "Testing identity activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testIdentityDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing identity activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // ReLU. + + error = testRelu<TReference<Scalar_t>>(10); + std::cout << "Testing ReLU activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testReluDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing ReLU activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Sigmoid. + + error = testSigmoid<TReference<Scalar_t>>(10); + std::cout << "Testing Sigmoid activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSigmoidDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing Sigmoid activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // TanH. + + error = testTanh<TReference<Scalar_t>>(10); + std::cout << "Testing TanH activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testTanhDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing TanH activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Symmetric ReLU. + + error = testSymmetricReluDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing Symm. ReLU activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSymmetricReluDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing Symm. ReLU activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Soft Sign. + + error = testSoftSign<TReference<Scalar_t>>(10); + std::cout << "Testing Soft Sign activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSoftSignDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing Soft Sign activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Gauss. + + error = testGauss<TReference<Scalar_t>>(10); + std::cout << "Testing Gauss activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testGaussDerivative<TReference<Scalar_t>>(10); + std::cout << "Testing Gauss activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestActivationFunctions.h b/tmva/tmva/test/DNN/TestActivationFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..c3f479eb706a7c56a7ff0eca7180744c4a0ddfa8 --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctions.h @@ -0,0 +1,457 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Generic tests of the layer activation functions // +// // +// Contains tests for each of the layer activation functions that // +// test the evaluation of the function using the evaluate(...) // +// method and the computation of the derivatives using // +// evaluate_derivative(...) on a randomly generated matrix. Each // +// function returns the maximum relative error between the expected // +// result and the result obtained for the given arcthitecture. // +////////////////////////////////////////////////////////////////////// + +#ifndef TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS +#define TMVA_TEST_DNN_TEST_ACTIVATION_FUNCTIONS + +#include "TMatrixT.h" +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +//______________________________________________________________________________ +// +// Identity Activation Function +//______________________________________________________________________________ + +/*! Test application of identity function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testIdentity(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kIdentity); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the identity function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testIdentityDerivative(size_t ntests) + -> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kIdentity, AArch); + evaluateDerivative<TReference<Double_t>>(BRef, EActivationFunction::kIdentity, + ARef); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, BRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// ReLU Activation Function +//______________________________________________________________________________ + +/*! Test application of ReLU function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testRelu(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kRelu); + applyMatrix(ARef, [](double x){return x < 0.0 ? 0.0 : x;}); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the ReLU function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testReluDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kRelu, AArch); + applyMatrix(ARef, [](double x){return x > 0.0 ? 1.0 : 0.0;}); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Sigmoid Activation Function +//______________________________________________________________________________ + +/*! Test application of Sigmoid function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testSigmoid(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kSigmoid); + applyMatrix(ARef, [](double x){return 1.0 / (1.0 + std::exp(-x));}); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the ReLU function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testSigmoidDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kSigmoid, AArch); + applyMatrix(ARef, [](Double_t x){ + Double_t sig = 1.0 / (1.0 + std::exp(-x)); + return sig * (1.0 - sig); + }); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Tanh Activation Function +//______________________________________________________________________________ + +/*! Test application of tanh function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testTanh(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kTanh); + applyMatrix(ARef, [](double x){return tanh(x);}); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the tanh function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testTanhDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kTanh, AArch); + applyMatrix(ARef, [](Double_t x){ + Double_t t = tanh(x); + return 1 - t * t; + }); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Symmetric ReLU Activation Function +//______________________________________________________________________________ + +/*! Test application of symmetric ReLU function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testSymmetricRelu(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kSymmRelu); + applyMatrix(ARef, [](double x){return fabs(x);}); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the symmetric ReLU function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testSymmetricReluDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kSymmRelu, AArch); + applyMatrix(ARef, [](Double_t x){ + return (x < 0) ? -1.0 : 1.0; + }); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Soft Sign Activation Function +//______________________________________________________________________________ + +/*! Test application of symmetric soft sign function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testSoftSign(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kSoftSign); + applyMatrix(ARef, [](double x){return x / (1 + fabs(x));}); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the soft sign function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testSoftSignDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kSoftSign, AArch); + applyMatrix(ARef, [](Double_t x){ + Double_t y = 1 + fabs(x); + return 1.0 / (y * y); + }); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Gauss Activation Functions +//______________________________________________________________________________ + +/*! Test application of Gauss activation function to matrix. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testGauss(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef); + + evaluate<Architecture>(AArch, EActivationFunction::kGauss); + applyMatrix(ARef, [](double x){return exp(- x * x);}); + + TMatrixT<Double_t> A = AArch; + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +/*! Test computation of the first derivative of the Gauss activation function. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testGaussDerivative(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m, n), BRef(m, n); + randomMatrix(ARef); + Matrix_t AArch(ARef), BArch(BRef); + + evaluateDerivative<Architecture>(BArch, EActivationFunction::kGauss, AArch); + applyMatrix(ARef, [](Double_t x){return -2.0 * x * exp(- x * x);}); + + TMatrixT<Double_t> B = BArch; + Double_t error = maximumRelativeError(B, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} +#endif diff --git a/tmva/tmva/test/DNN/TestActivationFunctionsCpu.cxx b/tmva/tmva/test/DNN/TestActivationFunctionsCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..998f8bc747d46f5a4e7dd67395f08bc3dec0079e --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctionsCpu.cxx @@ -0,0 +1,131 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic activation function test // +// for the multi-threaded CPU implementation. // +////////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cpu.h" +#include "Utility.h" +#include "TestActivationFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + + std::cout << "Testing Activation Functions:" << std::endl; + + double error; + + // Identity. + + error = testIdentity<TCpu<Scalar_t>>(10); + std::cout << "Testing identity activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testIdentityDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing identity activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + // ReLU. + + error = testRelu<TCpu<Scalar_t>>(10); + std::cout << "Testing ReLU activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testReluDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing ReLU activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + // Sigmoid. + + error = testSigmoid<TCpu<Scalar_t>>(10); + std::cout << "Testing Sigmoid activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testSigmoidDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing Sigmoid activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + // TanH. + + error = testTanh<TCpu<Scalar_t>>(10); + std::cout << "Testing TanH activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testTanhDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing TanH activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Symmetric ReLU. + + error = testSymmetricRelu<TCpu<Scalar_t>>(10); + std::cout << "Testing Symm. ReLU activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSymmetricReluDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing Symm. ReLU activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Soft Sign. + + error = testSoftSign<TCpu<Scalar_t>>(10); + std::cout << "Testing Soft Sign activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testSoftSignDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing Soft Sign activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + // Gauss. + + error = testGauss<TCpu<Scalar_t>>(10); + std::cout << "Testing Gauss activation: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + error = testGaussDerivative<TCpu<Scalar_t>>(10); + std::cout << "Testing Gauss activation derivative: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-10) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..adcb765098cb50a2c065ed17016abc24ed4df18c --- /dev/null +++ b/tmva/tmva/test/DNN/TestActivationFunctionsCuda.cxx @@ -0,0 +1,74 @@ +// @(#)root/tmva/tmva/test/dnn $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic activation function test // +// for the TCuda implementation. // +////////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cuda.h" +#include "Utility.h" +#include "TestActivationFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + + std::cout << "Testing Activation Functions:" << std::endl; + + double error; + + // Identity. + + error = testIdentity<TCuda<Scalar_t>>(10); + std::cout << "Testing identity activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-5) + return 1; + + error = testIdentityDerivative<TCuda<Scalar_t>>(10); + std::cout << "Testing identity activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-5) + return 1; + + // ReLU. + + error = testRelu<TCuda<Scalar_t>>(10); + std::cout << "Testing ReLU activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-5) + return 1; + + error = testReluDerivative<TCuda<Scalar_t>>(10); + std::cout << "Testing ReLU activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-5) + return 1; + + // Sigmoid. + + error = testSigmoid<TCuda<Scalar_t>>(10); + std::cout << "Testing Sigmoid activation: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-5) + return 1; + + error = testSigmoidDerivative<TCuda<Scalar_t>>(10); + std::cout << "Testing Sigmoid activation derivative: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-5) + return 1; + return 0; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagation.cxx b/tmva/tmva/test/DNN/TestBackpropagation.cxx new file mode 100644 index 0000000000000000000000000000000000000000..391aae7e3cd23a5ff17903c1d96b4f8301afc257 --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagation.cxx @@ -0,0 +1,50 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// the reference architecture. // +//////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestBackpropagation.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Backpropagation:" << std::endl; + + double error; + + // + // Test backpropagation for linear net. + // + + error = testBackpropagationWeightsLinear<TReference<double>>(1.0); + if (error > 1e-3) + return 1; + + error = testBackpropagationL1Regularization<TReference<double>>(1e-2); + if (error > 1e-3) + return 1; + + error = testBackpropagationL2Regularization<TReference<double>>(1.0); + if (error > 1e-3) + return 1; + + error = testBackpropagationBiasesLinear<TReference<double>>(1.0); + if (error > 1e-3) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagation.h b/tmva/tmva/test/DNN/TestBackpropagation.h new file mode 100644 index 0000000000000000000000000000000000000000..0988764ee21f781b8dd37b364c2d25d61cb1cc46 --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagation.h @@ -0,0 +1,361 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Generic tests of the backpropagation algorithm. // +// // +// All tests randomly generate a net with identity activation // +// functions, i.e. which is completely linear and then tests the // +// computed gradients for each layer using numerical // +// derivation. The restriction to linear nets is to avoid the // +// required division by the finite difference interval used to // +// approximate the numerical derivatives, which would otherwise // +// cause precision loss. // +//////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +/*! Compute the loss of the net as a function of the weight at index (i,j) in + * layer l. dx is added as an offset to the current value of the weight. */ +//______________________________________________________________________________ +template <typename Architecture> +auto evaluate_net_weight(TNet<Architecture> &net, + typename Architecture::Matrix_t &X, + const typename Architecture::Matrix_t &Y, + size_t l, + size_t i, + size_t j, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + + net.GetLayer(l).GetWeights().operator()(i,j) += dx; + Scalar_t res = net.Loss(X, Y); + net.GetLayer(l).GetWeights().operator()(i,j) -= dx; + return res; +} + +/*! Compute the loss of the net as a function of the weight at index i in + * layer l. dx is added as an offset to the current value of the weight. */ +//______________________________________________________________________________ +template <typename Architecture> +auto evaluate_net_bias(TNet<Architecture> &net, + typename Architecture::Matrix_t &X, + const typename Architecture::Matrix_t &Y, + size_t l, + size_t i, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + + net.GetLayer(l).GetBiases().operator()(i,0) += dx; + Scalar_t res = net.Loss(X, Y); + net.GetLayer(l).GetBiases().operator()(i,0) -= dx; + return res; +} + +/*! Generate a random net, perform forward and backward propagation and check + * the weight gradients using numerical differentiation. Returns the maximum + * relative gradient error and also prints it to stdout. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testBackpropagationWeightsLinear(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet<Architecture>; + + + Net_t net(50, 50, ELossFunction::kMeanSquaredError); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::kGauss); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weights using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients: layer: " + << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + for (size_t j = 0; j < layer.GetInputWidth(); j++) + { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = W(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + + std::cout << "\rTesting weight gradients: "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random, linear net, perform forward and backward propagation with + * L1 regularization and check the weight gradients using numerical + * differentiation. Returns the maximum relative gradient error and + * also prints it to stdout. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testBackpropagationL1Regularization(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet<Architecture>; + + Net_t net(50, 50, ELossFunction::kMeanSquaredError, ERegularization::kL1, 0.1); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::kGauss); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weights using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients (L1): layer: " + << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeights(); + auto & dW = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) { + for (size_t j = 0; j < layer.GetInputWidth(); j++) { + // Avoid running into the non-derivable point at 0.0. + if (std::abs(W(i,j)) > dx) { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = dW(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + } + + std::cout << "\rTesting weight gradients (L1): "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random, linear net, perform forward and backward propagation with + * L2 regularization and check the weight gradients using numerical + * differentiation. Returns the maximum relative gradient error and + * also prints it to stdout. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testBackpropagationL2Regularization(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet<Architecture>; + + Net_t net(50, 50, ELossFunction::kMeanSquaredError, ERegularization::kL2, 0.1); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::kGauss); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all weights using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting weight gradients (L2): layer: " + << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & W = layer.GetWeightGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + for (size_t j = 0; j < layer.GetInputWidth(); j++) + { + auto f = [& net, & X, &Y, l, i, j](Scalar_t x) + { + return evaluate_net_weight(net, X, Y, l, i, j, x); + }; + Scalar_t dy = finiteDifference(f, dx) / (2.0 * dx); + Scalar_t dy_ref = W(i,j); + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-15) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + } + + std::cout << "\rTesting weight gradients (L2): "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} + +/*! Generate a random net, perform forward and backward propagation and check + * the bias gradients using numerical differentiation. Returns the maximum + * relative gradient error and also prints it to stdout. */ +//______________________________________________________________________________ +template <typename Architecture> +auto testBackpropagationBiasesLinear(typename Architecture::Scalar_t dx) +-> typename Architecture::Scalar_t +{ + using Net_t = TNet<Architecture>; + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + + Net_t net(50, 50, ELossFunction::kMeanSquaredError); + + // Random net. + constructRandomLinearNet(net); + net.Initialize(EInitialization::kGauss); + + // Random training data. + Matrix_t X(50, 50); + randomBatch(X); + + Matrix_t Y(50, net.GetOutputWidth()); + randomMatrix(Y); + + net.Forward(X); + net.Backward(X,Y); + + Scalar_t maximum_error = 0.0; + + // Compute derivatives for all bias terms using finite differences and + // compare to result obtained from backpropagation. + for (size_t l = 0; l < net.GetDepth(); l++) + { + std::cout << "\rTesting bias gradients: layer: " + << l << " / " << net.GetDepth(); + std::cout << std::flush; + auto & layer = net.GetLayer(l); + auto & dtheta = layer.GetBiasGradients(); + + for (size_t i = 0; i < layer.GetWidth(); i++) + { + auto f = [& net, & X, &Y, l, i](Scalar_t x) + { + return evaluate_net_bias(net, X, Y, l, i, x); + }; + Scalar_t dy = finiteDifference(f, dx); + Scalar_t dy_ref = dtheta(i,0) * 2.0 * dx; + + // Compute the relative error if dy != 0. + Scalar_t error; + if (std::fabs(dy_ref) > 1e-10) + { + error = std::fabs((dy - dy_ref) / dy_ref); + } + else + { + error = std::fabs(dy - dy_ref); + } + + maximum_error = std::max(error, maximum_error); + } + } + + std::cout << "\rTesting bias gradients: "; + std::cout << "maximum relative error: " << print_error(maximum_error) << std::endl; + return maximum_error; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagationCpu.cxx b/tmva/tmva/test/DNN/TestBackpropagationCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..c44405b4f6a786561cd0d5500055570f9ebe2abc --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagationCpu.cxx @@ -0,0 +1,47 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// multi-threaded CPU architectures. // +//////////////////////////////////////////////////////////////////// + +#include "TMatrix.h" +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TestBackpropagation.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + std::cout << "Testing Backpropagation:" << std::endl; + + double error; + + error = testBackpropagationWeightsLinear<TCpu<Scalar_t>>(1.0); + if (error > 1e-3) + return 1; + + error = testBackpropagationL1Regularization<TCpu<Scalar_t>>(1e-2); + if (error > 1e-3) + return 1; + + error = testBackpropagationL2Regularization<TCpu<Scalar_t>>(1.0); + if (error > 1e-3) + return 1; + + error = testBackpropagationBiasesLinear<TCpu<Scalar_t>>(1.0); + if (error > 1e-3) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..55178a612a22bd005a7abbfb47f3cc5b6404aeb6 --- /dev/null +++ b/tmva/tmva/test/DNN/TestBackpropagationCuda.cxx @@ -0,0 +1,43 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// CUDA architectures. // +//////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMatrix.h" +#include "TestBackpropagation.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + + std::cout << "Testing Backpropagation:" << std::endl; + double error; + error = testBackpropagationWeightsLinear<TCuda<Scalar_t>>(1.0); + if (error > 1e-3) + return 1; + error = testBackpropagationL1Regularization<TCuda<Scalar_t>>(1e-2); + if (error > 1e-3) + return 1; + error = testBackpropagationL2Regularization<TCuda<Scalar_t>>(1.0); + if (error > 1e-3) + return 1; + error = testBackpropagationBiasesLinear<TCuda<Scalar_t>>(1.0); + if (error > 1e-3) + return 1; + return 0; +} diff --git a/tmva/tmva/test/DNN/TestCuda.cxx b/tmva/tmva/test/DNN/TestCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..46c4597f979a7c57f8646a9a1fb4c58153bce2c6 --- /dev/null +++ b/tmva/tmva/test/DNN/TestCuda.cxx @@ -0,0 +1,196 @@ +#include "Utility.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Architectures/Reference.h" +#include <stdlib.h> + +using namespace TMVA::DNN; + +//_________________________________________________________________________________ +Double_t testMultiply() +{ + const size_t ntests = 100; + + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n, k; + m = rand() % 50 + 1; + n = rand() % 50 + 1; + k = rand() % 50 + 1; + + TMatrixT<Double_t> A(m,k), AT(k,m) , B(k,n), BT(n,k), C(m,n); + randomMatrix(A); + randomMatrix(AT); + randomMatrix(B); + randomMatrix(BT); + TCudaMatrix ACuda(A), ATCuda(AT), BCuda(B), BTCuda(BT), CCuda(C); + + TReference<Double_t>::MultiplyTranspose(C, A, BT); + TCuda<false>::MultiplyTranspose(CCuda, ACuda, BTCuda); + TMatrixT<Double_t> CRef(CCuda); + Double_t error = maximumRelativeError(C, CRef); + maximumError = std::max(error, maximumError); + + C.Mult(A,B); + TCuda<false>::Multiply(CCuda, ACuda, BCuda); + CRef = CCuda; + error = maximumRelativeError(C, CRef); + maximumError = std::max(error, maximumError); + + C.TMult(AT,B); + TCuda<false>::TransposeMultiply(CCuda, ATCuda, BCuda); + CRef = CCuda; + error = maximumRelativeError(C, CRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testAddRowWise() +{ + const size_t ntests = 10; + + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 50 + 1; + n = rand() % 50 + 1; + + TMatrixT<Double_t> A(m,n), B(m,n), theta(n,1); + //randomMatrix(A); + randomMatrix(theta); + TCudaMatrix ACuda(A), BCuda(B), thetaCuda(theta); + + TReference<Double_t>::AddRowWise(A, theta); + TCuda<false>::AddRowWise(ACuda,thetaCuda); + TMatrixT<Double_t> ARef(ACuda); + + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testHadamard() +{ + const size_t ntests = 10; + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 10 + 1; + n = rand() % 10 + 1; + + TMatrixT<Double_t> A(m,n), B(m,n); + randomMatrix(A); + randomMatrix(B); + TCudaMatrix ACuda(A), BCuda(B); + + for (size_t j = 0; j < (size_t) A.GetNrows(); j++) { + for (size_t k = 0; k < (size_t) A.GetNcols(); k++) { + A(j,k) *= B(j,k); + } + } + + TCuda<false>::Hadamard(ACuda, BCuda); + TMatrixT<Double_t> ARef(ACuda); + Double_t error = maximumRelativeError(A, ARef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testReduction() +{ + const size_t ntests = 10; + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 1000 + 1; + n = rand() % 1000 + 1; + + TMatrixT<Double_t> A(m,n); + + for (size_t j = 0; j < m; j++) { + for (size_t k = 0; k < n; k++) { + A(j,k) = 1.0; + } + } + TCudaMatrix ACuda(A); + + TCudaMatrix BCuda(1,n); + TCuda<false>::InitializeZero(BCuda); + Double_t s = TCuda<false>::Sum(A); + TCuda<false>::SumColumns(BCuda, ACuda); + TMatrixT<Double_t> B(BCuda); + + Double_t error = s - ((Double_t) m * n); + maximumError = std::max(error, maximumError); + + for (size_t j = 0; j < n; j++) { + //std::cout << B(0,j) << " / " << j * m << std::endl; + error = std::abs(B(0,j) - m); + maximumError = std::max(error, maximumError); + } + } + return maximumError; +} + +//_________________________________________________________________________________ +Double_t testScaleAdd() +{ + const size_t ntests = 10; + Double_t maximumError = 0; + + for (size_t i = 0; i < ntests; i++) { + size_t m, n; + m = rand() % 1000 + 1; + n = rand() % 1000 + 1; + + TMatrixT<Double_t> A(m,n), B(m,n); + + randomMatrix(A); + randomMatrix(B); + + TCudaMatrix ACuda(A); + TCudaMatrix BCuda(B); + + Double_t beta = ((Double_t) rand()) / ((Double_t) RAND_MAX); + TReference<Double_t>::ScaleAdd(A, B, beta); + TCuda<false>::ScaleAdd(ACuda, BCuda, beta); + + Double_t error = maximumRelativeError(A, (TMatrixT<Double_t>) ACuda); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//_________________________________________________________________________________ +int main() +{ + Double_t error; + error = testReduction(); + std::cout << "Testing reduction: max. rel. error = "; + std::cout << error << std::endl; + + error = testScaleAdd(); + std::cout << "Testing scale_add: max. rel. error = "; + std::cout << error << std::endl; + + error = testHadamard(); + std::cout << "Testing hadamard: max. rel. error = "; + std::cout << error << std::endl; + + error = testMultiply(); + std::cout << "Testing multiplication: max. rel. error = "; + std::cout << error << std::endl; + + error = testAddRowWise(); + std::cout << "Testing add_row_wise: max. rel. error = "; + std::cout << error << std::endl; +} diff --git a/tmva/tmva/test/DNN/TestDataLoader.cxx b/tmva/tmva/test/DNN/TestDataLoader.cxx new file mode 100644 index 0000000000000000000000000000000000000000..283e94a03ddb3643f991a01950041489d27c641a --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoader.cxx @@ -0,0 +1,26 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////// +// Test the reference data loader implementation. // +//////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestDataLoader.h" + +using namespace TMVA::DNN; + +int main () +{ + Double_t error = testIdentity<TReference<Double_t>>(); + std::cout << "Testing reference data loader: Mex. rel. error = " << error; + std::cout << std::endl; +} diff --git a/tmva/tmva/test/DNN/TestDataLoader.h b/tmva/tmva/test/DNN/TestDataLoader.h new file mode 100644 index 0000000000000000000000000000000000000000..4a1f901204589ff964a159d81de072a3498535bd --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoader.h @@ -0,0 +1,93 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 12/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////// +// Generic test for DataLoader implementations. // +////////////////////////////////////////////////// + +#include "TMVA/DNN/Net.h" +#include "TMVA/DNN/DataLoader.h" +#include "Utility.h" + +namespace TMVA +{ +namespace DNN +{ + +/** Test that the data loader loads all data in the data set by summing + * up all elements batch wise and comparing to the result over the complete + * data set. */ +//______________________________________________________________________________ +template <typename Architecture_t> +auto testSum() + -> typename Architecture_t::Scalar_t +{ + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + using DataLoader_t = TDataLoader<MatrixInput_t, Architecture_t>; + + size_t nSamples = 10000; + TMatrixT<Double_t> X(nSamples,1); + randomMatrix(X); + for (size_t i = 0; i < 10000; i++) { + X(i,0) = i; + } + MatrixInput_t input(X, X); + DataLoader_t loader(input, nSamples, 5, 1, 1); + + Matrix_t XArch(X), Sum(1,1), SumTotal(1,1); + Scalar_t sum = 0.0, sumTotal = 0.0; + + for (auto b : loader) { + Architecture_t::SumColumns(Sum, b.GetInput()); + sum += Sum(0, 0); + } + + Architecture_t::SumColumns(SumTotal, XArch); + sumTotal = SumTotal(0,0); + + return fabs(sumTotal - sum) / sumTotal; +} + +/** Test the data loader by loading identical input and output data, running it + * through an identity neural network and computing the the mean squared error. + * Should obviously be zero. */ +//______________________________________________________________________________ +template <typename Architecture_t> +auto testIdentity() + -> typename Architecture_t::Scalar_t +{ + using Scalar_t = typename Architecture_t::Scalar_t; + using Net_t = TNet<Architecture_t>; + using DataLoader_t = TDataLoader<MatrixInput_t, Architecture_t>; + + TMatrixT<Double_t> X(2000, 100); randomMatrix(X); + MatrixInput_t input(X, X); + DataLoader_t loader(input, 2000, 20, 100, 100); + + Net_t net(20, 100, ELossFunction::kMeanSquaredError); + net.AddLayer(100, EActivationFunction::kIdentity); + net.AddLayer(100, EActivationFunction::kIdentity); + net.Initialize(EInitialization::kIdentity); + + Scalar_t maximumError = 0.0; + for (auto b : loader) { + auto inputMatrix = b.GetInput(); + auto outputMatrix = b.GetOutput(); + Scalar_t error = net.Loss(inputMatrix, outputMatrix); + maximumError = std::max(error, maximumError); + } + + return maximumError; +} + +} // namespace DNN +} // namespace TMVA diff --git a/tmva/tmva/test/DNN/TestDataLoaderCpu.cxx b/tmva/tmva/test/DNN/TestDataLoaderCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..8e4a787682165546943f76077241390fac667ad4 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoaderCpu.cxx @@ -0,0 +1,39 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 21/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////// +// Test the multi-threaded CPU data loader implementation. // +///////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TestDataLoader.h" + +using namespace TMVA::DNN; + +int main () +{ + using Scalar_t = Real_t; + + std::cout << "Testing data loader:" << std::endl; + + Scalar_t maximumError = 0.0; + + Scalar_t error = testSum<TCpu<Scalar_t>>(); + std::cout << "Sum: Maximum relative error = " << error << std::endl; + maximumError = std::max(error, maximumError); + error = testIdentity<TCpu<Scalar_t>>(); + std::cout << "Identity: Maximum relative error = " << error << std::endl; + maximumError = std::max(error, maximumError); + + if (maximumError > 1e-3) { + return 1; + } +} diff --git a/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..38efb25427234d1c9aac1bf7dd0fce103d6cc89d --- /dev/null +++ b/tmva/tmva/test/DNN/TestDataLoaderCuda.cxx @@ -0,0 +1,45 @@ +// @(#)root/tmva/tmva/dnn:$Id$ +// Author: Simon Pfreundschuh 08/08/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////// +// Test the generic data loader for the CUDA implementation. // +/////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestDataLoader.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing data loader:" << std::endl; + using Scalar_t = Real_t; + + Scalar_t maximumError = 0.0; + + Scalar_t error = testSum<TCuda<Scalar_t>>(); + std::cout << "Sum: Maximum relative error = " << error << std::endl; + maximumError = std::max(error, maximumError); + error = testIdentity<TCuda<Scalar_t>>(); + std::cout << "Identity: Maximum relative error = " << error << std::endl; + maximumError = std::max(error, maximumError); + + if (maximumError > 1e-3) { + return 1; + } + return 0; +} + + + + + diff --git a/tmva/tmva/test/DNN/TestDerivatives.cxx b/tmva/tmva/test/DNN/TestDerivatives.cxx new file mode 100644 index 0000000000000000000000000000000000000000..3a3e742469a692c1025fec6f6db05f403a5148f1 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivatives.cxx @@ -0,0 +1,65 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic derivative test for the // +// reference implementation. // +/////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestDerivatives.h" + +using namespace TMVA::DNN; + +int main() +{ + + double error; + + // + // Activation Functions + // + + std::cout << "Activation Functions:" << std::endl; + error = testActivationFunctionDerivatives<TReference<double>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + // + // Loss Functions + // + + std::cout << "Loss Functions:" << std::endl; + error = testLossFunctionGradients<TReference<double>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + // + // Regularization Functions + // + + std::cout << "Regularization:" << std::endl; + error = testRegularizationGradients<TReference<double>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << print_error(error); + std::cout << std::endl << std::endl; + if (error > 1e-5) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestDerivatives.h b/tmva/tmva/test/DNN/TestDerivatives.h new file mode 100644 index 0000000000000000000000000000000000000000..7240e23a95183f5b2b95b147894d11d8e625a0dd --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivatives.h @@ -0,0 +1,253 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Generic tests for the derivatives and gradiens of acitvation, // +// loss and regularization functions. Each function generates a // +// random 10 x 10 matrix and uses a central finite difference and // +// to numerically compute the derivative of the function // +// w.r.t. this element. The result is compared to the result // +// obtained by the corresponding analytic derivative implemented by // +// the evaluateDerivative(...), evaluateGradients(...), // +// addRegularizationGradients(...) functions. // +////////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +//______________________________________________________________________________ +// +// Activation Functions +//______________________________________________________________________________ + +/*! Generic function that numerically computes the derivative of a matrix + * function f and the analytical solution given by df the function signatures + * are assumed to be + * - void f(Matrix_t &X) + * - void df(Matrix_t &Y, const Matrix_t &X) -> derivative of f at X(i,j) is + * The function f is supposed to apply the corresponding mathematical function + * to each element in the provided matrix X. The function df is expected to + * set each element in Y to the derivative of the corresponding mathematical + * function evaluated at the corresponding element in X. + */ +template<typename Architecture, typename F, typename dF> + auto testDerivatives(F f, dF df, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + Scalar_t maximum_error = 0.0; + + for (size_t i = 0; i < 100; i++) + { + Matrix_t X(10,10), Y(10,10); + randomMatrix(Y); + + df(X, Y); + Scalar_t dy = X(0,0); + + copyMatrix(X, Y); + X(0,0) += dx; + f(X); + Scalar_t y1 = X(0,0); + copyMatrix(X, Y); + X(0,0) -= dx; + f(X); + Scalar_t y0 = X(0,0); + Scalar_t dy_num = (y1 - y0) / (2.0 * dx); + Scalar_t error = relativeError(dy_num, dy); + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +/*! Test derivatives of all activation functions and return the maximum relative + * error. Prints the result for each function to the stdout. */ +//______________________________________________________________________________ +template<typename Architecture> +auto testActivationFunctionDerivatives() + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + // Test only differentiable activation functions. + std::vector<EActivationFunction> EActivationFunctions + = {EActivationFunction::kIdentity, + EActivationFunction::kSigmoid, + EActivationFunction::kTanh, + EActivationFunction::kSoftSign, + EActivationFunction::kGauss}; + + Scalar_t error, maximum_error; + maximum_error = 0.0; + + for (auto & af : EActivationFunctions) + { + auto f = [& af](Matrix_t &X){ evaluate<Architecture>(X, af);}; + auto df = [& af](Matrix_t &X, const Matrix_t &Y) + { + evaluateDerivative<Architecture>(X, af, Y); + }; + error = testDerivatives<Architecture>(f, df, 5e-3); + + std::cout << "Testing " << static_cast<int>(af) << ": "; + std::cout << "Maximum Relative Error = " << error << std::endl; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +//______________________________________________________________________________ +// +// Loss functions. +//______________________________________________________________________________ + +/*! Similar to testDerivatives only that here the mathematical function is + * expected to be a matrix functional, i.e. to be mapping a matrix to a + * scalar value. The scalar value is supposed to be computed by the provided + * function object f, while the function object is just like above. */ +template<typename Architecture, typename F, typename dF> + auto testGradients(F f, dF df, + typename Architecture::Scalar_t dx) + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + Scalar_t maximum_error = 0.0; + + for (size_t i = 0; i < 100; i++) + { + Matrix_t X(10,10), Y(10,10), Z(10,10); + randomMatrix(X); + randomMatrix(Y); + + df(Z, Y, X); + Scalar_t dy = Z(0,0); + + X(0,0) += dx; + Scalar_t y1 = f(Y,X); + X(0,0) -= 2.0 * dx; + Scalar_t y0 = f(Y,X); + Scalar_t dy_num = (y1 - y0) / (2.0 * dx); + + Scalar_t error = 0.0; + if (std::fabs(dy) > 0) + { + error = std::fabs((dy_num - dy) / dy); + } + else + error = dy_num - dy; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +/*! Test gradients of all loss function for the given architecture type and + * return the maximum relative error. Prints results for each function to + * standard out. */ +//______________________________________________________________________________ +template<typename Architecture> +auto testLossFunctionGradients() + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + std::vector<ELossFunction> LossFunctions + = {ELossFunction::kMeanSquaredError, + ELossFunction::kCrossEntropy}; + + Scalar_t error, maximum_error; + maximum_error = 0.0; + + for (auto & lf : LossFunctions) + { + auto f = [lf](const Matrix_t &Y, const Matrix_t &Z) + { + return evaluate<Architecture>(lf, Y, Z); + }; + auto df = [& lf](Matrix_t &X, + const Matrix_t &Y, + const Matrix_t &Z) + { + evaluateGradients<Architecture>(X, lf, Y, Z); + }; + + error = testGradients<Architecture>(f, df, 5e-6); + + std::cout << "Testing " << static_cast<char>(lf) << ": "; + std::cout << "Maximum Relative Error = " << error << std::endl; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} + +//______________________________________________________________________________ +// +// Regularization. +//______________________________________________________________________________ + +/*! Test the computation of gradients for all differentiable regularization types, + * which is so far only L2 and no regularization and print the results to standard + * out */ +template<typename Architecture> +auto testRegularizationGradients() + -> typename Architecture::Scalar_t +{ + using Scalar_t = typename Architecture::Scalar_t; + using Matrix_t = typename Architecture::Matrix_t; + + std::vector<ERegularization> Regularizations + = {ERegularization::kNone, + ERegularization::kL2}; + + Scalar_t error, maximum_error; + maximum_error = 0.0; + + for (auto & r : Regularizations) + { + auto f = [r](const Matrix_t & , const Matrix_t & Y) + { + return regularization<Architecture>(Y, r); + }; + auto df = [& r](Matrix_t &X, + const Matrix_t & , + const Matrix_t & Y) + { + applyMatrix(X, [](double){return 0.0;}); + addRegularizationGradients<Architecture>(X, Y, (Scalar_t) 1.0, r); + }; + + error = testGradients<Architecture>(f, df, 1.0); + + std::cout << "Testing " << static_cast<char>(r) << ": "; + std::cout << "Maximum Relative Error = " << error << std::endl; + + maximum_error = std::max(maximum_error, error); + } + + return maximum_error; +} diff --git a/tmva/tmva/test/DNN/TestDerivativesCpu.cxx b/tmva/tmva/test/DNN/TestDerivativesCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..e94e3223809183c2422108922945c5cabf60fc06 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivativesCpu.cxx @@ -0,0 +1,66 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic derivative test for the // +// multi-threaded CPU implementation. // +/////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TestDerivatives.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + + double error; + + // + // Activation Functions + // + + std::cout << "Activation Functions:" << std::endl; + error = testActivationFunctionDerivatives<TCpu<Scalar_t>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << error; + std::cout << std::endl << std::endl; + if (error > 1e-3) + return 1; + + // + // Loss Functions + // + + std::cout << "Loss Functions:" << std::endl; + error = testLossFunctionGradients<TCpu<Scalar_t>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << error; + std::cout << std::endl << std::endl; + if (error > 1e-3) + return 1; + + // + // Regularization Functions + // + + std::cout << "Regularization:" << std::endl; + error = testRegularizationGradients<TCpu<Scalar_t>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << error; + std::cout << std::endl << std::endl; + if (error > 1e-3) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestDerivativesCuda.cxx b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..345c11ef11380cf45fc2862a1f69bdb28093cfd3 --- /dev/null +++ b/tmva/tmva/test/DNN/TestDerivativesCuda.cxx @@ -0,0 +1,65 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic derivative test for the // +// reference implementation. // +/////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestDerivatives.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + Double_t error; + + // + // Activation Functions + // + + std::cout << "Activation Functions:" << std::endl; + error = testActivationFunctionDerivatives<TCuda<Scalar_t>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << error; + std::cout << std::endl << std::endl; + if (error > 1e-2) + return 1; + + // + // Loss Functions + // + + std::cout << "Loss Functions:" << std::endl; + error = testLossFunctionGradients<TCuda<Scalar_t>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << error; + std::cout << std::endl << std::endl; + if (error > 1e-3) + return 1; + + // + // Regularization Functions + // + + std::cout << "Regularization:" << std::endl; + error = testRegularizationGradients<TCuda<Scalar_t>>(); + std::cout << "Total : "; + std::cout << "Maximum Relative Error = " << error; + std::cout << std::endl << std::endl; + if (error > 1e-3) + return 1; + + return 0; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctions.cxx b/tmva/tmva/test/DNN/TestLossFunctions.cxx new file mode 100644 index 0000000000000000000000000000000000000000..9ae39ae5ba6b703ffc7bee711014bc4d2834a898 --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctions.cxx @@ -0,0 +1,60 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test for the loss function reference implementation using the // +// generic test defined in TestLossFunctions.h. // +/////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestLossFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing Loss Functions:" << std::endl << std::endl; + + double error; + + // + // Mean Squared Error. + // + + error = testMeanSquaredError<TReference<double>>(10); + std::cout << "Testing mean squared error loss: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testMeanSquaredErrorGradients<TReference<double>>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + // + // Cross Entropy. + // + + error = testCrossEntropy<TReference<double>>(10); + std::cout << "Testing cross entropy loss: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; + + error = testCrossEntropyGradients<TReference<double>>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << error << std::endl; + if (error > 1e-10) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctions.h b/tmva/tmva/test/DNN/TestLossFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..d81bc3f052653b4fdf2dcd22e1edafc81e3a2509 --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctions.h @@ -0,0 +1,193 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////////// +// Generic tests of the loss functions // +// // +// Contains generic test for architecture-specific implementations // +// of the loss functions. Requires the architecture-specific matrix // +// type to be constructible and convertible from/to the // +// TMatrixT<Double_t> type. // +////////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +//______________________________________________________________________________ +// +// Mean Squared Error +//______________________________________________________________________________ + +template <typename Architecture> +auto testMeanSquaredError(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> X(m, n); + TMatrixT<Double_t> Y(m, n); + TMatrixT<Double_t> Z(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + + Scalar_t mse = evaluate<Architecture>(ELossFunction::kMeanSquaredError, + YArch, XArch); + zipWithMatrix(Z, [](Scalar_t x, Scalar_t y){return x - y;}, X, Y); + auto squaredSum = [](Scalar_t x, Scalar_t y){return x + y * y;}; + Scalar_t mseReference = reduceMean(squaredSum, 0.0, Z); + + Double_t error; + if (mseReference != 0.0) + error = std::fabs((mse - mseReference) / mseReference); + else + error = std::fabs(mse - mseReference); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +template <typename Architecture> +auto testMeanSquaredErrorGradients(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> X(m, n); + TMatrixT<Double_t> Y(m, n); + TMatrixT<Double_t> ZRef(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + Matrix_t ZArch(Y); + + evaluateGradients<Architecture>(ZArch, ELossFunction::kMeanSquaredError, + XArch, YArch); + auto normedDifference = [m, n](Scalar_t x, Scalar_t y) { + return 2.0 * (y - x) / (m * n); + }; + zipWithMatrix(ZRef, normedDifference, X, Y); + TMatrixT<Double_t> Z(ZArch); + Double_t error = maximumRelativeError(Z, ZRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +// +// Cross Entropy +//______________________________________________________________________________ + +template <typename Architecture> +auto testCrossEntropy(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = rand() % 100 + 1; + size_t n = rand() % 100 + 1; + + TMatrixT<Double_t> X(m, n); + TMatrixT<Double_t> Y(m, n); + TMatrixT<Double_t> Z(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + + Scalar_t ce = evaluate<Architecture>(ELossFunction::kCrossEntropy, + YArch, XArch); + + auto crossCorrelation = [](Scalar_t x, Scalar_t y) { + Scalar_t sig = 1.0 / (1.0 + std::exp(-x)); + return y * std::log(sig) + (1 - y) * std::log(1 - sig); + }; + zipWithMatrix(Z, crossCorrelation, X, Y); + auto sum = [](Scalar_t x, Scalar_t y) {return x + y;}; + Scalar_t ceReference = - reduceMean(sum, 0.0, Z); + + Double_t error; + if (ceReference != 0.0) + error = std::fabs((ce - ceReference) / ceReference); + else + error = std::fabs(ce - ceReference); + maximumError = std::max(error, maximumError); + } + return maximumError; +} + +//______________________________________________________________________________ +template <typename Architecture> +auto testCrossEntropyGradients(size_t ntests) +-> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Scalar_t = typename Architecture::Scalar_t; + Double_t maximumError = 0.0; + + for (size_t i = 0; i < ntests; i++) { + size_t m = 8; //rand() % 100 + 1; + size_t n = 8; //rand() % 100 + 1; + + TMatrixT<Double_t> X(m, n); + TMatrixT<Double_t> Y(m, n); + TMatrixT<Double_t> ZRef(m, n); + + randomMatrix(X); + randomMatrix(Y); + + Matrix_t XArch(X); + Matrix_t YArch(Y); + Matrix_t ZArch(Y); + + evaluateGradients<Architecture>(ZArch, ELossFunction::kCrossEntropy, + YArch, XArch); + auto crossCorrelationGradient = [m, n](Scalar_t x, Scalar_t y) { + Scalar_t sig = 1.0 / (1.0 + std::exp(-x)); + Scalar_t norm = 1.0 / ((Scalar_t) m * n); + return (sig - y) * norm;}; + zipWithMatrix(ZRef, crossCorrelationGradient, X, Y); + + TMatrixT<Double_t> Z(ZArch); + Double_t error = maximumRelativeError(Z, ZRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..94b426d780f629128c0d4c691507b7fba1cb5e74 --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctionsCpu.cxx @@ -0,0 +1,63 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +////////////////////////////////////////////////////////////////// +// Test for the loss function implementatoins for the // +// multi-threaded CPU version using the generic test defined in // +// TestLossFunctions.h. // +////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TestLossFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + + std::cout << "Testing Loss Functions:" << std::endl << std::endl; + + double error; + + // + // Mean Squared Error. + // + + error = testMeanSquaredError<TCpu<Scalar_t>>(10); + std::cout << "Testing mean squared error loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + error = testMeanSquaredErrorGradients<TCpu<Scalar_t>>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + // + // Cross Entropy. + // + + error = testCrossEntropy<TCpu<Scalar_t>>(10); + std::cout << "Testing cross entropy loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + error = testCrossEntropyGradients<TCpu<Scalar_t>>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..1bf9ccae98322975e1d668982270d86b1ce84959 --- /dev/null +++ b/tmva/tmva/test/DNN/TestLossFunctionsCuda.cxx @@ -0,0 +1,61 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test for the loss function reference implementation using the // +// generic test defined in TestLossFunctions.h. // +/////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestLossFunctions.h" + +using namespace TMVA::DNN; + +int main() +{ + using Scalar_t = Double_t; + std::cout << "Testing Loss Functions:" << std::endl << std::endl; + + double error; + + // + // Mean Squared Error. + // + + error = testMeanSquaredError<TCuda<Scalar_t>>(10); + std::cout << "Testing mean squared error loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + error = testMeanSquaredErrorGradients<TCuda<Scalar_t>>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + // + // Cross Entropy. + // + + error = testCrossEntropy<TCuda<Scalar_t>>(10); + std::cout << "Testing cross entropy loss: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; + + error = testCrossEntropyGradients<TCuda<Scalar_t>>(10); + std::cout << "Testing mean squared error gradient: "; + std::cout << "maximum relative error = " << print_error(error) << std::endl; + if (error > 1e-3) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestMatrixArithmetic.h b/tmva/tmva/test/DNN/TestMatrixArithmetic.h new file mode 100644 index 0000000000000000000000000000000000000000..10aed0ed1d372bc4ec176de11281fa8b3922dadf --- /dev/null +++ b/tmva/tmva/test/DNN/TestMatrixArithmetic.h @@ -0,0 +1,119 @@ +// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test arithmetic functions defined on matrices and compare the // +// results to the reference implementation. // +/////////////////////////////////////////////////////////////////// + +#include "TMatrix.h" +#include "Utility.h" +#include "TMVA/DNN/Architectures/Reference.h" + +/** Test multiplication (standard, transposed, hadamard) operation on + * architecture specific matrix types and compare with results + * obtained with TMatrixT. + */ +//______________________________________________________________________________ +template<typename Architecture_t> +auto testMultiplication(size_t ntests) + -> typename Architecture_t::Scalar_t +{ + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + + Scalar_t maximumError = 0.0; + + for (size_t t = 0; t < ntests; t++) { + size_t m, n, k; + m = rand() % 100 + 1; + n = rand() % 100 + 1; + k = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m,k), A2Ref(m,k), ATRef(k,m) , BRef(k,n), + BTRef(n,k), CRef(m,n); + TMVA::DNN::randomMatrix(ARef); + TMVA::DNN::randomMatrix(A2Ref); + TMVA::DNN::randomMatrix(ATRef); + TMVA::DNN::randomMatrix(BRef); + TMVA::DNN::randomMatrix(BTRef); + Matrix_t A(ARef), A2(A2Ref), AT(ATRef), B(BRef), BT(BTRef), C(CRef); + + // A * B + CRef.Mult(ARef,BRef); + Architecture_t::Multiply(C, A, B); + Scalar_t error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) C, CRef); + maximumError = std::max(error, maximumError); + + // A^T * B + CRef.TMult(ATRef,BRef); + Architecture_t::TransposeMultiply(C, AT, B); + error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) C, CRef); + maximumError = std::max(error, maximumError); + + // A * B^T + CRef.MultT(ARef,BTRef); + Architecture_t::MultiplyTranspose(C, A, BT); + error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) C, CRef); + maximumError = std::max(error, maximumError); + + // A .* B + for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) { + ARef(i,j) *= A2Ref(i,j); + } + } + Architecture_t::Hadamard(A, A2); + error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) A, ARef); + maximumError = std::max(error, maximumError); + } + + return maximumError; +} + +/** Test the summing over columns by summing by the sums obtained + * from a matrix filled with column indices as elements. + */ +//______________________________________________________________________________ +template<typename Architecture_t> +auto testSumColumns(size_t ntests) + -> typename Architecture_t::Scalar_t +{ + + using Scalar_t = typename Architecture_t::Scalar_t; + using Matrix_t = typename Architecture_t::Matrix_t; + + Scalar_t maximumError = 0.0; + for (size_t t = 0; t < ntests; t++) { + + Scalar_t error; + + size_t m, n; + m = rand() % 100 + 1; + n = rand() % 100 + 1; + + TMatrixT<Double_t> ARef(m,n), BRef(n,1); + + for (size_t i = 0; i < (size_t) ARef.GetNrows(); i++) { + for (size_t j = 0; j < (size_t) ARef.GetNcols(); j++) { + ARef(i,j) = j; + if (i == 0) BRef(j, 0) = m * j; + } + } + + Matrix_t A(ARef), B(n, 1); + Architecture_t::SumColumns(B, A); + + error = TMVA::DNN::maximumRelativeError((TMatrixT<Double_t>) B ,BRef); + maximumError = std::max(error, maximumError); + } + return maximumError; +} diff --git a/tmva/tmva/test/DNN/TestMatrixArithmeticCpu.cxx b/tmva/tmva/test/DNN/TestMatrixArithmeticCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..c7aa073325b14bdfee6ae6628fbb742f4e4ac4cc --- /dev/null +++ b/tmva/tmva/test/DNN/TestMatrixArithmeticCpu.cxx @@ -0,0 +1,46 @@ +// @(#)root/tmva/tmva/dnn:$Id$ // Author: Simon Pfreundschuh 20/07/16 + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +/////////////////////////////////////////////////////////////////// +// Test arithmetic on CpuMatrix class using the generic tests in // +// TestArithmetic.h // +/////////////////////////////////////////////////////////////////// + +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TestMatrixArithmetic.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing CPU matrix arithmetic (double):" << std::endl; + + Double_t error = testMultiplication<TCpu<Double_t>>(10); + std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-3) + return 1; + + error = testSumColumns<TCpu<Double_t>>(1); + std::cout << "Column Sum: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-3) + return 1; + + std::cout << "Testing CPU matrix arithmetic (float):" << std::endl; + + error = testMultiplication<TCpu<Real_t>>(10); + std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-1) + return 1; + + error = testSumColumns<TCpu<Real_t>>(1); + std::cout << "Column Sum: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-1) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..cf50a6d9f12826f98885e73eef952ae04def3e78 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMatrixArithmeticCuda.cxx @@ -0,0 +1,48 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////////////// +// Concrete instantiation of the generic backpropagation test for // +// CUDA architectures. // +//////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TestMatrixArithmetic.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing CUDA matrix arithmetic (double):" << std::endl; + + Double_t error = testMultiplication<TCuda<Double_t>>(10); + std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-3) + return 1; + + error = testSumColumns<TCuda<Double_t>>(1); + std::cout << "Column Sum: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-3) + return 1; + + std::cout << "Testing CUDA matrix arithmetic (float):" << std::endl; + + error = testMultiplication<TCuda<Real_t>>(10); + std::cout << "Multiplication: " << "Max. rel. error: " << error << std::endl; + if (error > 1) + return 1; + + error = testSumColumns<TCuda<Real_t>>(1); + std::cout << "Column Sum: " << "Max. rel. error: " << error << std::endl; + if (error > 1e-3) + return 1; +} diff --git a/tmva/tmva/test/DNN/TestMinimization.cxx b/tmva/tmva/test/DNN/TestMinimization.cxx new file mode 100644 index 0000000000000000000000000000000000000000..a2e5b36b7a249ad4bb22522675dd295c8a616c06 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimization.cxx @@ -0,0 +1,29 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +//////////////////////////////////////////////////////////// +// Test the Neural Network training using the reference // +// implementation. // +// // +// Calls the generic testMinimization function defined in // +// TestMinimization.cpp for the reference architecture. // +//////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Reference.h" +#include "TestMinimization.h" + +using namespace TMVA::DNN; + +int main() +{ + testMinimization<TReference<double>>(); +} diff --git a/tmva/tmva/test/DNN/TestMinimization.h b/tmva/tmva/test/DNN/TestMinimization.h new file mode 100644 index 0000000000000000000000000000000000000000..98803b950bb2ad9abd9f272e016d26248fb21fb4 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimization.h @@ -0,0 +1,121 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Test Standard Minimizer // +// // +// This test trains a linear neural network on a linear function // +// F(x) = W * x and computes the relative error between the matrix // +// W' representing the linear function learned by the net to the // +// orignal matrix W. // +///////////////////////////////////////////////////////////////////// + +#include "TMatrix.h" +#include "TMVA/DNN/Minimizers.h" +#include "TMVA/DNN/Net.h" +#include "Utility.h" + +using namespace TMVA::DNN; + +/** Train a linear neural network on a randomly generated linear mapping + * from a 20-dimensional input space to a 1-dimensional output space. + * Returns the error of the response of the network to the input containing + * only ones to the 1x20 matrix generating the mapping. + */ +template <typename Architecture> + auto testMinimization() + -> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet<Architecture>; + + size_t nSamples = 10000; + size_t nFeatures = 20; + size_t batchSize = 256; + + TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1), + XTest(batchSize, nFeatures), YTest(batchSize, 1), W(nFeatures, 1); + + randomMatrix(W); + randomMatrix(XTrain); + randomMatrix(XTest); + YTrain.Mult(XTrain, W); + YTest.Mult(XTest, W); + + Net_t net(batchSize, nFeatures, ELossFunction::kMeanSquaredError); + net.AddLayer(64, EActivationFunction::kIdentity); + net.AddLayer(64, EActivationFunction::kIdentity); + net.AddLayer(64, EActivationFunction::kIdentity); + net.AddLayer(1, EActivationFunction::kIdentity); + net.Initialize(EInitialization::kGauss); + + TGradientDescent<Architecture> minimizer(0.0001, 5, 1); + MatrixInput_t trainingData(XTrain, YTrain); + MatrixInput_t testData(XTest, YTest); + minimizer.TrainMomentum(trainingData, nSamples, testData, batchSize, net, 0.8, 1); + + TMatrixT<Double_t> I(nFeatures, nFeatures); + for (size_t i = 0; i < nFeatures; i++) { + I(i, i) = 1.0; + } + Matrix_t Id(I); + auto clone = net.CreateClone(nFeatures); + clone.Forward(Id); + TMatrixT<Double_t> Y(clone.GetOutput()); + + return maximumRelativeError(Y, W); +} + +/** Similar to testMinimization() as the function above except that + * it uses momentum for the training */ +template <typename Architecture> + auto testMinimizationMomentum() + -> typename Architecture::Scalar_t +{ + using Matrix_t = typename Architecture::Matrix_t; + using Net_t = TNet<Architecture>; + + size_t nSamples = 10000; + size_t nFeatures = 20; + size_t batchSize = 256; + + TMatrixT<Double_t> XTrain(nSamples, nFeatures), YTrain(nSamples, 1), + XTest(batchSize, nFeatures), YTest(batchSize, 1), W(nFeatures, 1); + + randomMatrix(W); + randomMatrix(XTrain); + randomMatrix(XTest); + YTrain.Mult(XTrain, W); + YTest.Mult(XTest, W); + + Net_t net(batchSize, nFeatures, ELossFunction::kMeanSquaredError); + net.AddLayer(64, EActivationFunction::kIdentity); + net.AddLayer(64, EActivationFunction::kIdentity); + net.AddLayer(64, EActivationFunction::kIdentity); + net.AddLayer(1, EActivationFunction::kIdentity); + net.Initialize(EInitialization::kGauss); + + TGradientDescent<Architecture> minimizer(0.0001, 5, 5); + MatrixInput_t trainingData(XTrain, YTrain); + MatrixInput_t testData(XTest, YTest); + minimizer.TrainMomentum(trainingData, nSamples, testData, batchSize, net, 0.9, 1); + + TMatrixT<Double_t> I(nFeatures, nFeatures); + for (size_t i = 0; i < nFeatures; i++) { + I(i, i) = 1.0; + } + Matrix_t Id(I); + auto clone = net.CreateClone(nFeatures); + clone.Forward(Id); + TMatrixT<Double_t> Y(clone.GetOutput()); + + return maximumRelativeError(Y, W); +} diff --git a/tmva/tmva/test/DNN/TestMinimizationCpu.cxx b/tmva/tmva/test/DNN/TestMinimizationCpu.cxx new file mode 100644 index 0000000000000000000000000000000000000000..86c4de988f020bfa335acdc170c3223cbddf1dc0 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimizationCpu.cxx @@ -0,0 +1,55 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Train the multi-threaded CPU implementation of DNNs on a random // +// linear mapping. In the linear case the minimization problem is // +// convex and the gradient descent training should converge to the // +// global minimum. // +///////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Cpu.h" +#include "TestMinimization.h" + +using namespace TMVA::DNN; + +int main() +{ + + std::cout << "Testing minimization: (single precision)" << std::endl; + + Double_t error = testMinimization<TCpu<Real_t>>(); + std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl; + if (error > 1e-3) { + return 1; + } + + error = testMinimizationMomentum<TCpu<Real_t>>(); + std::cout << "Momentum: Maximum relative error = " << error << std::endl; + if (error > 1e-3) { + return 1; + } + std::cout << std::endl << "Testing minimization: (double precision)" << std::endl; + + error = testMinimization<TCpu<Double_t>>(); + std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl; + if (error > 1e-5) { + return 1; + } + + error = testMinimizationMomentum<TCpu<Double_t>>(); + std::cout << "Momentum: Maximum relative error = " << error << std::endl; + if (error > 1e-5) { + return 1; + } + return 0; +} diff --git a/tmva/tmva/test/DNN/TestMinimizationCuda.cxx b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx new file mode 100644 index 0000000000000000000000000000000000000000..117ef5fce3a4ec01c0acc18ad548983377231100 --- /dev/null +++ b/tmva/tmva/test/DNN/TestMinimizationCuda.cxx @@ -0,0 +1,53 @@ +// @(#)root/tmva $Id$ +// Author: Simon Pfreundschuh + +/************************************************************************* + * Copyright (C) 2016, Simon Pfreundschuh + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +///////////////////////////////////////////////////////////////////// +// Use the generic tests defined in TestMinimization.h to test the // +// training of Neural Networks for CUDA architectures. // +///////////////////////////////////////////////////////////////////// + +#include <iostream> +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Architectures/Cuda.h" +#include "TMVA/DNN/Minimizers.h" +#include "TestMinimization.h" + +using namespace TMVA::DNN; + +int main() +{ + std::cout << "Testing minimization: (single precision)" << std::endl; + + Double_t error = testMinimization<TCuda<Real_t>>(); + std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl; + if (error > 1) { + return 1; + } + + error = testMinimizationMomentum<TCuda<Real_t>>(); + std::cout << "Momentum: Maximum relative error = " << error << std::endl; + if (error > 1) { + return 1; + } + std::cout << std::endl << "Testing minimization: (double precision)" << std::endl; + + error = testMinimization<TCuda<Double_t>>(); + std::cout << "Gradient Descent: Maximum relative error = " << error << std::endl; + if (error > 1e-3) { + return 1; + } + + error = testMinimizationMomentum<TCuda<Double_t>>(); + std::cout << "Momentum: Maximum relative error = " << error << std::endl; + if (error > 1e-3) { + return 1; + } +} diff --git a/tmva/tmva/test/DNN/Utility.h b/tmva/tmva/test/DNN/Utility.h new file mode 100644 index 0000000000000000000000000000000000000000..46077fc6a14c9406cf36ff3b7a6b443b31fe6b29 --- /dev/null +++ b/tmva/tmva/test/DNN/Utility.h @@ -0,0 +1,269 @@ +#ifndef TMVA_TEST_DNN_UTILITY +#define TMVA_TEST_DNN_UTILITY + +#include <iostream> +#include <sstream> +#include <type_traits> +#include "stdlib.h" +#include "TRandom.h" +#include "TMVA/DNN/Architectures/Reference.h" +#include "TMVA/DNN/Functions.h" +#include "TMVA/DNN/Net.h" + +namespace TMVA +{ +namespace DNN +{ + +/** Construct a random linear neural network with up to five layers.*/ +//______________________________________________________________________________ +template <typename AArchitecture> +void constructRandomLinearNet(TNet<AArchitecture> & net) +{ + int nlayers = rand() % 5 + 1; + + std::vector<EActivationFunction> ActivationFunctions + = {EActivationFunction::kIdentity}; + + for (int i = 0; i < nlayers; i++) { + int width = rand() % 20 + 1; + EActivationFunction f = + ActivationFunctions[rand() % ActivationFunctions.size()]; + net.AddLayer(width, f); + } +} + +/*! Set matrix to the identity matrix */ +//______________________________________________________________________________ +template <typename AMatrix> +void identityMatrix(AMatrix &X) +{ + size_t m, n; + m = X.GetNrows(); + n = X.GetNcols(); + + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + X(i,j) = 0.0; + } + if (i < n) { + X(i,i) = 1.0; + } + } +} + +/*! Fill matrix with random, Gaussian-distributed values. */ +//______________________________________________________________________________ +template <typename AMatrix> +void randomMatrix(AMatrix &X) +{ + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + TRandom rand(clock()); + + Double_t sigma = sqrt(10.0); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + X(i,j) = rand.Gaus(0.0, sigma); + } + } +} + +/*! Generate a random batch as input for a neural net. */ +//______________________________________________________________________________ +template <typename AMatrix> +void randomBatch(AMatrix &X) +{ + randomMatrix(X); +} + +/*! Generate a random batch as input for a neural net. */ +//______________________________________________________________________________ +template <typename AMatrix> +void copyMatrix(AMatrix &X, const AMatrix &Y) +{ + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + X(i,j) = Y(i,j); + } + } +} + +/*! Apply functional to each element in the matrix. */ +//______________________________________________________________________________ +template <typename AMatrix, typename F> +void applyMatrix(AMatrix &X, F f) +{ + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + X(i,j) = f(X(i,j)); + } + } +} + +/*! Combine elements of two given matrices into a single matrix using + * the given function f. */ +//______________________________________________________________________________ +template <typename AMatrix, typename F> +void zipWithMatrix(AMatrix &Z, + F f, + const AMatrix &X, + const AMatrix &Y) +{ + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + Z(i,j) = f(X(i,j), Y(i,j)); + } + } +} + +/** Generate a random batch as input for a neural net. */ +//______________________________________________________________________________ +template <typename AMatrix, typename AFloat, typename F> +AFloat reduce(F f, AFloat start, const AMatrix &X) +{ + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + AFloat result = start; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + result = f(result, X(i,j)); + } + } + return result; +} + +/** Apply function to matrix element-wise and compute the mean of the resulting + * element values */ +//______________________________________________________________________________ +template <typename AMatrix, typename AFloat, typename F> +AFloat reduceMean(F f, AFloat start, const AMatrix &X) +{ + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + AFloat result = start; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + result = f(result, X(i,j)); + } + } + return result / (AFloat) (m * n); +} + +/** Compute the relative error of x and y normalized by y. Specialized for + * float and double to make sure both arguments are above expected machine + * precision (1e-5 and 1e-10). */ +//______________________________________________________________________________ +template <typename AFloat> +inline AFloat relativeError(const AFloat &x, + const AFloat &y); + + +//______________________________________________________________________________ +template <> +inline Double_t relativeError(const Double_t &x, + const Double_t &y) +{ + if ((std::abs(x) > 1e-10) && (std::abs(y) > 1e-10)) { + return std::fabs((x - y) / y); + } else { + return std::fabs(x - y); + } +} + +//______________________________________________________________________________ +template <> +inline Real_t relativeError(const Real_t &x, + const Real_t &y) +{ + if ((std::abs(x) > 1e-5) && (std::abs(y) > 1e-5)) { + return std::fabs((x - y) / y); + } else { + return std::fabs(x - y); + } +} + +/*! Compute the maximum, element-wise relative error of the matrices +* X and Y normalized by the element of Y. Protected against division +* by zero. */ +//______________________________________________________________________________ +template <typename AMatrix> +auto maximumRelativeError(const AMatrix &X, + const AMatrix &Y) +-> decltype(X(0,0)) +{ + + using AFloat = decltype(X(0,0)); + + size_t m,n; + m = X.GetNrows(); + n = X.GetNcols(); + + AFloat maximumError = 0.0; + + for (size_t i = 0; i < m; i++) { + for (size_t j = 0; j < n; j++) { + AFloat error = relativeError(X(i,j), Y(i,j)); + maximumError = std::max(error, maximumError); + } + } + return maximumError; +} + +/*! Numerically compute the derivative of the functional f using finite +* differences. */ +//______________________________________________________________________________ +template <typename F, typename AFloat> +inline AFloat finiteDifference(F f, AFloat dx) +{ + return f(dx) - f(0.0 - dx); +} + +/*! Color code error. */ +//______________________________________________________________________________ +template <typename AFloat> +std::string print_error(AFloat &e) +{ + std::ostringstream out{}; + + out << ("\e["); + + if (e > 1e-5) + out << "31m"; + else if (e > 1e-9) + out << "33m"; + else + out << "32m"; + + out << e; + out << "\e[39m"; + + return out.str(); +} + +} +} + +#endif