From 8f5faa7c601f4d109780e9a82c83422cf6063f82 Mon Sep 17 00:00:00 2001 From: Stefan Wunsch <stefan.wunsch@cern.ch> Date: Wed, 3 Jul 2019 12:33:28 +0200 Subject: [PATCH] [TMVA experimental] Add RStandardScaler preprocessing method --- tmva/tmva/CMakeLists.txt | 1 + tmva/tmva/inc/TMVA/RStandardScaler.hxx | 117 +++++++++++++++++++++ tmva/tmva/inc/TMVA/RTensor.hxx | 61 ++++++++--- tmva/tmva/test/CMakeLists.txt | 2 + tmva/tmva/test/rstandardscaler.cxx | 128 +++++++++++++++++++++++ tutorials/tmva/tmva004_RStandardScaler.C | 52 +++++++++ 6 files changed, 348 insertions(+), 13 deletions(-) create mode 100644 tmva/tmva/inc/TMVA/RStandardScaler.hxx create mode 100644 tmva/tmva/test/rstandardscaler.cxx create mode 100644 tutorials/tmva/tmva004_RStandardScaler.C diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt index 068a524493e..62007169475 100644 --- a/tmva/tmva/CMakeLists.txt +++ b/tmva/tmva/CMakeLists.txt @@ -24,6 +24,7 @@ if(dataframe) set(TMVA_EXTRA_HEADERS TMVA/RTensor.hxx TMVA/RTensorUtils.hxx + TMVA/RStandardScaler.hxx ) set(TMVA_EXTRA_DEPENDENCIES ROOTDataFrame diff --git a/tmva/tmva/inc/TMVA/RStandardScaler.hxx b/tmva/tmva/inc/TMVA/RStandardScaler.hxx new file mode 100644 index 00000000000..9fd79b4d2a4 --- /dev/null +++ b/tmva/tmva/inc/TMVA/RStandardScaler.hxx @@ -0,0 +1,117 @@ +#ifndef TMVA_RSTANDARDSCALER +#define TMVA_RSTANDARDSCALER + +#include <TFile.h> + +#include <TMVA/RTensor.hxx> +#include <ROOT/RStringView.hxx> + +#include <cmath> +#include <vector> + +namespace TMVA { +namespace Experimental { + +template <typename T> +class RStandardScaler { +private: + std::vector<T> fMeans; + std::vector<T> fStds; + +public: + RStandardScaler() = default; + RStandardScaler(std::string_view title, std::string_view filename); + void Fit(const RTensor<T>& x); + std::vector<T> Compute(const std::vector<T>& x); + RTensor<T> Compute(const RTensor<T>& x); + std::vector<T> GetMeans() const { return fMeans; } + std::vector<T> GetStds() const { return fStds; } + void Save(std::string_view title, std::string_view filename); +}; + +template <typename T> +inline RStandardScaler<T>::RStandardScaler(std::string_view title, std::string_view filename) { + auto file = TFile::Open(filename.data(), "READ"); + RStandardScaler<T>* obj; + file->GetObject(title.data(), obj); + fMeans = obj->GetMeans(); + fStds = obj->GetStds(); + delete obj; + file->Close(); +} + +template <typename T> +inline void RStandardScaler<T>::Save(std::string_view title, std::string_view filename) { + auto file = TFile::Open(filename.data(), "UPDATE"); + file->WriteObject<RStandardScaler<T>>(this, title.data()); + file->Write(); + file->Close(); +} + +template <typename T> +inline void RStandardScaler<T>::Fit(const RTensor<T>& x) { + const auto shape = x.GetShape(); + if (shape.size() != 2) + throw std::runtime_error("Can only fit to input tensor of rank 2."); + fMeans.clear(); + fMeans.resize(shape[1]); + fStds.clear(); + fStds.resize(shape[1]); + + // Compute means + for (std::size_t i = 0; i < shape[0]; i++) { + for (std::size_t j = 0; j < shape[1]; j++) { + fMeans[j] += x(i, j); + } + } + for (std::size_t i = 0; i < shape[1]; i++) { + fMeans[i] /= shape[0]; + } + + // Compute standard deviations using unbiased estimator + for (std::size_t i = 0; i < shape[0]; i++) { + for (std::size_t j = 0; j < shape[1]; j++) { + fStds[j] += (x(i, j) - fMeans[j]) * (x(i, j) - fMeans[j]); + } + } + for (std::size_t i = 0; i < shape[1]; i++) { + fStds[i] = std::sqrt(fStds[i] / (shape[0] - 1)); + } +} + +template <typename T> +inline std::vector<T> RStandardScaler<T>::Compute(const std::vector<T>& x) { + const auto size = x.size(); + if (size != fMeans.size()) + throw std::runtime_error("Size of input vector is not equal to number of fitted variables."); + + std::vector<T> y(size); + for (std::size_t i = 0; i < size; i++) { + y[i] = (x[i] - fMeans[i]) / fStds[i]; + } + + return y; +} + +template <typename T> +inline RTensor<T> RStandardScaler<T>::Compute(const RTensor<T>& x) { + const auto shape = x.GetShape(); + if (shape.size() != 2) + throw std::runtime_error("Can only compute output for input tensor of rank 2."); + if (shape[1] != fMeans.size()) + throw std::runtime_error("Second dimension of input tensor is not equal to number of fitted variables."); + + RTensor<T> y(shape); + for (std::size_t i = 0; i < shape[0]; i++) { + for (std::size_t j = 0; j < shape[1]; j++) { + y(i, j) = (x(i, j) - fMeans[j]) / fStds[j]; + } + } + + return y; +} + +} // namespace Experimental +} // namespace TMVA + +#endif // TMVA_RSTANDARDSCALER diff --git a/tmva/tmva/inc/TMVA/RTensor.hxx b/tmva/tmva/inc/TMVA/RTensor.hxx index ff0e181a33c..e1f24d6d077 100644 --- a/tmva/tmva/inc/TMVA/RTensor.hxx +++ b/tmva/tmva/inc/TMVA/RTensor.hxx @@ -91,6 +91,21 @@ inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, cons return indices; } +/// \brief Compute global index from indices +/// \param[in] strides Strides vector +/// \param[in] idx Indice vector +/// \return Global index +template <typename U, typename V> +inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx) +{ + std::size_t globalIndex = 0; + const auto size = idx.size(); + for (std::size_t i = 0; i < size; i++) { + globalIndex += strides[size - 1 - i] * idx[size - 1 - i]; + } + return globalIndex; +} + /// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible template <class... Ts> struct and_types : std::true_type { @@ -204,17 +219,19 @@ public: // Access elements Value_t &operator()(const Index_t &idx); + const Value_t &operator() (const Index_t &idx) const; template <typename... Idx> Value_t &operator()(Idx... idx); + template <typename... Idx> const Value_t &operator() (Idx... idx) const; // Access properties - std::size_t GetSize() { return fSize; } - Shape_t GetShape() { return fShape; } - Shape_t GetStrides() { return fStrides; } - Value_t *GetData() { return fData; } - std::shared_ptr<Container_t> GetContainer() { return fContainer; } - MemoryLayout GetMemoryLayout() { return fLayout; } - bool IsView() { return fContainer == NULL; } - bool IsOwner() { return !IsView(); } + std::size_t GetSize() const { return fSize; } + Shape_t GetShape() const { return fShape; } + Shape_t GetStrides() const { return fStrides; } + Value_t *GetData() const { return fData; } + std::shared_ptr<Container_t> GetContainer() const { return fContainer; } + MemoryLayout GetMemoryLayout() const { return fLayout; } + bool IsView() const { return fContainer == NULL; } + bool IsOwner() const { return !IsView(); } // Copy RTensor<Value_t, Container_t> Copy(MemoryLayout layout = MemoryLayout::RowMajor); @@ -278,11 +295,17 @@ public: template <typename Value_t, typename Container_t> inline Value_t &RTensor<Value_t, Container_t>::operator()(const Index_t &idx) { - std::size_t globalIndex = 0; - const auto size = idx.size(); - for (std::size_t i = 0; i < size; i++) { - globalIndex += fStrides[size - 1 - i] * idx[size - 1 - i]; - } + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template <typename Value_t, typename Container_t> +inline const Value_t &RTensor<Value_t, Container_t>::operator() (const Index_t &idx) const +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); return fData[globalIndex]; } @@ -298,6 +321,18 @@ Value_t &RTensor<Value_t, Container_t>::operator()(Idx... idx) return operator()({static_cast<std::size_t>(idx)...}); } +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template <typename Value_t, typename Container_t> +template <typename... Idx> +const Value_t &RTensor<Value_t, Container_t>::operator() (Idx... idx) const +{ + static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast<std::size_t>(idx)...}); +} + /// \brief Transpose /// \returns New RTensor /// The tensor is transposed by inverting the associated memory layout from row- diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt index fe285b0cf5e..20cd441d495 100644 --- a/tmva/tmva/test/CMakeLists.txt +++ b/tmva/tmva/test/CMakeLists.txt @@ -21,6 +21,8 @@ if(dataframe) ROOT_ADD_GTEST(rtensor rtensor.cxx LIBRARIES ROOTVecOps TMVA) ROOT_ADD_GTEST(rtensor-iterator rtensor_iterator.cxx LIBRARIES ROOTVecOps TMVA) ROOT_ADD_GTEST(rtensor-utils rtensor_utils.cxx LIBRARIES ROOTVecOps TMVA ROOTDataFrame) + # RStandardScaler + ROOT_ADD_GTEST(rstandardscaler rstandardscaler.cxx LIBRARIES ROOTVecOps TMVA ROOTDataFrame) endif() project(tmva-tests) diff --git a/tmva/tmva/test/rstandardscaler.cxx b/tmva/tmva/test/rstandardscaler.cxx new file mode 100644 index 00000000000..0c6af93eecc --- /dev/null +++ b/tmva/tmva/test/rstandardscaler.cxx @@ -0,0 +1,128 @@ +#include <gtest/gtest.h> + +#include <TMVA/RStandardScaler.hxx> +#include <TMVA/RTensorUtils.hxx> + +#include <cmath> + +using namespace TMVA::Experimental; + +static const std::string filename_ = "http://root.cern.ch/files/tmva_class_example.root"; +static const std::vector<std::string> variables = {"var1", "var2", "var3", "var4"}; + +TEST(RStandardScaler, TensorOutputShape) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + auto y = scaler.Compute(x); + + EXPECT_EQ(y.GetShape().size(), 2ul); + EXPECT_EQ(x.GetShape()[0], y.GetShape()[0]); + EXPECT_EQ(x.GetShape()[1], y.GetShape()[1]); +} + +TEST(RStandardScaler, VectorOutputShape) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + auto y = scaler.Compute({1.0, 2.0, 3.0, 4.0}); + + EXPECT_EQ(x.GetShape()[1], y.size()); +} + +TEST(RStandardScaler, GetMeans) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + const auto y = scaler.GetMeans(); + + for (std::size_t j = 0; j < x.GetShape()[1]; j++) { + float m = 0.0; + for (std::size_t i = 0; i < x.GetShape()[0]; i++) { + m += x(i, j); + } + m /= x.GetShape()[0]; + ASSERT_NEAR(y[j], m, 1e-4); + } +} + +TEST(RStandardScaler, ComputeMeans) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + const auto y = scaler.Compute(x); + + for (std::size_t j = 0; j < x.GetShape()[1]; j++) { + float m = 0.0; + for (std::size_t i = 0; i < x.GetShape()[0]; i++) { + m += y(i, j); + } + m /= x.GetShape()[0]; + ASSERT_NEAR(0.0, m, 1e-4); + } +} + +TEST(RStandardScaler, ComputeStds) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + const auto y = scaler.Compute(x); + + for (std::size_t j = 0; j < x.GetShape()[1]; j++) { + float s = 0.0; + for (std::size_t i = 0; i < x.GetShape()[0]; i++) { + s += y(i, j) * y(i, j); + } + s = std::sqrt(s / (x.GetShape()[0] - 1)); + ASSERT_NEAR(1.0, s, 1e-4); + } +} + +TEST(RStandardScaler, CompareVectorTensorOutput) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + const auto y = scaler.Compute(x); + + for (std::size_t i = 0; i < x.GetShape()[0]; i++) { + const auto y2 = scaler.Compute({x(i, 0), x(i, 1), x(i, 2), x(i, 3)}); + for (std::size_t j = 0; j < x.GetShape()[1]; j++) { + EXPECT_EQ(y2[j], y(i, j)); + } + } +} + +TEST(RStandardScaler, SaveLoad) +{ + ROOT::RDataFrame df("TreeS", filename_); + auto x = AsTensor<float>(df, variables); + + RStandardScaler<float> scaler; + scaler.Fit(x); + scaler.Save("foo", "RStandardScalerSaveLoad.root"); + + RStandardScaler<float> scaler2("foo", "RStandardScalerSaveLoad.root"); + auto y1 = scaler.Compute({1.0, 2.0, 3.0, 4.0}); + auto y2 = scaler2.Compute({1.0, 2.0, 3.0, 4.0}); + for(std::size_t i = 0; i < 4; i++) { + EXPECT_EQ(y1[i], y2[i]); + } +} diff --git a/tutorials/tmva/tmva004_RStandardScaler.C b/tutorials/tmva/tmva004_RStandardScaler.C new file mode 100644 index 00000000000..6e02dd8c3f7 --- /dev/null +++ b/tutorials/tmva/tmva004_RStandardScaler.C @@ -0,0 +1,52 @@ +/// \file +/// \ingroup tutorial_tmva +/// \notebook -nodraw +/// This tutorial illustrates the usage of the standard scaler as preprocessing +/// method. +/// +/// \macro_code +/// \macro_output +/// +/// \date July 2019 +/// \author Stefan Wunsch + +using namespace TMVA::Experimental; + +void tmva004_RStandardScaler() +{ + // Load data used to fit the parameters + ROOT::RDataFrame df("TreeS", "https://root.cern/files/tmva_class_example.root"); + auto x = AsTensor<float>(df); + + // Create standard scaler and fit to data + RStandardScaler<float> scaler; + scaler.Fit(x); + + // Compute transformation + auto y = scaler.Compute(x); + + // Plot first variable scaled and unscaled + TH1F h1("h1", ";x_{4};N_{Events}", 20, -4, 4); + TH1F h2("h2", ";x_{4};N_{Events}", 20, -4, 4); + for (std::size_t i = 0; i < x.GetShape()[0]; i++) { + h1.Fill(x(i, 3)); + h2.Fill(y(i, 3)); + } + h1.SetLineWidth(2); + h1.SetLineColor(kRed); + h2.SetLineWidth(2); + h2.SetLineColor(kBlue); + + gStyle->SetOptStat(0); + auto c = new TCanvas("", "", 800, 800); + h2.Draw("HIST"); + h1.Draw("HIST SAME"); + + TLegend legend(0.7, 0.7, 0.89, 0.89); + legend.SetBorderSize(0); + legend.AddEntry("h1", "Unscaled", "l"); + legend.AddEntry("h2", "Scaled", "l"); + legend.Draw(); + + c->DrawClone(); +} -- GitLab