From 8f5faa7c601f4d109780e9a82c83422cf6063f82 Mon Sep 17 00:00:00 2001
From: Stefan Wunsch <stefan.wunsch@cern.ch>
Date: Wed, 3 Jul 2019 12:33:28 +0200
Subject: [PATCH] [TMVA experimental] Add RStandardScaler preprocessing method

---
 tmva/tmva/CMakeLists.txt                 |   1 +
 tmva/tmva/inc/TMVA/RStandardScaler.hxx   | 117 +++++++++++++++++++++
 tmva/tmva/inc/TMVA/RTensor.hxx           |  61 ++++++++---
 tmva/tmva/test/CMakeLists.txt            |   2 +
 tmva/tmva/test/rstandardscaler.cxx       | 128 +++++++++++++++++++++++
 tutorials/tmva/tmva004_RStandardScaler.C |  52 +++++++++
 6 files changed, 348 insertions(+), 13 deletions(-)
 create mode 100644 tmva/tmva/inc/TMVA/RStandardScaler.hxx
 create mode 100644 tmva/tmva/test/rstandardscaler.cxx
 create mode 100644 tutorials/tmva/tmva004_RStandardScaler.C

diff --git a/tmva/tmva/CMakeLists.txt b/tmva/tmva/CMakeLists.txt
index 068a524493e..62007169475 100644
--- a/tmva/tmva/CMakeLists.txt
+++ b/tmva/tmva/CMakeLists.txt
@@ -24,6 +24,7 @@ if(dataframe)
     set(TMVA_EXTRA_HEADERS
         TMVA/RTensor.hxx
         TMVA/RTensorUtils.hxx
+        TMVA/RStandardScaler.hxx
     )
     set(TMVA_EXTRA_DEPENDENCIES
         ROOTDataFrame
diff --git a/tmva/tmva/inc/TMVA/RStandardScaler.hxx b/tmva/tmva/inc/TMVA/RStandardScaler.hxx
new file mode 100644
index 00000000000..9fd79b4d2a4
--- /dev/null
+++ b/tmva/tmva/inc/TMVA/RStandardScaler.hxx
@@ -0,0 +1,117 @@
+#ifndef TMVA_RSTANDARDSCALER
+#define TMVA_RSTANDARDSCALER
+
+#include <TFile.h>
+
+#include <TMVA/RTensor.hxx>
+#include <ROOT/RStringView.hxx>
+
+#include <cmath>
+#include <vector>
+
+namespace TMVA {
+namespace Experimental {
+
+template <typename T>
+class RStandardScaler {
+private:
+   std::vector<T> fMeans;
+   std::vector<T> fStds;
+
+public:
+   RStandardScaler() = default;
+   RStandardScaler(std::string_view title, std::string_view filename);
+   void Fit(const RTensor<T>& x);
+   std::vector<T> Compute(const std::vector<T>& x);
+   RTensor<T> Compute(const RTensor<T>& x);
+   std::vector<T> GetMeans() const { return fMeans; }
+   std::vector<T> GetStds() const { return fStds; }
+   void Save(std::string_view title, std::string_view filename);
+};
+
+template <typename T>
+inline RStandardScaler<T>::RStandardScaler(std::string_view title, std::string_view filename) {
+    auto file = TFile::Open(filename.data(), "READ");
+    RStandardScaler<T>* obj;
+    file->GetObject(title.data(), obj);
+    fMeans = obj->GetMeans();
+    fStds = obj->GetStds();
+    delete obj;
+    file->Close();
+}
+
+template <typename T>
+inline void RStandardScaler<T>::Save(std::string_view title, std::string_view filename) {
+   auto file = TFile::Open(filename.data(), "UPDATE");
+   file->WriteObject<RStandardScaler<T>>(this, title.data());
+   file->Write();
+   file->Close();
+}
+
+template <typename T>
+inline void RStandardScaler<T>::Fit(const RTensor<T>& x) {
+   const auto shape = x.GetShape();
+   if (shape.size() != 2)
+      throw std::runtime_error("Can only fit to input tensor of rank 2.");
+   fMeans.clear();
+   fMeans.resize(shape[1]);
+   fStds.clear();
+   fStds.resize(shape[1]);
+
+   // Compute means
+   for (std::size_t i = 0; i < shape[0]; i++) {
+      for (std::size_t j = 0; j < shape[1]; j++) {
+         fMeans[j] += x(i, j);
+      }
+   }
+   for (std::size_t i = 0; i < shape[1]; i++) {
+      fMeans[i] /= shape[0];
+   }
+
+   // Compute standard deviations using unbiased estimator
+   for (std::size_t i = 0; i < shape[0]; i++) {
+      for (std::size_t j = 0; j < shape[1]; j++) {
+         fStds[j] += (x(i, j) - fMeans[j]) * (x(i, j) - fMeans[j]);
+      }
+   }
+   for (std::size_t i = 0; i < shape[1]; i++) {
+      fStds[i] = std::sqrt(fStds[i] / (shape[0] - 1));
+   }
+}
+
+template <typename T>
+inline std::vector<T> RStandardScaler<T>::Compute(const std::vector<T>& x) {
+   const auto size = x.size();
+   if (size != fMeans.size())
+      throw std::runtime_error("Size of input vector is not equal to number of fitted variables.");
+
+   std::vector<T> y(size);
+   for (std::size_t i = 0; i < size; i++) {
+      y[i] = (x[i] - fMeans[i]) / fStds[i];
+   }
+
+   return y;
+}
+
+template <typename T>
+inline RTensor<T> RStandardScaler<T>::Compute(const RTensor<T>& x) {
+   const auto shape = x.GetShape();
+   if (shape.size() != 2)
+      throw std::runtime_error("Can only compute output for input tensor of rank 2.");
+   if (shape[1] != fMeans.size())
+      throw std::runtime_error("Second dimension of input tensor is not equal to number of fitted variables.");
+
+   RTensor<T> y(shape);
+   for (std::size_t i = 0; i < shape[0]; i++) {
+      for (std::size_t j = 0; j < shape[1]; j++) {
+         y(i, j) = (x(i, j) - fMeans[j]) / fStds[j];
+      }
+   }
+
+   return y;
+}
+
+} // namespace Experimental
+} // namespace TMVA
+
+#endif // TMVA_RSTANDARDSCALER
diff --git a/tmva/tmva/inc/TMVA/RTensor.hxx b/tmva/tmva/inc/TMVA/RTensor.hxx
index ff0e181a33c..e1f24d6d077 100644
--- a/tmva/tmva/inc/TMVA/RTensor.hxx
+++ b/tmva/tmva/inc/TMVA/RTensor.hxx
@@ -91,6 +91,21 @@ inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, cons
     return indices;
 }
 
+/// \brief Compute global index from indices
+/// \param[in] strides Strides vector
+/// \param[in] idx Indice vector
+/// \return Global index
+template <typename U, typename V>
+inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx)
+{
+   std::size_t globalIndex = 0;
+   const auto size = idx.size();
+   for (std::size_t i = 0; i < size; i++) {
+      globalIndex += strides[size - 1 - i] * idx[size - 1 - i];
+   }
+   return globalIndex;
+}
+
 /// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible
 template <class... Ts>
 struct and_types : std::true_type {
@@ -204,17 +219,19 @@ public:
 
    // Access elements
    Value_t &operator()(const Index_t &idx);
+   const Value_t &operator() (const Index_t &idx) const;
    template <typename... Idx> Value_t &operator()(Idx... idx);
+   template <typename... Idx> const Value_t &operator() (Idx... idx) const;
 
    // Access properties
-   std::size_t GetSize() { return fSize; }
-   Shape_t GetShape() { return fShape; }
-   Shape_t GetStrides() { return fStrides; }
-   Value_t *GetData() { return fData; }
-   std::shared_ptr<Container_t> GetContainer() { return fContainer; }
-   MemoryLayout GetMemoryLayout() { return fLayout; }
-   bool IsView() { return fContainer == NULL; }
-   bool IsOwner() { return !IsView(); }
+   std::size_t GetSize() const { return fSize; }
+   Shape_t GetShape() const { return fShape; }
+   Shape_t GetStrides() const { return fStrides; }
+   Value_t *GetData() const { return fData; }
+   std::shared_ptr<Container_t> GetContainer() const { return fContainer; }
+   MemoryLayout GetMemoryLayout() const { return fLayout; }
+   bool IsView() const { return fContainer == NULL; }
+   bool IsOwner() const { return !IsView(); }
 
    // Copy
    RTensor<Value_t, Container_t> Copy(MemoryLayout layout = MemoryLayout::RowMajor);
@@ -278,11 +295,17 @@ public:
 template <typename Value_t, typename Container_t>
 inline Value_t &RTensor<Value_t, Container_t>::operator()(const Index_t &idx)
 {
-   std::size_t globalIndex = 0;
-   const auto size = idx.size();
-   for (std::size_t i = 0; i < size; i++) {
-      globalIndex += fStrides[size - 1 - i] * idx[size - 1 - i];
-   }
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline const Value_t &RTensor<Value_t, Container_t>::operator() (const Index_t &idx) const
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
    return fData[globalIndex];
 }
 
@@ -298,6 +321,18 @@ Value_t &RTensor<Value_t, Container_t>::operator()(Idx... idx)
    return operator()({static_cast<std::size_t>(idx)...});
 }
 
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+const Value_t &RTensor<Value_t, Container_t>::operator() (Idx... idx) const
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
 /// \brief Transpose
 /// \returns New RTensor
 /// The tensor is transposed by inverting the associated memory layout from row-
diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt
index fe285b0cf5e..20cd441d495 100644
--- a/tmva/tmva/test/CMakeLists.txt
+++ b/tmva/tmva/test/CMakeLists.txt
@@ -21,6 +21,8 @@ if(dataframe)
     ROOT_ADD_GTEST(rtensor rtensor.cxx LIBRARIES ROOTVecOps TMVA)
     ROOT_ADD_GTEST(rtensor-iterator rtensor_iterator.cxx LIBRARIES ROOTVecOps TMVA)
     ROOT_ADD_GTEST(rtensor-utils rtensor_utils.cxx LIBRARIES ROOTVecOps TMVA ROOTDataFrame)
+    # RStandardScaler
+    ROOT_ADD_GTEST(rstandardscaler rstandardscaler.cxx LIBRARIES ROOTVecOps TMVA ROOTDataFrame)
 endif()
 
 project(tmva-tests)
diff --git a/tmva/tmva/test/rstandardscaler.cxx b/tmva/tmva/test/rstandardscaler.cxx
new file mode 100644
index 00000000000..0c6af93eecc
--- /dev/null
+++ b/tmva/tmva/test/rstandardscaler.cxx
@@ -0,0 +1,128 @@
+#include <gtest/gtest.h>
+
+#include <TMVA/RStandardScaler.hxx>
+#include <TMVA/RTensorUtils.hxx>
+
+#include <cmath>
+
+using namespace TMVA::Experimental;
+
+static const std::string filename_ = "http://root.cern.ch/files/tmva_class_example.root";
+static const std::vector<std::string> variables = {"var1", "var2", "var3", "var4"};
+
+TEST(RStandardScaler, TensorOutputShape)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   auto y = scaler.Compute(x);
+
+   EXPECT_EQ(y.GetShape().size(), 2ul);
+   EXPECT_EQ(x.GetShape()[0], y.GetShape()[0]);
+   EXPECT_EQ(x.GetShape()[1], y.GetShape()[1]);
+}
+
+TEST(RStandardScaler, VectorOutputShape)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   auto y = scaler.Compute({1.0, 2.0, 3.0, 4.0});
+
+   EXPECT_EQ(x.GetShape()[1], y.size());
+}
+
+TEST(RStandardScaler, GetMeans)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   const auto y = scaler.GetMeans();
+
+   for (std::size_t j = 0; j < x.GetShape()[1]; j++) {
+      float m = 0.0;
+      for (std::size_t i = 0; i < x.GetShape()[0]; i++) {
+         m += x(i, j);
+      }
+      m /= x.GetShape()[0];
+      ASSERT_NEAR(y[j], m, 1e-4);
+   }
+}
+
+TEST(RStandardScaler, ComputeMeans)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   const auto y = scaler.Compute(x);
+
+   for (std::size_t j = 0; j < x.GetShape()[1]; j++) {
+      float m = 0.0;
+      for (std::size_t i = 0; i < x.GetShape()[0]; i++) {
+         m += y(i, j);
+      }
+      m /= x.GetShape()[0];
+      ASSERT_NEAR(0.0, m, 1e-4);
+   }
+}
+
+TEST(RStandardScaler, ComputeStds)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   const auto y = scaler.Compute(x);
+
+   for (std::size_t j = 0; j < x.GetShape()[1]; j++) {
+      float s = 0.0;
+      for (std::size_t i = 0; i < x.GetShape()[0]; i++) {
+         s += y(i, j) * y(i, j);
+      }
+      s = std::sqrt(s / (x.GetShape()[0] - 1));
+      ASSERT_NEAR(1.0, s, 1e-4);
+   }
+}
+
+TEST(RStandardScaler, CompareVectorTensorOutput)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   const auto y = scaler.Compute(x);
+
+   for (std::size_t i = 0; i < x.GetShape()[0]; i++) {
+      const auto y2 = scaler.Compute({x(i, 0), x(i, 1), x(i, 2), x(i, 3)});
+      for (std::size_t j = 0; j < x.GetShape()[1]; j++) {
+          EXPECT_EQ(y2[j], y(i, j));
+      }
+   }
+}
+
+TEST(RStandardScaler, SaveLoad)
+{
+   ROOT::RDataFrame df("TreeS", filename_);
+   auto x = AsTensor<float>(df, variables);
+
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+   scaler.Save("foo", "RStandardScalerSaveLoad.root");
+
+   RStandardScaler<float> scaler2("foo", "RStandardScalerSaveLoad.root");
+   auto y1 = scaler.Compute({1.0, 2.0, 3.0, 4.0});
+   auto y2 = scaler2.Compute({1.0, 2.0, 3.0, 4.0});
+   for(std::size_t i = 0; i < 4; i++) {
+      EXPECT_EQ(y1[i], y2[i]);
+   }
+}
diff --git a/tutorials/tmva/tmva004_RStandardScaler.C b/tutorials/tmva/tmva004_RStandardScaler.C
new file mode 100644
index 00000000000..6e02dd8c3f7
--- /dev/null
+++ b/tutorials/tmva/tmva004_RStandardScaler.C
@@ -0,0 +1,52 @@
+/// \file
+/// \ingroup tutorial_tmva
+/// \notebook -nodraw
+/// This tutorial illustrates the usage of the standard scaler as preprocessing
+/// method.
+///
+/// \macro_code
+/// \macro_output
+///
+/// \date July 2019
+/// \author Stefan Wunsch
+
+using namespace TMVA::Experimental;
+
+void tmva004_RStandardScaler()
+{
+   // Load data used to fit the parameters
+   ROOT::RDataFrame df("TreeS", "https://root.cern/files/tmva_class_example.root");
+   auto x = AsTensor<float>(df);
+
+   // Create standard scaler and fit to data
+   RStandardScaler<float> scaler;
+   scaler.Fit(x);
+
+   // Compute transformation
+   auto y = scaler.Compute(x);
+
+   // Plot first variable scaled and unscaled
+   TH1F h1("h1", ";x_{4};N_{Events}", 20, -4, 4);
+   TH1F h2("h2", ";x_{4};N_{Events}", 20, -4, 4);
+   for (std::size_t i = 0; i < x.GetShape()[0]; i++) {
+      h1.Fill(x(i, 3));
+      h2.Fill(y(i, 3));
+   }
+   h1.SetLineWidth(2);
+   h1.SetLineColor(kRed);
+   h2.SetLineWidth(2);
+   h2.SetLineColor(kBlue);
+
+   gStyle->SetOptStat(0);
+   auto c = new TCanvas("", "", 800, 800);
+   h2.Draw("HIST");
+   h1.Draw("HIST SAME");
+
+   TLegend legend(0.7, 0.7, 0.89, 0.89);
+   legend.SetBorderSize(0);
+   legend.AddEntry("h1", "Unscaled", "l");
+   legend.AddEntry("h2", "Scaled", "l");
+   legend.Draw();
+
+   c->DrawClone();
+}
-- 
GitLab