From 06cd6dce8fde1599f15acad7e0829dbca2408a1f Mon Sep 17 00:00:00 2001
From: Guilherme Amadio <amadio@cern.ch>
Date: Tue, 3 Oct 2017 13:34:27 +0200
Subject: [PATCH] [TDF] Add tutorials to show how to use jitted defines and
 filters

---
 tutorials/dataframe/tdf012_using_jit.C  | 63 +++++++++++++++++++++++++
 tutorials/dataframe/tdf012_using_jit.py | 51 ++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 tutorials/dataframe/tdf012_using_jit.C
 create mode 100644 tutorials/dataframe/tdf012_using_jit.py

diff --git a/tutorials/dataframe/tdf012_using_jit.C b/tutorials/dataframe/tdf012_using_jit.C
new file mode 100644
index 00000000000..feafd831bc9
--- /dev/null
+++ b/tutorials/dataframe/tdf012_using_jit.C
@@ -0,0 +1,63 @@
+/// \file
+/// \ingroup tutorial_tdataframe
+/// \notebook -nodraw
+///
+/// This tutorial illustrates how to save some typing when using TDataFrame
+/// by invoking functions that perform jit-compiling at runtime.
+///
+/// \macro_code
+///
+/// \date October 2017
+/// \author Guilherme Amadio
+
+#include "TRandom.h"
+#include "ROOT/TDataFrame.hxx"
+
+void tdf012_using_jit()
+{
+   // We will inefficiently calculate an approximation of pi by generating
+   // some data and doing very simple filtering and analysis on it
+
+   // We start by creating an empty dataframe where we will insert 10 million
+   // random points in a square of side 2.0 (that is, with an inscribed circle
+   // of radius 1.0)
+
+   size_t npoints = 10000000;
+   ROOT::Experimental::TDataFrame tdf(npoints);
+
+   // Make generators known to interpreter
+   gInterpreter->ProcessLine("TRandom rx, ry;");
+
+   // Use different seeds for independent streams
+   gInterpreter->ProcessLine("rx.SetSeed(1);");
+   gInterpreter->ProcessLine("ry.SetSeed(2);");
+
+   // Define what we want inside the dataframe. We do not need to define p as an array,
+   // but we do it here to demonstrate how to use jitting with TDataFrame
+
+   // NOTE: Although it's possible to use "for (auto&& x : p)" below, it will
+   // shadow the name of the data column "x", and may cause compilation failures
+   // if the local variable and the data column are of different types or the
+   // local x variable is declared in the global scope of the lambda function
+
+   auto pidf = tdf.Define("x", "rx.Uniform(-1.0, 1.0)")
+                  .Define("y", "ry.Uniform(-1.0, 1.0)")
+                  .Define("p", "std::array<double, 2> v{x, y}; return v;")
+                  .Define("r", "double r2 = 0.0; for (auto&& x : p) r2 += x*x; return sqrt(r2);");
+
+   // Now we have a dataframe with columns x, y, p (which is a point based on x
+   // and y), and the radius r = sqrt(x*x + y*y). In order to approximate pi, we
+   // need to know how many of our data points fall inside the unit circle compared
+   // with the total number of points. The ratio of the areas is
+   //
+   //     A_circle / A_square = pi r*r / l * l, where r = 1.0, and l = 2.0
+   //
+   // Therefore, we can approximate pi with 4 times the number of points inside the
+   // unit circle over the total number of points in our dataframe:
+
+   auto incircle = *(pidf.Filter("r <= 1.0").Count());
+
+   double pi_approx = 4.0 * incircle / npoints;
+
+   std::cout << "pi is approximately equal to " << pi_approx << std::endl;
+}
diff --git a/tutorials/dataframe/tdf012_using_jit.py b/tutorials/dataframe/tdf012_using_jit.py
new file mode 100644
index 00000000000..cdadb7bbb5e
--- /dev/null
+++ b/tutorials/dataframe/tdf012_using_jit.py
@@ -0,0 +1,51 @@
+## \file
+## \ingroup tutorial_tdataframe
+## \notebook -nodraw
+##
+## This tutorial illustrates how to use jit-compiling features of TDataFrame
+## to define data using C++ code in a Python script
+##
+## \macro_code
+##
+## \date October 2017
+## \author Guilherme Amadio
+
+import ROOT
+
+## We will inefficiently calculate an approximation of pi by generating
+## some data and doing very simple filtering and analysis on it.
+
+## We start by creating an empty dataframe where we will insert 10 million
+## random points in a square of side 2.0 (that is, with an inscribed unit
+## circle).
+
+npoints = 10000000
+tdf = ROOT.ROOT.Experimental.TDataFrame(npoints)
+
+ROOT.gInterpreter.ProcessLine("TRandom rx, ry;")
+ROOT.gInterpreter.ProcessLine("rx.SetSeed(1);")
+ROOT.gInterpreter.ProcessLine("ry.SetSeed(2);")
+
+## Define what data we want inside the dataframe. We do not need to define p
+## as an array, but we do it here to demonstrate how to use jitting with TDataFrame
+
+pidf = tdf.Define("x", "rx.Uniform(-1.0, 1.0)") \
+          .Define("y", "ry.Uniform(-1.0, 1.0)") \
+          .Define("p", "std::array<double, 2> v{x, y}; return v;") \
+          .Define("r", "double r2 = 0.0; for (auto&& w : p) r2 += w*w; return sqrt(r2);")
+
+## Now we have a dataframe with columns x, y, p (which is a point based on x
+## and y), and the radius r = sqrt(x*x + y*y). In order to approximate pi, we
+## need to know how many of our data points fall inside the circle of radius
+## one compared with the total number of points. The ratio of the areas is
+##
+##     A_circle / A_square = pi r*r / l * l, where r = 1.0, and l = 2.0
+##
+## Therefore, we can approximate pi with 4 times the number of points inside
+## the unit circle over the total number of points:
+
+incircle = pidf.Filter("r <= 1.0").Count().GetValue()
+
+pi_approx = 4.0 * incircle / npoints
+
+print("pi is approximately equal to %g" % (pi_approx))
-- 
GitLab