From 989619c9a105e8199cce76b7bf27f8fe164b6439 Mon Sep 17 00:00:00 2001 From: Stefan Wunsch <stefan.wunsch@cern.ch> Date: Mon, 8 Apr 2019 14:17:57 +0200 Subject: [PATCH] [PyROOT exp] Introduce MakeNumpyDataFrame to read numpy arrays with RDF --- .../pyroot_experimental/PyROOT/CMakeLists.txt | 2 + .../PyROOT/inc/ROOT/RNumpyDS.hxx | 236 ++++++++++++++++++ .../python/ROOT/pythonization/_rdataframe.py | 5 + .../PyROOT/src/PyROOTModule.cxx | 2 + .../PyROOT/src/PyROOTPythonize.h | 2 + .../PyROOT/src/RDataFramePyz.cxx | 111 ++++++++ .../PyROOT/test/CMakeLists.txt | 1 + .../PyROOT/test/rdataframe_makenumpy.py | 138 ++++++++++ 8 files changed, 497 insertions(+) create mode 100644 bindings/pyroot_experimental/PyROOT/inc/ROOT/RNumpyDS.hxx create mode 100644 bindings/pyroot_experimental/PyROOT/src/RDataFramePyz.cxx create mode 100644 bindings/pyroot_experimental/PyROOT/test/rdataframe_makenumpy.py diff --git a/bindings/pyroot_experimental/PyROOT/CMakeLists.txt b/bindings/pyroot_experimental/PyROOT/CMakeLists.txt index a696d356f47..33df4980e61 100644 --- a/bindings/pyroot_experimental/PyROOT/CMakeLists.txt +++ b/bindings/pyroot_experimental/PyROOT/CMakeLists.txt @@ -36,6 +36,7 @@ set(sources src/PyROOTStrings.cxx src/PyROOTWrapper.cxx src/GenericPyz.cxx + src/RDataFramePyz.cxx src/RVecPyz.cxx src/TClassPyz.cxx src/TClonesArrayPyz.cxx @@ -56,5 +57,6 @@ foreach(py_source ${py_sources}) endforeach() ROOT_LINKER_LIBRARY(ROOTPython ${sources} LIBRARIES Core Tree cppyy) +ROOT_INSTALL_HEADERS(inc) ROOT_ADD_TEST_SUBDIRECTORY(test) diff --git a/bindings/pyroot_experimental/PyROOT/inc/ROOT/RNumpyDS.hxx b/bindings/pyroot_experimental/PyROOT/inc/ROOT/RNumpyDS.hxx new file mode 100644 index 00000000000..2013b29c998 --- /dev/null +++ b/bindings/pyroot_experimental/PyROOT/inc/ROOT/RNumpyDS.hxx @@ -0,0 +1,236 @@ +// Author: Stefan Wunsch CERN 04/2019 + +/************************************************************************* + * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include "ROOT/RIntegerSequence.hxx" +#include "ROOT/RMakeUnique.hxx" +#include "ROOT/RDataSource.hxx" +#include "ROOT/TSeq.hxx" +#include "ROOT/RVec.hxx" + +#include <algorithm> +#include <map> +#include <tuple> +#include <string> +#include <typeinfo> +#include <vector> + +#include "Python.h" + +#ifndef ROOT_RNUMPYDS +#define ROOT_RNUMPYDS + +namespace ROOT { + +namespace Internal { + +namespace RDF { + +//////////////////////////////////////////////////////////////////////////////////////////////// +/// \brief A RDataSource implementation which takes a collection of RVecs, which +/// are able to adopt data from Numpy arrays +/// +/// This component allows to create a data source on a set of columns with data +/// coming from RVecs. The adoption of externally provided data, e.g., via Numpy +/// arrays, with RVecs allows to read arbitrary data from memory. +/// In addition, the data source has to keep a reference on the Python owned data +/// so that the lifetime of the data is tied to the datasource. +template <typename... ColumnTypes> +class RNumpyDS final : public ROOT::RDF::RDataSource { + using PointerHolderPtrs_t = std::vector<ROOT::Internal::TDS::TPointerHolder *>; + + std::tuple<ROOT::RVec<ColumnTypes>*...> fColumns; + const std::vector<std::string> fColNames; + const std::map<std::string, std::string> fColTypesMap; + // The role of the fPointerHoldersModels is to be initialised with the pack + // of arguments in the constrcutor signature at construction time + // Once the number of slots is known, the fPointerHolders are initialised + // according to the models. + const PointerHolderPtrs_t fPointerHoldersModels; + std::vector<PointerHolderPtrs_t> fPointerHolders; + std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{}; + unsigned int fNSlots{0}; + // Pointer to PyObject holding RVecs + // The RVecs itself hold a reference to the associated Numpy arrays so that + // the data cannot go out of scope as long as the datasource survives. + PyObject* fPyRVecs; + + Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id) + { + auto colNameStr = std::string(colName); + // This could be optimised and done statically + const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id); + auto it = fColTypesMap.find(colNameStr); + if (fColTypesMap.end() == it) { + std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source."; + throw std::runtime_error(err); + } + + const auto colIdName = it->second; + if (colIdName != idName) { + std::string err = "Column " + colNameStr + " has type " + colIdName + + " while the id specified is associated to type " + idName; + throw std::runtime_error(err); + } + + const auto colBegin = fColNames.begin(); + const auto colEnd = fColNames.end(); + const auto namesIt = std::find(colBegin, colEnd, colName); + const auto index = std::distance(colBegin, namesIt); + + Record_t ret(fNSlots); + for (auto slot : ROOT::TSeqU(fNSlots)) { + ret[slot] = fPointerHolders[index][slot]->GetPointerAddr(); + } + return ret; + } + + size_t GetEntriesNumber() { return std::get<0>(fColumns)->size(); } + template <std::size_t... S> + void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>) + { + std::initializer_list<int> expander{ + (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = (*std::get<S>(fColumns))[entry], 0)...}; + (void)expander; // avoid unused variable warnings + } + + template <std::size_t... S> + void ColLenghtChecker(std::index_sequence<S...>) + { + if (sizeof...(S) < 2) + return; + + const std::vector<size_t> colLengths{std::get<S>(fColumns)->size()...}; + const auto expectedLen = colLengths[0]; + std::string err; + for (auto i : TSeqI(1, colLengths.size())) { + if (expectedLen != colLengths[i]) { + err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] + + "\" have different lengths: " + std::to_string(expectedLen) + " and " + + std::to_string(colLengths[i]); + } + } + if (!err.empty()) { + throw std::runtime_error(err); + } + } + +protected: + std::string AsString() { return "Numpy data source"; }; + +public: + RNumpyDS(PyObject* pyRVecs, + std::pair<std::string, ROOT::RVec<ColumnTypes>*>... colsNameVals) + : fColumns(std::tuple<ROOT::RVec<ColumnTypes>*...>(colsNameVals.second...)), + fColNames({colsNameVals.first...}), + fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}), + fPointerHoldersModels({new ROOT::Internal::TDS::TTypedPointerHolder<ColumnTypes>(new ColumnTypes())...}), + fPyRVecs(pyRVecs) + { + // Take a reference to the data associated with this data source + Py_INCREF(fPyRVecs); + } + + ~RNumpyDS() + { + for (auto &&ptrHolderv : fPointerHolders) { + for (auto &&ptrHolder : ptrHolderv) { + delete ptrHolder; + } + } + // Release the data associated to this data source + Py_DECREF(fPyRVecs); + } + + const std::vector<std::string> &GetColumnNames() const { return fColNames; } + + std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() + { + auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges + return entryRanges; + } + + std::string GetTypeName(std::string_view colName) const + { + const auto key = std::string(colName); + return fColTypesMap.at(key); + } + + bool HasColumn(std::string_view colName) const + { + const auto key = std::string(colName); + const auto endIt = fColTypesMap.end(); + return endIt != fColTypesMap.find(key); + } + + bool SetEntry(unsigned int slot, ULong64_t entry) + { + SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>()); + return true; + } + + void SetNSlots(unsigned int nSlots) + { + fNSlots = nSlots; + const auto nCols = fColNames.size(); + fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type + auto colIndex = 0U; + for (auto &&ptrHolderv : fPointerHolders) { + for (auto slot : ROOT::TSeqI(fNSlots)) { + auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy(); + ptrHolderv.emplace_back(ptrHolder); + (void)slot; + } + colIndex++; + } + for (auto &&ptrHolder : fPointerHoldersModels) + delete ptrHolder; + } + + void Initialise() + { + ColLenghtChecker(std::index_sequence_for<ColumnTypes...>()); + const auto nEntries = GetEntriesNumber(); + const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller? + auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots; + fEntryRanges.resize(fNSlots); + auto init = 0ULL; + auto end = 0ULL; + for (auto &&range : fEntryRanges) { + end = init + nEntriesInRange; + if (0 != reminder) { // Distribute the reminder among the first chunks + reminder--; + end += 1; + } + range.first = init; + range.second = end; + init = end; + } + } + + std::string GetLabel() { return "RNumpyDS"; } +}; + +// Factory to create datasource able to read Numpy arrays through RVecs +// Note that we have to return the object on the heap so that the interpreter +// does not clean it up during shutdown and causes a double delete. +template <typename... ColumnTypes> +RDataFrame* MakeNumpyDataFrame(PyObject* pyRVecs, + std::pair<std::string, ROOT::RVec<ColumnTypes>*> &&... colNameProxyPairs) +{ + return new RDataFrame(std::make_unique<RNumpyDS<ColumnTypes...>>( + std::forward<PyObject*>(pyRVecs), + std::forward<std::pair<std::string, ROOT::RVec<ColumnTypes>*>>(colNameProxyPairs)...)); +} + +} // namespace RDF +} // namespace Internal +} // namespace ROOT + +#endif // ROOT_RNUMPYDS diff --git a/bindings/pyroot_experimental/PyROOT/python/ROOT/pythonization/_rdataframe.py b/bindings/pyroot_experimental/PyROOT/python/ROOT/pythonization/_rdataframe.py index 05a421ee6da..9b303bc13c1 100644 --- a/bindings/pyroot_experimental/PyROOT/python/ROOT/pythonization/_rdataframe.py +++ b/bindings/pyroot_experimental/PyROOT/python/ROOT/pythonization/_rdataframe.py @@ -9,6 +9,7 @@ ################################################################################ from ROOT import pythonization +from libROOTPython import MakeNumpyDataFrame def RDataFrameAsNumpy(df, columns=None, exclude=None): @@ -96,3 +97,7 @@ def pythonize_rdataframe(klass, name): klass.AsNumpy = RDataFrameAsNumpy return True + +# Add MakeNumpyDataFrame feature as free function to the ROOT module +import cppyy +cppyy.gbl.ROOT.RDF.MakeNumpyDataFrame = MakeNumpyDataFrame diff --git a/bindings/pyroot_experimental/PyROOT/src/PyROOTModule.cxx b/bindings/pyroot_experimental/PyROOT/src/PyROOTModule.cxx index 63d560af6d0..228cd873e11 100644 --- a/bindings/pyroot_experimental/PyROOT/src/PyROOTModule.cxx +++ b/bindings/pyroot_experimental/PyROOT/src/PyROOTModule.cxx @@ -67,6 +67,8 @@ static PyMethodDef gPyROOTMethods[] = {{(char *)"AddDirectoryWritePyz", (PyCFunc (char *)"Get class to wrap Python callable as C++ callable"}, {(char *)"AsRVec", (PyCFunction)PyROOT::AsRVec, METH_O, (char *)"Get object with array interface as RVec"}, + {(char *)"MakeNumpyDataFrame", (PyCFunction)PyROOT::MakeNumpyDataFrame, METH_O, + (char *)"Make RDataFrame from dictionary of numpy arrays"}, {NULL, NULL, 0, NULL}}; #if PY_VERSION_HEX >= 0x03000000 diff --git a/bindings/pyroot_experimental/PyROOT/src/PyROOTPythonize.h b/bindings/pyroot_experimental/PyROOT/src/PyROOTPythonize.h index 682245338ea..d5291698f55 100644 --- a/bindings/pyroot_experimental/PyROOT/src/PyROOTPythonize.h +++ b/bindings/pyroot_experimental/PyROOT/src/PyROOTPythonize.h @@ -39,6 +39,8 @@ PyObject *GetEndianess(PyObject *self, PyObject *args); PyObject *GetVectorDataPointer(PyObject *self, PyObject *args); PyObject *GetSizeOfType(PyObject *self, PyObject *args); +PyObject *MakeNumpyDataFrame(PyObject *self, PyObject *obj); + } // namespace PyROOT #endif // !PYROOT_PYTHONIZE_H diff --git a/bindings/pyroot_experimental/PyROOT/src/RDataFramePyz.cxx b/bindings/pyroot_experimental/PyROOT/src/RDataFramePyz.cxx new file mode 100644 index 00000000000..fccce3c01b4 --- /dev/null +++ b/bindings/pyroot_experimental/PyROOT/src/RDataFramePyz.cxx @@ -0,0 +1,111 @@ +// Author: Stefan Wunsch CERN 04/2019 +// Original PyROOT code by Wim Lavrijsen, LBL + +/************************************************************************* + * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include "CPyCppyy.h" +#include "CPPInstance.h" +#include "ProxyWrappers.h" +#include "PyROOTPythonize.h" +#include "RConfig.h" +#include "TInterpreter.h" +#include "TPython.h" + +#include <utility> // std::pair +#include <sstream> // std::stringstream + +//////////////////////////////////////////////////////////////////////////// +/// \brief Make an RDataFrame from a dictionary of numpy arrays +/// \param[in] pydata Dictionary with numpy arrays +/// +/// This function takes a dictionary of numpy arrays and creates an RDataFrame +/// using the keys as column names and the numpy arrays as data. +PyObject *PyROOT::MakeNumpyDataFrame(PyObject * /*self*/, PyObject * pydata) +{ + if (!pydata) { + PyErr_SetString(PyExc_RuntimeError, "Object not convertible: Invalid Python object."); + return NULL; + } + + if (!PyDict_Check(pydata)) { + PyErr_SetString(PyExc_RuntimeError, "Object not convertible: Python object is not a dictionary."); + return NULL; + } + + if (PyDict_Size(pydata) == 0) { + PyErr_SetString(PyExc_RuntimeError, "Object not convertible: Dictionary is empty."); + return NULL; + } + + + // Add PyObject (dictionary) holding RVecs to data source + std::stringstream code; + code << "ROOT::Internal::RDF::MakeNumpyDataFrame("; + std::stringstream pyaddress; + auto pyvecs = PyDict_New(); + pyaddress << pyvecs; + code << "reinterpret_cast<PyObject*>(" << pyaddress.str() << "), "; + + // Iterate over dictionary, convert numpy arrays to RVecs and put together interpreter code + PyObject *key, *value; + Py_ssize_t pos = 0; + const auto size = PyObject_Size(pydata); + auto counter = 0u; + while (PyDict_Next(pydata, &pos, &key, &value)) { + // Get name of key + if (!CPyCppyy_PyUnicode_Check(key)) { + PyErr_SetString(PyExc_RuntimeError, "Object not convertible: Dictionary key is not convertible to a string."); + return NULL; + } + std::string keystr = CPyCppyy_PyUnicode_AsString(key); + + // Convert value to RVec and attach to dictionary + auto pyvec = PyROOT::AsRVec(NULL, value); + if (pyvec == NULL) { + PyErr_SetString(PyExc_RuntimeError, + ("Object not convertible: Dictionary entry " + keystr + " is not convertible with AsRVec.").c_str()); + return NULL; + } + PyDict_SetItem(pyvecs, key, pyvec); + Py_DECREF(pyvec); + + // Add pairs of column name and associated RVec to signature + std::string vectype = Cppyy::GetScopedFinalName(((CPyCppyy::CPPInstance*)pyvec)->ObjectIsA()); + std::stringstream vecaddress; + vecaddress << ((CPyCppyy::CPPInstance*)pyvec)->GetObject(); + code << "std::pair<std::string, " << vectype << "*>(\"" + keystr + << "\", reinterpret_cast<" << vectype+ "*>(" << vecaddress.str() << "))"; + if (counter != size - 1) { + code << ","; + } else { + code << ");"; + } + counter++; + } + + // Create RDataFrame and build Python proxy + const auto err = gInterpreter->Declare("#include \"ROOT/RNumpyDS.hxx\""); + if (!err) { + PyErr_SetString(PyExc_RuntimeError, "Failed to find \"ROOT/RNumpyDS.hxx\"."); + return NULL; + } + const auto codeStr = code.str(); + auto address = (void*) gInterpreter->Calc(codeStr.c_str()); + const auto pythonOwns = true; + auto pyobj = TPython::CPPInstance_FromVoidPtr(address, "ROOT::RDataFrame", pythonOwns); + + // Bind pyobject holding adopted memory to the RVec + if (PyObject_SetAttrString(pyobj, "__data__", pyvecs)) { + PyErr_SetString(PyExc_RuntimeError, "Object not convertible: Failed to set dictionary as attribute __data__."); + return NULL; + } + Py_DECREF(pyvecs); + + return pyobj; +} diff --git a/bindings/pyroot_experimental/PyROOT/test/CMakeLists.txt b/bindings/pyroot_experimental/PyROOT/test/CMakeLists.txt index 6d5f3c6459a..1dcde5356e5 100644 --- a/bindings/pyroot_experimental/PyROOT/test/CMakeLists.txt +++ b/bindings/pyroot_experimental/PyROOT/test/CMakeLists.txt @@ -76,6 +76,7 @@ ROOT_ADD_PYUNITTEST(pyroot_pyz_rvec_asrvec rvec_asrvec.py) # RDataFrame and subclasses pythonizations ROOT_ADD_PYUNITTEST(pyroot_pyz_rdataframe_asnumpy rdataframe_asnumpy.py) +ROOT_ADD_PYUNITTEST(pyroot_pyz_rdataframe_makenumpy rdataframe_makenumpy.py) # Passing Python callables to ROOT.TF ROOT_ADD_PYUNITTEST(pyroot_pyz_tf_pycallables tf_pycallables.py) diff --git a/bindings/pyroot_experimental/PyROOT/test/rdataframe_makenumpy.py b/bindings/pyroot_experimental/PyROOT/test/rdataframe_makenumpy.py new file mode 100644 index 00000000000..43173b8fa97 --- /dev/null +++ b/bindings/pyroot_experimental/PyROOT/test/rdataframe_makenumpy.py @@ -0,0 +1,138 @@ +import unittest +import ROOT +import numpy as np +import sys + + +class MakeNumpyDataFrame(unittest.TestCase): + """ + Tests for the MakeNumpyDataFrame feature enabling to read numpy arrays + with RDataFrame. + """ + + dtypes = [ + "int32", "int64", "uint32", "uint64", "float32", "float64" + ] + + def test_dtypes(self): + """ + Test reading different datatypes + """ + for dtype in self.dtypes: + data = {"x": np.array([1, 2, 3], dtype=dtype)} + df = ROOT.ROOT.RDF.MakeNumpyDataFrame(data) + self.assertEqual(df.Mean("x").GetValue(), 2) + + def test_multiple_columns(self): + """ + Test reading multiple columns + """ + data = {} + for dtype in self.dtypes: + data[dtype] = np.array([1, 2, 3], dtype=dtype) + df = ROOT.ROOT.RDF.MakeNumpyDataFrame(data) + colnames = df.GetColumnNames() + # Test column names + for dtype in colnames: + self.assertIn(dtype, self.dtypes) + # Test mean + for dtype in self.dtypes: + self.assertEqual(df.Mean(dtype).GetValue(), 2) + + def test_refcount(self): + """ + Check refcounts of associated PyObjects + """ + data = {"x": np.array([1, 2, 3], dtype="float32")} + self.assertEqual(sys.getrefcount(data), 2) + self.assertEqual(sys.getrefcount(data["x"]), 2) + + df = ROOT.ROOT.RDF.MakeNumpyDataFrame(data) + self.assertTrue(hasattr(df, "__data__")) + self.assertEqual(sys.getrefcount(df), 2) + + self.assertEqual(sys.getrefcount(data["x"]), 3) + + def test_transformations(self): + """ + Test the use of transformations + """ + data = {"x": np.array([1, 2, 3], dtype="float32")} + df = ROOT.ROOT.RDF.MakeNumpyDataFrame(data) + df2 = df.Filter("x>1").Define("y", "2*x") + self.assertEqual(df2.Mean("x").GetValue(), 2.5) + self.assertEqual(df2.Mean("y").GetValue(), 5) + + def test_delete_dict(self): + """ + Test behaviour with data dictionary going out of scope + """ + data = {"x": np.array([1, 2, 3], dtype="float32")} + df = ROOT.ROOT.RDF.MakeNumpyDataFrame(data) + del data + self.assertEqual(df.Mean("x").GetValue(), 2) + + def test_delete_numpy_array(self): + """ + Test behaviour with numpy array going out of scope + """ + x = np.array([1, 2, 3], dtype="float32") + data = {"x": x} + df = ROOT.ROOT.RDF.MakeNumpyDataFrame(data) + del x + self.assertEqual(df.Mean("x").GetValue(), 2) + + def test_inplace_dict(self): + """ + Test behaviour with inplace dictionary + """ + df = ROOT.ROOT.RDF.MakeNumpyDataFrame({"x": np.array([1, 2, 3], dtype="float32")}) + self.assertEqual(df.Mean("x").GetValue(), 2) + + def test_lifetime_numpy_array(self): + """ + Test lifetime of numpy array + """ + x = np.array([1, 2, 3], dtype="float32") + ref1 = sys.getrefcount(x) + + df = ROOT.ROOT.RDF.MakeNumpyDataFrame({"x": x}) + ref2 = sys.getrefcount(x) + self.assertEqual(ref2, ref1 + 1) + + del df + ref3 = sys.getrefcount(x) + self.assertEqual(ref1, ref3) + + def test_lifetime_datasource(self): + """ + Test lifetime of datasource + + Datasource survives until last node of the graph goes out of scope + """ + x = np.array([1, 2, 3], dtype="float32") + ref1 = sys.getrefcount(x) + + # Data source has dictionary with RVecs attached, which take a reference + # to the numpy array + df = ROOT.ROOT.RDF.MakeNumpyDataFrame({"x": x}) + m = df.Mean("x") + ref2 = sys.getrefcount(x) + self.assertEqual(ref1 + 1, ref2) + + # Deleting the root node does not change anything since the datasource + # owns the RVecs + del df + self.assertEqual(m.GetValue(), 2) + ref3 = sys.getrefcount(x) + self.assertEqual(ref1 + 1, ref3) + + # Deleting the last node releases the RVecs and releases the reference + # to the numpy array + del m + ref4 = sys.getrefcount(x) + self.assertEqual(ref1, ref4) + + +if __name__ == '__main__': + unittest.main() -- GitLab