From 61f60e3724e4c124d2285306c7bac5516ab070d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Sun, 7 Apr 2024 21:27:53 +0200 Subject: [PATCH] guide done for now --- docs/source/general_guide.rst | 224 +++++++++++++++++---- docs/source/hepconvert.add_histograms.rst | 2 +- docs/source/hepconvert.copy_root.rst | 2 +- docs/source/hepconvert.merge_root.rst | 2 +- docs/source/hepconvert.parquet_to_root.rst | 2 +- docs/source/hepconvert.root_to_parquet.rst | 2 +- docs/source/index.rst | 15 +- src/hepconvert/copy_root.py | 7 +- src/hepconvert/histogram_adding.py | 8 +- src/hepconvert/merge.py | 6 +- src/hepconvert/parquet_to_root.py | 4 +- 11 files changed, 202 insertions(+), 72 deletions(-) diff --git a/docs/source/general_guide.rst b/docs/source/general_guide.rst index 1549ec1..d1a5fa8 100644 --- a/docs/source/general_guide.rst +++ b/docs/source/general_guide.rst @@ -1,72 +1,109 @@ General Guide and Examples: =========================== -Is something missing from this guide? Please let us know on the discussions page! +Is something missing from this guide? Please post your questions on the `discussions page `__! -Adding Histograms ------------------ -(similar to hadd) +Features of all (or most) functions: +---------------------------------------- -hepconvert's function ``hepconvert.add_histograms()`` adds the values of many histograms and writes the summed histograms to an output file. +**Automatic handling of Uproot duplicate counter issue:** +If you are using a hepconvert function that goes ROOT -> ROOT (both the input and output files are ROOT) +and working with data in jagged arrays, if branches have the same "fLeafCount", hepconvert +will group branches automatically so that Uproot will not create a `counter branch for each branch `__. -This function can be run in the command-line, see the `add cli guide <>`__ +**Quick Modifications of ROOT files and TTrees:** -**Parameters of note:** +Functions ``copy_root``, ``merge_root``, and ``root_to_parquet`` have a few options for applying quick +modifications to ROOT files and TTree data. +**Branch slimming:** + Parameters ``keep_branches`` or ``drop_branches`` (list or dict) control branch slimming. + Examples: -Memory: -This function will + .. code:: python + >>> hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches="x*", progress_bar=True, force=True) -Merging TTrees --------------- -(also similar to hadd) + # Before: -**Parameters of note:** + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # x1 | int64_t | AsDtype('>i8') + # x2 | int64_t | AsDtype('>i8') + # y1 | int64_t | AsDtype('>i8') + # y2 | int64_t | AsDtype('>i8') -Parquet to ROOT ---------------- + # After: -Writes the data from a single Parquet file to one TTree in a ROOT file. -This function creates a new TTree (which can be named) + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # x1 | int64_t | AsDtype('>i8') + # x2 | int64_t | AsDtype('>i8') -**Parameters of note:** + .. code:: python -``name`` str, will be the name of the new TTree. Defaults to "tree" -``progress_bar`` bool or tdqm object. If True, a basic progress bar will appear. Defaults to ``False``. + >>> hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches={"tree1": ["branch2", "branch3"], "tree2": ["branch2"]}, progress_bar=True, force=True) -ROOT to Parquet ---------------- + # Before: -Writes the data from a single flat TTree to a Parquet file. + # Tree1: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch1 | int64_t | AsDtype('>i8') + # branch2 | int64_t | AsDtype('>i8') + # branch3 | int64_t | AsDtype('>i8') -**Parameters of note:** + # Tree2: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch1 | int64_t | AsDtype('>i8') + # branch2 | int64_t | AsDtype('>i8') + # branch3 | int64_t | AsDtype('>i8') -``tree`` str, If there are multiple TTrees in the ROOT file being read, pass the name of one TTree to write. + # After: -Branch slimming: - ``keep_branches`` or ``drop_branches`` (list or dict): + # Tree1: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch2 | int64_t | AsDtype('>i8') + # branch3 | int64_t | AsDtype('>i8') - .. code:: python + # Tree2: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch2 | int64_t | AsDtype('>i8') + + +**Branch skimming:** + Parameters ``cut`` and ``expressions`` control branch skimming. Both of these parameters go to Uproot's `iterate + `__ + function. See Uproot's documentation for more details. - hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches=[], progress_bar=True, force=True) + Basic example: .. code:: python - hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches={"tree1": ["branch2", "branch3"], "tree2": ["branch2"]}, progress_bar=True, force=True) + hepconvert.copy_root("skimmed_HZZ.root", "HZZ.root", keep_branches="Jet_", + force=True, expressions="Jet_Px", cut="Jet_Px >= 10",) -Branch skimming: - ``cut`` - and - ``expressions`` +**Remove TTrees:** + Use parameters ``keep_ttrees`` or ``drop_ttrees`` to remove TTrees. -Remove TTrees: - ``keep_ttrees`` or ``drop_ttrees`` + .. code:: python + + # Creating example data: + with uproot.recreate("two_trees.root") as file: + file["tree"] = {"x": np.array([1, 2, 3])} + file["tree1"] = {"x": np.array([1, 2, 3])} + + hepconvert.copy_root("one_tree.root", "two_trees.root", keep_trees=tree, + force=True, expressions="Jet_Px", cut="Jet_Px >= 10",) -How hepconvert works with ROOT ------------------------------- + +**How hepconvert works with ROOT** hepconvert uses Uproot for reading and writing ROOT files; it also has the same limitations. +It currently only works with flat TTrees (nanoAOD-like data), and cannot yet read or write RNTuples. As described in Uproot's documentation: @@ -81,9 +118,112 @@ As described in Uproot's documentation: * histograms that satisfy the `Universal Histogram Interface `__ (UHI) with 3 dimensions or fewer; this includes `boost-histogram `__ and `hist `__ * PyROOT objects -hepconvert currently works with flat TTrees (nanoAOD-like data). +**Memory Management** + +Each hepconvert function has automatic and customizable memory management for working with large files. -Progress Bars -------------- -hepconvert uses the package tqdm for progress bars. +Functions reading **ROOT** files will read in batches controlled by the parameter ``step_size``. +Set ``step_size`` to either an `int` to set the batch size to a number of entries, or a `string` in +form of "100 MB". + + +**Progress Bars** +hepconvert uses the package tqdm for progress bars, if you do not have the package installed an error message will provide installation instructions. They are controlled with the ``progress_bar`` argument. +For example, to use a default progress bar with copy_root, set progress_bar to True: + +.. code:: python + + hepconvert.copy_root("out_file.root", "in_file.root", progress_bar=True) + + +Some functions can handle a customized tqdm progress bar. +To use a customized tqdm progress bar, make a progress bar object and pass it to the hepconvert function like so, + +.. code:: python + + >>> import tqdm + + >>> bar_obj = tqdm.tqdm(colour="GREEN", desc="Description") + >>> hepconvert.add_histograms("out_file.root", "path/in_files/", progress_bar=bar_obj) + +.. image:: https://raw.githubusercontent.com/scikit-hep/hepconvert/main/docs/docs-img/progress_bar.png + :width: 450px + :alt: hepconvert + :target: https://github.com/scikit-hep/hepconvert + + +Some types of tqdm progress bar objects may not work in this way. + + +**Command Line Interface** + +All functions are able to be run in the command line. See the "Command Line Interface Instructions" tab on the left to see CLI +instructions on individual functions. + +Adding Histograms +----------------- +``hepconvert.add_histograms`` adds the values of many histograms +and writes the summed histograms to an output file (like ROOT's hadd, but limited +to histograms). + + +**Parameters of note:** + +``union`` If True, adds the histograms that have the same name and appends all others +to the new file. + +``append`` If True, appends histograms to an existing file. Force and append +cannot both be True. + +``same_names`` If True, only adds together histograms which have the same name (key). If False, +histograms are added together based on TTree structure (bins must be equal). + +Memory: +``add_histograms`` has no memory customization available currently. To maintain +performance it stores the summed histograms in memory until all files have +been read, then the summed histograms are written to the output file. Only +one input ROOT file is read and kept in memory at a time. + + +Merging TTrees +-------------- +``hepconvert.merge_root`` merges TTrees in multiple ROOT files together. The end result is a single file containing data from all input files (again like ROOT's hadd, but can handle flat TTrees and histograms). + +.. warning:: + At the moment, hepconvert.merge can only merge TTrees that have the same + number of branches, with the same names and datatypes. + We are working on adding backfill capabilities for mismatched TTrees. + +**Features:** +merge_root has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``, ``drop_trees`` and ``keep_trees``. + + +Copying TTrees +-------------- +``hepconvert.copy_root`` copies TTrees in multiple ROOT files together. + +.. warning:: + At the moment, hepconvert.merge can only merge TTrees that have the same + number of branches, with the same names and datatypes. + We are working on adding backfill capabilities for mismatched TTrees. + +**Features:** +merge_root has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``, ``drop_trees`` and ``keep_trees``. + + +Parquet to ROOT +--------------- + +Writes the data from a single Parquet file to one TTree in a ROOT file. +This function creates a new TTree (name the new tree with parameter ``tree``). + + +ROOT to Parquet +--------------- + +Writes the data from one TTree in a ROOT file to a single Parquet file. +If there are multiple TTrees in the file, specify one TTree to write to the Parquet file using the ``tree`` parameter. + +**Features:** +root_to_parquet has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``. diff --git a/docs/source/hepconvert.add_histograms.rst b/docs/source/hepconvert.add_histograms.rst index 14814b3..0a8af51 100644 --- a/docs/source/hepconvert.add_histograms.rst +++ b/docs/source/hepconvert.add_histograms.rst @@ -1,6 +1,6 @@ hepconvert.add_histograms ========================= -Defined in `hepconvert.histogram_adding `__ on `line 345 `__. +Defined in `hepconvert.histogram_adding `__ on `line 345 `__. .. autofunction:: hepconvert.add_histograms diff --git a/docs/source/hepconvert.copy_root.rst b/docs/source/hepconvert.copy_root.rst index 52a6622..ff9d871 100644 --- a/docs/source/hepconvert.copy_root.rst +++ b/docs/source/hepconvert.copy_root.rst @@ -1,6 +1,6 @@ hepconvert.copy_root ==================== -Defined in `hepconvert.copy_root `__ on `line 15 `__. +Defined in `hepconvert.copy_root `__ on `line 15 `__. .. autofunction:: hepconvert.copy_root diff --git a/docs/source/hepconvert.merge_root.rst b/docs/source/hepconvert.merge_root.rst index a12914d..74b82c7 100644 --- a/docs/source/hepconvert.merge_root.rst +++ b/docs/source/hepconvert.merge_root.rst @@ -1,6 +1,6 @@ hepconvert.merge_root ===================== -Defined in `hepconvert.merge `__ on `line 17 `__. +Defined in `hepconvert.merge `__ on `line 17 `__. .. autofunction:: hepconvert.merge_root diff --git a/docs/source/hepconvert.parquet_to_root.rst b/docs/source/hepconvert.parquet_to_root.rst index 486d83d..338b8bb 100644 --- a/docs/source/hepconvert.parquet_to_root.rst +++ b/docs/source/hepconvert.parquet_to_root.rst @@ -1,6 +1,6 @@ hepconvert.parquet_to_root ========================== -Defined in `hepconvert.parquet_to_root `__ on `line 11 `__. +Defined in `hepconvert.parquet_to_root `__ on `line 11 `__. .. autofunction:: hepconvert.parquet_to_root diff --git a/docs/source/hepconvert.root_to_parquet.rst b/docs/source/hepconvert.root_to_parquet.rst index cd6a296..eeaee89 100644 --- a/docs/source/hepconvert.root_to_parquet.rst +++ b/docs/source/hepconvert.root_to_parquet.rst @@ -1,6 +1,6 @@ hepconvert.root_to_parquet ========================== -Defined in `hepconvert.root_to_parquet `__ on `line 9 `__. +Defined in `hepconvert.root_to_parquet `__ on `line 9 `__. .. autofunction:: hepconvert.root_to_parquet diff --git a/docs/source/index.rst b/docs/source/index.rst index cccb4e7..513ad1e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,21 +10,12 @@ :caption: Modules :hidden: -.. |br| raw:: html - -
- -.. image:: https://github.com/scikit-hep/hepconvert/blob/main/docs/docs-img/hepconvert_logo.svg - ::width: 300px +.. image:: https://raw.githubusercontent.com/scikit-hep/hepconvert/main/docs/docs-img/hepconvert_logo.svg + :width: 450px :alt: hepconvert :target: https://github.com/scikit-hep/hepconvert -.. role:: raw-html(raw) - :format: html - -|br| - -:raw-html:`

` +| Welcome to hepconvert's documentation! ====================================== diff --git a/src/hepconvert/copy_root.py b/src/hepconvert/copy_root.py index 252cd63..e59a7e3 100644 --- a/src/hepconvert/copy_root.py +++ b/src/hepconvert/copy_root.py @@ -216,11 +216,10 @@ def copy_root( ) raise ValueError(msg) - if len(trees) > 1 and progress_bar: + if len(trees) > 1 and progress_bar is not False: + number_of_items = len(trees) if progress_bar is True: tqdm = _utils.check_tqdm() - number_of_items = len(trees) - progress_bar = tqdm.tqdm(desc="Trees copied") progress_bar.reset(total=number_of_items) for t in trees: @@ -281,6 +280,6 @@ def copy_root( out_file[tree.name].extend(chunk) except AssertionError: msg = "Are the branch-names correct?" - if len(trees) > 1 and progress_bar: + if len(trees) > 1 and progress_bar is not False: progress_bar.update(n=1) f.close() diff --git a/src/hepconvert/histogram_adding.py b/src/hepconvert/histogram_adding.py index 10fde4e..0d57450 100644 --- a/src/hepconvert/histogram_adding.py +++ b/src/hepconvert/histogram_adding.py @@ -451,12 +451,12 @@ def add_histograms( with uproot.open(files[0]) as file: keys = file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False) - if progress_bar: + if progress_bar is not False: + tqdm = _utils.check_tqdm() file_bar = progress_bar - hist_bar = progress_bar + hist_bar = tqdm.tqdm(desc="Histograms added") + number_of_items = len(files) if progress_bar is True: - tqdm = _utils.check_tqdm() - number_of_items = len(files) file_bar = tqdm.tqdm(desc="Files added") hist_bar = tqdm.tqdm(desc="Histograms added") diff --git a/src/hepconvert/merge.py b/src/hepconvert/merge.py index 63bf0b7..66a0c70 100644 --- a/src/hepconvert/merge.py +++ b/src/hepconvert/merge.py @@ -245,7 +245,7 @@ def merge_root( destination, ) raise ValueError(msg) - if progress_bar: + if progress_bar is not False: if progress_bar is True: tqdm = _utils.check_tqdm() number_of_items = len(files) @@ -308,7 +308,7 @@ def merge_root( out_file[tree.name].extend(chunk) except AssertionError: msg = "TTrees must have the same structure to be merged. Are the branch_names correct?" - if progress_bar: + if progress_bar is not False: progress_bar.update(n=1) f.close() @@ -383,6 +383,6 @@ def merge_root( for key in hist_keys: out_file[key] = writable_hists[key] - if progress_bar: + if progress_bar is not False: progress_bar.update(n=1) f.close() diff --git a/src/hepconvert/parquet_to_root.py b/src/hepconvert/parquet_to_root.py index 61492ce..fc1a135 100644 --- a/src/hepconvert/parquet_to_root.py +++ b/src/hepconvert/parquet_to_root.py @@ -107,9 +107,9 @@ def parquet_to_root( ), ) metadata = ak.metadata_from_parquet(file) - if progress_bar: + if progress_bar is not False: + number_of_items = metadata["num_row_groups"] if progress_bar is True: - number_of_items = metadata["num_row_groups"] tqdm = _utils.check_tqdm() progress_bar = tqdm.tqdm(desc="Row-groups written")