diff --git a/README.md b/README.md index 15ebef7..20854b0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# hepconvert + [![Actions Status][actions-badge]][actions-link] [![Documentation Status][rtd-badge]][rtd-link] @@ -24,7 +24,7 @@ [rtd-badge]: https://readthedocs.org/projects/hepconvert/badge/?version=latest [rtd-link]: https://hepconvert.readthedocs.io/en/latest/ -The hepconvert library is a bridge between columnar file formats, currently **ROOT, and Parquet** and soon eventually include **Feather, and HDF5.** It aims to simplify file conversions in Python, replacing what is usually a multi-step process with one line of code, with builtin features for managing large datasets and choosing compression levels. +The hepconvert library is a bridge between columnar file formats, currently **ROOT, and Parquet** and soon will include **Feather, and HDF5.** It aims to simplify file conversions in Python, replacing what is usually a multi-step process with one line of code, with builtin features for managing large datasets and choosing compression levels. # Installation diff --git a/docs/source/add.rst b/docs/source/add.rst new file mode 100644 index 0000000..b9a1dfb --- /dev/null +++ b/docs/source/add.rst @@ -0,0 +1,43 @@ +CLI Guide for add_histograms (add) +================================== + +Instructions for function `add_histograms `__. + +Command: +-------- + +.. code-block:: bash + + hepconvert add [options] [OUT_FILE] [IN_FILES] + + +Examples: +--------- + +.. code-block:: bash + + hepconvert add -f --progress-bar --union summed_hists.root hist1.root hist2.root hist3.root + +Or, if files are in a directory: + +.. code-block:: bash + + hepconvert add -f --append --same_names summed_hists.root path/directory/ + + +Options: +-------- + +``--force``, ``-f`` Use flag to overwrite a file if it already exists. + +``--progress-bar`` Will show a basic progress bar to show how many histograms have summed, and how many files have been read. + +``--append``, ``-a`` Will append histograms to an existing file. + +``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib". + +``--compression-level`` Level of compression set by an integer. Default is 1. + +``--union`` Use flag to add together histograms that have the same name and append all others to the new file. + +``--same-names`` Use flag to only add histograms together if they have the same name. diff --git a/docs/source/cli.toctree b/docs/source/cli.toctree new file mode 100644 index 0000000..ed5fe37 --- /dev/null +++ b/docs/source/cli.toctree @@ -0,0 +1,9 @@ +.. toctree:: + :caption: Command Line Interface Instructions + :hidden: + + parquet-to-root + root-to-parquet + copy-root + merge-root + add (add_histograms) diff --git a/docs/source/conf.py b/docs/source/conf.py index 688e5ef..fe52563 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,4 +36,4 @@ # Additional stuff master_doc = "index" -# exec(open("prepare_docstrings.py").read(), dict(globals())) +exec(open("prepare_docstrings.py").read(), dict(globals())) diff --git a/docs/source/copy_root.rst b/docs/source/copy_root.rst new file mode 100644 index 0000000..30d5c5d --- /dev/null +++ b/docs/source/copy_root.rst @@ -0,0 +1,57 @@ +Command Line Interface Guide: copy_root +======================================= + +Instructions for function `hepconvert.copy_root `__ + +Command: +-------- + +.. code-block:: bash + + hepconvert copy-root [options] [OUT_FILE] [IN_FILE] + + +Examples: +--------- + +.. code-block:: bash + + hepconvert copy-root -f --progress-bar --keep-branches 'Jet_*' out_file.root in_file.root + + +Branch skimming using ``cut``: + +.. code-block:: bash + + hepconvert copy-root -f --keep-branches 'Jet_*' --cut 'Jet_Px > 5' out_file.root in_file.root + +Options: +-------- + +``--drop-branches``, ``-db`` and ``--keep-branches``, ``-kb`` list, str or dict. Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted. + +``--drop-trees``, ``-dt`` and ``--keep-trees``, ``-kt`` list of str, or str. Specify tree names to remove/keep TTrees in the ROOT files. Wildcarding accepted. + +``--cut`` For branch skimming, passed to `uproot.iterate `__. str, if not None, this expression filters all of the expressions. + +``--expressions`` For branch skimming, passed to `uproot.iterate `__. Names of TBranches or aliases to convert to ararys or mathematical expressions of them. If None, all TBranches selected by the filters are included. + +``--force``, ``-f`` Use flag to overwrite a file if it already exists. + +``--progress-bar`` Will show a basic progress bar to show how many TTrees have merged and written. + +``--append``, ``-a`` Will append new TTree to an existing file. + +``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib". + +``--compression-level`` Level of compression set by an integer. Default is 1. + +``--name`` Give a name to the new TTree. Default is "tree". + +``--title`` Give a title to the new TTree. + +``--initial-basket-capacity`` (int) Number of TBaskets that can be written to the TTree without rewriting the TTree metadata to make room. Default is 10. + +``--resize-factor`` (float) When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor. Default is 10.0. + +``--step-size`` Size of batches of data to read and write. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”. Default is "100 MB" diff --git a/docs/source/general_guide.rst b/docs/source/general_guide.rst new file mode 100644 index 0000000..d1a5fa8 --- /dev/null +++ b/docs/source/general_guide.rst @@ -0,0 +1,229 @@ +General Guide and Examples: +=========================== +Is something missing from this guide? Please post your questions on the `discussions page `__! + +Features of all (or most) functions: +---------------------------------------- + +**Automatic handling of Uproot duplicate counter issue:** +If you are using a hepconvert function that goes ROOT -> ROOT (both the input and output files are ROOT) +and working with data in jagged arrays, if branches have the same "fLeafCount", hepconvert +will group branches automatically so that Uproot will not create a `counter branch for each branch `__. + +**Quick Modifications of ROOT files and TTrees:** + +Functions ``copy_root``, ``merge_root``, and ``root_to_parquet`` have a few options for applying quick +modifications to ROOT files and TTree data. + +**Branch slimming:** + Parameters ``keep_branches`` or ``drop_branches`` (list or dict) control branch slimming. + Examples: + + .. code:: python + + >>> hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches="x*", progress_bar=True, force=True) + + # Before: + + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # x1 | int64_t | AsDtype('>i8') + # x2 | int64_t | AsDtype('>i8') + # y1 | int64_t | AsDtype('>i8') + # y2 | int64_t | AsDtype('>i8') + + # After: + + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # x1 | int64_t | AsDtype('>i8') + # x2 | int64_t | AsDtype('>i8') + + .. code:: python + + >>> hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches={"tree1": ["branch2", "branch3"], "tree2": ["branch2"]}, progress_bar=True, force=True) + + # Before: + + # Tree1: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch1 | int64_t | AsDtype('>i8') + # branch2 | int64_t | AsDtype('>i8') + # branch3 | int64_t | AsDtype('>i8') + + # Tree2: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch1 | int64_t | AsDtype('>i8') + # branch2 | int64_t | AsDtype('>i8') + # branch3 | int64_t | AsDtype('>i8') + + # After: + + # Tree1: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch2 | int64_t | AsDtype('>i8') + # branch3 | int64_t | AsDtype('>i8') + + # Tree2: + # name | typename | interpretation + # ---------------------+--------------------------+------------------------------- + # branch2 | int64_t | AsDtype('>i8') + + +**Branch skimming:** + Parameters ``cut`` and ``expressions`` control branch skimming. Both of these parameters go to Uproot's `iterate + `__ + function. See Uproot's documentation for more details. + + Basic example: + + .. code:: python + + hepconvert.copy_root("skimmed_HZZ.root", "HZZ.root", keep_branches="Jet_", + force=True, expressions="Jet_Px", cut="Jet_Px >= 10",) + + +**Remove TTrees:** + Use parameters ``keep_ttrees`` or ``drop_ttrees`` to remove TTrees. + + .. code:: python + + # Creating example data: + with uproot.recreate("two_trees.root") as file: + file["tree"] = {"x": np.array([1, 2, 3])} + file["tree1"] = {"x": np.array([1, 2, 3])} + + hepconvert.copy_root("one_tree.root", "two_trees.root", keep_trees=tree, + force=True, expressions="Jet_Px", cut="Jet_Px >= 10",) + + +**How hepconvert works with ROOT** + +hepconvert uses Uproot for reading and writing ROOT files; it also has the same limitations. +It currently only works with flat TTrees (nanoAOD-like data), and cannot yet read or write RNTuples. + +As described in Uproot's documentation: + +.. note:: + + A small but growing list of data types can be written to files: + + * strings: TObjString + * histograms: TH1*, TH2*, TH3* + * profile plots: TProfile, TProfile2D, TProfile3D + * NumPy histograms created with `np.histogram `__, `np.histogram2d `__, and `np.histogramdd `__ with 3 dimensions or fewer + * histograms that satisfy the `Universal Histogram Interface `__ (UHI) with 3 dimensions or fewer; this includes `boost-histogram `__ and `hist `__ + * PyROOT objects + +**Memory Management** + +Each hepconvert function has automatic and customizable memory management for working with large files. + +Functions reading **ROOT** files will read in batches controlled by the parameter ``step_size``. +Set ``step_size`` to either an `int` to set the batch size to a number of entries, or a `string` in +form of "100 MB". + + +**Progress Bars** +hepconvert uses the package tqdm for progress bars, if you do not have the package installed an error message will provide installation instructions. +They are controlled with the ``progress_bar`` argument. +For example, to use a default progress bar with copy_root, set progress_bar to True: + +.. code:: python + + hepconvert.copy_root("out_file.root", "in_file.root", progress_bar=True) + + +Some functions can handle a customized tqdm progress bar. +To use a customized tqdm progress bar, make a progress bar object and pass it to the hepconvert function like so, + +.. code:: python + + >>> import tqdm + + >>> bar_obj = tqdm.tqdm(colour="GREEN", desc="Description") + >>> hepconvert.add_histograms("out_file.root", "path/in_files/", progress_bar=bar_obj) + +.. image:: https://raw.githubusercontent.com/scikit-hep/hepconvert/main/docs/docs-img/progress_bar.png + :width: 450px + :alt: hepconvert + :target: https://github.com/scikit-hep/hepconvert + + +Some types of tqdm progress bar objects may not work in this way. + + +**Command Line Interface** + +All functions are able to be run in the command line. See the "Command Line Interface Instructions" tab on the left to see CLI +instructions on individual functions. + +Adding Histograms +----------------- +``hepconvert.add_histograms`` adds the values of many histograms +and writes the summed histograms to an output file (like ROOT's hadd, but limited +to histograms). + + +**Parameters of note:** + +``union`` If True, adds the histograms that have the same name and appends all others +to the new file. + +``append`` If True, appends histograms to an existing file. Force and append +cannot both be True. + +``same_names`` If True, only adds together histograms which have the same name (key). If False, +histograms are added together based on TTree structure (bins must be equal). + +Memory: +``add_histograms`` has no memory customization available currently. To maintain +performance it stores the summed histograms in memory until all files have +been read, then the summed histograms are written to the output file. Only +one input ROOT file is read and kept in memory at a time. + + +Merging TTrees +-------------- +``hepconvert.merge_root`` merges TTrees in multiple ROOT files together. The end result is a single file containing data from all input files (again like ROOT's hadd, but can handle flat TTrees and histograms). + +.. warning:: + At the moment, hepconvert.merge can only merge TTrees that have the same + number of branches, with the same names and datatypes. + We are working on adding backfill capabilities for mismatched TTrees. + +**Features:** +merge_root has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``, ``drop_trees`` and ``keep_trees``. + + +Copying TTrees +-------------- +``hepconvert.copy_root`` copies TTrees in multiple ROOT files together. + +.. warning:: + At the moment, hepconvert.merge can only merge TTrees that have the same + number of branches, with the same names and datatypes. + We are working on adding backfill capabilities for mismatched TTrees. + +**Features:** +merge_root has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``, ``drop_trees`` and ``keep_trees``. + + +Parquet to ROOT +--------------- + +Writes the data from a single Parquet file to one TTree in a ROOT file. +This function creates a new TTree (name the new tree with parameter ``tree``). + + +ROOT to Parquet +--------------- + +Writes the data from one TTree in a ROOT file to a single Parquet file. +If there are multiple TTrees in the file, specify one TTree to write to the Parquet file using the ``tree`` parameter. + +**Features:** +root_to_parquet has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``. diff --git a/docs/source/guide.toctree b/docs/source/guide.toctree new file mode 100644 index 0000000..ec2445f --- /dev/null +++ b/docs/source/guide.toctree @@ -0,0 +1,5 @@ +.. toctree:: + :caption: Guide with Examples + :hidden: + + general_guide diff --git a/docs/source/hepconvert.add_histograms.rst b/docs/source/hepconvert.add_histograms.rst new file mode 100644 index 0000000..0a8af51 --- /dev/null +++ b/docs/source/hepconvert.add_histograms.rst @@ -0,0 +1,6 @@ +hepconvert.add_histograms +========================= + +Defined in `hepconvert.histogram_adding `__ on `line 345 `__. + +.. autofunction:: hepconvert.add_histograms diff --git a/docs/source/hepconvert.copy_root.copy_root.rst b/docs/source/hepconvert.copy_root.copy_root.rst deleted file mode 100644 index f5e6d40..0000000 --- a/docs/source/hepconvert.copy_root.copy_root.rst +++ /dev/null @@ -1,6 +0,0 @@ -hepconvert.copy_root -==================== - -Defined in `hepconvert.copy_root `__ on `line 13 `__. - -.. autofunction:: hepconvert.copy_root.copy_root diff --git a/docs/source/hepconvert.copy_root.rst b/docs/source/hepconvert.copy_root.rst new file mode 100644 index 0000000..ff9d871 --- /dev/null +++ b/docs/source/hepconvert.copy_root.rst @@ -0,0 +1,6 @@ +hepconvert.copy_root +==================== + +Defined in `hepconvert.copy_root `__ on `line 15 `__. + +.. autofunction:: hepconvert.copy_root diff --git a/docs/source/hepconvert.copy_root.toctree b/docs/source/hepconvert.copy_root.toctree index 268bc9a..a6c3308 100644 --- a/docs/source/hepconvert.copy_root.toctree +++ b/docs/source/hepconvert.copy_root.toctree @@ -2,5 +2,4 @@ :caption: copy_root :hidden: - hepconvert.copy_root (module) - hepconvert.copy_root.copy_root + hepconvert.copy_root diff --git a/docs/source/hepconvert.histogram_adding.add_histograms.rst b/docs/source/hepconvert.histogram_adding.add_histograms.rst deleted file mode 100644 index 38e33a9..0000000 --- a/docs/source/hepconvert.histogram_adding.add_histograms.rst +++ /dev/null @@ -1,6 +0,0 @@ -hepconvert.add_histograms -========================= - -Defined in `hepconvert.histogram_adding `__ on `line 374 `__. - -.. autofunction:: hepconvert.histogram_adding.add_histograms diff --git a/docs/source/hepconvert.histogram_adding.toctree b/docs/source/hepconvert.histogram_adding.toctree index 02bc725..c8f63e5 100644 --- a/docs/source/hepconvert.histogram_adding.toctree +++ b/docs/source/hepconvert.histogram_adding.toctree @@ -2,5 +2,4 @@ :caption: histogram_adding :hidden: - hepconvert.histogram_adding (module) - hepconvert.histogram_adding.add_histograms + hepconvert.add_histograms diff --git a/docs/source/hepconvert.merge.merge_root.rst b/docs/source/hepconvert.merge.merge_root.rst deleted file mode 100644 index 314985e..0000000 --- a/docs/source/hepconvert.merge.merge_root.rst +++ /dev/null @@ -1,6 +0,0 @@ -hepconvert.merge_root -===================== - -Defined in `hepconvert.merge `__ on `line 11 `__. - -.. autofunction:: hepconvert.merge.merge_root diff --git a/docs/source/hepconvert.merge.toctree b/docs/source/hepconvert.merge.toctree index 6406e9a..272c6d7 100644 --- a/docs/source/hepconvert.merge.toctree +++ b/docs/source/hepconvert.merge.toctree @@ -2,5 +2,4 @@ :caption: merge :hidden: - hepconvert.merge (module) - hepconvert.merge.merge_root + hepconvert.merge_root diff --git a/docs/source/hepconvert.merge_root.rst b/docs/source/hepconvert.merge_root.rst new file mode 100644 index 0000000..74b82c7 --- /dev/null +++ b/docs/source/hepconvert.merge_root.rst @@ -0,0 +1,6 @@ +hepconvert.merge_root +===================== + +Defined in `hepconvert.merge `__ on `line 17 `__. + +.. autofunction:: hepconvert.merge_root diff --git a/docs/source/hepconvert.parquet_to_root.parquet_to_root.rst b/docs/source/hepconvert.parquet_to_root.parquet_to_root.rst deleted file mode 100644 index 58a1e31..0000000 --- a/docs/source/hepconvert.parquet_to_root.parquet_to_root.rst +++ /dev/null @@ -1,6 +0,0 @@ -hepconvert.parquet_to_root -========================== - -Defined in `hepconvert.parquet_to_root `__ on `line 9 `__. - -.. autofunction:: hepconvert.parquet_to_root.parquet_to_root diff --git a/docs/source/hepconvert.parquet_to_root.rst b/docs/source/hepconvert.parquet_to_root.rst new file mode 100644 index 0000000..338b8bb --- /dev/null +++ b/docs/source/hepconvert.parquet_to_root.rst @@ -0,0 +1,6 @@ +hepconvert.parquet_to_root +========================== + +Defined in `hepconvert.parquet_to_root `__ on `line 11 `__. + +.. autofunction:: hepconvert.parquet_to_root diff --git a/docs/source/hepconvert.parquet_to_root.toctree b/docs/source/hepconvert.parquet_to_root.toctree index e26a68d..5889b70 100644 --- a/docs/source/hepconvert.parquet_to_root.toctree +++ b/docs/source/hepconvert.parquet_to_root.toctree @@ -2,5 +2,4 @@ :caption: parquet_to_root :hidden: - hepconvert.parquet_to_root (module) - hepconvert.parquet_to_root.parquet_to_root + hepconvert.parquet_to_root diff --git a/docs/source/hepconvert.root_to_parquet.root_to_parquet.rst b/docs/source/hepconvert.root_to_parquet.root_to_parquet.rst deleted file mode 100644 index 24d7b04..0000000 --- a/docs/source/hepconvert.root_to_parquet.root_to_parquet.rst +++ /dev/null @@ -1,6 +0,0 @@ -hepconvert.root_to_parquet -========================== - -Defined in `hepconvert.root_to_parquet `__ on `line 9 `__. - -.. autofunction:: hepconvert.root_to_parquet.root_to_parquet diff --git a/docs/source/hepconvert.root_to_parquet.rst b/docs/source/hepconvert.root_to_parquet.rst new file mode 100644 index 0000000..eeaee89 --- /dev/null +++ b/docs/source/hepconvert.root_to_parquet.rst @@ -0,0 +1,6 @@ +hepconvert.root_to_parquet +========================== + +Defined in `hepconvert.root_to_parquet `__ on `line 9 `__. + +.. autofunction:: hepconvert.root_to_parquet diff --git a/docs/source/hepconvert.root_to_parquet.toctree b/docs/source/hepconvert.root_to_parquet.toctree index ea852d6..b0a7fab 100644 --- a/docs/source/hepconvert.root_to_parquet.toctree +++ b/docs/source/hepconvert.root_to_parquet.toctree @@ -2,5 +2,4 @@ :caption: root_to_parquet :hidden: - hepconvert.root_to_parquet (module) - hepconvert.root_to_parquet.root_to_parquet + hepconvert.root_to_parquet diff --git a/docs/source/hepconvert.toctree b/docs/source/hepconvert.toctree index f04ab24..43dfbae 100644 --- a/docs/source/hepconvert.toctree +++ b/docs/source/hepconvert.toctree @@ -1,6 +1,3 @@ .. toctree:: :caption: Detailed Reference :hidden: - - hepconvert (module) - hepconvert.merge.merge_root diff --git a/docs/source/index.rst b/docs/source/index.rst index 41b4c42..513ad1e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,11 +4,18 @@ contain the root `toctree` directive. .. include:: main.toctree - +.. include:: guide.toctree +.. include:: cli.toctree .. toctree:: :caption: Modules :hidden: +.. image:: https://raw.githubusercontent.com/scikit-hep/hepconvert/main/docs/docs-img/hepconvert_logo.svg + :width: 450px + :alt: hepconvert + :target: https://github.com/scikit-hep/hepconvert + +| Welcome to hepconvert's documentation! ====================================== diff --git a/docs/source/main.toctree b/docs/source/main.toctree index 973542b..3dde1c6 100644 --- a/docs/source/main.toctree +++ b/docs/source/main.toctree @@ -2,8 +2,8 @@ :caption: Main Interface :hidden: - hepconvert.parquet_to_root.parquet_to_root - hepconvert.root_to_parquet.root_to_parquet - hepconvert.copy_root.copy_root - hepconvert.merge.merge_root - hepconvert.histogram_adding.add_histograms + hepconvert.parquet_to_root + hepconvert.root_to_parquet + hepconvert.copy_root + hepconvert.merge_root + hepconvert.add_histograms diff --git a/docs/source/merge_root.rst b/docs/source/merge_root.rst new file mode 100644 index 0000000..57ce9f8 --- /dev/null +++ b/docs/source/merge_root.rst @@ -0,0 +1,64 @@ +Command Line Interface Guide: merge_root +======================================== + +Instructions for function `hepconvert.merge_root `__. + +Command: +-------- + +.. code-block:: bash + + hepconvert merge-root [options] [OUT_FILE] [IN_FILES] + + +Examples: +--------- + +.. code-block:: bash + + hepconvert merge-root -f --progress-bar --keep-branches 'Jet_*' out_file.root file1.root file2.root file3.root + + +Or with files in a directory: + +.. code-block:: bash + + hepconvert merge-root -f --progress-bar --drop-branches {'tree1': 'branch1', 'branch2'} out_file.root directory/in_files/ + + +Branch skimming using ``cut``: + +.. code-block:: bash + + hepconvert merge-root -f --keep-branches 'Jet_*' --cut 'Jet_Px > 5' out_file.root directory/in_files + +Options: +-------- + +``--drop-branches``, ``--keep-branches``, ``-db`` or ``-kb`` (list, str or dict) Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted. + +``--drop-trees``, ``--keep-trees`` (list of str, or str) Specify tree names to remove/keep TTrees in the ROOT files. Wildcarding accepted. + +``--cut`` For branch skimming, passed to `uproot.iterate `__. str, if not None, this expression filters all of the expressions. + +``--expressions`` For branch skimming, passed to `uproot.iterate `__. Names of TBranches or aliases to convert to ararys or mathematical expressions of them. If None, all TBranches selected by the filters are included. + +``--force``, ``-f`` Use flag to overwrite a file if it already exists. + +``--progress-bar`` Will show a basic progress bar to show how many TTrees have merged and written. + +``--append``, ``-a`` Will append new TTree to an existing file. + +``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib". + +``--compression-level`` Level of compression set by an integer. Default is 1. + +``--name`` Give a name to the new TTree. Default is "tree". + +``--title`` Give a title to the new TTree. + +``--initial-basket-capacity`` (int) Number of TBaskets that can be written to the TTree without rewriting the TTree metadata to make room. Default is 10. + +``--resize-factor`` (float) When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor. Default is 10.0. + +``--step-size`` (str or int) Size of batches of data to read and write. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”. Default is "100 MB" diff --git a/docs/source/parquet_to_root.rst b/docs/source/parquet_to_root.rst new file mode 100644 index 0000000..bba507b --- /dev/null +++ b/docs/source/parquet_to_root.rst @@ -0,0 +1,37 @@ +Command Line Interface Guide: parquet_to_root +============================================= + +Instructions for function `parquet_to_root `__. + +.. code-block:: bash + + hepconvert parquet-to-root [options] [OUT_FILE] [IN_FILE] + +Example: + +.. code-block:: bash + + hepconvert parquet-to-root -f --progress-bar True --name new_tree out_file.root in_file.parquet + +This will write the data from a Parquet file to a flat TTree with the name "new_tree". + +Options: +-------- + +``--force``, ``-f`` Use flag to overwrite a file if it already exists. + +``--progress-bar`` Will create a basic progress bar to show how many row-groups have been written. + +``--append`` Will append new TTree to an existing file. + +``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib". + +``--compression-level`` Level of compression set by an integer. Default is 1. + +``--name`` Give a name to the new TTree. Default is "tree". + +``--title`` Give a title to the new TTree. + +``--initial-basket-capacity`` (int) Number of TBaskets that can be written to the TTree without rewriting the TTree metadata to make room. Default is 10. + +``--resize-factor`` (float) When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor. Default is 10.0. diff --git a/docs/source/prepare_docstrings.py b/docs/source/prepare_docstrings.py index c6bc1c9..91096f2 100644 --- a/docs/source/prepare_docstrings.py +++ b/docs/source/prepare_docstrings.py @@ -19,11 +19,11 @@ ] common = [ - "hepconvert.parquet_to_root.parquet_to_root", - "hepconvert.root_to_parquet.root_to_parquet", - "hepconvert.copy_root.copy_root", - "hepconvert.merge.merge_root", - "hepconvert.histogram_adding.add_histograms", + "hepconvert.parquet_to_root", + "hepconvert.root_to_parquet", + "hepconvert.copy_root", + "hepconvert.merge_root", + "hepconvert.add_histograms", ] latest_commit = ( @@ -70,11 +70,11 @@ def handle_module(modulename, module): .. automodule:: {0} """.format(modulename, "=" * len(modulename)) - ensure(modulename + ".rst", content) - if toctree2 is None: - toctree.write(" " + modulename + " (module) <" + modulename + ">\n") - else: - toctree2.write(" " + modulename + " (module) <" + modulename + ">\n") + # ensure(modulename + ".rst", content) + # if toctree2 is None: + # toctree.write(" " + modulename + " (module) <" + modulename + ">\n") + # else: + # toctree2.write(" " + modulename + " (module) <" + modulename + ">\n") if modulename != "hepconvert" and all( not x.startswith("test") and not x.startswith("_") @@ -100,7 +100,8 @@ def line_order(pair): if inspect.isclass(obj): handle_class(modulename + "." + name, obj) elif inspect.isfunction(obj): - handle_function(modulename + "." + name, obj) + ensure("hepconvert." + name + ".rst", content) + handle_function("hepconvert." + name, obj) def handle_class(classname, cls): @@ -223,7 +224,6 @@ def prettymro(c): classname, "\n".join([text for index, line, text in sorted(methods.values())]), ) - ensure(classname + ".rst", content) if upfront or toctree2 is None: if classname not in common: diff --git a/docs/source/root_to_parquet.rst b/docs/source/root_to_parquet.rst new file mode 100644 index 0000000..97b9d11 --- /dev/null +++ b/docs/source/root_to_parquet.rst @@ -0,0 +1,160 @@ +Command Line Interface Guide: root_to_parquet +============================================= + +Instructions for function `hepconvert.root_to_parquet `__ + +Command: +-------- + +.. code-block:: bash + + hepconvert root-to-parquet [options] [OUT_FILE] [IN_FILE] + + +Examples: +--------- + +.. code-block:: bash + + hepconvert root-to-parquet -f --progress-bar --tree 'tree1' out_file.parquet in_file.root + + +Options: +-------- + +``tree`` (str) if there are multiple TTrees in the input file, specify the name of the TTree to copy. + +``--drop-branches``, ``-db``, and ``--keep-branches``, ``-kb`` (list) str or dict Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted. + +``--cut`` For branch skimming, passed to `uproot.iterate `__. str, if not None, this expression filters all of the expressions. + +``--expressions`` For branch skimming, passed to `uproot.iterate `__. Names of TBranches or aliases to convert to ararys or mathematical expressions of them. If None, all TBranches selected by the filters are included. + +``--force`` or ``-f`` Use flag to overwrite a file if it already exists. + +``--step-size`` (int) Size of batches of data to read and write. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”. Default is "100 MB" + +``--compression`` of ``-c`` (str) Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib". + +``--compression-level`` (int) Level of compression set by an integer. Default is 1. + +Options passed to `ak.to_parquet `__: +---------------------------------------------------------------------------------------------------------------- + +``--list-to32`` (bool) If True, convert Awkward lists into 32-bit Arrow lists +if they're small enough, even if it means an extra conversion. Otherwise, +signed 32-bit **ak.types.ListType** maps to Arrow `ListType`, +signed 64-bit **ak.types.ListType** maps to Arrow `LargeListType`, +and unsigned 32-bit **ak.types.ListType** picks whichever Arrow type its +values fit into. + +``--string-to32`` (bool) Same as the above for Arrow `string` and `large_string`. + +``--bytestring-to32`` (bool) Same as the above for Arrow `binary` and `large_binary`. + +``--emptyarray-to`` (None or dtype) If None, **ak.types.UnknownType** maps to Arrow's +null type; otherwise, it is converted a given numeric dtype. + +``--categorical-as-dictionary`` (bool) If True, **ak.contents.IndexedArray** and +#ak.contents.IndexedOptionArray labeled with `__array__ = "categorical"` +are mapped to Arrow `DictionaryArray`; otherwise, the projection is +evaluated before conversion (always the case without +`__array__ = "categorical"`). + +``--extensionarray`` (bool) If True, this function returns extended Arrow arrays +(at all levels of nesting), which preserve metadata so that Awkward to +Arrow to Awkward preserves the array's **ak.types.Type** (though not +the #ak.forms.Form). If False, this function returns generic Arrow arrays +that might be needed for third-party tools that don't recognize Arrow's +extensions. Even with `extensionarray=False`, the values produced by +Arrow's `to_pylist` method are the same as the values produced by Awkward's +#ak.to_list. + +``--count-nulls`` (bool) If True, count the number of missing values at each level +and include these in the resulting Arrow array, which makes some downstream +applications faster. If False, skip the up-front cost of counting them. + +``-c`` or ``--compression`` (None, str, or dict) Compression algorithm name, passed to +`pyarrow.parquet.ParquetWriter `__. +Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}` +(where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys +are column names (the same column names that #ak.forms.Form.columns returns +and #ak.forms.Form.select_columns accepts) and the values are compression +algorithm names, to compress each column differently. + +``--compression-level`` (None, int, or dict None) Compression level, passed to +`pyarrow.parquet.ParquetWriter `__. +Compression levels have different meanings for different compression +algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for +example. Generally, higher numbers provide slower but smaller compression. + +``--row-group-size`` (int or None) Will be overwritten by ``step_size``. + +``--data-page-size`` (None or int) Number of bytes in each data page, passed to +`pyarrow.parquet.ParquetWriter `__. +If None, the Parquet default of 1 MiB is used. + +``--parquet-flavor`` (None or `"spark"`) If None, the output Parquet file will follow +Arrow conventions; if `"spark"`, it will follow Spark conventions. Some +systems, such as Spark and Google BigQuery, might need Spark conventions, +while others might need Arrow conventions. Passed to +`pyarrow.parquet.ParquetWriter `__. +as `flavor`. + +``--parquet-version`` (`"1.0"`, `"2.4"`, or `"2.6"`) Parquet file format version. +Passed to `pyarrow.parquet.ParquetWriter `__. +as `version`. + +``--parquet-page-version`` (`"1.0"` or `"2.0"`) Parquet page format version. +Passed to `pyarrow.parquet.ParquetWriter `__. +as `data_page_version`. + +``--parquet-metadata-statistics`` (bool or dict) If True, include summary +statistics for each data page in the Parquet metadata, which lets some +applications search for data more quickly (by skipping pages). If a dict +mapping column names to bool, include summary statistics on only the +specified columns. Passed to +`pyarrow.parquet.ParquetWriter `__. +as `write_statistics`. + +``--parquet-dictionary-encoding`` (bool or dict) If True, allow Parquet to pre-compress +with dictionary encoding. If a dict mapping column names to bool, only +use dictionary encoding on the specified columns. Passed to +`pyarrow.parquet.ParquetWriter `__. +as `use_dictionary`. + +``--parquet-byte-stream-split`` (bool or dict) If True, pre-compress floating +point fields (`float32` or `float64`) with byte stream splitting, which +collects all mantissas in one part of the stream and exponents in another. +Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). +as `use_byte_stream_split`. + +``--parquet-coerce-timestamps`` (None, `"ms"`, or `"us"`) If None, any timestamps +(`datetime64` data) are coerced to a given resolution depending on +`parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds, +but later versions use the `datetime64`'s own units. If `"ms"` is explicitly +specified, timestamps are coerced to milliseconds; if `"us"`, microseconds. +Passed to `pyarrow.parquet.ParquetWriter `__. +as `coerce_timestamps`. + +``--parquet-old-int96-timestamps`` (None or bool) If True, use Parquet's INT96 format +for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`. +If None, let the `parquet_flavor` decide. Passed to +`pyarrow.parquet.ParquetWriter `__ +as `use_deprecated_int96_timestamps`. + +``--parquet-compliant-nested`` (bool) If True, use the Spark/BigQuery/Parquet +`convention for nested lists `__, +in which each list is a one-field record with field name "`element`"; +otherwise, use the Arrow convention, in which the field name is "`item`". +Passed to `pyarrow.parquet.ParquetWriter `__ +as `use_compliant_nested_type`. + +``--parquet-extra-options`` (None or dict) +Any additional options to pass to +`pyarrow.parquet.ParquetWriter `__. + +``--storage-options`` (None or dict) +Any additional options to pass to +`fsspec.core.url_to_fs `__ +to open a remote file for writing. diff --git a/src/hepconvert/__main__.py b/src/hepconvert/__main__.py index 500f5d2..1341247 100644 --- a/src/hepconvert/__main__.py +++ b/src/hepconvert/__main__.py @@ -9,16 +9,16 @@ def main() -> None: """ Must provide a subcommand: - parquet-to-root, root-to-parquet, copy-root, add-and-merge, or add + parquet-to-root, root-to-parquet, copy-root, add, or add """ @main.command() @click.argument("destination", type=click.Path()) @click.argument("file") -@click.option("--progress-bar", default=None, type=bool, required=False) -@click.option("--name", required=False, default="") -@click.option("--title", required=False, default="") +@click.option("--progress-bar", is_flag=True) +@click.option("--name", type=str, required=False, default="") +@click.option("--title", type=str, required=False, default="") @click.option( "--initial-basket-capacity", default=10, @@ -30,6 +30,7 @@ def main() -> None: help="When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor.", ) @click.option( + "-c", "--compression", default="zlib", help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4".', @@ -51,7 +52,7 @@ def parquet_to_root( *, name="tree", branch_types=None, - progress_bar=False, + progress_bar, title="", field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner, initial_basket_capacity=10, @@ -87,17 +88,36 @@ def parquet_to_root( @click.argument("destination", type=click.Path()) @click.argument("file") @click.option( - "--drop-branches", "-db", default=None, type=list or dict or str, required=False + "-db", + "--drop-branches", + default=None, + type=list or dict or str, + required=False, + help="Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.", +) +@click.option( + "-kb", "--keep-branches", default=None, type=list or dict or str, required=False +) +@click.option( + "-dt", + "--drop-trees", + default=None, + type=list or str, + required=False, + help="Specify tree names to remove from the ROOT file. Wildcarding accepted.", ) @click.option( - "--keep-branches", "-kb", default=None, type=list or dict or str, required=False + "-kt", + "--keep-trees", + default=None, + type=list or str, + required=False, + help="Specify tree names to keep in the ROOT file. All others will be removed. Wildcarding accepted.", ) -@click.option("--drop-trees", "-dt", default=None, type=list or str, required=False) -@click.option("--keep-trees", "kt", default=None, type=list or str, required=False) -@click.option("--progress-bar", default=None, type=bool, required=False) +@click.option("--progress-bar", is_flag=True) @click.option("--cut", default=None, type=str or list, required=False) @click.option("--expressions", default=None, type=str or list, required=False) -@click.option("--title", required=False, default="") +@click.option("--title", type=str, required=False, default="") @click.option( "--initial-basket-capacity", default=10, @@ -124,7 +144,7 @@ def copy_root( keep_trees=None, cut=None, expressions=None, - progress_bar=None, + progress_bar, force, title="", field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner, @@ -171,48 +191,53 @@ def copy_root( is_flag=True, help="Overwrite destination file if it already exists", ) -@click.option("--progress-bar", default=None, type=bool, required=False) -@click.option("--append", default=False, help="Append histograms to an existing file") +@click.option("--progress-bar", is_flag=True) +@click.option( + "-a", "--append", is_flag=True, help="Append histograms to an existing file" +) @click.option( + "-c", "--compression", default="zlib", - help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4".', + type=str, + help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "ZLIB".', ) @click.option( "--compression-level", default=1, + type=int, help="Use a compression level particular to the chosen compressor. By default the compression level is 1.", ) @click.option( "--skip-bad-files", - default=False, + is_flag=True, help="Skip corrupt or non-existent files without exiting", ) @click.option( "--union", - default=True, + is_flag=True, help="Adds the histograms that have the same name and appends all others to the new file", ) @click.option( "--same-names", - default=False, + is_flag=True, help="Only adds histograms together if they have the same name", ) def add( destination, files, *, - progress_bar=False, + progress_bar, force, - append=False, + append, compression="zlib", compression_level=1, - skip_bad_files=False, - union=True, - same_names=False, + skip_bad_files, + union, + same_names, ): """ - Hadd files. + Sums histograms and writes them to a new file. """ import hepconvert.histogram_adding # pylint: disable=import-outside-toplevel @@ -246,21 +271,51 @@ def add( ) @click.option( "--step-size", - default=100, + default="100 MB", + type=int or str, help="If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”.", ) -@click.option("--drop-branches", default=None, type=list or dict or str, required=False) -@click.option("--keep-branches", default=None, type=list or dict or str, required=False) -@click.option("--drop-trees", default=None, type=list or str, required=False) -@click.option("--keep-trees", default=None, type=list or str, required=False) -@click.option("--progress-bar", default=None, type=bool, required=False) +@click.option( + "-db", + "--drop-branches", + default=None, + type=list or dict or str, + required=False, + help="Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.", +) +@click.option( + "-kb", "--keep-branches", default=None, type=list or dict or str, required=False +) +@click.option( + "-dt", + "--drop-trees", + default=None, + type=list or str, + required=False, + help="Specify tree names to remove from the ROOT file. Wildcarding accepted.", +) +@click.option( + "-kt", + "--keep-trees", + default=None, + type=list or str, + required=False, + help="Specify tree names to keep in the ROOT file.. Wildcarding accepted.", +) +@click.option("--progress-bar", is_flag=True) @click.option("--cut", default=None, type=str or list, required=False) @click.option("--expressions", default=None, type=str or list, required=False) @click.option( - "--force", is_flag=True, help="Overwrite destination file if it already exists" + "-f", + "--force", + is_flag=True, + help="Overwrite destination file if it already exists", +) +@click.option( + "-a", "--append", is_flag=True, help="Append histograms to an existing file" ) -@click.option("--append", default=False, help="Append histograms to an existing file") @click.option( + "-c", "--compression", default="zlib", help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4".', @@ -288,13 +343,13 @@ def merge_root( keep_trees=None, cut=None, expressions=None, - progress_bar=False, + progress_bar, initial_basket_capacity=10, resize_factor=10.0, counter_name=lambda counted: "n" + counted, step_size="100 MB", force, - append=False, + append, compression="LZ4", compression_level=1, skip_bad_files=False, @@ -339,6 +394,24 @@ def merge_root( type=bool, help="Specify the name of a tree to write to Parquet, if there are multiple trees in the ROOT file.", ) +@click.option( + "-db", + "--drop-branches", + default=None, + type=list or dict or str, + required=False, + help="Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.", +) +@click.option( + "-kb", + "--keep-branches", + default=None, + type=list or dict or str, + required=False, + help="Specify branch names to keep in the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to keep only certain branches in certain ttrees. Wildcarding accepted.", +) +@click.option("--cut", default=None, type=str or list, required=False) +@click.option("--expressions", default=None, type=str or list, required=False) @click.option( "-f", "--force", @@ -349,6 +422,7 @@ def merge_root( @click.option( "-s", "--step-size", + type=int or str, default="100 MB", help="Specify batch size for reading ROOT file. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include.", ) @@ -393,6 +467,7 @@ def merge_root( help="Count the number of missing values at each level and include these in the resulting Arrow array, which makes some downstream applications faster. If False, skip the up-front cost of counting them.", ) @click.option( + "-c", "--compression", default=False, type=bool, @@ -480,6 +555,13 @@ def root_to_parquet( in_file=None, out_file=None, *, + tree=None, + drop_branches=None, + keep_branches=None, + cut=None, + expressions=None, + force=False, + step_size="100 MB", list_to32=False, string_to32=True, bytestring_to32=True, @@ -502,9 +584,6 @@ def root_to_parquet( parquet_compliant_nested=False, parquet_extra_options=None, storage_options=None, - tree=None, - force, - step_size=100, ): """ Convert ROOT to Parquet. @@ -514,6 +593,13 @@ def root_to_parquet( hepconvert.root_to_parquet( in_file=in_file, out_file=out_file, + tree=tree, + drop_branches=drop_branches, + keep_branches=keep_branches, + cut=cut, + expressions=expressions, + force=force, + step_size=step_size, list_to32=list_to32, string_to32=string_to32, bytestring_to32=bytestring_to32, @@ -536,9 +622,6 @@ def root_to_parquet( parquet_compliant_nested=parquet_compliant_nested, parquet_extra_options=parquet_extra_options, storage_options=storage_options, - tree=tree, - force=force, - step_size=step_size, ) diff --git a/src/hepconvert/copy_root.py b/src/hepconvert/copy_root.py index 321adc7..e59a7e3 100644 --- a/src/hepconvert/copy_root.py +++ b/src/hepconvert/copy_root.py @@ -18,7 +18,7 @@ def copy_root( *, keep_branches=None, drop_branches=None, - # add_branches=None, #TO-DO: add functionality for this, just specify about the counter issue + # add_branches=None, #TODO: add functionality for this, just specify about the counter issue? keep_trees=None, drop_trees=None, cut=None, @@ -26,7 +26,7 @@ def copy_root( progress_bar=None, force=False, fieldname_separator="_", - # fix_duplicate_counters=False, #TO-DO: ask about this? + # fix_duplicate_counters=False, #TODO: ask about this? title="", field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner, initial_basket_capacity=10, @@ -112,7 +112,9 @@ def copy_root( -------------------------- This function can be run from the command line. Use command - >>> hepconvert copy-root [options] [OUT_FILE] [IN_FILE] + .. code-block:: bash + + hepconvert copy-root [options] [OUT_FILE] [IN_FILE] """ if compression in ("ZLIB", "zlib"): @@ -214,11 +216,10 @@ def copy_root( ) raise ValueError(msg) - if len(trees) > 1 and progress_bar: + if len(trees) > 1 and progress_bar is not False: + number_of_items = len(trees) if progress_bar is True: tqdm = _utils.check_tqdm() - number_of_items = len(trees) - progress_bar = tqdm.tqdm(desc="Trees copied") progress_bar.reset(total=number_of_items) for t in trees: @@ -279,6 +280,6 @@ def copy_root( out_file[tree.name].extend(chunk) except AssertionError: msg = "Are the branch-names correct?" - if len(trees) > 1 and progress_bar: + if len(trees) > 1 and progress_bar is not False: progress_bar.update(n=1) f.close() diff --git a/src/hepconvert/histogram_adding.py b/src/hepconvert/histogram_adding.py index 3cf8ae6..0d57450 100644 --- a/src/hepconvert/histogram_adding.py +++ b/src/hepconvert/histogram_adding.py @@ -397,7 +397,9 @@ def add_histograms( -------------------------- This function can be run from the command line. Use command - >>> hepconvert add [options] [OUT_FILE] [IN_FILES] + .. code-block:: bash + + hepconvert add [options] [OUT_FILE] [IN_FILES] """ if compression in ("ZLIB", "zlib"): @@ -449,12 +451,12 @@ def add_histograms( with uproot.open(files[0]) as file: keys = file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False) - if progress_bar: + if progress_bar is not False: + tqdm = _utils.check_tqdm() file_bar = progress_bar - hist_bar = progress_bar + hist_bar = tqdm.tqdm(desc="Histograms added") + number_of_items = len(files) if progress_bar is True: - tqdm = _utils.check_tqdm() - number_of_items = len(files) file_bar = tqdm.tqdm(desc="Files added") hist_bar = tqdm.tqdm(desc="Histograms added") diff --git a/src/hepconvert/merge.py b/src/hepconvert/merge.py index febbdfa..66a0c70 100644 --- a/src/hepconvert/merge.py +++ b/src/hepconvert/merge.py @@ -109,7 +109,9 @@ def merge_root( -------------------------- This function can be run from the command line. Use command - >>> hepconvert merge [options] [OUT_FILE] [IN_FILES] + .. code-block:: bash + + hepconvert merge [options] [OUT_FILE] [IN_FILES] """ @@ -243,7 +245,7 @@ def merge_root( destination, ) raise ValueError(msg) - if progress_bar: + if progress_bar is not False: if progress_bar is True: tqdm = _utils.check_tqdm() number_of_items = len(files) @@ -306,7 +308,7 @@ def merge_root( out_file[tree.name].extend(chunk) except AssertionError: msg = "TTrees must have the same structure to be merged. Are the branch_names correct?" - if progress_bar: + if progress_bar is not False: progress_bar.update(n=1) f.close() @@ -381,6 +383,6 @@ def merge_root( for key in hist_keys: out_file[key] = writable_hists[key] - if progress_bar: + if progress_bar is not False: progress_bar.update(n=1) f.close() diff --git a/src/hepconvert/parquet_to_root.py b/src/hepconvert/parquet_to_root.py index 220680b..fc1a135 100644 --- a/src/hepconvert/parquet_to_root.py +++ b/src/hepconvert/parquet_to_root.py @@ -13,16 +13,17 @@ def parquet_to_root( file, *, name="tree", + force=False, branch_types=None, progress_bar=False, + append=False, title="", field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner, initial_basket_capacity=10, counter_name=lambda counted: "n" + counted, resize_factor=10.0, - compression="zlib", + compression="ZLIB", compression_level=1, - force=True, ): """Converts a Parquet file into a ROOT file. Data is stored in one TTree, which has a name defined by argument ``name``. @@ -66,7 +67,9 @@ def parquet_to_root( -------------------------- This function can be run from the command line. Use command - >>> hepconvert parquet-to-root [options] [OUT_FILE] [IN_FILE] + .. code-block:: bash + + hepconvert parquet-to-root [options] [OUT_FILE] [IN_FILE] """ if compression in ("LZMA", "lzma"): @@ -82,21 +85,35 @@ def parquet_to_root( raise ValueError(msg) path = Path(destination) if Path.is_file(path) and not force: - raise FileExistsError + msg = f"File {path} already exists. To overwrite it, set force=True." + raise FileExistsError(msg) + if append: + if Path.is_file(path): + out_file = uproot.update( + destination, + compression=uproot.compression.Compression.from_code_pair( + compression_code, compression_level + ), + ) + else: + msg = "Cannot append to a non-existent file." + raise FileNotFoundError(msg) + + else: + out_file = uproot.recreate( + destination, + compression=uproot.compression.Compression.from_code_pair( + compression_code, compression_level + ), + ) metadata = ak.metadata_from_parquet(file) - if progress_bar: + if progress_bar is not False: + number_of_items = metadata["num_row_groups"] if progress_bar is True: - number_of_items = metadata["num_row_groups"] tqdm = _utils.check_tqdm() progress_bar = tqdm.tqdm(desc="Row-groups written") progress_bar.reset(number_of_items) - out_file = uproot.recreate( - destination, - compression=uproot.compression.Compression.from_code_pair( - compression_code, compression_level - ), - ) chunk = ak.from_parquet(file, row_groups=[0]) if not branch_types: diff --git a/src/hepconvert/root_to_parquet.py b/src/hepconvert/root_to_parquet.py index a550ede..324c867 100644 --- a/src/hepconvert/root_to_parquet.py +++ b/src/hepconvert/root_to_parquet.py @@ -43,7 +43,7 @@ def root_to_parquet( """Converts ROOT to Parquet file using Uproot and awkward.to_parquet. Data read from 1 tree, converted to single Parquet file. :param in_file: Local ROOT file to convert to Parquet. May contain glob patterns. - :type in_file: str + :type in_file: path-like :param out_file: Name of the output file or file path. :type out_file: path-like :param tree: If there are multiple trees in the ROOT file, specify the name of one to write to Parquet. @@ -95,7 +95,7 @@ def root_to_parquet( Command line option: ``--count-nulls``. :type count_nulls: bool :param compression: Compression algorithm name, passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}` (where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys are column names (the same column names that #ak.forms.Form.columns returns @@ -103,33 +103,33 @@ def root_to_parquet( algorithm names, to compress each column differently. Command line option: ``--compression``. :type compression: None, str, or dict :param compression_level: Compression level, passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. Compression levels have different meanings for different compression algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for example. Generally, higher numbers provide slower but smaller compression. Command line option ``--compression-level``. :type compression_level: None, int, or dict None :param row_group_size: Maximum number of entries in each row group, - passed to [pyarrow.parquet.ParquetWriter.write_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table). + passed to `pyarrow.parquet.ParquetWriter.write_table `__. If None, the Parquet default of 64 MiB is used. Command line options: ``-rg`` or ``--row-group-size``. :type row_group_size: int or None :param data_page_size: Number of bytes in each data page, passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. If None, the Parquet default of 1 MiB is used. Command line option: ``--data-page-size``. :type data_page_size: None or int :param parquet_flavor: If None, the output Parquet file will follow Arrow conventions; if `"spark"`, it will follow Spark conventions. Some systems, such as Spark and Google BigQuery, might need Spark conventions, while others might need Arrow conventions. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. as `flavor`. Command line option: ``--parquet-flavor``. :type parquet_flavor: None or `"spark"` :param parquet_version: Parquet file format version. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Passed to `pyarrow.parquet.ParquetWriter `__. as `version`. Command line option: ``--parquet-version``. :type parquet_version: `"1.0"`, `"2.4"`, or `"2.6"` :param parquet_page_version: Parquet page format version. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Passed to `pyarrow.parquet.ParquetWriter `__. as `data_page_version`. Command line option: ``--parquet-page-version``. :type parquet_page_version: `"1.0"` or `"2.0"` :param parquet_metadata_statistics: If True, include summary @@ -137,19 +137,19 @@ def root_to_parquet( applications search for data more quickly (by skipping pages). If a dict mapping column names to bool, include summary statistics on only the specified columns. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. as `write_statistics`. Command line option: ``--parquet-metadata-statistics``. :type parquet_metadata_statistics: bool or dict :param parquet_dictionary_encoding: If True, allow Parquet to pre-compress with dictionary encoding. If a dict mapping column names to bool, only use dictionary encoding on the specified columns. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. as `use_dictionary`. Command line option: ``--parquet-dictionary-encoding``. :type parquet_dictionary_encoding: bool or dict :param parquet_byte_stream_split: If True, pre-compress floating point fields (`float32` or `float64`) with byte stream splitting, which collects all mantissas in one part of the stream and exponents in another. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Passed to `pyarrow.parquet.ParquetWriter `__. as `use_byte_stream_split`. Command line option: ``--parquet-byte-stream-split``. :type parquet_byte_stream_split: bool or dict :param parquet_coerce_timestamps: If None, any timestamps @@ -157,27 +157,27 @@ def root_to_parquet( `parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds, but later versions use the `datetime64`'s own units. If `"ms"` is explicitly specified, timestamps are coerced to milliseconds; if `"us"`, microseconds. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Passed to `pyarrow.parquet.ParquetWriter `__. as `coerce_timestamps`. Command line option: ``--parquet-coerce-timestamps``. :type parquet_coerce_timestamps: None, `"ms"`, or `"us"` :param parquet_old_int96_timestamps: If True, use Parquet's INT96 format for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`. If None, let the `parquet_flavor` decide. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. as `use_deprecated_int96_timestamps`. Command line option: ``--parquet-old-int96-timestamps``. :type parquet_old_int96_timestamps: None or bool :param parquet_compliant_nested: If True, use the Spark/BigQuery/Parquet - [convention for nested lists](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types), + `convention for nested lists `__, in which each list is a one-field record with field name "`element`"; otherwise, use the Arrow convention, in which the field name is "`item`". - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Passed to `pyarrow.parquet.ParquetWriter `__. as `use_compliant_nested_type`. Command line option: ``--parquet-compliant-nested``. :type parquet_compliated_nested: bool :param parquet_extra_options: Any additional options to pass to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + `pyarrow.parquet.ParquetWriter `__. :type parquet_extra_options: None or dict :param storage_options: Any additional options to pass to - [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) + `fsspec.core.url_to_fs `__ to open a remote file for writing. :type storage_options: None or dict @@ -191,7 +191,9 @@ def root_to_parquet( -------------------------- This function can be run from the command line. Use command - >>> hepconvert root-to-parquet [options] [OUT_FILE] [IN_FILE] + .. code-block:: bash + + hepconvert root-to-parquet [options] [OUT_FILE] [IN_FILE] """ path = Path(out_file) diff --git a/src/hepconvert/write_root.py b/src/hepconvert/write_root.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py index 01cdedf..139e12f 100644 --- a/tests/test_add_histograms.py +++ b/tests/test_add_histograms.py @@ -71,12 +71,12 @@ def test_simple(tmp_path): ).all -def mult_1D(tmp_path): +def mult_1D(tmp_path, file_paths): gauss_1 = ROOT.TH1I("name1", "title", 5, -4, 4) gauss_1.FillRandom("gaus") gauss_1.Sumw2() gauss_1.SetDirectory(0) - outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file1.root"), "RECREATE") + outHistFile = ROOT.TFile.Open(file_paths[0], "RECREATE") outHistFile.cd() gauss_1.Write() outHistFile.Close() @@ -86,7 +86,7 @@ def mult_1D(tmp_path): gauss_2.FillRandom("gaus") gauss_2.Sumw2() gauss_2.SetDirectory(0) - outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file1.root"), "UPDATE") + outHistFile = ROOT.TFile.Open(file_paths[0], "UPDATE") outHistFile.cd() gauss_2.Write() outHistFile.Close() @@ -96,7 +96,7 @@ def mult_1D(tmp_path): gauss_3.FillRandom("gaus") gauss_3.Sumw2() gauss_3.SetDirectory(0) - outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file2.root"), "RECREATE") + outHistFile = ROOT.TFile.Open(file_paths[1], "RECREATE") outHistFile.cd() gauss_3.Write() outHistFile.Close() @@ -106,7 +106,7 @@ def mult_1D(tmp_path): gauss_4.FillRandom("gaus") gauss_4.Sumw2() gauss_4.SetDirectory(0) - outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file2.root"), "UPDATE") + outHistFile = ROOT.TFile.Open(file_paths[1], "UPDATE") outHistFile.cd() gauss_4.Write() outHistFile.Close() @@ -116,7 +116,7 @@ def mult_1D(tmp_path): gauss_5.FillRandom("gaus") gauss_5.Sumw2() gauss_5.SetDirectory(0) - outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file3.root"), "RECREATE") + outHistFile = ROOT.TFile.Open(file_paths[2], "RECREATE") outHistFile.cd() gauss_5.Write() outHistFile.Close() @@ -126,23 +126,14 @@ def mult_1D(tmp_path): gauss_6.FillRandom("gaus") gauss_6.Sumw2() gauss_6.SetDirectory(0) - outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file3.root"), "UPDATE") + outHistFile = ROOT.TFile.Open(file_paths[2], "UPDATE") outHistFile.cd() gauss_6.Write() outHistFile.Close() h6 = uproot.from_pyroot(gauss_6) destination = os.path.join(tmp_path, "destination.root") - hepconvert.add_histograms( - destination, - [ - os.path.join(tmp_path, "file1.root"), - os.path.join(tmp_path, "file2.root"), - os.path.join(tmp_path, "file3.root"), - ], - force=True, - same_names=False, - ) + hepconvert.add_histograms(destination, file_paths, force=True, same_names=False) with uproot.open(destination) as file: added = uproot.from_pyroot(