diff --git a/README.md b/README.md
index 15ebef7..20854b0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# hepconvert
+
[![Actions Status][actions-badge]][actions-link]
[![Documentation Status][rtd-badge]][rtd-link]
@@ -24,7 +24,7 @@
[rtd-badge]: https://readthedocs.org/projects/hepconvert/badge/?version=latest
[rtd-link]: https://hepconvert.readthedocs.io/en/latest/
-The hepconvert library is a bridge between columnar file formats, currently **ROOT, and Parquet** and soon eventually include **Feather, and HDF5.** It aims to simplify file conversions in Python, replacing what is usually a multi-step process with one line of code, with builtin features for managing large datasets and choosing compression levels.
+The hepconvert library is a bridge between columnar file formats, currently **ROOT, and Parquet** and soon will include **Feather, and HDF5.** It aims to simplify file conversions in Python, replacing what is usually a multi-step process with one line of code, with builtin features for managing large datasets and choosing compression levels.
# Installation
diff --git a/docs/source/add.rst b/docs/source/add.rst
new file mode 100644
index 0000000..b9a1dfb
--- /dev/null
+++ b/docs/source/add.rst
@@ -0,0 +1,43 @@
+CLI Guide for add_histograms (add)
+==================================
+
+Instructions for function `add_histograms `__.
+
+Command:
+--------
+
+.. code-block:: bash
+
+ hepconvert add [options] [OUT_FILE] [IN_FILES]
+
+
+Examples:
+---------
+
+.. code-block:: bash
+
+ hepconvert add -f --progress-bar --union summed_hists.root hist1.root hist2.root hist3.root
+
+Or, if files are in a directory:
+
+.. code-block:: bash
+
+ hepconvert add -f --append --same_names summed_hists.root path/directory/
+
+
+Options:
+--------
+
+``--force``, ``-f`` Use flag to overwrite a file if it already exists.
+
+``--progress-bar`` Will show a basic progress bar to show how many histograms have summed, and how many files have been read.
+
+``--append``, ``-a`` Will append histograms to an existing file.
+
+``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib".
+
+``--compression-level`` Level of compression set by an integer. Default is 1.
+
+``--union`` Use flag to add together histograms that have the same name and append all others to the new file.
+
+``--same-names`` Use flag to only add histograms together if they have the same name.
diff --git a/docs/source/cli.toctree b/docs/source/cli.toctree
new file mode 100644
index 0000000..ed5fe37
--- /dev/null
+++ b/docs/source/cli.toctree
@@ -0,0 +1,9 @@
+.. toctree::
+ :caption: Command Line Interface Instructions
+ :hidden:
+
+ parquet-to-root
+ root-to-parquet
+ copy-root
+ merge-root
+ add (add_histograms)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 688e5ef..fe52563 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -36,4 +36,4 @@
# Additional stuff
master_doc = "index"
-# exec(open("prepare_docstrings.py").read(), dict(globals()))
+exec(open("prepare_docstrings.py").read(), dict(globals()))
diff --git a/docs/source/copy_root.rst b/docs/source/copy_root.rst
new file mode 100644
index 0000000..30d5c5d
--- /dev/null
+++ b/docs/source/copy_root.rst
@@ -0,0 +1,57 @@
+Command Line Interface Guide: copy_root
+=======================================
+
+Instructions for function `hepconvert.copy_root `__
+
+Command:
+--------
+
+.. code-block:: bash
+
+ hepconvert copy-root [options] [OUT_FILE] [IN_FILE]
+
+
+Examples:
+---------
+
+.. code-block:: bash
+
+ hepconvert copy-root -f --progress-bar --keep-branches 'Jet_*' out_file.root in_file.root
+
+
+Branch skimming using ``cut``:
+
+.. code-block:: bash
+
+ hepconvert copy-root -f --keep-branches 'Jet_*' --cut 'Jet_Px > 5' out_file.root in_file.root
+
+Options:
+--------
+
+``--drop-branches``, ``-db`` and ``--keep-branches``, ``-kb`` list, str or dict. Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.
+
+``--drop-trees``, ``-dt`` and ``--keep-trees``, ``-kt`` list of str, or str. Specify tree names to remove/keep TTrees in the ROOT files. Wildcarding accepted.
+
+``--cut`` For branch skimming, passed to `uproot.iterate `__. str, if not None, this expression filters all of the expressions.
+
+``--expressions`` For branch skimming, passed to `uproot.iterate `__. Names of TBranches or aliases to convert to ararys or mathematical expressions of them. If None, all TBranches selected by the filters are included.
+
+``--force``, ``-f`` Use flag to overwrite a file if it already exists.
+
+``--progress-bar`` Will show a basic progress bar to show how many TTrees have merged and written.
+
+``--append``, ``-a`` Will append new TTree to an existing file.
+
+``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib".
+
+``--compression-level`` Level of compression set by an integer. Default is 1.
+
+``--name`` Give a name to the new TTree. Default is "tree".
+
+``--title`` Give a title to the new TTree.
+
+``--initial-basket-capacity`` (int) Number of TBaskets that can be written to the TTree without rewriting the TTree metadata to make room. Default is 10.
+
+``--resize-factor`` (float) When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor. Default is 10.0.
+
+``--step-size`` Size of batches of data to read and write. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”. Default is "100 MB"
diff --git a/docs/source/general_guide.rst b/docs/source/general_guide.rst
new file mode 100644
index 0000000..d1a5fa8
--- /dev/null
+++ b/docs/source/general_guide.rst
@@ -0,0 +1,229 @@
+General Guide and Examples:
+===========================
+Is something missing from this guide? Please post your questions on the `discussions page `__!
+
+Features of all (or most) functions:
+----------------------------------------
+
+**Automatic handling of Uproot duplicate counter issue:**
+If you are using a hepconvert function that goes ROOT -> ROOT (both the input and output files are ROOT)
+and working with data in jagged arrays, if branches have the same "fLeafCount", hepconvert
+will group branches automatically so that Uproot will not create a `counter branch for each branch `__.
+
+**Quick Modifications of ROOT files and TTrees:**
+
+Functions ``copy_root``, ``merge_root``, and ``root_to_parquet`` have a few options for applying quick
+modifications to ROOT files and TTree data.
+
+**Branch slimming:**
+ Parameters ``keep_branches`` or ``drop_branches`` (list or dict) control branch slimming.
+ Examples:
+
+ .. code:: python
+
+ >>> hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches="x*", progress_bar=True, force=True)
+
+ # Before:
+
+ # name | typename | interpretation
+ # ---------------------+--------------------------+-------------------------------
+ # x1 | int64_t | AsDtype('>i8')
+ # x2 | int64_t | AsDtype('>i8')
+ # y1 | int64_t | AsDtype('>i8')
+ # y2 | int64_t | AsDtype('>i8')
+
+ # After:
+
+ # name | typename | interpretation
+ # ---------------------+--------------------------+-------------------------------
+ # x1 | int64_t | AsDtype('>i8')
+ # x2 | int64_t | AsDtype('>i8')
+
+ .. code:: python
+
+ >>> hepconvert.root_to_parquet("out_file.root", "in_file.root", keep_branches={"tree1": ["branch2", "branch3"], "tree2": ["branch2"]}, progress_bar=True, force=True)
+
+ # Before:
+
+ # Tree1:
+ # name | typename | interpretation
+ # ---------------------+--------------------------+-------------------------------
+ # branch1 | int64_t | AsDtype('>i8')
+ # branch2 | int64_t | AsDtype('>i8')
+ # branch3 | int64_t | AsDtype('>i8')
+
+ # Tree2:
+ # name | typename | interpretation
+ # ---------------------+--------------------------+-------------------------------
+ # branch1 | int64_t | AsDtype('>i8')
+ # branch2 | int64_t | AsDtype('>i8')
+ # branch3 | int64_t | AsDtype('>i8')
+
+ # After:
+
+ # Tree1:
+ # name | typename | interpretation
+ # ---------------------+--------------------------+-------------------------------
+ # branch2 | int64_t | AsDtype('>i8')
+ # branch3 | int64_t | AsDtype('>i8')
+
+ # Tree2:
+ # name | typename | interpretation
+ # ---------------------+--------------------------+-------------------------------
+ # branch2 | int64_t | AsDtype('>i8')
+
+
+**Branch skimming:**
+ Parameters ``cut`` and ``expressions`` control branch skimming. Both of these parameters go to Uproot's `iterate
+ `__
+ function. See Uproot's documentation for more details.
+
+ Basic example:
+
+ .. code:: python
+
+ hepconvert.copy_root("skimmed_HZZ.root", "HZZ.root", keep_branches="Jet_",
+ force=True, expressions="Jet_Px", cut="Jet_Px >= 10",)
+
+
+**Remove TTrees:**
+ Use parameters ``keep_ttrees`` or ``drop_ttrees`` to remove TTrees.
+
+ .. code:: python
+
+ # Creating example data:
+ with uproot.recreate("two_trees.root") as file:
+ file["tree"] = {"x": np.array([1, 2, 3])}
+ file["tree1"] = {"x": np.array([1, 2, 3])}
+
+ hepconvert.copy_root("one_tree.root", "two_trees.root", keep_trees=tree,
+ force=True, expressions="Jet_Px", cut="Jet_Px >= 10",)
+
+
+**How hepconvert works with ROOT**
+
+hepconvert uses Uproot for reading and writing ROOT files; it also has the same limitations.
+It currently only works with flat TTrees (nanoAOD-like data), and cannot yet read or write RNTuples.
+
+As described in Uproot's documentation:
+
+.. note::
+
+ A small but growing list of data types can be written to files:
+
+ * strings: TObjString
+ * histograms: TH1*, TH2*, TH3*
+ * profile plots: TProfile, TProfile2D, TProfile3D
+ * NumPy histograms created with `np.histogram `__, `np.histogram2d `__, and `np.histogramdd `__ with 3 dimensions or fewer
+ * histograms that satisfy the `Universal Histogram Interface `__ (UHI) with 3 dimensions or fewer; this includes `boost-histogram `__ and `hist `__
+ * PyROOT objects
+
+**Memory Management**
+
+Each hepconvert function has automatic and customizable memory management for working with large files.
+
+Functions reading **ROOT** files will read in batches controlled by the parameter ``step_size``.
+Set ``step_size`` to either an `int` to set the batch size to a number of entries, or a `string` in
+form of "100 MB".
+
+
+**Progress Bars**
+hepconvert uses the package tqdm for progress bars, if you do not have the package installed an error message will provide installation instructions.
+They are controlled with the ``progress_bar`` argument.
+For example, to use a default progress bar with copy_root, set progress_bar to True:
+
+.. code:: python
+
+ hepconvert.copy_root("out_file.root", "in_file.root", progress_bar=True)
+
+
+Some functions can handle a customized tqdm progress bar.
+To use a customized tqdm progress bar, make a progress bar object and pass it to the hepconvert function like so,
+
+.. code:: python
+
+ >>> import tqdm
+
+ >>> bar_obj = tqdm.tqdm(colour="GREEN", desc="Description")
+ >>> hepconvert.add_histograms("out_file.root", "path/in_files/", progress_bar=bar_obj)
+
+.. image:: https://raw.githubusercontent.com/scikit-hep/hepconvert/main/docs/docs-img/progress_bar.png
+ :width: 450px
+ :alt: hepconvert
+ :target: https://github.com/scikit-hep/hepconvert
+
+
+Some types of tqdm progress bar objects may not work in this way.
+
+
+**Command Line Interface**
+
+All functions are able to be run in the command line. See the "Command Line Interface Instructions" tab on the left to see CLI
+instructions on individual functions.
+
+Adding Histograms
+-----------------
+``hepconvert.add_histograms`` adds the values of many histograms
+and writes the summed histograms to an output file (like ROOT's hadd, but limited
+to histograms).
+
+
+**Parameters of note:**
+
+``union`` If True, adds the histograms that have the same name and appends all others
+to the new file.
+
+``append`` If True, appends histograms to an existing file. Force and append
+cannot both be True.
+
+``same_names`` If True, only adds together histograms which have the same name (key). If False,
+histograms are added together based on TTree structure (bins must be equal).
+
+Memory:
+``add_histograms`` has no memory customization available currently. To maintain
+performance it stores the summed histograms in memory until all files have
+been read, then the summed histograms are written to the output file. Only
+one input ROOT file is read and kept in memory at a time.
+
+
+Merging TTrees
+--------------
+``hepconvert.merge_root`` merges TTrees in multiple ROOT files together. The end result is a single file containing data from all input files (again like ROOT's hadd, but can handle flat TTrees and histograms).
+
+.. warning::
+ At the moment, hepconvert.merge can only merge TTrees that have the same
+ number of branches, with the same names and datatypes.
+ We are working on adding backfill capabilities for mismatched TTrees.
+
+**Features:**
+merge_root has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``, ``drop_trees`` and ``keep_trees``.
+
+
+Copying TTrees
+--------------
+``hepconvert.copy_root`` copies TTrees in multiple ROOT files together.
+
+.. warning::
+ At the moment, hepconvert.merge can only merge TTrees that have the same
+ number of branches, with the same names and datatypes.
+ We are working on adding backfill capabilities for mismatched TTrees.
+
+**Features:**
+merge_root has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``, ``drop_trees`` and ``keep_trees``.
+
+
+Parquet to ROOT
+---------------
+
+Writes the data from a single Parquet file to one TTree in a ROOT file.
+This function creates a new TTree (name the new tree with parameter ``tree``).
+
+
+ROOT to Parquet
+---------------
+
+Writes the data from one TTree in a ROOT file to a single Parquet file.
+If there are multiple TTrees in the file, specify one TTree to write to the Parquet file using the ``tree`` parameter.
+
+**Features:**
+root_to_parquet has parameters ``cut``, ``expressions``, ``drop_branches``, ``keep_branches``.
diff --git a/docs/source/guide.toctree b/docs/source/guide.toctree
new file mode 100644
index 0000000..ec2445f
--- /dev/null
+++ b/docs/source/guide.toctree
@@ -0,0 +1,5 @@
+.. toctree::
+ :caption: Guide with Examples
+ :hidden:
+
+ general_guide
diff --git a/docs/source/hepconvert.add_histograms.rst b/docs/source/hepconvert.add_histograms.rst
new file mode 100644
index 0000000..0a8af51
--- /dev/null
+++ b/docs/source/hepconvert.add_histograms.rst
@@ -0,0 +1,6 @@
+hepconvert.add_histograms
+=========================
+
+Defined in `hepconvert.histogram_adding `__ on `line 345 `__.
+
+.. autofunction:: hepconvert.add_histograms
diff --git a/docs/source/hepconvert.copy_root.copy_root.rst b/docs/source/hepconvert.copy_root.copy_root.rst
deleted file mode 100644
index f5e6d40..0000000
--- a/docs/source/hepconvert.copy_root.copy_root.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-hepconvert.copy_root
-====================
-
-Defined in `hepconvert.copy_root `__ on `line 13 `__.
-
-.. autofunction:: hepconvert.copy_root.copy_root
diff --git a/docs/source/hepconvert.copy_root.rst b/docs/source/hepconvert.copy_root.rst
new file mode 100644
index 0000000..ff9d871
--- /dev/null
+++ b/docs/source/hepconvert.copy_root.rst
@@ -0,0 +1,6 @@
+hepconvert.copy_root
+====================
+
+Defined in `hepconvert.copy_root `__ on `line 15 `__.
+
+.. autofunction:: hepconvert.copy_root
diff --git a/docs/source/hepconvert.copy_root.toctree b/docs/source/hepconvert.copy_root.toctree
index 268bc9a..a6c3308 100644
--- a/docs/source/hepconvert.copy_root.toctree
+++ b/docs/source/hepconvert.copy_root.toctree
@@ -2,5 +2,4 @@
:caption: copy_root
:hidden:
- hepconvert.copy_root (module)
- hepconvert.copy_root.copy_root
+ hepconvert.copy_root
diff --git a/docs/source/hepconvert.histogram_adding.add_histograms.rst b/docs/source/hepconvert.histogram_adding.add_histograms.rst
deleted file mode 100644
index 38e33a9..0000000
--- a/docs/source/hepconvert.histogram_adding.add_histograms.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-hepconvert.add_histograms
-=========================
-
-Defined in `hepconvert.histogram_adding `__ on `line 374 `__.
-
-.. autofunction:: hepconvert.histogram_adding.add_histograms
diff --git a/docs/source/hepconvert.histogram_adding.toctree b/docs/source/hepconvert.histogram_adding.toctree
index 02bc725..c8f63e5 100644
--- a/docs/source/hepconvert.histogram_adding.toctree
+++ b/docs/source/hepconvert.histogram_adding.toctree
@@ -2,5 +2,4 @@
:caption: histogram_adding
:hidden:
- hepconvert.histogram_adding (module)
- hepconvert.histogram_adding.add_histograms
+ hepconvert.add_histograms
diff --git a/docs/source/hepconvert.merge.merge_root.rst b/docs/source/hepconvert.merge.merge_root.rst
deleted file mode 100644
index 314985e..0000000
--- a/docs/source/hepconvert.merge.merge_root.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-hepconvert.merge_root
-=====================
-
-Defined in `hepconvert.merge `__ on `line 11 `__.
-
-.. autofunction:: hepconvert.merge.merge_root
diff --git a/docs/source/hepconvert.merge.toctree b/docs/source/hepconvert.merge.toctree
index 6406e9a..272c6d7 100644
--- a/docs/source/hepconvert.merge.toctree
+++ b/docs/source/hepconvert.merge.toctree
@@ -2,5 +2,4 @@
:caption: merge
:hidden:
- hepconvert.merge (module)
- hepconvert.merge.merge_root
+ hepconvert.merge_root
diff --git a/docs/source/hepconvert.merge_root.rst b/docs/source/hepconvert.merge_root.rst
new file mode 100644
index 0000000..74b82c7
--- /dev/null
+++ b/docs/source/hepconvert.merge_root.rst
@@ -0,0 +1,6 @@
+hepconvert.merge_root
+=====================
+
+Defined in `hepconvert.merge `__ on `line 17 `__.
+
+.. autofunction:: hepconvert.merge_root
diff --git a/docs/source/hepconvert.parquet_to_root.parquet_to_root.rst b/docs/source/hepconvert.parquet_to_root.parquet_to_root.rst
deleted file mode 100644
index 58a1e31..0000000
--- a/docs/source/hepconvert.parquet_to_root.parquet_to_root.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-hepconvert.parquet_to_root
-==========================
-
-Defined in `hepconvert.parquet_to_root `__ on `line 9 `__.
-
-.. autofunction:: hepconvert.parquet_to_root.parquet_to_root
diff --git a/docs/source/hepconvert.parquet_to_root.rst b/docs/source/hepconvert.parquet_to_root.rst
new file mode 100644
index 0000000..338b8bb
--- /dev/null
+++ b/docs/source/hepconvert.parquet_to_root.rst
@@ -0,0 +1,6 @@
+hepconvert.parquet_to_root
+==========================
+
+Defined in `hepconvert.parquet_to_root `__ on `line 11 `__.
+
+.. autofunction:: hepconvert.parquet_to_root
diff --git a/docs/source/hepconvert.parquet_to_root.toctree b/docs/source/hepconvert.parquet_to_root.toctree
index e26a68d..5889b70 100644
--- a/docs/source/hepconvert.parquet_to_root.toctree
+++ b/docs/source/hepconvert.parquet_to_root.toctree
@@ -2,5 +2,4 @@
:caption: parquet_to_root
:hidden:
- hepconvert.parquet_to_root (module)
- hepconvert.parquet_to_root.parquet_to_root
+ hepconvert.parquet_to_root
diff --git a/docs/source/hepconvert.root_to_parquet.root_to_parquet.rst b/docs/source/hepconvert.root_to_parquet.root_to_parquet.rst
deleted file mode 100644
index 24d7b04..0000000
--- a/docs/source/hepconvert.root_to_parquet.root_to_parquet.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-hepconvert.root_to_parquet
-==========================
-
-Defined in `hepconvert.root_to_parquet `__ on `line 9 `__.
-
-.. autofunction:: hepconvert.root_to_parquet.root_to_parquet
diff --git a/docs/source/hepconvert.root_to_parquet.rst b/docs/source/hepconvert.root_to_parquet.rst
new file mode 100644
index 0000000..eeaee89
--- /dev/null
+++ b/docs/source/hepconvert.root_to_parquet.rst
@@ -0,0 +1,6 @@
+hepconvert.root_to_parquet
+==========================
+
+Defined in `hepconvert.root_to_parquet `__ on `line 9 `__.
+
+.. autofunction:: hepconvert.root_to_parquet
diff --git a/docs/source/hepconvert.root_to_parquet.toctree b/docs/source/hepconvert.root_to_parquet.toctree
index ea852d6..b0a7fab 100644
--- a/docs/source/hepconvert.root_to_parquet.toctree
+++ b/docs/source/hepconvert.root_to_parquet.toctree
@@ -2,5 +2,4 @@
:caption: root_to_parquet
:hidden:
- hepconvert.root_to_parquet (module)
- hepconvert.root_to_parquet.root_to_parquet
+ hepconvert.root_to_parquet
diff --git a/docs/source/hepconvert.toctree b/docs/source/hepconvert.toctree
index f04ab24..43dfbae 100644
--- a/docs/source/hepconvert.toctree
+++ b/docs/source/hepconvert.toctree
@@ -1,6 +1,3 @@
.. toctree::
:caption: Detailed Reference
:hidden:
-
- hepconvert (module)
- hepconvert.merge.merge_root
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 41b4c42..513ad1e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -4,11 +4,18 @@
contain the root `toctree` directive.
.. include:: main.toctree
-
+.. include:: guide.toctree
+.. include:: cli.toctree
.. toctree::
:caption: Modules
:hidden:
+.. image:: https://raw.githubusercontent.com/scikit-hep/hepconvert/main/docs/docs-img/hepconvert_logo.svg
+ :width: 450px
+ :alt: hepconvert
+ :target: https://github.com/scikit-hep/hepconvert
+
+|
Welcome to hepconvert's documentation!
======================================
diff --git a/docs/source/main.toctree b/docs/source/main.toctree
index 973542b..3dde1c6 100644
--- a/docs/source/main.toctree
+++ b/docs/source/main.toctree
@@ -2,8 +2,8 @@
:caption: Main Interface
:hidden:
- hepconvert.parquet_to_root.parquet_to_root
- hepconvert.root_to_parquet.root_to_parquet
- hepconvert.copy_root.copy_root
- hepconvert.merge.merge_root
- hepconvert.histogram_adding.add_histograms
+ hepconvert.parquet_to_root
+ hepconvert.root_to_parquet
+ hepconvert.copy_root
+ hepconvert.merge_root
+ hepconvert.add_histograms
diff --git a/docs/source/merge_root.rst b/docs/source/merge_root.rst
new file mode 100644
index 0000000..57ce9f8
--- /dev/null
+++ b/docs/source/merge_root.rst
@@ -0,0 +1,64 @@
+Command Line Interface Guide: merge_root
+========================================
+
+Instructions for function `hepconvert.merge_root `__.
+
+Command:
+--------
+
+.. code-block:: bash
+
+ hepconvert merge-root [options] [OUT_FILE] [IN_FILES]
+
+
+Examples:
+---------
+
+.. code-block:: bash
+
+ hepconvert merge-root -f --progress-bar --keep-branches 'Jet_*' out_file.root file1.root file2.root file3.root
+
+
+Or with files in a directory:
+
+.. code-block:: bash
+
+ hepconvert merge-root -f --progress-bar --drop-branches {'tree1': 'branch1', 'branch2'} out_file.root directory/in_files/
+
+
+Branch skimming using ``cut``:
+
+.. code-block:: bash
+
+ hepconvert merge-root -f --keep-branches 'Jet_*' --cut 'Jet_Px > 5' out_file.root directory/in_files
+
+Options:
+--------
+
+``--drop-branches``, ``--keep-branches``, ``-db`` or ``-kb`` (list, str or dict) Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.
+
+``--drop-trees``, ``--keep-trees`` (list of str, or str) Specify tree names to remove/keep TTrees in the ROOT files. Wildcarding accepted.
+
+``--cut`` For branch skimming, passed to `uproot.iterate `__. str, if not None, this expression filters all of the expressions.
+
+``--expressions`` For branch skimming, passed to `uproot.iterate `__. Names of TBranches or aliases to convert to ararys or mathematical expressions of them. If None, all TBranches selected by the filters are included.
+
+``--force``, ``-f`` Use flag to overwrite a file if it already exists.
+
+``--progress-bar`` Will show a basic progress bar to show how many TTrees have merged and written.
+
+``--append``, ``-a`` Will append new TTree to an existing file.
+
+``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib".
+
+``--compression-level`` Level of compression set by an integer. Default is 1.
+
+``--name`` Give a name to the new TTree. Default is "tree".
+
+``--title`` Give a title to the new TTree.
+
+``--initial-basket-capacity`` (int) Number of TBaskets that can be written to the TTree without rewriting the TTree metadata to make room. Default is 10.
+
+``--resize-factor`` (float) When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor. Default is 10.0.
+
+``--step-size`` (str or int) Size of batches of data to read and write. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”. Default is "100 MB"
diff --git a/docs/source/parquet_to_root.rst b/docs/source/parquet_to_root.rst
new file mode 100644
index 0000000..bba507b
--- /dev/null
+++ b/docs/source/parquet_to_root.rst
@@ -0,0 +1,37 @@
+Command Line Interface Guide: parquet_to_root
+=============================================
+
+Instructions for function `parquet_to_root `__.
+
+.. code-block:: bash
+
+ hepconvert parquet-to-root [options] [OUT_FILE] [IN_FILE]
+
+Example:
+
+.. code-block:: bash
+
+ hepconvert parquet-to-root -f --progress-bar True --name new_tree out_file.root in_file.parquet
+
+This will write the data from a Parquet file to a flat TTree with the name "new_tree".
+
+Options:
+--------
+
+``--force``, ``-f`` Use flag to overwrite a file if it already exists.
+
+``--progress-bar`` Will create a basic progress bar to show how many row-groups have been written.
+
+``--append`` Will append new TTree to an existing file.
+
+``--compression``, ``-c`` Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib".
+
+``--compression-level`` Level of compression set by an integer. Default is 1.
+
+``--name`` Give a name to the new TTree. Default is "tree".
+
+``--title`` Give a title to the new TTree.
+
+``--initial-basket-capacity`` (int) Number of TBaskets that can be written to the TTree without rewriting the TTree metadata to make room. Default is 10.
+
+``--resize-factor`` (float) When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor. Default is 10.0.
diff --git a/docs/source/prepare_docstrings.py b/docs/source/prepare_docstrings.py
index c6bc1c9..91096f2 100644
--- a/docs/source/prepare_docstrings.py
+++ b/docs/source/prepare_docstrings.py
@@ -19,11 +19,11 @@
]
common = [
- "hepconvert.parquet_to_root.parquet_to_root",
- "hepconvert.root_to_parquet.root_to_parquet",
- "hepconvert.copy_root.copy_root",
- "hepconvert.merge.merge_root",
- "hepconvert.histogram_adding.add_histograms",
+ "hepconvert.parquet_to_root",
+ "hepconvert.root_to_parquet",
+ "hepconvert.copy_root",
+ "hepconvert.merge_root",
+ "hepconvert.add_histograms",
]
latest_commit = (
@@ -70,11 +70,11 @@ def handle_module(modulename, module):
.. automodule:: {0}
""".format(modulename, "=" * len(modulename))
- ensure(modulename + ".rst", content)
- if toctree2 is None:
- toctree.write(" " + modulename + " (module) <" + modulename + ">\n")
- else:
- toctree2.write(" " + modulename + " (module) <" + modulename + ">\n")
+ # ensure(modulename + ".rst", content)
+ # if toctree2 is None:
+ # toctree.write(" " + modulename + " (module) <" + modulename + ">\n")
+ # else:
+ # toctree2.write(" " + modulename + " (module) <" + modulename + ">\n")
if modulename != "hepconvert" and all(
not x.startswith("test") and not x.startswith("_")
@@ -100,7 +100,8 @@ def line_order(pair):
if inspect.isclass(obj):
handle_class(modulename + "." + name, obj)
elif inspect.isfunction(obj):
- handle_function(modulename + "." + name, obj)
+ ensure("hepconvert." + name + ".rst", content)
+ handle_function("hepconvert." + name, obj)
def handle_class(classname, cls):
@@ -223,7 +224,6 @@ def prettymro(c):
classname,
"\n".join([text for index, line, text in sorted(methods.values())]),
)
-
ensure(classname + ".rst", content)
if upfront or toctree2 is None:
if classname not in common:
diff --git a/docs/source/root_to_parquet.rst b/docs/source/root_to_parquet.rst
new file mode 100644
index 0000000..97b9d11
--- /dev/null
+++ b/docs/source/root_to_parquet.rst
@@ -0,0 +1,160 @@
+Command Line Interface Guide: root_to_parquet
+=============================================
+
+Instructions for function `hepconvert.root_to_parquet `__
+
+Command:
+--------
+
+.. code-block:: bash
+
+ hepconvert root-to-parquet [options] [OUT_FILE] [IN_FILE]
+
+
+Examples:
+---------
+
+.. code-block:: bash
+
+ hepconvert root-to-parquet -f --progress-bar --tree 'tree1' out_file.parquet in_file.root
+
+
+Options:
+--------
+
+``tree`` (str) if there are multiple TTrees in the input file, specify the name of the TTree to copy.
+
+``--drop-branches``, ``-db``, and ``--keep-branches``, ``-kb`` (list) str or dict Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.
+
+``--cut`` For branch skimming, passed to `uproot.iterate `__. str, if not None, this expression filters all of the expressions.
+
+``--expressions`` For branch skimming, passed to `uproot.iterate `__. Names of TBranches or aliases to convert to ararys or mathematical expressions of them. If None, all TBranches selected by the filters are included.
+
+``--force`` or ``-f`` Use flag to overwrite a file if it already exists.
+
+``--step-size`` (int) Size of batches of data to read and write. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”. Default is "100 MB"
+
+``--compression`` of ``-c`` (str) Compression type. Options are "lzma", "zlib", "lz4", and "zstd". Default is "zlib".
+
+``--compression-level`` (int) Level of compression set by an integer. Default is 1.
+
+Options passed to `ak.to_parquet `__:
+----------------------------------------------------------------------------------------------------------------
+
+``--list-to32`` (bool) If True, convert Awkward lists into 32-bit Arrow lists
+if they're small enough, even if it means an extra conversion. Otherwise,
+signed 32-bit **ak.types.ListType** maps to Arrow `ListType`,
+signed 64-bit **ak.types.ListType** maps to Arrow `LargeListType`,
+and unsigned 32-bit **ak.types.ListType** picks whichever Arrow type its
+values fit into.
+
+``--string-to32`` (bool) Same as the above for Arrow `string` and `large_string`.
+
+``--bytestring-to32`` (bool) Same as the above for Arrow `binary` and `large_binary`.
+
+``--emptyarray-to`` (None or dtype) If None, **ak.types.UnknownType** maps to Arrow's
+null type; otherwise, it is converted a given numeric dtype.
+
+``--categorical-as-dictionary`` (bool) If True, **ak.contents.IndexedArray** and
+#ak.contents.IndexedOptionArray labeled with `__array__ = "categorical"`
+are mapped to Arrow `DictionaryArray`; otherwise, the projection is
+evaluated before conversion (always the case without
+`__array__ = "categorical"`).
+
+``--extensionarray`` (bool) If True, this function returns extended Arrow arrays
+(at all levels of nesting), which preserve metadata so that Awkward to
+Arrow to Awkward preserves the array's **ak.types.Type** (though not
+the #ak.forms.Form). If False, this function returns generic Arrow arrays
+that might be needed for third-party tools that don't recognize Arrow's
+extensions. Even with `extensionarray=False`, the values produced by
+Arrow's `to_pylist` method are the same as the values produced by Awkward's
+#ak.to_list.
+
+``--count-nulls`` (bool) If True, count the number of missing values at each level
+and include these in the resulting Arrow array, which makes some downstream
+applications faster. If False, skip the up-front cost of counting them.
+
+``-c`` or ``--compression`` (None, str, or dict) Compression algorithm name, passed to
+`pyarrow.parquet.ParquetWriter `__.
+Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}`
+(where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys
+are column names (the same column names that #ak.forms.Form.columns returns
+and #ak.forms.Form.select_columns accepts) and the values are compression
+algorithm names, to compress each column differently.
+
+``--compression-level`` (None, int, or dict None) Compression level, passed to
+`pyarrow.parquet.ParquetWriter `__.
+Compression levels have different meanings for different compression
+algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for
+example. Generally, higher numbers provide slower but smaller compression.
+
+``--row-group-size`` (int or None) Will be overwritten by ``step_size``.
+
+``--data-page-size`` (None or int) Number of bytes in each data page, passed to
+`pyarrow.parquet.ParquetWriter `__.
+If None, the Parquet default of 1 MiB is used.
+
+``--parquet-flavor`` (None or `"spark"`) If None, the output Parquet file will follow
+Arrow conventions; if `"spark"`, it will follow Spark conventions. Some
+systems, such as Spark and Google BigQuery, might need Spark conventions,
+while others might need Arrow conventions. Passed to
+`pyarrow.parquet.ParquetWriter `__.
+as `flavor`.
+
+``--parquet-version`` (`"1.0"`, `"2.4"`, or `"2.6"`) Parquet file format version.
+Passed to `pyarrow.parquet.ParquetWriter `__.
+as `version`.
+
+``--parquet-page-version`` (`"1.0"` or `"2.0"`) Parquet page format version.
+Passed to `pyarrow.parquet.ParquetWriter `__.
+as `data_page_version`.
+
+``--parquet-metadata-statistics`` (bool or dict) If True, include summary
+statistics for each data page in the Parquet metadata, which lets some
+applications search for data more quickly (by skipping pages). If a dict
+mapping column names to bool, include summary statistics on only the
+specified columns. Passed to
+`pyarrow.parquet.ParquetWriter `__.
+as `write_statistics`.
+
+``--parquet-dictionary-encoding`` (bool or dict) If True, allow Parquet to pre-compress
+with dictionary encoding. If a dict mapping column names to bool, only
+use dictionary encoding on the specified columns. Passed to
+`pyarrow.parquet.ParquetWriter `__.
+as `use_dictionary`.
+
+``--parquet-byte-stream-split`` (bool or dict) If True, pre-compress floating
+point fields (`float32` or `float64`) with byte stream splitting, which
+collects all mantissas in one part of the stream and exponents in another.
+Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+as `use_byte_stream_split`.
+
+``--parquet-coerce-timestamps`` (None, `"ms"`, or `"us"`) If None, any timestamps
+(`datetime64` data) are coerced to a given resolution depending on
+`parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds,
+but later versions use the `datetime64`'s own units. If `"ms"` is explicitly
+specified, timestamps are coerced to milliseconds; if `"us"`, microseconds.
+Passed to `pyarrow.parquet.ParquetWriter `__.
+as `coerce_timestamps`.
+
+``--parquet-old-int96-timestamps`` (None or bool) If True, use Parquet's INT96 format
+for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`.
+If None, let the `parquet_flavor` decide. Passed to
+`pyarrow.parquet.ParquetWriter `__
+as `use_deprecated_int96_timestamps`.
+
+``--parquet-compliant-nested`` (bool) If True, use the Spark/BigQuery/Parquet
+`convention for nested lists `__,
+in which each list is a one-field record with field name "`element`";
+otherwise, use the Arrow convention, in which the field name is "`item`".
+Passed to `pyarrow.parquet.ParquetWriter `__
+as `use_compliant_nested_type`.
+
+``--parquet-extra-options`` (None or dict)
+Any additional options to pass to
+`pyarrow.parquet.ParquetWriter `__.
+
+``--storage-options`` (None or dict)
+Any additional options to pass to
+`fsspec.core.url_to_fs `__
+to open a remote file for writing.
diff --git a/src/hepconvert/__main__.py b/src/hepconvert/__main__.py
index 500f5d2..1341247 100644
--- a/src/hepconvert/__main__.py
+++ b/src/hepconvert/__main__.py
@@ -9,16 +9,16 @@
def main() -> None:
"""
Must provide a subcommand:
- parquet-to-root, root-to-parquet, copy-root, add-and-merge, or add
+ parquet-to-root, root-to-parquet, copy-root, add, or add
"""
@main.command()
@click.argument("destination", type=click.Path())
@click.argument("file")
-@click.option("--progress-bar", default=None, type=bool, required=False)
-@click.option("--name", required=False, default="")
-@click.option("--title", required=False, default="")
+@click.option("--progress-bar", is_flag=True)
+@click.option("--name", type=str, required=False, default="")
+@click.option("--title", type=str, required=False, default="")
@click.option(
"--initial-basket-capacity",
default=10,
@@ -30,6 +30,7 @@ def main() -> None:
help="When the TTree metadata needs to be rewritten, this specifies how many more TBasket slots to allocate as a multiplicative factor.",
)
@click.option(
+ "-c",
"--compression",
default="zlib",
help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4".',
@@ -51,7 +52,7 @@ def parquet_to_root(
*,
name="tree",
branch_types=None,
- progress_bar=False,
+ progress_bar,
title="",
field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner,
initial_basket_capacity=10,
@@ -87,17 +88,36 @@ def parquet_to_root(
@click.argument("destination", type=click.Path())
@click.argument("file")
@click.option(
- "--drop-branches", "-db", default=None, type=list or dict or str, required=False
+ "-db",
+ "--drop-branches",
+ default=None,
+ type=list or dict or str,
+ required=False,
+ help="Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.",
+)
+@click.option(
+ "-kb", "--keep-branches", default=None, type=list or dict or str, required=False
+)
+@click.option(
+ "-dt",
+ "--drop-trees",
+ default=None,
+ type=list or str,
+ required=False,
+ help="Specify tree names to remove from the ROOT file. Wildcarding accepted.",
)
@click.option(
- "--keep-branches", "-kb", default=None, type=list or dict or str, required=False
+ "-kt",
+ "--keep-trees",
+ default=None,
+ type=list or str,
+ required=False,
+ help="Specify tree names to keep in the ROOT file. All others will be removed. Wildcarding accepted.",
)
-@click.option("--drop-trees", "-dt", default=None, type=list or str, required=False)
-@click.option("--keep-trees", "kt", default=None, type=list or str, required=False)
-@click.option("--progress-bar", default=None, type=bool, required=False)
+@click.option("--progress-bar", is_flag=True)
@click.option("--cut", default=None, type=str or list, required=False)
@click.option("--expressions", default=None, type=str or list, required=False)
-@click.option("--title", required=False, default="")
+@click.option("--title", type=str, required=False, default="")
@click.option(
"--initial-basket-capacity",
default=10,
@@ -124,7 +144,7 @@ def copy_root(
keep_trees=None,
cut=None,
expressions=None,
- progress_bar=None,
+ progress_bar,
force,
title="",
field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner,
@@ -171,48 +191,53 @@ def copy_root(
is_flag=True,
help="Overwrite destination file if it already exists",
)
-@click.option("--progress-bar", default=None, type=bool, required=False)
-@click.option("--append", default=False, help="Append histograms to an existing file")
+@click.option("--progress-bar", is_flag=True)
+@click.option(
+ "-a", "--append", is_flag=True, help="Append histograms to an existing file"
+)
@click.option(
+ "-c",
"--compression",
default="zlib",
- help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4".',
+ type=str,
+ help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "ZLIB".',
)
@click.option(
"--compression-level",
default=1,
+ type=int,
help="Use a compression level particular to the chosen compressor. By default the compression level is 1.",
)
@click.option(
"--skip-bad-files",
- default=False,
+ is_flag=True,
help="Skip corrupt or non-existent files without exiting",
)
@click.option(
"--union",
- default=True,
+ is_flag=True,
help="Adds the histograms that have the same name and appends all others to the new file",
)
@click.option(
"--same-names",
- default=False,
+ is_flag=True,
help="Only adds histograms together if they have the same name",
)
def add(
destination,
files,
*,
- progress_bar=False,
+ progress_bar,
force,
- append=False,
+ append,
compression="zlib",
compression_level=1,
- skip_bad_files=False,
- union=True,
- same_names=False,
+ skip_bad_files,
+ union,
+ same_names,
):
"""
- Hadd files.
+ Sums histograms and writes them to a new file.
"""
import hepconvert.histogram_adding # pylint: disable=import-outside-toplevel
@@ -246,21 +271,51 @@ def add(
)
@click.option(
"--step-size",
- default=100,
+ default="100 MB",
+ type=int or str,
help="If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include. The string must be a number followed by a memory unit, such as “100 MB”.",
)
-@click.option("--drop-branches", default=None, type=list or dict or str, required=False)
-@click.option("--keep-branches", default=None, type=list or dict or str, required=False)
-@click.option("--drop-trees", default=None, type=list or str, required=False)
-@click.option("--keep-trees", default=None, type=list or str, required=False)
-@click.option("--progress-bar", default=None, type=bool, required=False)
+@click.option(
+ "-db",
+ "--drop-branches",
+ default=None,
+ type=list or dict or str,
+ required=False,
+ help="Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.",
+)
+@click.option(
+ "-kb", "--keep-branches", default=None, type=list or dict or str, required=False
+)
+@click.option(
+ "-dt",
+ "--drop-trees",
+ default=None,
+ type=list or str,
+ required=False,
+ help="Specify tree names to remove from the ROOT file. Wildcarding accepted.",
+)
+@click.option(
+ "-kt",
+ "--keep-trees",
+ default=None,
+ type=list or str,
+ required=False,
+ help="Specify tree names to keep in the ROOT file.. Wildcarding accepted.",
+)
+@click.option("--progress-bar", is_flag=True)
@click.option("--cut", default=None, type=str or list, required=False)
@click.option("--expressions", default=None, type=str or list, required=False)
@click.option(
- "--force", is_flag=True, help="Overwrite destination file if it already exists"
+ "-f",
+ "--force",
+ is_flag=True,
+ help="Overwrite destination file if it already exists",
+)
+@click.option(
+ "-a", "--append", is_flag=True, help="Append histograms to an existing file"
)
-@click.option("--append", default=False, help="Append histograms to an existing file")
@click.option(
+ "-c",
"--compression",
default="zlib",
help='Sets compression level for root file to write to. Can be one of "ZLIB", "LZMA", "LZ4", or "ZSTD". By default the compression algorithm is "LZ4".',
@@ -288,13 +343,13 @@ def merge_root(
keep_trees=None,
cut=None,
expressions=None,
- progress_bar=False,
+ progress_bar,
initial_basket_capacity=10,
resize_factor=10.0,
counter_name=lambda counted: "n" + counted,
step_size="100 MB",
force,
- append=False,
+ append,
compression="LZ4",
compression_level=1,
skip_bad_files=False,
@@ -339,6 +394,24 @@ def merge_root(
type=bool,
help="Specify the name of a tree to write to Parquet, if there are multiple trees in the ROOT file.",
)
+@click.option(
+ "-db",
+ "--drop-branches",
+ default=None,
+ type=list or dict or str,
+ required=False,
+ help="Specify branch names to remove from the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to remove branches from certain ttrees. Wildcarding accepted.",
+)
+@click.option(
+ "-kb",
+ "--keep-branches",
+ default=None,
+ type=list or dict or str,
+ required=False,
+ help="Specify branch names to keep in the ROOT file. Either a str, list of str (for multiple branches), or a dict with form {'tree': 'branches'} to keep only certain branches in certain ttrees. Wildcarding accepted.",
+)
+@click.option("--cut", default=None, type=str or list, required=False)
+@click.option("--expressions", default=None, type=str or list, required=False)
@click.option(
"-f",
"--force",
@@ -349,6 +422,7 @@ def merge_root(
@click.option(
"-s",
"--step-size",
+ type=int or str,
default="100 MB",
help="Specify batch size for reading ROOT file. If an integer, the maximum number of entries to include in each iteration step; if a string, the maximum memory size to include.",
)
@@ -393,6 +467,7 @@ def merge_root(
help="Count the number of missing values at each level and include these in the resulting Arrow array, which makes some downstream applications faster. If False, skip the up-front cost of counting them.",
)
@click.option(
+ "-c",
"--compression",
default=False,
type=bool,
@@ -480,6 +555,13 @@ def root_to_parquet(
in_file=None,
out_file=None,
*,
+ tree=None,
+ drop_branches=None,
+ keep_branches=None,
+ cut=None,
+ expressions=None,
+ force=False,
+ step_size="100 MB",
list_to32=False,
string_to32=True,
bytestring_to32=True,
@@ -502,9 +584,6 @@ def root_to_parquet(
parquet_compliant_nested=False,
parquet_extra_options=None,
storage_options=None,
- tree=None,
- force,
- step_size=100,
):
"""
Convert ROOT to Parquet.
@@ -514,6 +593,13 @@ def root_to_parquet(
hepconvert.root_to_parquet(
in_file=in_file,
out_file=out_file,
+ tree=tree,
+ drop_branches=drop_branches,
+ keep_branches=keep_branches,
+ cut=cut,
+ expressions=expressions,
+ force=force,
+ step_size=step_size,
list_to32=list_to32,
string_to32=string_to32,
bytestring_to32=bytestring_to32,
@@ -536,9 +622,6 @@ def root_to_parquet(
parquet_compliant_nested=parquet_compliant_nested,
parquet_extra_options=parquet_extra_options,
storage_options=storage_options,
- tree=tree,
- force=force,
- step_size=step_size,
)
diff --git a/src/hepconvert/copy_root.py b/src/hepconvert/copy_root.py
index 321adc7..e59a7e3 100644
--- a/src/hepconvert/copy_root.py
+++ b/src/hepconvert/copy_root.py
@@ -18,7 +18,7 @@ def copy_root(
*,
keep_branches=None,
drop_branches=None,
- # add_branches=None, #TO-DO: add functionality for this, just specify about the counter issue
+ # add_branches=None, #TODO: add functionality for this, just specify about the counter issue?
keep_trees=None,
drop_trees=None,
cut=None,
@@ -26,7 +26,7 @@ def copy_root(
progress_bar=None,
force=False,
fieldname_separator="_",
- # fix_duplicate_counters=False, #TO-DO: ask about this?
+ # fix_duplicate_counters=False, #TODO: ask about this?
title="",
field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner,
initial_basket_capacity=10,
@@ -112,7 +112,9 @@ def copy_root(
--------------------------
This function can be run from the command line. Use command
- >>> hepconvert copy-root [options] [OUT_FILE] [IN_FILE]
+ .. code-block:: bash
+
+ hepconvert copy-root [options] [OUT_FILE] [IN_FILE]
"""
if compression in ("ZLIB", "zlib"):
@@ -214,11 +216,10 @@ def copy_root(
)
raise ValueError(msg)
- if len(trees) > 1 and progress_bar:
+ if len(trees) > 1 and progress_bar is not False:
+ number_of_items = len(trees)
if progress_bar is True:
tqdm = _utils.check_tqdm()
- number_of_items = len(trees)
-
progress_bar = tqdm.tqdm(desc="Trees copied")
progress_bar.reset(total=number_of_items)
for t in trees:
@@ -279,6 +280,6 @@ def copy_root(
out_file[tree.name].extend(chunk)
except AssertionError:
msg = "Are the branch-names correct?"
- if len(trees) > 1 and progress_bar:
+ if len(trees) > 1 and progress_bar is not False:
progress_bar.update(n=1)
f.close()
diff --git a/src/hepconvert/histogram_adding.py b/src/hepconvert/histogram_adding.py
index 3cf8ae6..0d57450 100644
--- a/src/hepconvert/histogram_adding.py
+++ b/src/hepconvert/histogram_adding.py
@@ -397,7 +397,9 @@ def add_histograms(
--------------------------
This function can be run from the command line. Use command
- >>> hepconvert add [options] [OUT_FILE] [IN_FILES]
+ .. code-block:: bash
+
+ hepconvert add [options] [OUT_FILE] [IN_FILES]
"""
if compression in ("ZLIB", "zlib"):
@@ -449,12 +451,12 @@ def add_histograms(
with uproot.open(files[0]) as file:
keys = file.keys(filter_classname="TH[1|2|3][I|S|F|D|C]", cycle=False)
- if progress_bar:
+ if progress_bar is not False:
+ tqdm = _utils.check_tqdm()
file_bar = progress_bar
- hist_bar = progress_bar
+ hist_bar = tqdm.tqdm(desc="Histograms added")
+ number_of_items = len(files)
if progress_bar is True:
- tqdm = _utils.check_tqdm()
- number_of_items = len(files)
file_bar = tqdm.tqdm(desc="Files added")
hist_bar = tqdm.tqdm(desc="Histograms added")
diff --git a/src/hepconvert/merge.py b/src/hepconvert/merge.py
index febbdfa..66a0c70 100644
--- a/src/hepconvert/merge.py
+++ b/src/hepconvert/merge.py
@@ -109,7 +109,9 @@ def merge_root(
--------------------------
This function can be run from the command line. Use command
- >>> hepconvert merge [options] [OUT_FILE] [IN_FILES]
+ .. code-block:: bash
+
+ hepconvert merge [options] [OUT_FILE] [IN_FILES]
"""
@@ -243,7 +245,7 @@ def merge_root(
destination,
)
raise ValueError(msg)
- if progress_bar:
+ if progress_bar is not False:
if progress_bar is True:
tqdm = _utils.check_tqdm()
number_of_items = len(files)
@@ -306,7 +308,7 @@ def merge_root(
out_file[tree.name].extend(chunk)
except AssertionError:
msg = "TTrees must have the same structure to be merged. Are the branch_names correct?"
- if progress_bar:
+ if progress_bar is not False:
progress_bar.update(n=1)
f.close()
@@ -381,6 +383,6 @@ def merge_root(
for key in hist_keys:
out_file[key] = writable_hists[key]
- if progress_bar:
+ if progress_bar is not False:
progress_bar.update(n=1)
f.close()
diff --git a/src/hepconvert/parquet_to_root.py b/src/hepconvert/parquet_to_root.py
index 220680b..fc1a135 100644
--- a/src/hepconvert/parquet_to_root.py
+++ b/src/hepconvert/parquet_to_root.py
@@ -13,16 +13,17 @@ def parquet_to_root(
file,
*,
name="tree",
+ force=False,
branch_types=None,
progress_bar=False,
+ append=False,
title="",
field_name=lambda outer, inner: inner if outer == "" else outer + "_" + inner,
initial_basket_capacity=10,
counter_name=lambda counted: "n" + counted,
resize_factor=10.0,
- compression="zlib",
+ compression="ZLIB",
compression_level=1,
- force=True,
):
"""Converts a Parquet file into a ROOT file. Data is stored in one TTree, which has a name defined by argument ``name``.
@@ -66,7 +67,9 @@ def parquet_to_root(
--------------------------
This function can be run from the command line. Use command
- >>> hepconvert parquet-to-root [options] [OUT_FILE] [IN_FILE]
+ .. code-block:: bash
+
+ hepconvert parquet-to-root [options] [OUT_FILE] [IN_FILE]
"""
if compression in ("LZMA", "lzma"):
@@ -82,21 +85,35 @@ def parquet_to_root(
raise ValueError(msg)
path = Path(destination)
if Path.is_file(path) and not force:
- raise FileExistsError
+ msg = f"File {path} already exists. To overwrite it, set force=True."
+ raise FileExistsError(msg)
+ if append:
+ if Path.is_file(path):
+ out_file = uproot.update(
+ destination,
+ compression=uproot.compression.Compression.from_code_pair(
+ compression_code, compression_level
+ ),
+ )
+ else:
+ msg = "Cannot append to a non-existent file."
+ raise FileNotFoundError(msg)
+
+ else:
+ out_file = uproot.recreate(
+ destination,
+ compression=uproot.compression.Compression.from_code_pair(
+ compression_code, compression_level
+ ),
+ )
metadata = ak.metadata_from_parquet(file)
- if progress_bar:
+ if progress_bar is not False:
+ number_of_items = metadata["num_row_groups"]
if progress_bar is True:
- number_of_items = metadata["num_row_groups"]
tqdm = _utils.check_tqdm()
progress_bar = tqdm.tqdm(desc="Row-groups written")
progress_bar.reset(number_of_items)
- out_file = uproot.recreate(
- destination,
- compression=uproot.compression.Compression.from_code_pair(
- compression_code, compression_level
- ),
- )
chunk = ak.from_parquet(file, row_groups=[0])
if not branch_types:
diff --git a/src/hepconvert/root_to_parquet.py b/src/hepconvert/root_to_parquet.py
index a550ede..324c867 100644
--- a/src/hepconvert/root_to_parquet.py
+++ b/src/hepconvert/root_to_parquet.py
@@ -43,7 +43,7 @@ def root_to_parquet(
"""Converts ROOT to Parquet file using Uproot and awkward.to_parquet. Data read from 1 tree, converted to single Parquet file.
:param in_file: Local ROOT file to convert to Parquet. May contain glob patterns.
- :type in_file: str
+ :type in_file: path-like
:param out_file: Name of the output file or file path.
:type out_file: path-like
:param tree: If there are multiple trees in the ROOT file, specify the name of one to write to Parquet.
@@ -95,7 +95,7 @@ def root_to_parquet(
Command line option: ``--count-nulls``.
:type count_nulls: bool
:param compression: Compression algorithm name, passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}`
(where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys
are column names (the same column names that #ak.forms.Form.columns returns
@@ -103,33 +103,33 @@ def root_to_parquet(
algorithm names, to compress each column differently. Command line option: ``--compression``.
:type compression: None, str, or dict
:param compression_level: Compression level, passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
Compression levels have different meanings for different compression
algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for
example. Generally, higher numbers provide slower but smaller compression. Command line option
``--compression-level``.
:type compression_level: None, int, or dict None
:param row_group_size: Maximum number of entries in each row group,
- passed to [pyarrow.parquet.ParquetWriter.write_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table).
+ passed to `pyarrow.parquet.ParquetWriter.write_table `__.
If None, the Parquet default of 64 MiB is used. Command line options: ``-rg`` or ``--row-group-size``.
:type row_group_size: int or None
:param data_page_size: Number of bytes in each data page, passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
If None, the Parquet default of 1 MiB is used. Command line option: ``--data-page-size``.
:type data_page_size: None or int
:param parquet_flavor: If None, the output Parquet file will follow
Arrow conventions; if `"spark"`, it will follow Spark conventions. Some
systems, such as Spark and Google BigQuery, might need Spark conventions,
while others might need Arrow conventions. Passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
as `flavor`. Command line option: ``--parquet-flavor``.
:type parquet_flavor: None or `"spark"`
:param parquet_version: Parquet file format version.
- Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ Passed to `pyarrow.parquet.ParquetWriter `__.
as `version`. Command line option: ``--parquet-version``.
:type parquet_version: `"1.0"`, `"2.4"`, or `"2.6"`
:param parquet_page_version: Parquet page format version.
- Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ Passed to `pyarrow.parquet.ParquetWriter `__.
as `data_page_version`. Command line option: ``--parquet-page-version``.
:type parquet_page_version: `"1.0"` or `"2.0"`
:param parquet_metadata_statistics: If True, include summary
@@ -137,19 +137,19 @@ def root_to_parquet(
applications search for data more quickly (by skipping pages). If a dict
mapping column names to bool, include summary statistics on only the
specified columns. Passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
as `write_statistics`. Command line option: ``--parquet-metadata-statistics``.
:type parquet_metadata_statistics: bool or dict
:param parquet_dictionary_encoding: If True, allow Parquet to pre-compress
with dictionary encoding. If a dict mapping column names to bool, only
use dictionary encoding on the specified columns. Passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
as `use_dictionary`. Command line option: ``--parquet-dictionary-encoding``.
:type parquet_dictionary_encoding: bool or dict
:param parquet_byte_stream_split: If True, pre-compress floating
point fields (`float32` or `float64`) with byte stream splitting, which
collects all mantissas in one part of the stream and exponents in another.
- Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ Passed to `pyarrow.parquet.ParquetWriter `__.
as `use_byte_stream_split`. Command line option: ``--parquet-byte-stream-split``.
:type parquet_byte_stream_split: bool or dict
:param parquet_coerce_timestamps: If None, any timestamps
@@ -157,27 +157,27 @@ def root_to_parquet(
`parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds,
but later versions use the `datetime64`'s own units. If `"ms"` is explicitly
specified, timestamps are coerced to milliseconds; if `"us"`, microseconds.
- Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ Passed to `pyarrow.parquet.ParquetWriter `__.
as `coerce_timestamps`. Command line option: ``--parquet-coerce-timestamps``.
:type parquet_coerce_timestamps: None, `"ms"`, or `"us"`
:param parquet_old_int96_timestamps: If True, use Parquet's INT96 format
for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`.
If None, let the `parquet_flavor` decide. Passed to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
as `use_deprecated_int96_timestamps`. Command line option: ``--parquet-old-int96-timestamps``.
:type parquet_old_int96_timestamps: None or bool
:param parquet_compliant_nested: If True, use the Spark/BigQuery/Parquet
- [convention for nested lists](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types),
+ `convention for nested lists `__,
in which each list is a one-field record with field name "`element`";
otherwise, use the Arrow convention, in which the field name is "`item`".
- Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ Passed to `pyarrow.parquet.ParquetWriter `__.
as `use_compliant_nested_type`. Command line option: ``--parquet-compliant-nested``.
:type parquet_compliated_nested: bool
:param parquet_extra_options: Any additional options to pass to
- [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html).
+ `pyarrow.parquet.ParquetWriter `__.
:type parquet_extra_options: None or dict
:param storage_options: Any additional options to pass to
- [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs)
+ `fsspec.core.url_to_fs `__
to open a remote file for writing.
:type storage_options: None or dict
@@ -191,7 +191,9 @@ def root_to_parquet(
--------------------------
This function can be run from the command line. Use command
- >>> hepconvert root-to-parquet [options] [OUT_FILE] [IN_FILE]
+ .. code-block:: bash
+
+ hepconvert root-to-parquet [options] [OUT_FILE] [IN_FILE]
"""
path = Path(out_file)
diff --git a/src/hepconvert/write_root.py b/src/hepconvert/write_root.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py
index 01cdedf..139e12f 100644
--- a/tests/test_add_histograms.py
+++ b/tests/test_add_histograms.py
@@ -71,12 +71,12 @@ def test_simple(tmp_path):
).all
-def mult_1D(tmp_path):
+def mult_1D(tmp_path, file_paths):
gauss_1 = ROOT.TH1I("name1", "title", 5, -4, 4)
gauss_1.FillRandom("gaus")
gauss_1.Sumw2()
gauss_1.SetDirectory(0)
- outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file1.root"), "RECREATE")
+ outHistFile = ROOT.TFile.Open(file_paths[0], "RECREATE")
outHistFile.cd()
gauss_1.Write()
outHistFile.Close()
@@ -86,7 +86,7 @@ def mult_1D(tmp_path):
gauss_2.FillRandom("gaus")
gauss_2.Sumw2()
gauss_2.SetDirectory(0)
- outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file1.root"), "UPDATE")
+ outHistFile = ROOT.TFile.Open(file_paths[0], "UPDATE")
outHistFile.cd()
gauss_2.Write()
outHistFile.Close()
@@ -96,7 +96,7 @@ def mult_1D(tmp_path):
gauss_3.FillRandom("gaus")
gauss_3.Sumw2()
gauss_3.SetDirectory(0)
- outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file2.root"), "RECREATE")
+ outHistFile = ROOT.TFile.Open(file_paths[1], "RECREATE")
outHistFile.cd()
gauss_3.Write()
outHistFile.Close()
@@ -106,7 +106,7 @@ def mult_1D(tmp_path):
gauss_4.FillRandom("gaus")
gauss_4.Sumw2()
gauss_4.SetDirectory(0)
- outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file2.root"), "UPDATE")
+ outHistFile = ROOT.TFile.Open(file_paths[1], "UPDATE")
outHistFile.cd()
gauss_4.Write()
outHistFile.Close()
@@ -116,7 +116,7 @@ def mult_1D(tmp_path):
gauss_5.FillRandom("gaus")
gauss_5.Sumw2()
gauss_5.SetDirectory(0)
- outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file3.root"), "RECREATE")
+ outHistFile = ROOT.TFile.Open(file_paths[2], "RECREATE")
outHistFile.cd()
gauss_5.Write()
outHistFile.Close()
@@ -126,23 +126,14 @@ def mult_1D(tmp_path):
gauss_6.FillRandom("gaus")
gauss_6.Sumw2()
gauss_6.SetDirectory(0)
- outHistFile = ROOT.TFile.Open(os.path.join(tmp_path, "file3.root"), "UPDATE")
+ outHistFile = ROOT.TFile.Open(file_paths[2], "UPDATE")
outHistFile.cd()
gauss_6.Write()
outHistFile.Close()
h6 = uproot.from_pyroot(gauss_6)
destination = os.path.join(tmp_path, "destination.root")
- hepconvert.add_histograms(
- destination,
- [
- os.path.join(tmp_path, "file1.root"),
- os.path.join(tmp_path, "file2.root"),
- os.path.join(tmp_path, "file3.root"),
- ],
- force=True,
- same_names=False,
- )
+ hepconvert.add_histograms(destination, file_paths, force=True, same_names=False)
with uproot.open(destination) as file:
added = uproot.from_pyroot(