diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..608def7
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+---
+Language: Cpp
+BasedOnStyle:  LLVM
+ColumnLimit:     80
+---
diff --git a/.github/workflows/tests.yaml b/.github/workflows/ci.yaml
similarity index 55%
rename from .github/workflows/tests.yaml
rename to .github/workflows/ci.yaml
index 8f5109a..eaee8b4 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/ci.yaml
@@ -1,24 +1,30 @@
-name: tests
+name: GitHub CI
 
-on: [push]
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
 
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        os: [ubuntu-latest, macos-latest]
+        python-version: ['3.9', '3.10']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}.
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies.
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install .[test] -v
         pip install numpy
         pip install pytest pytest-cov
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
@@ -26,5 +32,4 @@ jobs:
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       run: |
-        make install
-        make tests
+        python -m pytest --cov=./ --cov-report=xml
diff --git a/.gitignore b/.gitignore
index 3951c67..e563265 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,16 +1,242 @@
-dist
-build
-.egg
-.tox
-.vscode
-.coverage
-__pycache__
-.mypy_cache
-.pytest_cache
-.ipynb_checkpoints
+# Created by https://www.toptal.com/developers/gitignore/api/c++,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=c++,python
 
-*.py[cod]
+### C++ ###
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
 *.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
 *.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
 *.egg
-*.egg-info
\ No newline at end of file
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+# End of https://www.toptal.com/developers/gitignore/api/c++,python
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..7129a0a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.15...3.27)
+
+# Scikit-build-core sets these values for you, or you can just hard-code the
+# name and version.
+project(
+  ${SKBUILD_PROJECT_NAME}
+  VERSION ${SKBUILD_PROJECT_VERSION}
+  LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -march=native -ffast-math")
+
+# Find the module development requirements (requires FindPython from 3.17 or
+# scikit-build-core's built-in backport)
+set(PYBIND11_NEWPYTHON ON)
+find_package(OpenMP)
+find_package(pybind11 CONFIG REQUIRED)
+pybind11_add_module(numbits src/numbits.cpp)
+
+target_link_libraries(numbits PRIVATE pybind11::pybind11)
+
+# Check if OpenMP was found
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(numbits PRIVATE OpenMP::OpenMP_CXX)
+    target_compile_definitions(numbits PRIVATE USE_OPENMP)
+endif()
+
+# This is passing in the version as a define just as an example
+target_compile_definitions(numbits PRIVATE VERSION_INFO=${PROJECT_VERSION})
+
+# The install directory is the output (wheel) directory
+install(TARGETS numbits DESTINATION .)
\ No newline at end of file
diff --git a/README.md b/README.md
index 7a76e32..9174e40 100644
--- a/README.md
+++ b/README.md
@@ -42,18 +42,26 @@ or you can:
     python setup.py install
     ```
 
-### Test call
+### Usage
 
 ```python
 import numpy as np
 import numbits
 a = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='uint8')
-b = numbits.unpack(a, nbits=2)
+b = numbits.unpack(a, nbits=2, bitorder="big", parallel=False)
 
 >>> array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0,
            1, 1, 0, 0, 1, 2, 0, 0, 1, 3, 0, 0, 2, 0], dtype=uint8)
 ```
 
+### Benchmarks
+|           |           |
+| --------- | --------- |
+| ![](tests/benchmarks/benchmark_unpack_1bit_little.png) | ![](tests/benchmarks/benchmark_unpack_1bit_big.png) |
+| ![](tests/benchmarks/benchmark_pack_1bit_little.png) | ![](tests/benchmarks/benchmark_pack_1bit_big.png) |
+
+
+
 [tests]: https://github.com/telegraphic/numbits/actions/workflows/tests.yaml/badge.svg
 [pybind]: https://github.com/pybind/pybind11
 [sigpyproc]: https://github.com/FRBs/sigpyproc3
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..bb8134d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,47 @@
+[build-system]
+requires = ["scikit-build-core", "pybind11"]
+build-backend = "scikit_build_core.build"
+
+
+[project]
+name = "numbits"
+version = "0.0.3"
+description="Pack and unpack 1, 2 and 4 bit data to/from 8-bit numpy arrays."
+readme = "README.md"
+license = {text = "MIT"}
+authors = [{ name = "Danny Price", email = "dancpr@berkeley.edu" }]
+requires-python = ">=3.8"
+dependencies = ["numpy"]
+classifiers = [
+  "License :: OSI Approved :: MIT License",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Topic :: Scientific/Engineering :: Astronomy",
+]
+
+[project.urls]
+Github = "https://github.com/telegraphic/numbits"
+
+
+[project.optional-dependencies]
+test = ["pytest"]
+
+
+[tool.scikit-build]
+cmake.build-type = "Release"
+wheel.expand-macos-universal-tags = true
+cmake.verbose = true
+logging.level = "INFO"
+
+
+[tool.cibuildwheel]
+test-command = "python -m pytest {project}/tests -v"
+test-extras = ["test"]
+test-skip = ["pp* *-musllinux* *-manylinux_i686", "*universal2:arm64"]
+build-verbosity = 1
+
+
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 0792c86..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-[metadata]
-license_files = LICENSE
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index c7da65d..0000000
--- a/setup.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# type: ignore
-
-import os
-import sys
-import pathlib
-import setuptools
-
-from setuptools.command.build_ext import build_ext
-from setuptools import setup, find_packages, Extension
-
-
-__version__ = "0.0.2"
-
-
-class get_pybind_include(object):
-
-    """
-    Helper class to determine the pybind11 include path
-    The purpose of this class is to postpone importing pybind11
-    until it is actually installed, so that the ``get_include()``
-    method can be invoked.
-    """
-
-    def __init__(self, user=False):
-        self.user = user
-
-    def __str__(self):
-        import pybind11
-
-        return pybind11.get_include(self.user)
-
-
-ext_modules = [
-    Extension(
-        "numbits",
-        sorted(["src/numbits.cpp"]),
-        include_dirs=[
-            get_pybind_include(),
-            get_pybind_include(user=True),
-        ],
-        language="c++",
-    ),
-]
-
-
-# cf http://bugs.python.org/issue26689
-def has_flag(
-    compiler,
-    flagname,
-) -> bool:
-
-    """
-    Return a boolean indicating whether a flag name is supported on
-    the specified compiler.
-    """
-
-    import os
-    import tempfile
-
-    with tempfile.NamedTemporaryFile(
-        "w",
-        suffix=".cpp",
-        delete=False,
-    ) as f:
-        f.write("int main (int argc, char **argv) { return 0; }")
-        fname = f.name
-    try:
-        compiler.compile([fname], extra_postargs=[flagname])
-    except setuptools.distutils.errors.CompileError:
-        return False
-    finally:
-        try:
-            os.remove(fname)
-        except OSError:
-            pass
-    return True
-
-
-def cpp_flag(compiler):
-
-    """
-    Return the -std=c++[11/14/17] compiler flag.
-    The newer version is prefered over c++11 (when it is available).
-    """
-
-    flags = ["-std=c++17", "-std=c++14", "-std=c++11"]
-    for flag in flags:
-        if has_flag(compiler, flag):
-            return flag
-    raise RuntimeError("Unsupported compiler -- at least C++11 support is needed!")
-
-
-class BuildExt(build_ext):
-
-    """
-    A custom build extension for adding compiler-specific options.
-    """
-
-    c_opts = {
-        "msvc": ["/EHsc"],
-        "unix": ["-O3", "-march=native", "-ffast-math"],
-    }
-    l_opts = {
-        "msvc": [],
-        "unix": [],
-    }
-
-    if sys.platform == "darwin":
-        darwin_opts = ["-stdlib=libc++", "-mmacosx-version-min=10.7"]
-        c_opts["unix"] += darwin_opts
-        l_opts["unix"] += darwin_opts
-
-    def build_extensions(self) -> None:
-        ct = self.compiler.compiler_type
-        opts = self.c_opts.get(ct, [])
-        link_opts = self.l_opts.get(ct, [])
-        if ct == "unix":
-            opts.append(cpp_flag(self.compiler))
-            if has_flag(self.compiler, "-fvisibility=hidden"):
-                opts.append("-fvisibility=hidden")
-
-        for ext in self.extensions:
-            ext.define_macros = [
-                ("VERSION_INFO", '"{}"'.format(self.distribution.get_version()))
-            ]
-            ext.extra_compile_args = opts
-            ext.extra_link_args = link_opts
-        build_ext.build_extensions(self)
-
-
-here = pathlib.Path(__file__).parent.resolve()
-long_description = (here / "README.md").read_text(encoding="utf-8")
-install_requires = []
-setup_requires = ["pybind11>=2.5.0"]
-
-
-setup(
-    name="numbits",
-    version=__version__,
-    description="Pack and unpack 1, 2 and 4 bit data to/from 8-bit numpy arrays.",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/telegraphic/numbits",
-    author="Danny Price",
-    author_email="dancpr@berkeley.edu",
-    classifiers=[
-        "License :: OSI Approved :: MIT License",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Topic :: Scientific/Engineering :: Astronomy",
-    ],
-    package_dir={"": "src"},
-    packages=find_packages(where="src"),
-    install_package_data=True,
-    python_requires=">=3.5, <4",
-    setup_requires=setup_requires,
-    install_requires=install_requires,
-    project_urls={
-        "Source": "https://github.com/telegraphic/numbits",
-        "Bug Reports": "https://github.com/telegraphic/numbits/issues",
-    },
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExt},
-    zip_safe=False,
-)
diff --git a/src/numbits.cpp b/src/numbits.cpp
index 06d29cf..81bc9e1 100644
--- a/src/numbits.cpp
+++ b/src/numbits.cpp
@@ -1,7 +1,15 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <math.h>
+#include <array>
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+
+namespace py = pybind11;
 
 #define HI4BITS 240
 #define LO4BITS 15
@@ -10,134 +18,392 @@
 #define LOMED2BITS 12
 #define LO2BITS 3
 
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-namespace py = pybind11;
-
 /*----------------------------------------------------------------------------*/
+// Lookup table for bit unpacking
+template <size_t NBits, bool BigEndian> struct unpack_lookup_table {
+  static constexpr size_t Size = 256;
+  static constexpr size_t Elements = 8 / NBits;
+  alignas(64) uint8_t data[Size][Elements]{}; // 256 * 8/NBits bytes
 
-/*
-Function to unpack 1,2 and 4 bit data
-data is unpacked into an empty buffer
-NOTE: Only unpacks big endian bit ordering
-*/
-py::array_t<uint8_t> unpack(py::array_t<uint8_t> inarray, int nbits)
-{
-  // Setup input/output buffers.
-  py::buffer_info inbuf = inarray.request();
-  int nbytes = inbuf.size;
-
-  auto outarray = py::array_t<uint8_t>(inbuf.size * 8 / nbits);
-  py::buffer_info outbuf = outarray.request();
-
-  uint8_t *indata = (uint8_t *)inbuf.ptr;
-  uint8_t *outdata = (uint8_t *)outbuf.ptr;
-
-  int ii, jj;
-  switch (nbits)
-  {
-  case 1:
-    for (ii = 0; ii < nbytes; ii++)
-    {
-      for (jj = 0; jj < 8; jj++)
-      {
-        outdata[(ii * 8) + (7 - jj)] = (indata[ii] >> jj) & 1;
+  constexpr unpack_lookup_table() {
+    for (size_t ii = 0; ii < Size; ii++) {
+      for (size_t jj = 0; jj < Elements; jj++) {
+        if constexpr (BigEndian) {
+          data[ii][Elements - 1 - jj] =
+              (ii >> (jj * NBits)) & ((1 << NBits) - 1);
+        } else {
+          data[ii][jj] = (ii >> (jj * NBits)) & ((1 << NBits) - 1);
+        }
       }
     }
-    break;
-  case 2:
-    for (ii = 0; ii < nbytes; ii++)
-    {
-      outdata[(ii * 4) + 3] = indata[ii] & LO2BITS;
-      outdata[(ii * 4) + 2] = (indata[ii] & LOMED2BITS) >> 2;
-      outdata[(ii * 4) + 1] = (indata[ii] & UPMED2BITS) >> 4;
-      outdata[(ii * 4) + 0] = (indata[ii] & HI2BITS) >> 6;
+  }
+};
+
+// Compile-time lookup table initialization
+constexpr unpack_lookup_table<1, false> unpack_lookup_table_1bit_little{};
+constexpr unpack_lookup_table<1, true> unpack_lookup_table_1bit_big{};
+constexpr unpack_lookup_table<2, false> unpack_lookup_table_2bit_little{};
+constexpr unpack_lookup_table<2, true> unpack_lookup_table_2bit_big{};
+constexpr unpack_lookup_table<4, false> unpack_lookup_table_4bit_little{};
+constexpr unpack_lookup_table<4, true> unpack_lookup_table_4bit_big{};
+
+template <bool parallel, bool BigEndian>
+void unpack_1bit_lookup(const uint8_t *inbuffer, uint8_t *outbuffer,
+                        size_t nbytes) {
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes; ii++) {
+    if constexpr (BigEndian) {
+      std::copy(&unpack_lookup_table_1bit_big.data[inbuffer[ii]][0],
+                &unpack_lookup_table_1bit_big.data[inbuffer[ii]][8],
+                &outbuffer[ii * 8]);
+    } else {
+      std::copy(&unpack_lookup_table_1bit_little.data[inbuffer[ii]][0],
+                &unpack_lookup_table_1bit_little.data[inbuffer[ii]][8],
+                &outbuffer[ii * 8]);
     }
-    break;
-  case 4:
-    for (ii = 0; ii < nbytes; ii++)
-    {
-      outdata[(ii * 2) + 1] = indata[ii] & LO4BITS;
-      outdata[(ii * 2) + 0] = (indata[ii] & HI4BITS) >> 4;
+  }
+}
+
+template <bool parallel, bool BigEndian>
+void unpack_2bit_lookup(const uint8_t *inbuffer, uint8_t *outbuffer,
+                        size_t nbytes) {
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes; ii++) {
+    if constexpr (BigEndian) {
+      std::copy(&unpack_lookup_table_2bit_big.data[inbuffer[ii]][0],
+                &unpack_lookup_table_2bit_big.data[inbuffer[ii]][4],
+                &outbuffer[ii * 4]);
+    } else {
+      std::copy(&unpack_lookup_table_2bit_little.data[inbuffer[ii]][0],
+                &unpack_lookup_table_2bit_little.data[inbuffer[ii]][4],
+                &outbuffer[ii * 4]);
     }
-    break;
   }
-  return outarray;
 }
 
-/*
-Function to pack bit data into an empty buffer
-*/
-py::array_t<uint8_t> pack(py::array_t<uint8_t> inarray, int nbits)
-{
-  // Setup input/output buffers.
-  py::buffer_info inbuf = inarray.request();
-  int nbytes = inbuf.size;
-
-  auto outarray = py::array_t<uint8_t>(inbuf.size * nbits / 8);
-  py::buffer_info outbuf = outarray.request();
-
-  uint8_t *indata = (uint8_t *)inbuf.ptr;
-  uint8_t *outdata = (uint8_t *)outbuf.ptr;
-
-  int ii, pos;
-  int bitfact = 8 / nbits;
-  unsigned char val;
-
-  switch (nbits)
-  {
-  case 1:
-    for (ii = 0; ii < nbytes / bitfact; ii++)
-    {
-      pos = ii * 8;
-      val = (indata[pos + 0] << 7) |
-            (indata[pos + 1] << 6) |
-            (indata[pos + 2] << 5) |
-            (indata[pos + 3] << 4) |
-            (indata[pos + 4] << 3) |
-            (indata[pos + 5] << 2) |
-            (indata[pos + 6] << 1) |
-            indata[pos + 7];
-      outdata[ii] = val;
+template <bool parallel, bool BigEndian>
+void unpack_4bit_lookup(const uint8_t *inbuffer, uint8_t *outbuffer,
+                        size_t nbytes) {
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes; ii++) {
+    if constexpr (BigEndian) {
+      std::copy(&unpack_lookup_table_4bit_big.data[inbuffer[ii]][0],
+                &unpack_lookup_table_4bit_big.data[inbuffer[ii]][2],
+                &outbuffer[ii * 2]);
+    } else {
+      std::copy(&unpack_lookup_table_4bit_little.data[inbuffer[ii]][0],
+                &unpack_lookup_table_4bit_little.data[inbuffer[ii]][2],
+                &outbuffer[ii * 2]);
     }
-    break;
-  case 2:
-    for (ii = 0; ii < nbytes / bitfact; ii++)
-    {
-      pos = ii * 4;
-      val = (indata[pos] << 6) |
-            (indata[pos + 1] << 4) |
-            (indata[pos + 2] << 2) |
-            indata[pos + 3];
-      outdata[ii] = val;
+  }
+}
+
+template <bool parallel, bool bigEndian>
+void unpack_1bit(const uint8_t *inbuffer, uint8_t *outbuffer, size_t nbytes) {
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes; ii++) {
+    for (size_t jj = 0; jj < 8; jj++) {
+      if constexpr (bigEndian) {
+        outbuffer[(ii << 3) + (7 - jj)] = (inbuffer[ii] >> jj) & 1;
+      } else {
+        outbuffer[(ii << 3) + jj] = (inbuffer[ii] >> jj) & 1;
+      }
+    }
+  }
+}
+
+template <bool parallel, bool bigEndian>
+void unpack_2bit(const uint8_t *inbuffer, uint8_t *outbuffer, size_t nbytes) {
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes; ii++) {
+    if constexpr (bigEndian) {
+      outbuffer[(ii << 2) + 3] = inbuffer[ii] & LO2BITS;
+      outbuffer[(ii << 2) + 2] = (inbuffer[ii] & LOMED2BITS) >> 2;
+      outbuffer[(ii << 2) + 1] = (inbuffer[ii] & UPMED2BITS) >> 4;
+      outbuffer[(ii << 2) + 0] = (inbuffer[ii] & HI2BITS) >> 6;
+    } else {
+      outbuffer[(ii << 2) + 0] = inbuffer[ii] & LO2BITS;
+      outbuffer[(ii << 2) + 1] = (inbuffer[ii] & LOMED2BITS) >> 2;
+      outbuffer[(ii << 2) + 2] = (inbuffer[ii] & UPMED2BITS) >> 4;
+      outbuffer[(ii << 2) + 3] = (inbuffer[ii] & HI2BITS) >> 6;
     }
-    break;
-  case 4:
-    for (ii = 0; ii < nbytes / bitfact; ii++)
-    {
-      pos = ii * 2;
-      val = (indata[pos] << 4) | indata[pos + 1];
-      outdata[ii] = val;
+  }
+}
+
+template <bool parallel, bool bigEndian>
+void unpack_4bit(const uint8_t *inbuffer, uint8_t *outbuffer, size_t nbytes) {
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes; ii++) {
+    if constexpr (bigEndian) {
+      outbuffer[(ii << 1) + 1] = inbuffer[ii] & LO4BITS;
+      outbuffer[(ii << 1) + 0] = (inbuffer[ii] & HI4BITS) >> 4;
+    } else {
+      outbuffer[(ii << 1) + 0] = inbuffer[ii] & LO4BITS;
+      outbuffer[(ii << 1) + 1] = (inbuffer[ii] & HI4BITS) >> 4;
+    }
+  }
+}
+
+template <bool parallel, bool bigEndian>
+void pack_1bit(const uint8_t *inbuffer, uint8_t *outbuffer, size_t nbytes) {
+  size_t pos;
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes / 8; ii++) {
+    pos = ii * 8;
+    if constexpr (bigEndian) {
+      outbuffer[ii] = (inbuffer[pos + 0] << 7) | (inbuffer[pos + 1] << 6) |
+                      (inbuffer[pos + 2] << 5) | (inbuffer[pos + 3] << 4) |
+                      (inbuffer[pos + 4] << 3) | (inbuffer[pos + 5] << 2) |
+                      (inbuffer[pos + 6] << 1) | inbuffer[pos + 7];
+    } else {
+      outbuffer[ii] = inbuffer[pos + 0] | (inbuffer[pos + 1] << 1) |
+                      (inbuffer[pos + 2] << 2) | (inbuffer[pos + 3] << 3) |
+                      (inbuffer[pos + 4] << 4) | (inbuffer[pos + 5] << 5) |
+                      (inbuffer[pos + 6] << 6) | (inbuffer[pos + 7] << 7);
+    }
+  }
+}
+
+template <bool parallel, bool bigEndian>
+void pack_2bit(const uint8_t *inbuffer, uint8_t *outbuffer, size_t nbytes) {
+  size_t pos;
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes / 4; ii++) {
+    pos = ii * 4;
+    if constexpr (bigEndian) {
+      outbuffer[ii] = (inbuffer[pos + 0] << 6) | (inbuffer[pos + 1] << 4) |
+                      (inbuffer[pos + 2] << 2) | inbuffer[pos + 3];
+    } else {
+      outbuffer[ii] = inbuffer[pos + 0] | (inbuffer[pos + 1] << 2) |
+                      (inbuffer[pos + 2] << 4) | (inbuffer[pos + 3] << 6);
+    }
+  }
+}
+
+template <bool parallel, bool bigEndian>
+void pack_4bit(const uint8_t *inbuffer, uint8_t *outbuffer, size_t nbytes) {
+  size_t pos;
+#ifdef USE_OPENMP
+#pragma omp parallel for if (parallel)
+#endif
+  for (size_t ii = 0; ii < nbytes / 2; ii++) {
+    pos = ii * 2;
+    if constexpr (bigEndian) {
+      outbuffer[ii] = (inbuffer[pos] << 4) | inbuffer[pos + 1];
+    } else {
+      outbuffer[ii] = inbuffer[pos] | (inbuffer[pos + 1] << 4);
     }
-    break;
   }
+}
+
+using PackUnpackFunc = void (*)(const uint8_t *, uint8_t *, size_t);
+
+constexpr std::array<std::array<std::array<PackUnpackFunc, 2>, 2>, 3>
+    unpackLookupDispatcher = {{{{
+                                   {unpack_1bit_lookup<false, false>,
+                                    unpack_1bit_lookup<true, false>}, // little
+                                   {unpack_1bit_lookup<false, true>,
+                                    unpack_1bit_lookup<true, true>} // big
+                               }},
+                               {{
+                                   {unpack_2bit_lookup<false, false>,
+                                    unpack_2bit_lookup<true, false>}, // little
+                                   {unpack_2bit_lookup<false, true>,
+                                    unpack_2bit_lookup<true, true>} // big
+                               }},
+                               {{
+                                   {unpack_4bit_lookup<false, false>,
+                                    unpack_4bit_lookup<true, false>}, // little
+                                   {unpack_4bit_lookup<false, true>,
+                                    unpack_4bit_lookup<true, true>} // big
+                               }}}};
+
+constexpr std::array<std::array<std::array<PackUnpackFunc, 2>, 2>, 3>
+    unpackDispatcher = {
+        {{{
+             {unpack_1bit<false, false>, unpack_1bit<true, false>}, // little
+             {unpack_1bit<false, true>, unpack_1bit<true, true>}    // big
+         }},
+         {{
+             {unpack_2bit<false, false>, unpack_2bit<true, false>}, // little
+             {unpack_2bit<false, true>, unpack_2bit<true, true>}    // big
+         }},
+         {{
+             {unpack_4bit<false, false>, unpack_4bit<true, false>}, // little
+             {unpack_4bit<false, true>, unpack_4bit<true, true>}    // big
+         }}}};
+
+constexpr std::array<std::array<std::array<PackUnpackFunc, 2>, 2>, 3>
+    packDispatcher = {
+        {{{
+             {pack_1bit<false, false>, pack_1bit<true, false>}, // little
+             {pack_1bit<false, true>, pack_1bit<true, true>}    // big
+         }},
+         {{
+             {pack_2bit<false, false>, pack_2bit<true, false>}, // little
+             {pack_2bit<false, true>, pack_2bit<true, true>}    // big
+         }},
+         {{
+             {pack_4bit<false, false>, pack_4bit<true, false>}, // little
+             {pack_4bit<false, true>, pack_4bit<true, true>}    // big
+         }}}};
+
+size_t get_bitorder_index(const std::string &bitorder) {
+  if (bitorder.empty() || (bitorder[0] != 'l' && bitorder[0] != 'b')) {
+    throw std::invalid_argument(
+        "Invalid bitorder. Must begin with 'l' or 'b'.");
+  }
+  return (bitorder[0] == 'b') ? 1 : 0;
+}
+
+/*
+Function to unpack 1, 2 and 4 bit data into an 8-bit array.
+*/
+py::array_t<uint8_t>
+unpack_lookup(const py::array_t<uint8_t, py::array::c_style> &inarray,
+              size_t nbits, const std::string &bitorder,
+              bool parallel = false) {
+  if (nbits != 1 && nbits != 2 && nbits != 4) {
+    throw std::invalid_argument(
+        "Invalid number of bits. Supported values are 1, 2, and 4.");
+  }
+  size_t bitorder_idx = get_bitorder_index(bitorder);
+  size_t nbits_idx = nbits >> 1;
+  size_t nbytes = inarray.size();
+  auto outarray = py::array_t<uint8_t>(nbytes * 8 / nbits);
+
+  PackUnpackFunc unpackFunc =
+      unpackLookupDispatcher[nbits_idx][bitorder_idx][parallel ? 1 : 0];
+  unpackFunc(inarray.data(), outarray.mutable_data(), nbytes);
+
   return outarray;
 }
 
-PYBIND11_MODULE(numbits, m)
-{
-  // Optional module docstring.
-  m.doc() = "Pack and unpack 1, 2 and 4 bit data";
-
-  m.def("unpack",
-        &unpack,
-        py::arg("array"),
-        py::arg("nbits"),
-        "Unpack 1, 2 and 4 bit data into an 8-bit numpy array.");
-
-  m.def("pack",
-        &pack,
-        py::arg("array"),
-        py::arg("nbits"),
-        "Pack 1, 2 and 4 bit data into an 8-bit numpy array.");
+py::array_t<uint8_t>
+unpack(const py::array_t<uint8_t, py::array::c_style> &inarray, size_t nbits,
+       const std::string &bitorder, bool parallel = false) {
+  if (nbits != 1 && nbits != 2 && nbits != 4) {
+    throw std::invalid_argument(
+        "Invalid number of bits. Supported values are 1, 2, and 4.");
+  }
+  size_t bitorder_idx = get_bitorder_index(bitorder);
+  size_t nbits_idx = nbits >> 1;
+  size_t nbytes = inarray.size();
+  auto outarray = py::array_t<uint8_t>(nbytes * 8 / nbits);
+
+  PackUnpackFunc unpackFunc =
+      unpackDispatcher[nbits_idx][bitorder_idx][parallel ? 1 : 0];
+  unpackFunc(inarray.data(), outarray.mutable_data(), nbytes);
+
+  return outarray;
+}
+
+void unpack_buffered(const py::array_t<uint8_t, py::array::c_style> &inarray,
+                     py::array_t<uint8_t, py::array::c_style> &outarray,
+                     size_t nbits, const std::string &bitorder,
+                     bool parallel = false) {
+  if (nbits != 1 && nbits != 2 && nbits != 4) {
+    throw std::invalid_argument(
+        "Invalid number of bits. Supported values are 1, 2, and 4.");
+  }
+  size_t bitorder_idx = get_bitorder_index(bitorder);
+  size_t nbits_idx = nbits >> 1;
+  size_t nbytes = inarray.size();
+  size_t outsize = outarray.size();
+  if (outsize != nbytes * 8 / nbits) {
+    throw std::invalid_argument("Output buffer size is not correct.");
+  }
+
+  PackUnpackFunc unpackFunc =
+      unpackDispatcher[nbits_idx][bitorder_idx][parallel ? 1 : 0];
+  unpackFunc(inarray.data(), outarray.mutable_data(), nbytes);
+}
+
+/*
+Function to pack 1, 2 and 4 bit data into an 8-bit array.
+*/
+py::array_t<uint8_t>
+pack(const py::array_t<uint8_t, py::array::c_style> &inarray, size_t nbits,
+     const std::string &bitorder, bool parallel = false) {
+  if (nbits != 1 && nbits != 2 && nbits != 4) {
+    throw std::invalid_argument(
+        "Invalid number of bits. Supported values are 1, 2, and 4.");
+  }
+  size_t bitorder_idx = get_bitorder_index(bitorder);
+  size_t nbits_idx = nbits >> 1;
+  size_t nbytes = inarray.size();
+  auto outarray = py::array_t<uint8_t>(nbytes * nbits / 8);
+
+  PackUnpackFunc packFunc =
+      packDispatcher[nbits_idx][bitorder_idx][parallel ? 1 : 0];
+  packFunc(inarray.data(), outarray.mutable_data(), nbytes);
+
+  return outarray;
+}
+
+void pack_buffered(const py::array_t<uint8_t, py::array::c_style> &inarray,
+                   py::array_t<uint8_t, py::array::c_style> &outarray,
+                   size_t nbits, const std::string &bitorder,
+                   bool parallel = false) {
+  if (nbits != 1 && nbits != 2 && nbits != 4) {
+    throw std::invalid_argument(
+        "Invalid number of bits. Supported values are 1, 2, and 4.");
+  }
+  size_t bitorder_idx = get_bitorder_index(bitorder);
+  size_t nbits_idx = nbits >> 1;
+  size_t nbytes = inarray.size();
+  size_t outsize = outarray.size();
+  if (outsize != nbytes * nbits / 8) {
+    throw std::invalid_argument("Output buffer size is not correct.");
+  }
+
+  PackUnpackFunc packFunc =
+      packDispatcher[nbits_idx][bitorder_idx][parallel ? 1 : 0];
+  packFunc(inarray.data(), outarray.mutable_data(), nbytes);
+}
+
+PYBIND11_MODULE(numbits, m) {
+  m.doc() = "Pack and unpack 1, 2 and 4 bit data into/from an 8-bit array.";
+
+  m.def(
+      "unpack_lookup", &unpack_lookup,
+      "Unpack 1, 2 and 4-bit data from an 8-bit numpy array using lookup table",
+      py::arg("inarray"), py::arg("nbits"), py::arg("bitorder") = "big",
+      py::arg("parallel") = false);
+
+  m.def("unpack", &unpack,
+        "Unpack 1, 2 and 4-bit data from an 8-bit numpy array",
+        py::arg("inarray"), py::arg("nbits"), py::arg("bitorder") = "big",
+        py::arg("parallel") = false);
+
+  m.def("unpack_buffered", &unpack_buffered,
+        "Unpack 1, 2 and 4-bit data from an 8-bit numpy array into a "
+        "pre-allocated buffer",
+        py::arg("inarray"), py::arg("outarray"), py::arg("nbits"),
+        py::arg("bitorder") = "big", py::arg("parallel") = false);
+
+  m.def("pack", &pack, "Pack 1, 2 and 4-bit data into an 8-bit numpy array",
+        py::arg("inarray"), py::arg("nbits"), py::arg("bitorder") = "big",
+        py::arg("parallel") = false);
+
+  m.def("pack_buffered", &pack_buffered,
+        "Pack 1, 2 and 4-bit data into an pre-allocated 8-bit numpy array",
+        py::arg("inarray"), py::arg("outarray"), py::arg("nbits"),
+        py::arg("bitorder") = "big", py::arg("parallel") = false);
 }
\ No newline at end of file
diff --git a/tests/benchmark.py b/tests/benchmark.py
new file mode 100644
index 0000000..c5ed59f
--- /dev/null
+++ b/tests/benchmark.py
@@ -0,0 +1,113 @@
+import numpy as np
+import perfplot
+import click
+
+import numbits
+
+
+@click.command()
+@click.option("--test", default="unpack", help="Choose between 'unpack' and 'pack'")
+@click.option("--bitorder", default="big", help="Choose between 'big' and 'little'")
+@click.option(
+    "--nbits",
+    default=1,
+    type=click.IntRange(min=1, max=4),
+    help="Number of bits to pack/unpack",
+)
+def main(test="unpack", bitorder="big", nbits=1):
+    if test == "unpack":
+        kernels = [
+            lambda arr, out: numbits.unpack(
+                arr, nbits, parallel=False, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.unpack(
+                arr, nbits, parallel=True, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.unpack_lookup(
+                arr, nbits, parallel=False, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.unpack_lookup(
+                arr, nbits, parallel=True, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.unpack_buffered(
+                arr, out, nbits, parallel=False, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.unpack_buffered(
+                arr, out, nbits, parallel=True, bitorder=bitorder
+            ),
+        ]
+        labels = [
+            "numbits",
+            "numbits_parallel",
+            "numbits_lookup",
+            "numbits_lookup_parallel",
+            "numbits_buffered",
+            "numbits_buffered_parallel",
+        ]
+        if nbits == 1:
+            kernels.insert(0, lambda arr, out: np.unpackbits(arr, bitorder=bitorder))
+            labels.insert(0, "numpy")
+        bench_stat = perfplot.bench(
+            setup=lambda n: (
+                np.random.randint(256, size=n, dtype="uint8"),
+                np.zeros(n * 8 // nbits, dtype="uint8"),
+            ),
+            n_range=[2**k for k in range(0, 24)],
+            kernels=kernels,
+            labels=labels,
+            xlabel="n",
+            title=f"Unpack {nbits} bit ({bitorder} endian)",
+            target_time_per_measurement=1,
+            equality_check=None,
+        )
+        bench_stat.save(
+            f"benchmark_unpack_{nbits}bit_{bitorder}.png",
+            transparent=False,
+            bbox_inches="tight",
+        )
+    else:
+        kernels = [
+            lambda arr, out: numbits.pack(
+                arr, nbits, parallel=False, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.pack(arr, nbits, parallel=True, bitorder=bitorder),
+            lambda arr, out: numbits.pack_buffered(
+                arr, out, nbits, parallel=False, bitorder=bitorder
+            ),
+            lambda arr, out: numbits.pack_buffered(
+                arr, out, nbits, parallel=True, bitorder=bitorder
+            ),
+        ]
+        labels = [
+            "numbits",
+            "numbits_parallel",
+            "numbits_buffered",
+            "numbits_buffered_parallel",
+        ]
+        if nbits == 1:
+            kernels.insert(0, lambda arr, out: np.packbits(arr, bitorder=bitorder))
+            labels.insert(0, "numpy")
+
+        bench_stat = perfplot.bench(
+            setup=lambda n: (
+                np.random.randint((1 << nbits) - 1, size=n, dtype="uint8"),
+                np.zeros(n * nbits // 8, dtype="uint8"),
+            ),
+            n_range=[2**k for k in range(3, 24)],
+            kernels=kernels,
+            labels=labels,
+            xlabel="n",
+            title=f"Pack {nbits} bit ({bitorder} endian)",
+            target_time_per_measurement=1,
+            equality_check=None,
+        )
+        bench_stat.save(
+            f"benchmark_pack_{nbits}bit_{bitorder}.png",
+            transparent=False,
+            bbox_inches="tight",
+        )
+    bench_stat.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/benchmarks/benchmark_pack_1bit_big.png b/tests/benchmarks/benchmark_pack_1bit_big.png
new file mode 100644
index 0000000..94ac062
Binary files /dev/null and b/tests/benchmarks/benchmark_pack_1bit_big.png differ
diff --git a/tests/benchmarks/benchmark_pack_1bit_little.png b/tests/benchmarks/benchmark_pack_1bit_little.png
new file mode 100644
index 0000000..4aeb27a
Binary files /dev/null and b/tests/benchmarks/benchmark_pack_1bit_little.png differ
diff --git a/tests/benchmarks/benchmark_pack_2bit_big.png b/tests/benchmarks/benchmark_pack_2bit_big.png
new file mode 100644
index 0000000..b744c2a
Binary files /dev/null and b/tests/benchmarks/benchmark_pack_2bit_big.png differ
diff --git a/tests/benchmarks/benchmark_pack_4bit_big.png b/tests/benchmarks/benchmark_pack_4bit_big.png
new file mode 100644
index 0000000..c0cfac6
Binary files /dev/null and b/tests/benchmarks/benchmark_pack_4bit_big.png differ
diff --git a/tests/benchmarks/benchmark_unpack_1bit_big.png b/tests/benchmarks/benchmark_unpack_1bit_big.png
new file mode 100644
index 0000000..d9557e1
Binary files /dev/null and b/tests/benchmarks/benchmark_unpack_1bit_big.png differ
diff --git a/tests/benchmarks/benchmark_unpack_1bit_little.png b/tests/benchmarks/benchmark_unpack_1bit_little.png
new file mode 100644
index 0000000..948418f
Binary files /dev/null and b/tests/benchmarks/benchmark_unpack_1bit_little.png differ
diff --git a/tests/benchmarks/benchmark_unpack_2bit_big.png b/tests/benchmarks/benchmark_unpack_2bit_big.png
new file mode 100644
index 0000000..459a63a
Binary files /dev/null and b/tests/benchmarks/benchmark_unpack_2bit_big.png differ
diff --git a/tests/benchmarks/benchmark_unpack_2bit_little.png b/tests/benchmarks/benchmark_unpack_2bit_little.png
new file mode 100644
index 0000000..3d5404e
Binary files /dev/null and b/tests/benchmarks/benchmark_unpack_2bit_little.png differ
diff --git a/tests/benchmarks/benchmark_unpack_4bit_big.png b/tests/benchmarks/benchmark_unpack_4bit_big.png
new file mode 100644
index 0000000..c7120ed
Binary files /dev/null and b/tests/benchmarks/benchmark_unpack_4bit_big.png differ
diff --git a/tests/benchmarks/benchmark_unpack_4bit_little.png b/tests/benchmarks/benchmark_unpack_4bit_little.png
new file mode 100644
index 0000000..8f1c042
Binary files /dev/null and b/tests/benchmarks/benchmark_unpack_4bit_little.png differ
diff --git a/tests/test_numbits.py b/tests/test_numbits.py
index 2ca3c77..05fb82d 100644
--- a/tests/test_numbits.py
+++ b/tests/test_numbits.py
@@ -1,47 +1,186 @@
-import numbits
 import pytest
+import numbits
 import numpy as np
 
+
+def unpack_bits(arr: np.ndarray, nbits: int, bitorder: str = "big") -> np.ndarray:
+    assert arr.dtype == np.uint8
+    assert nbits in {1, 2, 4}
+
+    mask = (1 << nbits) - 1
+    shifts = np.arange(0, 8, nbits)
+    if bitorder == "big":
+        shifts = shifts[::-1]
+    unpacked = (arr[..., np.newaxis] >> shifts) & mask
+    return unpacked.reshape(-1).astype(np.uint8)
+
+
+def pack_bits(arr: np.ndarray, nbits: int, bitorder: str = "big") -> np.ndarray:
+    assert arr.dtype == np.uint8
+    assert nbits in {1, 2, 4}
+
+    packed = np.zeros(arr.size * nbits // 8, dtype=np.uint8)
+    shifts = np.arange(0, 8, nbits)
+    if bitorder == "big":
+        shifts = shifts[::-1]
+    for ishift, shift in enumerate(shifts):
+        packed |= arr[ishift :: 8 // nbits] << shift
+
+    return packed
+
+
 class Testnumbits(object):
-    def test_unpackbits(self):
-        input_arr = np.array([0, 2, 7, 23], dtype=np.uint8)
-        expected_bit1 = np.unpackbits(input_arr, bitorder="big")
-        expected_bit2 = np.array(
-            [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 3, 0, 1, 1, 3], dtype=np.uint8
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    @pytest.mark.parametrize("funcn", ["unpack", "unpack_lookup"])
+    def test_unpack(self, nbits, bitorder, parallel, funcn):
+        rng = np.random.default_rng()
+        arr = rng.integers(255, size=2**10, dtype=np.uint8)
+        expected = unpack_bits(arr, nbits, bitorder)
+        output = getattr(numbits, funcn)(
+            arr, nbits=nbits, bitorder=bitorder, parallel=parallel
         )
-        expected_bit4 = np.array([0, 0, 0, 2, 0, 7, 1, 7], dtype=np.uint8)
-        np.testing.assert_array_equal(expected_bit1, numbits.unpack(input_arr, nbits=1))
-        np.testing.assert_array_equal(expected_bit2, numbits.unpack(input_arr, nbits=2))
-        np.testing.assert_array_equal(expected_bit4, numbits.unpack(input_arr, nbits=4))
+        np.testing.assert_array_equal(output, expected, strict=True)
+
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    @pytest.mark.parametrize("funcn", ["unpack", "unpack_lookup"])
+    def test_unpack_invalid(self, bitorder, parallel, funcn):
+        arr = np.arange(255, dtype=np.uint8)
+        with pytest.raises(ValueError):
+            getattr(numbits, funcn)(arr, nbits=3, bitorder=bitorder, parallel=parallel)
 
     @pytest.mark.parametrize("nbits", [1, 2, 4])
-    def test_unpackbits_empty(self, nbits):
-        input_arr = np.empty((0,), dtype=np.uint8)
-        output = numbits.unpack(input_arr, nbits=nbits)
-        np.testing.assert_array_equal(input_arr, output)
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    @pytest.mark.parametrize("funcn", ["unpack", "unpack_lookup"])
+    def test_unpack_empty(self, nbits, bitorder, parallel, funcn):
+        arr = np.empty((0,), dtype=np.uint8)
+        output = getattr(numbits, funcn)(
+            arr, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        np.testing.assert_array_equal(output, arr, strict=True)
 
     @pytest.mark.parametrize("nbits", [1, 2, 4])
-    def test_packbits(self, nbits):
-        input_arr = np.arange(255, dtype=np.uint8)
-        output = numbits.pack(numbits.unpack(input_arr, nbits=nbits), nbits=nbits)
-        np.testing.assert_array_equal(input_arr, output)
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_unpack_buffered(self, nbits, bitorder, parallel):
+        rng = np.random.default_rng()
+        arr = rng.integers(255, size=2**10, dtype=np.uint8)
+        expected = unpack_bits(arr, nbits, bitorder)
+        output = np.zeros(arr.size * 8 // nbits, dtype=np.uint8)
+        numbits.unpack_buffered(
+            arr, output, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        np.testing.assert_array_equal(output, expected, strict=True)
 
-def test_numbits():
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_unpack_buffered_invalid(self, bitorder, parallel):
+        arr = np.arange(255, dtype=np.uint8)
+        output = np.zeros(arr.size * 8 // 3, dtype=np.uint8)
+        with pytest.raises(ValueError):
+            numbits.unpack_buffered(
+                arr, output, nbits=3, bitorder=bitorder, parallel=parallel
+            )
 
-    """
-    Test the pack and unpack functions from the numbits package.
-    """
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_unpack_buffered_empty(self, nbits, bitorder, parallel):
+        arr = np.empty((0,), dtype=np.uint8)
+        output = np.empty((0,), dtype=np.uint8)
+        numbits.unpack_buffered(
+            arr, output, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        np.testing.assert_array_equal(output, arr, strict=True)
 
-    a = np.arange(255, dtype="uint8")
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack(self, nbits, bitorder, parallel):
+        rng = np.random.default_rng()
+        arr = rng.integers((1 << nbits) - 1, size=2**10, dtype=np.uint8)
+        expected = pack_bits(arr, nbits, bitorder)
+        output = numbits.pack(arr, nbits=nbits, bitorder=bitorder, parallel=parallel)
+        np.testing.assert_array_equal(output, expected, strict=True)
+
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_invalid(self, bitorder, parallel):
+        arr = np.arange((1 << 3) - 1, dtype=np.uint8)
+        with pytest.raises(ValueError):
+            numbits.pack(arr, nbits=3, bitorder=bitorder, parallel=parallel)
+
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_empty(self, nbits, bitorder, parallel):
+        arr = np.empty((0,), dtype=np.uint8)
+        output = numbits.pack(arr, nbits=nbits, bitorder=bitorder, parallel=parallel)
+        np.testing.assert_array_equal(output, arr, strict=True)
 
-    b1 = numbits.unpack(a, nbits=1)
-    c1 = numbits.pack(b1, nbits=1)
-    np.allclose(a, c1)
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_buffered(self, nbits, bitorder, parallel):
+        rng = np.random.default_rng()
+        arr = rng.integers((1 << nbits) - 1, size=2**10, dtype=np.uint8)
+        expected = pack_bits(arr, nbits, bitorder)
+        output = np.zeros(arr.size * nbits // 8, dtype=np.uint8)
+        numbits.pack_buffered(
+            arr, output, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        np.testing.assert_array_equal(output, expected, strict=True)
 
-    b2 = numbits.unpack(a, nbits=2)
-    c2 = numbits.pack(b2, nbits=2)
-    np.allclose(a, c2)
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_buffered_invalid(self, bitorder, parallel):
+        arr = np.arange((1 << 3) - 1, dtype=np.uint8)
+        output = np.zeros(arr.size * 3 // 8, dtype=np.uint8)
+        with pytest.raises(ValueError):
+            numbits.pack_buffered(
+                arr, output, nbits=3, bitorder=bitorder, parallel=parallel
+            )
 
-    b4 = numbits.unpack(a, nbits=4)
-    c4 = numbits.pack(b4, nbits=4)
-    np.allclose(a, c4)
\ No newline at end of file
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_buffered_empty(self, nbits, bitorder, parallel):
+        arr = np.empty((0,), dtype=np.uint8)
+        output = np.empty((0,), dtype=np.uint8)
+        numbits.pack_buffered(
+            arr, output, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        np.testing.assert_array_equal(output, arr, strict=True)
+
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_unpack(self, nbits, bitorder, parallel):
+        rng = np.random.default_rng()
+        arr = rng.integers(255, size=2**10, dtype=np.uint8)
+        output = numbits.pack(
+            numbits.unpack(arr, nbits=nbits, bitorder=bitorder, parallel=parallel),
+            nbits=nbits,
+            bitorder=bitorder,
+            parallel=parallel,
+        )
+        np.testing.assert_array_equal(output, arr, strict=True)
+
+    @pytest.mark.parametrize("nbits", [1, 2, 4])
+    @pytest.mark.parametrize("bitorder", ["big", "little"])
+    @pytest.mark.parametrize("parallel", [False, True])
+    def test_pack_unpack_buffered(self, nbits, bitorder, parallel):
+        rng = np.random.default_rng()
+        arr = rng.integers(255, size=2**10, dtype=np.uint8)
+        tmp = np.zeros(arr.size * 8 // nbits, dtype=np.uint8)
+        numbits.unpack_buffered(
+            arr, tmp, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        output = np.zeros_like(arr)
+        numbits.pack_buffered(
+            tmp, output, nbits=nbits, bitorder=bitorder, parallel=parallel
+        )
+        np.testing.assert_array_equal(output, arr, strict=True)