diff --git a/.github/workflows/compare-post-benchmark.yaml b/.github/workflows/compare-post-benchmark.yaml
new file mode 100644
index 00000000..0e6655f4
--- /dev/null
+++ b/.github/workflows/compare-post-benchmark.yaml
@@ -0,0 +1,202 @@
+name: "Run Compare Post Benchmark"
+on: [push, pull_request]
+#on:
+#  workflow_run:
+#    workflows: [Tests]
+#    types:
+#      - in_progress
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  benchmark:
+    # Job that clones pyuvsim@main, sets up a conda environment with the necessary dependencies,
+    # then locally installs pyuvsim and additionally installs pytest-benchmark and requests from
+    # PYPI. Runs in parallel as a matrix with input individual reference simulations. The "id"
+    # input is passed as a flag to pytest which parametrizes the reference simulation test
+    # function. pytest-benchmark output is saved as an artifact with its current workflow run
+    # and attempt as part of name key.
+    #
+    # Link to discussion of artifacts
+    # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/storing-and-sharing-data-from-a-workflow#about-workflow-artifacts
+    # https://github.com/actions/upload-artifact
+    #
+    # uncomment when Tests is working and done testing
+    #if: github.event.workflow_run.conclusion == 'success'
+    name: Performance Benchmark
+    env:
+      ENV_NAME: pyuvsim_tests_mpich
+      PYTHON: "3.12"
+    runs-on: ubuntu-latest
+
+    strategy:
+      # all jobs should run in parallel
+      matrix:
+        id: [1.1_uniform, 1.1_gauss, 1.1_mwa, 1.2_uniform, 1.2_gauss, 1.3_uniform, 1.3_gauss]
+
+    defaults:
+      run:
+        # Adding -l {0} helps ensure conda can be found properly.
+        shell: bash -l {0}
+    steps:
+      - uses: actions/checkout@main
+
+      - name: Setup Miniforge
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          python-version: ${{ env.PYTHON }}
+          environment-file: ci/${{ env.ENV_NAME }}.yaml
+          activate-environment: ${{ env.ENV_NAME }}
+          run-post: false
+
+      - name: Conda Info
+        run: |
+          conda info -a
+          conda list
+          PYVER=`python -c "import sys; print('{:d}.{:d}'.format(sys.version_info.major, sys.version_info.minor))"`
+          if [[ $PYVER != $PYTHON ]]; then
+            exit 1;
+          fi
+
+      # pip install benchmark utility and requests from PYPI, and local install pyuvsim
+      - name: Install
+        run: |
+          pip install pytest-benchmark
+          pip install requests
+          pip install .
+
+      # make the artifacts directory, then run pytest using mpiexec with only 1 node and core, specifying the
+      # reference simulation to run using the "refsim" flag. Save the pytest benchmark output in artifacts/
+      # with a sufficiently unique name
+      - name: Run benchmark
+        run: |
+          mkdir artifacts/
+          mpiexec -n 1 -np 1 pytest --refsim=${{ matrix.id }} --benchmark-only --benchmark-json artifacts/output_${{ matrix.id }}.json -s
+
+      # upload the benchmark output as an artifact with name key corresponding to the current
+      # workflow run and attempt only store artifacts for 1 day
+      - name: Upload result artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}
+          path: artifacts/
+          if-no-files-found: error
+          include-hidden-files: true
+          retention-days: 1
+
+  collate-post-benchmark:
+    # Job that loads the saved artifacts corresponding to the specific workflow run and attempt id,
+    # then creates a net benchmark output file named output.json with a python script action. The
+    # net benchmark file should still be accurate except the explicit machine info will be mostly
+    # lost. The net benchmark file is then fed to github-actions-benchmark which compares the
+    # current benchmark output with the latest data in the gh-pages branch. If the current workflow
+    # is a push to main, github-actions-benchmark then pushes the current benchmark output to
+    # gh-pages. If, during the benchmark comparison, a performance regression occurs, a comment of
+    # the benchmark comparison output is made on the workflow and this Job fails.
+    #
+    # Inspired by this workflow by yewstack/yew and the github-action-benchmark README:
+    # https://github.com/yewstack/yew/blob/master/.github/workflows/benchmark.yml
+    # https://github.com/yewstack/yew/blob/master/.github/workflows/post-benchmark.yml
+    # https://github.com/benchmark-action/github-action-benchmark
+    # https://github.com/actions/download-artifact
+
+    name: Concatenate and Post Benchmark Results
+    needs: benchmark
+    runs-on: ubuntu-latest
+
+    steps:
+      # Checkout repo for the github-action-benchmark action
+      - uses: actions/checkout@v4
+
+      # setup python
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      # only downloads artifacts from current workflow and run attempt via the pattern matching
+      # loads the saved benchmark artifacts from running the benchmark matrix into artifacts/
+      - name: Download result artifacts
+        uses: actions/download-artifact@v4
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          pattern: ${{ github.run_id }}-${{ github.run_attempt }}-*
+          merge-multiple: true
+          path: artifacts
+
+      # prints directory info recursively, removable
+      # (could maybe swap this to exa or lsd because base ls doesn't do tree)
+      - name: Display structure of downloaded files
+        run: ls -R
+
+      # approach to putting all the benchmark output in one file, with the machine/run info
+      # of only one the pytest benchmark runs. Loads all the benchmark output artifact files,
+      # then takes the benchmark timing infor fron n-1 files and adds it to the first file.
+      # With this approach, benchmark comparison output is only a single table, and we only
+      # have one comment on alert in the workflow.
+      - uses: jannekem/run-python-script-action@v1
+        with:
+          script: |
+            import os
+            import json
+
+            # make list of paths to artifact files excluding hidden files
+            filepath_arr = [os.path.join('artifacts', bench) for bench in os.listdir('artifacts') if not bench.startswith('.')]
+            print(filepath_arr)
+
+            output_jsons = []
+
+            # open each filepath in the filepath_arr, load it as a json, and append it to an empty list
+            for filepath in filepath_arr:
+                with open(filepath) as f:
+                    output_jsons.append(json.load(f))
+
+            # choose the first json as the one to modify to contain all the benchmark data
+            net_json = output_jsons[0]
+
+            # iterate through the other jsons (1-n) and append their benchmark data to net_json
+            for json_out in output_jsons[1:]:
+                net_json['benchmarks'].append(json_out['benchmarks'][0])
+
+            # save net_json as json with name output.json in current working directory
+            with open('output.json', 'w') as f:
+                json.dump(net_json, f)
+
+      # Print github event_name and ref_name and the boolean check for whether gh-pages should be updated
+      - name: Print Event, Ref, and Upload Boolean
+        run: |
+          echo "Event Name: ${{ github.event_name }}"
+          echo "Ref Name: ${{ github.ref_name }}"
+          echo "Update gh-pages: ${{ github.event_name == 'push' && github.ref_name == 'main' }}"
+
+      # Compares the data from the specified "output-file-path" and compares
+      # with the latest data from the gh-pages branch. If performance regression
+      # occurs, fails the test and alerts. Will only comment in if performance
+      # regression has occurred.
+      # NOTE: it is important that this does not modify gh-pages on pull request
+      # https://github.com/benchmark-action/github-action-benchmark?tab=readme-ov-file#caveats
+      # This only updates gh-pages if a push to main occurs
+      - name: Compare benchmarks
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          # What benchmark tool the output.txt came from
+          tool: 'pytest'
+          # Where the output from the benchmark tool is stored
+          output-file-path: output.json
+          # Where the previous data file is stored
+          # should fail consistently
+          alert-threshold: "120%"
+          # Workflow will fail when an alert happens
+          fail-on-alert: true
+          # Comment on the PR if the branch is not a fork
+          comment-on-alert: true
+          # Enable Job Summary for PRs
+          summary-always: true
+          # Always leave a comment
+          #comment-always: true
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          # Push and deploy GitHub pages branch automatically
+          auto-push: ${{ github.event_name == 'push' && github.ref_name == 'main' }}
+          save-data-file: ${{ github.event_name == 'push' && github.ref_name == 'main' }}
diff --git a/README.md b/README.md
index 448e9797..7c0317fe 100644
--- a/README.md
+++ b/README.md
@@ -127,6 +127,10 @@ One other package, pytest-xdist, is not required, but can be used to speed up ru
 the test suite by running tests in parallel. To use it call pytest with the
 ```-n auto``` option.
 
+Two additional packages, pytest-benchmark and requests, are required if you need to locally run
+single core regression testing of the reference simulations. For more realistic benchmarking at
+any level of scale, and for instruction on regression testing with pytest, see [Benchmarking](https://pyuvsim.readthedocs.io/en/latest/developers.html#benchmarking).
+
 One way to ensure you have all the needed packages is to use the included
 `environment.yaml` file to create a new environment that will
 contain all the optional dependencies along with dependencies required for
diff --git a/docs/developers.rst b/docs/developers.rst
index 6e19c5b2..8c58bf0c 100644
--- a/docs/developers.rst
+++ b/docs/developers.rst
@@ -31,6 +31,9 @@ For more details, see `reference_simulations/README.md <https://github.com/Radio
 Benchmarking
 ------------
 
+Benchmarking Simulations
+~~~~~~~~~~~~~~~~~~~~~~~~
+
 The ``benchmarking`` directory contains tools to test the runtime and memory usage of large simulations. There is no requirement to check benchmarks for pull requests, but it's a good idea to make sure changes don't drastically alter the runtime. The file BENCHMARKS.log keeps a record of performance over time.
 
 The README file in the ``benchmarking`` directory gives more details on how to do benchmarking.
@@ -38,3 +41,30 @@ The README file in the ``benchmarking`` directory gives more details on how to d
 Note that the benchmarking scripts are designed only for SLURM systems.
 
 For more details, see `benchmarking/README.md <https://github.com/RadioAstronomySoftwareGroup/pyuvsim/tree/main/benchmarking>`_.
+
+Running a Reference Simulation with pytest-benchmark
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To run a single core regression test of the reference simulations, you need to specify a reference
+simulation with the ``refsim`` flag and use ``benchmark-only``. Additionally, you need to use
+mpiexec to run pytest as follows:
+
+    .. code-block:: python
+
+        # use mpiexec to run pytest specifying one core
+        > mpiexec -n 1 -np 1 pytest --refsim=1.1_uniform --benchmark-only
+
+Here "1.1_uniform" would be the specific reference simulation being tested. You can use the ``refsim``
+flag multiple times to parametrize multiple reference simulations: ``--refsim=refsim1 --refsim=refsim2``.
+
+We run single core regression tests of the available reference simulations with pytest and pytest-benchmark via our github ci workflow on every push or pull request. We do so to ensure output and runtime consistency. As we only run the simulations with a single core, the benchmarking aspect of these tests is only relevant for linear operations and not a test of any parallelism.
+
+The available ``refsim`` values are:
+
+* 1.1_uniform
+* 1.1_gauss
+* 1.1_mwa
+* 1.2_uniform
+* 1.2_gauss
+* 1.3_uniform
+* 1.3_gauss
diff --git a/environment.yml b/environment.yml
index 97c83da7..47bb4f50 100644
--- a/environment.yml
+++ b/environment.yml
@@ -15,6 +15,8 @@ dependencies:
   - pytest
   - pytest-cov>=5.0.0
   - pytest-xdist
+  - pytest-benchmark
+  - requests
   - pyuvdata>=3.1.2
   - pyyaml>=5.4.1
   - scipy>=1.8
diff --git a/src/pyuvsim/data/baseline_lite.csv b/src/pyuvsim/data/baseline_lite.csv
new file mode 100644
index 00000000..b9cfa199
--- /dev/null
+++ b/src/pyuvsim/data/baseline_lite.csv
@@ -0,0 +1,6 @@
+Name Number   BeamID   E          N          U
+
+ANT1        0        0     0.0000   0.0000  0.0000
+ANT2        1        0     50.000   0.0000  0.0000
+ANT3        2        0     0.0000   -50.00  0.0000
+ANT4        3        0     26.000   -26.00  0.0000
diff --git a/src/pyuvsim/data/bl_lite_gauss.yaml b/src/pyuvsim/data/bl_lite_gauss.yaml
new file mode 100644
index 00000000..8684972a
--- /dev/null
+++ b/src/pyuvsim/data/bl_lite_gauss.yaml
@@ -0,0 +1,6 @@
+beam_paths:
+    0 : !AnalyticBeam
+        class: GaussianBeam
+        sigma: 0.08449
+telescope_location: (-30.72152777777791, 21.428305555555557, 1073.0000000093132)
+telescope_name: BLLITE
diff --git a/src/pyuvsim/data/bl_lite_uniform.yaml b/src/pyuvsim/data/bl_lite_uniform.yaml
new file mode 100644
index 00000000..d534300b
--- /dev/null
+++ b/src/pyuvsim/data/bl_lite_uniform.yaml
@@ -0,0 +1,5 @@
+beam_paths:
+    0 : !AnalyticBeam
+        class: UniformBeam
+telescope_location: (-30.72152777777791, 21.428305555555557, 1073.0000000093132)
+telescope_name: BLLITE
diff --git a/src/pyuvsim/data/mwa88_nocore_config_MWA.yaml b/src/pyuvsim/data/mwa88_nocore_config_MWA.yaml
new file mode 100644
index 00000000..75921c5d
--- /dev/null
+++ b/src/pyuvsim/data/mwa88_nocore_config_MWA.yaml
@@ -0,0 +1,7 @@
+beam_paths:
+  0: !UVBeam
+      filename: mwa_full_embedded_element_pattern.h5
+      path_variable: pyuvsim.data.DATA_PATH
+      pixels_per_deg: 1
+telescope_location: (-30.72152777777791, 21.428305555555557, 1073.0000000093132)
+telescope_name: MWA
diff --git a/src/pyuvsim/data/mwa88_nocore_config_gauss.yaml b/src/pyuvsim/data/mwa88_nocore_config_gauss.yaml
new file mode 100644
index 00000000..892b71fb
--- /dev/null
+++ b/src/pyuvsim/data/mwa88_nocore_config_gauss.yaml
@@ -0,0 +1,6 @@
+beam_paths:
+    0 : !AnalyticBeam
+        class: GaussianBeam
+        sigma: 0.08449
+telescope_location: (-30.72152777777791, 21.428305555555557, 1073.0000000093132)
+telescope_name: MWA
diff --git a/src/pyuvsim/data/test_catalogs/letter_R_12pt_2458098.38824015.txt b/src/pyuvsim/data/test_catalogs/letter_R_12pt_2458098.38824015.txt
new file mode 100644
index 00000000..21bae804
--- /dev/null
+++ b/src/pyuvsim/data/test_catalogs/letter_R_12pt_2458098.38824015.txt
@@ -0,0 +1,13 @@
+SOURCE_ID	RA_ICRS [deg]	Dec_ICRS [deg]	Flux [Jy]	Frequency [Hz]
+HERATEST5	59.37045	-28.778843	1	100000000.0
+HERATEST6	57.08925	-28.74223	1	100000000.0
+HERATEST12	59.38125	-27.778828	1	100000000.0
+HERATEST13	58.25100	-27.765359	1	100000000.0
+HERATEST21	59.39115	-26.779049	1	100000000.0
+HERATEST22	58.27125	-26.765736	1	100000000.0
+HERATEST23	57.15150	-26.743624	1	100000000.0
+HERATEST30	59.40120	-25.779269	1	100000000.0
+HERATEST31	57.18090	-25.744495	1	100000000.0
+HERATEST39	59.41035	-24.779242	1	100000000.0
+HERATEST40	58.30965	-24.766704	1	100000000.0
+HERATEST41	57.20820	-24.744905	1	100000000.0
diff --git a/src/pyuvsim/data/test_catalogs/mock_catalog_heratext_2458098.38824015.txt b/src/pyuvsim/data/test_catalogs/mock_catalog_heratext_2458098.38824015.txt
new file mode 100644
index 00000000..5cc5360b
--- /dev/null
+++ b/src/pyuvsim/data/test_catalogs/mock_catalog_heratext_2458098.38824015.txt
@@ -0,0 +1,44 @@
+SOURCE_ID	RA_ICRS [deg]	Dec_ICRS [deg]	Flux [Jy]	Frequency [Hz]
+HERATEST0	68.48535	-28.559917	1	100000000.0
+HERATEST1	66.21075	-28.669444	1	100000000.0
+HERATEST2	63.93300	-28.742866	1	100000000.0
+HERATEST3	62.79210	-28.76516	1	100000000.0
+HERATEST4	61.65180	-28.779055	1	100000000.0
+HERATEST5	59.37045	-28.778843	1	100000000.0
+HERATEST6	57.08925	-28.74223	1	100000000.0
+HERATEST7	54.81165	-28.668388	1	100000000.0
+HERATEST8	52.53720	-28.558443	1	100000000.0
+HERATEST9	68.41275	-27.564489	1	100000000.0
+HERATEST10	66.15885	-27.671835	1	100000000.0
+HERATEST11	63.90090	-27.743365	1	100000000.0
+HERATEST12	59.38125	-27.778828	1	100000000.0
+HERATEST13	58.25100	-27.765359	1	100000000.0
+HERATEST14	54.86385	-27.670802	1	100000000.0
+HERATEST15	52.60995	-27.563048	1	100000000.0
+HERATEST16	68.34300	-26.568897	1	100000000.0
+HERATEST17	67.22640	-26.625843	1	100000000.0
+HERATEST18	66.10875	-26.674063	1	100000000.0
+HERATEST19	63.87120	-26.744231	1	100000000.0
+HERATEST20	62.75160	-26.766141	1	100000000.0
+HERATEST21	59.39115	-26.779049	1	100000000.0
+HERATEST22	58.27125	-26.765736	1	100000000.0
+HERATEST23	57.15150	-26.743624	1	100000000.0
+HERATEST24	54.91395	-26.673054	1	100000000.0
+HERATEST25	53.79645	-26.624634	1	100000000.0
+HERATEST26	52.67985	-26.56749	1	100000000.0
+HERATEST27	68.27595	-25.573194	1	100000000.0
+HERATEST28	66.06075	-25.676232	1	100000000.0
+HERATEST29	63.84210	-25.745088	1	100000000.0
+HERATEST30	59.40120	-25.779269	1	100000000.0
+HERATEST31	57.18090	-25.744495	1	100000000.0
+HERATEST32	54.96225	-25.675246	1	100000000.0
+HERATEST33	52.74720	-25.571818	1	100000000.0
+HERATEST34	68.21160	-24.577407	1	100000000.0
+HERATEST35	66.01455	-24.678446	1	100000000.0
+HERATEST36	63.81510	-24.745485	1	100000000.0
+HERATEST37	62.71350	-24.76709	1	100000000.0
+HERATEST38	61.61280	-24.779435	1	100000000.0
+HERATEST39	59.41035	-24.779242	1	100000000.0
+HERATEST40	58.30965	-24.766704	1	100000000.0
+HERATEST41	57.20820	-24.744905	1	100000000.0
+HERATEST42	53.90925	-24.630704	1	100000000.0
diff --git a/src/pyuvsim/data/test_catalogs/two_distant_points_2458098.38824015.txt b/src/pyuvsim/data/test_catalogs/two_distant_points_2458098.38824015.txt
new file mode 100644
index 00000000..b0d11243
--- /dev/null
+++ b/src/pyuvsim/data/test_catalogs/two_distant_points_2458098.38824015.txt
@@ -0,0 +1,3 @@
+SOURCE_ID	RA_ICRS [deg]	Dec_ICRS [deg]	Flux [Jy]	Frequency [Hz]
+TWOPOINT0	108.05159	-0.9894784	1	100000000.0
+TWOPOINT1	291.49730	-1.0600652	1	100000000.0
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.1_gauss.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.1_gauss.yaml
new file mode 100644
index 00000000..bba05d8f
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.1_gauss.yaml
@@ -0,0 +1,20 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.1_gauss'
+  output_format: 'uvh5'
+freq:
+  Nfreqs: 1
+  channel_width: 80000.0
+  start_freq: 100000000.0
+sources:
+  catalog: '../test_catalogs/mock_catalog_heratext_2458098.38824015.txt'
+telescope:
+  array_layout: '../mwa_nocore_layout.csv'
+  telescope_config_name: '../mwa88_nocore_config_gauss.yaml'
+time:
+  Ntimes: 1
+  integration_time: 11.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.1_mwa.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.1_mwa.yaml
new file mode 100644
index 00000000..a42baef0
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.1_mwa.yaml
@@ -0,0 +1,20 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.1_mwa'
+  output_format: 'uvh5'
+freq:
+  Nfreqs: 1
+  channel_width: 80000.0
+  start_freq: 100000000.0
+sources:
+  catalog: '../test_catalogs/mock_catalog_heratext_2458098.38824015.txt'
+telescope:
+  array_layout: '../mwa_nocore_layout.csv'
+  telescope_config_name: '../mwa88_nocore_config_MWA.yaml'
+time:
+  Ntimes: 1
+  integration_time: 11.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.1_uniform.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.1_uniform.yaml
new file mode 100644
index 00000000..b954dde3
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.1_uniform.yaml
@@ -0,0 +1,20 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.1_uniform'
+  output_format: 'uvh5'
+freq:
+  Nfreqs: 1
+  channel_width: 80000.0
+  start_freq: 100000000.0
+sources:
+  catalog: '../test_catalogs/mock_catalog_heratext_2458098.38824015.txt'
+telescope:
+  array_layout: '../mwa_nocore_layout.csv'
+  telescope_config_name: '../mwa88_nocore_config.yaml'
+time:
+  Ntimes: 1
+  integration_time: 11.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.2_gauss.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.2_gauss.yaml
new file mode 100644
index 00000000..0a0b2290
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.2_gauss.yaml
@@ -0,0 +1,20 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.2_gauss'
+  output_format: 'uvh5'
+freq:
+  Nfreqs: 1
+  channel_width: 80000.0
+  start_freq: 100000000.0
+sources:
+  catalog: '../test_catalogs/two_distant_points_2458098.38824015.txt'
+telescope:
+  array_layout: '../baseline_lite.csv'
+  telescope_config_name: '../bl_lite_gauss.yaml'
+time:
+  Ntimes: 86400
+  integration_time: 1.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.2_uniform.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.2_uniform.yaml
new file mode 100644
index 00000000..5b5866e5
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.2_uniform.yaml
@@ -0,0 +1,20 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.2_uniform'
+  output_format: 'uvh5'
+freq:
+  Nfreqs: 1
+  channel_width: 80000.0
+  start_freq: 100000000.0
+sources:
+  catalog: '../test_catalogs/two_distant_points_2458098.38824015.txt'
+telescope:
+  array_layout: '../baseline_lite.csv'
+  telescope_config_name: '../bl_lite_uniform.yaml'
+time:
+  Ntimes: 86400
+  integration_time: 1.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.3_gauss.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.3_gauss.yaml
new file mode 100644
index 00000000..072fcb76
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.3_gauss.yaml
@@ -0,0 +1,21 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.3_gauss'
+  output_format: 'uvh5'
+freq:
+  Nfreqs:      6250.
+  channel_width: 32000.0
+  start_freq:  50016000.0
+  end_freq:   249984000.0
+sources:
+  catalog: '../test_catalogs/letter_R_12pt_2458098.38824015.txt'
+telescope:
+  array_layout: '../baseline_lite.csv'
+  telescope_config_name: '../bl_lite_gauss.yaml'
+time:
+  Ntimes: 2
+  integration_time: 11.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/src/pyuvsim/data/test_config/obsparam_ref_1.3_uniform.yaml b/src/pyuvsim/data/test_config/obsparam_ref_1.3_uniform.yaml
new file mode 100644
index 00000000..bc5992e4
--- /dev/null
+++ b/src/pyuvsim/data/test_config/obsparam_ref_1.3_uniform.yaml
@@ -0,0 +1,21 @@
+filing:
+  outdir: '.'
+  outfile_name: 'ref_1.3_uniform'
+  output_format: 'uvh5'
+freq:
+  Nfreqs:   6250.
+  channel_width: 32000.0
+  start_freq:  50016000.0
+  end_freq:   249984000.0
+sources:
+  catalog: '../test_catalogs/letter_R_12pt_2458098.38824015.txt'
+telescope:
+  array_layout: '../baseline_lite.csv'
+  telescope_config_name: '../bl_lite_uniform.yaml'
+time:
+  Ntimes: 2
+  integration_time: 11.0
+  start_time: 2458098.38824015
+ordering:
+  conjugation_convention: ant1<ant2
+  blt_order: [time, baseline]
diff --git a/tests/conftest.py b/tests/conftest.py
index 93398dbf..0eff7956 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -62,6 +62,17 @@ def pytest_addoption(parser):
     parser.addoption(
         "--nompi", action="store_true", help="skip mpi-parallelized tests."
     )
+    parser.addoption(
+        "--refsim",
+        action="append",
+        default=[],
+        help="list of refsim names to pass to test functions.",
+    )
+
+
+def pytest_generate_tests(metafunc):
+    if "refsim" in metafunc.fixturenames:
+        metafunc.parametrize("refsim", metafunc.config.getoption("refsim"))
 
 
 def pytest_runtest_setup(item):
diff --git a/tests/test_run_ref.py b/tests/test_run_ref.py
new file mode 100644
index 00000000..679b853d
--- /dev/null
+++ b/tests/test_run_ref.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2020 Radio Astronomy Software Group
+# Licensed under the 3-clause BSD License
+
+import importlib
+import os
+
+import pytest
+from pyuvdata import UVData
+
+import pyuvsim
+from pyuvsim.data import DATA_PATH as SIM_DATA_PATH
+from pyuvsim.uvsim import run_uvsim
+
+hasbench = importlib.util.find_spec("pytest_benchmark") is not None
+
+pytest.importorskip("mpi4py")  # noqa
+
+
+def robust_response(url, n_retry=5):
+    # attempt to GET the url n_retry times (if return code in range)
+    # returns the GET request
+    # Stackoverflow / Docs link for approach / Retry:
+    # https://stackoverflow.com/questions/15431044/can-i-set-max-retries-for-requests-request
+    # https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry
+    import requests
+    from requests.adapters import HTTPAdapter, Retry
+
+    s = requests.Session()
+    retries = Retry(
+        total=n_retry,
+        backoff_factor=1,
+        raise_on_status=True,
+        status_forcelist=range(500, 600),
+    )
+    s.mount("http://", HTTPAdapter(max_retries=retries))
+
+    return s.get(url)
+
+
+# gets latest pid from api call
+def get_latest_pid(response):
+    # takes as input the response from an api call to the Brown Digital Repository for the
+    # collection "pyuvsim historical reference simulations". Parses the response for the latest
+    # uploaded created item matching the query, then returns the PID of that item to be downloaded.
+    # In order to parse the response, further API calls are sent to get explicit data for each
+    # item. If the increased number of API calls becomes an issue then this function can be changed
+    # to simply determine basic datetime info from the input response with no further calls.
+    collection_response_items = response.json()["items"]["docs"]
+
+    if len(collection_response_items) == 0:
+        return "No items found in collection matching query."
+
+    # using "object_created_dsi" key to sort the items, so we need to request that
+    # for each item via the "json_uri", and get the pid as well to return
+    print(
+        "requesting json_uri for each item in reponse, and parsing 'object_created_dsi' and 'pid'"
+    )
+    json_uris = [item["json_uri"] for item in collection_response_items]
+    object_created_dsis = []
+    pids = []
+    for uri in json_uris:
+        # get response for item as json
+        response_json = robust_response(uri).json()
+
+        # get values from json
+        time_created = response_json["object_created_dsi"]
+        item_pid = response_json["pid"]
+
+        # append to lists
+        object_created_dsis.append(time_created)
+        pids.append(item_pid)
+
+    # get the max timestamp, then get the index at which the max time occurs
+    # this corresponds to the latest created object
+    latest_item_pos = object_created_dsis.index(max(object_created_dsis))
+
+    # get the pid of the latest item by shared pos
+    latest_pid = pids[latest_item_pos]
+
+    print(
+        "returning pid of most recent file uploaded to the collection matching the query"
+    )
+
+    return latest_pid
+
+
+def download_sim(target_dir, sim_name):
+    # method to download the historical reference simulations from the Brown Digital
+    # Repository. Sends an api call to the "pyuvsim historical reference simulations" collection,
+    # then identifies the latest uploaded object in the response. Downloads that object to the
+    # target directory and if the object requires the mwa beam file downloads that to the
+    # SIM_DATA_PATH
+
+    # Link to BDR API DOCS:
+    # https://repository.library.brown.edu/studio/api-docs/
+
+    # api url
+    api_url = "https://repository.library.brown.edu/api/collections/bdr:wte2qah8/?q="
+
+    print(
+        f"\nquerying BDR collection for items matching {sim_name} via api: {api_url+sim_name}"
+    )
+    response = robust_response(api_url + sim_name)
+
+    # parse out the latest file in the collection from the search result and return its pid
+    pid = get_latest_pid(response)
+
+    # append results_data to the target directory download path
+    target_dir = os.path.join(target_dir, "results_data")
+
+    # check if we need to make target_dir
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    # set filename with full filepath
+    fname = "ref_" + sim_name + ".uvh5"
+    fname = os.path.join(target_dir, fname)
+
+    # download url
+    download_url = f"https://repository.library.brown.edu/storage/{pid}/content/"
+
+    # download the file to the location
+    print(f"attempting download of requested file by http: {download_url}\n")
+    bdr_file_response = robust_response(download_url)
+    with open(fname, "wb") as f:
+        f.write(bdr_file_response.content)
+
+    # additionally download mwa uvbeam to SIM_DATA_PATH if necessary
+    if "mwa" in sim_name:
+        download_url = (
+            "http://ws.mwatelescope.org/static/mwa_full_embedded_element_pattern.h5"
+        )
+        fname = os.path.join(SIM_DATA_PATH, "mwa_full_embedded_element_pattern.h5")
+
+        # download the file to the location if not already downloaded
+        if os.path.isfile(fname):
+            print("skipping download mwa uvbeam file already downloaded")
+        else:
+            print(f"attempting download of requested file by http: {download_url}\n")
+            mwa_beam_response = robust_response(download_url)
+            with open(fname, "wb") as f:
+                f.write(mwa_beam_response.content)
+
+
+def compare_uvh5(uv_ref, uv_new):
+    # takes as input two UVData objects, and computes relevant quantities for determining how
+    # similar the data are. Prints the histories before setting them equal.
+    import numpy as np
+
+    # print histories
+    print("History 1:")
+    print(uv_ref.history)
+    print("")
+
+    print("History 2:")
+    print(uv_new.history)
+    print("")
+
+    # set history to match so equality check doesn't fail
+    uv_new.history = uv_ref.history
+
+    ref_arr, new_arr = uv_ref.data_array, uv_new.data_array
+
+    # mean diff
+    mean_diff_of_vis = np.mean(np.abs(new_arr - ref_arr))
+    mean_diff_of_abs = np.mean(np.abs(ref_arr) - np.abs(new_arr))
+
+    # compute max absolute difference and max relative difference
+    # should be identical to the allclose output for maximum absolute
+    # and relative difference
+    max_absolute_diff = np.amax(np.abs(new_arr - ref_arr))
+    frac_diff = frac_diff = np.abs(new_arr - ref_arr) / np.abs(ref_arr)
+    frac_diff[np.isnan(frac_diff)] = 0
+    max_relative_diff = np.amax(frac_diff)
+
+    # using np.testing.assert_allclose defaults for comparison
+    rtol = 1e-7
+    atol = 0
+
+    # generate a true/false ndarray for passing and failing cases
+    # should match output of np.testing.assert_allclose
+    cases = np.abs(new_arr - ref_arr) <= (atol + rtol * np.abs(ref_arr))
+    outcome = cases.all()
+
+    # get unique outcomes (true / false) and corresponding counts
+    # then convert to dict and get result
+    unique, counts = np.unique(cases, return_counts=True)
+    outcome_dict = dict(zip(unique, counts, strict=False))
+
+    # need to check that key exists
+    if False in outcome_dict:
+        num_mismatched = str(outcome_dict[False]) + "/" + str(cases.size)
+    else:
+        num_mismatched = "0" + "/" + str(cases.size)
+
+    # print some things for reference
+    print(
+        f"mean of abs of diff of visibilities "
+        f"'mean(abs(old_data_arr - new_data_arr))': {mean_diff_of_vis}"
+    )
+    print(
+        f"mean of diff of abs of visibilities "
+        f"'mean(abs(old_data_arr) - abs(new_data_arr))': {mean_diff_of_abs}"
+    )
+    print(f"max_absolute_diff: {max_absolute_diff}")
+    print(f"max_relative_diff: {max_relative_diff}")
+    print(f"outcome: {outcome}")
+    print(f"num_mismatched: {num_mismatched}")
+
+    # perform equality check between historical and current reference
+    # simulation output
+    assert uv_new == uv_ref
+
+
+def construct_filepaths(target_dir, sim):
+    # takes as input the sim name (NEEDS TO BE AN EXISTING SIM IN THE DATA DIRECTORY), then
+    # constructs the expected yaml_filepath to run the simulation and uvh5_filepath to locate
+    # the downloaded historical output
+
+    # construct filename and filepath of downloaded file
+    uvh5_filename = "ref_" + sim + ".uvh5"
+    uvh5_filepath = os.path.join(target_dir, "results_data", uvh5_filename)
+
+    # construct filename and filepath of yaml file used
+    # to run simulation
+    yaml_filename = "obsparam_ref_" + sim + ".yaml"
+    yaml_filepath = os.path.join(SIM_DATA_PATH, "test_config", yaml_filename)
+
+    # if yaml_filepath does not exist then comparison should fail
+    assert os.path.exists(yaml_filepath)
+
+    return uvh5_filepath, yaml_filepath
+
+
+@pytest.fixture
+def goto_tempdir(tmpdir):
+    # Run test within temporary directory.
+    newpath = str(tmpdir)
+    cwd = os.getcwd()
+    os.chdir(newpath)
+
+    yield newpath
+
+    os.chdir(cwd)
+
+
+@pytest.mark.skipif(not hasbench, reason="benchmark utility not installed")
+def test_run_sim(benchmark, goto_tempdir, refsim):
+    # pytest method to benchmark reference simulations. currently only called on one reference
+    # simulation at a time. takes as input the benchmark fixture, a fixture to generate a temporary
+    # directory for testing, and a fixture defined in conftest.py that is used to parametrize this
+    # method with a specific reference simulation name via command line input.
+
+    # download reference sim output to compare
+    download_sim(goto_tempdir, refsim)
+
+    # construct paths to necessary file using paramfile
+    uvh5_filepath, yaml_filepath = construct_filepaths(goto_tempdir, refsim)
+
+    # instantiate UVData object and read in the downloaded uvh5
+    # file as the point of historical comparison
+    uv_ref = UVData.from_file(uvh5_filepath)
+
+    # list of sims to benchmark pedantically as they are too long to run many times
+    # if need be can swap to a dictionary with more specific custom pedantic arguments
+    long_ref_sims = ["1.2_uniform", "1.2_gauss"]
+
+    # benchmark uvsim.run_uvsim method with param_filename argument
+    # runs multiple times to get benchmark data
+    # outdir is given by the yaml file but should not write anything
+    # and simply return a UVData object
+    # uses long_ref_sims global array to determine if pedantic benchmarking should be used
+    if refsim in long_ref_sims:
+        uv_new = benchmark.pedantic(
+            run_uvsim,
+            args=[yaml_filepath],
+            kwargs={"return_uv": True},
+            setup=None,
+            rounds=1,
+            warmup_rounds=0,
+            iterations=1,
+        )
+    else:
+        uv_new = benchmark(run_uvsim, yaml_filepath, return_uv=True)
+
+    # loading the file and comparing is only done on rank 0.
+    # probably not necessary
+    if pyuvsim.mpi.rank != 0:
+        return
+
+    # performs any assertions to confirm that the reference simulation output hasn't diverged
+    compare_uvh5(uv_ref, uv_new)