Skip to content

Commit

Permalink
compute-engine gpu: add qs and allreduce runs
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Sep 9, 2024
1 parent 24a1c01 commit 7401973
Show file tree
Hide file tree
Showing 34 changed files with 1,348 additions and 328 deletions.
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ This study will test HPC application performance across three clouds. The reposi
- [Amazon Web Services](experiments/aws) includes Parallel Cluster (EC2), and EKS (KUbernetes) for each of CPU and GPU
- [Microsoft Azure](experiments/azure) includes CycleCloud (VMs), and AKS (Kubernetes) for each of CPU and GPU.

## Timing

This is a checklist for the setups we have tested and timed:

## Experiments

### "Bare Metal"
Expand Down Expand Up @@ -44,14 +40,14 @@ This is a checklist for the setups we have tested and timed:
- [x] size 64 (vsoch done 8/26/2024)
- [x] size 128 (vsoch done 8/27/2024)
- [x] size 256 (vsoch done 8/27/2024)
- [ ] Google Compute Engine GPU
- [x] Google Compute Engine GPU
- done on llnl-flux
- [x] New VM and automation needed with Terraform (vsoch, early 9/2024)
- [x] size 4 (vsoch 9/6/2024)
- [x] size 8 (vsoch 9/7/2024)
- [x] size 16 (vsoch 9/8/2024)
- [x] size 32 (vsoch 9/8/2024)
- [ ] quicksilver and osu all reduce need runs at all sizes.
- [x] quicksilver and osu all reduce need runs at all sizes (vsoch 9/9/2024)

### Kubernetes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,20 +104,30 @@ cd flux-sched
./configure --prefix=/usr --sysconfdir=/etc
make -j 8 && sudo make install && sudo ldconfig

# install openmpi with cuda
# Note that UCX (and a rebuild of open mpi) was done after to get OSU/quicksilver working

cd /opt
sudo git clone https://github.com/openucx/ucx && \
sudo chown -R $USER ./ucx && cd ucx/ && \
git clean -xfd && \
./autogen.sh && mkdir build && cd build && \
../configure --prefix=/usr --enable-debug --with-cuda=/usr/local/cuda --enable-mt --disable-cma && \
make -j && sudo make install

# If already existed - remove
sudo rm -rf /usr/local/pancakes/

# install openmpi with cuda and ucx
cd /opt
sudo mkdir -p /usr/local/pancakes && \
sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz && \
sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz || true && \
sudo tar -xzvf openmpi-4.1.2.tar.gz && \
cd openmpi-4.1.2 && \
sudo chown -R $USER $(pwd) && \
./configure --with-cuda --prefix=/usr/local/pancakes && \
make -j 20 && sudo make install

# TODO check these, should be provided in flux environment later
# ENV CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/pancakes/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# ENV LD_LIBRARY_PATH=/usr/local/pancakes/lib:/opt/miniconda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
make distclean || true && \
mkdir build && cd build
../configure --with-cuda=/usr/local/cuda --with-ucx=/usr/ --prefix=/usr/local/pancakes
make -j && sudo make install

cd /opt

Expand Down Expand Up @@ -384,3 +394,47 @@ sudo apt-get install -y --no-install-recommends --allow-change-held-packages apt
sudo echo "deb https://packages.cloud.google.com/apt google-fast-socket main" | tee /etc/apt/sources.list.d/google-fast-socket.list
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add
sudo apt update && sudo apt install -y --no-install-recommends google-fast-socket=0.0.5


# Install additional apps for bare metal, osu and quicksilver and multi-gpu-models
54 make -j
55 make install
56 cd /opt/osu-benchmark/
57 rm -rf build.openmpi/
58 export OSU_VERSION=5.8
59 mkdir -p build.openmpi && cd build.openmpi && ../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && make && make install
60 export PATH=/usr/local/pancakes/bin:$PATH
61 make && make install
62 cd /opt/containers/
63 cd /root

# OSU benchmarks
sudo git clone --depth 1 https://github.com/ULHPC/tutorials /opt/tutorials && \
sudo mkdir -p /opt/osu-benchmark && \
sudo chown -R $USER /opt/tutorials /opt/osu-benchmark && \
cd /opt/osu-benchmark && \
ln -s /opt/tutorials/parallel/mpi/OSU_MicroBenchmarks ref.d && \
ln -s ref.d/Makefile . && \
ln -s ref.d/scripts . && \
mkdir src && \
cd src && \
export OSU_VERSION=5.8 && \
wget --no-check-certificate http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tgz && \
tar xf osu-micro-benchmarks-${OSU_VERSION}.tgz && \
cd /opt/osu-benchmark && \
# Compile based on openmpi with cuda/ucx
mkdir -p build.openmpi && cd build.openmpi && \
../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && \
make && make install

# Quicksilver
sudo git clone https://github.com/LLNL/Quicksilver quicksilver
sudo chown -R $USER /opt/quicksilver
wget https://raw.githubusercontent.com/converged-computing/performance-study/main/docker/google/gpu/quicksilver/Makefile
cd /opt/quicksilver/src
make || nvcc -DHAVE_CUDA -std=c++11 -O2 -Xptxas -v -gencode=arch=compute_70,code=\"sm_70,compute_70\" --compiler-bindir=/usr/local/pancakes/bin/mpicxx -L/usr/local/cuda/lib64/ -lcuda -lcudart -lm -o qs CollisionEvent.o CoralBenchmark.o CycleTracking.o DecompositionObject.o DirectionCosine.o EnergySpectrum.o GlobalFccGrid.o GridAssignmentObject.o InputBlock.o MCT.o MC_Adjacent_Facet.o MC_Base_Particle.o MC_Domain.o MC_Facet_Crossing_Event.o MC_Fast_Timer.o MC_Load_Particle.o MC_Location.o MC_Particle_Buffer.o MC_RNG_State.o MC_Segment_Outcome.o MC_SourceNow.o MacroscopicCrossSection.o MeshPartition.o MonteCarlo.o MpiCommObject.o NuclearData.o Parameters.o ParticleVault.o ParticleVaultContainer.o PopulationControl.o SendQueue.o SharedMemoryCommObject.o Tallies.o cmdLineParser.o cudaFunctions.o initMC.o main.o parseUtils.o utils.o utilsMpi.o && sudo cp qs /usr/bin/qs

# Multi-gpu-models
sudo git clone https://github.com/NVIDIA/multi-gpu-programming-models /opt/multi-gpu-programming-models && \
cd multi-gpu-programming-models/mpi && \
make && sudo mv jacobi /usr/local/bin
6 changes: 6 additions & 0 deletions experiments/google/compute-engine/gpu/debug/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ And from the outside and within the container.
strace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-trace-f.txt
```

### [flux-singularity-ltrace-f.txt](flux-singularity-ltrace-f.txt)

```bash
ltrace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-ltrace-f.txt
```

### [flux-singularity-trace-s-f.txt](flux-singularity-trace-s-f.txt)

```bash
Expand Down
85 changes: 6 additions & 79 deletions experiments/google/compute-engine/gpu/size16/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
- Cluster coming up at 4:35pm Mountain, $320/hour
- Cluster coming down at 6:00pm

I'm not running OSU all reduce or quicksilver.

## Experiment

Shell in:
Expand Down Expand Up @@ -270,108 +268,37 @@ mkdir -p $output
# We should do H H - better values across the board
./flux-run-combinations.sh 16 $app

# D D and H H errors (bad results but we ran anyway):
# The call to cuMemHostRegister(0x78407fe00008, 134217728, 0) failed.
# Host: flux-004
# cuMemHostRegister return value: 1
# Registration cache: smcuda

# Note that osu_latency had worse values with D D. H H seems better across the board.
cho "Running iteration $i"

# -d cuda H H/D D slowest and has errors for allreduce

# These were run separately
export app=osu-allreduce
export output=results/$app
mkdir -p $output

# I skipped these for now because we need to debug the GPU issue, don't
# want to spend the money credits on crappy results
# confirmed using all 8 gpu, but just a little, mostly memory (~312MiB)
for i in $(seq 2 2); do

# original command for 4, 2m 36 seconds
time flux run -opmi=pmix -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
--setattr=user.study_id=$app-4-DD-iter-$i \
singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \
bash -c "ulimit -m 9999999999 ; /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda D D"

# 2m 41 seconds
time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-HH-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \
singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H

# 2m 19 seconds
time flux run -opmi=pmix -N 4 -n 32 -g 1 --setattr=user.study_id=$app-4-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task \
singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif \
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
# fastest with D D and the OMPI envar.
for i in $(seq 1 5); do
time flux run --env OMPI_COMM_WORLD_LOCAL_RANK=0 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H
done

# Not tested yet! There are still errors (and much slower times) with any cuda flags
sflux run --setattr=user.study_id=$app-8-iter-$i -N 8 -n 64 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce

flux run --setattr=user.study_id=$app-16-iter-$i -N 16 -n 128 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce

flux run --setattr=user.study_id=$app-32-iter-$i -N 32 -n 256 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task \
singularity exec --nv /opt/containers/metric-osu-gpu_google-gpu.sif \
/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce

# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output
```

#### Quicksilver

Testing:

```console
# This is the only app that didn't run (I tried a lot of different configs)
# The call to cuMemHostRegister(0x7fbb82200008, 134217728, 0) failed.
# Host: flux-004
# cuMemHostRegister return value: 1
# Registration cache: smcuda

# testing smcuda snake error
flux run -opmi=pmix -o gpu-affinity=per-task --env OMP_NUM_THREADS=1 -o cpu-affinity=per-task -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 2 -K 2 -n 26214400

# only works on one node, ssh is not allowed
mpirun -n 8 --map-by ppr:8:node singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 16 -Y 16 -Z 16 -x 16 -y 16 -z 16 -I 4 -J 4 -K 2 -n 163840

time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 -N2 -n 16 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 16 -x 32 -y 32 -z 16 -I 4 -J 4 -K 2 -n 26214400
```

Run attempt:

```console
export app=quicksilver
export output=results/$app
mkdir -p $output

# Error:
# --------------------------------------------------------------------------
# The call to cuMemHostRegister(0x7e21e1c00008, 134217728, 0) failed.
# Host: flux-001
# cuMemHostRegister return value: 1
# Registration cache: smcuda
# --------------------------------------------------------------------------

# confirmed using all 8 GPU, 100%, despite error above
# Allowing 10 minutes to see output, and if none, cancelling.
# confirmed using all 8 GPU, 100%
# Allowing 15 minutes to run then cancel
for i in $(seq 1 1); do
echo "Running iteration $i"
# Try this and see if completes
time flux run -o gpu-affinity=per-task -o cpu-affinity=per-task --exclusive --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-4-iter-$i -N4 -n 32 -g 1 singularity exec --nv /opt/containers/metric-quicksilver-gpu_google-gpu.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 32 -Y 32 -Z 32 -x 32 -y 32 -z 32 -I 4 -J 4 -K 2 -n 52428800
time flux run --exclusive --env OMP_NUM_THREADS=1 --env OMPI_MCA_pml=ucx --env OMPI_MCA_btl=tcp -N 16 -n 128 -g 1 --setattr=user.study_id=$app-16-iter-$i -o cpu-affinity=per-task -o gpu-affinity=per-task qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 64 -Y 64 -Z 32 -x 64 -y 64 -z 32 -I 8 -J 4 -K 4 -n 209715200
done

# When they are done:
./save.sh $output

oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:compute-engine-gpu-16-$app $output
```

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
58.145s: job.exception type=cancel severity=0 interrupted by ctrl-C
58.552s: job.exception type=cancel severity=0 interrupted by ctrl-C
flux-job: task(s) Terminated

# OSU MPI-CUDA Allreduce Latency Test v5.8
# Size Avg Latency(us)
4 337.07
8 333.06
16 334.79
32 340.88
64 333.40
128 341.31
256 343.44
512 356.03
1024 373.93
2048 411.85
4096 451.33
8192 746.66
16384 1038.96
32768 1323.51
65536 1420.76
131072 2002.24
262144 1318.68
524288 2188.49
1048576 3982.10
START OF JOBSPEC
{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-1"}}, "version": 1}
START OF EVENTLOG
{"timestamp":1725922820.0990932,"name":"init"}
{"timestamp":1725922820.1000268,"name":"starting"}
{"timestamp":1725922820.779249,"name":"shell.init","context":{"service":"501043911-shell-f2cZAePdh","leader-rank":0,"size":16}}
{"timestamp":1725922820.9687486,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
{"timestamp":1725922878.296582,"name":"shell.task-exit","context":{"localid":1,"rank":65,"state":"Exited","pid":1487,"wait_status":15,"signaled":15,"exitcode":143}}
{"timestamp":1725922878.8372781,"name":"complete","context":{"status":36608}}
{"timestamp":1725922878.837321,"name":"done"}

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
23.953s: job.exception type=cancel severity=0 interrupted by ctrl-C
24.155s: job.exception type=cancel severity=0 interrupted by ctrl-C
24.348s: job.exception type=cancel severity=0 interrupted by ctrl-C
flux-job: task(s) Terminated

# OSU MPI-CUDA Allreduce Latency Test v5.8
# Size Avg Latency(us)
4 346.93
8 343.36
16 345.98
32 345.99
64 345.13
128 343.23
256 359.02
512 366.02
1024 379.76
2048 423.79
4096 467.01
8192 738.21
16384 1113.73
32768 1372.31
65536 1449.55
131072 2002.28
262144 1318.81
524288 2209.33
1048576 4025.37
START OF JOBSPEC
{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-2"}}, "version": 1}
START OF EVENTLOG
{"timestamp":1725922887.4867351,"name":"init"}
{"timestamp":1725922887.4880421,"name":"starting"}
{"timestamp":1725922887.6449196,"name":"shell.init","context":{"service":"501043911-shell-f38FgPx4w","leader-rank":0,"size":16}}
{"timestamp":1725922887.6735122,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
{"timestamp":1725922911.4788885,"name":"shell.task-exit","context":{"localid":0,"rank":16,"state":"Exited","pid":1540,"wait_status":15,"signaled":15,"exitcode":143}}
{"timestamp":1725922911.8023748,"name":"complete","context":{"status":36608}}
{"timestamp":1725922911.8023956,"name":"done"}

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
28.598s: job.exception type=cancel severity=0 interrupted by ctrl-C
28.759s: job.exception type=cancel severity=0 interrupted by ctrl-C
28.880s: job.exception type=cancel severity=0 interrupted by ctrl-C
flux-job: task(s) Terminated

# OSU MPI-CUDA Allreduce Latency Test v5.8
# Size Avg Latency(us)
4 343.81
8 340.60
16 342.57
32 341.13
64 349.80
128 352.47
256 366.56
512 372.22
1024 389.31
2048 433.20
4096 469.32
8192 772.67
16384 1108.57
32768 1423.65
65536 1486.61
131072 1971.11
262144 1310.67
524288 2192.83
1048576 4009.07
START OF JOBSPEC
{"resources": [{"type": "node", "count": 16, "with": [{"type": "slot", "count": 8, "with": [{"type": "core", "count": 1}, {"type": "gpu", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce", "-d", "cuda", "H", "H"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/sochat1_llnl_gov", "shell": {"options": {"rlimit": {"cpu": -1, "fsize": -1, "data": -1, "stack": 8388608, "core": 0, "nofile": 1048576, "as": -1, "rss": -1, "nproc": -1}, "cpu-affinity": "per-task", "gpu-affinity": "per-task"}}}, "user": {"study_id": "osu-allreduce-16-iter-3"}}, "version": 1}
START OF EVENTLOG
{"timestamp":1725922915.0641441,"name":"init"}
{"timestamp":1725922915.0654099,"name":"starting"}
{"timestamp":1725922915.1928067,"name":"shell.init","context":{"service":"501043911-shell-f3LQaYUUF","leader-rank":0,"size":16}}
{"timestamp":1725922915.2170558,"name":"shell.start","context":{"taskmap":{"version":1,"map":[[0,16,8,1]]}}}
{"timestamp":1725922943.6773355,"name":"shell.task-exit","context":{"localid":0,"rank":64,"state":"Exited","pid":1619,"wait_status":15,"signaled":15,"exitcode":143}}
{"timestamp":1725922944.1458526,"name":"complete","context":{"status":36608}}
{"timestamp":1725922944.1458805,"name":"done"}

Loading

0 comments on commit 7401973

Please sign in to comment.