forked from WEHI-ResearchComputing/rag
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathollama-submit.sh
executable file
·42 lines (34 loc) · 1.66 KB
/
ollama-submit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#SBATCH -c 12 --mem 24G
#SBATCH --gres gpu:1 -p gpuq
#SBATCH --output ollama-server.log
#SBATCH --prefer P100 # use P100s for now
set -eu
module purge
module load apptainer/1.2.3
ollama_models=/vast/scratch/users/$USER/ollama-models
ollama_tmp=/vast/scratch/users/$USER/tmp
mkdir -p $ollama_models $ollama_tmp
# Relevant Ollama environemnt variables to set:
# OLLAMA_DEBUG Show additional debug information (e.g. OLLAMA_DEBUG=1)
# OLLAMA_HOST IP Address for the ollama server (default 127.0.0.1:11434)
# OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default "5m")
# OLLAMA_MAX_LOADED_MODELS Maximum number of loaded models (default 1)
# OLLAMA_MAX_QUEUE Maximum number of queued requests
# OLLAMA_MODELS The path to the models directory
# OLLAMA_NUM_PARALLEL Maximum number of parallel requests (default 1)
# OLLAMA_NOPRUNE Do not prune model blobs on startup
# OLLAMA_ORIGINS A comma separated list of allowed origins
# OLLAMA_TMPDIR Location for temporary files
# OLLAMA_FLASH_ATTENTION Enabled flash attention
# OLLAMA_LLM_LIBRARY Set LLM library to bypass autodetection
# OLLAMA_MAX_VRAM Maximum VRAM
export OLLAMA_HOST=$SLURM_NODELIST OLLAMA_MODELS=$ollama_models OLLAMA_MAX_LOADED_MODELS=2
apptainer exec --nv \
-B $TMPDIR:/tmp \
-B /vast,/stornext \
--env OLLAMA_HOST=$SLURM_NODELIST \
--env OLLAMA_MODELS=$ollama_models \
--env OLLAMA_MAX_LOADED_MODELS=2 \
oras://ghcr.io/wehi-researchcomputing/rag:0.1.0 \
ollama serve