docs/ramalama.conf

# The RamaLama configuration file specifies all of the available configuration
# command-line options/flags for container engine tools like Podman & Buildah,
# but in a TOML format that can be easily modified and versioned.

# Please refer to ramalama.conf(5) for details of all configuration options.
# Not all container engines implement all of the options.
# All of the options have hard coded defaults and these options override
# the built in defaults. Users can override these options via the command
# line. Container engines read ramalama.conf files in up to three
# locations in the following order:
#  1. /usr/share/ramalama/ramalama.conf
#  2. /etc/ramalama/ramalama.conf
#  3. $XDG_CONFIG_HOME/ramalama/ramalama.conf or
#     $HOME/.config/ramalama/ramalama.conf if $XDG_CONFIG_HOME is not set
#  Items specified in the latter ramalama.conf, if they exist, override the
# previous ramalama.conf settings, or the default settings.

[ramalama]

# OCI model car image
# Image to use when building and pushing --type=car models
#
#carimage = "registry.access.redhat.com/ubi9-micro:latest"

# Run RamaLama in the default container.
#
#container = true

#size of the prompt context (0 = loaded from model)
#
#ctx_size=2048


# Run RamaLama using the specified container engine.
#
# Valid options (Podman, Docker)
#engine = "podman"

# OCI container image to run with the specified AI model
#
#image = "quay.io/ramalama/ramalama:latest"

# IP address for llama.cpp to listen on.
#
#host = "0.0.0.0"

# Pass `--group-add keep-groups` to podman, when using podman.
# In some cases this is needed to access the gpu from a rootless container
#
#keep_groups = false

# Default number of layers offloaded to the gpu
#
#ngl = 999

# Specify default port for services to listen on
#
#port = "8080"

# Specify the AI runtime to use; valid options are 'llama.cpp' and 'vllm' (default: llama.cpp)
# Options: llama.cpp, vllm
#
#runtime = "llama.cpp"

# Store AI Models in the specified directory
#
#store = "$HOME/.local/share/ramalama"

# Temperature of the response from the AI Model
# llama.cpp explains this as:
#
#    The lower the number is, the more deterministic the response is.
#
#    The higher the number is the more creative the response, but more likely to hallucinate when set too high.
#
#        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
#temp=0.8

# Specify the default transport to be used for pulling and pushing of AI Models.
# Options: oci, ollama, huggingface.
#
#transport = "ollama"