Skip to content

Commit

Permalink
Merge branch 'ComputeCanada:main' into generalize_local_users
Browse files Browse the repository at this point in the history
  • Loading branch information
mboisson authored Jan 24, 2025
2 parents 8229a66 + 820953c commit a200f41
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 25 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ The `profile::` sections list the available classes, their role and their parame
- [`profile::rsyslog::server`](#profilersyslogserver)
- [`profile::vector`](#profilervector)
- [`profile::slurm::base`](#profileslurmbase)
- [`profile::slurm::node`](#profileslurmnode)
- [`profile::slurm::accounting`](#profileslurmaccounting)
- [`profile::slurm::controller`](#profileslurmcontroller)
- [`profile::slurm::node`](#profileslurmnode)
Expand Down Expand Up @@ -659,6 +660,20 @@ For VGPU, the driver source is cloud provider specific and has to be specified
via either `profile::gpu::install::vgpu::rpm::source` for rpms or
`profile::gpu::install::vgpu::bin::source` for binary installer.

### parameters

| Variable | Description | Type |
| :--------------------- | :------------------------------------------------------------- | :------------ |
| `restrict_profiling` | Restrict access to NVIDIA GPU Performance Counters to root | Boolean |

<details>
<summary>default values</summary>

```yaml
profile::gpu::restrict_profiling: false
```
</details>

## `profile::jupyterhub::hub`

> JupyterHub is a multi-user server for Jupyter Notebooks. It is designed to support many users by
Expand Down Expand Up @@ -957,6 +972,22 @@ When `profile::slurm::base` is included, these classes are included too:
- [`profile::consul`](#profileconsul)
- [`profile::base::powertools`](#profilebasepowertools)

## `profile::slurm::node`
This class allows some configuration for the Slurm compute nodes.

### parameters
| Variable | Description | Type |
| :---------------------- | :------------------------------------------------------ | :----- |
| `pam_access_groups` | Groups that can access the node regardless of Slurm jobs | Array[String] |

<details>
<summary>default values</summary>

```yaml
profile::slurm::node::pam_access_groups: ['wheel']
```
</details>


## `profile::slurm::accounting`

Expand Down
1 change: 1 addition & 0 deletions data/common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ profile::reverse_proxy::subdomains:
profile::jupyterhub::hub::register_url: "https://mokey.%{lookup('terraform.data.domain_name')}/auth/signup"
profile::jupyterhub::hub::reset_pw_url: "https://mokey.%{lookup('terraform.data.domain_name')}/auth/forgotpw"

profile::gpu::restrict_profiling: false
profile::gpu::install::passthrough::packages:
- nvidia-driver-cuda-libs
- nvidia-driver
Expand Down
77 changes: 58 additions & 19 deletions site/profile/manifests/gpu.pp
Original file line number Diff line number Diff line change
@@ -1,37 +1,54 @@
class profile::gpu {
class profile::gpu (
Boolean $restrict_profiling,
) {
if $facts['nvidia_gpu_count'] > 0 {
require profile::gpu::install
if ! $facts['nvidia_grid_vgpu'] {
service { 'nvidia-persistenced':
ensure => 'running',
enable => true,
}
service { 'nvidia-dcgm':
ensure => 'running',
enable => true,
}
} else {
service { 'nvidia-gridd':
ensure => 'running',
enable => true,
}
}
include profile::gpu::install
include profile::gpu::services
}
}

class profile::gpu::install (
String $lib_symlink_path = undef
String $lib_symlink_path = undef,
) {
$restrict_profiling = lookup('profile::gpu::restrict_profiling')
ensure_resource('file', '/etc/nvidia', { 'ensure' => 'directory' })
ensure_packages(['kernel-devel'], { 'name' => "kernel-devel-${facts['kernelrelease']}" })
ensure_packages(['kernel-headers'], { 'name' => "kernel-headers-${facts['kernelrelease']}" })
ensure_packages(['dkms'], { 'require' => [Package['kernel-devel'], Yumrepo['epel']] })
$nvidia_kmod = ['nvidia', 'nvidia_modeset', 'nvidia_drm', 'nvidia_uvm']

selinux::module { 'nvidia-gpu':
ensure => 'present',
source_pp => 'puppet:///modules/profile/gpu/nvidia-gpu.pp',
}

file { '/etc/modprobe.d/nvidia.conf':
ensure => file,
owner => 'root',
group => 'root',
mode => '0755',
}

file_line { 'nvidia_restrict_profiling':
path => '/etc/modprobe.d/nvidia.conf',
match => '^options nvidia NVreg_RestrictProfilingToAdminUsers',
line => "options nvidia NVreg_RestrictProfilingToAdminUsers=${Integer($restrict_profiling)}",
require => File['/etc/modprobe.d/nvidia.conf'],
notify => [
Exec['stop_nvidia_services'],
Exec['unload_nvidia_drivers'],
],
}

exec { 'unload_nvidia_drivers':
command => sprintf('rmmod %s', $nvidia_kmod.reverse.join(' ')),
onlyif => 'grep -qE "^nvidia " /proc/modules',
refreshonly => true,
require => Exec['stop_nvidia_services'],
notify => Kmod::Load[$nvidia_kmod],
path => ['/bin', '/sbin'],
}

if ! $facts['nvidia_grid_vgpu'] {
include profile::gpu::install::passthrough
Class['profile::gpu::install::passthrough'] -> Exec['dkms_nvidia']
Expand All @@ -42,7 +59,6 @@

# Binary installer do not build drivers with DKMS
$installer = lookup('profile::gpu::install::vgpu::installer', undef, undef, '')
$nvidia_kmod = ['nvidia', 'nvidia_drm', 'nvidia_modeset', 'nvidia_uvm']
if ! $facts['nvidia_grid_vgpu'] or $installer != 'bin' {
exec { 'dkms_nvidia':
command => "dkms autoinstall -m nvidia -k ${facts['kernelrelease']}",
Expand Down Expand Up @@ -78,6 +94,7 @@
Package<| tag == profile::gpu::install |> ~> Exec['nvidia-symlink']
Exec<| tag == profile::gpu::install::vgpu::bin |> ~> Exec['nvidia-symlink']
}
Kmod::Load[$nvidia_kmod] ~> Service<| tag == profile::gpu::services |>
}

class profile::gpu::install::passthrough (
Expand Down Expand Up @@ -298,3 +315,25 @@
source => $gridd_source,
}
}

class profile::gpu::services {
if ! $facts['nvidia_grid_vgpu'] {
$gpu_services = ['nvidia-persistenced', 'nvidia-dcgm']
} else {
$gpu_services = ['nvidia-gridd']
}
service { $gpu_services:
ensure => 'running',
enable => true,
}

exec { 'stop_nvidia_services':
command => sprintf('systemctl stop %s', $gpu_services.reverse.join(' ')),
onlyif => sprintf('systemctl is-active %s', $gpu_services.reverse.join(' ')),
refreshonly => true,
path => ['/usr/bin'],
}

Package<| tag == profile::gpu::install |> -> Service[$gpu_services]
Exec<| tag == profile::gpu::install::vgpu::bin |> -> Exec[$gpu_services]
}
13 changes: 8 additions & 5 deletions site/profile/manifests/slurm.pp
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@
# Slurm node class. This is where slurmd is ran.
class profile::slurm::node (
Boolean $enable_tmpfs_mounts = true,
Array[String] $pam_access_groups = ['wheel'],
) {
contain profile::slurm::base

Expand Down Expand Up @@ -581,17 +582,19 @@
require => Pam['Add pam_slurm_adopt']
}

$access_conf = '
$access_conf = @(END)
# Allow root cronjob
+ : root : cron crond :0 tty1 tty2 tty3 tty4 tty5 tty6
# Allow admin to connect, deny all other
+:wheel:ALL
# Allow other groups if any
<% $pam_access_groups.each | $group | { %>
+:<%= $group %>:ALL
<% } %>
-:ALL:ALL
'
|END

file { '/etc/security/access.conf':
ensure => present,
content => $access_conf
content => inline_epp($access_conf, { 'pam_access_groups' => $pam_access_groups }),
}

selinux::module { 'sshd_pam_slurm_adopt':
Expand Down
4 changes: 3 additions & 1 deletion site/profile/manifests/users.pp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,9 @@
ensure => present,
condition => "User ${name}",
key => 'AuthenticationMethods',
value => $authenticationmethods
value => $authenticationmethods,
target => '/etc/ssh/sshd_config.d/50-authenticationmethods.conf',
notify => Service['sshd']
}
}
}

0 comments on commit a200f41

Please sign in to comment.