-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompute_build_base_img.yml
246 lines (202 loc) · 6.6 KB
/
compute_build_base_img.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
---
- hosts: localhost
vars:
compute_base_image: "JS-API-Featured-CentOS7-May-20-2019"
sec_group_global: "{{ clouds.tacc.auth.username }}-global-ssh"
sec_group_internal: "{{ clouds.tacc.auth.username }}-cluster-internal"
compute_base_size: "m1.small"
network_name: "{{ clouds.tacc.auth.username }}-elastic-net"
JS_ssh_keyname: "{{ clouds.tacc.auth.username }}-{{ clouds.tacc.auth.project_name }}-slurm-key"
vars_files:
- clouds.yaml
tasks:
- name: build compute base instance
os_server:
timeout: 300
state: present
name: "compute-{{ clouds.tacc.auth.username }}-base-instance"
cloud: "tacc"
image: "{{ compute_base_image }}"
key_name: "{{ JS_ssh_keyname }}"
security_groups: "{{ sec_group_global }},{{ sec_group_internal }}"
flavor: "{{ compute_base_size }}"
meta: { compute: "base" }
auto_ip: "no"
user_data: |
#cloud-config
packages: []
package_update: false
package_upgrade: false
package_reboot_if_required: false
final_message: "Boot completed in $UPTIME seconds"
network: "{{ network_name }}"
wait: yes
register: "os_host"
- debug:
var: os_host
- name: add compute instance to inventory
add_host:
name: "{{ os_host['openstack']['name'] }}"
groups: "compute-base"
ansible_host: "{{ os_host.openstack.private_v4 }}"
- name: pause for ssh to come up
pause:
seconds: 90
- hosts: compute-base
vars:
compute_base_package_list:
- "libselinux-python"
- "telnet"
- "bind-utils"
- "vim"
- "openmpi-gnu-ohpc"
- "ohpc-slurm-client"
- "lmod-ohpc"
tasks:
- name: Get the headnode private IP
local_action:
module: shell ip addr | grep -Eo '10.0.0.[0-9]*' | head -1
register: headnode_private_ip
become: False # for running as slurm, since no sudo on localhost
- name: Get the slurmctld uid
local_action:
module: shell getent passwd slurm | awk -F':' '{print $3}'
register: headnode_slurm_uid
become: False # for running as slurm, since no sudo on localhost
- name: Add OpenHPC 1.3.5 repo
yum:
name: "https://github.com/openhpc/ohpc/releases/download/v1.3.GA/ohpc-release-1.3-1.el7.x86_64.rpm"
state: present
lock_timeout: 900
- name: remove environment modules package
yum:
name: "environment-modules"
state: absent
lock_timeout: 300
- name: install basic packages
yum:
name: "{{ compute_base_package_list }}"
state: present
lock_timeout: 300
- name: fix slurm user uid
user:
name: slurm
uid: "{{ headnode_slurm_uid.stdout}}"
shell: "/sbin/nologin"
home: "/etc/slurm"
- name: change ownership of slurm files
file:
path: "{{ item }}"
owner: slurm
group: slurm
with_items:
- "/var/log/slurm_jobacct.log"
- "/var/spool/slurm"
- "/var/spool/slurm/ctld"
- name: disable selinux
selinux: state=permissive policy=targeted
# - name: allow use_nfs_home_dirs
# seboolean: name=use_nfs_home_dirs state=yes persistent=yes
- name: import /home on compute nodes
lineinfile:
dest: /etc/fstab
line: "{{ headnode_private_ip.stdout }}:/home /home nfs defaults,nfsvers=4.0 0 0"
state: present
- name: ensure /opt/ohpc/pub exists
file: path=/opt/ohpc/pub state=directory mode=777 recurse=yes
- name: import /opt/ohpc/pub on compute nodes
lineinfile:
dest: /etc/fstab
line: "{{ headnode_private_ip.stdout }}:/opt/ohpc/pub /opt/ohpc/pub nfs defaults,nfsvers=4.0 0 0"
state: present
- name: ensure /export exists
file: path=/export state=directory mode=777
- name: import /export on compute nodes
lineinfile:
dest: /etc/fstab
line: "{{ headnode_private_ip.stdout }}:/export /export nfs defaults,nfsvers=4.0 0 0"
state: present
- name: fix sda1 mount in fstab
lineinfile:
dest: /etc/fstab
regex: "/ xfs defaults"
line: "/dev/sda1 / xfs defaults 0 0"
state: present
- name: add local users to compute node
script: /tmp/add_users.sh
ignore_errors: True
- name: copy munge key from headnode
synchronize:
mode: push
src: /etc/munge/munge.key
dest: /etc/munge/munge.key
set_remote_user: no
use_ssh_args: yes
- name: fix perms on munge key
file:
path: /etc/munge/munge.key
owner: munge
group: munge
mode: 0600
- name: copy slurm.conf from headnode
synchronize:
mode: push
src: /etc/slurm/slurm.conf
dest: /etc/slurm/slurm.conf
set_remote_user: no
use_ssh_args: yes
- name: copy slurm_prolog.sh from headnode
synchronize:
mode: push
src: /usr/local/sbin/slurm_prolog.sh
dest: /usr/local/sbin/slurm_prolog.sh
set_remote_user: no
use_ssh_args: yes
- name: enable munge
service: name=munge.service enabled=yes
- name: enable slurmd
service: name=slurmd enabled=yes
#cat /etc/systemd/system/multi-user.target.wants/slurmd.service
#[Unit]
#Description=Slurm node daemon
#After=network.target munge.service #CHANGING TO: network-online.target
#ConditionPathExists=/etc/slurm/slurm.conf
#
#[Service]
#Type=forking
#EnvironmentFile=-/etc/sysconfig/slurmd
#ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS
#ExecReload=/bin/kill -HUP $MAINPID
#PIDFile=/var/run/slurmd.pid
#KillMode=process
#LimitNOFILE=51200
#LimitMEMLOCK=infinity
#LimitSTACK=infinity
#Delegate=yes
#
#
#[Install]
#WantedBy=multi-user.target
- name: change slurmd service "After" to sshd and remote filesystems
command: sed -i 's/network.target/sshd.service remote-fs.target/' /usr/lib/systemd/system/slurmd.service
- name: add slurmd service "Requires" of sshd and remote filesystems
command: sed -i '/After=network/aRequires=sshd.service remote-fs.target' /usr/lib/systemd/system/slurmd.service
# - name: mount -a on compute nodes
# command: "mount -a"
- hosts: localhost
vars_files:
- clouds.yaml
tasks:
- name: create compute instance snapshot
expect:
command: ./compute_take_snapshot.sh
responses:
Password: "{{ clouds.tacc.auth.password }}"
timeout: null # Need to check if this refers to the whole command or just the expect part?
no_log: true # to avoid putting OS_PASSWORD in logs - uncomment and re-run if you run into errors!
- name: remove compute instance
os_server:
timeout: 200
state: absent
name: "compute-{{ clouds.tacc.auth.username }}-base-instance"
cloud: "tacc"