diff --git a/images/base/Dockerfile b/images/base/Dockerfile index e9440cb1f1..8d8ed8d7f6 100644 --- a/images/base/Dockerfile +++ b/images/base/Dockerfile @@ -97,6 +97,11 @@ RUN echo "Enabling / Disabling services ... " \ RUN echo "Ensuring /etc/kubernetes/manifests" \ && mkdir -p /etc/kubernetes/manifests +# Used as mount points for private copies of proc and sys filesystems in entrypoint. +RUN echo "Ensuring /kind/private" \ + && mkdir -p /kind/private/proc /kind/private/sys \ + && chmod 0700 /kind/private /kind/private/proc /kind/private/sys + # shared stage to setup go version for building binaries # NOTE we will be cross-compiling for performance reasons # This is also why we start again FROM the same base image but a different diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 015036481c..c819527c98 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -172,13 +172,24 @@ fix_mount() { sync fi + # Mount sysfs and proc as read-write, on a known, but kind specific location. + # This allows bind mounting, below and is also required to run some workloads + # which need to mount proc and sysfs themselves (this avoids the proc and + # sysfs mounts being "masked", as far as the kernel is concerned). + # XXX, better ref for fs_fully_visible than kernel code? + # https://github.com/torvalds/linux/commit/1b852bceb0d1 + log_info 'mounting /kind/private filesystems' + mount -t sysfs -o rw sysfs /kind/private/sys + mount -t proc -o rw proc /kind/private/proc + log_info 'remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ # however, we need other things from `docker run --privileged` ... # and this flag also happens to make /sys rw, amongst other things # - # This step is ignored when running inside UserNS, because it fails with EACCES. + # This step is ignored when running inside UserNS, because it can fail with + # EACCES. if ! mount -o remount,ro /sys; then if [[ -n "$userns" ]]; then log_info 'UserNS: ignoring mount fail' @@ -187,6 +198,23 @@ fix_mount() { fi fi + log_info 'making /proc/sys read-only, with known sysctls read-write' + mount --rbind -o ro /proc/sys /proc/sys + # These are the sysctls known to be namespaced in the kernel, list taken from Kubernetes: + # https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace.go + # In addition the kubelet attempts to set some sysctl to particular settings, we allow those: + # https://github.com/search?q=repo%3Akubernetes/kubernetes%20setupKernelTunables&type=code + for mount_point in \ + kernel/shmall kernel/shmmax kernel/shmmni kernel/shm_rmid_forced kernel/msgmax kernel/msgmnb kernel/msgmni \ + fs/mqueue \ + net \ + vm/overcommit_memory vm/panic_on_oom kernel/panic kernel/panic_on_oops \ + kernel/keys/root_maxkeys kernel/keys/root_maxbytes; do + if [[ -f /kind/private/proc/sys/"${mount_point}" ]]; then + mount --bind -o rw /kind/private/proc/sys/"${mount_point}" /proc/sys/"${mount_point}" + fi + done + log_info 'making mounts shared' # for mount propagation mount --make-rshared /