diff --git a/ChangeLog b/ChangeLog index a34e5f3..4ee800a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ +- Add a bindexec command. - Add the variable SINGCVMFS_LOGDIR to override the location of the cvmfs logs. +- Stop using $TMPDIR as a temporary variable name in cvmfsexec because it + might be already set and exported. cvmfsexec-4.42 - 24 September 2024 - Add rhel9-aarch64 and rhel9-ppc64le machine types. diff --git a/README.md b/README.md index 04da9da..a95d67d 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,10 @@ do this in 4 different ways: unprivileged user namespaces enabled, this can also be used with unprivileged singularity or apptainer. +In addition, this package contains a related tool called +[bindexec](#bindexec) which starts a new user namespace with given +bind mounts added. + # Supported operating systems Operating systems currently supported by this package are Red Hat @@ -356,3 +360,40 @@ $ mkfs.ext3 -F -O ^has_journal -d tmp scratch.img By default the cvmfs logs are written to a top-level `log` directory, alongside the top-level `dist` directory. The variable `SINGCVMFS_LOGDIR` can be used to write them to a different directory, which will be created if it doesn't exist. + +# bindexec + +As a bonus, this package also includes a separate tool called `bindexec` +that accepts any set of bind mounts to add into a new unprivileged user +mount namespace. The usage is much like `cvmfsexec` except that instead +of cvmfs repository names you give it `src:dest` pairs where `src` is a +source directory or file and `dest` is a destination path. For example: + +``` +$ bindexec /etc/motd:/var/lib/mydir/motd -- ls /var/lib/mydir +motd +``` + +Like `cvmfsexec`, if no command is supplied after `--` it runs an +interactive shell. + +Bind mounts require target destinations to exist, but if they are +missing `bindexec` will automatically create them. This requires the +fuse-overlayfs command to be in the PATH, although if there is demand +for it a script for making that easily distributable as well will be +supplied (probably through a `makedist` option). + +Some system directories (`/proc`, `/sys`, `/dev`, and `/run`) are +included as-is on top of the overlay so anything bound into those +directories will not appear. In addition, any `nfs` filesystem types +are automatically added on top of the overlay because they don't work +properly through overlay, so no bind mounts will appear in those paths +either. + +`bindexec` always creates a new process namespace because that's the +easiest way to make sure that the fuse-overlayfs process will exit when +the command exits. This means that processes start over at pid 1 and no +process can be seen outside of the namespace. Also because it is using +an unprivileged user namespace, any files owned by anyone other than the +current user will show up as being owned by `nobody` (just as it does in +`cvmfsexec`). diff --git a/bindexec b/bindexec new file mode 100755 index 0000000..dc213df --- /dev/null +++ b/bindexec @@ -0,0 +1,170 @@ +#!/bin/bash +# Add bind mounts in a user namespace and change to that space. +# Requires being able to run unshare -rm and the ability to do fuse mounts +# (kernel >= 4.18) and requires fuse-overlayfs. +# Written by Dave Dykstra November 2024, based heavily on cvmfsexec. + +#set -x +#PS4='c$$+ ' + +VERSION=4.42 + +usage() +{ + echo "Usage: bindexec [-v] [src:dest ...] -- [command]" >&2 + echo " Bind mount each src to dest in new user mount namespace" >&2 + echo " -v: print current version and exit" >&2 + exit 1 +} + +# needed for pivot_root +PATH=$PATH:/usr/sbin + +TMPD="$(mktemp -d /dev/shm/bindexec.XXXXXXXXXX)" +trap "rm -rf $TMPD" 0 # note that trap does not carry past exec +STARTFIFO=$TMPD/start +WAITFIFO=$TMPD/wait +mkfifo $STARTFIFO $WAITFIFO + +# bash syntax {NAME}<&N doesn't work on older bashes such as the +# version 3.2.x on macOS Big Sur, and in fact it fails with an error +# message but not an error code, so test for it first to be able to +# gracefully die + +if [ -n "$({TESTX}<&0 2>&1)" ]; then + echo "Cannot assign file descriptors to variables, bash version too old" >&2 + exit 1 +fi + +# make a copy of stdin fd, for sending to the final command +exec {STDINCOPYFD}<&0 + +ORIGPWD=$PWD + +# can't use OPTIND because it can't distinguish between -- there or missing +NOPTS=0 +while getopts "v" OPTION; do + let NOPTS+=1 + case $OPTION in + v) echo "$VERSION" + exit + ;; + \?) usage + ;; + esac +done +shift $NOPTS + +BINDS="" +for ARG; do + if [ "$ARG" == "--" ]; then + break + fi + if [[ "$ARG" != *:* ]]; then + usage + fi + BINDS="$BINDS $ARG" + shift +done + +if [ "$ARG" != "--" ]; then + usage +fi +shift + +ORIGUID="$(id -u)" +ORIGGID="$(id -g)" + +# Note that within the HERE document, unprotected $ substitutions are +# done by the surrounding shell, and \$ is within the unshare shell +unshare -rm -pf /bin/bash /dev/stdin "${@:-$SHELL}" <&2 + fi + mount --bind \$SRC $TMPD/upper/\$DST + done + + # Leave this bash running as PID 1, because most other + # programs won't handle signals & child reaping correctly. + # Note that all other processes in the namespaces will get + # a SIGKILL when PID 1 exits. + trap "" 1 2 3 15 # ignore all ordinary signals + + fuse-overlayfs -o lowerdir=/,upperdir=$TMPD/upper,workdir=$TMPD/work $TMPD/overlay 2> >(grep -v lazytime >&2) + # put original system dirs on top of the overlay + mount -t proc proc $TMPD/overlay/proc + mount --rbind /sys $TMPD/overlay/sys + mount --rbind /dev $TMPD/overlay/dev + mount --rbind /run $TMPD/overlay/run + + # also overlay nfs mounts because they don't work through overlay + mount|while read FROM X TO X TYPE REST; do + if [[ \$TYPE = nfs* ]]; then + mkdir -p $TMPD/overlay/\$TO + mount --bind \$TO $TMPD/overlay/\$TO + fi + done + + # Start a second fake root namespace so we don't interfere with the + # fuse-overlayfs mount space when we do the pivot_root. + # Quoting the HERE document's delimeter makes this nested shell not + # interpret $ substitutions, but the previous one still does so + # need to use \$ when don't want first shell to expand. + unshare -rm /bin/bash /dev/stdin "\${@:-$SHELL}" <<'!EOF-2!' + #set -x + #PS4='c\$$+ ' + + ( + # This is a background process for setting up the child's uid map + trap "" 1 2 3 15 # ignore ordinary signals + read PID + # set up uid/gid map + echo "$ORIGGID 0 1" >/proc/"\$PID"/gid_map + echo "$ORIGUID 0 1" >/proc/"\$PID"/uid_map + echo "ready" >$WAITFIFO + ) <$STARTFIFO & + + # Change to the new root. Would use chroot but it doesn't work. + mount --rbind $TMPD/overlay $TMPD/overlay # pivot_root requires this + cd $TMPD/overlay + mkdir -p .old-root + pivot_root . .old-root + cd $ORIGPWD + + # Finally, start the user namespace with the original uid/gid + # This HERE document is also quoted and so the shell does not expand + exec unshare -U /bin/bash /dev/stdin "\${@:-$SHELL}" <<'!EOF-3!' + #set -x + #PS4='c\$$+ ' + + # now in the user namespace + + echo "\$$" >$STARTFIFO + # wait for the uid/gid maps to be set up + read X <$WAITFIFO + + exec "\$@" <&$STDINCOPYFD $STDINCOPYFD<&- +!EOF-3! + +!EOF-2! + +!EOF-1! diff --git a/cvmfsexec b/cvmfsexec index 7de4894..6f5fae4 100755 --- a/cvmfsexec +++ b/cvmfsexec @@ -39,13 +39,13 @@ elif [ "$MAJORKERN" -eq 3 -a "$MINORKERN" -eq 10 -a "$REVKERN" -ge 1127 ]; then USERFUSE=true fi -TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" 0 # note that trap does not carry past exec -CMDFIFO1=$TMPDIR/cmd1 -WAITFIFO1=$TMPDIR/wait1 -CMDFIFO2=$TMPDIR/cmd2 -WAITFIFO2=$TMPDIR/wait2 -FUNCS=$TMPDIR/funcs +TMPD=$(mktemp -d) +trap "rm -rf $TMPD" 0 # note that trap does not carry past exec +CMDFIFO1=$TMPD/cmd1 +WAITFIFO1=$TMPD/wait1 +CMDFIFO2=$TMPD/cmd2 +WAITFIFO2=$TMPD/wait2 +FUNCS=$TMPD/funcs # create the fifos used for interprocess communication mkfifo $CMDFIFO1 $WAITFIFO1 $CMDFIFO2 $WAITFIFO2 @@ -238,7 +238,7 @@ else fi ./umountrepo $REPO >/dev/null done - rm -rf $TMPDIR + rm -rf $TMPD ) & fi @@ -252,7 +252,7 @@ unshare -rm $UNSHAREOPTS /bin/bash /dev/stdin "${@:-$SHELL}" <