From 5da30c0b04fbfb8b72af0f7437988f2b66a5bb02 Mon Sep 17 00:00:00 2001 From: Tomash Sidei <43379202+tomashibm@users.noreply.github.com> Date: Mon, 29 Aug 2022 20:20:47 +0300 Subject: [PATCH] Implement Cassandra backup and restore. (#418) * First draft of icarus sidecar container usage. * Refactor auth logic to use a separate struct that holds all auth data. * Implement TLS support for icarus container. Pass the proper JMX credentials for icarus. * Create skeleton for cassandrabackup controller. * Fix auth logic. A fresh cluster couldn't init admin user. * fix unit tests * Create a separate icarus client. The generated had bugs and was inconvenient to work with. * Implement first draft of the backup controller. Tested and work with only with the file storage type. * Track backup progress. Shows as an integer value from 0-100 in status. Do not use float as it's not recommended by the controller-gen tool. Do a requeue until the backup is not finished to update status. * Create the skeleton for restore controller. * Regenerate manifestss * Move icarus backup related methods into a separate file. * Add restore methods for the icarus client. * Get the list of restores to later check if there's one exiting already. * Fix the check if the backup with the requested snapshot name exists already. Fix typo. * Implement restore logic. Tested with file storage type only. * Create a serviceaccount with necessary roles for cassandra pods. Needed for icarus to allow reading k8s secrets. Expose secret name arg option for icarus to be support storage providers other than file. * Add description to the backup CR fields. * Add backup duration option. * Add bandwidth option. * Add concurrentConnections option. * Add dc option. * Add entities, timeout and metadataDirective options. * Add the rest of the backup options. * Generate assets. * Add most of the fields for restore CRD. * Implement failed backup process restart if user changed the config and a failed backup exists in icarus. If the backup request is absent in icarus - tell the user to recrete the CR. * Implement failed restore process restart if user changed the config and a failed restore exists in icarus. If the restore request is absent in icarus - tell the user to recreate the CR. * Implement validating webhook for cassandrabackup. * Implement validating webhook for cassandrarestore. * Validate storage location in both controllers. * validate duration * Add more CRD fields validations. * Fix docs. * Move related backup search and failed backup reconcile logic into separate functions. * Move status reconcile into separate func. * Break up main func into smaller ones. * Split controller into smaller functions. * Move code around, rename vars and move icarus related funcs into separate file. * Move main restore logic into separate file. * Refactor restore logic. * Track cluster readiness in the CassandraCluster status. * Use CassandraCluster readiness status field in backup and restore controllers to block execution befor the cluster becomes ready. * Remove restorationStrategyType as only HARDLINKS cam be supported. IMPORT available only on Cassandra 4 and IN_PLACE can be used only on a node that's down. We support only alive clusters (at least for now). Remove singlePhase field as we don't plan to support (at least not yet) single phase restores. For that reason the restorationPhase is also removed. Only INIT can be supported if singlePhase is false. Remove the actualSnapshotTag status field from backup since icarus supports specifying only the tag name withouth appended schema version and timestamp. * Add schemaVersion and exactSchemaVersion fields. * Fix updating the active admin secret with the wrong role and password, * Drop support for file storage. * Fix tests and add checks for icarus container. * Fix lint issues. * Make the backup and restore controllers more testable. Implement new controllers initialization for test manager. Create icarus mock. Create a simple test for backup logic. * Cover with tests failure scenario. * Add restore tests. Hardcore doesnloaded sstables location on restore. Fix CassandraRestore cleanup in tests * Add docs. * Implement storage secret validation. * make manifests * Fix a few bugs and descriptions. * Fix tests and lint issues. * Fix not using the duration field. Removed not used field. * Allow to override the snapshotTag name. * Build and push icarus image in CI * Fix trivy vulnerability issue. * fix Dockerfile for icarus * Run tests against k8s 1.24.2. * Don't run against old k8s versions. * Fix CRD cleanuo in CI script. * Choose the container in `execPod`. Stopped working since we have 2 containers now, need to choose for request to succeed. Make `utils.MergeMap` resilient to nil maps. * Allow more processes during e2e tests. * Rename vars to avoid struct name shadowing. Don't mix value and pointer receiver methods declaration. * Fix misuse of util.MergeMap. It used a sideeffect of a bug that populated the map passed as a first argument but only the resulting map should have the merged elements. The args should not change. * Return a nil map if the inputs are nil in `MargeMap.` * Use .Before instead of comparing timestamps. * Fix compile errors after main merge. * Revert to run integration tests against 1.20.2 * Don't output debug logs into stdout on failed e2e tests since it became very verbose and hard to read. User should download the logs in artifacts on Github actions or look at /tmp/debug-logs folder if running tests locally. * Use constants to identify storage providers. * Remove commented code. * Don't parse time twice. * Mark the network policies test as Serial since it uses host ports. * Fix network policies e2e test. Set the correct container name. * Fix circular dependencies. * Fix networkpolicy for icarus and test them in the networkpolicy test. * Fix deprecated io/ioutil package usage. * Fix networkpolicy integration test. * Upgrade icarus and re-enable trivy scanner for the image. * Fix proxy registry URL. * Replace string literals with constants. * Apply suggestions from code review Co-authored-by: Craig Ingram * Replace string literals with constants. * Apply suggestions from code review Co-authored-by: Craig Ingram * Apply suggestions from code review Co-authored-by: Craig Ingram Co-authored-by: Craig Ingram --- .github/workflows/pull_request.yml | 75 ++++- .github/workflows/release.yml | 71 ++++- Makefile | 5 +- api/v1alpha1/cassandrabackup_types.go | 208 ++++++++++++ api/v1alpha1/cassandrabackup_webhook.go | 124 ++++++++ api/v1alpha1/cassandracluster_types.go | 12 +- api/v1alpha1/cassandracluster_webhook.go | 12 +- api/v1alpha1/cassandrarestore_types.go | 121 +++++++ api/v1alpha1/cassandrarestore_webhook.go | 74 +++++ api/v1alpha1/zz_generated.deepcopy.go | 295 ++++++++++++++++++ .../crds/db.ibm.com_cassandrabackups.yaml | 203 ++++++++++++ ...yaml => db.ibm.com_cassandraclusters.yaml} | 43 +++ .../crds/db.ibm.com_cassandrarestores.yaml | 220 +++++++++++++ cassandra-operator/templates/clusterrole.yaml | 40 +++ cassandra-operator/templates/deployment.yaml | 2 + cassandra-operator/values.yaml | 1 + .../bases/db.ibm.com_cassandrabackups.yaml | 203 ++++++++++++ .../bases/db.ibm.com_cassandraclusters.yaml | 42 +++ .../bases/db.ibm.com_cassandrarestores.yaml | 220 +++++++++++++ config/crd/kustomization.yaml | 1 + controllers/admin_auth.go | 160 +++++----- controllers/cassandra_icarus_container.go | 73 +++++ controllers/cassandra_pods_config_test.go | 2 +- controllers/cassandra_rbac.go | 165 ++++++++++ controllers/cassandra_scaling.go | 8 +- controllers/cassandra_service.go | 7 + controllers/cassandra_statefulset.go | 12 +- controllers/cassandra_tls.go | 45 ++- controllers/cassandrabackup/backup.go | 126 ++++++++ controllers/cassandrabackup/controller.go | 122 ++++++++ controllers/cassandrabackup/icarus.go | 118 +++++++ controllers/cassandrabackup/status.go | 40 +++ controllers/cassandrarestore/controller.go | 141 +++++++++ controllers/cassandrarestore/icarus.go | 141 +++++++++ controllers/cassandrarestore/restore.go | 95 ++++++ controllers/cassandrarestore/status.go | 39 +++ controllers/config/config.go | 6 +- controllers/controller.go | 44 ++- controllers/controller_test.go | 2 +- controllers/defaults.go | 11 + controllers/events/events.go | 28 +- controllers/helpers.go | 3 +- controllers/icarus/backup.go | 141 +++++++++ controllers/icarus/icarus.go | 32 ++ controllers/icarus/restore.go | 140 +++++++++ controllers/keyspaces.go | 2 +- controllers/names/names.go | 12 + controllers/network_policies.go | 9 +- controllers/nodectl/jolokia/jolokia.go | 4 +- controllers/prober/prober.go | 24 +- controllers/reaper.go | 2 +- controllers/reaper/reaper.go | 12 +- controllers/reaper/repair_schedules.go | 10 +- controllers/reaper/repairs.go | 8 +- controllers/role_admin.go | 49 +-- controllers/util/utils.go | 14 +- controllers/webhooks/webhook_certificates.go | 10 +- controllers/webhooks/webhook_validating.go | 68 +++- docs/docs/backup-restore.md | 78 +++++ docs/docs/cassandrabackup-configuration.md | 32 ++ docs/docs/cassandrarestore-configuration.md | 41 +++ icarus/Dockerfile | 13 + main.go | 50 ++- tests/e2e/auth_test.go | 8 +- tests/e2e/managed_regions_test.go | 2 +- tests/e2e/network_policies_test.go | 57 +++- tests/e2e/roles_test.go | 4 +- tests/e2e/suite_test.go | 3 +- tests/e2e/unmanaged_region_test.go | 2 +- tests/e2e/utils_test.go | 36 +-- tests/e2e/zone_as_racks_test.go | 4 +- tests/integration/cassandrabackup_test.go | 141 +++++++++ tests/integration/cassandracluster_test.go | 32 +- tests/integration/cassandrarestore_test.go | 167 ++++++++++ tests/integration/mocks_test.go | 89 ++++++ tests/integration/network_policies_test.go | 16 + tests/integration/suite_test.go | 73 ++++- 77 files changed, 4457 insertions(+), 288 deletions(-) create mode 100644 api/v1alpha1/cassandrabackup_types.go create mode 100644 api/v1alpha1/cassandrabackup_webhook.go create mode 100644 api/v1alpha1/cassandrarestore_types.go create mode 100644 api/v1alpha1/cassandrarestore_webhook.go create mode 100644 cassandra-operator/crds/db.ibm.com_cassandrabackups.yaml rename cassandra-operator/crds/{cassandracluster.yaml => db.ibm.com_cassandraclusters.yaml} (98%) create mode 100644 cassandra-operator/crds/db.ibm.com_cassandrarestores.yaml create mode 100644 config/crd/bases/db.ibm.com_cassandrabackups.yaml create mode 100644 config/crd/bases/db.ibm.com_cassandrarestores.yaml create mode 100644 controllers/cassandra_icarus_container.go create mode 100644 controllers/cassandra_rbac.go create mode 100644 controllers/cassandrabackup/backup.go create mode 100644 controllers/cassandrabackup/controller.go create mode 100644 controllers/cassandrabackup/icarus.go create mode 100644 controllers/cassandrabackup/status.go create mode 100644 controllers/cassandrarestore/controller.go create mode 100644 controllers/cassandrarestore/icarus.go create mode 100644 controllers/cassandrarestore/restore.go create mode 100644 controllers/cassandrarestore/status.go create mode 100644 controllers/icarus/backup.go create mode 100644 controllers/icarus/icarus.go create mode 100644 controllers/icarus/restore.go create mode 100644 docs/docs/backup-restore.md create mode 100644 docs/docs/cassandrabackup-configuration.md create mode 100644 docs/docs/cassandrarestore-configuration.md create mode 100644 icarus/Dockerfile create mode 100644 tests/integration/cassandrabackup_test.go create mode 100644 tests/integration/cassandrarestore_test.go diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 9b9dfac..8a44a96 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -6,6 +6,7 @@ on: env: GO_VERSION: 1.18 HELM_VERSION: v3.9.2 + ICARUS_VERSION: 2.0.4 PYTHON_VERSION: 3.7 # required for helm tester IBM_CLOUD_API_KEY: ${{ secrets.IBM_CLOUD_API_KEY }} IBM_CLOUD_REGION: us-south @@ -59,7 +60,7 @@ jobs: strategy: fail-fast: true matrix: - k8s: [1.20.2, 1.21.4, 1.22.1, 1.23.5, 1.24.2] + k8s: [1.20.2, 1.21.2, 1.22.1, 1.23.1, 1.24.2] steps: - name: Checkout uses: actions/checkout@v3 @@ -281,6 +282,56 @@ jobs: retention-days: 1 + build-icarus: + runs-on: ubuntu-latest + needs: [run-unit-tests, run-integration-tests, validate-helm-charts] + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + + - name: Modify GITHUB_REF_SLUG + run: echo "GITHUB_REF_SLUG=$GITHUB_REF_SLUG-${{ github.run_id }}" >> $GITHUB_ENV + + - name: Setup Buildx + uses: docker/setup-buildx-action@v2 + + - name: Authenticate to Docker Proxy Registry + uses: docker/login-action@v2 + with: + registry: ${{ secrets.DOCKER_PROXY_REGISTRY }} + username: ${{ secrets.ARTIFACTORY_USER }} + password: ${{ secrets.ARTIFACTORY_PASS }} + + - name: Build icarus image + uses: docker/build-push-action@v3 + with: + file: ./icarus/Dockerfile + context: ./icarus + build-args: | + ICARUS_VERSION: ${{ env.ICARUS_VERSION }} + DOCKER_PROXY_REGISTRY=${{ secrets.DOCKER_PROXY_REGISTRY }}/ + tags: us.icr.io/${{ env.ICR_NAMESPACE }}/icarus:${{ env.GITHUB_REF_SLUG }} + outputs: type=docker,dest=icarus.tar + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.6.2 + with: + input: "icarus.tar" + exit-code: "1" + ignore-unfixed: true + severity: ${{ env.TRIVY_SEVERITY }} + + - name: Upload jolokia image artifact + uses: actions/upload-artifact@v3 + with: + name: icarus + path: icarus.tar + retention-days: 1 + + validate-helm-charts: runs-on: ubuntu-latest steps: @@ -337,7 +388,7 @@ jobs: push-images-for-e2e: if: "!contains(github.event.head_commit.message, 'e2e skip')" - needs: [build-operator, build-cassandra, build-prober, build-jolokia, validate-helm-charts] + needs: [build-operator, build-cassandra, build-prober, build-jolokia, build-icarus, validate-helm-charts] runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -387,18 +438,25 @@ jobs: with: name: jolokia + - name: Download icarus image artifact + uses: actions/download-artifact@v3 + with: + name: icarus + - name: Load container images run: | docker load -i cassandra-operator.tar docker load -i cassandra.tar docker load -i prober.tar docker load -i jolokia.tar + docker load -i icarus.tar - name: Push Images to ICR run: | docker push "us.icr.io/${{ env.ICR_NAMESPACE }}/cassandra-operator:$GITHUB_REF_SLUG" docker push "us.icr.io/${{ env.ICR_NAMESPACE }}/prober:$GITHUB_REF_SLUG" docker push "us.icr.io/${{ env.ICR_NAMESPACE }}/cassandra:$GITHUB_REF_SLUG" docker push "us.icr.io/${{ env.ICR_NAMESPACE }}/jolokia:$GITHUB_REF_SLUG" + docker push "us.icr.io/${{ env.ICR_NAMESPACE }}/icarus:$GITHUB_REF_SLUG" run-e2e-tests: needs: [push-images-for-e2e] @@ -459,6 +517,7 @@ jobs: --set "proberImage=us.icr.io/${{ env.ICR_NAMESPACE }}/prober:$GITHUB_REF_SLUG" \ --set "cassandraImage=us.icr.io/${{ env.ICR_NAMESPACE }}/cassandra:$GITHUB_REF_SLUG" \ --set "jolokiaImage=us.icr.io/${{ env.ICR_NAMESPACE }}/jolokia:$GITHUB_REF_SLUG" \ + --set "icarusImage=us.icr.io/${{ env.ICR_NAMESPACE }}/icarus:$GITHUB_REF_SLUG" \ --set "logFormat=console" \ --set "logLevel=debug" \ --set "container.imagePullSecret=$IMAGE_PULL_SECRET" @@ -492,16 +551,22 @@ jobs: run: helm uninstall cassandra-operator - name: Remove CassandraCluster CRD if: ${{ always() }} - run: kubectl delete -f cassandra-operator/crds/cassandracluster.yaml + run: kubectl delete -f cassandra-operator/crds/db.ibm.com_cassandraclusters.yaml + - name: Remove CassandraBackup CRD + if: ${{ always() }} + run: kubectl delete -f cassandra-operator/crds/db.ibm.com_cassandrabackups.yaml + - name: Remove CassandraRestore CRD + if: ${{ always() }} + run: kubectl delete -f cassandra-operator/crds/db.ibm.com_cassandrarestores.yaml # We have below logic bc when multiple tags exist for the same image digest within a repository, the ibmcloud cr image-rm command removes the underlying image and all its tags. See details: https://cloud.ibm.com/docs/container-registry-cli-plugin?topic=container-registry-cli-plugin-containerregcli#bx_cr_image_rm # We can also add a check if commit message contains `no_image_del` then skip the image deletion step - - name: Clenaup k8s namespace + - name: Cleanup k8s namespace if: ${{ always() }} run: kubectl delete namespace $IKS_NAMESPACE - name: Cleanup Images if: ${{ always() }} run: | - for image_name in cassandra-operator prober cassandra jolokia; do + for image_name in cassandra-operator prober cassandra jolokia icarus; do image_digest=$(ibmcloud cr image-list --restrict ${{ env.ICR_NAMESPACE }} --format "{{if and (eq .Repository \"us.icr.io/cassandra-operator/$image_name\") (eq .Tag \"$GITHUB_REF_SLUG\")}}{{.Digest}}{{end}}" --no-trunc) image_tags=$(ibmcloud cr image-digests --restrict ${{ env.ICR_NAMESPACE }} --format "{{if and (eq .Digest \"$image_digest\")}}{{.Tags}}{{end}}" | sed -e 's/\[//g' -e 's/\]//g') image_tags_arr=($image_tags) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6c1969c..5de39af 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,6 +14,7 @@ env: HELM_REPO_PASS: ${{ secrets.ARTIFACTORY_PASS }} HELM_REPO: ${{ secrets.ARTIFACTORY_HELM_REPO }} CASSANDRA_VERSION: 3.11.13 + ICARUS_VERSION: 2.0.4 JMX_EXPORTER_VERSION: 0.17.0 jobs: @@ -286,9 +287,76 @@ jobs: labels: ${{ steps.meta.outputs.labels }} + icarus: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Prepare image metadata + uses: docker/metadata-action@v4 + id: meta + with: + images: | + us.icr.io/${{ env.ICR_NAMESPACE }}/icarus + uk.icr.io/${{ env.ICR_NAMESPACE }}/icarus + de.icr.io/${{ env.ICR_NAMESPACE }}/icarus + au.icr.io/${{ env.ICR_NAMESPACE }}/icarus + jp.icr.io/${{ env.ICR_NAMESPACE }}/icarus + tags: type=ref,event=tag + + - name: Setup Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to IBM Cloud Container Registry US + uses: docker/login-action@v2 + with: + registry: us.icr.io + username: ${{ env.ICR_USERNAME }} + password: ${{ env.ICR_PASSWORD }} + + - name: Login to IBM Cloud Container Registry UK + uses: docker/login-action@v2 + with: + registry: uk.icr.io + username: ${{ env.ICR_USERNAME }} + password: ${{ env.ICR_PASSWORD }} + + - name: Login to IBM Cloud Container Registry DE + uses: docker/login-action@v2 + with: + registry: de.icr.io + username: ${{ env.ICR_USERNAME }} + password: ${{ env.ICR_PASSWORD }} + + - name: Login to IBM Cloud Container Registry AU + uses: docker/login-action@v2 + with: + registry: au.icr.io + username: ${{ env.ICR_USERNAME }} + password: ${{ env.ICR_PASSWORD }} + + - name: Login to IBM Cloud Container Registry JP + uses: docker/login-action@v2 + with: + registry: jp.icr.io + username: ${{ env.ICR_USERNAME }} + password: ${{ env.ICR_PASSWORD }} + + - name: Build and push icarus image + uses: docker/build-push-action@v3 + with: + push: true + file: ./icarus/Dockerfile + context: ./icarus + build-args: | + ICARUS_VERSION: ${{ env.ICARUS_VERSION }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + helm-release: runs-on: ubuntu-latest - needs: [operator, cassandra, prober, jolokia] + needs: [operator, cassandra, prober, jolokia, icarus] outputs: tag: ${{ steps.get_release_tag.outputs.tag }} steps: @@ -311,6 +379,7 @@ jobs: ./bin/yq w -i cassandra-operator/values.yaml 'proberImage' $(./bin/yq r cassandra-operator/values.yaml 'proberImage' | sed "s/:.*/:${{ steps.get_release_tag.outputs.tag }}/") ./bin/yq w -i cassandra-operator/values.yaml 'cassandraImage' $(./bin/yq r cassandra-operator/values.yaml 'cassandraImage' | sed "s/:.*/:${{ env.CASSANDRA_VERSION }}-${{ steps.get_release_tag.outputs.tag }}/") ./bin/yq w -i cassandra-operator/values.yaml 'jolokiaImage' $(./bin/yq r cassandra-operator/values.yaml 'jolokiaImage' | sed "s/:.*/:${{ steps.get_release_tag.outputs.tag }}/") + ./bin/yq w -i cassandra-operator/values.yaml 'icarusImage' $(./bin/yq r cassandra-operator/values.yaml 'icarusImage' | sed "s/:.*/:${{ steps.get_release_tag.outputs.tag }}/") ./bin/yq w -i cassandra-operator/Chart.yaml 'appVersion' $(./bin/yq r cassandra-operator/Chart.yaml 'appVersion' | sed "s/:.*/:${{ steps.get_release_tag.outputs.tag }}/") ./bin/yq w -i cassandra-operator/Chart.yaml 'version' $(./bin/yq r cassandra-operator/Chart.yaml 'version' | sed "s/:.*/:${{ steps.get_release_tag.outputs.tag }}/") diff --git a/Makefile b/Makefile index 17f8c7b..6ee829d 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ integration-tests: # Run e2e tests e2e-tests: - ginkgo -v --procs 11 --timeout=$(E2E_TIMEOUT) --always-emit-ginkgo-writer --progress --fail-fast ./tests/e2e/ -- \ + ginkgo -v --procs 20 --timeout=$(E2E_TIMEOUT) --always-emit-ginkgo-writer --progress --fail-fast ./tests/e2e/ -- \ -test.v -test.timeout=$(E2E_TIMEOUT) \ -operatorNamespace=$(K8S_NAMESPACE) \ -imagePullSecret=$(IMAGE_PULL_SECRET) \ @@ -82,7 +82,7 @@ deploy: manifests kustomize # Generate manifests e.g. CRD, RBAC etc. manifests: controller-gen $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=manager-role output:rbac:none paths="./..." output:crd:artifacts:config=config/crd/bases - kustomize build $(ROOT_DIR)config/crd > $(ROOT_DIR)cassandra-operator/crds/cassandracluster.yaml + $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=manager-role output:rbac:none paths="./..." output:crd:artifacts:config=$(ROOT_DIR)cassandra-operator/crds $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=cassandra-operator paths="./..." output:crd:none output:rbac:stdout > $(ROOT_DIR)cassandra-operator/templates/clusterrole.yaml # Run go fmt against code @@ -100,6 +100,7 @@ generate: controller-gen mockgen -package=mocks -source=./controllers/prober/prober.go -destination=./controllers/mocks/mock_prober.go mockgen -package=mocks -source=./controllers/reaper/reaper.go -destination=./controllers/mocks/mock_reaper.go mockgen -package=mocks -source=./controllers/nodectl/nodectl.go -destination=./controllers/mocks/mock_nodectl.go + mockgen -package=mocks -source=./controllers/icarus/icarus.go -destination=./controllers/mocks/mock_icarus.go # Build the docker image docker-build: diff --git a/api/v1alpha1/cassandrabackup_types.go b/api/v1alpha1/cassandrabackup_types.go new file mode 100644 index 0000000..73f9ce5 --- /dev/null +++ b/api/v1alpha1/cassandrabackup_types.go @@ -0,0 +1,208 @@ +package v1alpha1 + +import ( + "fmt" + "github.com/ibm/cassandra-operator/controllers/util" + "go.uber.org/zap" + v1 "k8s.io/api/core/v1" + "strings" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type StorageProvider string + +const ( + StorageProviderS3 StorageProvider = "s3" + StorageProviderGCP StorageProvider = "gcp" + StorageProviderAzure StorageProvider = "azure" + StorageProviderMinio StorageProvider = "minio" + StorageProviderCeph StorageProvider = "ceph" + StorageProviderOracle StorageProvider = "oracle" +) + +type CassandraBackupSpec struct { + // CassandraCluster that is being backed up + CassandraCluster string `json:"cassandraCluster"` + // example: gcp://myBucket + // location where SSTables will be uploaded. + // A value of the storageLocation property has to have exact format which is 'protocol://bucket-name + // protocol is either 'gcp', 's3', 'azure', 'minio', 'ceph' or 'oracle'. + StorageLocation string `json:"storageLocation"` + // Name of the secret from which credentials used for the communication to cloud storage providers are read. + SecretName string `json:"secretName"` + // Tag name that identifies the backup. Defaulted to the name of the CassandraBackup. + SnapshotTag string `json:"snapshotTag,omitempty"` + // Based on this field, there will be throughput per second computed based on what size data we want to upload we have. + // The formula is "size / duration". The lower the duration is, the higher throughput per second we will need and vice versa. + // This will influence e.g. responsiveness of a node to its business requests so one can control how much bandwidth is used for backup purposes in case a cluster is fully operational. + // The format of this field is "amount unit". 'unit' is just a (case-insensitive) java.util.concurrent.TimeUnit enum value. + // If not used, there will not be any restrictions as how fast an upload can be. + Duration string `json:"duration,omitempty"` + // bandwidth used during uploads + Bandwidth *DataRate `json:"bandwidth,omitempty"` + // number of threads used for upload, there might be at most so many uploading threads at any given time, when not set, it defaults to 10 + // +kubebuilder:validation:Minimum=1 + ConcurrentConnections int64 `json:"concurrentConnections,omitempty"` + // name of datacenter to backup, nodes in the other datacenter(s) will not be involved + DC string `json:"dc,omitempty"` + // database entities to backup, it might be either only keyspaces or only tables (from different keyspaces if needed), + // e.g. 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' if one wants to backup tables. + // These formats can not be used together so 'k1,k2.t2' is invalid. If this field is empty, all keyspaces are backed up. + Entities string `json:"entities,omitempty"` + // number of hours to wait until backup is considered failed if not finished already + // +kubebuilder:validation:Minimum=1 + Timeout int64 `json:"timeout,omitempty"` + // Relevant during upload to S3-like bucket only. Specifies whether the metadata is copied from the source object or replaced with metadata provided in the request. + // Defaults to COPY. Consult com.amazonaws.services.s3.model.MetadatDirective for more information. + // +kubebuilder:validation:Enum=COPY;REPLACE + MetadataDirective string `json:"metadataDirective,omitempty"` + // Relevant during upload to S3-like bucket only. If true, communication is done via HTTP instead of HTTPS. Defaults to false. + Insecure bool `json:"insecure,omitempty"` + // Automatically creates a bucket if it does not exist. If a bucket does not exist, backup operation will fail. Defaults to false. + CreateMissingBucket bool `json:"createMissingBucket,omitempty"` + // Do not check the existence of a bucket. + // Some storage providers (e.g. S3) requires a special permissions to be able to list buckets or query their existence which might not be allowed. + // This flag will skip that check. Keep in mind that if that bucket does not exist, the whole backup operation will fail. + SkipBucketVerification bool `json:"skipBucketVerification,omitempty"` + // If set to true, refreshment of an object in a remote bucket (e.g. for s3) will be skipped. + // This might help upon backuping to specific s3 storage providers like Dell ECS storage. + // You will also skip versioning creating new versions when turned off as refreshment creates new version of files as a side effect. + SkipRefreshing bool `json:"skipRefreshing,omitempty"` + Retry Retry `json:"retry,omitempty"` +} + +type Retry struct { + // Defaults to false if not specified. If false, retry mechanism on upload / download operations in case they fail will not be used. + Enabled bool `json:"enabled,omitempty"` + // Time gap between retries, linear strategy will have always this gap constant, exponential strategy will make the gap bigger exponentially (power of 2) on each attempt + // +kubebuilder:validation:Minimum=1 + Interval int64 `json:"interval,omitempty"` + // Strategy how retry should be driven, might be either 'LINEAR' or 'EXPONENTIAL' + // +kubebuilder:validation:Enum=LINEAR;EXPONENTIAL + Strategy string `json:"strategy,omitempty"` + // Number of repetitions of an upload / download operation in case it fails before giving up completely. + // +kubebuilder:validation:Minimum=1 + MaxAttempts int64 `json:"maxAttempts,omitempty"` +} + +type DataRate struct { + // +kubebuilder:validation:Minimum=1 + Value int64 `json:"value"` + // +kubebuilder:validation:Enum=BPS;KBPS;MBPS;GBPS + Unit string `json:"unit"` +} + +type CassandraBackupStatus struct { + // The current state of the backup + State string `json:"state,omitempty"` + // Errors that occurred during backup process. Errors from all nodes are aggregated here + Errors []BackupError `json:"errors,omitempty"` + // A value from 0 to 100 indicating the progress of the backup as a percentage + Progress int `json:"progress,omitempty"` +} + +type BackupError struct { + // Name of the node where the error occurred + Source string `json:"source,omitempty"` + // The error message + Message string `json:"message,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status + +// CassandraBackup is the Schema for the CassandraBackups API +type CassandraBackup struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec CassandraBackupSpec `json:"spec"` + Status CassandraBackupStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// CassandraBackupList contains a list of CassandraBackup +type CassandraBackupList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CassandraBackup `json:"items"` +} + +func init() { + SchemeBuilder.Register(&CassandraBackup{}, &CassandraBackupList{}) +} + +func (in *CassandraBackup) StorageProvider() StorageProvider { + return storageProvider(in.Spec.StorageLocation) +} + +func storageProvider(storageLocation string) StorageProvider { + if strings.HasPrefix(storageLocation, "gcp://") { + return StorageProviderGCP + } + if strings.HasPrefix(storageLocation, "s3://") { + return StorageProviderS3 + } + if strings.HasPrefix(storageLocation, "azure://") { + return StorageProviderAzure + } + if strings.HasPrefix(storageLocation, "oracle://") { + return StorageProviderOracle + } + if strings.HasPrefix(storageLocation, "minio://") { + return StorageProviderMinio + } + if strings.HasPrefix(storageLocation, "ceph://") { + return StorageProviderCeph + } + + return "" +} + +func ValidateStorageSecret(logger *zap.SugaredLogger, secret *v1.Secret, storageProvider StorageProvider) error { + if util.Contains([]string{ + string(StorageProviderS3), + string(StorageProviderMinio), + string(StorageProviderOracle), + string(StorageProviderCeph), + }, string(storageProvider)) { + if len(secret.Data["awssecretaccesskey"]) == 0 { + logger.Info(fmt.Sprintf("'awssecretaccesskey' key for secret %s is not set, "+ + "will try to use AWS compatible env vars to obtain credentials", secret.Name)) + } + + if len(secret.Data["awsaccesskeyid"]) == 0 { + logger.Info(fmt.Sprintf("'awssecretaccesskey' key for secret %s is not set, "+ + "will try to use AWS compatible env vars to obtain credentials", secret.Name)) + } + + if len(secret.Data["awssecretaccesskey"]) != 0 && len(secret.Data["awsaccesskeyid"]) != 0 { + if len(secret.Data["awsregion"]) == 0 { + return fmt.Errorf("there is no 'awsregion' property "+ + "while you have set both 'awssecretaccesskey' and 'awsaccesskeyid in %s secret", secret.Name) + } + } + + if len(secret.Data["awsendpoint"]) != 0 && len(secret.Data["awsregion"]) == 0 { + return fmt.Errorf("'awsendpoint' is specified but 'awsregion' is not set in %s secret", secret.Name) + } + } + + if storageProvider == StorageProviderGCP && len(secret.Data["gcp"]) == 0 { + return fmt.Errorf("storage provider is GCP but key 'gpc' for secret %s is not set", secret.Name) + } + + if storageProvider == StorageProviderAzure { + if len(secret.Data["azurestorageaccount"]) == 0 { + return fmt.Errorf("'azurestorageaccount' key for secret %s is not set", secret.Name) + } + + if len(secret.Data["azurestoragekey"]) == 0 { + return fmt.Errorf("'azurestoragekey' key for secret %s is not set", secret.Name) + } + } + + return nil +} diff --git a/api/v1alpha1/cassandrabackup_webhook.go b/api/v1alpha1/cassandrabackup_webhook.go new file mode 100644 index 0000000..3d2509e --- /dev/null +++ b/api/v1alpha1/cassandrabackup_webhook.go @@ -0,0 +1,124 @@ +/* + + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "errors" + "fmt" + "strconv" + "strings" + + "github.com/ibm/cassandra-operator/controllers/util" + + "k8s.io/apimachinery/pkg/runtime" + kerrors "k8s.io/apimachinery/pkg/util/errors" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/webhook" +) + +func (cb *CassandraBackup) SetupWebhookWithManager(mgr ctrl.Manager) error { + return ctrl.NewWebhookManagedBy(mgr). + For(cb). + Complete() +} + +var _ webhook.Validator = &CassandraBackup{} + +// ValidateCreate implements webhook.Validator so a webhook will be registered for the type +func (cb *CassandraBackup) ValidateCreate() error { + webhookLogger.Debugf("Validating webhook has been called on create request for backup: %s", cb.Name) + + return kerrors.NewAggregate(validateBackupCreateUpdate(cb)) +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type +func (cb *CassandraBackup) ValidateUpdate(old runtime.Object) error { + webhookLogger.Debugf("Validating webhook has been called on update request for backup: %s", cb.Name) + + cbOld, ok := old.(*CassandraBackup) + if !ok { + return fmt.Errorf("old casandra cluster object: (%s) is not of type CassandraBackup", cbOld.Name) + } + + return kerrors.NewAggregate(validateBackupCreateUpdate(cb)) +} + +// ValidateDelete implements webhook.Validator so a webhook will be registered for the type +func (cb *CassandraBackup) ValidateDelete() error { + webhookLogger.Debugf("Validating webhook has been called on delete request for backup: %s", cb.Name) + return nil +} + +func validateBackupCreateUpdate(cb *CassandraBackup) (verrors []error) { + if err := validateStorageLocation(cb.Spec.StorageLocation); err != nil { + verrors = append(verrors, err) + } + + if err := validateDuration(cb.Spec.Duration); err != nil { + verrors = append(verrors, err) + } + + return verrors +} + +func validateDuration(durationStr string) error { + if len(durationStr) == 0 { + return nil + } + + allowedDurationUnits := []string{"days", "hours", "microseconds", "milliseconds", "minutes", "nanoseconds", "seconds"} + duration := strings.Split(durationStr, " ") + validationErr := fmt.Errorf( + "duration should be in format \"amount unit\", where amount is an integer value and unit is one of the following values: %v", + allowedDurationUnits, + ) + if len(duration) != 2 { + return validationErr + } + + if _, err := strconv.ParseInt(duration[0], 10, 64); err != nil { + return validationErr + } + + if !util.Contains(allowedDurationUnits, strings.TrimSpace(strings.ToLower(duration[1]))) { + return validationErr + } + + return nil +} + +func validateStorageLocation(location string) error { + index := 0 + if index = strings.Index(location, "://"); index < 0 { + return errors.New("storage location should be in format 'protocol://backup/location'") + } + + supportedProtocols := []string{ + string(StorageProviderS3), + string(StorageProviderMinio), + string(StorageProviderOracle), + string(StorageProviderCeph), + string(StorageProviderGCP), + string(StorageProviderAzure), + } + requestedProtocol := location[:index] + if !util.Contains(supportedProtocols, requestedProtocol) { + return fmt.Errorf("protocol %s is not supported. Should be one of the following: %v", requestedProtocol, supportedProtocols) + } + + return nil +} diff --git a/api/v1alpha1/cassandracluster_types.go b/api/v1alpha1/cassandracluster_types.go index 7ef329a..2fea8b1 100755 --- a/api/v1alpha1/cassandracluster_types.go +++ b/api/v1alpha1/cassandracluster_types.go @@ -41,8 +41,6 @@ const ( CassandraDefaultPassword = "cassandra" CassandraOperatorAdminRole = "admin-role" CassandraOperatorAdminPassword = "admin-password" - CassandraOperatorJmxUsername = "jmx-username" - CassandraOperatorJmxPassword = "jmx-password" CassandraOperatorInstance = "operator" CassandraOperatorInstanceName = "cassandra-operator" @@ -62,6 +60,7 @@ const ( DatastaxPort = 9103 ThriftPort = 9160 InstaclustrPort = 9500 + IcarusPort = 4567 ReaperReplicasNumber = 1 reaperRepairIntensityMin = 0.1 @@ -94,6 +93,7 @@ type CassandraClusterSpec struct { SystemKeyspaces SystemKeyspaces `json:"systemKeyspaces,omitempty"` Ingress Ingress `json:"ingress,omitempty"` ExternalRegions ExternalRegions `json:"externalRegions,omitempty"` + Icarus Icarus `json:"icarus,omitempty"` Prober Prober `json:"prober,omitempty"` Reaper *Reaper `json:"reaper,omitempty"` HostPort HostPort `json:"hostPort,omitempty"` @@ -298,6 +298,13 @@ type Persistence struct { CommitLogVolumeClaimSpec v1.PersistentVolumeClaimSpec `json:"commitLogVolumeClaimSpec,omitempty"` } +type Icarus struct { + Image string `json:"image,omitempty"` + // +kubebuilder:validation:Enum=Always;Never;IfNotPresent + ImagePullPolicy v1.PullPolicy `json:"imagePullPolicy,omitempty"` + Resources v1.ResourceRequirements `json:"resources,omitempty"` +} + type Prober struct { Image string `json:"image,omitempty"` // +kubebuilder:validation:Enum=Always;Never;IfNotPresent @@ -359,6 +366,7 @@ type SystemKeyspaceDC struct { // CassandraClusterStatus defines the observed state of CassandraCluster type CassandraClusterStatus struct { MaintenanceState []Maintenance `json:"maintenanceState,omitempty"` + Ready bool `json:"ready,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/cassandracluster_webhook.go b/api/v1alpha1/cassandracluster_webhook.go index c89ae08..d854835 100644 --- a/api/v1alpha1/cassandracluster_webhook.go +++ b/api/v1alpha1/cassandracluster_webhook.go @@ -18,9 +18,11 @@ package v1alpha1 import ( "fmt" + "strconv" + "time" + "github.com/google/go-cmp/cmp" "github.com/ibm/cassandra-operator/controllers/util" - "time" "go.uber.org/zap" "k8s.io/apimachinery/pkg/runtime" @@ -28,7 +30,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/yaml" - "strconv" ) var webhookLogger = zap.NewNop().Sugar() @@ -49,7 +50,7 @@ var _ webhook.Validator = &CassandraCluster{} func (cc *CassandraCluster) ValidateCreate() error { webhookLogger.Infof("Validating webhook has been called on create request for cluster: %s", cc.Name) - return kerrors.NewAggregate(validateCreateUpdate(cc, nil)) + return kerrors.NewAggregate(validateClusterCreateUpdate(cc, nil)) } // ValidateUpdate implements webhook.Validator so a webhook will be registered for the type @@ -61,7 +62,7 @@ func (cc *CassandraCluster) ValidateUpdate(old runtime.Object) error { return fmt.Errorf("old casandra cluster object: (%s) is not of type CassandraCluster", ccOld.Name) } - return kerrors.NewAggregate(validateCreateUpdate(cc, ccOld)) + return kerrors.NewAggregate(validateClusterCreateUpdate(cc, ccOld)) } // ValidateDelete implements webhook.Validator so a webhook will be registered for the type @@ -70,7 +71,7 @@ func (cc *CassandraCluster) ValidateDelete() error { return nil } -func validateCreateUpdate(cc *CassandraCluster, ccOld *CassandraCluster) (errors []error) { +func validateClusterCreateUpdate(cc *CassandraCluster, ccOld *CassandraCluster) (errors []error) { err := validateImmutableFields(cc, ccOld) if err != nil { errors = append(errors, err...) @@ -301,6 +302,7 @@ func validateNetworkPolicies(cc *CassandraCluster) (errors []error) { strconv.Itoa(JmxPort), strconv.Itoa(CqlPort), strconv.Itoa(ThriftPort), + strconv.Itoa(IcarusPort), } for _, rule := range cc.Spec.NetworkPolicies.ExtraCassandraRules { diff --git a/api/v1alpha1/cassandrarestore_types.go b/api/v1alpha1/cassandrarestore_types.go new file mode 100644 index 0000000..f9f0c76 --- /dev/null +++ b/api/v1alpha1/cassandrarestore_types.go @@ -0,0 +1,121 @@ +package v1alpha1 + +import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + +type CassandraRestoreSpec struct { + CassandraCluster string `json:"cassandraCluster"` + CassandraBackup string `json:"cassandraBackup,omitempty"` + // example: gcp://myBucket + // location of SSTables + // A value of the storageLocation property has to have exact format which is 'protocol://bucket-name + // protocol is either 'gcp', 's3', 'azure', 'minio', 'ceph' or 'oracle'. + // If empty, the value is retrieved from the CassandraBackup spec + StorageLocation string `json:"storageLocation,omitempty"` + // Name of the snapshot tag to restore. Can be used to manually set the snapshot tag. Retrieved from CassandraBackup if not specified + SnapshotTag string `json:"snapshotTag,omitempty"` + // Name of the secret from which credentials used for the communication to cloud storage providers are read. + // The secret from the backup spec is used when empty + SecretName string `json:"secretName,omitempty"` + // number of threads used for download, there might be at most so many downloading threads at any given time, + // when not set, it defaults to 10 + // +kubebuilder:validation:Minimum=1 + ConcurrentConnections int64 `json:"concurrentConnections,omitempty"` + // Name of datacenter(s) against which restore will be done. It means that nodes in a different DC will not receive restore requests. + // Multiple dcs are separated by comma + DC string `json:"dc,omitempty"` + // database entities to backup, it might be either only keyspaces or only tables (from different keyspaces if needed), + // e.g. 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' if one wants to backup tables. + // These formats can not be used together so 'k1,k2.t2' is invalid. If this field is empty, all keyspaces are backed up. + Entities string `json:"entities,omitempty"` + // flag saying if we should not delete truncated SSTables after they are imported, as part of CLEANUP phase, defaults to false + NoDeleteTruncates bool `json:"noDeleteTruncates,omitempty"` + // flag saying if we should not delete downloaded SSTables from remote location, as part of CLEANUP phase, defaults to false + NoDeleteDownloads bool `json:"noDeleteDownloads,omitempty"` + // flag saying if we should not download data from remote location as we expect them to be there already, defaults to false, + // setting this to true has sense only in case noDeleteDownloads was set to true in previous restoration requests + NoDownloadData bool `json:"noDownloadData,omitempty"` + // object used upon restoration, + // keyspace and table fields do not need to be set when restoration strategy type is IMPORT or HARDLINKS as this object will be initialised for each entities entry with right keyspace and table. + // 'sourceDir' property is used for pointing to a directory where we expect to find downloaded SSTables. + // This in turn means that all SSTables and other meta files will be downloaded into this directory (from which they will be fed to CFSMB). + // All other fields are taken from ColumnFamilyStoreMBean#importNewSSTables + Import RestoreImport `json:"import,omitempty"` + // number of hours to wait until restore is considered failed if not finished already + // +kubebuilder:validation:Minimum=1 + Timeout int64 `json:"timeout,omitempty"` + // if set to true, host id of node to restore will be resolved from remote topology file located in a bucket by translating it from provided nodeId of storageLocation field + ResolveHostIdFromTopology bool `json:"resolveHostIdFromTopology,omitempty"` + // Relevant during upload to S3-like bucket only. If true, communication is done via HTTP instead of HTTPS. Defaults to false. + Insecure bool `json:"insecure,omitempty"` + // Do not check the existence of a bucket. + // Some storage providers (e.g. S3) requires a special permissions to be able to list buckets or query their existence which might not be allowed. + // This flag will skip that check. Keep in mind that if that bucket does not exist, the whole backup operation will fail. + SkipBucketVerification bool `json:"skipBucketVerification,omitempty"` + Retry Retry `json:"retry,omitempty"` + // Map of key and values where keys and values are in format "keyspace.table", if key is "ks1.tb1" and value is "ks1.tb2", + // it means that upon restore, table ks1.tb1 will be restored into table ks1.tb2. + // This in practice means that table ks1.tb2 will be truncated and populated with data from ks1.tb1. + // The source table, ks1.tb1, will not be touched. It is expected that user knows that schema of both tables is compatible. + // There is not any check done in this regard. + Rename map[string]string `json:"rename,omitempty"` + // version of schema we want to restore from. + // Upon backup, a schema version is automatically appended to snapshot name and its manifest is uploaded under that name (plus timestamp at the end). + // In case we have two snapshots having same name, we might distinguish between them by this schema version. + // If schema version is not specified, we expect that there will be one and only one backup taken with respective snapshot name. + // This schema version has to match the version of a Cassandra nodes. + SchemaVersion string `json:"schemaVersion,omitempty"` + // flag saying if we indeed want a schema version of a running node match with schema version a snapshot is taken on. + // There might be cases when we want to restore a table for which its CQL schema has not changed, + // but it has changed for other table / keyspace but a schema for that node has changed by doing that. + ExactSchemaVersion bool `json:"exactSchemaVersion,omitempty"` +} + +type RestoreImport struct { + KeepLevel bool `json:"keepLevel,omitempty"` + NoVerify bool `json:"noVerify,omitempty"` + NoVerifyTokens bool `json:"noVerifyTokens,omitempty"` + NoInvalidateCaches bool `json:"noInvalidateCaches,omitempty"` + Quick bool `json:"quick,omitempty"` + ExtendedVerify bool `json:"extendedVerify,omitempty"` + KeepRepaired bool `json:"keepRepaired,omitempty"` +} + +type CassandraRestoreStatus struct { + State string `json:"state,omitempty"` + Progress int `json:"progress,omitempty"` + Errors []RestoreError `json:"errors,omitempty"` +} + +type RestoreError struct { + Source string `json:"source,omitempty"` + Message string `json:"message,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status + +// CassandraRestore is the Schema for the CassandraRestores API +type CassandraRestore struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec CassandraRestoreSpec `json:"spec"` + Status CassandraRestoreStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// CassandraRestoreList contains a list of CassandraRestore +type CassandraRestoreList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CassandraRestore `json:"items"` +} + +func init() { + SchemeBuilder.Register(&CassandraRestore{}, &CassandraRestoreList{}) +} + +func (in *CassandraRestore) StorageProvider() StorageProvider { + return storageProvider(in.Spec.StorageLocation) +} diff --git a/api/v1alpha1/cassandrarestore_webhook.go b/api/v1alpha1/cassandrarestore_webhook.go new file mode 100644 index 0000000..58545ff --- /dev/null +++ b/api/v1alpha1/cassandrarestore_webhook.go @@ -0,0 +1,74 @@ +/* + + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "fmt" + + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/runtime" + kerrors "k8s.io/apimachinery/pkg/util/errors" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/webhook" +) + +func (cr *CassandraRestore) SetupWebhookWithManager(mgr ctrl.Manager) error { + return ctrl.NewWebhookManagedBy(mgr). + For(cr). + Complete() +} + +var _ webhook.Validator = &CassandraRestore{} + +// ValidateCreate implements webhook.Validator so a webhook will be registered for the type +func (cr *CassandraRestore) ValidateCreate() error { + webhookLogger.Debugf("Validating webhook has been called on create request for restore: %s", cr.Name) + + return kerrors.NewAggregate(validateRestoreCreateUpdate(cr)) +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type +func (cr *CassandraRestore) ValidateUpdate(old runtime.Object) error { + webhookLogger.Debugf("Validating webhook has been called on update request for restore: %s", cr.Name) + + cbOld, ok := old.(*CassandraRestore) + if !ok { + return fmt.Errorf("old cassandra cluster object: (%s) is not of type CassandraRestore", cbOld.Name) + } + + return kerrors.NewAggregate(validateRestoreCreateUpdate(cr)) +} + +// ValidateDelete implements webhook.Validator so a webhook will be registered for the type +func (cr *CassandraRestore) ValidateDelete() error { + webhookLogger.Debugf("Validating webhook has been called on delete request for restore: %s", cr.Name) + return nil +} + +func validateRestoreCreateUpdate(cr *CassandraRestore) (verrors []error) { + if len(cr.Spec.CassandraBackup) == 0 { + if len(cr.Spec.StorageLocation) == 0 || len(cr.Spec.SnapshotTag) == 0 || len(cr.Spec.SecretName) == 0 { + verrors = append(verrors, errors.New(".spec.storageLocation, .spec.snapshotTag and .spec.secretName should be set if .spec.cassandraBackup is not set")) + } else { + if err := validateStorageLocation(cr.Spec.StorageLocation); err != nil { + verrors = append(verrors, err) + } + } + } + + return verrors +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index cbe8ae6..08f20f2 100755 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -47,6 +47,21 @@ func (in *AutoScheduling) DeepCopy() *AutoScheduling { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackupError) DeepCopyInto(out *BackupError) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackupError. +func (in *BackupError) DeepCopy() *BackupError { + if in == nil { + return nil + } + out := new(BackupError) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CATLSSecret) DeepCopyInto(out *CATLSSecret) { *out = *in @@ -102,6 +117,106 @@ func (in *Cassandra) DeepCopy() *Cassandra { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraBackup) DeepCopyInto(out *CassandraBackup) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraBackup. +func (in *CassandraBackup) DeepCopy() *CassandraBackup { + if in == nil { + return nil + } + out := new(CassandraBackup) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CassandraBackup) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraBackupList) DeepCopyInto(out *CassandraBackupList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CassandraBackup, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraBackupList. +func (in *CassandraBackupList) DeepCopy() *CassandraBackupList { + if in == nil { + return nil + } + out := new(CassandraBackupList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CassandraBackupList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraBackupSpec) DeepCopyInto(out *CassandraBackupSpec) { + *out = *in + if in.Bandwidth != nil { + in, out := &in.Bandwidth, &out.Bandwidth + *out = new(DataRate) + **out = **in + } + out.Retry = in.Retry +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraBackupSpec. +func (in *CassandraBackupSpec) DeepCopy() *CassandraBackupSpec { + if in == nil { + return nil + } + out := new(CassandraBackupSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraBackupStatus) DeepCopyInto(out *CassandraBackupStatus) { + *out = *in + if in.Errors != nil { + in, out := &in.Errors, &out.Errors + *out = make([]BackupError, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraBackupStatus. +func (in *CassandraBackupStatus) DeepCopy() *CassandraBackupStatus { + if in == nil { + return nil + } + out := new(CassandraBackupStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CassandraCluster) DeepCopyInto(out *CassandraCluster) { *out = *in @@ -191,6 +306,7 @@ func (in *CassandraClusterSpec) DeepCopyInto(out *CassandraClusterSpec) { in.SystemKeyspaces.DeepCopyInto(&out.SystemKeyspaces) in.Ingress.DeepCopyInto(&out.Ingress) in.ExternalRegions.DeepCopyInto(&out.ExternalRegions) + in.Icarus.DeepCopyInto(&out.Icarus) in.Prober.DeepCopyInto(&out.Prober) if in.Reaper != nil { in, out := &in.Reaper, &out.Reaper @@ -234,6 +350,109 @@ func (in *CassandraClusterStatus) DeepCopy() *CassandraClusterStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraRestore) DeepCopyInto(out *CassandraRestore) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraRestore. +func (in *CassandraRestore) DeepCopy() *CassandraRestore { + if in == nil { + return nil + } + out := new(CassandraRestore) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CassandraRestore) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraRestoreList) DeepCopyInto(out *CassandraRestoreList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CassandraRestore, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraRestoreList. +func (in *CassandraRestoreList) DeepCopy() *CassandraRestoreList { + if in == nil { + return nil + } + out := new(CassandraRestoreList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CassandraRestoreList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraRestoreSpec) DeepCopyInto(out *CassandraRestoreSpec) { + *out = *in + out.Import = in.Import + out.Retry = in.Retry + if in.Rename != nil { + in, out := &in.Rename, &out.Rename + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraRestoreSpec. +func (in *CassandraRestoreSpec) DeepCopy() *CassandraRestoreSpec { + if in == nil { + return nil + } + out := new(CassandraRestoreSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CassandraRestoreStatus) DeepCopyInto(out *CassandraRestoreStatus) { + *out = *in + if in.Errors != nil { + in, out := &in.Errors, &out.Errors + *out = make([]RestoreError, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CassandraRestoreStatus. +func (in *CassandraRestoreStatus) DeepCopy() *CassandraRestoreStatus { + if in == nil { + return nil + } + out := new(CassandraRestoreStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClientEncryption) DeepCopyInto(out *ClientEncryption) { *out = *in @@ -293,6 +512,21 @@ func (in *DC) DeepCopy() *DC { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataRate) DeepCopyInto(out *DataRate) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataRate. +func (in *DataRate) DeepCopy() *DataRate { + if in == nil { + return nil + } + out := new(DataRate) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Encryption) DeepCopyInto(out *Encryption) { *out = *in @@ -357,6 +591,22 @@ func (in *HostPort) DeepCopy() *HostPort { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Icarus) DeepCopyInto(out *Icarus) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Icarus. +func (in *Icarus) DeepCopy() *Icarus { + if in == nil { + return nil + } + out := new(Icarus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Ingress) DeepCopyInto(out *Ingress) { *out = *in @@ -700,6 +950,51 @@ func (in *RepairSchedules) DeepCopy() *RepairSchedules { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RestoreError) DeepCopyInto(out *RestoreError) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RestoreError. +func (in *RestoreError) DeepCopy() *RestoreError { + if in == nil { + return nil + } + out := new(RestoreError) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RestoreImport) DeepCopyInto(out *RestoreImport) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RestoreImport. +func (in *RestoreImport) DeepCopy() *RestoreImport { + if in == nil { + return nil + } + out := new(RestoreImport) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Retry) DeepCopyInto(out *Retry) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Retry. +func (in *Retry) DeepCopy() *Retry { + if in == nil { + return nil + } + out := new(Retry) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServerEncryption) DeepCopyInto(out *ServerEncryption) { *out = *in diff --git a/cassandra-operator/crds/db.ibm.com_cassandrabackups.yaml b/cassandra-operator/crds/db.ibm.com_cassandrabackups.yaml new file mode 100644 index 0000000..984d18b --- /dev/null +++ b/cassandra-operator/crds/db.ibm.com_cassandrabackups.yaml @@ -0,0 +1,203 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: cassandrabackups.db.ibm.com +spec: + group: db.ibm.com + names: + kind: CassandraBackup + listKind: CassandraBackupList + plural: cassandrabackups + singular: cassandrabackup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: CassandraBackup is the Schema for the CassandraBackups API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + bandwidth: + description: bandwidth used during uploads + properties: + unit: + enum: + - BPS + - KBPS + - MBPS + - GBPS + type: string + value: + format: int64 + minimum: 1 + type: integer + required: + - unit + - value + type: object + cassandraCluster: + description: CassandraCluster that is being backed up + type: string + concurrentConnections: + description: number of threads used for upload, there might be at + most so many uploading threads at any given time, when not set, + it defaults to 10 + format: int64 + minimum: 1 + type: integer + createMissingBucket: + description: Automatically creates a bucket if it does not exist. + If a bucket does not exist, backup operation will fail. Defaults + to false. + type: boolean + dc: + description: name of datacenter to backup, nodes in the other datacenter(s) + will not be involved + type: string + duration: + description: Based on this field, there will be throughput per second + computed based on what size data we want to upload we have. The + formula is "size / duration". The lower the duration is, the higher + throughput per second we will need and vice versa. This will influence + e.g. responsiveness of a node to its business requests so one can + control how much bandwidth is used for backup purposes in case a + cluster is fully operational. The format of this field is "amount + unit". 'unit' is just a (case-insensitive) java.util.concurrent.TimeUnit + enum value. If not used, there will not be any restrictions as how + fast an upload can be. + type: string + entities: + description: database entities to backup, it might be either only + keyspaces or only tables (from different keyspaces if needed), e.g. + 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' + if one wants to backup tables. These formats can not be used together + so 'k1,k2.t2' is invalid. If this field is empty, all keyspaces + are backed up. + type: string + insecure: + description: Relevant during upload to S3-like bucket only. If true, + communication is done via HTTP instead of HTTPS. Defaults to false. + type: boolean + metadataDirective: + description: Relevant during upload to S3-like bucket only. Specifies + whether the metadata is copied from the source object or replaced + with metadata provided in the request. Defaults to COPY. Consult + com.amazonaws.services.s3.model.MetadatDirective for more information. + enum: + - COPY + - REPLACE + type: string + retry: + properties: + enabled: + description: Defaults to false if not specified. If false, retry + mechanism on upload / download operations in case they fail + will not be used. + type: boolean + interval: + description: Time gap between retries, linear strategy will have + always this gap constant, exponential strategy will make the + gap bigger exponentially (power of 2) on each attempt + format: int64 + minimum: 1 + type: integer + maxAttempts: + description: Number of repetitions of an upload / download operation + in case it fails before giving up completely. + format: int64 + minimum: 1 + type: integer + strategy: + description: Strategy how retry should be driven, might be either + 'LINEAR' or 'EXPONENTIAL' + enum: + - LINEAR + - EXPONENTIAL + type: string + type: object + secretName: + description: Name of the secret from which credentials used for the + communication to cloud storage providers are read. + type: string + skipBucketVerification: + description: Do not check the existence of a bucket. Some storage + providers (e.g. S3) requires a special permissions to be able to + list buckets or query their existence which might not be allowed. + This flag will skip that check. Keep in mind that if that bucket + does not exist, the whole backup operation will fail. + type: boolean + skipRefreshing: + description: If set to true, refreshment of an object in a remote + bucket (e.g. for s3) will be skipped. This might help upon backuping + to specific s3 storage providers like Dell ECS storage. You will + also skip versioning creating new versions when turned off as refreshment + creates new version of files as a side effect. + type: boolean + snapshotTag: + description: Tag name that identifies the backup. Defaulted to the + name of the CassandraBackup. + type: string + storageLocation: + description: 'example: gcp://myBucket location where SSTables will + be uploaded. A value of the storageLocation property has to have + exact format which is ''protocol://bucket-name protocol is either + ''gcp'', ''s3'', ''azure'', ''minio'', ''ceph'' or ''oracle''.' + type: string + timeout: + description: number of hours to wait until backup is considered failed + if not finished already + format: int64 + minimum: 1 + type: integer + required: + - cassandraCluster + - secretName + - storageLocation + type: object + status: + properties: + errors: + description: Errors that occurred during backup process. Errors from + all nodes are aggregated here + items: + properties: + message: + description: The error message + type: string + source: + description: Name of the node where the error occurred + type: string + type: object + type: array + progress: + description: A value from 0 to 100 indicating the progress of the + backup as a percentage + type: integer + state: + description: The current state of the backup + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/cassandra-operator/crds/cassandracluster.yaml b/cassandra-operator/crds/db.ibm.com_cassandraclusters.yaml similarity index 98% rename from cassandra-operator/crds/cassandracluster.yaml rename to cassandra-operator/crds/db.ibm.com_cassandraclusters.yaml index 602dcfa..5d53667 100644 --- a/cassandra-operator/crds/cassandracluster.yaml +++ b/cassandra-operator/crds/db.ibm.com_cassandraclusters.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -1610,6 +1611,46 @@ spec: useExternalHostIP: type: boolean type: object + icarus: + properties: + image: + type: string + imagePullPolicy: + description: PullPolicy describes a policy for if/when to pull + a container image + enum: + - Always + - Never + - IfNotPresent + type: string + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + type: object imagePullSecretName: minLength: 1 type: string @@ -3282,6 +3323,8 @@ spec: - dc type: object type: array + ready: + type: boolean type: object required: - spec diff --git a/cassandra-operator/crds/db.ibm.com_cassandrarestores.yaml b/cassandra-operator/crds/db.ibm.com_cassandrarestores.yaml new file mode 100644 index 0000000..cb905dd --- /dev/null +++ b/cassandra-operator/crds/db.ibm.com_cassandrarestores.yaml @@ -0,0 +1,220 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: cassandrarestores.db.ibm.com +spec: + group: db.ibm.com + names: + kind: CassandraRestore + listKind: CassandraRestoreList + plural: cassandrarestores + singular: cassandrarestore + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: CassandraRestore is the Schema for the CassandraRestores API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + cassandraBackup: + type: string + cassandraCluster: + type: string + concurrentConnections: + description: number of threads used for download, there might be at + most so many downloading threads at any given time, when not set, + it defaults to 10 + format: int64 + minimum: 1 + type: integer + dc: + description: Name of datacenter(s) against which restore will be done. + It means that nodes in a different DC will not receive restore requests. + Multiple dcs are separated by comma + type: string + entities: + description: database entities to backup, it might be either only + keyspaces or only tables (from different keyspaces if needed), e.g. + 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' + if one wants to backup tables. These formats can not be used together + so 'k1,k2.t2' is invalid. If this field is empty, all keyspaces + are backed up. + type: string + exactSchemaVersion: + description: flag saying if we indeed want a schema version of a running + node match with schema version a snapshot is taken on. There might + be cases when we want to restore a table for which its CQL schema + has not changed, but it has changed for other table / keyspace but + a schema for that node has changed by doing that. + type: boolean + import: + description: object used upon restoration, keyspace and table fields + do not need to be set when restoration strategy type is IMPORT or + HARDLINKS as this object will be initialised for each entities entry + with right keyspace and table. 'sourceDir' property is used for + pointing to a directory where we expect to find downloaded SSTables. + This in turn means that all SSTables and other meta files will be + downloaded into this directory (from which they will be fed to CFSMB). + All other fields are taken from ColumnFamilyStoreMBean#importNewSSTables + properties: + extendedVerify: + type: boolean + keepLevel: + type: boolean + keepRepaired: + type: boolean + noInvalidateCaches: + type: boolean + noVerify: + type: boolean + noVerifyTokens: + type: boolean + quick: + type: boolean + type: object + insecure: + description: Relevant during upload to S3-like bucket only. If true, + communication is done via HTTP instead of HTTPS. Defaults to false. + type: boolean + noDeleteDownloads: + description: flag saying if we should not delete downloaded SSTables + from remote location, as part of CLEANUP phase, defaults to false + type: boolean + noDeleteTruncates: + description: flag saying if we should not delete truncated SSTables + after they are imported, as part of CLEANUP phase, defaults to false + type: boolean + noDownloadData: + description: flag saying if we should not download data from remote + location as we expect them to be there already, defaults to false, + setting this to true has sense only in case noDeleteDownloads was + set to true in previous restoration requests + type: boolean + rename: + additionalProperties: + type: string + description: Map of key and values where keys and values are in format + "keyspace.table", if key is "ks1.tb1" and value is "ks1.tb2", it + means that upon restore, table ks1.tb1 will be restored into table + ks1.tb2. This in practice means that table ks1.tb2 will be truncated + and populated with data from ks1.tb1. The source table, ks1.tb1, + will not be touched. It is expected that user knows that schema + of both tables is compatible. There is not any check done in this + regard. + type: object + resolveHostIdFromTopology: + description: if set to true, host id of node to restore will be resolved + from remote topology file located in a bucket by translating it + from provided nodeId of storageLocation field + type: boolean + retry: + properties: + enabled: + description: Defaults to false if not specified. If false, retry + mechanism on upload / download operations in case they fail + will not be used. + type: boolean + interval: + description: Time gap between retries, linear strategy will have + always this gap constant, exponential strategy will make the + gap bigger exponentially (power of 2) on each attempt + format: int64 + minimum: 1 + type: integer + maxAttempts: + description: Number of repetitions of an upload / download operation + in case it fails before giving up completely. + format: int64 + minimum: 1 + type: integer + strategy: + description: Strategy how retry should be driven, might be either + 'LINEAR' or 'EXPONENTIAL' + enum: + - LINEAR + - EXPONENTIAL + type: string + type: object + schemaVersion: + description: version of schema we want to restore from. Upon backup, + a schema version is automatically appended to snapshot name and + its manifest is uploaded under that name (plus timestamp at the + end). In case we have two snapshots having same name, we might distinguish + between them by this schema version. If schema version is not specified, + we expect that there will be one and only one backup taken with + respective snapshot name. This schema version has to match the version + of a Cassandra nodes. + type: string + secretName: + description: Name of the secret from which credentials used for the + communication to cloud storage providers are read. The secret from + the backup spec is used when empty + type: string + skipBucketVerification: + description: Do not check the existence of a bucket. Some storage + providers (e.g. S3) requires a special permissions to be able to + list buckets or query their existence which might not be allowed. + This flag will skip that check. Keep in mind that if that bucket + does not exist, the whole backup operation will fail. + type: boolean + snapshotTag: + description: Name of the snapshot tag to restore. Can be used to manually + set the snapshot tag. Retrieved from CassandraBackup if specified + type: string + storageLocation: + description: 'example: gcp://myBucket location of SSTables A value + of the storageLocation property has to have exact format which is + ''protocol://bucket-name protocol is either ''gcp'', ''s3'', ''azure'', + ''minio'', ''ceph'' or ''oracle''. If empty, the value is retrieved + from the CassandraBackup spec' + type: string + timeout: + description: number of hours to wait until restore is considered failed + if not finished already + format: int64 + minimum: 1 + type: integer + required: + - cassandraCluster + type: object + status: + properties: + errors: + items: + properties: + message: + type: string + source: + type: string + type: object + type: array + progress: + type: integer + state: + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/cassandra-operator/templates/clusterrole.yaml b/cassandra-operator/templates/clusterrole.yaml index ce6517c..a73d474 100644 --- a/cassandra-operator/templates/clusterrole.yaml +++ b/cassandra-operator/templates/clusterrole.yaml @@ -157,6 +157,26 @@ rules: - patch - update - watch +- apiGroups: + - db.ibm.com + resources: + - cassandrabackups + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - db.ibm.com + resources: + - cassandrabackups/status + verbs: + - get + - patch + - update - apiGroups: - db.ibm.com resources: @@ -177,6 +197,26 @@ rules: - get - patch - update +- apiGroups: + - db.ibm.com + resources: + - cassandrarestores + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - db.ibm.com + resources: + - cassandrarestores/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: diff --git a/cassandra-operator/templates/deployment.yaml b/cassandra-operator/templates/deployment.yaml index f460e04..df211fc 100644 --- a/cassandra-operator/templates/deployment.yaml +++ b/cassandra-operator/templates/deployment.yaml @@ -50,5 +50,7 @@ spec: value: {{ .Values.jolokiaImage | quote }} - name: DEFAULT_REAPER_IMAGE value: {{ .Values.reaperImage | quote }} + - name: DEFAULT_ICARUS_IMAGE + value: {{ .Values.icarusImage | quote }} - name: WEBHOOKS_ENABLED value: {{ .Values.admissionWebhooks.enabled | quote }} diff --git a/cassandra-operator/values.yaml b/cassandra-operator/values.yaml index f1c0168..9b9b813 100644 --- a/cassandra-operator/values.yaml +++ b/cassandra-operator/values.yaml @@ -17,6 +17,7 @@ proberImage: us.icr.io/cassandra-operator/prober:0.4.0 # this value will be upda jolokiaImage: us.icr.io/cassandra-operator/jolokia:0.4.0 # this value will be updated on next release in GHA cassandraImage: us.icr.io/cassandra-operator/cassandra:3.11.13-0.4.0 # this value will be updated on next release in GHA reaperImage: thelastpickle/cassandra-reaper:3.2.0 +icarusImage: us.icr.io/cassandra-operator/icarus:0.3.0 clusterDashboards: enabled: [] namespace: "" diff --git a/config/crd/bases/db.ibm.com_cassandrabackups.yaml b/config/crd/bases/db.ibm.com_cassandrabackups.yaml new file mode 100644 index 0000000..984d18b --- /dev/null +++ b/config/crd/bases/db.ibm.com_cassandrabackups.yaml @@ -0,0 +1,203 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: cassandrabackups.db.ibm.com +spec: + group: db.ibm.com + names: + kind: CassandraBackup + listKind: CassandraBackupList + plural: cassandrabackups + singular: cassandrabackup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: CassandraBackup is the Schema for the CassandraBackups API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + bandwidth: + description: bandwidth used during uploads + properties: + unit: + enum: + - BPS + - KBPS + - MBPS + - GBPS + type: string + value: + format: int64 + minimum: 1 + type: integer + required: + - unit + - value + type: object + cassandraCluster: + description: CassandraCluster that is being backed up + type: string + concurrentConnections: + description: number of threads used for upload, there might be at + most so many uploading threads at any given time, when not set, + it defaults to 10 + format: int64 + minimum: 1 + type: integer + createMissingBucket: + description: Automatically creates a bucket if it does not exist. + If a bucket does not exist, backup operation will fail. Defaults + to false. + type: boolean + dc: + description: name of datacenter to backup, nodes in the other datacenter(s) + will not be involved + type: string + duration: + description: Based on this field, there will be throughput per second + computed based on what size data we want to upload we have. The + formula is "size / duration". The lower the duration is, the higher + throughput per second we will need and vice versa. This will influence + e.g. responsiveness of a node to its business requests so one can + control how much bandwidth is used for backup purposes in case a + cluster is fully operational. The format of this field is "amount + unit". 'unit' is just a (case-insensitive) java.util.concurrent.TimeUnit + enum value. If not used, there will not be any restrictions as how + fast an upload can be. + type: string + entities: + description: database entities to backup, it might be either only + keyspaces or only tables (from different keyspaces if needed), e.g. + 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' + if one wants to backup tables. These formats can not be used together + so 'k1,k2.t2' is invalid. If this field is empty, all keyspaces + are backed up. + type: string + insecure: + description: Relevant during upload to S3-like bucket only. If true, + communication is done via HTTP instead of HTTPS. Defaults to false. + type: boolean + metadataDirective: + description: Relevant during upload to S3-like bucket only. Specifies + whether the metadata is copied from the source object or replaced + with metadata provided in the request. Defaults to COPY. Consult + com.amazonaws.services.s3.model.MetadatDirective for more information. + enum: + - COPY + - REPLACE + type: string + retry: + properties: + enabled: + description: Defaults to false if not specified. If false, retry + mechanism on upload / download operations in case they fail + will not be used. + type: boolean + interval: + description: Time gap between retries, linear strategy will have + always this gap constant, exponential strategy will make the + gap bigger exponentially (power of 2) on each attempt + format: int64 + minimum: 1 + type: integer + maxAttempts: + description: Number of repetitions of an upload / download operation + in case it fails before giving up completely. + format: int64 + minimum: 1 + type: integer + strategy: + description: Strategy how retry should be driven, might be either + 'LINEAR' or 'EXPONENTIAL' + enum: + - LINEAR + - EXPONENTIAL + type: string + type: object + secretName: + description: Name of the secret from which credentials used for the + communication to cloud storage providers are read. + type: string + skipBucketVerification: + description: Do not check the existence of a bucket. Some storage + providers (e.g. S3) requires a special permissions to be able to + list buckets or query their existence which might not be allowed. + This flag will skip that check. Keep in mind that if that bucket + does not exist, the whole backup operation will fail. + type: boolean + skipRefreshing: + description: If set to true, refreshment of an object in a remote + bucket (e.g. for s3) will be skipped. This might help upon backuping + to specific s3 storage providers like Dell ECS storage. You will + also skip versioning creating new versions when turned off as refreshment + creates new version of files as a side effect. + type: boolean + snapshotTag: + description: Tag name that identifies the backup. Defaulted to the + name of the CassandraBackup. + type: string + storageLocation: + description: 'example: gcp://myBucket location where SSTables will + be uploaded. A value of the storageLocation property has to have + exact format which is ''protocol://bucket-name protocol is either + ''gcp'', ''s3'', ''azure'', ''minio'', ''ceph'' or ''oracle''.' + type: string + timeout: + description: number of hours to wait until backup is considered failed + if not finished already + format: int64 + minimum: 1 + type: integer + required: + - cassandraCluster + - secretName + - storageLocation + type: object + status: + properties: + errors: + description: Errors that occurred during backup process. Errors from + all nodes are aggregated here + items: + properties: + message: + description: The error message + type: string + source: + description: Name of the node where the error occurred + type: string + type: object + type: array + progress: + description: A value from 0 to 100 indicating the progress of the + backup as a percentage + type: integer + state: + description: The current state of the backup + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/db.ibm.com_cassandraclusters.yaml b/config/crd/bases/db.ibm.com_cassandraclusters.yaml index 1c35149..5d53667 100755 --- a/config/crd/bases/db.ibm.com_cassandraclusters.yaml +++ b/config/crd/bases/db.ibm.com_cassandraclusters.yaml @@ -1611,6 +1611,46 @@ spec: useExternalHostIP: type: boolean type: object + icarus: + properties: + image: + type: string + imagePullPolicy: + description: PullPolicy describes a policy for if/when to pull + a container image + enum: + - Always + - Never + - IfNotPresent + type: string + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + type: object imagePullSecretName: minLength: 1 type: string @@ -3283,6 +3323,8 @@ spec: - dc type: object type: array + ready: + type: boolean type: object required: - spec diff --git a/config/crd/bases/db.ibm.com_cassandrarestores.yaml b/config/crd/bases/db.ibm.com_cassandrarestores.yaml new file mode 100644 index 0000000..cb905dd --- /dev/null +++ b/config/crd/bases/db.ibm.com_cassandrarestores.yaml @@ -0,0 +1,220 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: cassandrarestores.db.ibm.com +spec: + group: db.ibm.com + names: + kind: CassandraRestore + listKind: CassandraRestoreList + plural: cassandrarestores + singular: cassandrarestore + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: CassandraRestore is the Schema for the CassandraRestores API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + cassandraBackup: + type: string + cassandraCluster: + type: string + concurrentConnections: + description: number of threads used for download, there might be at + most so many downloading threads at any given time, when not set, + it defaults to 10 + format: int64 + minimum: 1 + type: integer + dc: + description: Name of datacenter(s) against which restore will be done. + It means that nodes in a different DC will not receive restore requests. + Multiple dcs are separated by comma + type: string + entities: + description: database entities to backup, it might be either only + keyspaces or only tables (from different keyspaces if needed), e.g. + 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' + if one wants to backup tables. These formats can not be used together + so 'k1,k2.t2' is invalid. If this field is empty, all keyspaces + are backed up. + type: string + exactSchemaVersion: + description: flag saying if we indeed want a schema version of a running + node match with schema version a snapshot is taken on. There might + be cases when we want to restore a table for which its CQL schema + has not changed, but it has changed for other table / keyspace but + a schema for that node has changed by doing that. + type: boolean + import: + description: object used upon restoration, keyspace and table fields + do not need to be set when restoration strategy type is IMPORT or + HARDLINKS as this object will be initialised for each entities entry + with right keyspace and table. 'sourceDir' property is used for + pointing to a directory where we expect to find downloaded SSTables. + This in turn means that all SSTables and other meta files will be + downloaded into this directory (from which they will be fed to CFSMB). + All other fields are taken from ColumnFamilyStoreMBean#importNewSSTables + properties: + extendedVerify: + type: boolean + keepLevel: + type: boolean + keepRepaired: + type: boolean + noInvalidateCaches: + type: boolean + noVerify: + type: boolean + noVerifyTokens: + type: boolean + quick: + type: boolean + type: object + insecure: + description: Relevant during upload to S3-like bucket only. If true, + communication is done via HTTP instead of HTTPS. Defaults to false. + type: boolean + noDeleteDownloads: + description: flag saying if we should not delete downloaded SSTables + from remote location, as part of CLEANUP phase, defaults to false + type: boolean + noDeleteTruncates: + description: flag saying if we should not delete truncated SSTables + after they are imported, as part of CLEANUP phase, defaults to false + type: boolean + noDownloadData: + description: flag saying if we should not download data from remote + location as we expect them to be there already, defaults to false, + setting this to true has sense only in case noDeleteDownloads was + set to true in previous restoration requests + type: boolean + rename: + additionalProperties: + type: string + description: Map of key and values where keys and values are in format + "keyspace.table", if key is "ks1.tb1" and value is "ks1.tb2", it + means that upon restore, table ks1.tb1 will be restored into table + ks1.tb2. This in practice means that table ks1.tb2 will be truncated + and populated with data from ks1.tb1. The source table, ks1.tb1, + will not be touched. It is expected that user knows that schema + of both tables is compatible. There is not any check done in this + regard. + type: object + resolveHostIdFromTopology: + description: if set to true, host id of node to restore will be resolved + from remote topology file located in a bucket by translating it + from provided nodeId of storageLocation field + type: boolean + retry: + properties: + enabled: + description: Defaults to false if not specified. If false, retry + mechanism on upload / download operations in case they fail + will not be used. + type: boolean + interval: + description: Time gap between retries, linear strategy will have + always this gap constant, exponential strategy will make the + gap bigger exponentially (power of 2) on each attempt + format: int64 + minimum: 1 + type: integer + maxAttempts: + description: Number of repetitions of an upload / download operation + in case it fails before giving up completely. + format: int64 + minimum: 1 + type: integer + strategy: + description: Strategy how retry should be driven, might be either + 'LINEAR' or 'EXPONENTIAL' + enum: + - LINEAR + - EXPONENTIAL + type: string + type: object + schemaVersion: + description: version of schema we want to restore from. Upon backup, + a schema version is automatically appended to snapshot name and + its manifest is uploaded under that name (plus timestamp at the + end). In case we have two snapshots having same name, we might distinguish + between them by this schema version. If schema version is not specified, + we expect that there will be one and only one backup taken with + respective snapshot name. This schema version has to match the version + of a Cassandra nodes. + type: string + secretName: + description: Name of the secret from which credentials used for the + communication to cloud storage providers are read. The secret from + the backup spec is used when empty + type: string + skipBucketVerification: + description: Do not check the existence of a bucket. Some storage + providers (e.g. S3) requires a special permissions to be able to + list buckets or query their existence which might not be allowed. + This flag will skip that check. Keep in mind that if that bucket + does not exist, the whole backup operation will fail. + type: boolean + snapshotTag: + description: Name of the snapshot tag to restore. Can be used to manually + set the snapshot tag. Retrieved from CassandraBackup if specified + type: string + storageLocation: + description: 'example: gcp://myBucket location of SSTables A value + of the storageLocation property has to have exact format which is + ''protocol://bucket-name protocol is either ''gcp'', ''s3'', ''azure'', + ''minio'', ''ceph'' or ''oracle''. If empty, the value is retrieved + from the CassandraBackup spec' + type: string + timeout: + description: number of hours to wait until restore is considered failed + if not finished already + format: int64 + minimum: 1 + type: integer + required: + - cassandraCluster + type: object + status: + properties: + errors: + items: + properties: + message: + type: string + source: + type: string + type: object + type: array + progress: + type: integer + state: + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 3a29e29..2e8d24e 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -1,2 +1,3 @@ resources: - bases/db.ibm.com_cassandraclusters.yaml +- bases/db.ibm.com_cassandrabackups.yaml diff --git a/controllers/admin_auth.go b/controllers/admin_auth.go index 60d1a80..5a45806 100644 --- a/controllers/admin_auth.go +++ b/controllers/admin_auth.go @@ -1,7 +1,6 @@ package controllers import ( - "bytes" "context" "fmt" @@ -12,7 +11,6 @@ import ( "github.com/ibm/cassandra-operator/controllers/events" "github.com/ibm/cassandra-operator/controllers/labels" "github.com/ibm/cassandra-operator/controllers/names" - "github.com/ibm/cassandra-operator/controllers/prober" "github.com/pkg/errors" v1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" @@ -22,58 +20,67 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -func (r *CassandraClusterReconciler) reconcileAdminAuth(ctx context.Context, cc *dbv1alpha1.CassandraCluster, adminSecret *v1.Secret, proberAuth prober.Auth) error { +type credentials struct { + activeRole string + activePassword string + desiredRole string + desiredPassword string +} + +func (r *CassandraClusterReconciler) reconcileAdminAuth(ctx context.Context, cc *dbv1alpha1.CassandraCluster, desiredRole, desiredRolePassword string) (credentials, error) { + auth := credentials{ + desiredRole: desiredRole, + desiredPassword: desiredRolePassword, + } actualActiveAdminSecret := &v1.Secret{} err := r.Get(ctx, types.NamespacedName{Name: names.ActiveAdminSecret(cc.Name), Namespace: cc.Namespace}, actualActiveAdminSecret) if err != nil && kerrors.IsNotFound(err) { r.Log.Infof("Secret `%s` doesn't exist. Assuming it's the first cluster deployment.", names.ActiveAdminSecret(cc.Name)) - err := r.createClusterAdminSecrets(ctx, cc, adminSecret, proberAuth) + activeRole, activePassword, err := r.createClusterAdminSecrets(ctx, cc, desiredRole, desiredRolePassword) if err != nil { - return errors.Wrap(err, "failed to create admin secrets for newly created cluster") + return credentials{}, errors.Wrap(err, "failed to create admin secrets for newly created cluster") } - return nil + auth.activeRole = activeRole + auth.activePassword = activePassword + return auth, nil } else if err != nil { - return errors.Wrapf(err, "failed to get active admin Secret `%s`", names.ActiveAdminSecret(cc.Name)) + return credentials{}, errors.Wrapf(err, "failed to get active admin Secret `%s`", names.ActiveAdminSecret(cc.Name)) } + auth.activeRole = string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) + auth.activePassword = string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - activeAdminRoleName := string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) - activeAdminRolePassword := string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - baseAdminRoleName := string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) - baseAdminRolePassword := string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - defaultRoleInUse := activeAdminRoleName == dbv1alpha1.CassandraDefaultRole && activeAdminRolePassword == dbv1alpha1.CassandraDefaultPassword - defaultRoleIsDesired := baseAdminRoleName == dbv1alpha1.CassandraDefaultRole && baseAdminRolePassword == dbv1alpha1.CassandraDefaultPassword + defaultRoleInUse := auth.activeRole == dbv1alpha1.CassandraDefaultRole && auth.activePassword == dbv1alpha1.CassandraDefaultPassword + defaultRoleIsDesired := auth.desiredRole == dbv1alpha1.CassandraDefaultRole && auth.desiredPassword == dbv1alpha1.CassandraDefaultPassword - if defaultRoleInUse && defaultRoleIsDesired { // don't initialize if the default user is in use, unless that's what is desired - return nil + if defaultRoleInUse && defaultRoleIsDesired { // no need for password change + return auth, nil + } + + if defaultRoleInUse { // don't proceed to password change logic if still bootstrapping + return auth, nil } // Please note: your changes to Base Admin Secret won't have affect until ALL DCs are ready - passCompareRes := bytes.Compare(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword], adminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - nameCompareRes := bytes.Compare(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole], adminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) - if passCompareRes != 0 || nameCompareRes != 0 { - r.Log.Info("User role changed in the secret") + if auth.activeRole != auth.desiredRole || auth.activePassword != auth.desiredPassword { + r.Log.Info("Admin role changed in the secret") - err = r.handleAdminRoleChange(ctx, cc, adminSecret, actualActiveAdminSecret) + err = r.handleAdminRoleChange(ctx, cc, auth) if err != nil { - return errors.Wrap(err, "failed to update operator admin role") + return credentials{}, errors.Wrap(err, "failed to update operator admin role") } + auth.activeRole = auth.desiredRole + auth.activePassword = auth.desiredPassword - return nil + return auth, nil } - return nil + return auth, nil } -func (r *CassandraClusterReconciler) createClusterAdminSecrets(ctx context.Context, cc *dbv1alpha1.CassandraCluster, adminSecret *v1.Secret, proberAuth prober.Auth) error { - secretRoleName, secretRolePassword, err := extractCredentials(adminSecret) - if err != nil { - return err - } - - desiredRoleName := dbv1alpha1.CassandraDefaultRole - desiredRolePassword := dbv1alpha1.CassandraDefaultPassword - desiredSecretData := adminSecret.Data +func (r *CassandraClusterReconciler) createClusterAdminSecrets(ctx context.Context, cc *dbv1alpha1.CassandraCluster, desiredRole, desiredPassword string) (string, string, error) { + activeRoleName := dbv1alpha1.CassandraDefaultRole + activeRolePassword := dbv1alpha1.CassandraDefaultPassword if len(cc.Spec.ExternalRegions.Managed) > 0 || len(cc.Spec.ExternalRegions.Unmanaged) > 0 { if cc.Spec.Encryption.Server.InternodeEncryption == dbv1alpha1.InternodeEncryptionNone { @@ -86,9 +93,9 @@ func (r *CassandraClusterReconciler) createClusterAdminSecrets(ctx context.Conte storageExists := false if cc.Spec.Cassandra.Persistence.Enabled { pvcs := &v1.PersistentVolumeClaimList{} - err = r.List(ctx, pvcs, client.InNamespace(cc.Namespace), client.MatchingLabels(labels.ComponentLabels(cc, dbv1alpha1.CassandraClusterComponentCassandra))) + err := r.List(ctx, pvcs, client.InNamespace(cc.Namespace), client.MatchingLabels(labels.ComponentLabels(cc, dbv1alpha1.CassandraClusterComponentCassandra))) if err != nil { - return errors.Wrap(err, "can't get pvcs") + return "", "", errors.Wrap(err, "can't get pvcs") } if len(pvcs.Items) > 0 { // cluster existed before. Use the credentials from the provided secret to recreate the cluster. @@ -99,7 +106,7 @@ func (r *CassandraClusterReconciler) createClusterAdminSecrets(ctx context.Conte joiningExistingManagedRegion := false if len(cc.Spec.ExternalRegions.Managed) > 0 { - proberClient := r.ProberClient(proberURL(cc), proberAuth) + proberClient := r.ProberClient(proberURL(cc), desiredRole, desiredPassword) for _, managedRegion := range cc.Spec.ExternalRegions.Managed { regionHost := names.ProberIngressDomain(cc, managedRegion) regionReady, err := proberClient.RegionReady(ctx, regionHost) @@ -115,26 +122,25 @@ func (r *CassandraClusterReconciler) createClusterAdminSecrets(ctx context.Conte if storageExists || len(cc.Spec.ExternalRegions.Unmanaged) > 0 || joiningExistingManagedRegion { r.Log.Info("using user provided credentials to bootstrap the region") //use the user provided credentials, not cassandra/cassandra - desiredRoleName = secretRoleName - desiredRolePassword = secretRolePassword + activeRoleName = desiredRole + activeRolePassword = desiredPassword } else { r.Log.Info("using cassandra/cassandra user to bootstrap the region") } - desiredSecretData[dbv1alpha1.CassandraOperatorAdminRole] = []byte(desiredRoleName) - desiredSecretData[dbv1alpha1.CassandraOperatorAdminPassword] = []byte(desiredRolePassword) - - if cc.Spec.JMXAuth == jmxAuthenticationLocalFiles { - desiredSecretData[dbv1alpha1.CassandraOperatorJmxUsername] = []byte(secretRoleName) - desiredSecretData[dbv1alpha1.CassandraOperatorJmxPassword] = []byte(secretRolePassword) + auth := credentials{ + desiredRole: desiredRole, + desiredPassword: desiredPassword, + activeRole: activeRoleName, + activePassword: activeRolePassword, } - err = r.reconcileAdminSecrets(ctx, cc, desiredSecretData) + err := r.reconcileAdminSecrets(ctx, cc, auth) if err != nil { - return errors.Wrap(err, "failed to reconcile active admin secrets") + return "", "", errors.Wrap(err, "failed to reconcile active admin secrets") } - return nil + return activeRoleName, activeRolePassword, nil } func extractCredentials(baseAdminSecret *v1.Secret) (string, string, error) { @@ -155,14 +161,9 @@ func (r *CassandraClusterReconciler) adminRoleSecret(ctx context.Context, cc *db return baseAdminSecret, nil } -func (r *CassandraClusterReconciler) handleAdminRoleChange(ctx context.Context, cc *dbv1alpha1.CassandraCluster, actualBaseAdminSecret, actualActiveAdminSecret *v1.Secret) error { +func (r *CassandraClusterReconciler) handleAdminRoleChange(ctx context.Context, cc *dbv1alpha1.CassandraCluster, auth credentials) error { r.Log.Info("Updating admin role") - cassandraOperatorAdminRole := string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) - cassandraOperatorAdminPassword := string(actualActiveAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - newCassandraOperatorAdminPassword := string(actualBaseAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - newCassandraOperatorAdminRole := string(actualBaseAdminSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) - - err := r.updateAdminRoleInCassandra(cc, cassandraOperatorAdminRole, newCassandraOperatorAdminRole, cassandraOperatorAdminPassword, newCassandraOperatorAdminPassword) + err := r.updateAdminRoleInCassandra(cc, auth) if err != nil { errMsg := "failed to update admin role in cassandra" r.Events.Warning(cc, events.EventAdminRoleUpdateFailed, errMsg) @@ -174,7 +175,7 @@ func (r *CassandraClusterReconciler) handleAdminRoleChange(ctx context.Context, var cqlClientTestCon cql.CqlClient err = r.doWithRetry(func() error { - cqlClientTestCon, err = r.CqlClient(newCassandraConfig(cc, newCassandraOperatorAdminRole, newCassandraOperatorAdminPassword, r.Log)) + cqlClientTestCon, err = r.CqlClient(newCassandraConfig(cc, auth.desiredRole, auth.desiredPassword, r.Log)) if err != nil { return err } @@ -190,13 +191,10 @@ func (r *CassandraClusterReconciler) handleAdminRoleChange(ctx context.Context, r.Log.Info("Logged in successfully with new credentials. Updating active admin secret.") defer cqlClientTestCon.CloseSession() - if cc.Spec.JMXAuth == jmxAuthenticationLocalFiles { - actualBaseAdminSecret.Data[dbv1alpha1.CassandraOperatorJmxUsername] = []byte(newCassandraOperatorAdminRole) - actualBaseAdminSecret.Data[dbv1alpha1.CassandraOperatorJmxPassword] = []byte(newCassandraOperatorAdminPassword) - } - + auth.activeRole = auth.desiredRole + auth.activePassword = auth.desiredPassword r.Events.Normal(cc, events.EventAdminRoleChanged, "admin role has been successfully changed") - err = r.reconcileAdminSecrets(ctx, cc, actualBaseAdminSecret.Data) + err = r.reconcileAdminSecrets(ctx, cc, auth) if err != nil { return errors.Wrap(err, "failed to update admin secret") } @@ -204,8 +202,8 @@ func (r *CassandraClusterReconciler) handleAdminRoleChange(ctx context.Context, return nil } -func (r *CassandraClusterReconciler) updateAdminRoleInCassandra(cc *dbv1alpha1.CassandraCluster, oldAdminRoleName, newAdminRoleName, oldAdminPassword, newAdminPassword string) error { - cqlClient, err := r.CqlClient(newCassandraConfig(cc, newAdminRoleName, newAdminPassword, r.Log)) +func (r *CassandraClusterReconciler) updateAdminRoleInCassandra(cc *dbv1alpha1.CassandraCluster, auth credentials) error { + cqlClient, err := r.CqlClient(newCassandraConfig(cc, auth.desiredRole, auth.desiredPassword, r.Log)) if err == nil { r.Log.Info("Admin role has been already updated by a different region") cqlClient.CloseSession() @@ -213,8 +211,8 @@ func (r *CassandraClusterReconciler) updateAdminRoleInCassandra(cc *dbv1alpha1.C return nil } - r.Log.Info("Establishing cql session with role " + oldAdminRoleName) - cqlClient, err = r.CqlClient(newCassandraConfig(cc, oldAdminRoleName, oldAdminPassword, r.Log)) + r.Log.Info("Establishing cql session with role " + auth.activeRole) + cqlClient, err = r.CqlClient(newCassandraConfig(cc, auth.activeRole, auth.activePassword, r.Log)) if err != nil { return errors.Wrap(err, "Could not log in with existing credentials") } @@ -222,21 +220,21 @@ func (r *CassandraClusterReconciler) updateAdminRoleInCassandra(cc *dbv1alpha1.C r.Log.Info("Updating admin role") - if oldAdminRoleName == newAdminRoleName { - if err = cqlClient.UpdateRolePassword(oldAdminRoleName, newAdminPassword); err != nil { - return errors.Wrap(err, "Can't update role"+oldAdminRoleName) + if auth.activeRole == auth.desiredRole { + if err = cqlClient.UpdateRolePassword(auth.activeRole, auth.desiredPassword); err != nil { + return errors.Wrap(err, "Can't update role"+auth.activeRole) } r.Log.Info("Admin password in cassandra cluster is successfully updated") } else { r.Log.Info("Admin role name changed. Creating a new admin role in cassandra") cassOperatorAdminRole := cql.Role{ - Role: newAdminRoleName, + Role: auth.desiredRole, Super: true, Login: true, - Password: newAdminPassword, + Password: auth.desiredPassword, } if err = cqlClient.CreateRole(cassOperatorAdminRole); err != nil { - return errors.Wrap(err, "Can't create admin role "+oldAdminRoleName) + return errors.Wrap(err, "Can't create admin role "+auth.desiredRole) } r.Log.Info("New admin role created. Old admin role was NOT removed. Manual removal is required.") } @@ -244,7 +242,7 @@ func (r *CassandraClusterReconciler) updateAdminRoleInCassandra(cc *dbv1alpha1.C return nil } -func (r *CassandraClusterReconciler) reconcileActiveAdminSecret(ctx context.Context, cc *dbv1alpha1.CassandraCluster, data map[string][]byte) error { +func (r *CassandraClusterReconciler) reconcileActiveAdminSecret(ctx context.Context, cc *dbv1alpha1.CassandraCluster, auth credentials) error { desiredActiveAdminSecret := &v1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: names.ActiveAdminSecret(cc.Name), @@ -253,7 +251,10 @@ func (r *CassandraClusterReconciler) reconcileActiveAdminSecret(ctx context.Cont }, Immutable: proto.Bool(true), Type: v1.SecretTypeOpaque, - Data: data, + Data: map[string][]byte{ + dbv1alpha1.CassandraOperatorAdminRole: []byte(auth.activeRole), + dbv1alpha1.CassandraOperatorAdminPassword: []byte(auth.activePassword), + }, } if err := controllerutil.SetControllerReference(cc, desiredActiveAdminSecret, r.Scheme); err != nil { @@ -287,7 +288,7 @@ func (r *CassandraClusterReconciler) reconcileActiveAdminSecret(ctx context.Cont return nil } -func (r *CassandraClusterReconciler) reconcileAdminAuthConfigSecret(ctx context.Context, cc *dbv1alpha1.CassandraCluster, secretData map[string][]byte) error { +func (r *CassandraClusterReconciler) reconcileAdminAuthConfigSecret(ctx context.Context, cc *dbv1alpha1.CassandraCluster, auth credentials) error { desiredAdminAuthConfigSecret := &v1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: names.AdminAuthConfigSecret(cc.Name), @@ -299,18 +300,15 @@ func (r *CassandraClusterReconciler) reconcileAdminAuthConfigSecret(ctx context. data := make(map[string][]byte) - cassandraAdminRole := string(secretData[dbv1alpha1.CassandraOperatorAdminRole]) - cassandraAdminPassword := string(secretData[dbv1alpha1.CassandraOperatorAdminPassword]) + data["icarus-jmx"] = []byte(fmt.Sprintf("username=%s\npassword=%s\n", auth.desiredRole, auth.desiredPassword)) if cc.Spec.JMXAuth == jmxAuthenticationLocalFiles { - jmxUsername := string(secretData[dbv1alpha1.CassandraOperatorJmxUsername]) - jmxPassword := string(secretData[dbv1alpha1.CassandraOperatorJmxPassword]) - data["jmxremote.password"] = []byte(fmt.Sprintf("%s %s\n", jmxUsername, jmxPassword)) + data["jmxremote.password"] = []byte(fmt.Sprintf("%s %s\n", auth.desiredRole, auth.desiredPassword)) // jmxremote.access file is not hot-reload in runtime, so we need to set the cassandra role before the start data["jmxremote.access"] = []byte(fmt.Sprintf(`%s readwrite \ create javax.management.monitor.*, javax.management.timer.* \ unregister -`, jmxUsername)) +`, auth.desiredRole)) } if cc.Spec.Encryption.Client.Enabled { @@ -344,8 +342,8 @@ usercert = %s/%s data["cqlshrc"] = []byte(cqlshConfig) } - data[dbv1alpha1.CassandraOperatorAdminRole] = []byte(cassandraAdminRole) - data[dbv1alpha1.CassandraOperatorAdminPassword] = []byte(cassandraAdminPassword) + data[dbv1alpha1.CassandraOperatorAdminRole] = []byte(auth.activeRole) + data[dbv1alpha1.CassandraOperatorAdminPassword] = []byte(auth.activePassword) desiredAdminAuthConfigSecret.Data = data diff --git a/controllers/cassandra_icarus_container.go b/controllers/cassandra_icarus_container.go new file mode 100644 index 0000000..3853f02 --- /dev/null +++ b/controllers/cassandra_icarus_container.go @@ -0,0 +1,73 @@ +package controllers + +import ( + dbv1alpha1 "github.com/ibm/cassandra-operator/api/v1alpha1" + v1 "k8s.io/api/core/v1" +) + +func icarusContainer(cc *dbv1alpha1.CassandraCluster) v1.Container { + container := v1.Container{ + Name: "icarus", + Image: cc.Spec.Icarus.Image, + ImagePullPolicy: cc.Spec.Icarus.ImagePullPolicy, + Args: []string{"--jmx-credentials=/etc/cassandra-auth-config/icarus-jmx", "--jmx-client-auth=true"}, + VolumeMounts: []v1.VolumeMount{ + cassandraDataVolumeMount(), + authVolumeMount(), + }, + TerminationMessagePath: "/dev/termination-log", + TerminationMessagePolicy: v1.TerminationMessageReadFile, + } + + icarusPort := v1.ContainerPort{ + Name: "icarus", + ContainerPort: dbv1alpha1.IcarusPort, + Protocol: v1.ProtocolTCP, + HostPort: 0, + } + + if cc.Spec.HostPort.Enabled { + icarusPort.HostPort = dbv1alpha1.IcarusPort + } + + if cc.Spec.Encryption.Client.Enabled { + tlsArgs := []string{ + "--jmx-truststore=/etc/cassandra-client-tls/" + cc.Spec.Encryption.Client.NodeTLSSecret.TruststoreFileKey, + "--jmx-truststore-password=$(ICARUS_TRUSTSTORE_PASSWORD)", + "--jmx-keystore=/etc/cassandra-client-tls/" + cc.Spec.Encryption.Client.NodeTLSSecret.KeystoreFileKey, + "--jmx-keystore-password=$(ICARUS_KEYSTORE_PASSWORD)", + } + + container.Env = append(container.Env, + v1.EnvVar{ + Name: "ICARUS_TRUSTSTORE_PASSWORD", + ValueFrom: &v1.EnvVarSource{ + SecretKeyRef: &v1.SecretKeySelector{ + LocalObjectReference: v1.LocalObjectReference{ + Name: cc.Spec.Encryption.Client.NodeTLSSecret.Name, + }, + Key: cc.Spec.Encryption.Client.NodeTLSSecret.TruststorePasswordKey, + }, + }, + }, + v1.EnvVar{ + Name: "ICARUS_KEYSTORE_PASSWORD", + ValueFrom: &v1.EnvVarSource{ + SecretKeyRef: &v1.SecretKeySelector{ + LocalObjectReference: v1.LocalObjectReference{ + Name: cc.Spec.Encryption.Client.NodeTLSSecret.Name, + }, + Key: cc.Spec.Encryption.Client.NodeTLSSecret.KeystorePasswordKey, + }, + }, + }, + ) + + container.Args = append(container.Args, tlsArgs...) + container.VolumeMounts = append(container.VolumeMounts, cassandraClientTLSVolumeMount()) + } + + container.Ports = append(container.Ports, icarusPort) + + return container +} diff --git a/controllers/cassandra_pods_config_test.go b/controllers/cassandra_pods_config_test.go index 9f9a716..92aae6a 100755 --- a/controllers/cassandra_pods_config_test.go +++ b/controllers/cassandra_pods_config_test.go @@ -1311,7 +1311,7 @@ export PAUSE_REASON="pod is not paused" reconciler := &CassandraClusterReconciler{ Client: tClient, - ProberClient: func(url *url.URL, auth prober.Auth) prober.ProberClient { + ProberClient: func(url *url.URL, user, password string) prober.ProberClient { return proberClient }, Scheme: baseScheme, diff --git a/controllers/cassandra_rbac.go b/controllers/cassandra_rbac.go new file mode 100644 index 0000000..38b697f --- /dev/null +++ b/controllers/cassandra_rbac.go @@ -0,0 +1,165 @@ +package controllers + +import ( + "context" + + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/compare" + "github.com/ibm/cassandra-operator/controllers/labels" + "github.com/ibm/cassandra-operator/controllers/names" + + "github.com/pkg/errors" + + v1 "k8s.io/api/core/v1" + rbac "k8s.io/api/rbac/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func (r *CassandraClusterReconciler) reconcileCassandraRBAC(ctx context.Context, cc *v1alpha1.CassandraCluster) error { + if err := r.reconcileCassandraRole(ctx, cc); err != nil { + return err + } + if err := r.reconcileCassandraRoleBinding(ctx, cc); err != nil { + return err + } + if err := r.reconcileCassandraServiceAccount(ctx, cc); err != nil { + return err + } + + return nil +} + +func (r *CassandraClusterReconciler) reconcileCassandraRole(ctx context.Context, cc *v1alpha1.CassandraCluster) error { + desiredRole := &rbac.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: names.CassandraRole(cc.Name), + Namespace: cc.Namespace, + Labels: labels.CombinedComponentLabels(cc, v1alpha1.CassandraClusterComponentCassandra), + }, + Rules: []rbac.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"secrets"}, + Verbs: []string{"get", "list", "watch"}, + }, + }, + } + + // Set controller reference for role + if err := controllerutil.SetControllerReference(cc, desiredRole, r.Scheme); err != nil { + return errors.Wrap(err, "Cannot set controller reference") + } + + actualRole := &rbac.Role{} + + err := r.Get(ctx, types.NamespacedName{Name: desiredRole.Name, Namespace: desiredRole.Namespace}, actualRole) + if err != nil && kerrors.IsNotFound(err) { + r.Log.Info("Creating cassandra Role") + if err = r.Create(ctx, desiredRole); err != nil { + return errors.Wrap(err, "Unable to create cassandra role") + } + } else if err != nil { + return errors.Wrap(err, "Could not Get cassandra role") + } else if !compare.EqualRole(actualRole, desiredRole) { + r.Log.Info("Updating cassandra Role") + r.Log.Debug(compare.DiffRole(actualRole, desiredRole)) + actualRole.Rules = desiredRole.Rules + actualRole.Labels = desiredRole.Labels + if err = r.Update(ctx, actualRole); err != nil { + return errors.Wrap(err, "Could not Update cassandra role") + } + } else { + r.Log.Debug("No updates for cassandra Role") + + } + return nil +} + +func (r *CassandraClusterReconciler) reconcileCassandraRoleBinding(ctx context.Context, cc *v1alpha1.CassandraCluster) error { + desiredRoleBinding := &rbac.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: names.CassandraRoleBinding(cc.Name), + Namespace: cc.Namespace, + Labels: labels.CombinedComponentLabels(cc, v1alpha1.CassandraClusterComponentCassandra), + }, + Subjects: []rbac.Subject{ + { + Kind: "ServiceAccount", + Name: names.CassandraServiceAccount(cc.Name), + Namespace: cc.Namespace, + }, + }, + RoleRef: rbac.RoleRef{ + Kind: "Role", + Name: names.CassandraRole(cc.Name), + APIGroup: "rbac.authorization.k8s.io", + }, + } + + if err := controllerutil.SetControllerReference(cc, desiredRoleBinding, r.Scheme); err != nil { + return errors.Wrap(err, "Cannot set controller reference") + } + + actualRoleBinding := &rbac.RoleBinding{} + + err := r.Get(ctx, types.NamespacedName{Name: desiredRoleBinding.Name, Namespace: desiredRoleBinding.Namespace}, actualRoleBinding) + if err != nil && kerrors.IsNotFound(err) { + r.Log.Info("Creating cassandra RoleBinding") + if err = r.Create(ctx, desiredRoleBinding); err != nil { + return errors.Wrap(err, "Unable to create cassandra roleBinding") + } + } else if err != nil { + return errors.Wrap(err, "Could not Get cassandra roleBinding") + } else if !compare.EqualRoleBinding(actualRoleBinding, desiredRoleBinding) { + r.Log.Info("Updating cassandra RoleBinding") + r.Log.Debug(compare.DiffRoleBinding(actualRoleBinding, desiredRoleBinding)) + actualRoleBinding.Subjects = desiredRoleBinding.Subjects + actualRoleBinding.RoleRef = desiredRoleBinding.RoleRef + actualRoleBinding.Labels = desiredRoleBinding.Labels + if err = r.Update(ctx, actualRoleBinding); err != nil { + return errors.Wrap(err, "Could not Update cassandra roleBinding") + } + } else { + r.Log.Debugw("No updates for cassandra Rolebinding") + } + return nil +} + +func (r *CassandraClusterReconciler) reconcileCassandraServiceAccount(ctx context.Context, cc *v1alpha1.CassandraCluster) error { + saName := names.CassandraServiceAccount(cc.Name) + desiredServiceAccount := &v1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: saName, + Namespace: cc.Namespace, + Labels: labels.CombinedComponentLabels(cc, v1alpha1.CassandraClusterComponentCassandra), + }, + } + + if err := controllerutil.SetControllerReference(cc, desiredServiceAccount, r.Scheme); err != nil { + return errors.Wrap(err, "Cannot set controller reference") + } + + actualServiceAccount := &v1.ServiceAccount{} + err := r.Get(ctx, types.NamespacedName{Name: desiredServiceAccount.Name, Namespace: desiredServiceAccount.Namespace}, actualServiceAccount) + if err != nil && kerrors.IsNotFound(err) { + r.Log.Info("Creating Service cassandra account") + if err = r.Create(ctx, desiredServiceAccount); err != nil { + return errors.Wrap(err, "Unable to create cassandra Service account") + } + } else if err != nil { + return errors.Wrap(err, "Could not get cassandra Service account") + } else if !compare.EqualServiceAccount(actualServiceAccount, desiredServiceAccount) { + r.Log.Info("Updating cassandra Service account") + r.Log.Debugf(compare.DiffServiceAccount(actualServiceAccount, desiredServiceAccount)) + actualServiceAccount.Labels = desiredServiceAccount.Labels + if err = r.Update(ctx, actualServiceAccount); err != nil { + return errors.Wrap(err, "Unable to update cassandra service account") + } + } else { + r.Log.Debug("No updates for cassandra Service account") + } + return nil +} diff --git a/controllers/cassandra_scaling.go b/controllers/cassandra_scaling.go index 177f879..943fda4 100644 --- a/controllers/cassandra_scaling.go +++ b/controllers/cassandra_scaling.go @@ -98,7 +98,7 @@ func (r *CassandraClusterReconciler) reconcileCassandraScaling(ctx context.Conte return true, nil } -func (r CassandraClusterReconciler) handleDCsDecommission(ctx context.Context, cc *dbv1alpha1.CassandraCluster, stsNames []string, stsList map[string]appsv1.StatefulSet, allDCs []dbv1alpha1.DC, adminRoleSecret *v1.Secret, broadcastAddresses map[string]string, podList *v1.PodList) error { +func (r *CassandraClusterReconciler) handleDCsDecommission(ctx context.Context, cc *dbv1alpha1.CassandraCluster, stsNames []string, stsList map[string]appsv1.StatefulSet, allDCs []dbv1alpha1.DC, adminRoleSecret *v1.Secret, broadcastAddresses map[string]string, podList *v1.PodList) error { r.Log.Infof("Decommissioning DCs %v", stsNames) keyspacesToReconcile := desiredKeyspacesToReconcile(cc) @@ -175,7 +175,7 @@ func (r CassandraClusterReconciler) handleDCsDecommission(ctx context.Context, c return nil } -func (r CassandraClusterReconciler) handlePodDecommission(ctx context.Context, cc *dbv1alpha1.CassandraCluster, sts appsv1.StatefulSet, broadcastAddresses map[string]string, decommissionPodName string, podList *v1.PodList) error { +func (r *CassandraClusterReconciler) handlePodDecommission(ctx context.Context, cc *dbv1alpha1.CassandraCluster, sts appsv1.StatefulSet, broadcastAddresses map[string]string, decommissionPodName string, podList *v1.PodList) error { jobName := "pod-decommission-" + decommissionPodName if r.Jobs.Exists(jobName) && r.Jobs.IsRunning(jobName) { r.Log.Infof("decommission in progress, waiting to finish") @@ -257,7 +257,7 @@ func (r CassandraClusterReconciler) handlePodDecommission(ctx context.Context, c return nil } -func (r CassandraClusterReconciler) podDecommissioned(ctx context.Context, cc *dbv1alpha1.CassandraCluster, nctl nodectl.Nodectl, pods []v1.Pod, decommissionPod v1.Pod, broadcastAddresses map[string]string) (bool, error) { +func (r *CassandraClusterReconciler) podDecommissioned(ctx context.Context, cc *dbv1alpha1.CassandraCluster, nctl nodectl.Nodectl, pods []v1.Pod, decommissionPod v1.Pod, broadcastAddresses map[string]string) (bool, error) { quorum := 0 //number of pods that need to agree that the pods is decommissioned for _, pod := range pods { podDC := pod.Labels[dbv1alpha1.CassandraClusterDC] @@ -296,7 +296,7 @@ func (r CassandraClusterReconciler) podDecommissioned(ctx context.Context, cc *d return notLiveView >= quorum, nil } -func (r CassandraClusterReconciler) removeDC(ctx context.Context, cc *dbv1alpha1.CassandraCluster, sts appsv1.StatefulSet) error { +func (r *CassandraClusterReconciler) removeDC(ctx context.Context, cc *dbv1alpha1.CassandraCluster, sts appsv1.StatefulSet) error { dcName := sts.Labels[dbv1alpha1.CassandraClusterDC] reaperDeploy := &appsv1.Deployment{} diff --git a/controllers/cassandra_service.go b/controllers/cassandra_service.go index d39367c..c46ca3c 100644 --- a/controllers/cassandra_service.go +++ b/controllers/cassandra_service.go @@ -62,6 +62,13 @@ func (r *CassandraClusterReconciler) reconcileDCService(ctx context.Context, cc TargetPort: intstr.FromInt(dbv1alpha1.ThriftPort), NodePort: 0, }, + { + Name: "icarus", + Protocol: v1.ProtocolTCP, + Port: dbv1alpha1.IcarusPort, + TargetPort: intstr.FromInt(dbv1alpha1.IcarusPort), + NodePort: 0, + }, }, ClusterIP: v1.ClusterIPNone, Type: v1.ServiceTypeClusterIP, diff --git a/controllers/cassandra_statefulset.go b/controllers/cassandra_statefulset.go index 0434608..850848f 100644 --- a/controllers/cassandra_statefulset.go +++ b/controllers/cassandra_statefulset.go @@ -65,7 +65,7 @@ func (r *CassandraClusterReconciler) reconcileDCStatefulSet(ctx context.Context, desiredSts.Spec.Selector = actualSts.Spec.Selector desiredSts.Spec.Template.Labels = actualSts.Spec.Template.Labels // annotation can be used by things like `kubectl rollout sts restart` so don't overwrite it - desiredSts.Spec.Template.Annotations = actualSts.Spec.Template.Annotations + desiredSts.Spec.Template.Annotations = util.MergeMap(actualSts.Spec.Template.Annotations, desiredSts.Spec.Template.Annotations) // scaling is handled by the scaling logic desiredSts.Spec.Replicas = actualSts.Spec.Replicas if !compare.EqualStatefulSet(desiredSts, actualSts) { @@ -111,10 +111,15 @@ func cassandraStatefulSet(cc *dbv1alpha1.CassandraCluster, dc dbv1alpha1.DC, res Template: v1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: stsLabels, + Annotations: map[string]string{ + // https://kubernetes.io/docs/reference/labels-annotations-taints/#kubectl-kubernetes-io-default-container + "kubectl.kubernetes.io/default-container": "cassandra", + }, }, Spec: v1.PodSpec{ Containers: []v1.Container{ cassandraContainer(cc, dc, restartChecksum, clientTLSSecret), + icarusContainer(cc), }, InitContainers: []v1.Container{ privilegedInitContainer(cc), @@ -128,6 +133,7 @@ func cassandraStatefulSet(cc *dbv1alpha1.CassandraCluster, dc dbv1alpha1.DC, res podsConfigVolume(cc), authVolume(cc), }, + ServiceAccountName: names.CassandraServiceAccount(cc.Name), Affinity: dc.Affinity, Tolerations: dc.Tolerations, RestartPolicy: v1.RestartPolicyAlways, @@ -268,6 +274,10 @@ func authVolume(cc *dbv1alpha1.CassandraCluster) v1.Volume { Key: dbv1alpha1.CassandraOperatorAdminPassword, Path: dbv1alpha1.CassandraOperatorAdminPassword, }, + { + Key: "icarus-jmx", + Path: "icarus-jmx", + }, } if cc.Spec.Encryption.Client.Enabled { diff --git a/controllers/cassandra_tls.go b/controllers/cassandra_tls.go index 5d831d7..ee84f27 100644 --- a/controllers/cassandra_tls.go +++ b/controllers/cassandra_tls.go @@ -4,7 +4,6 @@ import ( "context" "crypto/x509" "fmt" - "io/ioutil" "log" "os" "regexp" @@ -93,23 +92,23 @@ func newTLSSecret(cc *dbv1alpha1.CassandraCluster, secretType tlsSecretType) *tl } func newTLSSecretRequiredFields(cc *dbv1alpha1.CassandraCluster, secretType tlsSecretType) ([]string, error) { - var tlsSecret dbv1alpha1.NodeTLSSecret + var nodeTLSSecret dbv1alpha1.NodeTLSSecret switch secretType { case clientNode: - tlsSecret = cc.Spec.Encryption.Client.NodeTLSSecret + nodeTLSSecret = cc.Spec.Encryption.Client.NodeTLSSecret case serverNode: - tlsSecret = cc.Spec.Encryption.Server.NodeTLSSecret + nodeTLSSecret = cc.Spec.Encryption.Server.NodeTLSSecret default: return nil, errors.New(fmt.Sprintf("only node secrets should check required fields. secretType: %v", secretType)) } return []string{ - tlsSecret.CACrtFileKey, - tlsSecret.CrtFileKey, - tlsSecret.FileKey, - tlsSecret.KeystoreFileKey, - tlsSecret.TruststoreFileKey, - tlsSecret.KeystorePasswordKey, - tlsSecret.TruststorePasswordKey, + nodeTLSSecret.CACrtFileKey, + nodeTLSSecret.CrtFileKey, + nodeTLSSecret.FileKey, + nodeTLSSecret.KeystoreFileKey, + nodeTLSSecret.TruststoreFileKey, + nodeTLSSecret.KeystorePasswordKey, + nodeTLSSecret.TruststorePasswordKey, }, nil } @@ -130,30 +129,30 @@ func (r *CassandraClusterReconciler) validateTLSFields(cc *dbv1alpha1.CassandraC func (r *CassandraClusterReconciler) reconcileNodeTLSSecret(ctx context.Context, cc *dbv1alpha1.CassandraCluster, restartChecksum checksumContainer, secretType tlsSecretType) (*v1.Secret, error) { secret := newTLSSecret(cc, secretType) - tlsSecret, err := r.getTLSSecret(ctx, cc, secret, false) + k8sTLSSecret, err := r.getTLSSecret(ctx, cc, secret, false) if err != nil { return nil, err } - if err = r.validateTLSFields(cc, tlsSecret, secretType); err != nil { + if err = r.validateTLSFields(cc, k8sTLSSecret, secretType); err != nil { return nil, errors.Wrapf(err, "failed to validate %s: %s fields", secret.fieldPath, *secret.secretName) } annotations := make(map[string]string) annotations[dbv1alpha1.CassandraClusterInstance] = cc.Name - err = r.reconcileAnnotations(ctx, tlsSecret, annotations) + err = r.reconcileAnnotations(ctx, k8sTLSSecret, annotations) if err != nil { - return nil, errors.Wrapf(err, "failed to reconcile annotations for secret `%s`", tlsSecret.Name) + return nil, errors.Wrapf(err, "failed to reconcile annotations for secret `%s`", k8sTLSSecret.Name) } - restartChecksum[secret.annotation] = fmt.Sprintf("%v", tlsSecret.Data) - return tlsSecret, nil + restartChecksum[secret.annotation] = fmt.Sprintf("%v", k8sTLSSecret.Data) + return k8sTLSSecret, nil } func (r *CassandraClusterReconciler) getTLSSecret(ctx context.Context, cc *dbv1alpha1.CassandraCluster, secret *tlsSecret, allowDefaulting bool) (*v1.Secret, error) { - tlsSecret := &v1.Secret{} + k8sTLSSecret := &v1.Secret{} secretName := secret.secretName - err := r.Get(ctx, types.NamespacedName{Name: *secretName, Namespace: cc.Namespace}, tlsSecret) + err := r.Get(ctx, types.NamespacedName{Name: *secretName, Namespace: cc.Namespace}, k8sTLSSecret) if err != nil && kerrors.IsNotFound(err) { errMsg := fmt.Sprintf("%s: %s was not found", secret.fieldPath, *secretName) if allowDefaulting { @@ -164,7 +163,7 @@ func (r *CassandraClusterReconciler) getTLSSecret(ctx context.Context, cc *dbv1a } else if err != nil { return nil, errors.Wrapf(err, "failed to get secret: %s: %s", secret.fieldPath, *secretName) } - return tlsSecret, nil + return k8sTLSSecret, nil } func (r *CassandraClusterReconciler) reconcileServerEncryption(ctx context.Context, cc *dbv1alpha1.CassandraCluster) error { @@ -456,17 +455,17 @@ func (r *CassandraClusterReconciler) setupClientTLSFiles(ctx context.Context, cc return errors.Wrapf(err, "failed to create directory: %s", names.OperatorClientTLSDir(cc)) } - err = ioutil.WriteFile(fmt.Sprintf("%s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.CACrtFileKey), clientTLSSecret.Data[cc.Spec.Encryption.Client.NodeTLSSecret.CACrtFileKey], 0600) + err = os.WriteFile(fmt.Sprintf("%s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.CACrtFileKey), clientTLSSecret.Data[cc.Spec.Encryption.Client.NodeTLSSecret.CACrtFileKey], 0600) if err != nil { return errors.Wrapf(err, "failed to write CA certificate into file %s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.CACrtFileKey) } - err = ioutil.WriteFile(fmt.Sprintf("%s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.FileKey), clientTLSSecret.Data[cc.Spec.Encryption.Client.NodeTLSSecret.FileKey], 0600) + err = os.WriteFile(fmt.Sprintf("%s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.FileKey), clientTLSSecret.Data[cc.Spec.Encryption.Client.NodeTLSSecret.FileKey], 0600) if err != nil { return errors.Wrapf(err, "failed to write private key into file %s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.FileKey) } - err = ioutil.WriteFile(fmt.Sprintf("%s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.CrtFileKey), clientTLSSecret.Data[cc.Spec.Encryption.Client.NodeTLSSecret.CrtFileKey], 0600) + err = os.WriteFile(fmt.Sprintf("%s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.CrtFileKey), clientTLSSecret.Data[cc.Spec.Encryption.Client.NodeTLSSecret.CrtFileKey], 0600) if err != nil { return errors.Wrapf(err, "failed to write certificate into file %s/%s", names.OperatorClientTLSDir(cc), cc.Spec.Encryption.Client.NodeTLSSecret.CrtFileKey) } diff --git a/controllers/cassandrabackup/backup.go b/controllers/cassandrabackup/backup.go new file mode 100644 index 0000000..f3c6ef9 --- /dev/null +++ b/controllers/cassandrabackup/backup.go @@ -0,0 +1,126 @@ +package cassandrabackup + +import ( + "context" + "strings" + "time" + + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" + ctrl "sigs.k8s.io/controller-runtime" +) + +func (r *CassandraBackupReconciler) reconcileBackup(ctx context.Context, ic icarus.Icarus, cb *v1alpha1.CassandraBackup, cc *v1alpha1.CassandraCluster) (ctrl.Result, error) { + existingBackups, err := ic.Backups(ctx) + if err != nil { + return ctrl.Result{}, err + } + + icarusBackup, relatedIcarusBackupFound := r.findRelatedBackup(cb, existingBackups) + + if cb.Status.State == icarus.StateFailed { + if !relatedIcarusBackupFound { + r.Log.Infof("Backup %s/%s has failed and no respecting backup record found in icarus. "+ + "Recreate the CassandraBackup resource to start a new backup attempt", cb.Namespace, cb.Name) + return ctrl.Result{}, err + } + + return ctrl.Result{}, r.reconcileFailedBackup(ctx, ic, icarusBackup, cc, cb) + } + + // create if not found or found, but it's a new backup with the same tag (e.g. incremental backup) + if !relatedIcarusBackupFound || (relatedIcarusBackupFound && len(cb.Status.State) == 0) { + icarusBackup, err = ic.Backup(ctx, createBackupRequest(cc, cb)) + if err != nil { + return ctrl.Result{}, err + } + + r.Log.Debugf("Backup request sent") + } + + err = r.reconcileStatus(ctx, cb, icarusBackup) + if err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil +} + +func (r *CassandraBackupReconciler) findRelatedBackup(cb *v1alpha1.CassandraBackup, icarusBackups []icarus.Backup) (icarus.Backup, bool) { + if len(icarusBackups) == 0 { + return icarus.Backup{}, false + } + + tagName := cb.Spec.SnapshotTag + if len(tagName) == 0 { + tagName = cb.Name + } + var relatedBackups []icarus.Backup //there may be several related backups if we retried a failed one + for _, item := range icarusBackups { + if !strings.Contains(item.SnapshotTag, tagName+"-"+item.SchemaVersion) && tagName != item.SnapshotTag { + continue + } + + if !item.GlobalRequest { //only look at coordinator requests + continue + } + relatedBackups = append(relatedBackups, item) + } + + if len(relatedBackups) == 0 { + return icarus.Backup{}, false + } + + var existingBackup icarus.Backup + if len(relatedBackups) == 1 { + existingBackup = relatedBackups[0] + } else { // more than one related backups + for i, relatedBackup := range relatedBackups { + candidateCreatedTime, err := time.Parse(time.RFC3339, relatedBackup.CreationTime) + if err != nil { + r.Log.Warnf("Couldn't parse creation time for backup with id %s. Skipping it", relatedBackup.ID) + continue + } + if len(existingBackup.CreationTime) == 0 { + existingBackup = relatedBackups[i] + continue + } + + existingBackupCreatedTime, err := time.Parse(time.RFC3339, existingBackup.CreationTime) + if err != nil { + r.Log.Warnf("skipping backup with ID %s and tag %s. Couldn't parse create time to determine if it's the most recent backup: %s", + relatedBackup.ID, relatedBackup.SnapshotTag, err.Error()) + continue + } + if existingBackupCreatedTime.Before(candidateCreatedTime) { + //this one is created later, use it + existingBackup = relatedBackups[i] + } + } + } + + return existingBackup, true +} + +func (r *CassandraBackupReconciler) reconcileFailedBackup(ctx context.Context, ic icarus.Icarus, existingBackup icarus.Backup, + cc *v1alpha1.CassandraCluster, cb *v1alpha1.CassandraBackup) error { + newBackupRequest := createBackupRequest(cc, cb) + if r.backupConfigChanged(existingBackup, newBackupRequest) { + r.Log.Info("Detected a configuration change for backup %s/%s, sending a new backup request", cb.Namespace, cb.Name) + icarusBackup, err := ic.Backup(ctx, newBackupRequest) + if err != nil { + return err + } + + cb.Status = v1alpha1.CassandraBackupStatus{} // reset status since we're restarting backup in Icarus + err = r.reconcileStatus(ctx, cb, icarusBackup) + if err != nil { + return err + } + } else { + r.Log.Infof("Backup %s/%s has failed. Assuming configuration error. "+ + "Apply the fixed CassandraBackup configuration to trigger a new retry", cb.Namespace, cb.Name) + } + + return nil +} diff --git a/controllers/cassandrabackup/controller.go b/controllers/cassandrabackup/controller.go new file mode 100644 index 0000000..4d05e6f --- /dev/null +++ b/controllers/cassandrabackup/controller.go @@ -0,0 +1,122 @@ +package cassandrabackup + +import ( + "context" + "fmt" + + v1 "k8s.io/api/core/v1" + + "github.com/pkg/errors" + "go.uber.org/zap" + + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/config" + "github.com/ibm/cassandra-operator/controllers/events" + "github.com/ibm/cassandra-operator/controllers/icarus" + "github.com/ibm/cassandra-operator/controllers/names" + + kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// CassandraBackupReconciler reconciles a CassandraCluster object +type CassandraBackupReconciler struct { + client.Client + Log *zap.SugaredLogger + Scheme *runtime.Scheme + Cfg config.Config + Events *events.EventRecorder + IcarusClient func(coordinatorPodURL string) icarus.Icarus +} + +// +kubebuilder:rbac:groups=db.ibm.com,resources=cassandrabackups,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=db.ibm.com,resources=cassandrabackups/status,verbs=get;update;patch + +func (r *CassandraBackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + cb := &v1alpha1.CassandraBackup{} + err := r.Get(ctx, req.NamespacedName, cb) + if err != nil { + if kerrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + if cb.Status.State == icarus.StateCompleted { + r.Log.Debugf("Backup %v is compeleted", cb.Name) + return ctrl.Result{}, nil + } + + cc := &v1alpha1.CassandraCluster{} + err = r.Get(ctx, types.NamespacedName{Name: cb.Spec.CassandraCluster, Namespace: cb.Namespace}, cc) + if err != nil { + if kerrors.IsNotFound(err) { + errMsg := fmt.Sprintf("Failed to create backup for cluster %q. Cluster not found.", cb.Spec.CassandraCluster) + r.Log.Warn(errMsg) + r.Events.Warning(cb, events.EventCassandraClusterNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + return ctrl.Result{}, err + } + + if !cc.Status.Ready { + r.Log.Warnf("CassandraCluster %s/%s is not ready. Not starting backup, trying again in %s...", cc.Namespace, cc.Name, r.Cfg.RetryDelay) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + storageCredentials := &v1.Secret{} + err = r.Get(ctx, types.NamespacedName{Name: cb.Spec.SecretName, Namespace: cb.Namespace}, storageCredentials) + if err != nil { + if kerrors.IsNotFound(err) { + errMsg := fmt.Sprintf("Failed to create backup for cluster %q. Storage credentials secret %q not found.", cb.Spec.CassandraCluster, cb.Spec.SecretName) + r.Log.Warn(errMsg) + r.Events.Warning(cb, events.EventStorageCredentialsSecretNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + return ctrl.Result{}, err + } + + err = v1alpha1.ValidateStorageSecret(r.Log, storageCredentials, cb.StorageProvider()) + if err != nil { + errMsg := fmt.Sprintf("Storage credentials secret %q is invalid: %s", cb.Spec.SecretName, err.Error()) + r.Log.Warn(errMsg) + r.Events.Warning(cb, events.EventStorageCredentialsSecretNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + dcName := cc.Spec.DCs[0].Name + svc := names.DC(cc.Name, dcName) + //always use the same pod as the coordinator as only that pod has the global request info + coordinatorPodURL := fmt.Sprintf("http://%s-0.%s.%s.svc.cluster.local:%d", svc, svc, cc.Namespace, v1alpha1.IcarusPort) + + ic := r.IcarusClient(coordinatorPodURL) + + res, err := r.reconcileBackup(ctx, ic, cb, cc) + if err != nil { + if statusErr, ok := errors.Cause(err).(*kerrors.StatusError); ok && statusErr.ErrStatus.Reason == metav1.StatusReasonConflict { + r.Log.Info("Conflict occurred. Retrying...", zap.Error(err)) + return ctrl.Result{Requeue: true}, nil //retry but do not treat conflicts as errors + } + + r.Log.Errorf("%+v", err) + return ctrl.Result{}, err + } + + return res, nil +} + +func SetupCassandraBackupReconciler(r reconcile.Reconciler, mgr manager.Manager) error { + builder := ctrl.NewControllerManagedBy(mgr). + Named("cassandrabackup"). + For(&v1alpha1.CassandraBackup{}) + + return builder.Complete(r) +} diff --git a/controllers/cassandrabackup/icarus.go b/controllers/cassandrabackup/icarus.go new file mode 100644 index 0000000..6357d00 --- /dev/null +++ b/controllers/cassandrabackup/icarus.go @@ -0,0 +1,118 @@ +package cassandrabackup + +import ( + "fmt" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" +) + +func createBackupRequest(cc *v1alpha1.CassandraCluster, backup *v1alpha1.CassandraBackup) icarus.BackupRequest { + storageLocation := backup.Spec.StorageLocation + if storageLocation[len(storageLocation):] != "/" { + storageLocation += "/" + } + + storageLocation = fmt.Sprintf("%s%s/%s/1", storageLocation, cc.Name, cc.Spec.DCs[0].Name) + tagName := backup.Spec.SnapshotTag + if len(tagName) == 0 { + tagName = backup.Name + } + backupRequest := icarus.BackupRequest{ + Type: "backup", + StorageLocation: storageLocation, + DataDirs: []string{"/var/lib/cassandra/data"}, + GlobalRequest: true, + SnapshotTag: tagName, + K8sNamespace: backup.Namespace, + K8sSecretName: backup.Spec.SecretName, + ConcurrentConnections: backup.Spec.ConcurrentConnections, + DC: backup.Spec.DC, + Entities: backup.Spec.Entities, + Timeout: backup.Spec.Timeout, + Duration: backup.Spec.Duration, + MetadataDirective: backup.Spec.MetadataDirective, + Insecure: backup.Spec.Insecure, + CreateMissingBucket: backup.Spec.CreateMissingBucket, + SkipBucketVerification: backup.Spec.SkipBucketVerification, + SkipRefreshing: backup.Spec.SkipRefreshing, + Retry: icarus.Retry{ + Interval: backup.Spec.Retry.Interval, + Strategy: backup.Spec.Retry.Strategy, + MaxAttempts: backup.Spec.Retry.MaxAttempts, + Enabled: backup.Spec.Retry.Enabled, + }, + } + + if backup.Spec.Bandwidth != nil && backup.Spec.Bandwidth.Value > 0 { + backupRequest.Bandwidth = &icarus.DataRate{ + Value: backup.Spec.Bandwidth.Value, + Unit: backup.Spec.Bandwidth.Unit, + } + } + + if backupRequest.ConcurrentConnections == 0 { + backupRequest.ConcurrentConnections = 10 + } + + if backupRequest.Timeout == 0 { + backupRequest.Timeout = 5 + } + + if backupRequest.MetadataDirective == "" { + backupRequest.MetadataDirective = "COPY" + } + + if backupRequest.Retry.Interval == 0 { + backupRequest.Retry.Interval = 10 + } + + if backupRequest.Retry.Strategy == "" { + backupRequest.Retry.Strategy = "LINEAR" + } + + if backupRequest.Retry.MaxAttempts == 0 { + backupRequest.Retry.MaxAttempts = 3 + } + + return backupRequest +} + +func (r *CassandraBackupReconciler) backupConfigChanged(existingBackup icarus.Backup, backupReq icarus.BackupRequest) bool { + oldReq := icarus.BackupRequest{ + Type: "backup", + StorageLocation: existingBackup.StorageLocation, + DataDirs: existingBackup.DataDirs, + GlobalRequest: true, + SnapshotTag: existingBackup.SnapshotTag, + K8sNamespace: existingBackup.K8sNamespace, + K8sSecretName: existingBackup.K8sSecretName, + Duration: existingBackup.Duration, + Bandwidth: existingBackup.Bandwidth, + ConcurrentConnections: existingBackup.ConcurrentConnections, + DC: existingBackup.DC, + Entities: existingBackup.Entities, + Timeout: existingBackup.Timeout, + MetadataDirective: existingBackup.MetadataDirective, + Insecure: existingBackup.Insecure, + CreateMissingBucket: existingBackup.CreateMissingBucket, + SkipRefreshing: existingBackup.SkipRefreshing, + SkipBucketVerification: existingBackup.SkipBucketVerification, + Retry: icarus.Retry{ + Interval: existingBackup.Retry.Interval, + MaxAttempts: existingBackup.Retry.MaxAttempts, + Enabled: existingBackup.Retry.Enabled, + Strategy: existingBackup.Retry.Strategy, + }, + } + + cmpIgnoreFields := cmpopts.IgnoreFields(icarus.BackupRequest{}, "SnapshotTag") + if !cmp.Equal(oldReq, backupReq, cmpIgnoreFields) { + r.Log.Debugf(cmp.Diff(oldReq, backupReq, cmpIgnoreFields)) + return true + } + + return false +} diff --git a/controllers/cassandrabackup/status.go b/controllers/cassandrabackup/status.go new file mode 100644 index 0000000..05e2578 --- /dev/null +++ b/controllers/cassandrabackup/status.go @@ -0,0 +1,40 @@ +package cassandrabackup + +import ( + "context" + + "github.com/google/go-cmp/cmp" + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" +) + +func (r *CassandraBackupReconciler) reconcileStatus(ctx context.Context, cb *v1alpha1.CassandraBackup, relatedIcarusBackup icarus.Backup) error { + backupStatus := cb.DeepCopy() + //found, update state + backupStatus.Status.Progress = int(relatedIcarusBackup.Progress * 100) + if cb.Status.State != relatedIcarusBackup.State { + backupStatus.Status.State = relatedIcarusBackup.State + if relatedIcarusBackup.State == icarus.StateFailed { + var backupErrors []v1alpha1.BackupError + for _, backupError := range relatedIcarusBackup.Errors { + backupErrors = append(backupErrors, v1alpha1.BackupError{ + Source: backupError.Source, + Message: backupError.Message, + }) + } + + backupStatus.Status.Errors = backupErrors + } + } + + if !cmp.Equal(cb.Status, backupStatus.Status) { + r.Log.Info("Updating backup status") + r.Log.Debugf(cmp.Diff(cb.Status, backupStatus.Status)) + err := r.Status().Update(ctx, backupStatus) + if err != nil { + return err + } + } + + return nil +} diff --git a/controllers/cassandrarestore/controller.go b/controllers/cassandrarestore/controller.go new file mode 100644 index 0000000..a12a458 --- /dev/null +++ b/controllers/cassandrarestore/controller.go @@ -0,0 +1,141 @@ +package cassandrarestore + +import ( + "context" + "fmt" + + v1 "k8s.io/api/core/v1" + + "github.com/pkg/errors" + "go.uber.org/zap" + + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/config" + "github.com/ibm/cassandra-operator/controllers/events" + "github.com/ibm/cassandra-operator/controllers/icarus" + "github.com/ibm/cassandra-operator/controllers/names" + + kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// CassandraRestoreReconciler reconciles a CassandraRestore object +type CassandraRestoreReconciler struct { + client.Client + Log *zap.SugaredLogger + Scheme *runtime.Scheme + Cfg config.Config + Events *events.EventRecorder + IcarusClient func(coordinatorPodURL string) icarus.Icarus +} + +// +kubebuilder:rbac:groups=db.ibm.com,resources=cassandrarestores,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=db.ibm.com,resources=cassandrarestores/status,verbs=get;update;patch + +func (r *CassandraRestoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + cr := &v1alpha1.CassandraRestore{} + err := r.Get(ctx, req.NamespacedName, cr) + if err != nil { + if kerrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + if cr.Status.State == icarus.StateCompleted { + r.Log.Debugf("Restore %s is completed", cr.Name) + return ctrl.Result{}, nil + } + + cc := &v1alpha1.CassandraCluster{} + err = r.Get(ctx, types.NamespacedName{Name: cr.Spec.CassandraCluster, Namespace: cr.Namespace}, cc) + if err != nil { + if kerrors.IsNotFound(err) { + errMsg := fmt.Sprintf("Restore failed. CassandraCluster %s not found", cr.Spec.CassandraCluster) + r.Log.Warn(errMsg) + r.Events.Warning(cr, events.EventCassandraClusterNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + return ctrl.Result{}, err + } + + if !cc.Status.Ready { + r.Log.Warnf("CassandraCluster %s/%s is not ready. Not starting backup, trying again in %s...", cc.Namespace, cc.Name, r.Cfg.RetryDelay) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + cb := &v1alpha1.CassandraBackup{} + if len(cr.Spec.CassandraBackup) > 0 { + err = r.Get(ctx, types.NamespacedName{Name: cr.Spec.CassandraBackup, Namespace: cr.Namespace}, cb) + if err != nil { + if kerrors.IsNotFound(err) { + errMsg := fmt.Sprintf("Restore failed. CassandraBackup %s not found", cr.Spec.CassandraBackup) + r.Log.Warn(errMsg) + r.Events.Warning(cr, events.EventCassandraBackupNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + return ctrl.Result{}, err + } + } + + secretName := cr.Spec.SecretName + if len(secretName) == 0 { + secretName = cb.Spec.SecretName + } + + storageCredentials := &v1.Secret{} + err = r.Get(ctx, types.NamespacedName{Name: secretName, Namespace: cr.Namespace}, storageCredentials) + if err != nil { + if kerrors.IsNotFound(err) { + errMsg := fmt.Sprintf("Failed to create backup for cluster %q. Storage credentials secret %q not found.", cb.Spec.CassandraCluster, cb.Spec.SecretName) + r.Log.Warn(errMsg) + r.Events.Warning(cb, events.EventStorageCredentialsSecretNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + return ctrl.Result{}, err + } + + err = v1alpha1.ValidateStorageSecret(r.Log, storageCredentials, cb.StorageProvider()) + if err != nil { + errMsg := fmt.Sprintf("Storage credentials secret %q is invalid: %s", cb.Spec.SecretName, err.Error()) + r.Log.Warn(errMsg) + r.Events.Warning(cb, events.EventStorageCredentialsSecretNotFound, errMsg) + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + dcName := cc.Spec.DCs[0].Name + svc := names.DC(cc.Name, dcName) + //always use the same pod as the coordinator as only that pod has the global request info + coordinatorPodURL := fmt.Sprintf("http://%s-0.%s.%s.svc.cluster.local:%d", svc, svc, cc.Namespace, v1alpha1.IcarusPort) + + ic := r.IcarusClient(coordinatorPodURL) + + res, err := r.reconcileRestore(ctx, ic, cr, cb, cc) + if err != nil { + if statusErr, ok := errors.Cause(err).(*kerrors.StatusError); ok && statusErr.ErrStatus.Reason == metav1.StatusReasonConflict { + r.Log.Info("Conflict occurred. Retrying...", zap.Error(err)) + return ctrl.Result{Requeue: true}, nil //retry but do not treat conflicts as errors + } + + r.Log.Errorf("%+v", err) + return ctrl.Result{}, err + } + + return res, nil +} + +func SetupCassandraRestoreReconciler(r reconcile.Reconciler, mgr manager.Manager) error { + builder := ctrl.NewControllerManagedBy(mgr). + Named("cassandrarestore"). + For(&v1alpha1.CassandraRestore{}) + + return builder.Complete(r) +} diff --git a/controllers/cassandrarestore/icarus.go b/controllers/cassandrarestore/icarus.go new file mode 100644 index 0000000..46e1e46 --- /dev/null +++ b/controllers/cassandrarestore/icarus.go @@ -0,0 +1,141 @@ +package cassandrarestore + +import ( + "fmt" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" +) + +func createRestoreReq(cc *v1alpha1.CassandraCluster, backup *v1alpha1.CassandraBackup, restore *v1alpha1.CassandraRestore) icarus.RestoreRequest { + storageLocation := restore.Spec.StorageLocation + if len(storageLocation) == 0 { + storageLocation = backup.Spec.StorageLocation + } + if storageLocation[len(storageLocation):] != "/" { + storageLocation += "/" + } + + storageLocation = fmt.Sprintf("%s%s/%s/1", storageLocation, cc.Name, cc.Spec.DCs[0].Name) + snapshotTag := restore.Spec.SnapshotTag + if len(snapshotTag) == 0 { + snapshotTag = backup.Name + } + + secretName := restore.Spec.SecretName + if len(secretName) == 0 { + secretName = backup.Spec.SecretName + } + + restoreReq := icarus.RestoreRequest{ + Type: "restore", + DC: restore.Spec.DC, + StorageLocation: storageLocation, + SnapshotTag: snapshotTag, + DataDirs: []string{"/var/lib/cassandra/data"}, + GlobalRequest: true, + RestorationStrategyType: "HARDLINKS", + RestorationPhase: "INIT", + Import: icarus.RestoreImport{ + Type: "import", + SourceDir: "file:///var/lib/cassandra/downloadedsstables", + KeepLevel: restore.Spec.Import.KeepLevel, + NoVerify: restore.Spec.Import.NoVerify, + NoVerifyTokens: restore.Spec.Import.NoVerifyTokens, + NoInvalidateCaches: restore.Spec.Import.NoInvalidateCaches, + Quick: restore.Spec.Import.Quick, + ExtendedVerify: restore.Spec.Import.ExtendedVerify, + KeepRepaired: restore.Spec.Import.KeepRepaired, + }, + K8sNamespace: restore.Namespace, + K8sSecretName: secretName, + Entities: restore.Spec.Entities, + SinglePhase: false, + Retry: icarus.Retry{ + Interval: restore.Spec.Retry.Interval, + Strategy: restore.Spec.Retry.Strategy, + MaxAttempts: restore.Spec.Retry.MaxAttempts, + Enabled: restore.Spec.Retry.Enabled, + }, + ConcurrentConnections: restore.Spec.ConcurrentConnections, + SkipBucketVerification: restore.Spec.SkipBucketVerification, + Timeout: restore.Spec.Timeout, + NoDeleteDownloads: restore.Spec.NoDeleteDownloads, + NoDeleteTruncates: restore.Spec.NoDeleteTruncates, + NoDownloadData: restore.Spec.NoDownloadData, + Insecure: restore.Spec.Insecure, + Rename: restore.Spec.Rename, + ResolveHostIdFromTopology: restore.Spec.ResolveHostIdFromTopology, + ExactSchemaVersion: restore.Spec.ExactSchemaVersion, + SchemaVersion: restore.Spec.SchemaVersion, + } + + if restoreReq.ConcurrentConnections == 0 { + restoreReq.ConcurrentConnections = 10 + } + + if restoreReq.Timeout == 0 { + restoreReq.Timeout = 5 + } + + if restoreReq.Retry.Interval == 0 { + restoreReq.Retry.Interval = 10 + } + + if restoreReq.Retry.MaxAttempts == 0 { + restoreReq.Retry.MaxAttempts = 3 + } + + if len(restoreReq.Retry.Strategy) == 0 { + restoreReq.Retry.Strategy = "LINEAR" + } + + if restoreReq.Rename == nil { + restoreReq.Rename = make(map[string]string) + } + + return restoreReq +} + +func (r *CassandraRestoreReconciler) restoreConfigChanged(existingRestore icarus.Restore, restoreReq icarus.RestoreRequest) bool { + oldReq := icarus.RestoreRequest{ + Type: "restore", + StorageLocation: existingRestore.StorageLocation, + DataDirs: existingRestore.DataDirs, + GlobalRequest: true, + SnapshotTag: existingRestore.SnapshotTag, + K8sNamespace: existingRestore.K8sNamespace, + K8sSecretName: existingRestore.K8sSecretName, + ConcurrentConnections: existingRestore.ConcurrentConnections, + DC: existingRestore.DC, + Entities: existingRestore.Entities, + Timeout: existingRestore.Timeout, + Insecure: existingRestore.Insecure, + SkipBucketVerification: existingRestore.SkipBucketVerification, + Retry: icarus.Retry{ + Interval: existingRestore.Retry.Interval, + MaxAttempts: existingRestore.Retry.MaxAttempts, + Enabled: existingRestore.Retry.Enabled, + Strategy: existingRestore.Retry.Strategy, + }, + ResolveHostIdFromTopology: existingRestore.ResolveHostIdFromTopology, + Rename: existingRestore.Rename, + NoDownloadData: existingRestore.NoDownloadData, + NoDeleteTruncates: existingRestore.NoDeleteTruncates, + NoDeleteDownloads: existingRestore.NoDeleteDownloads, + SinglePhase: existingRestore.SinglePhase, + Import: existingRestore.Import, + RestorationPhase: existingRestore.RestorationPhase, + RestorationStrategyType: existingRestore.RestorationStrategyType, + } + + cmpIgnoreFields := cmpopts.IgnoreFields(icarus.RestoreRequest{}, "SnapshotTag") + if !cmp.Equal(oldReq, restoreReq, cmpIgnoreFields) { + r.Log.Debugf(cmp.Diff(oldReq, restoreReq, cmpIgnoreFields)) + return true + } + + return false +} diff --git a/controllers/cassandrarestore/restore.go b/controllers/cassandrarestore/restore.go new file mode 100644 index 0000000..9dad1b0 --- /dev/null +++ b/controllers/cassandrarestore/restore.go @@ -0,0 +1,95 @@ +package cassandrarestore + +import ( + "context" + + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" + "github.com/pkg/errors" + ctrl "sigs.k8s.io/controller-runtime" +) + +func (r *CassandraRestoreReconciler) reconcileRestore(ctx context.Context, ic icarus.Icarus, + cr *v1alpha1.CassandraRestore, cb *v1alpha1.CassandraBackup, cc *v1alpha1.CassandraCluster) (ctrl.Result, error) { + snapshotTag := cr.Spec.SnapshotTag + if len(snapshotTag) == 0 { + if cb == nil { + return ctrl.Result{}, errors.New("No snapshotTag specified. " + + "It should be in the CassandraRestore spec (.spec.snapshotTag) or a CassandraBackup should be specified (.spec.cassandraBackup)") + } + snapshotTag = cb.Name + } + + icarusRestores, err := ic.Restores(ctx) + if err != nil { + return ctrl.Result{}, err + } + + relatedIcarusRestore, relatedIcarusRestoreFound := findRelatedIcarusRestore(icarusRestores, snapshotTag) + + if cr.Status.State == icarus.StateFailed { + if !relatedIcarusRestoreFound { + r.Log.Infof("Restore %s/%s has failed and no matching restore request found in icarus. "+ + "Recreate the CassandraRestore resource to start a new restore attempt", cr.Namespace, cr.Name) + return ctrl.Result{}, nil + } + return ctrl.Result{}, r.reconcileFailedRestore(ctx, ic, cr, cb, cc, relatedIcarusRestore) + } + + if !relatedIcarusRestoreFound { // doesn't exist yet, create it + err = ic.Restore(ctx, createRestoreReq(cc, cb, cr)) + if err != nil { + return ctrl.Result{}, err + } + + r.Log.Info("Restore request sent") + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil + } + + err = r.reconcileStatus(ctx, cr, relatedIcarusRestore) + if err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil +} + +func findRelatedIcarusRestore(icarusRestores []icarus.Restore, snapshotTag string) (icarus.Restore, bool) { + for i, existingRestore := range icarusRestores { + if !existingRestore.GlobalRequest { //filter out non coordinator requests + continue + } + + if existingRestore.SnapshotTag != snapshotTag { + continue + } + + return icarusRestores[i], true + } + + return icarus.Restore{}, false +} + +func (r *CassandraRestoreReconciler) reconcileFailedRestore(ctx context.Context, ic icarus.Icarus, cr *v1alpha1.CassandraRestore, + cb *v1alpha1.CassandraBackup, cc *v1alpha1.CassandraCluster, relatedIcarusRestore icarus.Restore) error { + newRestoreRequest := createRestoreReq(cc, cb, cr) + if r.restoreConfigChanged(relatedIcarusRestore, newRestoreRequest) { + r.Log.Info("Detected a configuration change for restore %s/%s, sending a new restore request", cb.Namespace, cb.Name) + err := ic.Restore(ctx, newRestoreRequest) + if err != nil { + return err + } + + cr.Status = v1alpha1.CassandraRestoreStatus{} // reset status since we're restarting restore in Icarus + err = r.Status().Update(ctx, cr) + if err != nil { + return err + } + + return nil + } else { + r.Log.Debugf("Backup %s/%s has failed. Assuming configration error. "+ + "Apply the fixed CassandraRestore configuration to trigger a new retry", cb.Namespace, cb.Name) + } + return nil +} diff --git a/controllers/cassandrarestore/status.go b/controllers/cassandrarestore/status.go new file mode 100644 index 0000000..27cf946 --- /dev/null +++ b/controllers/cassandrarestore/status.go @@ -0,0 +1,39 @@ +package cassandrarestore + +import ( + "context" + + "github.com/google/go-cmp/cmp" + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" +) + +func (r *CassandraRestoreReconciler) reconcileStatus(ctx context.Context, cr *v1alpha1.CassandraRestore, relatedIcarusRestore icarus.Restore) error { + restoreStatus := cr.DeepCopy() + restoreStatus.Status.Progress = int(relatedIcarusRestore.Progress * 100) + if cr.Status.State != relatedIcarusRestore.State { + restoreStatus.Status.State = relatedIcarusRestore.State + if relatedIcarusRestore.State == icarus.StateFailed { + var restoreErrors []v1alpha1.RestoreError + for _, restoreError := range relatedIcarusRestore.Errors { + restoreErrors = append(restoreErrors, v1alpha1.RestoreError{ + Source: restoreError.Source, + Message: restoreError.Message, + }) + } + + restoreStatus.Status.Errors = restoreErrors + } + } + + if !cmp.Equal(cr.Status, restoreStatus.Status) { + r.Log.Info("Updating restore status") + r.Log.Debugf(cmp.Diff(cr.Status, restoreStatus.Status)) + err := r.Status().Update(ctx, restoreStatus) + if err != nil { + return err + } + } + + return nil +} diff --git a/controllers/config/config.go b/controllers/config/config.go index 4216e89..369e4ce 100644 --- a/controllers/config/config.go +++ b/controllers/config/config.go @@ -1,11 +1,12 @@ package config import ( + "reflect" + "time" + "github.com/caarlos0/env/v6" "github.com/pkg/errors" "go.uber.org/zap/zapcore" - "reflect" - "time" ) var ( @@ -29,6 +30,7 @@ type Config struct { DefaultProberImage string `env:"DEFAULT_PROBER_IMAGE,required"` DefaultJolokiaImage string `env:"DEFAULT_JOLOKIA_IMAGE,required"` DefaultReaperImage string `env:"DEFAULT_REAPER_IMAGE,required"` + DefaultIcarusImage string `env:"DEFAULT_ICARUS_IMAGE,required"` } func LoadConfig() (*Config, error) { diff --git a/controllers/controller.go b/controllers/controller.go index ea75817..39cfe47 100755 --- a/controllers/controller.go +++ b/controllers/controller.go @@ -82,7 +82,7 @@ type CassandraClusterReconciler struct { Cfg config.Config Events *events.EventRecorder Jobs *jobs.JobManager - ProberClient func(url *url.URL, auth prober.Auth) prober.ProberClient + ProberClient func(url *url.URL, user, password string) prober.ProberClient CqlClient func(cluster *gocql.ClusterConfig) (cql.CqlClient, error) ReaperClient func(url *url.URL, clusterName string, defaultRepairThreadCount int32) reaper.ReaperClient NodectlClient func(jolokiaAddr, jmxUser, jmxPassword string, logr *zap.SugaredLogger) nodectl.Nodectl @@ -127,7 +127,6 @@ func (r *CassandraClusterReconciler) Reconcile(ctx context.Context, req ctrl.Req func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { cc := &v1alpha1.CassandraCluster{} - err := r.Get(ctx, types.NamespacedName{Name: req.Name, Namespace: req.Namespace}, cc) if err != nil { if apierrors.IsNotFound(err) { //do not react to CRD delete events @@ -143,6 +142,22 @@ func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, r return ctrl.Result{}, errors.Wrap(err, "Failed to cleanup network policies") } + clusterReady := false + ccStatus := cc.DeepCopy() + defer func() { + if ccStatus.Status.Ready != clusterReady { + ccStatus.Status.Ready = clusterReady + statusErr := r.Status().Update(ctx, ccStatus) + if statusErr != nil { + r.Log.Errorf("Failed to update cluster readiness state: %#v", statusErr) + } + } + }() + err = r.reconcileCassandraRBAC(ctx, cc) + if err != nil { + return ctrl.Result{}, err + } + if err = r.reconcileTLSSecrets(ctx, cc); err != nil { return ctrl.Result{}, errors.Wrap(err, "Error reconciling TLS Secrets") } @@ -161,7 +176,12 @@ func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, r return ctrl.Result{}, errors.Wrapf(err, "Failed to get secret: %s", names.ActiveAdminSecret(cc.Name)) } - adminRole, adminPassword, err := extractCredentials(baseAdminSecret) + err = r.reconcileAnnotations(ctx, baseAdminSecret, map[string]string{v1alpha1.CassandraClusterInstance: cc.Name}) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile annotations") + } + + desiredAdminRole, desiredAdminPassword, err := extractCredentials(baseAdminSecret) if err != nil { errMsg := fmt.Sprintf("admin secret %q is invalid: %s", cc.Spec.AdminRoleSecretName, err.Error()) r.Log.Warn(errMsg) @@ -169,12 +189,7 @@ func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, r return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil } - proberAuth := prober.Auth{ - Username: adminRole, - Password: adminPassword, - } - - err = r.reconcileAdminAuth(ctx, cc, baseAdminSecret, proberAuth) + auth, err := r.reconcileAdminAuth(ctx, cc, desiredAdminRole, desiredAdminPassword) if err != nil { return ctrl.Result{}, errors.Wrap(err, "Error reconciling Admin Auth Secrets") } @@ -199,7 +214,7 @@ func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, r return ctrl.Result{}, errors.Wrap(err, "Error reconciling prober") } - proberClient := r.ProberClient(proberURL(cc), proberAuth) + proberClient := r.ProberClient(proberURL(cc), auth.desiredRole, auth.desiredPassword) proberReady, err := proberClient.Ready(ctx) if err != nil { @@ -230,7 +245,7 @@ func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, r nodeList := &v1.NodeList{} //optimization - node info needed only if hostport or zoneAsRacks enabled if cc.Spec.HostPort.Enabled || cc.Spec.Cassandra.ZonesAsRacks { - err := r.List(ctx, nodeList) + err = r.List(ctx, nodeList) if err != nil { return ctrl.Result{}, errors.Wrap(err, "can't get list of nodes") } @@ -290,17 +305,18 @@ func (r *CassandraClusterReconciler) reconcileWithContext(ctx context.Context, r } } - ready, err := r.clusterReady(ctx, cc, proberClient) + clusterReady, err = r.clusterReady(ctx, cc, proberClient) if err != nil { + clusterReady = false return ctrl.Result{}, err } - if !ready { + if !clusterReady { r.Log.Infof("Cluster not ready. Trying again in %s...", r.Cfg.RetryDelay) return ctrl.Result{RequeueAfter: r.Cfg.RetryDelay}, nil } - cqlClient, err := r.reconcileAdminRole(ctx, cc, allDCs) + cqlClient, err := r.reconcileAdminRole(ctx, cc, auth, allDCs) if err != nil { return ctrl.Result{}, errors.Wrap(err, "Failed to reconcile Admin Role") } diff --git a/controllers/controller_test.go b/controllers/controller_test.go index aa356ce..a60e995 100644 --- a/controllers/controller_test.go +++ b/controllers/controller_test.go @@ -44,7 +44,7 @@ func createMockedReconciler(t *testing.T) (*CassandraClusterReconciler, *gomock. Scheme: scheme.Scheme, Cfg: config.Config{}, Events: events.NewEventRecorder(&record.FakeRecorder{}), - ProberClient: func(url *url.URL, auth prober.Auth) prober.ProberClient { + ProberClient: func(url *url.URL, user, password string) prober.ProberClient { return proberClientMock }, CqlClient: func(clusterConfig *gocql.ClusterConfig) (cql.CqlClient, error) { diff --git a/controllers/defaults.go b/controllers/defaults.go index f3c8615..28b9c8c 100644 --- a/controllers/defaults.go +++ b/controllers/defaults.go @@ -30,6 +30,7 @@ func (r *CassandraClusterReconciler) defaultCassandraCluster(cc *dbv1alpha1.Cass r.defaultCassandra(cc) r.defaultProber(cc) + r.defaultIcarus(cc) r.defaultReaper(cc) if len(cc.Spec.Maintenance) > 0 { @@ -165,6 +166,16 @@ func (r *CassandraClusterReconciler) defaultProber(cc *dbv1alpha1.CassandraClust } } +func (r *CassandraClusterReconciler) defaultIcarus(cc *dbv1alpha1.CassandraCluster) { + if cc.Spec.Icarus.Image == "" { + cc.Spec.Icarus.Image = r.Cfg.DefaultIcarusImage + } + + if cc.Spec.Icarus.ImagePullPolicy == "" { + cc.Spec.Icarus.ImagePullPolicy = v1.PullIfNotPresent + } +} + func (r *CassandraClusterReconciler) defaultCassandra(cc *dbv1alpha1.CassandraCluster) { if cc.Spec.Cassandra == nil { cc.Spec.Cassandra = &dbv1alpha1.Cassandra{} diff --git a/controllers/events/events.go b/controllers/events/events.go index 16f64fe..3307bab 100644 --- a/controllers/events/events.go +++ b/controllers/events/events.go @@ -9,18 +9,22 @@ import ( const ( EventRecorderNameCassandraCluster = "cassandra-cluster" // appears in the 'From' column of the events list - EventAdminRoleSecretNotFound = "AdminRoleSecretNotFound" - EventAdminRoleSecretInvalid = "AdminRoleSecretInvalid" - EventAdminRoleUpdateFailed = "AdminRoleUpdateFailed" - EventAdminRoleCreated = "AdminRoleCreated" - EventDefaultAdminRoleDropped = "DefaultAdminRoleRemoved" - EventRoleSecretNotFound = "RolesSecretNotFound" - EventTLSSecretNotFound = "TLSSecretNotFound" - EventTLSSecretInvalid = "TLSSecretInvalid" - EventInsecureSetup = "InsecureSetup" - EventInvalidRole = "InvalidRole" - CassandraConfigInvalid = "InvalidCassandraConfig" - EventDCDecommissionBlocked = "DCDecommissionBlocked" + EventAdminRoleSecretNotFound = "AdminRoleSecretNotFound" + EventAdminRoleSecretInvalid = "AdminRoleSecretInvalid" + EventAdminRoleUpdateFailed = "AdminRoleUpdateFailed" + EventAdminRoleCreated = "AdminRoleCreated" + EventDefaultAdminRoleDropped = "DefaultAdminRoleRemoved" + EventRoleSecretNotFound = "RolesSecretNotFound" + EventTLSSecretNotFound = "TLSSecretNotFound" + EventTLSSecretInvalid = "TLSSecretInvalid" + EventInsecureSetup = "InsecureSetup" + EventInvalidRole = "InvalidRole" + CassandraConfigInvalid = "InvalidCassandraConfig" + EventDCDecommissionBlocked = "DCDecommissionBlocked" + EventCassandraClusterNotFound = "CassandraClusterNotFound" + EventCassandraBackupNotFound = "CassandraBackupNotFound" + EventStorageCredentialsSecretNotFound = "StorageCredentialsSecretNotFound" + EventStorageCredentialsSecretInvalid = "StorageCredentialsSecretInvalid" EventAdminRoleChanged = "AdminRoleChanged" EventRegionInit = "RegionInit" diff --git a/controllers/helpers.go b/controllers/helpers.go index 60cec3a..24965d5 100644 --- a/controllers/helpers.go +++ b/controllers/helpers.go @@ -103,8 +103,7 @@ func (r *CassandraClusterReconciler) reconcileAnnotations(ctx context.Context, o if currentAnnotations == nil { object.SetAnnotations(annotations) } else { - util.MergeMap(currentAnnotations, annotations) - object.SetAnnotations(currentAnnotations) + object.SetAnnotations(util.MergeMap(currentAnnotations, annotations)) } err := r.Update(ctx, object) diff --git a/controllers/icarus/backup.go b/controllers/icarus/backup.go new file mode 100644 index 0000000..675ba3c --- /dev/null +++ b/controllers/icarus/backup.go @@ -0,0 +1,141 @@ +package icarus + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" +) + +const ( + StatePending = "PENDING" + StateRunning = "RUNNING" + StateCompleted = "COMPLETED" + StateCancelled = "CANCELLED" + StateFailed = "FAILED" +) + +type BackupRequest struct { + Type string `json:"type"` + StorageLocation string `json:"storageLocation"` + DataDirs []string `json:"dataDirs"` + GlobalRequest bool `json:"globalRequest"` + SnapshotTag string `json:"snapshotTag"` + K8sNamespace string `json:"k8sNamespace,omitempty"` + K8sSecretName string `json:"k8sSecretName,omitempty"` + Duration string `json:"duration,omitempty"` + Bandwidth *DataRate `json:"bandwidth,omitempty"` + ConcurrentConnections int64 `json:"concurrentConnections,omitempty"` + DC string `json:"dc,omitempty"` + Entities string `json:"entities,omitempty"` + Timeout int64 `json:"timeout,omitempty"` + MetadataDirective string `json:"metadataDirective,omitempty"` + Insecure bool `json:"insecure"` + CreateMissingBucket bool `json:"createMissingBucket"` + SkipRefreshing bool `json:"skipRefreshing"` + SkipBucketVerification bool `json:"skipBucketVerification"` + Retry Retry `json:"retry,omitempty"` +} + +type DataRate struct { + Value int64 `json:"value"` + Unit string `json:"unit"` +} + +type Backup struct { + ID string `json:"id"` + CreationTime string `json:"creationTime"` + State string `json:"state"` + Errors []Error `json:"errors"` + Progress float64 `json:"progress"` + StartTime string `json:"startTime"` + Type string `json:"type"` + StorageLocation string `json:"storageLocation"` + ConcurrentConnections int64 `json:"concurrentConnections"` + MetadataDirective string `json:"metadataDirective"` + Entities string `json:"entities"` + SnapshotTag string `json:"snapshotTag"` + GlobalRequest bool `json:"globalRequest"` + Timeout int64 `json:"timeout"` + Insecure bool `json:"insecure"` + SchemaVersion string `json:"schemaVersion"` + CreateMissingBucket bool `json:"createMissingBucket"` + SkipBucketVerification bool `json:"skipBucketVerification"` + UploadClusterTopology bool `json:"uploadClusterTopology"` + Retry Retry `json:"retry"` + SkipRefreshing bool `json:"skipRefreshing"` + DataDirs []string `json:"dataDirs"` + DC string `json:"dc"` + K8sNamespace string `json:"k8sNamespace"` + K8sSecretName string `json:"k8sSecretName"` + Duration string `json:"duration"` + Bandwidth *DataRate `json:"bandwidth"` +} + +type Error struct { + Source string `json:"source"` + Message string `json:"message"` +} + +func (c *client) Backup(ctx context.Context, backupReq BackupRequest) (Backup, error) { + backupReq.Type = "backup" + body, err := json.Marshal(backupReq) + if err != nil { + return Backup{}, err + } + req, err := http.NewRequestWithContext(ctx, "POST", c.addr+"/operations", bytes.NewReader(body)) + if err != nil { + return Backup{}, err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return Backup{}, err + } + b, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusCreated { + return Backup{}, fmt.Errorf("backup request failed: code: %d, body: %s", resp.StatusCode, string(b)) + } + + backup := Backup{} + err = json.Unmarshal(b, &backup) + if err != nil { + return Backup{}, err + } + + return backup, nil +} + +func (c *client) Backups(ctx context.Context) ([]Backup, error) { + req, err := http.NewRequestWithContext(ctx, "GET", c.addr+"/operations?type=backup", nil) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + if resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("backup request failed: code: %d, body: %s", resp.StatusCode, string(b)) + } + b, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + var backups []Backup + err = json.Unmarshal(b, &backups) + if err != nil { + return nil, err + } + + return backups, nil +} diff --git a/controllers/icarus/icarus.go b/controllers/icarus/icarus.go new file mode 100644 index 0000000..5a42148 --- /dev/null +++ b/controllers/icarus/icarus.go @@ -0,0 +1,32 @@ +package icarus + +import ( + "context" + "net/http" +) + +type Retry struct { + Interval int64 `json:"interval"` + Strategy string `json:"strategy"` + MaxAttempts int64 `json:"maxAttempts"` + Enabled bool `json:"enabled"` +} + +type Icarus interface { + Backup(ctx context.Context, req BackupRequest) (Backup, error) + Backups(ctx context.Context) ([]Backup, error) + Restore(ctx context.Context, req RestoreRequest) error + Restores(ctx context.Context) ([]Restore, error) +} + +type client struct { + addr string + httpClient *http.Client +} + +func New(addr string) Icarus { + return &client{ + addr: addr, + httpClient: &http.Client{}, + } +} diff --git a/controllers/icarus/restore.go b/controllers/icarus/restore.go new file mode 100644 index 0000000..135e950 --- /dev/null +++ b/controllers/icarus/restore.go @@ -0,0 +1,140 @@ +package icarus + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" +) + +type RestoreRequest struct { + Type string `json:"type"` + StorageLocation string `json:"storageLocation"` + SnapshotTag string `json:"snapshotTag"` + DataDirs []string `json:"dataDirs"` + GlobalRequest bool `json:"globalRequest"` + RestorationStrategyType string `json:"restorationStrategyType"` + RestorationPhase string `json:"restorationPhase"` + Import RestoreImport `json:"import"` + K8sNamespace string `json:"k8sNamespace,omitempty"` + K8sSecretName string `json:"k8sSecretName,omitempty"` + ConcurrentConnections int64 `json:"concurrentConnections,omitempty"` + Entities string `json:"entities,omitempty"` + NoDeleteTruncates bool `json:"noDeleteTruncates,omitempty"` + NoDeleteDownloads bool `json:"noDeleteDownloads,omitempty"` + NoDownloadData bool `json:"noDownloadData,omitempty"` + Timeout int64 `json:"timeout,omitempty"` + ResolveHostIdFromTopology bool `json:"resolveHostIdFromTopology,omitempty"` + Insecure bool `json:"insecure,omitempty"` + SkipBucketVerification bool `json:"skipBucketVerification,omitempty"` + Retry Retry `json:"retry,omitempty"` + Rename map[string]string `json:"rename,omitempty"` + SinglePhase bool `json:"singlePhase,omitempty"` + DC string `json:"dc,omitempty"` + SchemaVersion string `json:"schemaVersion,omitempty"` + ExactSchemaVersion bool `json:"exactSchemaVersion,omitempty"` +} + +type Restore struct { + Id string `json:"id"` + CreationTime string `json:"creationTime"` + State string `json:"state"` + Errors []Error `json:"errors"` + Progress float64 `json:"progress"` + StartTime string `json:"startTime"` + Type string `json:"type"` + StorageLocation string `json:"storageLocation"` + ConcurrentConnections int64 `json:"concurrentConnections"` + CassandraConfigDirectory string `json:"cassandraConfigDirectory"` + RestoreSystemKeyspace bool `json:"restoreSystemKeyspace"` + SnapshotTag string `json:"snapshotTag"` + Entities string `json:"entities"` + RestorationStrategyType string `json:"restorationStrategyType"` + RestorationPhase string `json:"restorationPhase"` + Import RestoreImport `json:"import"` + NoDeleteTruncates bool `json:"noDeleteTruncates"` + NoDeleteDownloads bool `json:"noDeleteDownloads"` + NoDownloadData bool `json:"noDownloadData"` + ExactSchemaVersion bool `json:"exactSchemaVersion"` + GlobalRequest bool `json:"globalRequest"` + Timeout int64 `json:"timeout"` + ResolveHostIdFromTopology bool `json:"resolveHostIdFromTopology"` + Insecure bool `json:"insecure"` + SkipBucketVerification bool `json:"skipBucketVerification"` + Retry Retry `json:"retry"` + SinglePhase bool `json:"singlePhase"` + DataDirs []string `json:"dataDirs"` + DC string `json:"dc"` + K8sNamespace string `json:"k8sNamespace"` + K8sSecretName string `json:"k8sSecretName"` + Rename map[string]string `json:"rename"` +} + +type RestoreImport struct { + Type string `json:"type"` + SourceDir string `json:"sourceDir"` + KeepLevel bool `json:"keepLevel,omitempty"` + NoVerify bool `json:"noVerify,omitempty"` + NoVerifyTokens bool `json:"noVerifyTokens,omitempty"` + NoInvalidateCaches bool `json:"noInvalidateCaches,omitempty"` + Quick bool `json:"quick,omitempty"` + ExtendedVerify bool `json:"extendedVerify,omitempty"` + KeepRepaired bool `json:"keepRepaired,omitempty"` +} + +func (c *client) Restore(ctx context.Context, restoreRequest RestoreRequest) error { + restoreRequest.Type = "restore" + body, err := json.Marshal(restoreRequest) + if err != nil { + return err + } + req, err := http.NewRequestWithContext(ctx, "POST", c.addr+"/operations", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return err + } + if resp.StatusCode != http.StatusCreated { + b, _ := ioutil.ReadAll(resp.Body) + return fmt.Errorf("backup request failed: code: %d, body: %s", resp.StatusCode, string(b)) + } + + return nil +} + +func (c *client) Restores(ctx context.Context) ([]Restore, error) { + req, err := http.NewRequestWithContext(ctx, "GET", c.addr+"/operations?type=restore", nil) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + if resp.StatusCode != http.StatusOK { + b, _ := ioutil.ReadAll(resp.Body) + return nil, fmt.Errorf("backup request failed: code: %d, body: %s", resp.StatusCode, string(b)) + } + b, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + var backups []Restore + err = json.Unmarshal(b, &backups) + if err != nil { + return nil, err + } + + return backups, nil +} diff --git a/controllers/keyspaces.go b/controllers/keyspaces.go index 01c26c6..804d3f4 100644 --- a/controllers/keyspaces.go +++ b/controllers/keyspaces.go @@ -156,7 +156,7 @@ func (r *CassandraClusterReconciler) reconcileSystemAuthKeyspace(ctx context.Con return nil } -func (r CassandraClusterReconciler) reconcileSystemAuthIfReady(ctx context.Context, cc *dbv1alpha1.CassandraCluster, allDCs []dbv1alpha1.DC) error { +func (r *CassandraClusterReconciler) reconcileSystemAuthIfReady(ctx context.Context, cc *dbv1alpha1.CassandraCluster, allDCs []dbv1alpha1.DC) error { adminRoleSecret := &v1.Secret{} err := r.Get(ctx, types.NamespacedName{Namespace: cc.Namespace, Name: cc.Spec.AdminRoleSecretName}, adminRoleSecret) if err != nil { diff --git a/controllers/names/names.go b/controllers/names/names.go index 85cefbd..dba2038 100644 --- a/controllers/names/names.go +++ b/controllers/names/names.go @@ -151,6 +151,18 @@ func CassandraClientTLSNode(clusterName string) string { return clusterName + "-client-tls-node" } +func CassandraRole(clusterName string) string { + return clusterName + "-cassandra-role" +} + +func CassandraRoleBinding(clusterName string) string { + return clusterName + "-cassandra-rolebinding" +} + +func CassandraServiceAccount(clusterName string) string { + return clusterName + "-cassandra-serviceaccount" +} + func CassandraClusterNetworkPolicyName(clusterName string) string { return clusterName + "-cassandra-cluster-policies" } diff --git a/controllers/network_policies.go b/controllers/network_policies.go index 8479e7a..a7ab364 100644 --- a/controllers/network_policies.go +++ b/controllers/network_policies.go @@ -3,6 +3,8 @@ package controllers import ( "context" "fmt" + "sort" + dbv1alpha1 "github.com/ibm/cassandra-operator/api/v1alpha1" "github.com/ibm/cassandra-operator/controllers/compare" "github.com/ibm/cassandra-operator/controllers/events" @@ -19,7 +21,6 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sort" ) func (r *CassandraClusterReconciler) reconcileNetworkPolicies(ctx context.Context, cc *dbv1alpha1.CassandraCluster, proberClient prober.ProberClient, podList *v1.PodList) error { @@ -174,7 +175,7 @@ func (r *CassandraClusterReconciler) reconcileProberNetworkPolicies(ctx context. return nil } -func (r CassandraClusterReconciler) reconcileReaperNetworkPolicies(ctx context.Context, cc *dbv1alpha1.CassandraCluster) error { +func (r *CassandraClusterReconciler) reconcileReaperNetworkPolicies(ctx context.Context, cc *dbv1alpha1.CassandraCluster) error { var err error desiredReaperPolicy := &nwv1.NetworkPolicy{ @@ -284,6 +285,7 @@ func (r *CassandraClusterReconciler) cassandraClusterPolicy(ctx context.Context, Ports: []nwv1.NetworkPolicyPort{ nwPolicyPort(dbv1alpha1.TlsPort), nwPolicyPort(dbv1alpha1.IntraPort), + nwPolicyPort(dbv1alpha1.IcarusPort), }, From: []nwv1.NetworkPolicyPeer{ nwPolicyPeer(map[string]string{dbv1alpha1.CassandraClusterComponent: dbv1alpha1.CassandraClusterComponentCassandra}, cc.Namespace), @@ -293,6 +295,7 @@ func (r *CassandraClusterReconciler) cassandraClusterPolicy(ctx context.Context, { Ports: []nwv1.NetworkPolicyPort{ nwPolicyPort(dbv1alpha1.CqlPort), + nwPolicyPort(dbv1alpha1.IcarusPort), }, From: []nwv1.NetworkPolicyPeer{ nwPolicyPeer(dbv1alpha1.CassandraOperatorPodLabels, r.Cfg.Namespace), @@ -354,6 +357,7 @@ func (r *CassandraClusterReconciler) cassandraClusterHostportPolicy(ctx context. Ports: []nwv1.NetworkPolicyPort{ nwPolicyPort(dbv1alpha1.TlsPort), nwPolicyPort(dbv1alpha1.IntraPort), + nwPolicyPort(dbv1alpha1.IcarusPort), }, From: generatePeers(curRegionNodeIPs), }) @@ -379,6 +383,7 @@ func (r *CassandraClusterReconciler) cassandraClusterExternalManagedRegionsPolic Ports: []nwv1.NetworkPolicyPort{ nwPolicyPort(dbv1alpha1.TlsPort), nwPolicyPort(dbv1alpha1.IntraPort), + nwPolicyPort(dbv1alpha1.IcarusPort), }, From: generatePeers(casIPs), }) diff --git a/controllers/nodectl/jolokia/jolokia.go b/controllers/nodectl/jolokia/jolokia.go index d31021f..5f6877c 100644 --- a/controllers/nodectl/jolokia/jolokia.go +++ b/controllers/nodectl/jolokia/jolokia.go @@ -5,7 +5,7 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" + "io" "net/http" "github.com/ibm/cassandra-operator/api/v1alpha1" @@ -92,7 +92,7 @@ func (j *Client) Post(ctx context.Context, jmxReq JMXRequest, ip string) (JMXRes defer func() { _ = resp.Body.Close() }() - responseBody, err := ioutil.ReadAll(resp.Body) + responseBody, err := io.ReadAll(resp.Body) if err != nil { return JMXResponse{}, err } diff --git a/controllers/prober/prober.go b/controllers/prober/prober.go index f1addb4..66169c2 100644 --- a/controllers/prober/prober.go +++ b/controllers/prober/prober.go @@ -5,13 +5,12 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" + "io" "net/http" "net/url" "strconv" "github.com/ibm/cassandra-operator/api/v1alpha1" - "github.com/pkg/errors" ) @@ -42,11 +41,14 @@ type Auth struct { Password string } -func NewProberClient(url *url.URL, client *http.Client, auth Auth) ProberClient { - return &proberClient{url, client, auth} +func NewProberClient(url *url.URL, client *http.Client, user, password string) ProberClient { + return &proberClient{url, client, Auth{ + Username: user, + Password: password, + }} } -func (p proberClient) url(path string) string { +func (p *proberClient) url(path string) string { return p.baseUrl.String() + path } @@ -93,7 +95,7 @@ func (p *proberClient) GetSeeds(ctx context.Context, host string) ([]string, err http.StatusText(resp.StatusCode), resp.StatusCode, http.StatusText(http.StatusOK)) } - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return []string{}, errors.Wrap(err, "Unable to read response body") } @@ -138,7 +140,7 @@ func (p *proberClient) GetDCs(ctx context.Context, host string) ([]v1alpha1.DC, http.StatusText(resp.StatusCode), resp.StatusCode, http.StatusText(http.StatusOK)) } - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return nil, errors.Wrap(err, "Unable to read response body") } @@ -194,7 +196,7 @@ func (p *proberClient) RegionReady(ctx context.Context, host string) (bool, erro http.StatusText(resp.StatusCode), resp.StatusCode, http.StatusText(http.StatusOK)) } - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return false, err } @@ -236,7 +238,7 @@ func (p *proberClient) ReaperReady(ctx context.Context, host string) (bool, erro http.StatusText(resp.StatusCode), resp.StatusCode, http.StatusText(http.StatusOK)) } - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return false, err } @@ -281,7 +283,7 @@ func (p *proberClient) GetRegionIPs(ctx context.Context, host string) ([]string, http.StatusText(resp.StatusCode), resp.Status, http.StatusText(http.StatusOK)) } - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return []string{}, errors.Wrap(err, "Unable to read response body") } @@ -326,7 +328,7 @@ func (p *proberClient) GetReaperIPs(ctx context.Context, host string) ([]string, http.StatusText(resp.StatusCode), resp.Status, http.StatusText(http.StatusOK)) } - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return []string{}, errors.Wrap(err, "Unable to read response body") } diff --git a/controllers/reaper.go b/controllers/reaper.go index da39a80..aa68827 100644 --- a/controllers/reaper.go +++ b/controllers/reaper.go @@ -344,7 +344,7 @@ func reaperVolumes(cc *dbv1alpha1.CassandraCluster) []v1.Volume { return volume } -func (r CassandraClusterReconciler) reaperInitialization(ctx context.Context, cc *dbv1alpha1.CassandraCluster, reaperClient reaper.ReaperClient) error { +func (r *CassandraClusterReconciler) reaperInitialization(ctx context.Context, cc *dbv1alpha1.CassandraCluster, reaperClient reaper.ReaperClient) error { seed := getSeedHostname(cc, cc.Spec.DCs[0].Name, 0, true) clusterExists, err := reaperClient.ClusterExists(ctx) if err != nil { diff --git a/controllers/reaper/reaper.go b/controllers/reaper/reaper.go index 25a0dc0..3c91007 100644 --- a/controllers/reaper/reaper.go +++ b/controllers/reaper/reaper.go @@ -4,7 +4,7 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" + "io" "net/http" "net/url" @@ -94,7 +94,7 @@ func (r *reaperClient) ClusterExists(ctx context.Context) (bool, error) { return false, err } defer resp.Body.Close() - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) if resp.StatusCode >= 300 { if resp.StatusCode == http.StatusNotFound { return false, nil @@ -119,7 +119,7 @@ func (r *reaperClient) AddCluster(ctx context.Context, seed string) error { if err != nil { return err } - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) defer resp.Body.Close() if resp.StatusCode >= 300 { if resp.StatusCode == http.StatusNotFound { @@ -142,7 +142,7 @@ func (r *reaperClient) Clusters(ctx context.Context) ([]string, error) { if err != nil { return nil, err } - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) defer resp.Body.Close() if resp.StatusCode >= 300 { return nil, &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} @@ -194,7 +194,7 @@ func (r *reaperClient) DeleteCluster(ctx context.Context) error { if err != nil { return err } - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) defer resp.Body.Close() if resp.StatusCode >= 300 { return &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} @@ -223,7 +223,7 @@ func (r *reaperClient) deleteRepairRun(ctx context.Context, repairRun RepairRun) if err != nil { return err } - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) defer resp.Body.Close() if resp.StatusCode >= 300 { return &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} diff --git a/controllers/reaper/repair_schedules.go b/controllers/reaper/repair_schedules.go index fe931f4..127756f 100644 --- a/controllers/reaper/repair_schedules.go +++ b/controllers/reaper/repair_schedules.go @@ -4,7 +4,7 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" + "io" "net/http" "net/url" @@ -46,7 +46,7 @@ func (r *reaperClient) CreateRepairSchedule(ctx context.Context, repair dbv1alph return err } defer resp.Body.Close() - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) if resp.StatusCode >= 300 { return &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} } @@ -71,7 +71,7 @@ func (r *reaperClient) RepairSchedules(ctx context.Context) ([]RepairSchedule, e } defer resp.Body.Close() - b, err := ioutil.ReadAll(resp.Body) + b, err := io.ReadAll(resp.Body) if err != nil { return nil, err } @@ -106,7 +106,7 @@ func (r *reaperClient) DeleteRepairSchedule(ctx context.Context, repairScheduleI } defer resp.Body.Close() - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) if resp.StatusCode >= 300 { return &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} } @@ -137,7 +137,7 @@ func (r *reaperClient) SetRepairScheduleState(ctx context.Context, repairSchedul } defer resp.Body.Close() - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) if resp.StatusCode >= 300 { return &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} } diff --git a/controllers/reaper/repairs.go b/controllers/reaper/repairs.go index 6ed3749..6b39b75 100644 --- a/controllers/reaper/repairs.go +++ b/controllers/reaper/repairs.go @@ -4,7 +4,7 @@ import ( "context" "encoding/json" "fmt" - "io/ioutil" + "io" "net/http" "net/url" ) @@ -63,7 +63,7 @@ func (r *reaperClient) createRepairRun(ctx context.Context, keyspace, cause stri return RepairRun{}, err } defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return RepairRun{}, err } @@ -101,7 +101,7 @@ func (r *reaperClient) getRepairRuns(ctx context.Context, keyspace string) ([]Re return nil, err } defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return nil, err } @@ -132,7 +132,7 @@ func (r *reaperClient) setRepairState(ctx context.Context, runID, state string) return err } defer resp.Body.Close() - b, _ := ioutil.ReadAll(resp.Body) + b, _ := io.ReadAll(resp.Body) if resp.StatusCode >= 300 { return &requestFailedWithStatus{code: resp.StatusCode, message: string(b)} } diff --git a/controllers/role_admin.go b/controllers/role_admin.go index 55eea29..92db797 100644 --- a/controllers/role_admin.go +++ b/controllers/role_admin.go @@ -2,42 +2,23 @@ package controllers import ( "context" + dbv1alpha1 "github.com/ibm/cassandra-operator/api/v1alpha1" "github.com/ibm/cassandra-operator/controllers/cql" "github.com/ibm/cassandra-operator/controllers/events" "github.com/pkg/errors" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" ) -func (r *CassandraClusterReconciler) reconcileAdminRole(ctx context.Context, cc *dbv1alpha1.CassandraCluster, allDCs []dbv1alpha1.DC) (cql.CqlClient, error) { - adminRoleSecret := &v1.Secret{} - err := r.Get(ctx, types.NamespacedName{Namespace: cc.Namespace, Name: cc.Spec.AdminRoleSecretName}, adminRoleSecret) - if err != nil { - return nil, err - } - - err = r.reconcileAnnotations(ctx, adminRoleSecret, map[string]string{dbv1alpha1.CassandraClusterInstance: cc.Name}) - if err != nil { - return nil, errors.Wrap(err, "failed to reconcile annotations") - } - - cassandraOperatorAdminRole := string(adminRoleSecret.Data[dbv1alpha1.CassandraOperatorAdminRole]) - cassandraOperatorAdminPassword := string(adminRoleSecret.Data[dbv1alpha1.CassandraOperatorAdminPassword]) - - if cc.Spec.JMXAuth == jmxAuthenticationLocalFiles { - adminRoleSecret.Data[dbv1alpha1.CassandraOperatorJmxUsername] = []byte(cassandraOperatorAdminRole) - adminRoleSecret.Data[dbv1alpha1.CassandraOperatorJmxPassword] = []byte(cassandraOperatorAdminPassword) - } - - r.Log.Debug("Establishing cql session with role " + cassandraOperatorAdminRole) - cqlClient, err := r.CqlClient(newCassandraConfig(cc, cassandraOperatorAdminRole, cassandraOperatorAdminPassword, r.Log)) +func (r *CassandraClusterReconciler) reconcileAdminRole(ctx context.Context, cc *dbv1alpha1.CassandraCluster, auth credentials, allDCs []dbv1alpha1.DC) (cql.CqlClient, error) { + r.Log.Debug("Establishing cql session with role " + auth.desiredRole) + cqlClient, err := r.CqlClient(newCassandraConfig(cc, auth.desiredRole, auth.desiredPassword, r.Log)) if err == nil { // operator admin role exists if err = r.reconcileSystemAuthKeyspace(ctx, cc, cqlClient, allDCs); err != nil { return nil, err } - - err = r.reconcileAdminSecrets(ctx, cc, adminRoleSecret.Data) //make sure the secrets have the correct credentials + auth.activeRole = auth.desiredRole + auth.activePassword = auth.desiredPassword + err = r.reconcileAdminSecrets(ctx, cc, auth) //make sure the secrets have the correct credentials if err != nil { return nil, err } @@ -52,14 +33,14 @@ func (r *CassandraClusterReconciler) reconcileAdminRole(ctx context.Context, cc defaultUserCQLClient.CloseSession() r.Log.Info("The default admin role is in use. Going to create the secure role and delete the default...") - err = r.createAdminRoleInCassandra(ctx, cc, cassandraOperatorAdminRole, cassandraOperatorAdminPassword, allDCs) + err = r.createAdminRoleInCassandra(ctx, cc, auth.desiredRole, auth.desiredPassword, allDCs) if err != nil { return nil, errors.Wrap(err, "can't create admin role") } - r.Log.Debug("Establishing cql session with role " + cassandraOperatorAdminRole) + r.Log.Debug("Establishing cql session with role " + auth.desiredRole) err = r.doWithRetry(func() error { - cqlClient, err = r.CqlClient(newCassandraConfig(cc, cassandraOperatorAdminRole, cassandraOperatorAdminPassword, r.Log)) + cqlClient, err = r.CqlClient(newCassandraConfig(cc, auth.desiredRole, auth.desiredPassword, r.Log)) if err != nil { return err } @@ -67,7 +48,9 @@ func (r *CassandraClusterReconciler) reconcileAdminRole(ctx context.Context, cc }) r.Events.Normal(cc, events.EventAdminRoleCreated, "secure admin role is created") - err = r.reconcileAdminSecrets(ctx, cc, adminRoleSecret.Data) + auth.activeRole = auth.desiredRole + auth.activePassword = auth.desiredPassword + err = r.reconcileAdminSecrets(ctx, cc, auth) if err != nil { return nil, errors.Wrap(err, "failed to update admin secrets with new password") } @@ -105,13 +88,13 @@ func (r *CassandraClusterReconciler) createAdminRoleInCassandra(ctx context.Cont return nil } -func (r *CassandraClusterReconciler) reconcileAdminSecrets(ctx context.Context, cc *dbv1alpha1.CassandraCluster, secretData map[string][]byte) error { - err := r.reconcileActiveAdminSecret(ctx, cc, secretData) +func (r *CassandraClusterReconciler) reconcileAdminSecrets(ctx context.Context, cc *dbv1alpha1.CassandraCluster, auth credentials) error { + err := r.reconcileActiveAdminSecret(ctx, cc, auth) if err != nil { return errors.Wrap(err, "failed to update active admin secret") } - if err = r.reconcileAdminAuthConfigSecret(ctx, cc, secretData); err != nil { + if err = r.reconcileAdminAuthConfigSecret(ctx, cc, auth); err != nil { return errors.Wrap(err, "failed to reconcile admin auth secret") } diff --git a/controllers/util/utils.go b/controllers/util/utils.go index e2b76f6..4a3eafa 100644 --- a/controllers/util/utils.go +++ b/controllers/util/utils.go @@ -3,16 +3,24 @@ package util import ( "crypto/sha1" "fmt" - v1 "k8s.io/api/core/v1" "math/rand" "time" + + v1 "k8s.io/api/core/v1" ) func MergeMap(a, b map[string]string) map[string]string { + if a == nil && b == nil { + return nil + } + res := make(map[string]string, len(a)+len(b)) + for k, v := range a { + res[k] = v + } for k, v := range b { - a[k] = v + res[k] = v } - return a + return res } func Contains(s []string, str string) bool { diff --git a/controllers/webhooks/webhook_certificates.go b/controllers/webhooks/webhook_certificates.go index 923ceaa..411bdc5 100644 --- a/controllers/webhooks/webhook_certificates.go +++ b/controllers/webhooks/webhook_certificates.go @@ -2,13 +2,13 @@ package webhooks import ( "fmt" + "os" + "path/filepath" + "github.com/ibm/cassandra-operator/controllers/certs" "github.com/ibm/cassandra-operator/controllers/config" "github.com/ibm/cassandra-operator/controllers/names" "github.com/pkg/errors" - "io/ioutil" - "os" - "path/filepath" ) func setupWebhookTLS(operatorConfig *config.Config) (*certs.Keypair, error) { @@ -47,11 +47,11 @@ func writeWebhookTLS(dir string, kp certs.Keypair) error { fMode := os.FileMode(0600) - if err := ioutil.WriteFile(filepath.Join(dir, "tls.crt"), kp.Crt, fMode); err != nil { + if err := os.WriteFile(filepath.Join(dir, "tls.crt"), kp.Crt, fMode); err != nil { return errors.Wrap(err, "failed to write Webhook TLS certificate to container's filesystem") } - if err := ioutil.WriteFile(filepath.Join(dir, "tls.key"), kp.Pk, fMode); err != nil { + if err := os.WriteFile(filepath.Join(dir, "tls.key"), kp.Pk, fMode); err != nil { return errors.Wrap(err, "failed to write Webhook TLS key to container's filesystem") } diff --git a/controllers/webhooks/webhook_validating.go b/controllers/webhooks/webhook_validating.go index 41e3b1f..42b1375 100644 --- a/controllers/webhooks/webhook_validating.go +++ b/controllers/webhooks/webhook_validating.go @@ -62,7 +62,9 @@ func CreateValidatingWebhookConf(namespace string, clusterRole *rbac.ClusterRole sideEffectNone = admissionv1.SideEffectClassNone failurePolicyType = admissionv1.Fail namespacedScope = admissionv1.NamespacedScope - webhookPath = "/validate-db-ibm-com-v1alpha1-cassandracluster" + ccWebhookPath = "/validate-db-ibm-com-v1alpha1-cassandracluster" + cbWebhookPath = "/validate-db-ibm-com-v1alpha1-cassandrabackup" + crWebhookPath = "/validate-db-ibm-com-v1alpha1-cassandrarestore" ) return admissionv1.ValidatingWebhookConfiguration{ @@ -88,7 +90,7 @@ func CreateValidatingWebhookConf(namespace string, clusterRole *rbac.ClusterRole Service: &admissionv1.ServiceReference{ Namespace: namespace, Name: names.WebhooksServiceName(), - Path: &webhookPath, + Path: &ccWebhookPath, Port: proto.Int32(443), }, CABundle: caCrtBytes, @@ -112,6 +114,68 @@ func CreateValidatingWebhookConf(namespace string, clusterRole *rbac.ClusterRole TimeoutSeconds: nil, AdmissionReviewVersions: []string{"v1", "v1beta1"}, }, + { + Name: "vcassandrabackup.kb.io", + ClientConfig: admissionv1.WebhookClientConfig{ + URL: nil, + Service: &admissionv1.ServiceReference{ + Namespace: namespace, + Name: names.WebhooksServiceName(), + Path: &cbWebhookPath, + Port: proto.Int32(443), + }, + CABundle: caCrtBytes, + }, + Rules: []admissionv1.RuleWithOperations{ + { + Operations: []admissionv1.OperationType{admissionv1.Create, admissionv1.Update}, + Rule: admissionv1.Rule{ + APIGroups: []string{"db.ibm.com"}, + APIVersions: []string{"v1alpha1"}, + Resources: []string{"cassandrabackups"}, + Scope: &namespacedScope, + }, + }, + }, + FailurePolicy: &failurePolicyType, + MatchPolicy: nil, + NamespaceSelector: nil, + ObjectSelector: nil, + SideEffects: &sideEffectNone, + TimeoutSeconds: nil, + AdmissionReviewVersions: []string{"v1", "v1beta1"}, + }, + { + Name: "vcassandrarestore.kb.io", + ClientConfig: admissionv1.WebhookClientConfig{ + URL: nil, + Service: &admissionv1.ServiceReference{ + Namespace: namespace, + Name: names.WebhooksServiceName(), + Path: &crWebhookPath, + Port: proto.Int32(443), + }, + CABundle: caCrtBytes, + }, + Rules: []admissionv1.RuleWithOperations{ + { + Operations: []admissionv1.OperationType{admissionv1.Create, admissionv1.Update}, + Rule: admissionv1.Rule{ + APIGroups: []string{"db.ibm.com"}, + APIVersions: []string{"v1alpha1"}, + Resources: []string{"cassandrarestores"}, + Scope: &namespacedScope, + }, + }, + }, + FailurePolicy: &failurePolicyType, + MatchPolicy: nil, + NamespaceSelector: nil, + ObjectSelector: nil, + SideEffects: &sideEffectNone, + TimeoutSeconds: nil, + AdmissionReviewVersions: []string{"v1", "v1beta1"}, + }, }, } } diff --git a/docs/docs/backup-restore.md b/docs/docs/backup-restore.md new file mode 100644 index 0000000..1e11545 --- /dev/null +++ b/docs/docs/backup-restore.md @@ -0,0 +1,78 @@ +--- +title: Cassandra backup and restore +slug: /cassandra-backup-restore +--- + +The Cassandra operator uses [Icarus](https://github.com/instaclustr/icarus) to perform backups and restores. +Refer to its documentation for more details on how the backup and restore procedures work internally. + +Backup and restore creation and configuration is done by creating CassandraBackup and CassandraRestore custom resources. + +S3, Azure and GCP storage providers are supported. The type is determined by the `storageLocation` field, which should be in the following format: +`protocol://backup/location`. So an S3 provider would look like to following: `s3://location/to/the/backup` + +To provide credentials, the `secretName` field should be used to refer to a secret in the following format: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: cassandra-backup-restore-secret-cluster-my-cluster +type: Opaque +stringData: + awssecretaccesskey: _AWS secret key_ + awsaccesskeyid: _AWS access id_ + awsregion: e.g. eu-central-1 + awsendpoint: endpoint + azurestorageaccount: _Azure storage account_ + azurestoragekey: _Azure storage key_ + gcp: 'whole json with service account' +``` + +Only fields for a particular provider used should be set. + +### CassandraBackup + +To create a backup simply create a CassandraBackup resource: + +```yaml +apiVersion: db.ibm.com/v1alpha1 +kind: CassandraBackup +metadata: + name: example-backup +spec: + cassandraCluster: test-cluster + storageLocation: s3://bucket-name/backup/location + secretName: backup-restore-credentials +``` + +To track progress of the backup process you can see the status of the object, where you can see the state, progress and other information about the backup. If a backup failed you'll see the errors in the status object as well. + +See [all fields description](cassandrabackup-configuration.md) for more information + +#### Restarting a failed backup + +If a misconfigured backup has failed, the operator will retry only when a configuration is changed. If a retry is needed without a configuration change, simply recreate the resource. + +### CassandraRestore + +To restore a backup a CassandraRestore should be created which will start the restore process. + +The backup can be referenced either by setting the corresponding CassandraBackup resource name or by manually setting the `storageLocation` and `snapshotTag` fields. + +```yaml +apiVersion: db.ibm.com/v1alpha1 +kind: CassandraRestore +metadata: + name: rest3 +spec: + cassandraCluster: test-cluster + cassandraBackup: example-backup + # or the following if no corresponding cassandraBackup available + # storageLocation: s3://bucket-name/backup/location + # snapshotTag: example-backup //the name of the CassandraBackup if the backup was created using the Cassandra Operator +``` + +The Cassandra Operator will update the progress of the restore in the status field of CassandraRestores CR object. + +See [all fields description](cassandrarestore-configuration.md) for more information. \ No newline at end of file diff --git a/docs/docs/cassandrabackup-configuration.md b/docs/docs/cassandrabackup-configuration.md new file mode 100644 index 0000000..512ca64 --- /dev/null +++ b/docs/docs/cassandrabackup-configuration.md @@ -0,0 +1,32 @@ +--- +title: CassandraBackup Configuration +slug: /cassandrabackup-configuration +--- + +## CassandraBackup Field Specification Reference + +| Field | Description | Is Required | Default | +|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|---------------| +| `cassandraCluster` | CassandraCluster name that the backup is created for | `Y` | | +| `storageLocation` | Location where SSTables will be uploaded. example: protocol://myBucket. protocol can be `gcp`, `s3`, `azure` or `oracle` | `Y` | | +| `secretName` | Name of the secret where cloud storage credentials are located | `Y` | | +| `duration` | Based on this field, there will be throughput per second computed based on what size data we want to upload we have. | `N` | | +| `bandwidth` | bandwidth used during uploads | `N` | | +| `bandwidth.value` | the bandwidth to use during upload | `Y` | | +| `bandwidth.unit` | unit used for the value. Can be `BPS`, `KBPS`, `MBPS`, `GBPS`. | `Y` | | +| `concurrentConnections` | number of threads used for upload, there might be at most so many uploading threads at any given time | `N` | `10` | +| `dc` | name of datacenter to backup, nodes in the other datacenter(s) will not be involved | `N` | | +| `entities` | database entities to backup, it might be either only keyspaces or only tables (from different keyspaces if needed). E.g. 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' if one wants to backup tables. | `N` | All keyspaces | +| `timeout` | number of hours to wait until backup is considered failed if not finished already | `N` | 5 | +| `metadataDirective` | Relevant during upload to S3-like bucket only. Specifies whether the metadata is copied from the source object or replaced with metadata provided in the request. Can be `COPY` or `REPLACE` | `N` | `COPY` | +| `insecure` | Relevant during upload to S3-like bucket only. If true, communication is done via HTTP instead of HTTPS. | `N` | false | +| `createMissingBucket` | Automatically creates a bucket if it does not exist. If a bucket does not exist, backup operation will fail. | `N` | false | +| `skipBucketVerification` | Do not check the existence of a bucket. | `N` | false | +| `skipRefreshing` | If set to true, refreshment of an object in a remote bucket (e.g. for s3) will be skipped. | `N` | false | +| `retry` | Retry configuration for backup procedure | `N` | | +| `retry.enabled` | Enable or disable retry mechanism on failure | `N` | false | +| `retry.interval` | Time gap between retries, linear strategy will have always this gap constant, exponential strategy will make the gap bigger exponentially (power of 2) on each attempt | `N` | | +| `retry.strategy` | Strategy how retry should be driven, might be either 'LINEAR' or 'EXPONENTIAL' | `N` | | +| `retry.maxAttempts` | Number of repetitions of an upload / download operation in case it fails before giving up completely. | `N` | | + +See [icarus](https://github.com/instaclustr/icarus)/[esop](https://github.com/instaclustr/esop) documentation for more information on the fields as most of them are passed directly to icarus. \ No newline at end of file diff --git a/docs/docs/cassandrarestore-configuration.md b/docs/docs/cassandrarestore-configuration.md new file mode 100644 index 0000000..46500df --- /dev/null +++ b/docs/docs/cassandrarestore-configuration.md @@ -0,0 +1,41 @@ +--- +title: CassandraRestore Configuration +slug: /cassandrarestore-configuration +--- + +## CassandraRestore Field Specification Reference + +| Field | Description | Is Required | Default | +|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|---------------| +| `cassandraCluster` | The CassandraCluster the restore is going to be used on | `Y` | | +| `cassandraBackup` | The CassandraBackup the operator is going to restore to the cluster. If omitted the `storageLocation`, `snapshotTag` and `secretName` should be set. | `N` | | +| `storageLocation` | Location of SSTables. Example: protocol://myBucket. protocol can be `gcp`, `s3`, `azure` or `oracle` | `N` | | +| `secretName` | Name of the secret where cloud storage credentials are located | `Y` | | +| `concurrentConnections` | number of threads used for upload, there might be at most so many uploading threads at any given time | `N` | `10` | +| `dc` | Name of datacenter(s) against which restore will be done. It means that nodes in a different DC will not receive restore requests. | `N` | | +| `entities` | database entities to backup, it might be either only keyspaces or only tables (from different keyspaces if needed). E.g. 'k1,k2' if one wants to backup whole keyspaces and 'ks1.t1,ks2,t2' if one wants to backup tables. | `N` | All keyspaces | +| `timeout` | number of hours to wait until backup is considered failed if not finished already | `N` | 5 | +| `noDeleteTruncates` | flag saying if we should not delete truncated SSTables after they are imported, as part of CLEANUP phase, defaults to false | `N` | false | +| `noDeleteDownloads` | flag saying if we should not delete downloaded SSTables from remote location, as part of CLEANUP phase, defaults to false | `N` | false | +| `noDownloadData` | flag saying if we should not download data from remote location as we expect them to be there already, defaults to false | `N` | false | +| `import` | controls the SSTables import process. fields are taken from ColumnFamilyStoreMBean#importNewSSTables | `N` | false | +| `import.keepLevel` | | `N` | false | +| `import.noVerify` | | `N` | false | +| `import.noVerifyTokens` | | `N` | false | +| `import.noInvalidateCaches` | | `N` | false | +| `import.quick` | if true, noVerifyTokens, noInvalidateCaches and noVerify will be set to true automatically | `N` | false | +| `import.extendedVerify` | | `N` | false | +| `import.keepRepaired` | | `N` | false | +| `resolveHostIdFromTopology` | if set to true, host id of node to restore will be resolved from remote topology file located in a bucket by translating it from provided nodeId of storageLocation field | `N` | false | +| `insecure` | Relevant during upload to S3-like bucket only. If true, communication is done via HTTP instead of HTTPS. | `N` | false | +| `skipBucketVerification` | Do not check the existence of a bucket. | `N` | false | +| `retry` | Retry configuration for restore procedure | `N` | | +| `retry.enabled` | Enable or disable retry mechanism on failure | `N` | false | +| `retry.interval` | Time gap between retries, linear strategy will have always this gap constant, exponential strategy will make the gap bigger exponentially (power of 2) on each attempt | `N` | | +| `retry.strategy` | Strategy how retry should be driven, might be either 'LINEAR' or 'EXPONENTIAL' | `N` | | +| `retry.maxAttempts` | Number of repetitions of an upload / download operation in case it fails before giving up completely. | `N` | | +| `rename` | Map of key and values where keys and values are in format "keyspace.table", if key is "ks1.tb1" and value is "ks1.tb2", it means that upon restore, table ks1.tb1 will be restored into table ks1.tb2. | `N` | | +| `schemaVersion` | version of schema we want to restore from | `N` | | +| `exactSchemaVersion` | flag saying if we indeed want a schema version of a running node match with schema version a snapshot is taken on | `N` | false | + +See [icarus](https://github.com/instaclustr/icarus)/[esop](https://github.com/instaclustr/esop) documentation for more information on the fields as most of them are passed directly to icarus. \ No newline at end of file diff --git a/icarus/Dockerfile b/icarus/Dockerfile new file mode 100644 index 0000000..957d770 --- /dev/null +++ b/icarus/Dockerfile @@ -0,0 +1,13 @@ +ARG DOCKER_PROXY_REGISTRY="" +FROM ${DOCKER_PROXY_REGISTRY}openjdk:11-jre-bullseye +ARG ICARUS_VERSION=2.0.4 + +RUN wget https://github.com/instaclustr/icarus/releases/download/icarus-${ICARUS_VERSION}/icarus.jar + +RUN addgroup --gid 999 cassandra && adduser --uid 999 --gid 999 cassandra + +RUN apt-get update && apt-get -y upgrade && rm -rf /var/lib/apt/lists/* + +USER cassandra + +ENTRYPOINT ["java", "-jar", "icarus.jar", "icarus"] diff --git a/main.go b/main.go index 9569326..a06daef 100644 --- a/main.go +++ b/main.go @@ -23,6 +23,8 @@ import ( "os" "time" + "github.com/ibm/cassandra-operator/controllers/icarus" + "github.com/go-logr/zapr" "github.com/gocql/gocql" "go.uber.org/zap" @@ -40,6 +42,8 @@ import ( dbv1alpha1 "github.com/ibm/cassandra-operator/api/v1alpha1" "github.com/ibm/cassandra-operator/controllers" + "github.com/ibm/cassandra-operator/controllers/cassandrabackup" + "github.com/ibm/cassandra-operator/controllers/cassandrarestore" operatorCfg "github.com/ibm/cassandra-operator/controllers/config" "github.com/ibm/cassandra-operator/controllers/cql" "github.com/ibm/cassandra-operator/controllers/events" @@ -127,8 +131,8 @@ func main() { Scheme: mgr.GetScheme(), Cfg: *operatorConfig, Events: eventRecorder, - ProberClient: func(url *url.URL, auth prober.Auth) prober.ProberClient { - return prober.NewProberClient(url, httpClient, auth) + ProberClient: func(url *url.URL, user, password string) prober.ProberClient { + return prober.NewProberClient(url, httpClient, user, password) }, CqlClient: func(cluster *gocql.ClusterConfig) (cql.CqlClient, error) { return cql.NewCQLClient(cluster) }, NodectlClient: func(jolokiaAddr, jmxUser, jmxPassword string, logr *zap.SugaredLogger) nodectl.Nodectl { @@ -145,6 +149,38 @@ func main() { os.Exit(1) } + cassandraBackupReconciler := &cassandrabackup.CassandraBackupReconciler{ + Client: mgr.GetClient(), + Log: logr, + Scheme: mgr.GetScheme(), + Cfg: *operatorConfig, + Events: eventRecorder, + IcarusClient: func(coordinatorPodURL string) icarus.Icarus { + return icarus.New(coordinatorPodURL) + }, + } + err = cassandrabackup.SetupCassandraBackupReconciler(cassandraBackupReconciler, mgr) + if err != nil { + logr.With(zap.Error(err)).Error("unable to create controller", "controller", "CassandraBackup") + os.Exit(1) + } + + cassandraRestoreReconciler := &cassandrarestore.CassandraRestoreReconciler{ + Client: mgr.GetClient(), + Log: logr, + Scheme: mgr.GetScheme(), + Cfg: *operatorConfig, + Events: eventRecorder, + IcarusClient: func(coordinatorPodURL string) icarus.Icarus { + return icarus.New(coordinatorPodURL) + }, + } + err = cassandrarestore.SetupCassandraRestoreReconciler(cassandraRestoreReconciler, mgr) + if err != nil { + logr.With(zap.Error(err)).Error("unable to create controller", "controller", "CassandraRestore") + os.Exit(1) + } + // We use k8s.io/client-go client due to the fact that we require Create and Update operations. // So we can't use sigs.k8s.io/controller-runtime rest client as it requires running manager with cache. kubeClient, err := kubernetes.NewForConfig(restCfg) @@ -164,7 +200,15 @@ func main() { mgr.GetWebhookServer().Port = int(operatorConfig.WebhooksPort) mgr.GetWebhookServer().CertDir = names.OperatorWebhookTLSDir() if err = (&dbv1alpha1.CassandraCluster{}).SetupWebhookWithManager(mgr); err != nil { - logr.With(zap.Error(err)).Fatal("failed to setup webhook with manager") + logr.With(zap.Error(err)).Fatal("failed to setup webhook with manager for cassandracluster") + os.Exit(1) + } + if err = (&dbv1alpha1.CassandraBackup{}).SetupWebhookWithManager(mgr); err != nil { + logr.With(zap.Error(err)).Fatal("failed to setup webhook with manager for cassandrabackup") + os.Exit(1) + } + if err = (&dbv1alpha1.CassandraRestore{}).SetupWebhookWithManager(mgr); err != nil { + logr.With(zap.Error(err)).Fatal("failed to setup webhook with manager for cassandrarestore") os.Exit(1) } dbv1alpha1.SetWebhookLogger(logr) diff --git a/tests/e2e/auth_test.go b/tests/e2e/auth_test.go index 1ce7b1a..415479e 100644 --- a/tests/e2e/auth_test.go +++ b/tests/e2e/auth_test.go @@ -113,7 +113,7 @@ CREATE TABLE e2e_tests.e2e_tests_table ( fmt.Sprintf("cqlsh -u %s -p \"%s\" -e \"DESCRIBE KEYSPACES\"", testAdminRole, testAdminPassword), } Eventually(func() (string, error) { - execResult, err := execPod(pod.Name, pod.Namespace, selectQueryCmd) + execResult, err := execPod(pod.Name, pod.Namespace, selectQueryCmd, "cassandra") if err != nil { return "", err } @@ -211,7 +211,7 @@ CREATE TABLE e2e_tests.e2e_tests_table ( } Eventually(func() error { - _, err := execPod(pod.Name, pod.Namespace, cmd) + _, err := execPod(pod.Name, pod.Namespace, cmd, "cassandra") return err }, 3*time.Minute, 15*time.Second).ShouldNot(Succeed()) @@ -236,7 +236,7 @@ CREATE TABLE e2e_tests.e2e_tests_table ( "-c", fmt.Sprintf("sysctl -n %s", key), } - execResult, err := execPod(pod.Name, pod.Namespace, cmd) + execResult, err := execPod(pod.Name, pod.Namespace, cmd, "cassandra") Expect(err).ToNot(HaveOccurred()) Expect(execResult.stderr).To(BeEmpty()) Expect(strings.TrimSpace(execResult.stdout)).To(Equal(value)) @@ -278,7 +278,7 @@ func testCQLLogin(cc *dbv1alpha1.CassandraCluster, podName, podNamespace, roleNa } stdout := "" Eventually(func() error { - execResult, err := execPod(podName, podNamespace, cmd) + execResult, err := execPod(podName, podNamespace, cmd, "cassandra") stdout = execResult.stdout return err }, 5*time.Minute, 15*time.Second).Should(Succeed()) diff --git a/tests/e2e/managed_regions_test.go b/tests/e2e/managed_regions_test.go index e1a5b5e..fab5d2e 100644 --- a/tests/e2e/managed_regions_test.go +++ b/tests/e2e/managed_regions_test.go @@ -161,7 +161,7 @@ func expectNumberOfNodes(podName, podNamespace, roleName, rolePassword string, e } var stdout, stderr string Eventually(func() error { - execResult, err := execPod(podName, podNamespace, cmd) + execResult, err := execPod(podName, podNamespace, cmd, "cassandra") stdout = execResult.stdout stderr = execResult.stderr return err diff --git a/tests/e2e/network_policies_test.go b/tests/e2e/network_policies_test.go index 5867a5a..6572809 100644 --- a/tests/e2e/network_policies_test.go +++ b/tests/e2e/network_policies_test.go @@ -2,6 +2,8 @@ package e2e import ( "fmt" + "time" + "github.com/gogo/protobuf/proto" dbv1alpha1 "github.com/ibm/cassandra-operator/api/v1alpha1" "github.com/ibm/cassandra-operator/controllers/labels" @@ -10,12 +12,11 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "time" "sigs.k8s.io/controller-runtime/pkg/client" ) -var _ = Describe("Network policies in multi-region cluster", func() { +var _ = Describe("Network policies in multi-region cluster", Serial, func() { ccName := "netpol-multi" adminRoleName := ccName + "-admin-role" @@ -149,21 +150,21 @@ var _ = Describe("Network policies in multi-region cluster", func() { By("Cassandra shouldn't accept connections from external clients") - cmd := []string{ - "bash", - "-c", - fmt.Sprintf("cqlsh --connect-timeout 1 -u %s -p \"%s\" %s-cassandra-dc1.%s.svc.cluster.local", testAdminRole, testAdminPassword, ccName, namespaceName1), - } - Eventually(func() error { - if _, err := execPod(testPodName, namespaceName1, []string{"bash", "-c", "ls"}); err != nil { + if _, err := execPod(testPodName, namespaceName1, []string{"bash", "-c", "ls"}, "test-container"); err != nil { return err } return nil }, time.Minute*1, time.Second*10).Should(Succeed()) + cmd := []string{ + "bash", + "-c", + fmt.Sprintf("cqlsh --connect-timeout 1 -u %s -p \"%s\" %s-cassandra-dc1.%s.svc.cluster.local", testAdminRole, testAdminPassword, ccName, namespaceName1), + } + Eventually(func() string { - execResult, _ := execPod(testPodName, namespaceName1, cmd) + execResult, _ := execPod(testPodName, namespaceName1, cmd, "test-container") return execResult.stderr }, time.Minute*1, time.Second*10).Should(ContainSubstring("Connection error")) @@ -176,7 +177,7 @@ var _ = Describe("Network policies in multi-region cluster", func() { } Eventually(func() string { - execResult, _ := execPod(testPodName, namespaceName1, cmd) + execResult, _ := execPod(testPodName, namespaceName1, cmd, "test-container") return execResult.stderr }, time.Minute*1, time.Second*10).Should(ContainSubstring("Connection timed out")) @@ -189,7 +190,20 @@ var _ = Describe("Network policies in multi-region cluster", func() { } Eventually(func() string { - execResult, _ := execPod(testPodName, namespaceName1, cmd) + execResult, _ := execPod(testPodName, namespaceName1, cmd, "test-container") + return execResult.stderr + }, time.Minute*1, time.Second*10).Should(ContainSubstring("Connection timed out")) + + By("Icarus shouldn't accept connections from external clients") + + cmd = []string{ + "bash", + "-c", + fmt.Sprintf("curl --show-error --connect-timeout 3 %s-cassandra-dc1.%s.svc.cluster.local:%d/operations", ccName, namespaceName1, dbv1alpha1.IcarusPort), + } + + Eventually(func() string { + execResult, _ := execPod(testPodName, namespaceName1, cmd, "test-container") return execResult.stderr }, time.Minute*1, time.Second*10).Should(ContainSubstring("Connection timed out")) @@ -199,7 +213,7 @@ var _ = Describe("Network policies in multi-region cluster", func() { Expect(kubeClient.Update(ctx, testPod)).To(Succeed()) Eventually(func() error { - if _, err := execPod(testPodName, namespaceName1, []string{"bash", "-c", "ls"}); err != nil { + if _, err := execPod(testPodName, namespaceName1, []string{"bash", "-c", "ls"}, "test-container"); err != nil { return err } return nil @@ -215,6 +229,7 @@ var _ = Describe("Network policies in multi-region cluster", func() { NamespaceSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{"kubernetes.io/metadata.name": namespaceName1}, }, + Ports: []int32{dbv1alpha1.IcarusPort, dbv1alpha1.CqlPort}, }, } @@ -227,13 +242,25 @@ var _ = Describe("Network policies in multi-region cluster", func() { } Eventually(func() (string, error) { - execResult, err := execPod(testPodName, namespaceName1, cmd) + execResult, err := execPod(testPodName, namespaceName1, cmd, "test-container") if err != nil { return "", err } return execResult.stdout, nil }, time.Minute*1, time.Second*10).Should(ContainSubstring("system")) + By("Icarus should accept connections from external clients") + cmd = []string{ + "bash", + "-c", + fmt.Sprintf("curl --show-error --connect-timeout 3 %s-cassandra-dc1.%s.svc.cluster.local:%d/operations", ccName, namespaceName1, dbv1alpha1.IcarusPort), + } + + Eventually(func() string { + execResult, _ := execPod(testPodName, namespaceName1, cmd, "test-container") + return execResult.stdout + }, time.Minute*1, time.Second*10).Should(ContainSubstring("[ ]")) + By("Cassandra should accept connection for prometheus agent") Expect(kubeClient.Get(ctx, types.NamespacedName{Name: testPodName, Namespace: namespaceName1}, testPod)).To(Succeed()) testPod.Labels = podSelector @@ -261,7 +288,7 @@ var _ = Describe("Network policies in multi-region cluster", func() { } Eventually(func() (string, error) { - execResult, err := execPod(testPodName, namespaceName1, cmd) + execResult, err := execPod(testPodName, namespaceName1, cmd, "test-container") if err != nil { return "", err } diff --git a/tests/e2e/roles_test.go b/tests/e2e/roles_test.go index 8f7f235..34c9fe7 100644 --- a/tests/e2e/roles_test.go +++ b/tests/e2e/roles_test.go @@ -48,7 +48,7 @@ var _ = Describe("user provided roles", func() { stdout := "" Eventually(func() error { - execResult, err := execPod(pod.Name, pod.Namespace, cmd) + execResult, err := execPod(pod.Name, pod.Namespace, cmd, "cassandra") stdout = execResult.stdout return err }, 30*time.Second, 5*time.Second).Should(Succeed()) @@ -66,7 +66,7 @@ var _ = Describe("user provided roles", func() { Expect(kubeClient.Update(ctx, rolesSecret)).To(Succeed()) Eventually(func() error { - _, err := execPod(pod.Name, pod.Namespace, cmd) + _, err := execPod(pod.Name, pod.Namespace, cmd, "cassandra") return err }, 30*time.Second, 5*time.Second).ShouldNot(Succeed()) }) diff --git a/tests/e2e/suite_test.go b/tests/e2e/suite_test.go index 39f8821..a94b87f 100644 --- a/tests/e2e/suite_test.go +++ b/tests/e2e/suite_test.go @@ -122,7 +122,8 @@ var _ = JustAfterEach(func() { showPodLogs(map[string]string{v1alpha1.CassandraClusterInstance: cc.Name}, cc.Namespace) } - showClusterEvents() + writeClusterEvents() + GinkgoWriter.Println("Wrote logs to %s directory", debugLogsDir) } }) diff --git a/tests/e2e/unmanaged_region_test.go b/tests/e2e/unmanaged_region_test.go index 9b9781b..3fee0ee 100644 --- a/tests/e2e/unmanaged_region_test.go +++ b/tests/e2e/unmanaged_region_test.go @@ -150,7 +150,7 @@ func checkBroadcastAddressOnAllPods(podList *v1.PodList, nodeList *v1.NodeList, for _, node := range nodeList.Items { if node.Name == nodeName { cassandraIP := util.GetNodeIP(addressType, node.Status.Addresses) - execResult, err := execPod(pod.Name, pod.Namespace, cmd) + execResult, err := execPod(pod.Name, pod.Namespace, cmd, "cassandra") execResult.stdout = strings.TrimSuffix(execResult.stdout, "\n") execResult.stdout = strings.TrimSpace(execResult.stdout) Expect(err).ToNot(HaveOccurred()) diff --git a/tests/e2e/utils_test.go b/tests/e2e/utils_test.go index 9516cf6..bb90b65 100644 --- a/tests/e2e/utils_test.go +++ b/tests/e2e/utils_test.go @@ -4,9 +4,9 @@ import ( "bytes" "fmt" "io" - "io/ioutil" "net/http" "net/url" + "os" "strings" "time" @@ -185,7 +185,7 @@ func doHTTPRequest(method string, url string) ([]byte, int, error) { defer func() { _ = resp.Body.Close() }() // Parse response - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return nil, 0, err } @@ -222,7 +222,7 @@ func getPodLogs(pod v1.Pod, podLogOpts v1.PodLogOptions) (string, error) { return str, err } -func execPod(podName string, namespace string, cmd []string) (ExecResult, error) { +func execPod(podName string, namespace string, cmd []string, containerName string) (ExecResult, error) { req := k8sClientset.CoreV1().RESTClient().Post().Resource("pods").Name(podName). Namespace(namespace).SubResource("exec") option := &v1.PodExecOptions{ @@ -233,6 +233,10 @@ func execPod(podName string, namespace string, cmd []string) (ExecResult, error) TTY: false, } + if len(containerName) != 0 { + option.Container = containerName + } + req.VersionedParams( option, scheme.ParameterCodec, @@ -300,31 +304,22 @@ func showPodLogs(labels map[string]string, namespace string) { } for _, pod := range podList.Items { - fmt.Println("Logs from pod: ", pod.Name) - for _, container := range pod.Spec.Containers { logFileName := fmt.Sprintf("%s%s-%s-%s.txt", debugLogsDir, pod.Namespace, pod.Name, container.Name) - str, err := getPodLogs(pod, v1.PodLogOptions{TailLines: &cfg.tailLines, Container: container.Name}) - if err != nil { - fileContent := []byte(fmt.Sprintf("couldn't get logs for pod %s/%s: %s", pod.Namespace, pod.Name, err.Error())) - Expect(ioutil.WriteFile(logFileName, fileContent, 0777)).To(Succeed()) - continue - } - fmt.Println(str) allLogs, err := getPodLogs(pod, v1.PodLogOptions{Container: container.Name}) if err != nil { fileContent := []byte(fmt.Sprintf("couldn't get logs for pod %s/%s container %s: %s", pod.Namespace, pod.Name, container.Name, err.Error())) - Expect(ioutil.WriteFile(logFileName, fileContent, 0777)).To(Succeed()) + Expect(os.WriteFile(logFileName, fileContent, 0777)).To(Succeed()) continue } - Expect(ioutil.WriteFile(logFileName, []byte(allLogs), 0777)).To(Succeed()) + Expect(os.WriteFile(logFileName, []byte(allLogs), 0777)).To(Succeed()) } } } -// showClusterEvents shows all events from the cluster. Helpful if the pods were not able to be schedule -func showClusterEvents() { +// writeClusterEvents write all events from the cluster to a file. Helpful if the pods were not able to be schedule +func writeClusterEvents() { eventsList := &v1.EventList{} err := kubeClient.List(ctx, eventsList) if err != nil { @@ -342,14 +337,7 @@ func showClusterEvents() { )) } - startIndex := len(eventsOutput) - int(cfg.tailLines) - if startIndex < 0 { - startIndex = 0 - } - - fmt.Println("Kubernetes events: ") - fmt.Print(strings.Join(eventsOutput[startIndex:], "\n")) - Expect(ioutil.WriteFile(debugLogsDir+"cluster-events.txt", []byte(strings.Join(eventsOutput, "\n")), 0777)).To(Succeed()) + Expect(os.WriteFile(debugLogsDir+"cluster-events.txt", []byte(strings.Join(eventsOutput, "\n")), 0777)).To(Succeed()) } diff --git a/tests/e2e/zone_as_racks_test.go b/tests/e2e/zone_as_racks_test.go index ba4fb53..941d22c 100644 --- a/tests/e2e/zone_as_racks_test.go +++ b/tests/e2e/zone_as_racks_test.go @@ -46,7 +46,7 @@ var _ = Describe("Cassandra cluster", func() { } for _, p := range podList.Items { - r, err := execPod(p.Name, p.Namespace, cmd) + r, err := execPod(p.Name, p.Namespace, cmd, "cassandra") Expect(err).ToNot(HaveOccurred()) r.stdout = strings.TrimSuffix(r.stdout, "\n") @@ -83,7 +83,7 @@ var _ = Describe("Cassandra cluster", func() { } for _, p := range podList.Items { - r, err := execPod(p.Name, p.Namespace, cmd) + r, err := execPod(p.Name, p.Namespace, cmd, "cassandra") Expect(err).ToNot(HaveOccurred()) r.stdout = strings.TrimSuffix(r.stdout, "\n") diff --git a/tests/integration/cassandrabackup_test.go b/tests/integration/cassandrabackup_test.go new file mode 100644 index 0000000..b81907e --- /dev/null +++ b/tests/integration/cassandrabackup_test.go @@ -0,0 +1,141 @@ +package integration + +import ( + "github.com/gogo/protobuf/proto" + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +var _ = Describe("created cassandrabackup", func() { + ccTpl := &v1alpha1.CassandraCluster{ + ObjectMeta: cassandraObjectMeta, + Spec: v1alpha1.CassandraClusterSpec{ + DCs: []v1alpha1.DC{ + { + Name: "dc1", + Replicas: proto.Int32(6), + }, + }, + AdminRoleSecretName: "admin-role", + ImagePullSecretName: "pullSecretName", + }, + } + + storageSecretTpl := &v1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "storage-credentials", Namespace: cassandraObjectMeta.Namespace}, + Data: map[string][]byte{ + "awsaccesskeyid": []byte("key-id"), + "awssecretaccesskey": []byte("access-key"), + "awsregion": []byte("us-east"), + "awsendpoint": []byte("https://s3.us-east.cloud-object-storage.appdomain.cloud"), + }, + } + + cbTpl := &v1alpha1.CassandraBackup{ + ObjectMeta: cassandraBackupObjectMeta, + Spec: v1alpha1.CassandraBackupSpec{ + CassandraCluster: cassandraObjectMeta.Name, + StorageLocation: "s3://bucket", + SecretName: storageSecretTpl.Name, + }, + } + + It("should send an icarus backup request and track progress", func() { + cc := ccTpl.DeepCopy() + cb := cbTpl.DeepCopy() + createReadyCluster(cc) + Expect(k8sClient.Create(ctx, storageSecretTpl.DeepCopy())).To(Succeed()) + Expect(k8sClient.Create(ctx, cb)).To(Succeed()) + Eventually(func() []icarus.Backup { + return mockIcarusClient.backups + }, mediumTimeout, mediumRetry).Should(HaveLen(1)) + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateRunning)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(0)) + + mockIcarusClient.backups[0].Progress = 0.43253 + + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(43)) + + mockIcarusClient.backups[0].Progress = 1.0 + mockIcarusClient.backups[0].State = icarus.StateCompleted + + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateCompleted)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(100)) + }) + + Context("with failed backup", func() { + It("should reflect errors in the status", func() { + cc := ccTpl.DeepCopy() + cb := cbTpl.DeepCopy() + Expect(k8sClient.Create(ctx, storageSecretTpl.DeepCopy())).To(Succeed()) + createReadyCluster(cc) + Expect(k8sClient.Create(ctx, cb)).To(Succeed()) + Eventually(func() []icarus.Backup { + return mockIcarusClient.backups + }, mediumTimeout, mediumRetry).Should(HaveLen(1)) + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateRunning)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(0)) + + mockIcarusClient.backups[0].Progress = 1 + mockIcarusClient.backups[0].State = icarus.StateFailed + mockIcarusClient.backups[0].Errors = []icarus.Error{ + { + Source: "pod-1", + Message: "some error", + }, + { + Source: "pod-2", + Message: "another error", + }, + } + + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateFailed)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(100)) + Eventually(func() []v1alpha1.BackupError { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cb.Namespace, Name: cb.Name}, cb)).To(Succeed()) + return cb.Status.Errors + }, mediumTimeout, mediumRetry).Should(BeEquivalentTo([]v1alpha1.BackupError{ + { + Source: "pod-1", + Message: "some error", + }, + { + Source: "pod-2", + Message: "another error", + }, + })) + }) + }) +}) diff --git a/tests/integration/cassandracluster_test.go b/tests/integration/cassandracluster_test.go index 0e7f8f4..a100134 100644 --- a/tests/integration/cassandracluster_test.go +++ b/tests/integration/cassandracluster_test.go @@ -94,7 +94,9 @@ var _ = Describe("prober, statefulsets and reaper", func() { }, mediumTimeout, mediumRetry).Should(Succeed()) By("Cassandra run command should be set correctly") - Expect(sts.Spec.Template.Spec.Containers[0].Args).To(BeEquivalentTo([]string{ + cassandraContainer, found := getContainerByName(sts.Spec.Template.Spec, "cassandra") + Expect(found).To(BeTrue()) + Expect(cassandraContainer.Args).To(BeEquivalentTo([]string{ "bash", "-c", fmt.Sprintf("rm -rf /var/lib/cassandra/data/system/peers*\n" + @@ -125,6 +127,29 @@ var _ = Describe("prober, statefulsets and reaper", func() { "-Djava.security.auth.login.config=$CASSANDRA_HOME/conf/cassandra-jaas.config " + "-Dcassandra.jmx.authorizer=org.apache.cassandra.auth.jmx.AuthorizationProxy"), })) + + icarusContainer, found := getContainerByName(sts.Spec.Template.Spec, "icarus") + Expect(found).To(BeTrue()) + Expect(icarusContainer.Args).To(BeEquivalentTo([]string{"--jmx-credentials=/etc/cassandra-auth-config/icarus-jmx", "--jmx-client-auth=true"})) + Expect(icarusContainer.VolumeMounts).To(BeEquivalentTo([]v1.VolumeMount{ + { + Name: "data", + ReadOnly: false, + MountPath: "/var/lib/cassandra", + SubPath: "", + MountPropagation: nil, + SubPathExpr: "", + }, + { + Name: "auth-config", + ReadOnly: false, + MountPath: "/etc/cassandra-auth-config/", + SubPath: "", + MountPropagation: nil, + SubPathExpr: "", + }, + })) + Expect(sts.Spec.Template.Spec.TopologySpreadConstraints).To(BeEquivalentTo([]v1.TopologySpreadConstraint{ { TopologyKey: v1.LabelTopologyZone, @@ -291,6 +316,11 @@ var _ = Describe("prober, statefulsets and reaper", func() { Eventually(func() []string { return mockReaperClient.clusters }, shortTimeout, shortRetry).Should(BeEquivalentTo([]string{cc.Name})) + + Eventually(func() bool { + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: cc.Name, Namespace: cc.Namespace}, cc)).To(Succeed()) + return cc.Status.Ready + }).Should(BeTrue()) }) }) }) diff --git a/tests/integration/cassandrarestore_test.go b/tests/integration/cassandrarestore_test.go new file mode 100644 index 0000000..b56e5cf --- /dev/null +++ b/tests/integration/cassandrarestore_test.go @@ -0,0 +1,167 @@ +package integration + +import ( + "github.com/gogo/protobuf/proto" + "github.com/ibm/cassandra-operator/api/v1alpha1" + "github.com/ibm/cassandra-operator/controllers/icarus" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +var _ = Describe("created cassandrarestore", func() { + ccTpl := &v1alpha1.CassandraCluster{ + ObjectMeta: cassandraObjectMeta, + Spec: v1alpha1.CassandraClusterSpec{ + DCs: []v1alpha1.DC{ + { + Name: "dc1", + Replicas: proto.Int32(6), + }, + }, + AdminRoleSecretName: "admin-role", + ImagePullSecretName: "pullSecretName", + }, + } + + storageSecretTpl := &v1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "storage-credentials", Namespace: cassandraObjectMeta.Namespace}, + Data: map[string][]byte{ + "awsaccesskeyid": []byte("key-id"), + "awssecretaccesskey": []byte("access-key"), + "awsregion": []byte("us-east"), + "awsendpoint": []byte("https://s3.us-east.cloud-object-storage.appdomain.cloud"), + }, + } + + cbTpl := &v1alpha1.CassandraBackup{ + ObjectMeta: cassandraBackupObjectMeta, + Spec: v1alpha1.CassandraBackupSpec{ + CassandraCluster: cassandraObjectMeta.Name, + StorageLocation: "s3://bucket", + SecretName: storageSecretTpl.Name, + }, + } + + crTpl := &v1alpha1.CassandraRestore{ + ObjectMeta: cassandraRestoreObjectMeta, + Spec: v1alpha1.CassandraRestoreSpec{ + CassandraCluster: cassandraObjectMeta.Name, + CassandraBackup: cassandraBackupObjectMeta.Name, + }, + } + + It("should send an icarus restore request and track progress", func() { + cc := ccTpl.DeepCopy() + cr := crTpl.DeepCopy() + cb := cbTpl.DeepCopy() + createReadyCluster(cc) + Expect(k8sClient.Create(ctx, storageSecretTpl.DeepCopy())).To(Succeed()) + Expect(k8sClient.Create(ctx, cb)).To(Succeed()) + Eventually(func() []icarus.Backup { + return mockIcarusClient.backups + }, mediumTimeout, mediumRetry).Should(HaveLen(1)) + + mockIcarusClient.backups[0].Progress = 1.0 + mockIcarusClient.backups[0].State = icarus.StateCompleted + + Expect(k8sClient.Create(ctx, cr)).To(Succeed()) + Eventually(func() []icarus.Restore { + return mockIcarusClient.restores + }, mediumTimeout, mediumRetry).Should(HaveLen(1)) + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateRunning)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(0)) + + mockIcarusClient.restores[0].Progress = 0.53 + + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(53)) + + mockIcarusClient.restores[0].Progress = 1 + mockIcarusClient.restores[0].State = icarus.StateCompleted + + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateCompleted)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(100)) + }) + + Context("with failed restore", func() { + It("should reflect errors in the status", func() { + cc := ccTpl.DeepCopy() + cr := crTpl.DeepCopy() + cb := cbTpl.DeepCopy() + createReadyCluster(cc) + Expect(k8sClient.Create(ctx, storageSecretTpl.DeepCopy())).To(Succeed()) + Expect(k8sClient.Create(ctx, cb)).To(Succeed()) + Eventually(func() []icarus.Backup { + return mockIcarusClient.backups + }, mediumTimeout, mediumRetry).Should(HaveLen(1)) + + mockIcarusClient.backups[0].Progress = 1.0 + mockIcarusClient.backups[0].State = icarus.StateCompleted + + Expect(k8sClient.Create(ctx, cr)).To(Succeed()) + Eventually(func() []icarus.Restore { + return mockIcarusClient.restores + }, mediumTimeout, mediumRetry).Should(HaveLen(1)) + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateRunning)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(0)) + + mockIcarusClient.restores[0].Progress = 1 + mockIcarusClient.restores[0].State = icarus.StateFailed + mockIcarusClient.restores[0].Errors = []icarus.Error{ + { + Source: "pod-1", + Message: "some error", + }, + { + Source: "pod-2", + Message: "another error", + }, + } + + Eventually(func() string { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.State + }, mediumTimeout, mediumRetry).Should(Equal(icarus.StateFailed)) + Eventually(func() int { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.Progress + }, mediumTimeout, mediumRetry).Should(Equal(100)) + Eventually(func() []v1alpha1.RestoreError { + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: cr.Namespace, Name: cr.Name}, cr)).To(Succeed()) + return cr.Status.Errors + }, mediumTimeout, mediumRetry).Should(BeEquivalentTo([]v1alpha1.RestoreError{ + { + Source: "pod-1", + Message: "some error", + }, + { + Source: "pod-2", + Message: "another error", + }, + })) + }) + }) +}) diff --git a/tests/integration/mocks_test.go b/tests/integration/mocks_test.go index d86ad7c..2b9faaa 100644 --- a/tests/integration/mocks_test.go +++ b/tests/integration/mocks_test.go @@ -4,6 +4,9 @@ import ( "context" "reflect" "strconv" + "time" + + "github.com/ibm/cassandra-operator/controllers/icarus" "github.com/ibm/cassandra-operator/controllers/nodectl" @@ -47,6 +50,92 @@ type reaperMock struct { err error } +type icarusMock struct { + backups []icarus.Backup + restores []icarus.Restore + error +} + +func (i *icarusMock) Backup(ctx context.Context, req icarus.BackupRequest) (icarus.Backup, error) { + backup := icarus.Backup{ + ID: "random_id", + State: icarus.StateRunning, + Progress: 0.0, + Type: "backup", + CreationTime: time.Now().Format(time.RFC3339), + StartTime: time.Now().Format(time.RFC3339), + Errors: nil, + SchemaVersion: "schema-1", + + StorageLocation: req.StorageLocation, + Duration: req.Duration, + SnapshotTag: req.SnapshotTag, + ConcurrentConnections: req.ConcurrentConnections, + Entities: req.Entities, + K8sSecretName: req.K8sSecretName, + K8sNamespace: req.K8sNamespace, + Retry: req.Retry, + MetadataDirective: req.MetadataDirective, + Timeout: req.Timeout, + SkipBucketVerification: req.SkipBucketVerification, + SkipRefreshing: req.SkipRefreshing, + CreateMissingBucket: req.CreateMissingBucket, + DC: req.DC, + Insecure: req.Insecure, + DataDirs: req.DataDirs, + Bandwidth: req.Bandwidth, + GlobalRequest: req.GlobalRequest, + } + + i.backups = append(i.backups, backup) + return backup, i.error +} + +func (i *icarusMock) Backups(ctx context.Context) ([]icarus.Backup, error) { + return i.backups, i.error +} + +func (i *icarusMock) Restore(ctx context.Context, req icarus.RestoreRequest) error { + restore := icarus.Restore{ + Id: "random_id", + CreationTime: time.Now().Format(time.RFC3339), + State: icarus.StateRunning, + Errors: nil, + Progress: 0.0, + StartTime: time.Now().Format(time.RFC3339), + Type: "restore", + StorageLocation: req.StorageLocation, + ConcurrentConnections: req.ConcurrentConnections, + SnapshotTag: req.SnapshotTag, + Entities: req.Entities, + RestorationStrategyType: req.RestorationStrategyType, + RestorationPhase: req.RestorationPhase, + Import: req.Import, + NoDeleteTruncates: req.NoDeleteTruncates, + NoDeleteDownloads: req.NoDeleteDownloads, + NoDownloadData: req.NoDownloadData, + ExactSchemaVersion: req.ExactSchemaVersion, + GlobalRequest: req.GlobalRequest, + Timeout: req.Timeout, + ResolveHostIdFromTopology: req.ResolveHostIdFromTopology, + Insecure: req.Insecure, + SkipBucketVerification: req.SkipBucketVerification, + Retry: req.Retry, + SinglePhase: req.SinglePhase, + DataDirs: req.DataDirs, + DC: req.DC, + K8sNamespace: req.K8sNamespace, + K8sSecretName: req.K8sSecretName, + Rename: req.Rename, + } + i.restores = append(i.restores, restore) + return i.error +} + +func (i *icarusMock) Restores(ctx context.Context) ([]icarus.Restore, error) { + return i.restores, i.error +} + func (r proberMock) Ready(ctx context.Context) (bool, error) { return r.ready, r.err } diff --git a/tests/integration/network_policies_test.go b/tests/integration/network_policies_test.go index c56f1bf..d8eee12 100644 --- a/tests/integration/network_policies_test.go +++ b/tests/integration/network_policies_test.go @@ -159,6 +159,10 @@ var _ = Describe("network policies", func() { Port: &intstr.IntOrString{IntVal: dbv1alpha1.IntraPort}, Protocol: &protocolTCP, }, + { + Port: &intstr.IntOrString{IntVal: dbv1alpha1.IcarusPort}, + Protocol: &protocolTCP, + }, }, From: []nwv1.NetworkPolicyPeer{ { @@ -177,6 +181,10 @@ var _ = Describe("network policies", func() { Port: &intstr.IntOrString{IntVal: dbv1alpha1.CqlPort}, Protocol: &protocolTCP, }, + { + Port: &intstr.IntOrString{IntVal: dbv1alpha1.IcarusPort}, + Protocol: &protocolTCP, + }, }, From: []nwv1.NetworkPolicyPeer{ { @@ -254,6 +262,10 @@ var _ = Describe("network policies", func() { Port: &intstr.IntOrString{IntVal: dbv1alpha1.IntraPort}, Protocol: &protocolTCP, }, + { + Port: &intstr.IntOrString{IntVal: dbv1alpha1.IcarusPort}, + Protocol: &protocolTCP, + }, }, From: []nwv1.NetworkPolicyPeer{ { @@ -285,6 +297,10 @@ var _ = Describe("network policies", func() { Port: &intstr.IntOrString{IntVal: dbv1alpha1.IntraPort}, Protocol: &protocolTCP, }, + { + Port: &intstr.IntOrString{IntVal: dbv1alpha1.IcarusPort}, + Protocol: &protocolTCP, + }, }, From: []nwv1.NetworkPolicyPeer{ { diff --git a/tests/integration/suite_test.go b/tests/integration/suite_test.go index 035eeda..a05c487 100644 --- a/tests/integration/suite_test.go +++ b/tests/integration/suite_test.go @@ -29,6 +29,12 @@ import ( "testing" "time" + "github.com/ibm/cassandra-operator/controllers/icarus" + + "github.com/ibm/cassandra-operator/controllers/cassandrarestore" + + "github.com/ibm/cassandra-operator/controllers/cassandrabackup" + "github.com/ibm/cassandra-operator/controllers/nodectl" "sigs.k8s.io/controller-runtime/pkg/event" @@ -84,6 +90,7 @@ var mockNodectlClient = &nodectlMock{} var mockNodetoolClient = &nodetoolMock{} var mockCQLClient = &cqlMock{} var mockReaperClient = &reaperMock{} +var mockIcarusClient = &icarusMock{} var operatorConfig = config.Config{} var ctx = context.Background() var logr = zap.NewNop() @@ -104,6 +111,14 @@ var ( Namespace: "default", Name: "test-cassandra-cluster", } + cassandraBackupObjectMeta = metav1.ObjectMeta{ + Namespace: "default", + Name: "test-cassandra-backup", + } + cassandraRestoreObjectMeta = metav1.ObjectMeta{ + Namespace: "default", + Name: "test-cassandra-restore", + } reaperDeploymentLabels = map[string]string{ v1alpha1.CassandraClusterComponent: v1alpha1.CassandraClusterComponentReaper, @@ -182,6 +197,7 @@ var _ = BeforeSuite(func() { DefaultProberImage: "prober/image", DefaultJolokiaImage: "jolokia/image", DefaultReaperImage: "reaper/image", + DefaultIcarusImage: "icarus/image", } sch := scheme.Scheme k8sClient, err = client.New(cfg, client.Options{Scheme: sch}) @@ -206,13 +222,21 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(mgr).ToNot(BeNil()) + err = (&v1alpha1.CassandraBackup{}).SetupWebhookWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + Expect(mgr).ToNot(BeNil()) + + err = (&v1alpha1.CassandraRestore{}).SetupWebhookWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + Expect(mgr).ToNot(BeNil()) + cassandraCtrl := &controllers.CassandraClusterReconciler{ Log: logr.Sugar(), Scheme: sch, Client: k8sClient, Cfg: operatorConfig, Events: events.NewEventRecorder(&record.FakeRecorder{}), - ProberClient: func(url *url.URL, auth prober.Auth) prober.ProberClient { + ProberClient: func(url *url.URL, user, password string) prober.ProberClient { return mockProberClient }, CqlClient: func(clusterConfig *gocql.ClusterConfig) (cql.CqlClient, error) { @@ -241,9 +265,34 @@ var _ = BeforeSuite(func() { }, } + cassandraBackupCtrl := &cassandrabackup.CassandraBackupReconciler{ + Log: logr.Sugar(), + Scheme: sch, + Client: k8sClient, + Cfg: operatorConfig, + Events: events.NewEventRecorder(&record.FakeRecorder{}), + IcarusClient: func(coordinatorPodURL string) icarus.Icarus { + return mockIcarusClient + }, + } + + cassandraRestoreCtrl := &cassandrarestore.CassandraRestoreReconciler{ + Log: logr.Sugar(), + Scheme: sch, + Client: k8sClient, + Cfg: operatorConfig, + Events: events.NewEventRecorder(&record.FakeRecorder{}), + IcarusClient: func(coordinatorPodURL string) icarus.Icarus { + return mockIcarusClient + }, + } + testReconciler := SetupTestReconcile(cassandraCtrl) - err = controllers.SetupCassandraReconciler(testReconciler, mgr, zap.NewNop().Sugar(), make(chan event.GenericEvent)) - Expect(err).ToNot(HaveOccurred()) + Expect(controllers.SetupCassandraReconciler(testReconciler, mgr, zap.NewNop().Sugar(), make(chan event.GenericEvent))).To(Succeed()) + testBackupReconciler := SetupTestReconcile(cassandraBackupCtrl) + Expect(cassandrabackup.SetupCassandraBackupReconciler(testBackupReconciler, mgr)).To(Succeed()) + testRestoreReconciler := SetupTestReconcile(cassandraRestoreCtrl) + Expect(cassandrarestore.SetupCassandraRestoreReconciler(testRestoreReconciler, mgr)).To(Succeed()) mgrStopCh = StartTestManager(mgr) }) @@ -266,11 +315,29 @@ var _ = AfterEach(func() { return reconcileInProgress }, longTimeout, mediumRetry).Should(BeFalse(), "Test didn't stop triggering reconcile events. See operator logs for more details.") CleanUpCreatedResources(cassandraObjectMeta.Name, cassandraObjectMeta.Namespace) + backup := &v1alpha1.CassandraBackup{} + err := k8sClient.Get(ctx, types.NamespacedName{Name: cassandraBackupObjectMeta.Name, Namespace: cassandraObjectMeta.Namespace}, backup) + if err == nil { + Expect(deleteResource(types.NamespacedName{Name: backup.Spec.SecretName, Namespace: cassandraBackupObjectMeta.Namespace}, &v1.Secret{})).To(Succeed()) + expectResourceIsDeleted(types.NamespacedName{Name: backup.Spec.SecretName, Namespace: cassandraBackupObjectMeta.Namespace}, &v1.Secret{}) + Expect(k8sClient.Delete(ctx, backup)).To(Succeed()) + } + + restore := &v1alpha1.CassandraRestore{} + err = k8sClient.Get(ctx, types.NamespacedName{Name: cassandraRestoreObjectMeta.Name, Namespace: cassandraRestoreObjectMeta.Namespace}, restore) + if err == nil { + if len(restore.Spec.SecretName) > 0 { + Expect(deleteResource(types.NamespacedName{Name: restore.Spec.SecretName, Namespace: cassandraRestoreObjectMeta.Namespace}, &v1.Secret{})).To(Succeed()) + expectResourceIsDeleted(types.NamespacedName{Name: restore.Spec.SecretName, Namespace: cassandraRestoreObjectMeta.Namespace}, &v1.Secret{}) + } + Expect(k8sClient.Delete(ctx, restore)).To(Succeed()) + } mockProberClient = &proberMock{} mockNodectlClient = &nodectlMock{} mockNodetoolClient = &nodetoolMock{} mockCQLClient = &cqlMock{} mockReaperClient = &reaperMock{} + mockIcarusClient = &icarusMock{} testFinished = false })