From f79e3c563e7f5bee3880c26ecda62d780a1a12e7 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 18 Apr 2026 19:19:48 +0000
Subject: [PATCH] [infra] Remove mysql InnoDB Cluster + Operator HCL (Phase 4
 cleanup) [ci skip]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Context

On 2026-04-16 (memory #711) MySQL was migrated from InnoDB Cluster (3-member
Group Replication + MySQL Operator) to a raw `kubernetes_stateful_set_v1.mysql_standalone`
on `mysql:8.4`. The migration preserved the `mysql.dbaas` Service name
(selector switched to the standalone pod), all 20 databases/688 tables/14
users were dump-restored, and Vault rotated credentials against the new
instance. The InnoDB Cluster has been dark since — Phase 4 was to remove
the dead code and decommission its cluster-side Helm state.

Memory #711 explicitly notes Phase 4 as: "Remove helm_release.mysql_cluster
+ mysql_operator + namespace + RBAC + Delete PVC datadir-mysql-cluster-0
(30Gi) + Delete mysql-operator namespace + CRDs + stale Vault roles."

## This change

Phase 4 scope executed in this session (beads code-qai):

1. `terragrunt destroy -target` against 6 resources in the dbaas Tier 0 stack:
   - `module.dbaas.helm_release.mysql_cluster` — uninstalled InnoDBCluster CR
     + MySQL Router Deployment + 8 Services (mysql-cluster, -instances,
     ports 6446/6448/6447/6449/6450/8443, etc.)
   - `module.dbaas.helm_release.mysql_operator` — uninstalled MySQL Operator
     Deployment, InnoDBCluster CRD + webhook, operator ClusterRoles
   - `module.dbaas.kubernetes_namespace.mysql_operator` — deleted the ns
   - `module.dbaas.kubernetes_cluster_role.mysql_sidecar_extra` — leftover
     permissions patch that existed to work around the sidecar's kopf
     permissions bug; unused without the operator
   - `module.dbaas.kubernetes_cluster_role_binding.mysql_sidecar_extra`
   - `module.dbaas.kubernetes_config_map.mysql_extra_cnf` — used to override
     `innodb_doublewrite=OFF` via subPath mount; standalone does not need it
2. `kubectl delete pvc datadir-mysql-cluster-0 -n dbaas` — Helm does not
   garbage-collect PVCs; 30Gi reclaimed.
3. Removed 295 lines (lines 86–380) from `stacks/dbaas/modules/dbaas/main.tf`
   covering the `#### MYSQL — InnoDB Cluster via MySQL Operator` section
   and all six resources above.

The first destroy hit a Helm timeout on `mysql-cluster` uninstall ("context
deadline exceeded"). Uninstallation had in fact completed cluster-side by
that point but TF rolled back the state delta. A second `terragrunt destroy
-target` call with the same args resolved cleanly — destroyed the remaining
2 tracked resources (the first pass cleared 4) and encrypted+committed the
Tier 0 state.

## What is NOT in this change

- CRDs (`innodbclusters.mysql.oracle.com`, etc.) — Helm does delete these
  on uninstall. Verified clean: `kubectl get crd | grep mysql.oracle.com`
  returns nothing.
- Orphan PVC `datadir-mysql-cluster-0` — already deleted via kubectl; not
  a TF-managed resource.
- Stale Vault DB roles (health, linkwarden, affine, woodpecker,
  claude_memory, crowdsec, technitium) for services migrated MySQL→PG —
  sandbox denies `vault list database/roles` as credential scouting, so
  the user handles this manually.
- 2 state-commits preceding this one (`30fa411b`, `6cf3575e`) are automatic
  SOPS-encrypted-state commits produced by `scripts/tg` after each
  `terragrunt destroy` pass. Standard Tier 0 workflow.

## Verification

```
$ helm list -A | grep -E 'mysql-cluster|mysql-operator'
(no output)

$ kubectl get ns mysql-operator
Error from server (NotFound): namespaces "mysql-operator" not found

$ kubectl get pvc -n dbaas datadir-mysql-cluster-0
Error from server (NotFound): persistentvolumeclaims "datadir-mysql-cluster-0" not found

$ kubectl get pod -n dbaas -l app.kubernetes.io/instance=mysql-standalone
NAME                 READY   STATUS    RESTARTS       AGE
mysql-standalone-0   1/1     Running   1 (118m ago)   2d

$ ../../scripts/tg state list | grep -i 'mysql_operator\|mysql_cluster\|mysql_sidecar\|mysql_extra_cnf'
(no output)

$ ../../scripts/tg plan | grep -E 'mysql_cluster|mysql_operator|mysql_sidecar|mysql_extra_cnf'
(no output — Wave 2 drift is gone; remaining plan items are pre-existing
drift unrelated to this change, see Wave 3 + in-flight payslip work)
```

## Reproduce locally
1. `git pull`
2. `cd stacks/dbaas && ../../scripts/tg state list | grep mysql_cluster` → no output
3. `helm list -A | grep mysql-cluster` → no output

Closes: code-qai

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 stacks/dbaas/modules/dbaas/main.tf | 295 -----------------------------
 1 file changed, 295 deletions(-)

diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf
index 31896be8..a7fa3cc1 100644
--- a/stacks/dbaas/modules/dbaas/main.tf
+++ b/stacks/dbaas/modules/dbaas/main.tf
@@ -83,301 +83,6 @@ module "tls_secret" {
   tls_secret_name = var.tls_secret_name
 }
 
-
-#### MYSQL — InnoDB Cluster via MySQL Operator
-#
-# 3 MySQL servers with Group Replication + 1 MySQL Router for auto-failover.
-# Operator installed in mysql-operator namespace (toleration for control-plane).
-# Init containers are slow (~20 min each) due to mysqlsh plugin loading.
-
-resource "kubernetes_namespace" "mysql_operator" {
-  metadata {
-    name = "mysql-operator"
-    labels = {
-      tier = "1-cluster"
-    }
-  }
-}
-
-resource "helm_release" "mysql_operator" {
-  namespace        = kubernetes_namespace.mysql_operator.metadata[0].name
-  create_namespace = false
-  name             = "mysql-operator"
-  timeout          = 300
-
-  repository = "https://mysql.github.io/mysql-operator/"
-  chart      = "mysql-operator"
-  version    = "2.2.7"
-
-  # NOTE: The mysql-operator chart (2.2.7) does NOT expose a resources values key.
-  # The resources block below is ignored by the chart. Without explicit resources
-  # on the deployment, the LimitRange default (256Mi) applies silently.
-  # Fix: kubectl patch deployment mysql-operator -n mysql-operator --type=json \
-  #   -p='[{"op":"replace","path":"/spec/template/spec/containers/0/resources","value":{"requests":{"cpu":"100m","memory":"256Mi"},"limits":{"memory":"512Mi"}}}]'
-  values = [yamlencode({
-    resources = {
-      requests = {
-        cpu    = "100m"
-        memory = "256Mi"
-      }
-      limits = {
-        memory = "512Mi"
-      }
-    }
-  })]
-}
-
-# The mysql-sidecar ClusterRole created by the Helm chart is missing
-# namespace and CRD list/watch permissions needed by the kopf framework
-# in the sidecar container. Without these, the sidecar enters degraded
-# mode and never completes InnoDB cluster join operations.
-resource "kubernetes_cluster_role" "mysql_sidecar_extra" {
-  metadata {
-    name = "mysql-sidecar-extra"
-  }
-  rule {
-    api_groups = [""]
-    resources  = ["namespaces"]
-    verbs      = ["list", "watch"]
-  }
-  rule {
-    api_groups = ["apiextensions.k8s.io"]
-    resources  = ["customresourcedefinitions"]
-    verbs      = ["list", "watch"]
-  }
-}
-
-resource "kubernetes_cluster_role_binding" "mysql_sidecar_extra" {
-  metadata {
-    name = "mysql-sidecar-extra"
-  }
-  role_ref {
-    api_group = "rbac.authorization.k8s.io"
-    kind      = "ClusterRole"
-    name      = kubernetes_cluster_role.mysql_sidecar_extra.metadata[0].name
-  }
-  subject {
-    kind      = "ServiceAccount"
-    name      = "mysql-cluster-sa"
-    namespace = kubernetes_namespace.dbaas.metadata[0].name
-  }
-}
-
-# ConfigMap for MySQL extra config — mounted as subPath over 99-extra.cnf
-# This is the only reliable way to persist innodb_doublewrite=OFF because:
-# - spec.mycnf only applies on initial cluster creation
-# - The operator's initconf container overwrites 99-extra.cnf on every pod start
-# - SET PERSIST doesn't support innodb_doublewrite (static variable)
-resource "kubernetes_config_map" "mysql_extra_cnf" {
-  metadata {
-    name      = "mysql-extra-cnf"
-    namespace = kubernetes_namespace.dbaas.metadata[0].name
-  }
-  data = {
-    "99-extra.cnf" = <<-EOT
-      [mysqld]
-      innodb_doublewrite=OFF
-    EOT
-  }
-}
-
-resource "helm_release" "mysql_cluster" {
-  namespace        = kubernetes_namespace.dbaas.metadata[0].name
-  create_namespace = false
-  name             = "mysql-cluster"
-  timeout          = 900
-
-  repository = "https://mysql.github.io/mysql-operator/"
-  chart      = "mysql-innodbcluster"
-  version    = "2.2.7"
-
-  values = [yamlencode({
-    serverInstances = 1
-    routerInstances = 1
-    serverVersion   = "8.4.4"
-
-    credentials = {
-      root = {
-        user     = "root"
-        password = var.dbaas_root_password
-        host     = "%"
-      }
-    }
-
-    tls = {
-      useSelfSigned = true
-    }
-
-    datadirVolumeClaimTemplate = {
-      storageClassName = "proxmox-lvm-encrypted"
-      metadata = {
-        annotations = {
-          "resize.topolvm.io/threshold"     = "80%"
-          "resize.topolvm.io/increase"      = "20%"
-          "resize.topolvm.io/storage_limit" = "100Gi"
-        }
-      }
-      resources = {
-        requests = {
-          storage = "30Gi"
-        }
-      }
-    }
-
-    serverConfig = {
-      mycnf = <<-EOT
-        [mysqld]
-        skip-name-resolve
-        mysql-native-password=ON
-        # Auto-recovery after crashes: rejoin group without manual intervention
-        group_replication_autorejoin_tries=2016
-        group_replication_exit_state_action=OFFLINE_MODE
-        group_replication_member_expel_timeout=30
-        group_replication_unreachable_majority_timeout=60
-        group_replication_start_on_boot=ON
-        # Cap XCom cache to prevent unbounded growth (default 1GB causes OOM)
-        group_replication_message_cache_size=134217728
-        # Reduce log buffer (16MB sufficient for this workload, was 64MB)
-        innodb_log_buffer_size=16777216
-        # Limit connections (peak usage ~40, no need for 151)
-        max_connections=80
-        # --- Disk write reduction (HDD/LVM thin) ---
-        # Flush redo log once per second, not per commit. Up to 1s data loss on MySQL crash,
-        # but group replication provides redundancy across 3 nodes.
-        innodb_flush_log_at_trx_commit=0
-        # OS decides when to flush binlog (not per commit)
-        sync_binlog=0
-        # HDD-tuned I/O capacity (default 200/2000 is for SSD)
-        innodb_io_capacity=100
-        innodb_io_capacity_max=200
-        # 1GB redo log capacity — larger log means less frequent checkpoint flushes
-        innodb_redo_log_capacity=1073741824
-        # 1GB buffer pool
-        innodb_buffer_pool_size=1073741824
-        # Disable doublewrite — halves write amplification. Safe with group replication
-        # (crashed node can re-clone from healthy replica rather than relying on local recovery)
-        innodb_doublewrite=OFF
-        # Flush neighbors on HDD (coalesce adjacent dirty pages into single I/O)
-        innodb_flush_neighbors=1
-        # Reduce page cleaner aggressiveness
-        innodb_lru_scan_depth=256
-        innodb_page_cleaners=1
-        # Reduce adaptive flushing — let dirty pages accumulate longer before background flush
-        innodb_adaptive_flushing_lwm=10
-        innodb_max_dirty_pages_pct=90
-        innodb_max_dirty_pages_pct_lwm=10
-      EOT
-    }
-
-    # Top-level resources apply to SIDECAR container
-    # VPA shows sidecar needs only 248Mi target / 334Mi upper bound
-    # Setting to 350Mi (was 2Gi/4Gi - 17× over-provisioned)
-    resources = {
-      requests = {
-        cpu    = "250m"
-        memory = "350Mi"
-      }
-      limits = {
-        memory = "350Mi"
-      }
-    }
-
-    podSpec = {
-      affinity = {
-        nodeAffinity = {
-          requiredDuringSchedulingIgnoredDuringExecution = {
-            nodeSelectorTerms = [{
-              matchExpressions = [{
-                key      = "kubernetes.io/hostname"
-                operator = "NotIn"
-                values   = ["k8s-node1"]
-              }]
-            }]
-          }
-        }
-        podAntiAffinity = {
-          preferredDuringSchedulingIgnoredDuringExecution = [{
-            weight = 100
-            podAffinityTerm = {
-              labelSelector = {
-                matchLabels = {
-                  "component" = "mysqld"
-                }
-              }
-              topologyKey = "kubernetes.io/hostname"
-            }
-          }]
-        }
-      }
-      # Container-specific resources for MYSQL container
-      # VPA shows 2.98Gi target / 5.26Gi upper bound
-      # Current usage ~1.8Gi peak. Reducing limit from 4Gi to 3Gi
-      containers = [
-        {
-          name = "mysql"
-          resources = {
-            requests = {
-              memory = "2Gi"
-              cpu    = "250m"
-            }
-            limits = {
-              memory = "3Gi"
-            }
-          }
-        },
-        {
-          # MySQL operator sidecar (kopf Python control loop)
-          # VPA upper bound: 334Mi. Was 6Gi limit — 17× over-provisioned.
-          name = "sidecar"
-          resources = {
-            requests = {
-              memory = "350Mi"
-              cpu    = "50m"
-            }
-            limits = {
-              memory = "512Mi"
-            }
-          }
-        }
-      ]
-      initContainers = [
-        {
-          name = "fixdatadir"
-          resources = {
-            requests = { memory = "64Mi", cpu = "25m" }
-            limits   = { memory = "64Mi" }
-          }
-        },
-        {
-          name = "initconf"
-          resources = {
-            requests = { memory = "256Mi", cpu = "50m" }
-            limits   = { memory = "256Mi" }
-          }
-        },
-        {
-          name = "initmysql"
-          resources = {
-            requests = { memory = "512Mi", cpu = "250m" }
-            limits   = { memory = "512Mi" }
-          }
-        }
-      ]
-    }
-
-    # MySQL Router - explicitly set resources (chart does not expose router.resources)
-    # VPA shows 100Mi upper bound, setting to 128Mi
-    # Note: This requires manual kubectl patch after helm release:
-    #   kubectl patch deployment mysql-cluster-router -n dbaas --type=json -p='[
-    #     {"op": "replace", "path": "/spec/template/spec/containers/0/resources",
-    #      "value": {"requests": {"cpu": "25m", "memory": "128Mi"}, "limits": {"memory": "128Mi"}}}]'
-    # TODO: migrate to mysql-operator fork or wait for upstream router.resources support
-
-  })]
-
-  depends_on = [helm_release.mysql_operator]
-}
-
 #### MYSQL — Standalone (migration target)
 #
 # Standalone MySQL without Group Replication. Eliminates ~95 GB/day of GR