fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

152
stacks/nextcloud/.terraform.lock.hcl generated Normal file
View file

@ -0,0 +1,152 @@
# This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates.
provider "registry.terraform.io/cloudflare/cloudflare" {
version = "4.52.7"
constraints = "~> 4.0"
hashes = [
"h1:pPItIWii5oymR+geZB219ROSPuSODPLTlM4S/u8xLvM=",
"zh:0c904ce31a4c6c4a5b3bf7ff1560e77c0cc7e2450c8553ded8e8c90398e1418b",
"zh:36183d310c36373fe4cb936b83c595c6fd3b0a94bc7827f28e5789ccbf59752e",
"zh:556a568a6f0235e8f41647de9e4d3a1e7b1d6502df8b19b54ec441f1c653ea10",
"zh:633ebbd5b0245e75e500ef9be4d9e62288f97e8da3baaa51323892a786d90285",
"zh:6acfe60cf52a65ba8f044f748548d2119e7f4fd7f8ebcb14698960d87c68f529",
"zh:890df766e9b839623b1f0437355032a3c006226a6c200cd911e15ee1a9014e9f",
"zh:904acc31ebb9d6ef68c792074b30532ee61bf515f19e0a3c75b46f126cca1f13",
"zh:a1d0a81246afc8750286d3f6fe7a8fbe6460dd2662407b28dbfbabb612e5fa9d",
"zh:a41a36fe253fc365fe2b7ffc749624688b2693b4634862fda161179ab100029f",
"zh:a7ef269e77ffa8715c8945a2c14322c7ff159ea44c15f62505f3cbb2cae3b32d",
"zh:b01aa3bed30610633b762df64332b26f8844a68c3960cebcb30f04918efc67fe",
"zh:b069cc2cd18cae10757df3ae030508eac8d55de7e49eda7a5e3e11f2f7fe6455",
"zh:b2d2c6313729ebb7465dceece374049e2d08bda34473901be9ff46a8836d42b2",
"zh:db0e114edaf4bc2f3d4769958807c83022bfbc619a00bdf4c4bd17faa4ab2d8b",
"zh:ecc0aa8b9044f664fd2aaf8fa992d976578f78478980555b4b8f6148e8d1a5fe",
]
}
provider "registry.terraform.io/gavinbunney/kubectl" {
version = "1.19.0"
constraints = "~> 1.14"
hashes = [
"h1:9QkxPjp0x5FZFfJbE+B7hBOoads9gmdfj9aYu5N4Sfc=",
"zh:1dec8766336ac5b00b3d8f62e3fff6390f5f60699c9299920fc9861a76f00c71",
"zh:43f101b56b58d7fead6a511728b4e09f7c41dc2e3963f59cf1c146c4767c6cb7",
"zh:4c4fbaa44f60e722f25cc05ee11dfaec282893c5c0ffa27bc88c382dbfbaa35c",
"zh:51dd23238b7b677b8a1abbfcc7deec53ffa5ec79e58e3b54d6be334d3d01bc0e",
"zh:5afc2ebc75b9d708730dbabdc8f94dd559d7f2fc5a31c5101358bd8d016916ba",
"zh:6be6e72d4663776390a82a37e34f7359f726d0120df622f4a2b46619338a168e",
"zh:72642d5fcf1e3febb6e5d4ae7b592bb9ff3cb220af041dbda893588e4bf30c0c",
"zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
"zh:a1da03e3239867b35812ee031a1060fed6e8d8e458e2eaca48b5dd51b35f56f7",
"zh:b98b6a6728fe277fcd133bdfa7237bd733eae233f09653523f14460f608f8ba2",
"zh:bb8b071d0437f4767695c6158a3cb70df9f52e377c67019971d888b99147511f",
"zh:dc89ce4b63bfef708ec29c17e85ad0232a1794336dc54dd88c3ba0b77e764f71",
"zh:dd7dd18f1f8218c6cd19592288fde32dccc743cde05b9feeb2883f37c2ff4b4e",
"zh:ec4bd5ab3872dedb39fe528319b4bba609306e12ee90971495f109e142d66310",
"zh:f610ead42f724c82f5463e0e71fa735a11ffb6101880665d93f48b4a67b9ad82",
]
}
provider "registry.terraform.io/goauthentik/authentik" {
version = "2024.12.1"
constraints = "~> 2024.10"
hashes = [
"h1:roBMd+gi+TGgikH/bMzEI8JfvJiMAQWt+8FmokCrQIs=",
"zh:090260dc7889ea822ec1d899344e1ee23eba5290461989c0796149c9511f2316",
"zh:13c2655ff824b0dc4b9bb832b5ca6d41dba97cb280330258c5fef4115e236209",
"zh:166a73c3a810c9c895d68a8ff968158f339f8a2c1c03e20ec9fc5ed99cc64e20",
"zh:203777eae1cdc711233315499643180604cff2324411b186b7cf07fdbe16f655",
"zh:3b2f18c9a8d28dac74dc6bbf168c946855ab9c68f053578d4630c50d5eaf30a0",
"zh:4822275985f6b74b6196c47112316a4252db22cf4ceaef7c9ab4c66d488abf2f",
"zh:53ea97562666c8a5a2f6d63d418a302a7f8ee4b7bb7da35dedaa89aa5708b7f0",
"zh:56b8a230901e3550c92a1d3f58ee9dafe9853f30fe4315af3ab28ae63262e15d",
"zh:6293ab7b1fd8206a0c853591f50186aca4a1eff117b2a773e10760a23a2c83e9",
"zh:9433970f79fb92d8aae3ee436db5630ab312c78b6dc9df9c1db3273a18f8aaa1",
"zh:95df406214f79b3b98222d7c7fe8fc319a3d90b7a9d53e1d5abbda5dfb8b9436",
"zh:a85880da0552a42c8f449390fbd7d8b03541d1a13e04bba9f1404fa658754260",
"zh:a95f6e9bd62c67e70eba1b1a14728856b9a6a28cd1e5e3be54a7718882c87e7f",
"zh:dd599b51c5beb34a4c6feece244fde07d2558d69929449ab1fd39a5ebe738781",
]
}
provider "registry.terraform.io/hashicorp/helm" {
version = "3.1.1"
hashes = [
"h1:47CqNwkxctJtL/N/JuEj+8QMg8mRNI/NWeKO5/ydfZU=",
"h1:5b2ojWKT0noujHiweCds37ZreRFRQLNaErdJLusJN88=",
"zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275",
"zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a",
"zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29",
"zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104",
"zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990",
"zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34",
"zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8",
"zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1",
"zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b",
"zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903",
"zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4",
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
]
}
provider "registry.terraform.io/hashicorp/kubernetes" {
version = "3.1.0"
hashes = [
"h1:oodIAuFMikXNmEtil5MQgP4dfSctUBYQiGJfjbsF3NY=",
"zh:0215c5c60be62028c09a2f22458e89cda3ef5830a632299f1d401eb3538874b0",
"zh:09ebb9f442431e278a310a9423f32caf467cb4b3cad3fe59573ca71fa7b14e20",
"zh:0c4e5912f83bb35846ae0a9ae54fc320706ee61894cd21cc6b4181b1c5a2fa5c",
"zh:1678c982853ad461e65ccb5e79d585e13ed109dd47dab2a66d3a7a304faeef65",
"zh:1c050a5c15e330457a9c18caacf61a923c59d663e13f2962e4b32f04fef523a0",
"zh:2c55bcec83be58ec132c7cb0a1ac644758b800d794fdc636d53a0eada0358a3a",
"zh:a062bb0aa316c08d8460c66a5d68da71da40de5d3bc3b31abcf3a1a9a19650f1",
"zh:a26fdea0afaa9b247c73c0b42843ca51ba7db0ac2571f9d3d50dcabd20ca1b98",
"zh:c872c9385a78d502bf5823d61cd3bb0f9a0585030e025eb12585c83451beeaa1",
"zh:f180879af931182beee4c8c0d9dab62b81d86f17ddcbe3786ef4c7cec9163a4e",
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
"zh:f70f5789264069e0eef06f9b5d5fde955ef7206f7d446d1ce51a4c37a3f3e02f",
]
}
provider "registry.terraform.io/hashicorp/vault" {
version = "4.8.0"
constraints = "~> 4.0"
hashes = [
"h1:GPfhH6dr1LY0foPBDYv9bEGifx7eSwYqFcEAOWOUxLk=",
"h1:aHqgWQhDBMeZO9iUKwJYMlh4q+xNMUlMIcjRbF4d02Y=",
"zh:269ab13433f67684012ae7e15876532b0312f5d0d2002a9cf9febb1279ce5ea6",
"zh:4babc95bf0c40eb85005db1dc2ca403c46be4a71dd3e409db3711a56f7a5ca0e",
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
"zh:86e27c1c625ecc24446a11eeffc3ac319b36c2b4e51251db8579256a0dbcf136",
"zh:a32f31da94824009e26b077374440b52098aecb93c92ff55dc3d31dd37c4ea25",
"zh:be0a18c6c0425518bab4fbffd82078b82036a88503b5d76064de551c9f646cbf",
"zh:be5a77fdfd36863ebeec79cd12b1d13322ffad6821d157a0b279789fa06b5937",
"zh:be8317d142a3caad74c7d936039ae27076a1b2b8312ef5208e2871a5f525977c",
"zh:c94a84895a3d9954b80e983eed4603330a5cdbbd8eef5b3c99278c2d1402ef3c",
"zh:de1fb712784dd8415f011ca5346a34f87fab6046c730557615247e511dbc7d98",
"zh:e3eafae7da550f86cae395d6660b2a0e93ec8d2b0e0e5ef982ec762e961fc952",
"zh:ff35fb1ab6add288f0f368981e56f780b50405accd1937131cba1137999c8d83",
]
}
provider "registry.terraform.io/telmate/proxmox" {
version = "3.0.2-rc07"
constraints = "3.0.2-rc07"
hashes = [
"h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=",
"zh:2ee860cd0a368b3eaa53f4a9ea46f16dab8a97929e813ea6ef55183f8112c2ca",
"zh:415965fd915bae2040d7f79e45f64d6e3ae61149c10114efeac1b34687d7296c",
"zh:6584b2055df0e32062561c615e3b6b2c291ca8c959440adda09ef3ec1e1436bd",
"zh:65dcfad71928e0a8dd9befc22524ed686be5020b0024dc5cca5184c7420eeb6b",
"zh:7253dc29bd265d33f2791ac4f779c5413f16720bb717de8e6c5fcb2c858648ea",
"zh:7ec8993da10a47606670f9f67cfd10719a7580641d11c7aa761121c4a2bd66fb",
"zh:999a3f7a9dcf517967fc537e6ec930a8172203642fb01b8e1f78f908373db210",
"zh:a50e6df7280eb6584a5fd2456e3f5b6df13b2ec8a7fa4605511e438e1863be42",
"zh:b25b329a1e42681c509d027fee0365414f0cc5062b65690cfc3386aab16132ae",
"zh:c028877fdb438ece48f7bc02b65bbae9ca7b7befbd260e519ccab6c0cbb39f26",
"zh:cf0eaa3ea9fcc6d62793637947f1b8d7c885b6ad74695ab47e134e4ff132190f",
"zh:d5ade3fae031cc629b7c512a7b60e46570f4c41665e88a595d7efd943dde5ab2",
"zh:f388c15ad1ecfc09e7361e3b98bae9b627a3a85f7b908c9f40650969c949901c",
"zh:f415cc6f735a3971faae6ac24034afdb9ee83373ef8de19a9631c187d5adc7db",
]
}

View file

@ -0,0 +1,191 @@
# image.tag is rendered dynamically (templatefile var `image_tag`) from the
# CURRENT live Deployment tag, falling back to var.nextcloud_image_tag_floor
# (32.0.9) on fresh install / DR — see stacks/nextcloud/main.tf
# `data.kubernetes_resource.nextcloud_live` + locals. This makes helm upgrades
# image-no-ops in steady state and means a re-render can NEVER downgrade below
# the Keel-bumped live tag (the 2026-06-01 CrashLoop: a pinned 32.0.3 lost to
# live 32.0.9 and Nextcloud refused the downgrade). Keel (keel.sh/policy=minor)
# bumps the live tag upward within major 32; the next apply just follows it.
# flavor=apache renders the bare apache-default tag (live image is
# `nextcloud:<tag>`, no -apache suffix).
image:
flavor: apache
tag: "${image_tag}"
nextcloud:
host: nextcloud.viktorbarzin.me
trustedDomains:
- nextcloud.viktorbarzin.me
# mail:
# enabled: true
# # the user we send email as
# fromAddress: nextcloud@viktorbarzin.me
# # the domain we send email from
# domain: viktorbarzin.me
# smtp:
# host: mail.viktorbarzin.me
# secure: starttls
# port: 587
# authtype: LOGIN
# name: nextcloud@viktorbarzin.me
# password:
extraEnv:
- name: TRUSTED_PROXIES
value: "10.0.0.0/8"
- name: PHP_MEMORY_LIMIT
value: "512M"
- name: PHP_UPLOAD_LIMIT
value: "16G"
# - name: mail_smtpdebug
# value: "true"
# - name: loglevel
# value: "0"
configs:
zzz-redis.config.php: |
<?php
// Redis via HAProxy master-only service. HAProxy (3 replicas, PDB
// minAvailable=2) health-checks all v2 pods via `INFO replication` and
// routes to the current role:master. Sentinel failover takes <30s, and
// HAProxy detects the new master via its 1s tcp-check interval and
// starts routing within ~3s of detection. Removed the old in-process
// sentinel-query loop on 2026-04-19 after the Redis rework — see
// beads code-v2b and infra/docs/architecture/databases.md.
$CONFIG = array(
'memcache.distributed' => '\\OC\\Memcache\\Redis',
'memcache.locking' => '\\OC\\Memcache\\Redis',
'redis' => array(
'host' => 'redis-master.redis.svc.cluster.local',
'port' => 6379,
'password' => '',
'timeout' => 1.5,
'read_timeout' => 1.5,
),
);
performance.config.php: |
<?php
$CONFIG = array(
'loglevel' => 2,
// Cap + rotate nextcloud.log. Without this it grew unbounded to
// 10GB+ and bloated every backup (2026-06-01 space incident).
// At 10MB the log rotates to nextcloud.log.1 (1 kept) → ~20MB max.
'log_rotate_size' => 10485760,
'mail_smtpdebug' => false,
);
zzz-mysql.config.php: |
<?php
$CONFIG = array(
'mysql.utf8mb4' => true,
);
phpConfigs:
zzz-custom.ini: |
max_execution_time = 300
max_input_time = 300
default_socket_timeout = 300
opcache.enable_file_override = 1
apc.shm_size = 128M
extraVolumes:
- name: apache-tuning
configMap:
name: nextcloud-apache-tuning
- name: db-password-sync
configMap:
name: nextcloud-db-password-sync
defaultMode: 0755
- name: pve-nfs
persistentVolumeClaim:
claimName: nextcloud-pve-nfs-root
- name: pve-nfs-ssd
persistentVolumeClaim:
claimName: nextcloud-pve-nfs-ssd-root
extraVolumeMounts:
- name: apache-tuning
mountPath: /etc/apache2/mods-available/mpm_prefork.conf
subPath: mpm_prefork.conf
- name: db-password-sync
mountPath: /docker-entrypoint-hooks.d/before-starting
- name: pve-nfs
mountPath: /mnt/pve-nfs
- name: pve-nfs-ssd
mountPath: /mnt/pve-nfs-ssd
internalDatabase:
enabled: false
externalRedis:
enabled: false
externalDatabase:
enabled: true
type: mysql
host: ${mysql_host}
user: nextcloud
database: nextcloud
existingSecret:
secretName: nextcloud-db-creds
usernameKey: db-username
passwordKey: DB_PASSWORD
persistence:
enabled: true
existingClaim: nextcloud-data-encrypted
accessMode: ReadWriteOnce
size: 20Gi
startupProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
successThreshold: 1
livenessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
successThreshold: 1
readinessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 5
successThreshold: 1
podAnnotations:
diun.enable: "true"
diun.include_tags: "^[0-9]+(?:.[0-9]+)?(?:.[0-9]+)?.*"
dependency.kyverno.io/wait-for: "mysql.dbaas:3306,redis-master.redis:6379"
secret.reloader.stakater.com/reload: "nextcloud-db-creds"
# OnRootMismatch: kubelet only recursively chowns the volume to fsGroup if the
# root dir's GID doesn't already match. Without this, every pod restart triggers
# a ~30-min recursive chown of /srv/nfs and /srv/nfs-ssd (600k+ files) — the
# default policy "Always" recurses every time. Locks fsGroup=33 explicitly so
# this block fully replaces the chart's default {fsGroup: 33}.
securityContext:
fsGroup: 33
fsGroupChangePolicy: OnRootMismatch
collabora:
enabled: false # Using onlyoffice instead
resources:
limits:
memory: 8Gi
requests:
cpu: 50m
memory: 256Mi
cronjob:
enabled: true
resources:
limits:
memory: 384Mi
requests:
cpu: 25m
memory: 384Mi

View file

@ -0,0 +1,322 @@
# Nextcloud Files External bootstrap mount-per-archive + applicable_users model.
# Creates two admin-only root browser mounts (PVE NFS Pool, PVE NFS-SSD Pool)
# pointing at the NFS roots mounted at /mnt/pve-nfs and /mnt/pve-nfs-ssd inside
# the Nextcloud container, plus per-archive mounts visible only to the named
# users. Safe to re-run the bootstrap Job is idempotent.
#
# ACL model (verified via context7 + NC docs):
# Mount visibility is controlled by `occ files_external:applicable`.
# A mount with no applicable users/groups is visible to ALL users so we
# always set at least one applicable group (admin) or user list.
#
# occ commands used (syntax verified via context7):
# files_external:create <mountPoint> local null::null --config "datadir=<dir>"
# files_external:list --output=json array; each entry has numeric .mount_id,
# .applicable_users [], .applicable_groups []
# files_external:applicable <mountId> --add-user=<user>
# files_external:applicable <mountId> --remove-user=<user>
# files_external:applicable <mountId> --add-group=<group>
# files_external:applicable <mountId> --remove-group=<group>
#
# Note: `files_external:applicable` has NO --output=json flag (write-only command).
# Current applicable state is read from files_external:list --output=json instead.
#
# NO Files Access Control. Drop the workflow-engine machinery entirely.
# External storage manifest (JSON)
resource "kubernetes_config_map_v1" "nextcloud_external_storage_manifest" {
metadata {
name = "nextcloud-external-storage-manifest"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
data = {
"manifest.json" = jsonencode({
# enableSharing: lets users right-click a folder inside the mount and
# share it with another NC user/group/public link. NC defaults to false
# for local-backend mounts; we opt-in per-mount. Currently true on the
# admin pool browsers (admin uses them as a "share-from picker"); false
# on /anca-elements (anca manages her own re-sharing inside her view).
rootMounts = [
{
mountPoint = "/PVE NFS Pool"
dataDir = "/mnt/pve-nfs"
applicableGroup = "admin"
enableSharing = true
},
{
mountPoint = "/PVE NFS-SSD Pool"
dataDir = "/mnt/pve-nfs-ssd"
applicableGroup = "admin"
enableSharing = true
},
]
archiveMounts = [
{
mountPoint = "/anca-elements"
dataDir = "/mnt/pve-nfs/anca-elements"
# NC usernames (not display names): admin is Viktor, anca is Anca.
applicableUsers = ["anca", "admin"]
applicableGroups = []
enableSharing = false
},
]
})
}
}
# RBAC for the bootstrap Job
resource "kubernetes_service_account" "nextcloud_external_storage_bootstrap" {
metadata {
name = "nextcloud-external-storage-bootstrap"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
}
resource "kubernetes_role" "nextcloud_external_storage_bootstrap" {
metadata {
name = "nextcloud-external-storage-bootstrap"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
rule {
api_groups = [""]
resources = ["pods"]
verbs = ["list", "get", "watch"]
}
rule {
api_groups = [""]
resources = ["pods/exec"]
verbs = ["create"]
}
}
resource "kubernetes_role_binding" "nextcloud_external_storage_bootstrap" {
metadata {
name = "nextcloud-external-storage-bootstrap"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = kubernetes_role.nextcloud_external_storage_bootstrap.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.nextcloud_external_storage_bootstrap.metadata[0].name
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
}
# Bootstrap Job
resource "kubernetes_job_v1" "nextcloud_external_storage_bootstrap" {
# The bootstrap script (below) waits up to 10m for the NC pod to be Ready.
# kubernetes_job_v1's default create timeout is only 1m, which spuriously
# fails the apply whenever the NC pod takes >1m to come up e.g. now that
# Keel auto-upgrades nextcloud, a bump mid-apply runs `occ upgrade` in the
# entrypoint and delays readiness past 1m (observed 2026-06-01). Match the
# script's 10m wait plus margin.
timeouts {
create = "12m"
}
metadata {
name = "nextcloud-external-storage-bootstrap"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
spec {
backoff_limit = 5
ttl_seconds_after_finished = 600
template {
metadata {}
spec {
restart_policy = "OnFailure"
service_account_name = kubernetes_service_account.nextcloud_external_storage_bootstrap.metadata[0].name
container {
name = "bootstrap"
image = "bitnami/kubectl:latest"
# bitnami/kubectl (debian-12 base) ships jq no apt-get needed.
# HCL heredoc: only $${...} needs escaping; bare $VAR and $(...)
# are passed through unchanged by HCL. No nested heredocs used.
command = ["/bin/bash", "-c", <<-EOF
set -euo pipefail
trap 'echo "[bootstrap] FAIL at line $LINENO — exit $?"' ERR
MANIFEST=/manifest/manifest.json
NC_NS=nextcloud
NC_LABEL="app.kubernetes.io/name=nextcloud"
# 1. Wait for NC pod to be Ready
echo "[bootstrap] Waiting for NC pod Ready (timeout 10m)..."
kubectl wait -n "$NC_NS" pod \
-l "$NC_LABEL" \
--for=condition=Ready \
--timeout=600s
echo "[bootstrap] Pod is Ready."
# 2. Resolve pod name
NC_POD=$(kubectl get pods -n "$NC_NS" -l "$NC_LABEL" \
-o jsonpath='{.items[0].metadata.name}')
echo "[bootstrap] Target pod: $NC_POD"
# 3. occ helper must run as www-data
nc_occ() {
kubectl exec -n "$NC_NS" "$NC_POD" -c nextcloud -- \
runuser -u www-data -- php /var/www/html/occ "$@"
}
# 4. Enable files_external (idempotent)
nc_occ app:enable files_external || true
# NO files_accesscontrol that app is not used in this model.
# 5. Helpers
# get_mount_id <mountPoint>
# Reads files_external:list --output=json (array of mount objects).
# Each object has a numeric "mount_id" and a string "mount_point".
get_mount_id() {
local MP="$1"
nc_occ files_external:list --output=json 2>/dev/null \
| jq -r --arg mp "$MP" \
'.[] | select(.mount_point == $mp) | .mount_id' \
| head -1
}
# ensure_mount <mountPoint> <dataDir> echoes the numeric mount id
ensure_mount() {
local MP="$1" DIR="$2"
local MID
MID=$(get_mount_id "$MP")
if [ -z "$MID" ]; then
echo "[bootstrap] Creating mount '$MP' -> $DIR" >&2
nc_occ files_external:create "$MP" local null::null \
--config "datadir=$DIR"
MID=$(get_mount_id "$MP")
else
echo "[bootstrap] Mount '$MP' already exists (id=$MID)" >&2
fi
echo "$MID"
}
# sync_applicable <mountId> <desiredUsersJSON> <desiredGroupsJSON>
# Reads current applicable state from files_external:list --output=json
# (fields: applicable_users [], applicable_groups []).
# Diffs against desired sets; adds missing, removes extras.
# When no applicable users + no groups are set, NC treats the mount
# as visible to ALL so desired sets must always be non-empty.
#
# Process substitution `< <(jq ...)` feeds the loops directly: when
# jq emits no rows (already-synced state), the body never runs and
# the loop returns 0 avoiding a set -e exit on a no-op re-run.
sync_applicable() {
local MID="$1" DESIRED_USERS_JSON="$2" DESIRED_GROUPS_JSON="$3"
# Read current state from files_external:list --output=json
local MOUNT_JSON
MOUNT_JSON=$(nc_occ files_external:list --output=json 2>/dev/null \
| jq -c --argjson mid "$MID" '.[] | select(.mount_id == $mid)')
local CURRENT_USERS_JSON CURRENT_GROUPS_JSON
CURRENT_USERS_JSON=$(echo "$MOUNT_JSON" \
| jq -c '.applicable_users // []')
CURRENT_GROUPS_JSON=$(echo "$MOUNT_JSON" \
| jq -c '.applicable_groups // []')
while IFS= read -r U; do
nc_occ files_external:applicable "$MID" --add-user="$U"
done < <(jq -rn \
--argjson d "$DESIRED_USERS_JSON" \
--argjson c "$CURRENT_USERS_JSON" \
'($d - $c)[]')
while IFS= read -r U; do
nc_occ files_external:applicable "$MID" --remove-user="$U"
done < <(jq -rn \
--argjson d "$DESIRED_USERS_JSON" \
--argjson c "$CURRENT_USERS_JSON" \
'($c - $d)[]')
while IFS= read -r G; do
nc_occ files_external:applicable "$MID" --add-group="$G"
done < <(jq -rn \
--argjson d "$DESIRED_GROUPS_JSON" \
--argjson c "$CURRENT_GROUPS_JSON" \
'($d - $c)[]')
while IFS= read -r G; do
nc_occ files_external:applicable "$MID" --remove-group="$G"
done < <(jq -rn \
--argjson d "$DESIRED_GROUPS_JSON" \
--argjson c "$CURRENT_GROUPS_JSON" \
'($c - $d)[]')
}
# sync_option <mountId> <key> <value>
# Reconciles a single mount option. occ files_external:option is
# idempotent (no error on setting same value), so we always write.
sync_option() {
nc_occ files_external:option "$1" "$2" "$3" >/dev/null
}
# 6. Process root mounts (admin group only)
ROOT_COUNT=$(jq '.rootMounts | length' "$MANIFEST")
for i in $(seq 0 $((ROOT_COUNT - 1))); do
MP=$(jq -r ".rootMounts[$i].mountPoint" "$MANIFEST")
DIR=$(jq -r ".rootMounts[$i].dataDir" "$MANIFEST")
GROUP=$(jq -r ".rootMounts[$i].applicableGroup" "$MANIFEST")
ENABLE_SHARING=$(jq -r ".rootMounts[$i].enableSharing // false" "$MANIFEST")
MID=$(ensure_mount "$MP" "$DIR")
sync_applicable "$MID" '[]' "[\"$GROUP\"]"
sync_option "$MID" enable_sharing "$ENABLE_SHARING"
done
# 7. Process archive mounts (per-user / per-group)
ARCH_COUNT=$(jq '.archiveMounts | length' "$MANIFEST")
for i in $(seq 0 $((ARCH_COUNT - 1))); do
MP=$(jq -r ".archiveMounts[$i].mountPoint" "$MANIFEST")
DIR=$(jq -r ".archiveMounts[$i].dataDir" "$MANIFEST")
USERS_JSON=$(jq -c ".archiveMounts[$i].applicableUsers // []" "$MANIFEST")
GROUPS_JSON=$(jq -c ".archiveMounts[$i].applicableGroups // []" "$MANIFEST")
ENABLE_SHARING=$(jq -r ".archiveMounts[$i].enableSharing // false" "$MANIFEST")
MID=$(ensure_mount "$MP" "$DIR")
sync_applicable "$MID" "$USERS_JSON" "$GROUPS_JSON"
sync_option "$MID" enable_sharing "$ENABLE_SHARING"
done
echo "[bootstrap] Bootstrap complete."
EOF
]
volume_mount {
name = "manifest"
mount_path = "/manifest"
}
}
volume {
name = "manifest"
config_map {
name = kubernetes_config_map_v1.nextcloud_external_storage_manifest.metadata[0].name
}
}
}
}
}
depends_on = [helm_release.nextcloud]
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].template[0].spec[0].dns_config]
}
}

729
stacks/nextcloud/main.tf Normal file
View file

@ -0,0 +1,729 @@
variable "tls_secret_name" {
type = string
sensitive = true
}
variable "nfs_server" { type = string }
variable "redis_host" { type = string }
variable "mysql_host" { type = string }
# FLOOR only Keel bumps the LIVE image tag upward (minor policy); the
# data source below renders the current live tag so a helm apply never
# downgrades below what Keel installed. This floor only wins on a fresh
# install / DR (no live Deployment) or after deliberately restoring an
# OLDER DB snapshot (bump this to match see comment on the data source).
variable "nextcloud_image_tag_floor" {
type = string
default = "32.0.9"
}
data "vault_kv_secret_v2" "secrets" {
mount = "secret"
name = "nextcloud"
}
# Render the CURRENT live image tag so helm upgrades are image-no-ops and
# can NEVER downgrade below the Keel-bumped live tag (failure mode F2: the
# 2026-06-01 CrashLoop where a pinned 32.0.3 re-render lost to live 32.0.9).
# Helm-managed workloads can't use the raw-Deployment KEEL_IGNORE_IMAGE
# `lifecycle.ignore_changes` trick (immich/freshrss main.tf), so we feed the
# live tag back into the chart instead.
#
# Use the PLURAL `kubernetes_resources` (field-selected to name=nextcloud), NOT
# the singular `kubernetes_resource`: in kubernetes provider 3.1.0 the singular
# data source ERRORS ("Provider produced null object") when the target is
# absent, and try() can't rescue it (the failure is at the provider read, not
# the expression). The plural returns an empty `objects` list on no match, so
# objects[0] + try() cleanly falls back to var.nextcloud_image_tag_floor on
# fresh install / DR. (Verified empirically against provider 3.1.0.)
#
# namespace is the LITERAL "nextcloud", NOT
# kubernetes_namespace.nextcloud.metadata[0].name, on purpose: referencing the
# namespace resource makes Terraform defer this data read to apply time
# whenever the namespace has a pending change (e.g. the keel.sh/enrolled label
# add) "(depends on a resource ... with changes pending)" which leaves the
# tag unknown at plan, turning every helm plan into an unverifiable
# (known after apply) values churn. A static namespace decouples the read so it
# resolves at plan time.
data "kubernetes_resources" "nextcloud_live" {
api_version = "apps/v1"
kind = "Deployment"
namespace = "nextcloud"
field_selector = "metadata.name=nextcloud"
}
locals {
homepage_credentials = jsondecode(data.vault_kv_secret_v2.secrets.data["homepage_credentials"])
_live_image = try(data.kubernetes_resources.nextcloud_live.objects[0].spec.template.spec.containers[0].image, "")
# Last colon-segment is the tag (handles registry:port/repo:tag); strip the
# optional `-apache` flavor suffix so it round-trips through the chart's
# `image.flavor=apache` (which renders the bare apache-default tag).
_live_tag = try(replace(element(split(":", local._live_image), length(split(":", local._live_image)) - 1), "-apache", ""), "")
nextcloud_image_tag = local._live_tag != "" ? local._live_tag : var.nextcloud_image_tag_floor
}
module "tls_secret" {
source = "../../modules/kubernetes/setup_tls_secret"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
tls_secret_name = var.tls_secret_name
}
resource "kubernetes_namespace" "nextcloud" {
metadata {
name = "nextcloud"
labels = {
"istio-injection" : "disabled"
tier = local.tiers.edge
"resource-governance/custom-limitrange" = "true"
"resource-governance/custom-quota" = "true"
# Keel re-enabled 2026-06-01 (was disabled after the 2026-05-26 bump
# 32.0.332.0.9 stuck the pod in maintenance mode for ~22h). Two
# safeguards make auto-upgrade safe, engineered around BOTH failure modes:
# F1 interrupted `occ upgrade` (entrypoint copies version.php before
# occ upgrade finishes, so a probe-restart mid-upgrade leaves the
# DB half-migrated 503): the nextcloud-watchdog CronJob below
# self-heals by running `occ upgrade` when occ reports
# needsDbUpgrade=true.
# F2 helm re-renders a tag BELOW the Keel-bumped live image
# Nextcloud refuses the downgrade CrashLoop (the 2026-06-01
# incident): chart_values renders the live tag with a floor, so a
# re-render is never below live.
# Scope: the shared Kyverno `inject-keel-annotations` policy stamps
# keel.sh/policy=patch (+ trigger=poll + pollSchedule) on enrolled
# workloads. For Nextcloud patch == minor in practice it only ships
# 32.0.x maintenance releases (never 32.1.x), and major 33 needs `major`
# policy and stays manual (the entrypoint's +1-major limit enforces that
# anyway). We deliberately do NOT override the policy per-workload see
# the note where the old override resources used to live, below.
"keel.sh/enrolled" = "true"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
# No per-workload Keel override resources here, on purpose. Nextcloud is
# enrolled via the namespace label above; the shared Kyverno
# `inject-keel-annotations` policy then stamps keel.sh/policy=patch +
# trigger=poll + pollSchedule, and Keel auto-upgrades within 32.0.x.
#
# This stack used to carry kubernetes_labels + kubernetes_annotations
# resources forcing keel.sh/policy=minor (and before that =never, for the
# opt-out). Both were removed 2026-06-01 after re-enabling Keel because each
# produced perpetual drift:
# - Kyverno's background-controller overwrites a TF-set policy back to
# `patch` despite the policy's `+(keel.sh/policy)` add-if-missing anchor
# (observed live: the annotation's field manager was background-controller
# with value patch right after a Keel-bump admission).
# - The helm release strips the deployment's keel.sh/policy LABEL on every
# roll, so TF re-added it on every apply.
# patch == minor for Nextcloud (32.0.x only; major 33 needs `major` and stays
# manual), so letting Kyverno own the keel annotations exactly like every
# other enrolled workload (immich, freshrss) is both correct and drift-free.
resource "kubernetes_manifest" "external_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "nextcloud-secrets"
namespace = "nextcloud"
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = {
name = "nextcloud-secrets"
}
dataFrom = [{
extract = {
key = "nextcloud"
}
}]
}
}
depends_on = [kubernetes_namespace.nextcloud]
}
# DB credentials from Vault database engine (rotated every 24h)
# Nextcloud Helm chart reads password at runtime via existingSecret reference
resource "kubernetes_manifest" "db_external_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "nextcloud-db-creds"
namespace = "nextcloud"
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-database"
kind = "ClusterSecretStore"
}
target = {
name = "nextcloud-db-creds"
template = {
data = {
DB_PASSWORD = "{{ .password }}"
db-username = "nextcloud"
}
}
}
data = [{
secretKey = "password"
remoteRef = {
key = "static-creds/mysql-nextcloud"
property = "password"
}
}]
}
}
depends_on = [kubernetes_namespace.nextcloud]
}
resource "kubernetes_resource_quota" "nextcloud" {
metadata {
name = "nextcloud-quota"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
spec {
hard = {
"requests.cpu" = "4"
"requests.memory" = "8Gi"
"limits.memory" = "16Gi"
pods = "10"
}
}
}
resource "kubernetes_limit_range" "nextcloud" {
metadata {
name = "nextcloud-limits"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
spec {
limit {
type = "Container"
default = {
memory = "256Mi"
}
default_request = {
cpu = "25m"
memory = "64Mi"
}
max = {
memory = "8Gi"
}
}
}
}
resource "helm_release" "nextcloud" {
namespace = kubernetes_namespace.nextcloud.metadata[0].name
name = "nextcloud"
repository = "https://nextcloud.github.io/helm/"
chart = "nextcloud"
atomic = true
version = "8.8.1"
values = [templatefile("${path.module}/chart_values.yaml", { tls_secret_name = var.tls_secret_name, mysql_host = var.mysql_host, image_tag = local.nextcloud_image_tag })]
timeout = 6000
depends_on = [kubernetes_manifest.db_external_secret]
}
resource "kubernetes_config_map" "apache_tuning" {
metadata {
name = "nextcloud-apache-tuning"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
data = {
"mpm_prefork.conf" = <<-EOF
# Tuned for Nextcloud on MySQL
# Capped MaxRequestWorkers to prevent runaway Apache consuming all node CPU
<IfModule mpm_prefork_module>
StartServers 5
MinSpareServers 3
MaxSpareServers 10
MaxRequestWorkers 30
MaxConnectionsPerChild 500
</IfModule>
EOF
}
}
# resource "kubernetes_config_map" "config" {
# metadata {
# name = "config"
# namespace = kubernetes_namespace.nextcloud.metadata[0].name
# annotations = {
# "reloader.stakater.com/match" = "true"
# }
# }
# data = {
# "conf.yml" = file("${path.module}/conf.yml")
# }
# }
resource "kubernetes_persistent_volume_claim" "nextcloud_data_encrypted" {
wait_until_bound = false
metadata {
name = "nextcloud-data-encrypted"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
annotations = {
"resize.topolvm.io/threshold" = "10%"
"resize.topolvm.io/increase" = "20%"
"resize.topolvm.io/storage_limit" = "100Gi"
}
}
spec {
access_modes = ["ReadWriteOnce"]
storage_class_name = "proxmox-lvm-encrypted"
resources {
requests = {
storage = "20Gi"
}
}
}
lifecycle {
# The autoresizer expands requests.storage up to storage_limit and
# PVCs can't shrink. Without this, every TF apply tries to revert
# to the spec value, K8s rejects the shrink, and the PVC ends up
# in Terminating-but-in-use limbo.
ignore_changes = [spec[0].resources[0].requests]
}
}
module "nfs_nextcloud_backup_host" {
source = "../../modules/kubernetes/nfs_volume"
name = "nextcloud-backup-host"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs/nextcloud-backup"
}
module "nfs_pve_root_host" {
source = "../../modules/kubernetes/nfs_volume"
name = "nextcloud-pve-nfs-root"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs"
storage = "3000Gi"
}
module "nfs_pve_ssd_root_host" {
source = "../../modules/kubernetes/nfs_volume"
name = "nextcloud-pve-nfs-ssd-root"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs-ssd"
storage = "100Gi"
}
module "ingress" {
source = "../../modules/kubernetes/ingress_factory"
# Native WebDAV / CalDAV / CardDAV clients (Nextcloud desktop+mobile apps,
# calendar sync) use HTTP basic-auth + app passwords, not browser sessions.
# Nextcloud has strong app-layer auth of its own.
# auth = "app": Native WebDAV / CalDAV / CardDAV clients use HTTP Basic auth + app passwords; Nextcloud enforces app-layer authentication.
auth = "app"
dns_type = "proxied"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
name = "nextcloud"
tls_secret_name = var.tls_secret_name
port = 8080
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = "Nextcloud"
"gethomepage.dev/description" = "Cloud productivity suite"
"gethomepage.dev/icon" = "nextcloud.png"
"gethomepage.dev/group" = "Productivity"
"gethomepage.dev/pod-selector" = ""
"gethomepage.dev/widget.type" = "nextcloud"
"gethomepage.dev/widget.url" = "https://nextcloud.viktorbarzin.me"
"gethomepage.dev/widget.username" = local.homepage_credentials["nextcloud"]["username"]
"gethomepage.dev/widget.password" = local.homepage_credentials["nextcloud"]["password"]
}
}
# Hook script: sync DB password from env var into config.php on every pod start.
# Closes the Vault rotation gap: Vault rotates MySQL password ESO syncs to K8s Secret
# Reloader restarts pod this hook patches config.php with the current MYSQL_PASSWORD.
resource "kubernetes_config_map" "db_password_sync_hook" {
metadata {
name = "nextcloud-db-password-sync"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
data = {
"sync-db-password.sh" = <<-EOF
#!/bin/bash
set -e
CONFIG="/var/www/html/config/config.php"
if [ -z "$MYSQL_PASSWORD" ]; then
echo "MYSQL_PASSWORD not set, skipping config.php sync"
exit 0
fi
if [ ! -f "$CONFIG" ]; then
echo "config.php not found, skipping (first install)"
exit 0
fi
CURRENT_PW=$(php -r "include '$CONFIG'; echo \$CONFIG['dbpassword'] ?? '';")
if [ "$CURRENT_PW" = "$MYSQL_PASSWORD" ]; then
echo "DB password in config.php already matches MYSQL_PASSWORD"
exit 0
fi
echo "Updating DB password in config.php to match MYSQL_PASSWORD..."
php /docker-entrypoint-hooks.d/before-starting/patch-db-pw.php "$CONFIG" "$MYSQL_PASSWORD"
echo "DB password updated successfully"
EOF
"patch-db-pw.php" = <<-EOF
<?php
$file = $argv[1];
$newPw = $argv[2];
$content = file_get_contents($file);
$escaped = str_replace(["'", "\\"], ["\\'", "\\\\"], $newPw);
$content = preg_replace("/'dbpassword'\\s*=>\\s*'[^']*'/", "'dbpassword' => '" . $escaped . "'", $content);
file_put_contents($file, $content);
EOF
}
}
resource "kubernetes_config_map" "backup-script" {
metadata {
name = "nextcloud-backup-script"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
data = {
"backup.sh" = <<-EOF
#!/bin/bash
set -e
BACKUP_DIR="/backup"
DATA_DIR="/nextcloud-data"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_PATH="$BACKUP_DIR/$DATE"
echo "Starting Nextcloud backup at $(date)"
# Note: Maintenance mode is skipped because occ is not available in the NFS mount.
# For a proper backup with maintenance mode, exec into the nextcloud pod:
# kubectl exec -n nextcloud deployment/nextcloud -- php occ maintenance:mode --on
# Create backup directory
mkdir -p "$BACKUP_PATH"
# Backup config/data/custom_apps. Exclusions (2026-06-01 space fix):
# - nextcloud.log* rotated at source via log_rotate_size; previously
# grew to 10GB+ and bloated every dated copy (backups hit 20G each).
# - preview cache regenerable thumbnails, no need to back up.
# Backs up config/, data/, custom_apps/ (the irreplaceable bits). Skips:
# - html/ the Nextcloud app code, reproducible from the pinned image
# (real config is at config/config.php; html/config/config.php is empty).
# - nextcloud.log* capped at source via log_rotate_size; was 10GB+.
# - preview cache regenerable thumbnails.
echo "Backing up Nextcloud installation..."
rsync -a \
--exclude='/html/' \
--exclude='nextcloud.log' \
--exclude='nextcloud.log.*' \
--exclude='data/appdata_*/preview/' \
"$DATA_DIR/" "$BACKUP_PATH/"
# Keep only the latest backup. The version history lives in daily-backup's
# pvc-data (4 weekly snapshot-consistent copies of this same encrypted PVC),
# so this browsable app-level copy only needs the most recent. Keeping the
# whole installation (incl. logs) x7 here was the bulk of the 87G that
# filled the offsite Synology.
#
# Sort by NAME, not mtime: dirs are YYYYMMDD_HHMMSS so lexical order is
# chronological. `rsync -a` stamps the backup dir with the SOURCE dir's
# mtime, which made the old `ls -dt | tail` delete the freshest backup and
# keep a stale one keep the lexically-last (newest) instead.
echo "Cleaning old backups (keep latest)..."
cd "$BACKUP_DIR"
ls -d */ 2>/dev/null | sort | head -n -1 | xargs -r rm -rf
echo "Backup completed at $(date)"
echo "Backup stored at: $BACKUP_PATH"
EOF
"restore.sh" = <<-EOF
#!/bin/bash
# Restore script - run manually when needed
# Usage: ./restore.sh <backup_date>
# Example: ./restore.sh 20250117_030000
#
# Before restoring, enable maintenance mode:
# kubectl exec -n nextcloud deployment/nextcloud -- php occ maintenance:mode --on
# After restoring, disable it:
# kubectl exec -n nextcloud deployment/nextcloud -- php occ maintenance:mode --off
set -e
if [ -z "$1" ]; then
echo "Usage: $0 <backup_date>"
echo "Available backups:"
ls -1 /backup/
exit 1
fi
BACKUP_PATH="/backup/$1"
DATA_DIR="/nextcloud-data"
if [ ! -d "$BACKUP_PATH" ]; then
echo "Backup not found: $BACKUP_PATH"
exit 1
fi
echo "Restoring from $BACKUP_PATH"
# Restore everything
echo "Restoring Nextcloud installation..."
rsync -a "$BACKUP_PATH/" "$DATA_DIR/"
echo "Restore completed!"
echo "Remember to run: kubectl exec -n nextcloud deployment/nextcloud -- php occ maintenance:mode --off"
EOF
}
}
# Watchdog: runs every 5 minutes with two jobs:
# 1. Apache runaway recovery if >40 workers (normal 5-15), rollout-restart
# to recover node CPU.
# 2. F1 Keel self-heal if occ reports needsDbUpgrade=true (an interrupted
# `occ upgrade` after a Keel image bump left the app in maintenance mode),
# re-run `occ upgrade` and clear maintenance mode.
resource "kubernetes_service_account" "nextcloud_watchdog" {
metadata {
name = "nextcloud-watchdog"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
}
resource "kubernetes_role" "nextcloud_watchdog" {
metadata {
name = "nextcloud-watchdog"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
rule {
api_groups = ["apps"]
resources = ["deployments"]
verbs = ["get", "patch"]
}
rule {
api_groups = [""]
resources = ["pods"]
verbs = ["list", "get"]
}
rule {
api_groups = [""]
resources = ["pods/exec"]
verbs = ["create"]
}
}
resource "kubernetes_role_binding" "nextcloud_watchdog" {
metadata {
name = "nextcloud-watchdog"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = kubernetes_role.nextcloud_watchdog.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.nextcloud_watchdog.metadata[0].name
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
}
resource "kubernetes_cron_job_v1" "nextcloud_watchdog" {
metadata {
name = "nextcloud-watchdog"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
spec {
schedule = "*/5 * * * *"
successful_jobs_history_limit = 1
failed_jobs_history_limit = 3
concurrency_policy = "Forbid"
job_template {
metadata {}
spec {
# 600s (was 120s) so the F1 self-heal `occ upgrade` isn't killed
# mid-migration. concurrency_policy=Forbid prevents overlap.
active_deadline_seconds = 600
template {
metadata {}
spec {
service_account_name = kubernetes_service_account.nextcloud_watchdog.metadata[0].name
restart_policy = "Never"
container {
name = "watchdog"
image = "bitnami/kubectl:latest"
command = ["/bin/bash", "-c", <<-EOF
set -e
# Find the nextcloud pod
POD=$(kubectl get pods -n nextcloud -l app.kubernetes.io/name=nextcloud -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
if [ -z "$POD" ]; then
echo "No nextcloud pod found, skipping"
exit 0
fi
# Count Apache worker processes (exclude grep itself and the parent apache2 process)
WORKERS=$(kubectl exec -n nextcloud "$POD" -c nextcloud -- pgrep -c apache2 2>/dev/null || echo "0")
echo "$(date): Apache worker count: $WORKERS"
# Normal operation: 5-15 workers. Runaway threshold: 40+
if [ "$WORKERS" -gt 40 ]; then
echo "RUNAWAY DETECTED: $WORKERS Apache workers (threshold: 40)"
echo "Restarting nextcloud deployment..."
kubectl rollout restart deployment nextcloud -n nextcloud
echo "Restart triggered at $(date)"
else
echo "Apache workers within normal range ($WORKERS <= 40)"
fi
# F1 self-heal: a Keel image bump runs `occ upgrade` in the
# entrypoint, but if that's interrupted (e.g. a probe restart
# mid-upgrade) occ reports needsDbUpgrade=true and the app sits
# in maintenance mode (503). Re-run the upgrade and clear
# maintenance mode. Gated on needsDbUpgrade only, so a
# deliberate manual maintenance window is left untouched.
ST=$(kubectl exec -n nextcloud "$POD" -c nextcloud -- php occ status --output=json 2>/dev/null || true)
if echo "$ST" | grep -q '"needsDbUpgrade":true'; then
echo "$(date): needsDbUpgrade=true → running occ upgrade"
kubectl exec -n nextcloud "$POD" -c nextcloud -- php occ upgrade --no-interaction || true
kubectl exec -n nextcloud "$POD" -c nextcloud -- php occ maintenance:mode --off || true
echo "$(date): self-heal occ upgrade complete"
else
echo "$(date): occ status healthy (no DB upgrade pending)"
fi
EOF
]
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
resource "kubernetes_cron_job_v1" "nextcloud-backup" {
metadata {
name = "nextcloud-backup"
namespace = kubernetes_namespace.nextcloud.metadata[0].name
}
spec {
schedule = "0 3 * * 0" # Sunday at 3 AM
successful_jobs_history_limit = 3
failed_jobs_history_limit = 3
concurrency_policy = "Forbid"
job_template {
metadata {}
spec {
template {
metadata {}
spec {
restart_policy = "OnFailure"
# Backup mounts the same RWO PVC (proxmox-lvm-encrypted) as the
# main nextcloud pod, so it MUST schedule on the same node the
# volume cannot attach to two nodes simultaneously. Without this
# the backup pod is stuck in ContainerCreating until cron retries.
affinity {
pod_affinity {
required_during_scheduling_ignored_during_execution {
label_selector {
match_labels = {
"app.kubernetes.io/name" = "nextcloud"
"app.kubernetes.io/instance" = "nextcloud"
}
}
topology_key = "kubernetes.io/hostname"
namespaces = [kubernetes_namespace.nextcloud.metadata[0].name]
}
}
}
container {
name = "backup"
image = "alpine:latest"
command = ["/bin/sh", "-c", "apk add --no-cache rsync bash && /scripts/backup.sh"]
volume_mount {
name = "nextcloud-data"
mount_path = "/nextcloud-data"
}
volume_mount {
name = "backup"
mount_path = "/backup"
}
volume_mount {
name = "scripts"
mount_path = "/scripts"
}
}
volume {
name = "nextcloud-data"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.nextcloud_data_encrypted.metadata[0].name
}
}
volume {
name = "backup"
persistent_volume_claim {
claim_name = module.nfs_nextcloud_backup_host.claim_name
}
}
volume {
name = "scripts"
config_map {
name = kubernetes_config_map.backup-script.metadata[0].name
default_mode = "0755"
}
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
# CI retrigger 2026-05-16T13:42:57+00:00 bulk enrollment apply (pipeline #689 killed)
# CI retrigger v2 2026-05-16T13:46:35+00:00

View file

@ -0,0 +1,53 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
cloudflare = {
source = "cloudflare/cloudflare"
version = "~> 4"
}
authentik = {
source = "goauthentik/authentik"
version = "~> 2024.10"
}
# kubectl (gavinbunney) workaround for hashicorp/kubernetes
# `kubernetes_manifest` panics on Kyverno CRDs. See beads code-e2dp.
# Declared for all stacks but only used where opted-in.
kubectl = {
source = "gavinbunney/kubectl"
version = "~> 1.14"
}
proxmox = {
source = "telmate/proxmox"
version = "3.0.2-rc07"
}
}
}
variable "kube_config_path" {
type = string
default = "~/.kube/config"
}
provider "kubernetes" {
config_path = var.kube_config_path
}
provider "helm" {
kubernetes = {
config_path = var.kube_config_path
}
}
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}
provider "kubectl" {
config_path = var.kube_config_path
load_config_file = true
}

1
stacks/nextcloud/secrets Symbolic link
View file

@ -0,0 +1 @@
../../secrets

View file

@ -0,0 +1,13 @@
include "root" {
path = find_in_parent_folders()
}
dependency "platform" {
config_path = "../platform"
skip_outputs = true
}
dependency "vault" {
config_path = "../vault"
skip_outputs = true
}