fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

View file

@ -0,0 +1,185 @@
#cloud-config
# Hostname intentionally NOT set here — cloud-init reads it from
# Proxmox's auto-generated meta-data (which uses `qm set --name <X>`),
# so a single shared snippet works for every node.
manage_etc_hosts: true
users:
- name: wizard
sudo: ALL=(ALL) NOPASSWD:ALL
ssh_authorized_keys:
- ${authorized_ssh_key}
passwd: ${passwd}
lock_passwd: false # enable passwd login
shell: /bin/bash
package_update: true
package_upgrade: true
packages:
- htop
- vim
- curl
- jq
- tcpdump
- tree
- tmux
- wget
- net-tools
- zsh
- apt-transport-https
- ca-certificates
- gpg
- isc-dhcp-client
- cloud-guest-utils # to enable resizing of disk via growpart
- qemu-guest-agent
- nginx
# docker
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
%{if is_k8s_template}
# kubernetes
- kubeadm
- kubelet
# iSCSI client for CSI-backed database storage
- open-iscsi
%{endif}
apt:
sources:
%{if is_k8s_template}
kubernetes:
source: "deb https://pkgs.k8s.io/core:/stable:/v1.34/deb/ /"
keyid: "DE15B14486CD377B9E876E1A234654DA9A296436"
filename: kubernetes.list
%{endif}
docker:
source: "deb https://download.docker.com/linux/ubuntu noble stable"
keyid: "9DC858229FC7DD38854AE2D88D81803C0EBFCD88"
filename: docker.list
%{if is_k8s_template}
# Setup script is base64-encoded by the module so YAML whitespace
# handling never touches the heredoc bodies inside it. Replaces an
# earlier `indent(6, …)` approach that put `[plugins.*]` TOML
# sections at col 6 inside `cat >> /etc/containerd/config.toml`
# heredocs — containerd refused to parse the result and the node5 v1
# boot failed there (2026-05-26). Source: modules/create-template-vm/k8s-node-containerd-setup.sh
write_files:
- path: /usr/local/bin/k8s-node-containerd-setup.sh
permissions: '0755'
owner: root:root
encoding: b64
content: ${k8s_node_setup_script_b64}
- path: /usr/local/bin/k8s-node-post-join-tune.sh
permissions: '0755'
owner: root:root
encoding: b64
content: ${k8s_node_post_join_script_b64}
%{endif}
runcmd:
# Enable weekly TRIM/discard to reclaim freed blocks in LVM thin pool
- systemctl enable --now fstrim.timer
# Enable persistent journald logging for crash forensics, with size limits to reduce disk wear
- mkdir -p /var/log/journal
- sed -i 's/#Storage=auto/Storage=persistent/' /etc/systemd/journald.conf
- sed -i 's/#SystemMaxUse=/SystemMaxUse=500M/' /etc/systemd/journald.conf
- sed -i 's/#MaxRetentionSec=/MaxRetentionSec=7day/' /etc/systemd/journald.conf
- sed -i 's/#MaxFileSec=/MaxFileSec=1day/' /etc/systemd/journald.conf
- sed -i 's/#Compress=yes/Compress=yes/' /etc/systemd/journald.conf
- systemctl restart systemd-journald
%{if is_k8s_template}
# systemd-resolved global DNS fallback. Without this, only the
# link-level DNS from Proxmox's `qm set --nameserver` (Technitium,
# 10.0.20.201) is consulted — and Technitium returns NXDOMAIN for
# forgejo.viktorbarzin.me, so kubelet image pulls from the Forgejo
# registry break. Public DNS upstream + Technitium fallback matches
# the pre-existing manual setup on k8s-node1..4.
- mkdir -p /etc/systemd/resolved.conf.d
- |
cat > /etc/systemd/resolved.conf.d/global-dns.conf <<'EOF'
[Resolve]
DNS=8.8.8.8 1.1.1.1
FallbackDNS=10.0.20.201
EOF
- systemctl restart systemd-resolved
# Re-enabled 2026-05-10: unattended-upgrades is back on, but with a tight
# Allowed-Origins list, a Package-Blacklist for k8s/containerd/runc/calico,
# and Automatic-Reboot disabled (kured + sentinel-gate handles reboots in a
# 24h-soaked rolling window, gated by Prometheus alerts).
# Original outage (March 2026) was kernel update → containerd overlayfs corruption.
# Mitigations: 24h cool-down between node reboots, Prometheus halt-on-alert,
# apt-mark hold on k8s components, Package-Blacklist for runtime components.
- apt-get install -y unattended-upgrades update-notifier-common
- |
cat > /etc/apt/apt.conf.d/52unattended-upgrades-k8s <<'EOF'
Unattended-Upgrade::Allowed-Origins {
"$${distro_id}:$${distro_codename}";
"$${distro_id}:$${distro_codename}-security";
"$${distro_id}:$${distro_codename}-updates";
"$${distro_id}ESMApps:$${distro_codename}-apps-security";
"$${distro_id}ESM:$${distro_codename}-infra-security";
};
Unattended-Upgrade::Package-Blacklist {
"^containerd(\.io)?$$";
"^runc$$";
"^cri-tools$$";
"^kubernetes-cni$$";
"^calico-.*";
"^cni-plugins-.*";
"^docker-ce$$";
};
Unattended-Upgrade::DevRelease "false";
Unattended-Upgrade::Automatic-Reboot "false";
EOF
- |
cat > /etc/apt/apt.conf.d/20auto-upgrades <<'EOF'
APT::Periodic::Update-Package-Lists "1";
APT::Periodic::Unattended-Upgrade "1";
EOF
- systemctl unmask unattended-upgrades 2>/dev/null || true
- systemctl enable --now unattended-upgrades
- apt-mark hold kubelet kubeadm kubectl
- apt-mark hold containerd containerd.io runc 2>/dev/null || true
- systemctl stop kubelet
- containerd config default | sudo tee /etc/containerd/config.toml
# The containerd/kubelet setup is delivered as /usr/local/bin/k8s-node-containerd-setup.sh
# via the write_files: block at the top of this file. We run it as a single
# bash invocation here so cloud-init only sees a one-line runcmd item.
# (Previous inline `- $${containerd_config_update_command}` broke YAML parsing
# because the heredoc contains mixed-indent inner shell heredocs.)
- bash /usr/local/bin/k8s-node-containerd-setup.sh
- systemctl restart containerd
- systemctl enable --now iscsid
# Harden iSCSI: increase recovery timeout (300s vs 120s default) and enable
# CRC32C data/header digests to detect bit flips over the network.
# Prevents SQLite corruption from transient iSCSI session drops.
- sed -i 's/^node.session.timeo.replacement_timeout = .*/node.session.timeo.replacement_timeout = 300/' /etc/iscsi/iscsid.conf
- sed -i 's/^node.conn\[0\].timeo.noop_out_interval = .*/node.conn[0].timeo.noop_out_interval = 10/' /etc/iscsi/iscsid.conf
- sed -i 's/^node.conn\[0\].timeo.noop_out_timeout = .*/node.conn[0].timeo.noop_out_timeout = 15/' /etc/iscsi/iscsid.conf
- |
if ! grep -q '^node.conn\[0\].iscsi.HeaderDigest' /etc/iscsi/iscsid.conf; then
echo 'node.conn[0].iscsi.HeaderDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
echo 'node.conn[0].iscsi.DataDigest = CRC32C,None' >> /etc/iscsi/iscsid.conf
fi
- systemctl restart iscsid
# Create /sentinel directory for kured reboot gating (sentinel gate DaemonSet)
- mkdir -p /sentinel
# Disable swap — kubelet defaults to failSwapOn=true and won't start otherwise.
# (Previously this snippet created a 4G swapfile for "memory pressure relief"
# but never set failSwapOn=false / memorySwap.swapBehavior together, so the
# join consistently bricked kubelet — observed on node6 boot v3 2026-05-26.)
- swapoff -a
- sed -i '/ swap / s/^/#/' /etc/fstab
- ${k8s_join_command}
- systemctl enable kubelet
- systemctl start kubelet
# Kubelet tuning runs AFTER kubeadm join — that's when
# /var/lib/kubelet/config.yaml gets written. Restarts kubelet at the
# end to pick up the patched config.
- bash /usr/local/bin/k8s-node-post-join-tune.sh
%{ endif }
%{ for provision_cmd in provision_cmds ~}
- ${provision_cmd}
%{ endfor ~}

View file

@ -0,0 +1,147 @@
#!/usr/bin/env bash
#
# K8s node containerd + kubelet bootstrap. Runs once via cloud-init runcmd.
# Embedded into the cloud-init snippet base64-encoded by main.tf so YAML
# whitespace handling never touches the heredoc bodies — TOML / Python
# blocks below land in /etc/containerd/config.toml etc. with their leading
# whitespace intact.
#
# Layout:
# 1. /etc/containerd/config.toml — config_path + mirror dirs + GC tuning
# 2. /etc/containerd/certs.d/*/hosts.toml — per-registry mirror configs
# 3. /var/lib/kubelet/config.yaml — eviction + shutdown grace + log rotation
# 4. /etc/systemd/logind.conf.d + kubelet.service.d — graceful shutdown
# 5. (master-only) /etc/kubernetes/manifests — apiserver + controller flags
set -euo pipefail
# 1. config_path — match BOTH quote styles. containerd v1 writes `""`,
# containerd v2.x writes `''`. Without the v2 match, hosts.toml mirror
# config is silently ignored — observed 2026-05-26 on k8s-node4
# (containerd v2.2.4) and reproduced on k8s-node5 v1 boot.
sed -i "s|config_path = \"\"|config_path = \"/etc/containerd/certs.d\"|g" /etc/containerd/config.toml
sed -i "s|config_path = ''|config_path = \"/etc/containerd/certs.d\"|g" /etc/containerd/config.toml
# 2. Per-registry hosts.toml — pull-through caches on docker-registry VM
# (10.0.20.10) for high-traffic registries, Traefik LB (10.0.20.200) for
# forgejo. Low-traffic registries (registry.k8s.io, reg.kyverno.io) skip
# the cache and pull direct because past pull-through cache attempts
# truncated downloads and broke VPA certgen + Kyverno image pulls.
mkdir -p /etc/containerd/certs.d/docker.io
cat > /etc/containerd/certs.d/docker.io/hosts.toml <<'DOCKERIO'
server = "https://registry-1.docker.io"
[host."http://10.0.20.10:5000"]
capabilities = ["pull", "resolve"]
[host."https://registry-1.docker.io"]
capabilities = ["pull", "resolve"]
DOCKERIO
mkdir -p /etc/containerd/certs.d/ghcr.io
cat > /etc/containerd/certs.d/ghcr.io/hosts.toml <<'GHCR'
server = "https://ghcr.io"
[host."http://10.0.20.10:5010"]
capabilities = ["pull", "resolve"]
[host."https://ghcr.io"]
capabilities = ["pull", "resolve"]
GHCR
# Forgejo OCI registry: prefer in-cluster Traefik LB (10.0.20.200) to
# avoid hairpin NAT. Traefik serves the *.viktorbarzin.me wildcard so
# SNI verification succeeds. If the mirror is unreachable, fall back to
# public DNS resolution (needs the global DNS fallback set up below).
mkdir -p /etc/containerd/certs.d/forgejo.viktorbarzin.me
cat > /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml <<'FORGEJO'
server = "https://forgejo.viktorbarzin.me"
[host."https://10.0.20.203"]
capabilities = ["pull", "resolve"]
skip_verify = true
FORGEJO
# quay.io + registry.k8s.io: include mirror configs that match node4's
# layout (no real pull-through cache today, server line is the direct
# upstream). Keeping these present makes the per-node config uniform and
# lets us flip a cache on later by editing only the [host."..."] block.
mkdir -p /etc/containerd/certs.d/quay.io
cat > /etc/containerd/certs.d/quay.io/hosts.toml <<'QUAY'
server = "https://quay.io"
[host."http://10.0.20.10:5020"]
capabilities = ["pull", "resolve"]
QUAY
mkdir -p /etc/containerd/certs.d/registry.k8s.io
cat > /etc/containerd/certs.d/registry.k8s.io/hosts.toml <<'K8SREG'
server = "https://registry.k8s.io"
[host."http://10.0.20.10:5030"]
capabilities = ["pull", "resolve"]
K8SREG
# 3. containerd tuning: parallel pulls + selective GC overrides.
# containerd v2's `config default` ALREADY emits `[plugins.'io.containerd.gc.v1.scheduler']`,
# `[plugins.'io.containerd.runtime.v2.task']`, and `[plugins.'io.containerd.metadata.v1.bolt']`
# sections — declaring them again fails with `toml: table … already exists`
# (observed on node6 boot 2026-05-26). Patch values in place instead.
sed -i 's/.*max_concurrent_downloads = 3/max_concurrent_downloads = 20/g' /etc/containerd/config.toml
# pause_threshold: 0.5 → 0.02 (run GC more aggressively when images dirty %)
sed -i "s/^[[:space:]]*pause_threshold = .*/ pause_threshold = 0.02/" /etc/containerd/config.toml
# schedule_delay: 0s/1ms → 30 min (longer cool-down between GC runs)
sed -i "s/^[[:space:]]*schedule_delay = .*/ schedule_delay = '1800s'/" /etc/containerd/config.toml
# exit_timeout: 0s → 5m (more aggressive container cleanup)
sed -i "s/^[[:space:]]*exit_timeout = .*/ exit_timeout = '5m'/" /etc/containerd/config.toml
# 4. (kubelet tuning intentionally NOT here — /var/lib/kubelet/config.yaml
# only exists AFTER kubeadm join. That work runs in
# k8s-node-post-join-tune.sh, invoked as a separate cloud-init runcmd
# step after the join completes.)
# 5. logind + kubelet systemd unit — total kubelet shutdown 310s, so
# logind InhibitDelay > that and kubelet TimeoutStopSec > that.
mkdir -p /etc/systemd/logind.conf.d
cat > /etc/systemd/logind.conf.d/kubelet-shutdown.conf <<'LOGIND_CONF'
[Login]
InhibitDelayMaxSec=480
LOGIND_CONF
systemctl restart systemd-logind
mkdir -p /etc/systemd/system/kubelet.service.d
cat > /etc/systemd/system/kubelet.service.d/20-shutdown.conf <<'KUBELET_SHUTDOWN'
[Service]
TimeoutStopSec=420s
KUBELET_SHUTDOWN
systemctl daemon-reload
# 6. (master-only) faster pod eviction + attach-detach reconcile.
if [ -f /etc/kubernetes/manifests/kube-controller-manager.yaml ]; then
python3 - <<'CM_PATCH'
import yaml
with open('/etc/kubernetes/manifests/kube-controller-manager.yaml') as f:
m = yaml.safe_load(f)
args = m['spec']['containers'][0]['command']
for flag in ['--attach-detach-reconcile-sync-period=15s']:
key = flag.split('=')[0]
args = [a for a in args if not a.startswith(key)]
args.append(flag)
m['spec']['containers'][0]['command'] = args
with open('/etc/kubernetes/manifests/kube-controller-manager.yaml', 'w') as f:
yaml.dump(m, f, default_flow_style=False)
CM_PATCH
python3 - <<'AS_PATCH'
import yaml
with open('/etc/kubernetes/manifests/kube-apiserver.yaml') as f:
m = yaml.safe_load(f)
args = m['spec']['containers'][0]['command']
for flag in ['--default-unreachable-toleration-seconds=60', '--default-not-ready-toleration-seconds=60']:
key = flag.split('=')[0]
args = [a for a in args if not a.startswith(key)]
args.append(flag)
m['spec']['containers'][0]['command'] = args
with open('/etc/kubernetes/manifests/kube-apiserver.yaml', 'w') as f:
yaml.dump(m, f, default_flow_style=False)
AS_PATCH
fi

View file

@ -0,0 +1,78 @@
#!/usr/bin/env bash
#
# Runs AFTER `kubeadm join` has written /var/lib/kubelet/config.yaml.
# Patches kubelet config in place (parallel image pulls, eviction
# thresholds, priority-based shutdown grace, container log rotation)
# and (on master) tightens controller-manager / apiserver flags.
#
# Embedded into the cloud-init snippet base64-encoded by main.tf so
# YAML whitespace doesn't touch the heredoc bodies inside.
set -euo pipefail
if [ ! -f /var/lib/kubelet/config.yaml ]; then
echo "post-join-tune: /var/lib/kubelet/config.yaml not found — was kubeadm join run?" >&2
exit 1
fi
# Parallel image pulls.
sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml
sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml
printf 'serializeImagePulls: false\nmaxParallelImagePulls: 50\n' >> /var/lib/kubelet/config.yaml
# Memory / disk eviction. Aggressive disk thresholds (15%/20%)
# prevent the 2026-03-13 containerd image-store corruption that took
# down k8s-node2.
sed -i '/systemReserved:/d; /kubeReserved:/d; /evictionHard:/,/^[^ ]/{ /evictionHard:/d; /^ /d }; /evictionSoft:/,/^[^ ]/{ /evictionSoft:/d; /^ /d }; /evictionSoftGracePeriod:/,/^[^ ]/{ /evictionSoftGracePeriod:/d; /^ /d }' /var/lib/kubelet/config.yaml
cat >> /var/lib/kubelet/config.yaml <<'KUBELET_PATCH'
systemReserved:
memory: "512Mi"
cpu: "200m"
kubeReserved:
memory: "512Mi"
cpu: "200m"
evictionHard:
memory.available: "500Mi"
nodefs.available: "15%"
imagefs.available: "20%"
evictionSoft:
memory.available: "1Gi"
nodefs.available: "20%"
imagefs.available: "25%"
evictionSoftGracePeriod:
memory.available: "30s"
nodefs.available: "60s"
imagefs.available: "30s"
memorySwap:
swapBehavior: "LimitedSwap"
KUBELET_PATCH
# Container log rotation + priority-based shutdown grace.
sed -i '/^shutdownGracePeriod:/d; /^shutdownGracePeriodCriticalPods:/d' /var/lib/kubelet/config.yaml
python3 - <<'KUBELET_FINAL'
import yaml
with open('/var/lib/kubelet/config.yaml') as f:
cfg = yaml.safe_load(f)
cfg.pop('shutdownGracePeriod', None)
cfg.pop('shutdownGracePeriodCriticalPods', None)
cfg.pop('shutdownGracePeriodByPodPriority', None)
cfg['containerLogMaxSize'] = '10Mi'
cfg['containerLogMaxFiles'] = 3
cfg['shutdownGracePeriodByPodPriority'] = [
{'priority': 0, 'shutdownGracePeriodSeconds': 20},
{'priority': 200000, 'shutdownGracePeriodSeconds': 20},
{'priority': 400000, 'shutdownGracePeriodSeconds': 30},
{'priority': 600000, 'shutdownGracePeriodSeconds': 30},
{'priority': 800000, 'shutdownGracePeriodSeconds': 90},
{'priority': 1000000, 'shutdownGracePeriodSeconds': 30},
{'priority': 1200000, 'shutdownGracePeriodSeconds': 30},
{'priority': 2000000000, 'shutdownGracePeriodSeconds': 30},
{'priority': 2000001000, 'shutdownGracePeriodSeconds': 30},
]
with open('/var/lib/kubelet/config.yaml', 'w') as f:
yaml.dump(cfg, f, default_flow_style=False)
KUBELET_FINAL
# Reload kubelet to pick up new config (it's already started by the
# preceding cloud-init runcmd line — restart, not start).
systemctl restart kubelet

View file

@ -0,0 +1,104 @@
variable "proxmox_host" { type = string }
variable "proxmox_user" { type = string }
variable "cloud_image_url" { type = string }
variable "image_path" { type = string }
variable "template_id" {
type = number
default = 8000
}
variable "template_name" { type = string }
variable "snippet_name" { type = string }
variable "user_passwd" { type = string } # hashed pw
variable "k8s_join_command" {
type = string
default = ""
}
variable "containerd_config_update_command" {
type = string
default = ""
description = "DEPRECATED: was inlined into write_files via indent(); the heredoc-TOML interaction broke containerd config parsing on node5 v1 boot 2026-05-26. The k8s setup script is now bundled inside the module at k8s-node-containerd-setup.sh — pass nothing here. Kept to avoid breaking stacks that still reference it; ignored when is_k8s_template=true."
}
variable "is_k8s_template" { type = bool }
variable "ssh_private_key" {
type = string
default = ""
}
variable "ssh_public_key" {
type = string
default = ""
}
variable "provision_cmds" {
type = list(string)
default = []
}
# SSH connection to Proxmox
resource "null_resource" "create_template_remote" {
connection {
type = "ssh"
user = var.proxmox_user
host = var.proxmox_host
private_key = var.ssh_private_key
}
# Commands executed *on Proxmox host*
provisioner "remote-exec" {
inline = [
"set -e",
# download the cloud image if missing
"if [ ! -f ${var.image_path} ]; then wget -O ${var.image_path} ${var.cloud_image_url}; fi",
# create template only if not existing
"if ! qm status ${var.template_id} >/dev/null 2>&1; then",
" echo 'Creating cloud-init template...';",
" qm create ${var.template_id} --name ${var.template_name} --memory 8192 --cores 8 --net0 virtio,bridge=vmbr0;",
" qm importdisk ${var.template_id} ${var.image_path} local-lvm;",
" qm set ${var.template_id} --scsihw virtio-scsi-pci --scsi0 local-lvm:vm-${var.template_id}-disk-0;",
" qm set ${var.template_id} --ide2 local-lvm:cloudinit;",
" qm set ${var.template_id} --boot c --bootdisk scsi0;",
" qm set ${var.template_id} --serial0 socket --vga serial0;",
" qm template ${var.template_id};",
"else",
" echo 'Template ${var.template_id} already exists — skipping.';",
"fi"
]
}
}
resource "null_resource" "upload_cloud_init" {
connection {
type = "ssh"
host = var.proxmox_host
user = var.proxmox_user
private_key = var.ssh_private_key
}
provisioner "remote-exec" {
inline = ["mkdir -p /var/lib/vz/snippets"]
}
provisioner "file" {
destination = "/var/lib/vz/snippets/${var.snippet_name}"
content = templatefile("${path.module}/cloud_init.yaml", {
is_k8s_template = var.is_k8s_template,
authorized_ssh_key = var.ssh_public_key,
passwd = var.user_passwd,
provision_cmds = var.provision_cmds,
k8s_join_command = var.k8s_join_command,
k8s_node_setup_script_b64 = var.is_k8s_template ? base64encode(file("${path.module}/k8s-node-containerd-setup.sh")) : ""
k8s_node_post_join_script_b64 = var.is_k8s_template ? base64encode(file("${path.module}/k8s-node-post-join-tune.sh")) : ""
}
)
}
# Force recreate when the below changes
triggers = {
file_hash = filesha256("${path.module}/cloud_init.yaml")
setup_script_hash = var.is_k8s_template ? filesha256("${path.module}/k8s-node-containerd-setup.sh") : ""
post_join_script_hash = var.is_k8s_template ? filesha256("${path.module}/k8s-node-post-join-tune.sh") : ""
provision_cmds = join(", ", var.provision_cmds)
is_k8s_template = var.is_k8s_template,
passwd = var.user_passwd,
k8s_join_command = var.k8s_join_command,
ssh_public_key = var.ssh_public_key,
}
}

313
modules/create-vm/main.tf Normal file
View file

@ -0,0 +1,313 @@
# ---------------------------------------------------------------------------
# Variables Required
# ---------------------------------------------------------------------------
variable "vm_name" { type = string }
variable "vmid" {
type = number
default = 0
}
variable "cisnippet_name" {
type = string
default = ""
}
variable "bridge" { type = string }
# ---------------------------------------------------------------------------
# Variables VM sizing
# ---------------------------------------------------------------------------
variable "vm_cpus" {
type = number
default = 4
}
variable "cpu_sockets" {
type = number
default = 1
}
variable "vm_mem_mb" {
type = number
default = 8192
}
variable "vm_disk_size" {
type = string
default = "64G"
}
variable "balloon" {
type = number
default = 0 # 0 = disabled (recommended for k8s nodes)
}
# ---------------------------------------------------------------------------
# Variables VM identity & networking
# ---------------------------------------------------------------------------
variable "vm_mac_address" {
type = string
default = null
}
variable "vlan_tag" {
type = string
default = null
}
variable "ipconfig0" {
type = string
default = "ip=dhcp,ip6=dhcp"
}
# ---------------------------------------------------------------------------
# Variables Boot & hardware
# ---------------------------------------------------------------------------
variable "template_name" {
type = string
default = "" # empty = no clone (for importing existing VMs)
}
variable "scsihw" {
type = string
default = "virtio-scsi-pci"
}
variable "boot" {
type = string
default = "order=scsi0"
}
variable "boot_disk" {
type = string
default = "" # e.g., "scsi0" only set if boot = "c" (legacy)
}
variable "disk_slot" {
type = string
default = "scsi0" # which SCSI slot the OS disk is on
}
variable "agent" {
type = number
default = 1
}
variable "qemu_os" {
type = string
default = "l26"
}
variable "numa" {
type = bool
default = false
}
variable "machine" {
type = string
default = "" # empty = provider default. Use "q35" for GPU passthrough
}
# ---------------------------------------------------------------------------
# Variables Startup/shutdown ordering
# ---------------------------------------------------------------------------
variable "startup_order" {
type = number
default = -1
}
variable "startup_delay" {
type = number
default = -1
}
variable "shutdown_timeout" {
type = number
default = -1
}
# ---------------------------------------------------------------------------
# Variables Cloud-Init (optional disable for non-cloud-init VMs)
# ---------------------------------------------------------------------------
variable "use_cloud_init" {
type = bool
default = true
}
variable "ssh_keys" {
type = string
default = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDHLhYDfyx237eJgOGVoJRECpUS95+7rEBS9vacsIxtx devvm"
}
# ---------------------------------------------------------------------------
# Variables GPU / PCI passthrough
# ---------------------------------------------------------------------------
variable "hostpci0" {
type = string
default = "" # e.g., "0000:06:00.0" for Tesla T4 passthrough
}
# ---------------------------------------------------------------------------
# Variables Disk I/O throttling (bytes/sec; 0 = uncapped)
# ---------------------------------------------------------------------------
# Caps any single VM's share of the underlying disk so a runaway workload
# (e.g. the 2026-05-23/26 alloy IO storm memory id=2726) cannot wedge the
# whole Proxmox host's sdc thin pool. Values inferred from PVE RRD p99/max
# observed in /nodes/pve/qemu/<vmid>/rrddata.
variable "mbps_rd" {
type = number
default = 0
}
variable "mbps_wr" {
type = number
default = 0
}
# ---------------------------------------------------------------------------
# Resource
# ---------------------------------------------------------------------------
resource "proxmox_vm_qemu" "cloudinit-vm" {
vmid = var.vmid
name = var.vm_name
target_node = "pve"
agent = var.agent
memory = var.vm_mem_mb
balloon = var.balloon
boot = var.boot
bootdisk = var.boot_disk != "" ? var.boot_disk : null
clone = var.template_name != "" ? var.template_name : null
full_clone = var.template_name != "" ? true : false
scsihw = var.scsihw
vm_state = "running"
automatic_reboot = false # never let Terraform reboot VMs use /reboot-server skill instead
os_type = var.use_cloud_init ? "cloud-init" : null
machine = var.machine != "" ? var.machine : null
# Cloud-Init configuration (only when use_cloud_init = true)
cicustom = var.use_cloud_init && var.cisnippet_name != "" ? "vendor=local:snippets/${var.cisnippet_name}" : null
ciupgrade = var.use_cloud_init ? true : null
nameserver = var.use_cloud_init ? "1.1.1.1 8.8.8.8" : null
ipconfig0 = var.use_cloud_init ? var.ipconfig0 : null
skip_ipv6 = var.use_cloud_init ? true : null
ciuser = var.use_cloud_init ? "root" : null
cipassword = var.use_cloud_init ? "root" : null
sshkeys = var.use_cloud_init ? var.ssh_keys : null
searchdomain = var.use_cloud_init ? "viktorbarzin.lan" : null
start_at_node_boot = true
qemu_os = var.qemu_os
cpu {
cores = var.vm_cpus
sockets = var.cpu_sockets
type = "host"
}
startup_shutdown {
order = var.startup_order
shutdown_timeout = var.shutdown_timeout
startup_delay = var.startup_delay
}
serial {
id = 0
}
disks {
scsi {
dynamic "scsi0" {
for_each = var.disk_slot == "scsi0" ? [1] : []
content {
disk {
storage = "local-lvm"
size = var.vm_disk_size
discard = true # Enable TRIM passthrough to LVM thin pool reduces CoW overhead
mbps_r_concurrent = var.mbps_rd
mbps_wr_concurrent = var.mbps_wr
}
}
}
dynamic "scsi1" {
for_each = var.disk_slot == "scsi1" ? [1] : []
content {
disk {
storage = "local-lvm"
size = var.vm_disk_size
discard = true
mbps_r_concurrent = var.mbps_rd
mbps_wr_concurrent = var.mbps_wr
}
}
}
}
dynamic "ide" {
for_each = var.use_cloud_init ? [1] : []
content {
ide1 {
cloudinit {
storage = "local-lvm"
}
}
}
}
}
network {
id = 0
bridge = var.bridge
model = "virtio"
macaddr = var.vm_mac_address
tag = var.vlan_tag
}
# Safety: ignore dynamically-attached iSCSI PVC disks (managed by democratic-csi)
# and cloud-init changes that drift after initial provisioning
lifecycle {
prevent_destroy = true
ignore_changes = [
# proxmox-csi dynamically attaches/detaches PVC disks. K8s workers
# have up to ~30 slots in use simultaneously (k8s-node1: scsi1-29 +
# unused0-29). The k8s-master only uses scsi0 (boot) so most of
# these are no-ops for that VM but harmless.
disks[0].scsi[0].scsi1,
disks[0].scsi[0].scsi2,
disks[0].scsi[0].scsi3,
disks[0].scsi[0].scsi4,
disks[0].scsi[0].scsi5,
disks[0].scsi[0].scsi6,
disks[0].scsi[0].scsi7,
disks[0].scsi[0].scsi8,
disks[0].scsi[0].scsi9,
disks[0].scsi[0].scsi10,
disks[0].scsi[0].scsi11,
disks[0].scsi[0].scsi12,
disks[0].scsi[0].scsi13,
disks[0].scsi[0].scsi14,
disks[0].scsi[0].scsi15,
disks[0].scsi[0].scsi16,
disks[0].scsi[0].scsi17,
disks[0].scsi[0].scsi18,
disks[0].scsi[0].scsi19,
disks[0].scsi[0].scsi20,
disks[0].scsi[0].scsi21,
disks[0].scsi[0].scsi22,
disks[0].scsi[0].scsi23,
disks[0].scsi[0].scsi24,
disks[0].scsi[0].scsi25,
disks[0].scsi[0].scsi26,
disks[0].scsi[0].scsi27,
disks[0].scsi[0].scsi28,
disks[0].scsi[0].scsi29,
# cloud-init config may drift after first boot
cicustom,
ciupgrade,
ciuser,
cipassword,
sshkeys,
# SMBIOS UUID and vmgenid are auto-generated
smbios,
# Tags and description may be edited in Proxmox UI
tags,
desc,
# Provider defaults that differ from imported state
define_connection_info,
full_clone,
# scsihw varies per VM (virtio-scsi-pci / virtio-scsi-single / lsi)
# and changing it on a running VM is risky leave whatever's live.
scsihw,
# qemu_os is a hint to qemu about the guest OS; some live VMs have
# "other" (unset originally) and the module's "l26" default would
# otherwise force an unnecessary write on apply.
qemu_os,
]
}
}

View file

@ -0,0 +1,9 @@
terraform {
required_providers {
proxmox = {
source = "telmate/proxmox"
version = "3.0.2-rc07"
}
}
}

View file

@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""Keeps only the N most recent tags per image in pull-through cache registries.
Deletes old tag links directly from the filesystem since the API doesn't support
DELETE on proxy registries. Run garbage-collect after to reclaim blob storage."""
import os
import shutil
import sys
sys.stdout.reconfigure(line_buffering=True)
KEEP = int(sys.argv[1]) if len(sys.argv) > 1 else 10
BASE = sys.argv[2] if len(sys.argv) > 2 else "/opt/registry/data"
total_deleted = 0
for registry_name in sorted(os.listdir(BASE)):
storage = os.path.join(BASE, registry_name, "docker/registry/v2/repositories")
if not os.path.isdir(storage):
continue
for root, dirs, _ in os.walk(storage):
if not root.endswith("_manifests/tags"):
continue
repo = root.replace(storage + "/", "").replace("/_manifests/tags", "")
tag_times = []
for tag in os.listdir(root):
tag_path = os.path.join(root, tag)
if os.path.isdir(tag_path):
mtime = os.path.getmtime(tag_path)
tag_times.append((mtime, tag, tag_path))
if len(tag_times) <= KEEP:
continue
tag_times.sort(reverse=True)
to_delete = tag_times[KEEP:]
print(f"[{registry_name}/{repo}] {len(tag_times)} tags -> keeping {KEEP}, deleting {len(to_delete)}")
for _, tag, tag_path in to_delete:
shutil.rmtree(tag_path)
total_deleted += 1
print(f" done")
print(f"\nDeleted {total_deleted} tags. Run garbage-collect to reclaim space.")

View file

@ -0,0 +1,31 @@
version: 0.1
log:
fields:
service: registry-private
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: /var/lib/registry
maxsize: 100GiB
delete:
enabled: true
maintenance:
uploadpurging:
enabled: true
age: 168h
interval: 4h
dryrun: false
auth:
htpasswd:
realm: "Registry Realm"
path: /auth/htpasswd
http:
addr: :5000
headers:
X-Content-Type-Options: [nosniff]
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3

View file

@ -0,0 +1,30 @@
version: 0.1
log:
fields:
service: registry-${name}
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: /var/lib/registry
delete:
enabled: true
maintenance:
uploadpurging:
enabled: true
age: 24h
interval: 4h
dryrun: false
http:
addr: :5000
draintimeout: 60s
headers:
X-Content-Type-Options: [nosniff]
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3
proxy:
remoteurl: ${remote_url}
ttl: 0

View file

@ -0,0 +1,41 @@
version: 0.1
log:
fields:
service: registry
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: /var/lib/registry
delete:
enabled: true
maintenance:
uploadpurging:
enabled: true
age: 24h
interval: 4h
dryrun: false
readonly:
enabled: false
http:
addr: :5000
draintimeout: 60s
headers:
X-Content-Type-Options: [nosniff]
debug:
addr: ":5001"
# Enable proxy on nodes - https://github.com/containerd/containerd/blob/main/docs/cri/registry.md
# https://ops.tips/gists/retrieving-docker-registry-metrics-using-prometheus/
prometheus:
enabled: true
path: "/metrics"
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3
proxy:
remoteurl: https://registry-1.docker.io
username: vbarzin@gmail.com
password: ${password}
ttl: 0

View file

@ -0,0 +1,158 @@
networks:
registry:
driver: bridge
services:
# registry:2 is pinned after the 2026-04-13 + 2026-04-19 orphan-index incidents.
# Floating tags were swapping to regressed versions between GC runs. Upgrade
# path: bump all six registry-* services in lockstep and bounce via
# `systemctl restart docker-compose-registry.service`.
registry-dockerhub:
image: registry:2.8.3
container_name: registry-dockerhub
restart: always
volumes:
- /opt/registry/data/dockerhub:/var/lib/registry
- /opt/registry/config-dockerhub.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
ports:
- "5001:5001"
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-ghcr:
image: registry:2.8.3
container_name: registry-ghcr
restart: always
volumes:
- /opt/registry/data/ghcr:/var/lib/registry
- /opt/registry/config-ghcr.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-quay:
image: registry:2.8.3
container_name: registry-quay
restart: always
volumes:
- /opt/registry/data/quay:/var/lib/registry
- /opt/registry/config-quay.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-k8s:
image: registry:2.8.3
container_name: registry-k8s
restart: always
volumes:
- /opt/registry/data/k8s:/var/lib/registry
- /opt/registry/config-k8s.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
registry-kyverno:
image: registry:2.8.3
container_name: registry-kyverno
restart: always
volumes:
- /opt/registry/data/kyverno:/var/lib/registry
- /opt/registry/config-kyverno.yml:/etc/docker/registry/config.yml:ro
networks:
- registry
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
# registry-private decommissioned in Phase 4 of
# forgejo-registry-consolidation 2026-05-07 — image migration completed,
# cluster flipped to forgejo.viktorbarzin.me/viktor/<image>. The remaining
# five services on this VM are pull-through caches for upstream registries.
# After 1 week of no incidents, `rm -rf /opt/registry/data/private/` on the
# VM frees ~2.6 GB. The tarball break-glass under
# /opt/registry/data/private/_breakglass/ stays — it's how we recover
# infra-ci if Forgejo ever goes fully down.
nginx:
image: nginx:alpine
container_name: registry-nginx
restart: always
# 5050 dropped Phase 4 of forgejo-registry-consolidation 2026-05-07.
ports:
- "5000:5000"
- "5010:5010"
- "5020:5020"
- "5030:5030"
- "5040:5040"
volumes:
- /opt/registry/nginx.conf:/etc/nginx/nginx.conf:ro
- /opt/registry/tls:/etc/nginx/tls:ro
- nginx-cache:/var/cache/nginx
networks:
- registry
depends_on:
registry-dockerhub:
condition: service_healthy
registry-ghcr:
condition: service_healthy
registry-quay:
condition: service_healthy
registry-k8s:
condition: service_healthy
registry-kyverno:
condition: service_healthy
healthcheck:
test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
registry-ui:
image: joxit/docker-registry-ui:latest
container_name: registry-ui
restart: always
ports:
- "8080:80"
environment:
- NGINX_PROXY_PASS_URL=http://registry-dockerhub:5000
- DELETE_IMAGES=true
- SINGLE_REGISTRY=true
- SHOW_CONTENT_DIGEST=true
- SHOW_CATALOG_NB_TAGS=true
- CATALOG_ELEMENTS_LIMIT=1000
- TAGLIST_PAGE_SIZE=100
- REGISTRY_TITLE=viktorbarzin.me
networks:
- registry
depends_on:
registry-dockerhub:
condition: service_healthy
volumes:
nginx-cache:

View file

@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""Registry integrity scanner — two classes of brokenness.
1. Orphaned layer links: the cleanup-tags.sh + garbage-collect cycle can delete
blob data while leaving _layers/ link files intact. The registry then returns
HTTP 200 with 0 bytes for those layers (it finds the link, trusts the blob
exists, but the data is gone). Containerd sees "unexpected EOF".
Action: delete the orphan link so the next pull re-fetches cleanly.
2. Orphaned OCI-index children: an image index (multi-platform manifest list)
references child manifests by digest. If a child's blob has been deleted —
by a cleanup-tags.sh tag rmtree followed by garbage-collect walking the
children wrong (distribution/distribution#3324 class), or by an incomplete
`buildx --push` whose partial blob was later purged by `uploadpurging`
the index survives but pulls fail with `manifest unknown`.
Action: log loudly. Deleting an index is a conscious decision (the image
was published; removing it breaks downstream consumers), so we surface
the problem and leave repair to a human or to the rebuild runbook.
Run after garbage-collect (Sunday 03:30) and daily (Mon-Sat 02:30).
"""
import argparse
import json
import os
import sys
sys.stdout.reconfigure(line_buffering=True)
parser = argparse.ArgumentParser(description="Scan registry for orphaned blobs and indexes")
parser.add_argument("base", nargs="?", default="/opt/registry/data", help="Registry data directory")
parser.add_argument("--dry-run", action="store_true", help="Report but don't delete")
args = parser.parse_args()
BASE = args.base
DRY_RUN = args.dry_run
INDEX_MEDIA_TYPES = (
"application/vnd.oci.image.index.v1+json",
"application/vnd.docker.distribution.manifest.list.v2+json",
)
# Only the private R/W registry is authoritative for every child of every
# index it stores — we pushed those indexes ourselves, so a missing child is
# always a bug (the 2026-04-13 + 2026-04-19 failure mode).
#
# Pull-through caches (dockerhub, ghcr, quay, k8s, kyverno) are ALLOWED to
# have missing children: they only fetch what someone actually pulls.
# Uncached arm64 / arm / attestation variants of a multi-platform index are
# normal partial state, not orphans. Scanning them generates hundreds of
# false-positive warnings — noise that would mask the real signal from the
# private registry. Scan 2 is therefore private-only.
INDEX_SCAN_REGISTRIES = ("private",)
total_layer_removed = 0
total_layer_checked = 0
total_index_scanned = 0
total_index_orphans = 0
def load_manifest_blob(blobs_root, digest_hex):
blob_path = os.path.join(blobs_root, digest_hex[:2], digest_hex, "data")
if not os.path.isfile(blob_path):
return None
try:
with open(blob_path, "rb") as f:
raw = f.read(1024 * 1024)
except OSError:
return None
try:
return json.loads(raw)
except (json.JSONDecodeError, UnicodeDecodeError):
return None
for registry_name in sorted(os.listdir(BASE)):
repos_dir = os.path.join(BASE, registry_name, "docker/registry/v2/repositories")
blobs_root = os.path.join(BASE, registry_name, "docker/registry/v2/blobs/sha256")
if not os.path.isdir(repos_dir):
continue
for root, _, _ in os.walk(repos_dir):
# --- Scan 1: orphan layer links ----------------------------------------
if root.endswith("/_layers/sha256"):
repo = root.replace(repos_dir + "/", "").replace("/_layers/sha256", "")
for digest_dir in os.listdir(root):
link_file = os.path.join(root, digest_dir, "link")
if not os.path.isfile(link_file):
continue
total_layer_checked += 1
blob_data = os.path.join(blobs_root, digest_dir[:2], digest_dir, "data")
if os.path.isfile(blob_data):
continue
prefix = "[DRY RUN] " if DRY_RUN else ""
print(f"{prefix}[{registry_name}/{repo}] removing orphaned layer link: {digest_dir[:12]}...")
if not DRY_RUN:
import shutil
shutil.rmtree(os.path.join(root, digest_dir))
total_layer_removed += 1
# --- Scan 2: orphan OCI-index children (private registry only) --------
elif root.endswith("/_manifests/revisions/sha256") and registry_name in INDEX_SCAN_REGISTRIES:
repo = root.replace(repos_dir + "/", "").replace("/_manifests/revisions/sha256", "")
for digest_dir in os.listdir(root):
# Manifest revision entry. Load the blob it points to.
manifest = load_manifest_blob(blobs_root, digest_dir)
if manifest is None:
continue
media_type = manifest.get("mediaType", "")
if media_type not in INDEX_MEDIA_TYPES:
continue
total_index_scanned += 1
# Per-repo revision links — serving a child manifest via the API
# requires <repo>/_manifests/revisions/sha256/<child-digest>/link
# to exist. The blob data alone is not enough: cleanup-tags.sh
# rmtrees tag dirs (which on 2.8.x also orphans the per-repo
# revision links for index children), while the upstream blob
# data survives in /blobs/. That's exactly the 2026-04-19
# failure mode — the probe sees 404 even though the blob file
# is still on disk.
revisions_root = os.path.dirname(root) # …/_manifests/revisions
for child in manifest.get("manifests", []):
child_digest = child.get("digest", "")
if not child_digest.startswith("sha256:"):
continue
child_hex = child_digest[len("sha256:"):]
child_link = os.path.join(revisions_root, "sha256", child_hex, "link")
if os.path.isfile(child_link):
continue
platform = child.get("platform", {})
arch = platform.get("architecture", "?")
os_ = platform.get("os", "?")
child_blob = os.path.join(blobs_root, child_hex[:2], child_hex, "data")
blob_state = "blob-data-present" if os.path.isfile(child_blob) else "blob-data-gone"
print(
f"WARNING [{registry_name}/{repo}] ORPHAN INDEX: "
f"{digest_dir[:12]} references missing child {child_hex[:12]} "
f"({arch}/{os_}, {blob_state}) — registry returns 404, rebuild required"
)
total_index_orphans += 1
mode = "DRY RUN — " if DRY_RUN else ""
print(f"\n{mode}Layer scan: checked {total_layer_checked} links, removed {total_layer_removed} orphaned.")
print(f"{mode}Index scan: inspected {total_index_scanned} image indexes, found {total_index_orphans} orphaned children.")
if total_index_orphans > 0:
print(f"\nACTION REQUIRED: {total_index_orphans} orphan index child(ren) detected. "
"See docs/runbooks/registry-rebuild-image.md — the affected image must be rebuilt "
"(a registry DELETE on an index is a conscious decision, not an automated repair).")

View file

@ -0,0 +1,174 @@
worker_processes auto;
error_log /var/log/nginx/error.log warn;
pid /tmp/nginx.pid;
events {
worker_connections 1024;
}
http {
proxy_cache_path /var/cache/nginx/registry
levels=1:2
keys_zone=registry:500m
max_size=50g
inactive=24h
use_temp_path=off;
log_format registry '$remote_addr [$time_local] "$request" '
'$status $body_bytes_sent '
'upstream=$upstream_addr time=$upstream_response_time '
'cache=$upstream_cache_status';
access_log /var/log/nginx/access.log registry;
# --- Upstreams ---
upstream dockerhub {
server registry-dockerhub:5000;
keepalive 32;
}
upstream ghcr {
server registry-ghcr:5000;
keepalive 32;
}
# `upstream private` removed in Phase 4 of forgejo-registry-consolidation
# 2026-05-07. The /v2/ private registry is now Forgejo at
# forgejo.viktorbarzin.me/viktor/.
# --- Docker Hub (port 5000) ---
server {
listen 5000;
server_name _;
client_max_body_size 0;
proxy_request_buffering off;
proxy_buffering on;
# Blobs are content-addressed (sha256) — immutable, safe to cache aggressively
location ~ /v2/.*/blobs/ {
proxy_pass http://dockerhub;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
# Reject truncated upstream responses
proxy_intercept_errors on;
error_page 502 503 504 = @upstream_error;
proxy_cache registry;
proxy_cache_lock on;
proxy_cache_lock_timeout 5m;
proxy_cache_lock_age 5m;
proxy_cache_use_stale updating;
proxy_cache_valid 200 24h;
proxy_cache_valid any 0;
proxy_cache_min_uses 2;
proxy_cache_methods GET;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
# Manifests are mutable (tags can change) — no cache, pass through to registry
location /v2/ {
proxy_pass http://dockerhub;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
proxy_cache off;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
location @upstream_error {
return 502 "upstream error";
}
location /healthz {
proxy_pass http://dockerhub/v2/;
proxy_read_timeout 5s;
proxy_connect_timeout 3s;
access_log off;
}
location / {
return 200 'ok';
add_header Content-Type text/plain;
}
}
# --- GHCR (port 5010) ---
server {
listen 5010;
server_name _;
client_max_body_size 0;
proxy_request_buffering off;
proxy_buffering on;
# Blobs are content-addressed (sha256) — immutable, safe to cache aggressively
location ~ /v2/.*/blobs/ {
proxy_pass http://ghcr;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
# Reject truncated upstream responses
proxy_intercept_errors on;
error_page 502 503 504 = @upstream_error;
proxy_cache registry;
proxy_cache_lock on;
proxy_cache_lock_timeout 5m;
proxy_cache_lock_age 5m;
proxy_cache_use_stale updating;
proxy_cache_valid 200 24h;
proxy_cache_valid any 0;
proxy_cache_min_uses 2;
proxy_cache_methods GET;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
# Manifests are mutable (tags can change) — no cache, pass through to registry
location /v2/ {
proxy_pass http://ghcr;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Connection "";
proxy_cache off;
proxy_read_timeout 900;
proxy_send_timeout 900;
}
location @upstream_error {
return 502 "upstream error";
}
location /healthz {
proxy_pass http://ghcr/v2/;
proxy_read_timeout 5s;
proxy_connect_timeout 3s;
access_log off;
}
location / {
return 200 'ok';
add_header Content-Type text/plain;
}
}
# --- Private R/W Registry (port 5050) decommissioned Phase 4 2026-05-07 ---
# The TLS port 5050 server block previously fronted `registry-private`.
# Migrated to Forgejo at forgejo.viktorbarzin.me/viktor/. Both
# docker-compose.yml and this nginx config no longer reference port 5050.
}

View file

@ -0,0 +1,474 @@
terraform {
required_providers {
kubernetes = {
source = "hashicorp/kubernetes"
}
}
}
# Per-site Anubis reverse proxy.
# Sits between Traefik and the real backend. On first visit, serves a
# proof-of-work challenge; on success, drops a long-lived JWT cookie and
# proxies the request through to `target_url`.
#
# Sharing a single ed25519 signing key across instances + COOKIE_DOMAIN at
# the registrable domain means a token solved on one viktorbarzin.me subdomain
# is honoured by every other Anubis-fronted site.
variable "name" {
type = string
description = "Short logical name (e.g. \"blog\"). Used to derive Service / Deployment / Secret names as anubis-<name>."
}
variable "namespace" {
type = string
description = "Namespace to deploy into — typically the same as the protected backend service."
}
variable "target_url" {
type = string
description = "Backend URL Anubis forwards passing requests to (e.g. http://blog.website.svc.cluster.local)."
}
variable "cookie_domain" {
type = string
default = "viktorbarzin.me"
description = "Cookie domain — set to the registrable domain so a single PoW solve covers every Anubis-fronted subdomain."
}
variable "difficulty" {
type = number
default = 2
description = "PoW difficulty (leading-zero hex chars). 2 = ~250ms desktop / ~700ms mobile. Bump for stronger filtering."
}
variable "cookie_expiration_hours" {
type = number
default = 720 # 30 days
description = "Lifetime of the issued JWT cookie in hours."
}
variable "image_tag" {
type = string
default = "v1.25.0"
description = "ghcr.io/techarohq/anubis tag — pin to a release, never :latest."
}
variable "replicas" {
type = number
default = null
description = "Optional replica count override. When null, defaults to 1 if shared_store_url is null and 2 otherwise. Capped at 2 — Redis can handle more but anti-affinity assumes ≤2 replicas per Anubis instance on a 5-node cluster."
validation {
condition = var.replicas == null ? true : (var.replicas >= 1 && var.replicas <= 2)
error_message = "replicas must be 1 or 2 (or null to auto-pick from shared_store_url presence)."
}
}
variable "shared_store_url" {
type = string
default = null
description = "If set, Anubis stores in-flight challenge state in this Valkey/Redis-protocol URL instead of in-process memory, enabling HA across replicas. Format: redis://host:port/<db-index>. The DB index MUST be unique per Anubis instance (this module assumes 16 DBs available, common in standalone Redis). Cluster Redis is redis-master.redis.svc.cluster.local:6379 with HA via Sentinel + haproxy. Without this, replicas>1 causes ~50% PoW failures (challenge issued by pod A, solved against pod B → 500)."
validation {
condition = var.shared_store_url == null || can(regex("^redis://[a-zA-Z0-9_.-]+:[0-9]+/[0-9]+$", var.shared_store_url))
error_message = "shared_store_url must look like redis://host:port/<db-index> (explicit DB index required)."
}
}
variable "memory" {
type = string
default = "128Mi"
description = "requests==limits memory. Anubis docs suggest 128Mi handles many concurrent clients."
}
variable "policy_yaml" {
type = string
default = null
description = "Override the strict default bot-policy YAML. Leave null to use the catch-all CHALLENGE policy."
}
variable "cpu_request" {
type = string
default = "20m"
description = "CPU request. PoW verification is server-cheap (just hash check)."
}
locals {
full_name = "anubis-${var.name}"
labels = {
"app" = local.full_name
"app.kubernetes.io/name" = "anubis"
"app.kubernetes.io/instance" = local.full_name
"app.kubernetes.io/component" = "ai-bot-challenge"
"app.kubernetes.io/managed-by" = "terraform"
}
# Effective replicas: caller-override > shared-store-aware default.
effective_replicas = coalesce(var.replicas, var.shared_store_url == null ? 1 : 2)
# Anubis store config. With backend=valkey, multiple Anubis pods can share
# in-flight PoW state and a challenge issued by pod A is verifiable by pod
# B. Default backend is in-process memory which only works at replicas=1.
store_yaml_block = var.shared_store_url == null ? "" : <<-EOT
store:
backend: valkey
parameters:
url: "${var.shared_store_url}"
EOT
# Strict bot policy. Default Anubis policy only WEIGHs Mozilla|Opera UAs
# and lets unmatched UAs (curl, wget, Python-requests, scrapy, headless
# CLI scrapers) fall through to ALLOW. We import the same upstream
# snippets and append a catch-all CHALLENGE so anyone without JS+PoW
# capability is filtered.
default_policy_yaml = <<-EOT
bots:
# Hard-deny known-bad bots first runs before the method bypass so
# a declared bad bot can't sneak through by sending a POST.
- import: (data)/bots/_deny-pathological.yaml
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
# Hard-deny declared AI/LLM crawlers (ClaudeBot, GPTBot, Bytespider, ).
- import: (data)/meta/ai-block-aggressive.yaml
# Whitelist legitimate search-engine crawlers (Googlebot, Bingbot, ).
- import: (data)/crawlers/_allow-good.yaml
# Challenge Firefox AI previews specifically.
- import: (data)/clients/x-firefox-ai.yaml
# Allow /.well-known, /robots.txt, /favicon.*, /sitemap.xml keeps
# the internet working for benign crawlers and discovery clients.
- import: (data)/common/keep-internet-working.yaml
# Allow every non-GET request through. Rationale: AI scrapers steal
# the body of GETs (page content) they don't POST. State-mutating
# methods come from app XHRs (PrivateBin paste creation, Komga
# uploads, SPA actions) and CORS preflight (OPTIONS). Challenging
# those breaks the app, because the JS expects JSON and gets the
# Anubis HTML challenge page. CrowdSec + rate-limit + per-app auth
# already cover abuse on these methods.
- name: allow-non-get-methods
action: ALLOW
expression: method != "GET"
# Catch-all: every remaining (GET) request must solve the challenge.
# This closes the "unmatched UA falls through to ALLOW" gap that
# lets curl/wget/Python-requests scrape non-CDN-fronted hosts.
- name: catchall-challenge
path_regex: .*
action: CHALLENGE
EOT
# Final policy YAML: defaults (or caller override) plus an optional store
# block when shared_store_url is set. Store block is module-managed and
# appended universally callers passing a custom policy_yaml shouldn't
# include their own `store:` block (they would collide).
rendered_policy_yaml = "${coalesce(var.policy_yaml, local.default_policy_yaml)}${local.store_yaml_block}"
}
# Bot policy ConfigMap. Mounted into the pod and referenced by POLICY_FNAME.
resource "kubernetes_config_map" "policy" {
metadata {
name = "${local.full_name}-policy"
namespace = var.namespace
labels = local.labels
}
data = {
"botPolicies.yaml" = local.rendered_policy_yaml
}
}
# ED25519 signing key pulled from Vault `secret/viktor` -> field
# `anubis_ed25519_key`. Same key across every instance so JWTs are
# cross-validatable, enabling cross-subdomain SSO.
resource "kubernetes_manifest" "ed25519_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "${local.full_name}-key"
namespace = var.namespace
}
spec = {
refreshInterval = "1h"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = {
name = "${local.full_name}-key"
creationPolicy = "Owner"
}
data = [{
secretKey = "key"
remoteRef = {
key = "viktor"
property = "anubis_ed25519_key"
}
}]
}
}
}
resource "kubernetes_deployment" "anubis" {
metadata {
name = local.full_name
namespace = var.namespace
labels = local.labels
}
spec {
replicas = local.effective_replicas
selector {
match_labels = { app = local.full_name }
}
strategy {
type = "RollingUpdate"
rolling_update {
max_surge = 1
max_unavailable = 0
}
}
template {
metadata {
labels = local.labels
annotations = {
# Roll the deployment whenever the policy YAML changes Anubis
# reads the policy at startup, so a ConfigMap update alone
# doesn't take effect until pods restart.
"checksum/policy" = sha256(local.rendered_policy_yaml)
}
}
spec {
# Spread replicas across nodes to survive a single node failure.
# DoNotSchedule (not ScheduleAnyway) so 2 replicas are forced onto
# different hosts otherwise the scheduler may pile them on the
# same node and a single node reboot takes the whole Anubis instance
# down despite replicas=2. On a 5-node cluster the spread is always
# satisfiable; the worst case (4 nodes unavailable) leaves one
# replica Pending, but the other keeps serving.
topology_spread_constraint {
max_skew = 1
topology_key = "kubernetes.io/hostname"
when_unsatisfiable = "DoNotSchedule"
label_selector {
match_labels = { app = local.full_name }
}
}
container {
name = "anubis"
image = "ghcr.io/techarohq/anubis:${var.image_tag}"
port {
name = "http"
container_port = 8923
}
port {
name = "metrics"
container_port = 9090
}
env {
name = "BIND"
value = ":8923"
}
env {
name = "METRICS_BIND"
value = ":9090"
}
env {
name = "TARGET"
value = var.target_url
}
env {
name = "DIFFICULTY"
value = tostring(var.difficulty)
}
env {
name = "COOKIE_EXPIRATION_TIME"
value = "${var.cookie_expiration_hours}h"
}
# Cross-subdomain SSO: cookie scoped to the registrable domain so
# a JWT solved on any Anubis-fronted subdomain is honoured on every
# other one. (COOKIE_DOMAIN and COOKIE_DYNAMIC_DOMAIN are mutually
# exclusive picking the explicit form.)
env {
name = "COOKIE_DOMAIN"
value = var.cookie_domain
}
env {
name = "COOKIE_SECURE"
value = "true"
}
env {
name = "COOKIE_SAME_SITE"
value = "Lax"
}
# Built-in robots.txt that disallows known AI scrapers well-behaved
# bots get blocked here without ever paying the PoW cost.
env {
name = "SERVE_ROBOTS_TXT"
value = "true"
}
# Drop cluster-internal IPs from XFF so Anubis sees the real client.
env {
name = "XFF_STRIP_PRIVATE"
value = "true"
}
env {
name = "SLOG_LEVEL"
value = "INFO"
}
env {
name = "ED25519_PRIVATE_KEY_HEX_FILE"
# Mounted from the ESO-managed Secret below.
value = "/keys/key"
}
env {
name = "POLICY_FNAME"
value = "/config/botPolicies.yaml"
}
volume_mount {
name = "ed25519-key"
mount_path = "/keys"
read_only = true
}
volume_mount {
name = "policy"
mount_path = "/config"
read_only = true
}
resources {
requests = {
cpu = var.cpu_request
memory = var.memory
}
limits = {
memory = var.memory
}
}
# Liveness + readiness on the metrics endpoint (zero auth, always 200).
liveness_probe {
http_get {
path = "/metrics"
port = "metrics"
}
initial_delay_seconds = 10
period_seconds = 30
failure_threshold = 3
}
readiness_probe {
http_get {
path = "/metrics"
port = "metrics"
}
initial_delay_seconds = 2
period_seconds = 5
failure_threshold = 2
}
security_context {
run_as_non_root = true
run_as_user = 1000
run_as_group = 1000
allow_privilege_escalation = false
read_only_root_filesystem = true
capabilities {
drop = ["ALL"]
}
}
}
volume {
name = "ed25519-key"
secret {
secret_name = "${local.full_name}-key"
items {
key = "key"
path = "key"
}
}
}
volume {
name = "policy"
config_map {
name = kubernetes_config_map.policy.metadata[0].name
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].template[0].spec[0].dns_config]
}
depends_on = [kubernetes_manifest.ed25519_secret]
}
resource "kubernetes_service" "anubis" {
metadata {
name = local.full_name
namespace = var.namespace
labels = local.labels
annotations = {
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/metrics"
"prometheus.io/port" = "9090"
}
}
spec {
selector = { app = local.full_name }
port {
name = "http"
port = 8080
target_port = 8923
protocol = "TCP"
}
port {
name = "metrics"
port = 9090
target_port = 9090
protocol = "TCP"
}
}
}
resource "kubernetes_pod_disruption_budget_v1" "anubis" {
metadata {
name = local.full_name
namespace = var.namespace
}
spec {
# max_unavailable=1 means: at most one pod can be voluntarily disrupted
# at a time. With replicas=2 this allows clean rolling drains (one pod
# goes down other serves traffic first recreates elsewhere). With
# replicas=1 (no shared store) this is functionally equivalent to no
# PDB drain proceeds, brief outage, new pod schedules elsewhere.
# Was min_available=1 before 2026-05-16 which deadlocked drains on
# single-replica instances (eviction API can never satisfy the
# constraint at replicas=1). See PM-2026-05-11.
max_unavailable = "1"
selector {
match_labels = { app = local.full_name }
}
}
}
output "service_name" {
value = kubernetes_service.anubis.metadata[0].name
description = "ClusterIP service name. Pass this to ingress_factory's `service_name` so Traefik routes through Anubis."
}
output "service_port" {
value = 8080
description = "Service port. Anubis listens on 8923 inside; the Service exposes 8080."
}

View file

@ -0,0 +1,431 @@
terraform {
required_providers {
cloudflare = {
source = "cloudflare/cloudflare"
version = "~> 4"
}
kubernetes = {
source = "hashicorp/kubernetes"
}
}
}
variable "name" { type = string }
variable "service_name" {
type = string
default = null # defaults to name
}
variable "host" {
type = string
default = null
}
variable "namespace" { type = string }
variable "external_name" {
type = string
default = null
}
variable "port" {
default = "80"
}
variable "tls_secret_name" {}
variable "backend_protocol" {
default = "HTTP"
}
variable "auth" {
type = string
default = "required"
description = <<-EOT
Auth posture for this ingress. Pick by asking "what gates the app?":
* "required" (default, fail-closed): Authentik forward-auth gates every
request. Pick this when the backend has NO built-in user auth and
Authentik is the only thing standing between strangers and the app.
Examples: prowlarr, qbittorrent, netbox, phpipam, k8s-dashboard, any
admin UI shipped without its own login.
* "app": the backend handles its own user authentication (NextAuth,
Django sessions, OAuth, bearer-token API, etc.) and Authentik would
only get in the way. No Authentik middleware is attached; the app's
own login is the gate. Examples: immich, linkwarden, tandoor,
freshrss, affine, actualbudget, audiobookshelf, novelapp.
**Functionally identical to "none"** the distinct name exists to
record intent at the call site so future readers don't have to guess.
* "public": Authentik anonymous binding via the `public` outpost.
Strangers are auto-bound to the `guest` Authentik user; logged-in
users keep their identity in X-authentik-username. Only works for
top-level browser navigation CORS preflight rejects XHR/fetch and
automation can't replay the cookie dance. Audit trail, not a gate.
* "none": no Authentik middleware, no own-auth claim explicitly
public or unauthenticated-by-design. Use for: Anubis-fronted content
sites (where Anubis is the gate), native-client APIs that auth
themselves (Git, /v2/, WebDAV/CalDAV, CardDAV), webhook receivers,
OAuth callbacks, and Authentik outposts themselves.
**Anti-exposure rule** (the reason "app" exists as a distinct mode):
only pick "app" or "none" AFTER you have verified the app has its own
user auth (for "app") OR the endpoint is intentionally public (for
"none"). Picking either of these on a naked admin UI exposes it to the
internet. The default is "required" specifically so accidental omission
fails closed.
**Convention**: when using "app" or "none", add a comment line above
the `auth = "..."` line stating what gates the app or why it's public.
Future-you reads the call site, not the module description.
EOT
validation {
condition = contains(["required", "app", "public", "none"], var.auth)
error_message = "auth must be one of: required, app, public, none."
}
}
variable "ingress_path" {
type = list(string)
default = ["/"]
}
variable "max_body_size" {
type = string
default = null
description = "Maximum request body size, e.g. '5g'. null = no limit (Traefik default). When set, a per-ingress Buffering middleware is created and attached."
}
variable "extra_annotations" {
default = {}
}
variable "ssl_redirect" {
default = true
type = bool
}
variable "allow_local_access_only" {
default = false
type = bool
}
variable "root_domain" {
default = "viktorbarzin.me"
type = string
}
variable "custom_content_security_policy" {
type = string
default = null
}
variable "exclude_crowdsec" {
type = bool
default = false
}
variable "full_host" {
type = string
default = null
}
variable "extra_middlewares" {
type = list(string)
default = []
}
variable "skip_default_rate_limit" {
type = bool
default = false
}
variable "anti_ai_scraping" {
type = bool
default = null # null = auto (enabled when not protected, disabled when protected)
}
variable "dns_type" {
type = string
default = "none"
description = "Cloudflare DNS: 'proxied' (CNAME to tunnel), 'non-proxied' (A/AAAA to public IP), or 'none'"
validation {
condition = contains(["proxied", "non-proxied", "none"], var.dns_type)
error_message = "dns_type must be 'proxied', 'non-proxied', or 'none'."
}
}
# Uptime Kuma external monitor: when true, annotate the ingress so the
# external-monitor-sync CronJob creates a `[External] <name>` monitor pointing
# at https://<host>. Null means "follow dns_type" enabled when proxied.
variable "external_monitor" {
type = bool
default = null
description = "Enable Uptime Kuma external monitor. null = auto (enabled when dns_type == 'proxied')."
}
variable "external_monitor_name" {
type = string
default = null
description = "Override the monitor label. Defaults to the ingress hostname label (e.g. 'dawarich' for dawarich.viktorbarzin.me)."
}
# Cloudflare config defaults override via variables if these change.
# Source of truth: config.tfvars (cloudflare_zone_id, cloudflare_tunnel_id, public_ip, public_ipv6)
variable "cloudflare_zone_id" {
type = string
default = "fd2c5dd4efe8fe38958944e74d0ced6d"
}
variable "cloudflare_tunnel_id" {
type = string
default = "75182cd7-bb91-4310-b961-5d8967da8b41"
}
variable "public_ip" {
type = string
default = "176.12.22.76"
}
variable "public_ipv6" {
type = string
default = "2001:470:6e:43d::2"
}
variable "homepage_group" {
type = string
default = null # auto-detect from namespace
}
variable "homepage_enabled" {
type = bool
default = true
}
locals {
effective_host = var.full_host != null ? var.full_host : "${var.host != null ? var.host : var.name}.${var.root_domain}"
# Anti-AI default: ON when no Authentik auth fronts the ingress (auth =
# "none" or auth = "app" either the app gates users itself or the site
# is intentionally public). When Authentik gates the request
# (required/public), the auth flow already discourages bots.
effective_anti_ai = var.anti_ai_scraping != null ? var.anti_ai_scraping : (var.auth == "none" || var.auth == "app")
# Auth middleware selection. "app" and "none" both attach no Authentik
# middleware "app" signals "the backend has its own user auth", "none"
# signals "intentionally public / native-client API / webhook". The
# distinction lives at the call site for human readers; the runtime
# effect is identical.
auth_middleware = (
var.auth == "required" ? "traefik-authentik-forward-auth@kubernetescrd" :
var.auth == "public" ? "traefik-authentik-forward-auth-public@kubernetescrd" :
null
)
# External monitor enabled by default when the ingress has a public DNS
# record (either CF-proxied or direct A/AAAA). Explicit bool overrides.
effective_external_monitor = var.external_monitor != null ? var.external_monitor : (var.dns_type != "none")
# Emit the annotation when effective is true (positive signal), or when the
# caller explicitly set external_monitor=false (opt-out). When the caller
# leaves it null AND dns_type="none", emit nothing the sync script's
# default opt-in (any *.viktorbarzin.me ingress) keeps monitoring services
# that are publicly reachable via routes we don't manage here (e.g.
# helm-provisioned ingresses, services behind cloudflared tunnel with DNS
# set elsewhere).
external_monitor_annotations = local.effective_external_monitor ? merge(
{ "uptime.viktorbarzin.me/external-monitor" = "true" },
var.external_monitor_name != null ? { "uptime.viktorbarzin.me/external-monitor-name" = var.external_monitor_name } : {},
) : (var.external_monitor == false ?
{ "uptime.viktorbarzin.me/external-monitor" = "false" } : {}
)
ns_to_group = {
monitoring = "Infrastructure"
prometheus = "Infrastructure"
technitium = "Infrastructure"
traefik = "Infrastructure"
metallb-system = "Infrastructure"
kyverno = "Infrastructure"
authentik = "Identity & Security"
crowdsec = "Identity & Security"
woodpecker = "Development & CI"
forgejo = "Development & CI"
immich = "Media & Entertainment"
frigate = "Smart Home"
home-assistant = "Smart Home"
ollama = "AI & Data"
dbaas = "Infrastructure"
servarr = "Media & Entertainment"
navidrome = "Media & Entertainment"
nextcloud = "Productivity"
n8n = "Automation"
changedetection = "Automation"
finance = "Finance & Personal"
homepage = "Core Platform"
reverse-proxy = "Smart Home"
mailserver = "Infrastructure"
}
homepage_group = coalesce(
var.homepage_group,
lookup(local.ns_to_group, var.namespace, "Other")
)
dns_name = local.effective_host == var.root_domain ? "@" : replace(local.effective_host, ".${var.root_domain}", "")
homepage_defaults = var.homepage_enabled ? {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = replace(replace(var.name, "-", " "), "_", " ")
"gethomepage.dev/group" = local.homepage_group
"gethomepage.dev/href" = "https://${local.effective_host}"
"gethomepage.dev/icon" = "${replace(var.name, "-", "")}.png"
} : {}
# Parse "5g"/"50m"/"1024k"/"42" into bytes. Traefik's Buffering middleware
# takes maxRequestBodyBytes as an integer. Empty unit = bytes.
body_size_match = var.max_body_size == null ? null : regex("^([0-9]+)([kmgKMG]?)$", var.max_body_size)
body_size_unit_multiplier = var.max_body_size == null ? 0 : (
lower(local.body_size_match[1]) == "g" ? 1073741824 :
lower(local.body_size_match[1]) == "m" ? 1048576 :
lower(local.body_size_match[1]) == "k" ? 1024 :
1
)
max_body_size_bytes = var.max_body_size == null ? 0 : tonumber(local.body_size_match[0]) * local.body_size_unit_multiplier
}
resource "kubernetes_service" "proxied-service" {
count = var.external_name == null ? 0 : 1
metadata {
name = var.name
namespace = var.namespace
labels = {
"app" = var.name
}
}
spec {
type = var.external_name != null ? "ExternalName" : "ClusterIP"
external_name = var.name
port {
name = "${var.name}-web"
port = var.port
protocol = "TCP"
target_port = var.port
}
}
}
resource "kubernetes_ingress_v1" "proxied-ingress" {
metadata {
name = var.name
namespace = var.namespace
annotations = merge({
"traefik.ingress.kubernetes.io/router.middlewares" = join(",", compact(concat([
"traefik-retry@kubernetescrd",
"traefik-error-pages@kubernetescrd",
var.skip_default_rate_limit ? null : "traefik-rate-limit@kubernetescrd",
var.custom_content_security_policy == null ? "traefik-csp-headers@kubernetescrd" : null,
var.exclude_crowdsec ? null : "traefik-crowdsec@kubernetescrd",
local.effective_anti_ai ? "traefik-ai-bot-block@kubernetescrd" : null,
local.effective_anti_ai ? "traefik-anti-ai-headers@kubernetescrd" : null,
local.auth_middleware,
var.allow_local_access_only ? "traefik-local-only@kubernetescrd" : null,
var.custom_content_security_policy != null ? "${var.namespace}-custom-csp-${var.name}@kubernetescrd" : null,
var.max_body_size != null ? "${var.namespace}-buffering-${var.name}@kubernetescrd" : null,
], var.extra_middlewares)))
"traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
}, local.homepage_defaults, var.extra_annotations,
var.dns_type != "none" ? { "cloudflare.viktorbarzin.me/dns-type" = var.dns_type } : {},
local.external_monitor_annotations,
)
}
spec {
ingress_class_name = "traefik"
tls {
hosts = [local.effective_host]
secret_name = var.tls_secret_name
}
rule {
host = local.effective_host
http {
dynamic "path" {
for_each = var.ingress_path
content {
path = path.value
backend {
service {
name = var.service_name != null ? var.service_name : var.name
port {
number = var.port
}
}
}
}
}
}
}
}
}
# Custom CSP headers middleware - created per service when custom_content_security_policy is set
resource "kubernetes_manifest" "custom_csp" {
count = var.custom_content_security_policy != null ? 1 : 0
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "custom-csp-${var.name}"
namespace = var.namespace
}
spec = {
headers = {
contentSecurityPolicy = var.custom_content_security_policy
}
}
}
}
# Buffering middleware - created per service when max_body_size is set.
# Traefik default is unlimited; setting maxRequestBodyBytes enforces a limit
# (e.g. Forgejo container pushes can ship multi-GB layer blobs).
resource "kubernetes_manifest" "buffering" {
count = var.max_body_size != null ? 1 : 0
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "buffering-${var.name}"
namespace = var.namespace
}
spec = {
buffering = {
maxRequestBodyBytes = local.max_body_size_bytes
}
}
}
}
# Cloudflare DNS records created automatically when dns_type is set.
# Proxied: CNAME to Cloudflare tunnel. Non-proxied: A + AAAA to public IP.
resource "cloudflare_record" "proxied" {
count = var.dns_type == "proxied" ? 1 : 0
name = local.dns_name
content = "${var.cloudflare_tunnel_id}.cfargotunnel.com"
proxied = true
ttl = 1
type = "CNAME"
zone_id = var.cloudflare_zone_id
allow_overwrite = true
}
resource "cloudflare_record" "non_proxied_a" {
count = var.dns_type == "non-proxied" ? 1 : 0
name = local.dns_name
content = var.public_ip
proxied = false
ttl = 1
type = "A"
zone_id = var.cloudflare_zone_id
allow_overwrite = true
}
resource "cloudflare_record" "non_proxied_aaaa" {
count = var.dns_type == "non-proxied" ? 1 : 0
name = local.dns_name
content = var.public_ipv6
proxied = false
ttl = 1
type = "AAAA"
zone_id = var.cloudflare_zone_id
allow_overwrite = true
}

View file

@ -0,0 +1,88 @@
variable "name" {
description = "Unique name for PV and PVC (convention: <service>-<purpose>)"
type = string
}
variable "namespace" {
description = "Kubernetes namespace for the PVC"
type = string
}
variable "nfs_server" {
description = "NFS server address"
type = string
}
variable "nfs_path" {
description = "NFS export path (e.g. /mnt/main/myservice)"
type = string
}
variable "storage" {
description = "Storage capacity (informational for NFS)"
type = string
default = "10Gi"
}
variable "access_modes" {
description = "PV/PVC access modes"
type = list(string)
default = ["ReadWriteMany"]
}
resource "kubernetes_persistent_volume" "this" {
metadata {
name = var.name
}
spec {
capacity = {
storage = var.storage
}
access_modes = var.access_modes
persistent_volume_reclaim_policy = "Retain"
storage_class_name = "nfs-truenas"
volume_mode = "Filesystem"
mount_options = [
"nfsvers=4",
"soft",
"timeo=30",
"retrans=3",
"actimeo=5",
]
persistent_volume_source {
csi {
driver = "nfs.csi.k8s.io"
volume_handle = var.name
volume_attributes = {
server = var.nfs_server
share = var.nfs_path
}
}
}
}
}
resource "kubernetes_persistent_volume_claim" "this" {
metadata {
name = var.name
namespace = var.namespace
}
spec {
access_modes = var.access_modes
storage_class_name = "nfs-truenas"
volume_name = kubernetes_persistent_volume.this.metadata[0].name
resources {
requests = {
storage = var.storage
}
}
}
}
output "claim_name" {
description = "PVC name to use in pod spec persistent_volume_claim blocks"
value = kubernetes_persistent_volume_claim.this.metadata[0].name
}

View file

@ -0,0 +1,25 @@
variable "namespace" { type = string }
variable "tls_secret_name" {}
variable "tls_crt" {
default = ""
}
variable "tls_key" {
default = ""
}
resource "kubernetes_secret" "tls_secret" {
metadata {
name = var.tls_secret_name
namespace = var.namespace
}
data = {
# Cannot set default function in variable so use default behaviour here
"tls.crt" = var.tls_crt == "" ? file("${path.root}/secrets/fullchain.pem") : var.tls_crt
"tls.key" = var.tls_key == "" ? file("${path.root}/secrets/privkey.pem") : var.tls_key
}
type = "kubernetes.io/tls"
lifecycle {
# KYVERNO_LIFECYCLE_V1: the sync-tls-secret policy stamps generate.kyverno.io/* + app.kubernetes.io/managed-by labels on this generated Secret
ignore_changes = [metadata[0].labels]
}
}

View file

@ -0,0 +1,89 @@
#!/usr/bin/env sh
set -e
export le_dir="/tmp/le/"
export config_dir="$le_dir/out/config"
export technitium_token="$TECHNITIUM_API_KEY"
export certbot_auth="$le_dir/certbot_auth.sh"
export certbot_cleanup="$le_dir/certbot_cleanup.sh"
mkdir $le_dir
echo "Creating $certbot_auth"
cat << EOF > $certbot_auth
#!/usr/bin/env sh
# Generate API token from DNS web console
# Create challenge TXT record
# TECHNITIUM
#API_TOKEN="$technitium_token"
# curl "http://technitium-web.technitium.svc.cluster.local:5380/api/zones/records/add?token=\$API_TOKEN&domain=_acme-challenge.\$CERTBOT_DOMAIN&type=TXT&ttl=60&text=\$CERTBOT_VALIDATION"
# CLOUDFLARE
curl https://api.cloudflare.com/client/v4/zones/$CLOUDFLARE_ZONE_ID/dns_records \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $CLOUDFLARE_TOKEN" \
-d "{
\"comment\": \"certbot temporary challenge\",
\"content\": \"\$CERTBOT_VALIDATION\",
\"name\": \"_acme-challenge.\$CERTBOT_DOMAIN\",
\"proxied\": false,
\"ttl\": 60,
\"type\": \"TXT\"
}"
# Sleep to make sure the change has time to propagate from primary to secondary name servers
sleep 25
EOF
chmod 700 $certbot_auth
cat $certbot_auth
echo "Creating $certbot_cleanup"
cat << EOF > $certbot_cleanup
#!/usr/bin/env sh
# Generate API token from DNS web console
# Delete challenge TXT record
# TECHNIUM
#API_TOKEN="$technitium_token"
#curl "http://technitium-web.technitium.svc.cluster.local:5380/api/zones/records/delete?token=\$API_TOKEN&domain=_acme-challenge.\$CERTBOT_DOMAIN&type=TXT&text=\$CERTBOT_VALIDATION"
# CLOUDFLARE
curl https://api.cloudflare.com/client/v4/zones/$CLOUDFLARE_ZONE_ID/dns_records -H "Authorization: Bearer $CLOUDFLARE_TOKEN" | jq -r '.result[] | select(.name | contains("acme")) | .id' | while read -r record_id; do
curl https://api.cloudflare.com/client/v4/zones/$CLOUDFLARE_ZONE_ID/dns_records/\$record_id \
-X DELETE \
-H "Authorization: Bearer $CLOUDFLARE_TOKEN"
done
EOF
chmod 700 $certbot_cleanup
cat $certbot_cleanup
echo "Cleaning up stale _acme-challenge TXT records from Cloudflare"
curl -s "https://api.cloudflare.com/client/v4/zones/$CLOUDFLARE_ZONE_ID/dns_records?type=TXT&name=_acme-challenge.viktorbarzin.me" \
-H "Authorization: Bearer $CLOUDFLARE_TOKEN" | jq -r '.result[].id' | while read -r old_id; do
echo "Deleting stale record $old_id"
curl -s -X DELETE "https://api.cloudflare.com/client/v4/zones/$CLOUDFLARE_ZONE_ID/dns_records/$old_id" \
-H "Authorization: Bearer $CLOUDFLARE_TOKEN" > /dev/null
done
echo "Executing certbot renew command"
certbot certonly --manual --preferred-challenges=dns --email me@viktorbarzin.me --server https://acme-v02.api.letsencrypt.org/directory --agree-tos --manual-auth-hook $certbot_auth --config-dir $config_dir --work-dir $le_dir/workdir --logs-dir $le_dir/logsdir --no-eff-email --manual-cleanup-hook $certbot_cleanup -d viktorbarzin.me -d *.viktorbarzin.me
cat $config_dir/live/viktorbarzin.me/fullchain.pem
cat $config_dir/live/viktorbarzin.me/privkey.pem
cp --remove-destination $config_dir/live/viktorbarzin.me/fullchain.pem ./secrets
cp --remove-destination $config_dir/live/viktorbarzin.me/privkey.pem ./secrets
echo "Done renewing cert. Output certificates stored in ./secrets\n"
ls ./secrets