From 8b7c77c7949d7942006e3aea4e92c3cdcf79d6eb Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 11 Jun 2026 19:38:54 +0000 Subject: [PATCH] =?UTF-8?q?android-emulator:=20new=20stack=20=E2=80=94=20s?= =?UTF-8?q?hared=20in-cluster=20Android=2016=20testing=20instance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Viktor is setting up an Android app development pipeline (tripit is the first app) and wants agents to natively test changes on Android before shipping. This adds the testing environment: an API-36 Google emulator under KVM as a privileged pod (namespace joins the Kyverno exclude list), SDK/system-image/AVD on a proxmox-lvm PVC, adb on the shared MetalLB IP 10.0.20.200:5555 (LAN only), noVNC screen view at android-emulator.viktorbarzin.lan. Image is built manually from the stack's docker/ dir (rare rebuilds; off-infra-CI rule targets repeated builds). First infra ADR records the trade-offs (devvm/VM/redroid/budtmo rejected). --- .claude/reference/service-catalog.md | 1 + docs/adr/0001-android-emulator-in-cluster.md | 42 ++++ stacks/android-emulator/README.md | 65 +++++ stacks/android-emulator/docker/Dockerfile | 43 ++++ stacks/android-emulator/docker/entrypoint.sh | 68 ++++++ stacks/android-emulator/main.tf | 231 ++++++++++++++++++ stacks/android-emulator/secrets | 1 + stacks/android-emulator/terragrunt.hcl | 3 + stacks/android-emulator/variables.tf | 10 + .../modules/kyverno/security-policies.tf | 9 +- 10 files changed, 469 insertions(+), 4 deletions(-) create mode 100644 docs/adr/0001-android-emulator-in-cluster.md create mode 100644 stacks/android-emulator/README.md create mode 100644 stacks/android-emulator/docker/Dockerfile create mode 100644 stacks/android-emulator/docker/entrypoint.sh create mode 100644 stacks/android-emulator/main.tf create mode 120000 stacks/android-emulator/secrets create mode 100644 stacks/android-emulator/terragrunt.hcl create mode 100644 stacks/android-emulator/variables.tf diff --git a/.claude/reference/service-catalog.md b/.claude/reference/service-catalog.md index eb442d61..0b3a108c 100644 --- a/.claude/reference/service-catalog.md +++ b/.claude/reference/service-catalog.md @@ -41,6 +41,7 @@ | shadowsocks | Proxy | shadowsocks | | webhook_handler | Webhook processing | webhook_handler | | tuya-bridge | Smart home bridge | tuya-bridge | +| android-emulator | Shared Android 16 test emulator (adb 10.0.20.200:5555, noVNC android-emulator.viktorbarzin.lan) | android-emulator | | dawarich | Location history | dawarich | | owntracks | Location tracking | owntracks | | nextcloud | File sync/share | nextcloud | diff --git a/docs/adr/0001-android-emulator-in-cluster.md b/docs/adr/0001-android-emulator-in-cluster.md new file mode 100644 index 00000000..aec44e9d --- /dev/null +++ b/docs/adr/0001-android-emulator-in-cluster.md @@ -0,0 +1,42 @@ +--- +status: accepted +--- + +# The Android testing environment is a privileged KVM emulator pod in-cluster + +Viktor's apps are growing Android clients (first: tripit's Capacitor shell — +see tripit ADR-0013/0014), and agents need a native Android instance to test +changes against before shipping. All K8s nodes already run with CPU type +`host`, so `/dev/kvm` works inside the cluster. + +Decision (2026-06-11): one shared **Android 16 (API 36) Google-emulator +instance** runs as a privileged pod in namespace `android-emulator` +(stack `stacks/android-emulator`), with `/dev/kvm` via hostPath, adb exposed +LAN-only on the shared MetalLB IP (10.0.20.200:5555), and a noVNC screen view +at android-emulator.viktorbarzin.lan. The SDK/system-image/AVD live on a PVC; +the image is a slim manually-built shell. + +## Considered options + +- **devvm-local docker emulator** — rejected as the durable home: shared + 24GB workstation, ~13GB free disk, per-machine, not shared across agents. +- **Dedicated Proxmox VM** — rejected: burns scarce PVE host headroom 24/7 + and adds a whole VM lifecycle for one emulator. +- **redroid (container-native Android)** — rejected: requires binder kernel + modules on every node (documented binderfs incompatibilities), max + Android 15; most invasive for the least version coverage. +- **budtmo/docker-android** — rejected: turnkey but capped at Android 14; + the native features driving the Android work (Live Updates, background + GPS) are Android 16 behaviors, matching the real target device. +- **/dev/kvm device plugin instead of privileged** — deferred: a new + cluster component to avoid one namespace-scoped exclude-list entry; the + exclude pattern (kured/woodpecker/frigate/changedetection) already exists. + +## Consequences + +- `android-emulator` joins the Kyverno `security_policy_exclude_namespaces` + list (privileged allowed; registry policy also bypassed in-namespace). +- adb is unauthenticated by design — the LB IP must remain LAN-only. +- Single shared instance: concurrent agent sessions share Android state; + long destructive work should presence-claim `service:android-emulator`. +- Rendering is swiftshader (CPU) — the contended T4 stays out of the path. diff --git a/stacks/android-emulator/README.md b/stacks/android-emulator/README.md new file mode 100644 index 00000000..3cbde526 --- /dev/null +++ b/stacks/android-emulator/README.md @@ -0,0 +1,65 @@ +# android-emulator — shared in-cluster Android testing instance + +Android 16 (API 36, `google_apis/x86_64`) emulator running under KVM in the +cluster, so agents can natively test app/PWA changes before shipping (first +tenant: tripit). Decision record: `docs/adr/0001-android-emulator-in-cluster.md`. + +## Endpoints + +| What | Where | +|---|---| +| adb | `adb connect 10.0.20.200:5555` (LAN only; adb is unauthenticated — never expose publicly) | +| Screen (noVNC) | (LAN only) | + +## Agent quickstart (from a devvm) + +```bash +# one-time: user-local platform-tools +wget -qO /tmp/pt.zip https://dl.google.com/android/repository/platform-tools-latest-linux.zip +unzip -q /tmp/pt.zip -d ~/android-sdk # → ~/android-sdk/platform-tools/adb + +adb="$HOME/android-sdk/platform-tools/adb" +$adb connect 10.0.20.200:5555 +$adb -s 10.0.20.200:5555 install app-debug.apk # install an APK +$adb -s 10.0.20.200:5555 shell am start -a android.intent.action.VIEW -d https://tripit.viktorbarzin.me # open a URL +$adb -s 10.0.20.200:5555 shell input tap 540 1200 # drive the UI +$adb -s 10.0.20.200:5555 exec-out screencap -p > /tmp/screen.png # screenshot +``` + +The emulator is a single shared instance — `adb shell pm list packages`, +uninstall your test app when done, and presence-claim +(`presence claim service:android-emulator`) for long destructive sessions +(wipes, system-image changes). + +## How it works + +- The container image (built from `docker/`) holds only JDK 17, cmdline-tools, + emulator native libs, Xvfb/x11vnc/noVNC and socat — ~1GB. +- The SDK proper (platform-tools, emulator, system image, AVD, snapshots) + lives on the `android-emulator-sdk` PVC (`proxmox-lvm`); the entrypoint + installs it idempotently. **First boot downloads ~2.5GB (≈9GB unpacked on the PVC) and takes ~15 min** + (startup probe allows 30); subsequent restarts boot in ~1–2 min. +- The emulator renders via swiftshader (CPU) — deliberately NOT scheduled on + the contended T4 GPU node. + +## Rebuilding the image (rare — tool/library bumps only) + +```bash +cd stacks/android-emulator/docker +docker build -t forgejo.viktorbarzin.me/viktor/android-emulator: . +docker push forgejo.viktorbarzin.me/viktor/android-emulator: +# then bump var.image_tag default in variables.tf and land via CI +``` + +Built manually from a devvm on purpose: it changes rarely, and a one-off push +doesn't warrant CI plumbing (the off-infra-CI rule targets *repeated* build IO). + +## Troubleshooting + +- Pod CrashLoops with `FATAL: /dev/kvm not present` → node lost the device or + the privileged/Kyverno exclude regressed (`android-emulator` must be in + `security_policy_exclude_namespaces`, stacks/kyverno). +- Wedged Android (won't boot, storage full) → delete the PVC + pod: next boot + re-downloads cleanly. Snapshots/AVD state are disposable by design. +- Different API level: set `API_LEVEL` env on the deployment (entrypoint + installs that system image on the same PVC) or recreate the AVD. diff --git a/stacks/android-emulator/docker/Dockerfile b/stacks/android-emulator/docker/Dockerfile new file mode 100644 index 00000000..25f8a367 --- /dev/null +++ b/stacks/android-emulator/docker/Dockerfile @@ -0,0 +1,43 @@ +# Android emulator runner — slim on purpose: the SDK proper (platform-tools, +# emulator, system image, AVD) lives on the stack's PVC and is installed by +# entrypoint.sh on first boot, so this image carries only the JDK, +# cmdline-tools and the native libraries the emulator needs at runtime. +# +# Rebuild + push (rare — only when tool/library versions bump): +# docker build -t forgejo.viktorbarzin.me/viktor/android-emulator:api36-v1 . +# docker push forgejo.viktorbarzin.me/viktor/android-emulator:api36-v1 +FROM eclipse-temurin:17-jdk-jammy + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y --no-install-recommends \ + # emulator runtime deps (Qt window into Xvfb) + libpulse0 libgl1 libglu1-mesa libnss3 libasound2 libfontconfig1 \ + libx11-6 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 \ + libxfixes3 libxi6 libxrandr2 libxrender1 libxtst6 libxkbcommon0 \ + libxkbfile1 libsm6 libice6 libdbus-1-3 \ + # virtual display + browser viewing + xvfb x11vnc novnc websockify openbox \ + # adb TCP forwarding + fetch tooling + socat wget unzip ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Android cmdline-tools (sdkmanager/avdmanager). Pinned; SDK packages they +# install land on the PVC at /sdk, not in this image. +ARG CMDLINE_TOOLS_VERSION=13114758 +RUN mkdir -p /opt/android/cmdline-tools && \ + wget -q "https://dl.google.com/android/repository/commandlinetools-linux-${CMDLINE_TOOLS_VERSION}_latest.zip" -O /tmp/clt.zip && \ + unzip -q /tmp/clt.zip -d /opt/android/cmdline-tools && \ + mv /opt/android/cmdline-tools/cmdline-tools /opt/android/cmdline-tools/latest && \ + rm /tmp/clt.zip + +ENV ANDROID_SDK_ROOT=/sdk \ + ANDROID_USER_HOME=/sdk/.android \ + ANDROID_AVD_HOME=/sdk/.android/avd \ + PATH="/opt/android/cmdline-tools/latest/bin:/sdk/platform-tools:/sdk/emulator:${PATH}" + +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# 5555 adb (socat → emulator adbd), 6080 noVNC web UI +EXPOSE 5555 6080 +ENTRYPOINT ["/entrypoint.sh"] diff --git a/stacks/android-emulator/docker/entrypoint.sh b/stacks/android-emulator/docker/entrypoint.sh new file mode 100644 index 00000000..4f32e1f3 --- /dev/null +++ b/stacks/android-emulator/docker/entrypoint.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Boot sequence: ensure SDK + AVD on the PVC (/sdk), bring up a virtual +# display with browser viewing (Xvfb → x11vnc → noVNC :6080), start the +# emulator windowed into it, and expose its adbd on :5555 for the LAN. +set -euo pipefail + +API_LEVEL="${API_LEVEL:-36}" +SYSTEM_IMAGE="system-images;android-${API_LEVEL};google_apis;x86_64" +AVD_NAME="${AVD_NAME:-lab}" +EMULATOR_RAM_MB="${EMULATOR_RAM_MB:-4096}" +SCREEN_GEOMETRY="${SCREEN_GEOMETRY:-1080x2280x24}" + +[ -e /dev/kvm ] || { echo "FATAL: /dev/kvm not present — pod needs the kvm hostPath + privileged"; exit 1; } + +mkdir -p "$ANDROID_USER_HOME" + +# --- SDK packages on the PVC (idempotent; first boot downloads ~2.5GB) ------ +if [ ! -x /sdk/platform-tools/adb ] || [ ! -x /sdk/emulator/emulator ] || \ + [ ! -d "/sdk/system-images/android-${API_LEVEL}" ]; then + echo "Installing SDK packages into /sdk (first boot)..." + # (yes || true): yes dies of SIGPIPE (141) when sdkmanager stops reading, + # which set -o pipefail would otherwise turn into a fatal error. + (yes || true) | sdkmanager --sdk_root=/sdk --licenses >/dev/null + sdkmanager --sdk_root=/sdk "platform-tools" "emulator" "$SYSTEM_IMAGE" +fi + +# --- AVD (idempotent) -------------------------------------------------------- +if ! avdmanager list avd -c | grep -qx "$AVD_NAME"; then + echo "Creating AVD '$AVD_NAME' (${SYSTEM_IMAGE}, pixel_7)..." + (echo no || true) | avdmanager create avd -n "$AVD_NAME" -k "$SYSTEM_IMAGE" --device pixel_7 + cat >> "${ANDROID_AVD_HOME}/${AVD_NAME}.avd/config.ini" </dev/null | tr -d '\r')" = "1" ]; do + sleep 3 +done +echo "Boot completed." + +# Expose the emulator's adbd (localhost:5555) to the pod network. Plain TCP, +# no auth — reachable only inside the LAN via the MetalLB IP. +socat TCP-LISTEN:5555,fork,reuseaddr TCP:127.0.0.1:5555 & + +# Supervise: if any background process dies, exit so the pod restarts. +wait -n +echo "A supervised process exited; restarting pod." >&2 +exit 1 diff --git a/stacks/android-emulator/main.tf b/stacks/android-emulator/main.tf new file mode 100644 index 00000000..154cd7f6 --- /dev/null +++ b/stacks/android-emulator/main.tf @@ -0,0 +1,231 @@ +# Android emulator — shared in-cluster Android 16 (API 36) testing instance. +# Agents drive it over adb (10.0.20.200:5555); humans watch the screen at +# https://android-emulator.viktorbarzin.lan (noVNC). The SDK + system image + +# AVD live on the PVC; the container image is just JDK + cmdline-tools + libs +# (built manually from docker/, see README.md). +# +# Decision record: docs/adr/0001-android-emulator-in-cluster.md +# - privileged + /dev/kvm hostPath (namespace is on the Kyverno exclude list) +# - swiftshader rendering — deliberately NOT on the contended T4 GPU node + +resource "kubernetes_namespace" "android-emulator" { + metadata { + name = "android-emulator" + labels = { + "istio-injection" : "disabled" + tier = local.tiers.cluster + } + } + lifecycle { + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +module "tls_secret" { + source = "../../modules/kubernetes/setup_tls_secret" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + tls_secret_name = var.tls_secret_name +} + +# SDK + system image + AVD + snapshots. First boot downloads ~2.5GB (≈9GB +# unpacked) into here; +# subsequent pod restarts reuse it (boot in ~1 min instead of ~15). +# DELIBERATE deviation from the proxmox-lvm backup convention: no backup +# CronJob — everything on this PVC is a regenerable download/cache (wipe the +# PVC and the next boot rebuilds it; that's also the documented recovery path). +resource "kubernetes_persistent_volume_claim" "sdk" { + wait_until_bound = false + metadata { + name = "android-emulator-sdk" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + annotations = { + "resize.topolvm.io/threshold" = "10%" + "resize.topolvm.io/increase" = "50%" + "resize.topolvm.io/storage_limit" = "60Gi" + } + } + spec { + access_modes = ["ReadWriteOnce"] + storage_class_name = "proxmox-lvm" + resources { + requests = { + storage = "30Gi" + } + } + } + lifecycle { + # Autoresizer grows requests.storage up to storage_limit; PVCs can't shrink. + ignore_changes = [spec[0].resources[0].requests] + } +} + +resource "kubernetes_deployment" "android-emulator" { + metadata { + name = "android-emulator" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + labels = { + app = "android-emulator" + } + } + spec { + replicas = 1 + strategy { + type = "Recreate" # RWO PVC — old pod must release it first + } + selector { + match_labels = { app = "android-emulator" } + } + template { + metadata { + labels = { app = "android-emulator" } + } + spec { + image_pull_secrets { + name = "registry-credentials" + } + container { + name = "emulator" + image = "forgejo.viktorbarzin.me/viktor/android-emulator:${var.image_tag}" + + security_context { + privileged = true # /dev/kvm access + } + + port { + name = "adb" + container_port = 5555 + } + port { + name = "novnc" + container_port = 6080 + } + + volume_mount { + name = "sdk" + mount_path = "/sdk" + } + volume_mount { + name = "kvm" + mount_path = "/dev/kvm" + } + + resources { + # No CPU limit (cluster-wide rule — CFS throttling); requests=limits + # on memory. Emulator peak: qemu (-memory 4096) + guest overhead + + # Xvfb/VNC + JVM sdkmanager on first boot. + requests = { + cpu = "2" + memory = "8Gi" + } + limits = { + memory = "8Gi" + } + } + + # First boot downloads the system image + cold-boots Android: allow + # up to ~30 min before the pod is declared failed. + startup_probe { + exec { + command = ["/bin/bash", "-c", "/sdk/platform-tools/adb shell getprop sys.boot_completed | grep -q 1"] + } + period_seconds = 20 + failure_threshold = 90 + } + readiness_probe { + exec { + command = ["/bin/bash", "-c", "/sdk/platform-tools/adb shell getprop sys.boot_completed | grep -q 1"] + } + period_seconds = 30 + failure_threshold = 3 + } + liveness_probe { + exec { + command = ["/bin/bash", "-c", "/sdk/platform-tools/adb shell getprop sys.boot_completed | grep -q 1"] + } + period_seconds = 60 + failure_threshold = 10 # generous — don't reboot mid-test on a hiccup + } + } + + volume { + name = "sdk" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.sdk.metadata[0].name + } + } + volume { + name = "kvm" + host_path { + path = "/dev/kvm" + type = "CharDevice" + } + } + } + } + } + lifecycle { + ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + } +} + +# adb endpoint for agents/devvms: `adb connect 10.0.20.200:5555`. +# Unauthenticated by nature — LAN-only via MetalLB, never exposed publicly. +resource "kubernetes_service" "adb" { + metadata { + name = "android-emulator-adb" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + annotations = { + "metallb.universe.tf/loadBalancerIPs" = "10.0.20.200" + "metallb.io/allow-shared-ip" = "shared" + } + } + spec { + type = "LoadBalancer" + selector = { + app = "android-emulator" + } + port { + name = "adb" + port = 5555 + target_port = 5555 + } + } +} + +resource "kubernetes_service" "novnc" { + metadata { + name = "android-emulator" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + labels = { + app = "android-emulator" + } + } + spec { + selector = { + app = "android-emulator" + } + port { + name = "http" + port = 80 + target_port = 6080 + } + } +} + +# Browser screen view (noVNC) — LAN only. +module "ingress-internal" { + source = "../../modules/kubernetes/ingress_factory" + # auth = "none": LAN-only (allow_local_access_only) noVNC screen view of the + # shared test emulator — no user data behind it; Authentik would break the + # websocket flow agents and users rely on. + auth = "none" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + name = "android-emulator" + root_domain = "viktorbarzin.lan" + tls_secret_name = var.tls_secret_name + allow_local_access_only = true + ssl_redirect = false + extra_annotations = { + "gethomepage.dev/enabled" = "false" + } +} diff --git a/stacks/android-emulator/secrets b/stacks/android-emulator/secrets new file mode 120000 index 00000000..ca54a7cf --- /dev/null +++ b/stacks/android-emulator/secrets @@ -0,0 +1 @@ +../../secrets \ No newline at end of file diff --git a/stacks/android-emulator/terragrunt.hcl b/stacks/android-emulator/terragrunt.hcl new file mode 100644 index 00000000..e147285f --- /dev/null +++ b/stacks/android-emulator/terragrunt.hcl @@ -0,0 +1,3 @@ +include "root" { + path = find_in_parent_folders() +} diff --git a/stacks/android-emulator/variables.tf b/stacks/android-emulator/variables.tf new file mode 100644 index 00000000..5d4927b6 --- /dev/null +++ b/stacks/android-emulator/variables.tf @@ -0,0 +1,10 @@ +variable "tls_secret_name" { + type = string + sensitive = true +} + +variable "image_tag" { + type = string + default = "api36-v1" + description = "android-emulator image tag at forgejo.viktorbarzin.me/viktor/android-emulator. Built + pushed manually from stacks/android-emulator/docker/ (see README.md) — bump this when the image is rebuilt." +} diff --git a/stacks/kyverno/modules/kyverno/security-policies.tf b/stacks/kyverno/modules/kyverno/security-policies.tf index 2bee1757..08902d64 100644 --- a/stacks/kyverno/modules/kyverno/security-policies.tf +++ b/stacks/kyverno/modules/kyverno/security-policies.tf @@ -23,10 +23,11 @@ locals { "xray", "infra-maintenance", "metrics-server", "tigera-operator", "frigate", # Additions discovered during wave 1 enforce flip — these contain workloads # that legitimately need privileged / hostNetwork / SYS_ADMIN: - "kured", # kured DaemonSet is privileged (manages node reboots) - "default", # etcd backup + defrag CronJobs use hostNetwork - "changedetection", # uses SYS_ADMIN for chromium sandbox - "woodpecker", # CI pipeline pods (wp-*) run privileged docker builds + "kured", # kured DaemonSet is privileged (manages node reboots) + "default", # etcd backup + defrag CronJobs use hostNetwork + "changedetection", # uses SYS_ADMIN for chromium sandbox + "woodpecker", # CI pipeline pods (wp-*) run privileged docker builds + "android-emulator", # emulator pod is privileged for /dev/kvm (ADR-0001) ] }