diff --git a/stacks/android-emulator/README.md b/stacks/android-emulator/README.md index 50c90f96..0a8c0469 100644 --- a/stacks/android-emulator/README.md +++ b/stacks/android-emulator/README.md @@ -4,6 +4,21 @@ Android 16 (API 36, `google_apis/x86_64`) emulator running under KVM in the cluster, so agents can natively test app/PWA changes before shipping (first tenant: tripit). Decision record: `docs/adr/0001-android-emulator-in-cluster.md`. +## On-demand lifecycle (since 2026-06-12) + +The emulator **scales to zero when idle** (no adb/VNC connections for ~1h, +checked by the `android-emulator-idle-sleeper` CronJob) and **wakes on +visit**: the wake gate owns `/` on both hostnames. Warm boot is ~90s. + +- Humans: open https://android-emulator.viktorbarzin.me — it wakes the + emulator if needed, shows a self-refreshing boot page, then hands over to + the noVNC screen. +- Agents (before adb): wake + poll, then connect: + + curl -ks --resolve android-emulator.viktorbarzin.lan:443:10.0.20.203 https://android-emulator.viktorbarzin.lan/wake + until curl -ks --resolve android-emulator.viktorbarzin.lan:443:10.0.20.203 https://android-emulator.viktorbarzin.lan/status | grep -q '"ready": 1'; do sleep 5; done + adb connect 10.0.20.200:5555 + ## Endpoints | What | Where | @@ -39,8 +54,10 @@ uninstall your test app when done, and presence-claim lives on the `android-emulator-sdk` PVC (`proxmox-lvm`); the entrypoint installs it idempotently. **First boot downloads ~2.5GB (≈9GB unpacked on the PVC) and takes ~15 min** (startup probe allows 30); subsequent restarts boot in ~1–2 min. -- The emulator renders via swiftshader (CPU) — deliberately NOT scheduled on - the contended T4 GPU node. +- The emulator runs on the GPU node (k8s-node1) with a T4 time-slice and + `-gpu host` hardware rendering (~0.5–1 GiB VRAM while awake — scale-to-zero + keeps it transient); if GPU init fails it falls back to swiftshader (CPU) + automatically. ## Rebuilding the image (rare — tool/library bumps only) diff --git a/stacks/android-emulator/docker/Dockerfile b/stacks/android-emulator/docker/Dockerfile index c8542e22..ef08b245 100644 --- a/stacks/android-emulator/docker/Dockerfile +++ b/stacks/android-emulator/docker/Dockerfile @@ -4,8 +4,8 @@ # cmdline-tools and the native libraries the emulator needs at runtime. # # Rebuild + push (rare — only when tool/library versions bump): -# docker build -t forgejo.viktorbarzin.me/viktor/android-emulator:api36-v4 . -# docker push forgejo.viktorbarzin.me/viktor/android-emulator:api36-v4 +# docker build -t forgejo.viktorbarzin.me/viktor/android-emulator:api36-v5 . +# docker push forgejo.viktorbarzin.me/viktor/android-emulator:api36-v5 FROM eclipse-temurin:17-jdk-jammy ENV DEBIAN_FRONTEND=noninteractive @@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libpulse0 libgl1 libglu1-mesa libnss3 libasound2 libfontconfig1 \ libx11-6 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 \ libxfixes3 libxi6 libxrandr2 libxrender1 libxtst6 libxkbcommon0 \ + libegl1 libgles2 \ libxkbfile1 libsm6 libice6 libdbus-1-3 \ # virtual display + browser viewing xvfb x11vnc novnc websockify openbox \ diff --git a/stacks/android-emulator/docker/entrypoint.sh b/stacks/android-emulator/docker/entrypoint.sh index 530f758b..ff2638f2 100644 --- a/stacks/android-emulator/docker/entrypoint.sh +++ b/stacks/android-emulator/docker/entrypoint.sh @@ -83,12 +83,32 @@ x11vnc -display :0 -nopw -forever -shared -quiet -bg websockify --web /usr/share/novnc 6080 localhost:5900 & # --- emulator ----------------------------------------------------------------- -# swiftshader = CPU rendering (no GPU dependency); KVM does the heavy lifting. -emulator -avd "$AVD_NAME" \ - -gpu swiftshader_indirect -accel on \ - -memory "$EMULATOR_RAM_MB" \ - -no-audio -no-boot-anim \ - & +# Use the host GPU when the NVIDIA runtime injected one (driver libs + +# /dev/nvidia* appear when the pod requests nvidia.com/gpu), otherwise +# swiftshader (CPU rendering). If the GPU launch dies early, fall back to +# swiftshader automatically so the worst case equals CPU rendering. +GPU_FLAG="swiftshader_indirect" +[ -e /dev/nvidiactl ] && GPU_FLAG="host" +echo "Emulator GPU mode: $GPU_FLAG" + +launch_emulator() { + emulator -avd "$AVD_NAME" \ + -gpu "$1" -accel on \ + -memory "$EMULATOR_RAM_MB" \ + -no-audio -no-boot-anim \ + & + EMU_PID=$! +} + +launch_emulator "$GPU_FLAG" +if [ "$GPU_FLAG" = "host" ]; then + sleep 25 + if ! kill -0 "$EMU_PID" 2>/dev/null; then + echo "GPU launch (-gpu host) died early — falling back to swiftshader." >&2 + rm -f "${ANDROID_AVD_HOME}/${AVD_NAME}.avd"/*.lock + launch_emulator swiftshader_indirect + fi +fi adb wait-for-device echo "Emulator up; waiting for boot completion..." diff --git a/stacks/android-emulator/gate.py b/stacks/android-emulator/gate.py new file mode 100644 index 00000000..2301042e --- /dev/null +++ b/stacks/android-emulator/gate.py @@ -0,0 +1,112 @@ +"""Wake gate for the android-emulator deployment. + +Owns `/` on the emulator hostnames: if the emulator is up, redirect to the +noVNC screen; if it is scaled to zero, scale it to 1 and show a self-refreshing +"waking up" page. Agents use GET /status (JSON) + GET /wake. Pure stdlib — +runs on a stock python image with no installs. +""" +import json +import os +import ssl +import urllib.error +import urllib.request +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +NS = os.environ.get("NAMESPACE", "android-emulator") +DEPLOY = os.environ.get("DEPLOYMENT", "android-emulator") +API = "https://kubernetes.default.svc" +TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" +CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +IDLE_ANNOTATION = "emulator.viktorbarzin.me/idle-checks" +VNC_PATH = "/vnc.html?autoconnect=1&resize=scale" + +WAKING_PAGE = """Android emulator + + +

📱 Waking the emulator…

+

Boot takes about 90 seconds from a warm disk.

+

This page refreshes automatically and will hand over to the screen when ready.

+

state: {state}

""" + + +def kube(method: str, path: str, body=None): + with open(TOKEN_PATH) as f: + token = f.read() + req = urllib.request.Request(API + path, method=method) + req.add_header("Authorization", "Bearer " + token) + data = None + if body is not None: + data = json.dumps(body).encode() + req.add_header("Content-Type", "application/strategic-merge-patch+json") + ctx = ssl.create_default_context(cafile=CA_PATH) + with urllib.request.urlopen(req, data=data, context=ctx, timeout=10) as r: + return json.load(r) + + +def deployment_state(): + d = kube("GET", f"/apis/apps/v1/namespaces/{NS}/deployments/{DEPLOY}") + spec = d["spec"].get("replicas") or 0 + ready = d["status"].get("readyReplicas") or 0 + return spec, ready + + +def wake(): + kube( + "PATCH", + f"/apis/apps/v1/namespaces/{NS}/deployments/{DEPLOY}", + { + "spec": {"replicas": 1}, + "metadata": {"annotations": {IDLE_ANNOTATION: "0"}}, + }, + ) + + +class Handler(BaseHTTPRequestHandler): + def _respond(self, code: int, body: bytes, ctype: str, extra=None): + self.send_response(code) + self.send_header("Content-Type", ctype) + self.send_header("Cache-Control", "no-store") + for k, v in (extra or {}).items(): + self.send_header(k, v) + self.end_headers() + self.wfile.write(body) + + def do_GET(self): # noqa: N802 (stdlib naming) + if self.path == "/healthz": + return self._respond(200, b"ok", "text/plain") + try: + spec, ready = deployment_state() + if self.path.startswith("/status"): + return self._respond( + 200, + json.dumps({"replicas": spec, "ready": ready}).encode(), + "application/json", + ) + woke = False + if spec == 0: + wake() + woke = True + if self.path.startswith("/wake"): + return self._respond( + 200, + json.dumps({"replicas": 1, "ready": ready, "woke": woke}).encode(), + "application/json", + ) + # default: human path + if ready >= 1: + return self._respond(302, b"", "text/plain", {"Location": VNC_PATH}) + state = "starting" if not woke else "scaled up just now" + page = WAKING_PAGE.replace("{state}", state) + return self._respond(200, page.encode(), "text/html") + except urllib.error.HTTPError as e: + return self._respond(502, f"kube api error: {e.code}".encode(), "text/plain") + except Exception as e: # surface anything else readably + return self._respond(500, f"gate error: {e}".encode(), "text/plain") + + def log_message(self, fmt, *args): + print("%s - %s" % (self.address_string(), fmt % args), flush=True) + + +if __name__ == "__main__": + ThreadingHTTPServer(("0.0.0.0", 8080), Handler).serve_forever() diff --git a/stacks/android-emulator/gate.tf b/stacks/android-emulator/gate.tf new file mode 100644 index 00000000..77e11b57 --- /dev/null +++ b/stacks/android-emulator/gate.tf @@ -0,0 +1,233 @@ +# On-demand lifecycle: the emulator scales to ZERO when idle and wakes on +# visit. The gate (tiny stdlib-python HTTP server) owns `/` on both emulator +# hostnames — it scales the deployment up and hands the browser to noVNC once +# ready; agents use GET /wake + /status. The idle CronJob scales back to zero +# after ~1h with no adb/VNC connections. Decision: Viktor 2026-06-12 — +# dev-only usage, and an always-on GPU emulator would permanently hold T4 +# VRAM that the LLM jobs need. + +resource "kubernetes_service_account" "gate" { + metadata { + name = "android-emulator-gate" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } +} + +resource "kubernetes_role" "gate" { + metadata { + name = "android-emulator-gate" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } + rule { + api_groups = ["apps"] + resources = ["deployments"] + resource_names = ["android-emulator"] + verbs = ["get", "patch"] + } + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["get", "list"] + } + rule { + api_groups = [""] + resources = ["pods/exec"] + verbs = ["create"] + } +} + +resource "kubernetes_role_binding" "gate" { + metadata { + name = "android-emulator-gate" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.gate.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.gate.metadata[0].name + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } +} + +resource "kubernetes_config_map" "gate" { + metadata { + name = "android-emulator-gate" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } + data = { + "gate.py" = file("${path.module}/gate.py") + } +} + +resource "kubernetes_deployment" "gate" { + metadata { + name = "android-emulator-gate" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + labels = { + app = "android-emulator-gate" + } + } + spec { + replicas = 1 + selector { + match_labels = { app = "android-emulator-gate" } + } + template { + metadata { + labels = { app = "android-emulator-gate" } + annotations = { + "checksum/gate" = sha1(file("${path.module}/gate.py")) + } + } + spec { + service_account_name = kubernetes_service_account.gate.metadata[0].name + container { + name = "gate" + image = "python:3.12-alpine" + command = ["python", "/app/gate.py"] + env { + name = "NAMESPACE" + value = kubernetes_namespace.android-emulator.metadata[0].name + } + env { + name = "DEPLOYMENT" + value = "android-emulator" + } + port { + container_port = 8080 + } + volume_mount { + name = "app" + mount_path = "/app" + } + resources { + requests = { + cpu = "10m" + memory = "64Mi" + } + limits = { + memory = "64Mi" + } + } + readiness_probe { + http_get { + path = "/healthz" + port = 8080 + } + period_seconds = 10 + } + } + volume { + name = "app" + config_map { + name = kubernetes_config_map.gate.metadata[0].name + } + } + } + } + } + lifecycle { + ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + } +} + +resource "kubernetes_service" "gate" { + metadata { + name = "android-emulator-gate" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } + spec { + selector = { + app = "android-emulator-gate" + } + port { + name = "http" + port = 80 + target_port = 8080 + } + } +} + +# Sleep side: every 15 min, look at established TCP connections to the +# emulator's adb (5555) and noVNC (6080) ports from OUTSIDE the pod +# (remote != 127.0.0.1 — the in-container adb server holds a permanent +# loopback connection to adbd that must not count as activity). Four +# consecutive idle checks (~1h) scale the deployment to zero. +resource "kubernetes_cron_job_v1" "idle_sleeper" { + metadata { + name = "android-emulator-idle-sleeper" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + } + spec { + schedule = "*/15 * * * *" + concurrency_policy = "Forbid" + successful_jobs_history_limit = 1 + failed_jobs_history_limit = 2 + job_template { + metadata {} + spec { + backoff_limit = 0 + ttl_seconds_after_finished = 3600 + template { + metadata {} + spec { + service_account_name = kubernetes_service_account.gate.metadata[0].name + restart_policy = "Never" + container { + name = "sleeper" + image = "bitnami/kubectl:latest" + command = ["/bin/bash", "-c"] + args = [<<-EOT + set -euo pipefail + NS=android-emulator DEPLOY=android-emulator ANN=emulator.viktorbarzin.me/idle-checks + spec=$(kubectl -n $NS get deploy $DEPLOY -o jsonpath='{.spec.replicas}') + [ "$spec" = "0" ] && { echo "already asleep"; exit 0; } + pod=$(kubectl -n $NS get pods -l app=$DEPLOY --field-selector=status.phase=Running -o name | head -1) + [ -z "$pod" ] && { echo "no running pod (booting?) — not counting"; exit 0; } + # /proc/net/tcp: count ESTABLISHED (st=01) conns with local port + # 5555 (0x15B3) or 6080 (0x17C0) whose remote is not loopback. + est=$(kubectl -n $NS exec $${pod#pod/} -- cat /proc/net/tcp | awk ' + $4 == "01" { + split($2, l, ":"); split($3, r, ":") + if ((l[2] == "15B3" || l[2] == "17C0") && r[1] != "0100007F") n++ + } END { print n+0 }') + if [ "$est" -gt 0 ]; then + echo "$est active connection(s) — resetting idle counter" + kubectl -n $NS annotate deploy $DEPLOY $ANN=0 --overwrite + exit 0 + fi + n=$(kubectl -n $NS get deploy $DEPLOY -o jsonpath="{.metadata.annotations['emulator\.viktorbarzin\.me/idle-checks']}") + n=$(( $${n:-0} + 1 )) + if [ "$n" -ge 4 ]; then + echo "idle for $n checks (~1h) — scaling to zero" + kubectl -n $NS scale deploy $DEPLOY --replicas=0 + kubectl -n $NS annotate deploy $DEPLOY $ANN=0 --overwrite + else + echo "idle check $n/4" + kubectl -n $NS annotate deploy $DEPLOY $ANN=$n --overwrite + fi + EOT + ] + resources { + requests = { + cpu = "10m" + memory = "64Mi" + } + limits = { + memory = "128Mi" + } + } + } + } + } + } + } + } + lifecycle { + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + } +} diff --git a/stacks/android-emulator/main.tf b/stacks/android-emulator/main.tf index 10220999..27686add 100644 --- a/stacks/android-emulator/main.tf +++ b/stacks/android-emulator/main.tf @@ -80,6 +80,15 @@ resource "kubernetes_deployment" "android-emulator" { labels = { app = "android-emulator" } } spec { + node_selector = { + "nvidia.com/gpu.present" : "true" + } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } image_pull_secrets { name = "registry-credentials" } @@ -121,7 +130,8 @@ resource "kubernetes_deployment" "android-emulator" { memory = "3Gi" } limits = { - memory = "8Gi" + memory = "8Gi" + "nvidia.com/gpu" = "1" # T4 time-slice; ~0.5-1GiB VRAM while awake } } @@ -167,7 +177,12 @@ resource "kubernetes_deployment" "android-emulator" { } } lifecycle { - ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + # the wake gate + idle sleeper own replicas (scale-to-zero on demand); + # an apply must not resurrect or kill the emulator. + spec[0].replicas, + ] } } @@ -215,45 +230,82 @@ resource "kubernetes_service" "novnc" { } } -# Browser screen view (noVNC) — LAN only. -module "ingress-internal" { +# Ingress layout, same on both hostnames: the wake gate owns `/` (visiting +# wakes a sleeping emulator), while the noVNC asset/socket paths go straight +# to the emulator service. LAN (.lan) is unauthenticated local-only for +# agents; public (.me) is Authentik-gated for humans. +locals { + novnc_paths = [ + "/vnc.html", "/app", "/core", "/vendor", + "/websockify", "/package.json", "/defaults.json", "/mandatory.json", + ] +} + +module "ingress-internal-gate" { source = "../../modules/kubernetes/ingress_factory" - # auth = "none": LAN-only (allow_local_access_only) noVNC screen view of the - # shared test emulator — no user data behind it; Authentik would break the - # websocket flow agents and users rely on. + # auth = "none": LAN-only (allow_local_access_only) wake gate + screen for + # the shared test emulator — no user data behind it; agents need cookie-free + # curl access and Authentik would break the noVNC websocket flow. auth = "none" namespace = kubernetes_namespace.android-emulator.metadata[0].name name = "android-emulator" root_domain = "viktorbarzin.lan" + service_name = kubernetes_service.gate.metadata[0].name tls_secret_name = var.tls_secret_name allow_local_access_only = true ssl_redirect = false extra_annotations = { "gethomepage.dev/enabled" = "false" } - # noVNC loads ~60 unbundled ES modules in parallel; the default 10/50 - # limiter 429s the tail and the loader hangs forever. Dedicated limiter, - # same pattern as actualbudget/immich. - skip_default_rate_limit = true - extra_middlewares = ["traefik-android-emulator-rate-limit@kubernetescrd"] } -# Remote (off-LAN) screen access — Authentik-gated at the edge; WebSockets -# work through forward-auth same-origin (proven by stacks/terminal's ttyd). -# adb (5555) deliberately stays LAN-only: it is unauthenticated and must -# never be exposed publicly. -module "ingress-public" { +module "ingress-internal-novnc" { + source = "../../modules/kubernetes/ingress_factory" + # auth = "none": LAN-only noVNC paths (see ingress-internal-gate above). + auth = "none" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + name = "android-emulator-novnc" + host = "android-emulator" + root_domain = "viktorbarzin.lan" + service_name = kubernetes_service.novnc.metadata[0].name + ingress_path = local.novnc_paths + tls_secret_name = var.tls_secret_name + allow_local_access_only = true + ssl_redirect = false + # noVNC loads ~60 unbundled ES modules in parallel; the default 10/50 + # limiter 429s the tail and the loader hangs forever. + skip_default_rate_limit = true + extra_middlewares = ["traefik-android-emulator-rate-limit@kubernetescrd"] + extra_annotations = { + "gethomepage.dev/enabled" = "false" + } +} + +# Remote (off-LAN) access — Authentik-gated at the edge; WebSockets work +# through forward-auth same-origin (proven by stacks/terminal's ttyd). +# adb (5555) deliberately stays LAN-only: it is unauthenticated. +module "ingress-public-gate" { source = "../../modules/kubernetes/ingress_factory" auth = "required" dns_type = "proxied" namespace = kubernetes_namespace.android-emulator.metadata[0].name name = "android-emulator-public" host = "android-emulator" - service_name = kubernetes_service.novnc.metadata[0].name + service_name = kubernetes_service.gate.metadata[0].name tls_secret_name = var.tls_secret_name - # noVNC loads ~60 unbundled ES modules in parallel; the default 10/50 - # limiter 429s the tail and the loader hangs forever. Dedicated limiter, - # same pattern as actualbudget/immich. +} + +module "ingress-public-novnc" { + source = "../../modules/kubernetes/ingress_factory" + auth = "required" + namespace = kubernetes_namespace.android-emulator.metadata[0].name + name = "android-emulator-public-novnc" + host = "android-emulator" + service_name = kubernetes_service.novnc.metadata[0].name + ingress_path = local.novnc_paths + tls_secret_name = var.tls_secret_name + # see ingress-internal-novnc — noVNC's parallel module storm needs the + # dedicated limiter. skip_default_rate_limit = true extra_middlewares = ["traefik-android-emulator-rate-limit@kubernetescrd"] } diff --git a/stacks/android-emulator/variables.tf b/stacks/android-emulator/variables.tf index efd8841f..bf9062d5 100644 --- a/stacks/android-emulator/variables.tf +++ b/stacks/android-emulator/variables.tf @@ -5,6 +5,6 @@ variable "tls_secret_name" { variable "image_tag" { type = string - default = "api36-v4" + default = "api36-v5" description = "android-emulator image tag at forgejo.viktorbarzin.me/viktor/android-emulator. Built + pushed manually from stacks/android-emulator/docker/ (see README.md) — bump this when the image is rebuilt." }