move immich ml into a separate deplaoyment and ask it to use gpu [ci skip]

2025-12-14 09:47:36 +00:00 · 2025-12-14 09:47:36 +00:00 · fcbb5971f3
commit fcbb5971f3
parent b284cfe293
2 changed files with 103 additions and 3 deletions
--- a/modules/kubernetes/immich/chart_values.tpl
+++ b/modules/kubernetes/immich/chart_values.tpl
@ -29,7 +29,7 @@ env:
  # IMMICH_MACHINE_LEARNING_URL: "http://immich-machine-learning.immich.svc.cluster.local:3003"

 image:
-  tag: v2.3.1
+  tag: ${version}

 immich:
  persistence:
@ -55,7 +55,8 @@ server:

 # increase liveliness and readiness checks to allow enough time for downloading models
 machine-learning:
-  enabled: true
+  # enabled: true
+  enabled: false
  image:
    repository: ghcr.io/immich-app/immich-machine-learning
    pullPolicy: IfNotPresent
--- a/modules/kubernetes/immich/main.tf
+++ b/modules/kubernetes/immich/main.tf
@ -1,6 +1,12 @@
 variable "tls_secret_name" {}
 variable "postgresql_password" {}
 variable "homepage_token" {}
+variable "immich_version" {
+  type = string
+  # Change me to upgrade
+  default = "v2.3.1"
+}
+

 module "tls_secret" {
  source          = "../setup_tls_secret"
@ -181,7 +187,100 @@ resource "helm_release" "immich" {
  version    = "0.9.3"
  timeout    = 6000

-  values = [templatefile("${path.module}/chart_values.tpl", { postgresql_password = var.postgresql_password })]
+  values = [templatefile("${path.module}/chart_values.tpl", { postgresql_password = var.postgresql_password, version = var.immich_version })]
+}
+
+# The helm one cannot be customized to use affinity settings to use the gpu node
+resource "kubernetes_deployment" "immich-machine-learning" {
+  metadata {
+    name      = "immich-machine-learning"
+    namespace = "immich"
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        app = "immich-machine-learning"
+      }
+    }
+    strategy {
+      type = "RollingUpdate"
+    }
+    template {
+      metadata {
+        labels = {
+          app = "immich-machine-learning"
+        }
+      }
+      spec {
+        node_selector = {
+          "gpu" : "true"
+        }
+        container {
+          # image = "ghcr.io/immich-app/immich-machine-learning:${var.immich_version}-cuda"
+          image = "ghcr.io/immich-app/immich-machine-learning:${var.immich_version}"
+          name  = "immich-machine-learning"
+          port {
+            container_port = 3003
+            protocol       = "TCP"
+            name           = "immich-ml"
+          }
+          env {
+            name  = "TRANSFORMERS_CACHE"
+            value = "/cache"
+          }
+          env {
+            name  = "HF_XET_CACHE"
+            value = "/cache/huggingface-xet"
+          }
+          env {
+            name  = "MPLCONFIGDIR"
+            value = "/cache/matplotlib-config"
+          }
+          env {
+            name  = "MACHINE_LEARNING_PRELOAD__CLIP"
+            value = "ViT-B-16-SigLIP2__webli"
+          }
+
+          volume_mount {
+            name       = "cache"
+            mount_path = "/cache"
+          }
+          resources {
+            limits = {
+              "nvidia.com/gpu" = "1" # Used for inference
+            }
+          }
+        }
+        volume {
+          name = "cache"
+          nfs {
+            path   = "/mnt/main/immich/machine-learning"
+            server = "10.0.10.15"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "immich-machine-learning" {
+  metadata {
+    name      = "immich-machine-learning"
+    namespace = "immich"
+    labels = {
+      "app" = "immich-machine-learning"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "immich-machine-learning"
+    }
+    port {
+      port = 3003
+    }
+  }
 }

 resource "kubernetes_ingress_v1" "ingress" {