homelab: v0.5.0 — net/dns/metrics/logs probes (endpoint resolution)

The remaining verbs that pass the "saves reasoning, not just typing" test the
user posed mid-session: each encodes the non-obvious which-endpoint-reached-how
resolution otherwise re-derived every time. (Same test deprioritized node-ssh
and secret-get aliasing — thin wrappers over commands already known.)

- net check <host> [path]: two-legged reachability — external (public DNS→CF)
  vs internal (Traefik LB) — so you see WHERE a break is, not just that one path
  works. (live: surfaced the LB at 6ms vs CF 77ms.)
- dns lookup <name> [type]: Technitium (10.0.20.201) vs public (1.1.1.1) diff.
- metrics query "<promql>" / metrics alerts: Prometheus via the LB
  (prometheus-query.viktorbarzin.lan); alerts uses the synthetic ALERTS series
  since the query frontend has no /api/v1/alerts and Alertmanager has no ingress.
- logs query "<logql>" [--since 1h] [--limit N]: Loki range query via the LB.

All reach auth-free internal ingresses through the LB (Go form of
curl --resolve host:443:10.0.20.203) — no port-forward, no kubectl. In-cluster-
only endpoints (Alertmanager v2) deliberately out of scope. Verified live before
building; all five smoke-tested green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-19 11:27:31 +00:00
parent 9189560ac3
commit e91e1612dd
9 changed files with 466 additions and 3 deletions

197
cli/cmd_obs.go Normal file
View file

@ -0,0 +1,197 @@
package main
import (
"encoding/json"
"fmt"
"net/url"
"sort"
"strconv"
"strings"
"time"
)
const (
promHost = "prometheus-query.viktorbarzin.lan"
lokiHost = "loki.viktorbarzin.lan"
)
func obsCommands() []Command {
return []Command{
{Path: []string{"metrics", "query"}, Tier: TierRead,
Summary: `Prometheus instant query: metrics query "<promql>" [--json]`, Run: metricsQuery},
{Path: []string{"metrics", "alerts"}, Tier: TierRead,
Summary: "list currently firing Prometheus alerts", Run: metricsAlerts},
{Path: []string{"logs", "query"}, Tier: TierRead,
Summary: `Loki query (last --since, default 1h): logs query "<logql>" [--since 1h] [--limit N] [--json]`, Run: logsQuery},
}
}
// queryArg joins non-flag args into the query (PromQL/LogQL should normally be
// passed as a single quoted argument; this also tolerates unquoted multi-token).
func queryArg(args []string, valueFlags map[string]bool) string {
var parts []string
for i := 0; i < len(args); i++ {
a := args[i]
if valueFlags[a] {
i++
continue
}
if strings.HasPrefix(a, "-") {
continue
}
parts = append(parts, a)
}
return strings.Join(parts, " ")
}
func labelStr(m map[string]string) string {
name := m["__name__"]
var kv []string
for k, v := range m {
if k != "__name__" {
kv = append(kv, k+"="+v)
}
}
sort.Strings(kv)
return name + "{" + strings.Join(kv, ",") + "}"
}
func metricsQuery(args []string) error {
q := queryArg(args, nil)
if q == "" {
return fmt.Errorf(`usage: homelab metrics query "<promql>" [--json]`)
}
v := url.Values{}
v.Set("query", q)
body, err := lbGetBody(promHost, "/api/v1/query", v)
if err != nil {
return err
}
if containsArg(args, "--json") {
fmt.Println(string(body))
return nil
}
var r struct {
Data struct {
Result []struct {
Metric map[string]string `json:"metric"`
Value []interface{} `json:"value"`
} `json:"result"`
} `json:"data"`
}
if err := json.Unmarshal(body, &r); err != nil {
fmt.Println(string(body))
return nil
}
if len(r.Data.Result) == 0 {
fmt.Println("(no series)")
return nil
}
for _, s := range r.Data.Result {
val := ""
if len(s.Value) == 2 {
val = fmt.Sprint(s.Value[1])
}
fmt.Printf("%-14s %s\n", val, labelStr(s.Metric))
}
return nil
}
func metricsAlerts(args []string) error {
// prometheus-query is a query-only frontend (no /api/v1/alerts); the firing
// set is exposed as the synthetic ALERTS series, queryable the normal way.
v := url.Values{}
v.Set("query", `ALERTS{alertstate="firing"}`)
body, err := lbGetBody(promHost, "/api/v1/query", v)
if err != nil {
return err
}
if containsArg(args, "--json") {
fmt.Println(string(body))
return nil
}
var r struct {
Data struct {
Result []struct {
Metric map[string]string `json:"metric"`
} `json:"result"`
} `json:"data"`
}
if err := json.Unmarshal(body, &r); err != nil {
fmt.Println(string(body))
return nil
}
if len(r.Data.Result) == 0 {
fmt.Println("(no firing alerts)")
return nil
}
for _, a := range r.Data.Result {
m := a.Metric
scope := ""
for _, k := range []string{"namespace", "deployment", "instance", "job", "node"} {
if v := m[k]; v != "" {
scope = k + "=" + v
break
}
}
fmt.Printf("%-9s %-34s %s\n", m["severity"], m["alertname"], scope)
}
return nil
}
func logsQuery(args []string) error {
q := queryArg(args, map[string]bool{"--since": true, "--limit": true})
if q == "" {
return fmt.Errorf(`usage: homelab logs query "<logql>" [--since 1h] [--limit N] [--json]`)
}
since := flagValue(args, "--since")
if since == "" {
since = "1h"
}
dur, err := time.ParseDuration(since)
if err != nil {
return fmt.Errorf("bad --since %q: %w", since, err)
}
limit := flagValue(args, "--limit")
if limit == "" {
limit = "100"
}
end := time.Now()
v := url.Values{}
v.Set("query", q)
v.Set("limit", limit)
v.Set("start", strconv.FormatInt(end.Add(-dur).UnixNano(), 10))
v.Set("end", strconv.FormatInt(end.UnixNano(), 10))
body, err := lbGetBody(lokiHost, "/loki/api/v1/query_range", v)
if err != nil {
return err
}
if containsArg(args, "--json") {
fmt.Println(string(body))
return nil
}
var r struct {
Data struct {
Result []struct {
Values [][]string `json:"values"`
} `json:"result"`
} `json:"data"`
}
if err := json.Unmarshal(body, &r); err != nil {
fmt.Println(string(body))
return nil
}
n := 0
for _, s := range r.Data.Result {
for _, val := range s.Values {
if len(val) == 2 {
fmt.Println(val[1])
n++
}
}
}
if n == 0 {
fmt.Println("(no log lines)")
}
return nil
}