homelab: v0.5.0 — net/dns/metrics/logs probes (endpoint resolution)
The remaining verbs that pass the "saves reasoning, not just typing" test the user posed mid-session: each encodes the non-obvious which-endpoint-reached-how resolution otherwise re-derived every time. (Same test deprioritized node-ssh and secret-get aliasing — thin wrappers over commands already known.) - net check <host> [path]: two-legged reachability — external (public DNS→CF) vs internal (Traefik LB) — so you see WHERE a break is, not just that one path works. (live: surfaced the LB at 6ms vs CF 77ms.) - dns lookup <name> [type]: Technitium (10.0.20.201) vs public (1.1.1.1) diff. - metrics query "<promql>" / metrics alerts: Prometheus via the LB (prometheus-query.viktorbarzin.lan); alerts uses the synthetic ALERTS series since the query frontend has no /api/v1/alerts and Alertmanager has no ingress. - logs query "<logql>" [--since 1h] [--limit N]: Loki range query via the LB. All reach auth-free internal ingresses through the LB (Go form of curl --resolve host:443:10.0.20.203) — no port-forward, no kubectl. In-cluster- only endpoints (Alertmanager v2) deliberately out of scope. Verified live before building; all five smoke-tested green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
9189560ac3
commit
e91e1612dd
9 changed files with 466 additions and 3 deletions
|
|
@ -112,6 +112,25 @@ remote, with retries that ride Woodpecker's intermittent empty responses.
|
|||
step) is deferred to v0.4.1 — Woodpecker's per-pipeline detail/log endpoints were
|
||||
the least reliable; `status`/`watch` use the list endpoint that works.
|
||||
|
||||
### v0.5 verbs — net / dns / metrics / logs
|
||||
|
||||
Reachability + observability probes. Their value is *endpoint resolution* — the
|
||||
non-obvious "which host, public or LB, what auth, what URL shape" reasoning you'd
|
||||
otherwise re-derive every time — not the HTTP call itself. All reach internal
|
||||
ingresses through the Traefik LB (the Go form of `curl --resolve host:443:10.0.20.203`).
|
||||
|
||||
| Command | Tier | What it does |
|
||||
|---|---|---|
|
||||
| `net check <host> [path]` | read | probes the host two ways — external (public DNS → Cloudflare) vs internal (Traefik LB) — with status + latency, so you can tell *where* a break is (CF? app? the LB path?) |
|
||||
| `dns lookup <name> [type]` | read | resolves via Technitium (`10.0.20.201`) and public (`1.1.1.1`), diffed — surfaces split-horizon vs propagation gaps |
|
||||
| `metrics query "<promql>"` | read | Prometheus instant query (`prometheus-query.viktorbarzin.lan`); prints `value {labels}` or `--json` |
|
||||
| `metrics alerts` | read | currently-firing alerts (via the synthetic `ALERTS` series — the query frontend has no `/api/v1/alerts`) |
|
||||
| `logs query "<logql>" [--since 1h] [--limit N]` | read | Loki range query (`loki.viktorbarzin.lan`); prints log lines or `--json` |
|
||||
|
||||
Quote the PromQL/LogQL. These hit auth-free internal ingresses — no port-forward,
|
||||
no kubectl. (In-cluster-only endpoints like Alertmanager stay out of scope; the
|
||||
firing set is reachable via `ALERTS` instead.)
|
||||
|
||||
## Build / install
|
||||
|
||||
Built from source to `/usr/local/bin/homelab` during devvm provisioning
|
||||
|
|
@ -131,4 +150,4 @@ original flag-based path unchanged, so the webhook handler is unaffected.
|
|||
|
||||
## Design
|
||||
|
||||
See `infra/docs/adr/0004`–`0009` for the architecture decisions.
|
||||
See `infra/docs/adr/0004`–`0010` for the architecture decisions.
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
v0.4.0
|
||||
v0.5.0
|
||||
|
|
|
|||
83
cli/cmd_net.go
Normal file
83
cli/cmd_net.go
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func netCommands() []Command {
|
||||
return []Command{
|
||||
{Path: []string{"net", "check"}, Tier: TierRead,
|
||||
Summary: "reachability of <host>[/path]: external (public DNS→CF) vs internal (Traefik LB)", Run: netCheck},
|
||||
{Path: []string{"dns", "lookup"}, Tier: TierRead,
|
||||
Summary: "resolve <name> via Technitium (10.0.20.201) and public (1.1.1.1), diffed", Run: dnsLookup},
|
||||
}
|
||||
}
|
||||
|
||||
func fmtProbe(code int, d time.Duration, err error) string {
|
||||
if err != nil {
|
||||
return "ERR " + err.Error()
|
||||
}
|
||||
return fmt.Sprintf("HTTP %d %dms", code, d.Milliseconds())
|
||||
}
|
||||
|
||||
func netCheck(args []string) error {
|
||||
host, rest := firstPositional(args)
|
||||
if host == "" {
|
||||
return fmt.Errorf("usage: homelab net check <host> [path]")
|
||||
}
|
||||
path := "/"
|
||||
if len(rest) > 0 && !strings.HasPrefix(rest[0], "-") {
|
||||
path = rest[0]
|
||||
if !strings.HasPrefix(path, "/") {
|
||||
path = "/" + path
|
||||
}
|
||||
}
|
||||
u := "https://" + host + path
|
||||
fmt.Printf("%s\n", u)
|
||||
|
||||
// external leg: resolve via public DNS, dial the public IP (tests the real CF path)
|
||||
pubOut, _ := dig(hostOnly(host), "1.1.1.1", "")
|
||||
if pubIP := firstLine(pubOut); pubIP != "" {
|
||||
c, d, e := probeURL(clientDialingIP(pubIP, 10*time.Second), u)
|
||||
fmt.Printf(" external (public %-15s) %s\n", pubIP, fmtProbe(c, d, e))
|
||||
} else {
|
||||
fmt.Println(" external (public) no public A record")
|
||||
}
|
||||
// internal leg: dial the Traefik LB directly
|
||||
c, d, e := probeURL(clientDialingIP(internalLBIP, 10*time.Second), u)
|
||||
fmt.Printf(" internal (LB %-15s) %s\n", internalLBIP, fmtProbe(c, d, e))
|
||||
return nil
|
||||
}
|
||||
|
||||
func dnsLookup(args []string) error {
|
||||
name, rest := firstPositional(args)
|
||||
if name == "" {
|
||||
return fmt.Errorf("usage: homelab dns lookup <name> [A|AAAA|TXT|MX|PTR]")
|
||||
}
|
||||
rr := ""
|
||||
if len(rest) > 0 {
|
||||
rr = rest[0]
|
||||
}
|
||||
tech, _ := dig(name, "10.0.20.201", rr)
|
||||
pub, _ := dig(name, "1.1.1.1", rr)
|
||||
fmt.Printf("technitium (10.0.20.201): %s\n", oneLineList(tech))
|
||||
fmt.Printf("public (1.1.1.1) : %s\n", oneLineList(pub))
|
||||
if strings.TrimSpace(tech) != strings.TrimSpace(pub) {
|
||||
fmt.Println("⚠ mismatch — split-horizon (expected for internal-only apps) or a propagation gap")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func hostOnly(h string) string { // strip any path accidentally included
|
||||
return strings.SplitN(h, "/", 2)[0]
|
||||
}
|
||||
|
||||
func oneLineList(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return "(none)"
|
||||
}
|
||||
return strings.ReplaceAll(s, "\n", ", ")
|
||||
}
|
||||
197
cli/cmd_obs.go
Normal file
197
cli/cmd_obs.go
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
promHost = "prometheus-query.viktorbarzin.lan"
|
||||
lokiHost = "loki.viktorbarzin.lan"
|
||||
)
|
||||
|
||||
func obsCommands() []Command {
|
||||
return []Command{
|
||||
{Path: []string{"metrics", "query"}, Tier: TierRead,
|
||||
Summary: `Prometheus instant query: metrics query "<promql>" [--json]`, Run: metricsQuery},
|
||||
{Path: []string{"metrics", "alerts"}, Tier: TierRead,
|
||||
Summary: "list currently firing Prometheus alerts", Run: metricsAlerts},
|
||||
{Path: []string{"logs", "query"}, Tier: TierRead,
|
||||
Summary: `Loki query (last --since, default 1h): logs query "<logql>" [--since 1h] [--limit N] [--json]`, Run: logsQuery},
|
||||
}
|
||||
}
|
||||
|
||||
// queryArg joins non-flag args into the query (PromQL/LogQL should normally be
|
||||
// passed as a single quoted argument; this also tolerates unquoted multi-token).
|
||||
func queryArg(args []string, valueFlags map[string]bool) string {
|
||||
var parts []string
|
||||
for i := 0; i < len(args); i++ {
|
||||
a := args[i]
|
||||
if valueFlags[a] {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(a, "-") {
|
||||
continue
|
||||
}
|
||||
parts = append(parts, a)
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func labelStr(m map[string]string) string {
|
||||
name := m["__name__"]
|
||||
var kv []string
|
||||
for k, v := range m {
|
||||
if k != "__name__" {
|
||||
kv = append(kv, k+"="+v)
|
||||
}
|
||||
}
|
||||
sort.Strings(kv)
|
||||
return name + "{" + strings.Join(kv, ",") + "}"
|
||||
}
|
||||
|
||||
func metricsQuery(args []string) error {
|
||||
q := queryArg(args, nil)
|
||||
if q == "" {
|
||||
return fmt.Errorf(`usage: homelab metrics query "<promql>" [--json]`)
|
||||
}
|
||||
v := url.Values{}
|
||||
v.Set("query", q)
|
||||
body, err := lbGetBody(promHost, "/api/v1/query", v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if containsArg(args, "--json") {
|
||||
fmt.Println(string(body))
|
||||
return nil
|
||||
}
|
||||
var r struct {
|
||||
Data struct {
|
||||
Result []struct {
|
||||
Metric map[string]string `json:"metric"`
|
||||
Value []interface{} `json:"value"`
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
fmt.Println(string(body))
|
||||
return nil
|
||||
}
|
||||
if len(r.Data.Result) == 0 {
|
||||
fmt.Println("(no series)")
|
||||
return nil
|
||||
}
|
||||
for _, s := range r.Data.Result {
|
||||
val := ""
|
||||
if len(s.Value) == 2 {
|
||||
val = fmt.Sprint(s.Value[1])
|
||||
}
|
||||
fmt.Printf("%-14s %s\n", val, labelStr(s.Metric))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func metricsAlerts(args []string) error {
|
||||
// prometheus-query is a query-only frontend (no /api/v1/alerts); the firing
|
||||
// set is exposed as the synthetic ALERTS series, queryable the normal way.
|
||||
v := url.Values{}
|
||||
v.Set("query", `ALERTS{alertstate="firing"}`)
|
||||
body, err := lbGetBody(promHost, "/api/v1/query", v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if containsArg(args, "--json") {
|
||||
fmt.Println(string(body))
|
||||
return nil
|
||||
}
|
||||
var r struct {
|
||||
Data struct {
|
||||
Result []struct {
|
||||
Metric map[string]string `json:"metric"`
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
fmt.Println(string(body))
|
||||
return nil
|
||||
}
|
||||
if len(r.Data.Result) == 0 {
|
||||
fmt.Println("(no firing alerts)")
|
||||
return nil
|
||||
}
|
||||
for _, a := range r.Data.Result {
|
||||
m := a.Metric
|
||||
scope := ""
|
||||
for _, k := range []string{"namespace", "deployment", "instance", "job", "node"} {
|
||||
if v := m[k]; v != "" {
|
||||
scope = k + "=" + v
|
||||
break
|
||||
}
|
||||
}
|
||||
fmt.Printf("%-9s %-34s %s\n", m["severity"], m["alertname"], scope)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func logsQuery(args []string) error {
|
||||
q := queryArg(args, map[string]bool{"--since": true, "--limit": true})
|
||||
if q == "" {
|
||||
return fmt.Errorf(`usage: homelab logs query "<logql>" [--since 1h] [--limit N] [--json]`)
|
||||
}
|
||||
since := flagValue(args, "--since")
|
||||
if since == "" {
|
||||
since = "1h"
|
||||
}
|
||||
dur, err := time.ParseDuration(since)
|
||||
if err != nil {
|
||||
return fmt.Errorf("bad --since %q: %w", since, err)
|
||||
}
|
||||
limit := flagValue(args, "--limit")
|
||||
if limit == "" {
|
||||
limit = "100"
|
||||
}
|
||||
end := time.Now()
|
||||
v := url.Values{}
|
||||
v.Set("query", q)
|
||||
v.Set("limit", limit)
|
||||
v.Set("start", strconv.FormatInt(end.Add(-dur).UnixNano(), 10))
|
||||
v.Set("end", strconv.FormatInt(end.UnixNano(), 10))
|
||||
body, err := lbGetBody(lokiHost, "/loki/api/v1/query_range", v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if containsArg(args, "--json") {
|
||||
fmt.Println(string(body))
|
||||
return nil
|
||||
}
|
||||
var r struct {
|
||||
Data struct {
|
||||
Result []struct {
|
||||
Values [][]string `json:"values"`
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
fmt.Println(string(body))
|
||||
return nil
|
||||
}
|
||||
n := 0
|
||||
for _, s := range r.Data.Result {
|
||||
for _, val := range s.Values {
|
||||
if len(val) == 2 {
|
||||
fmt.Println(val[1])
|
||||
n++
|
||||
}
|
||||
}
|
||||
}
|
||||
if n == 0 {
|
||||
fmt.Println("(no log lines)")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
@ -18,6 +18,8 @@ func buildRegistry() []Command {
|
|||
reg = append(reg, memoryCommands()...)
|
||||
reg = append(reg, ciCommands()...)
|
||||
reg = append(reg, deployCommands()...)
|
||||
reg = append(reg, netCommands()...)
|
||||
reg = append(reg, obsCommands()...)
|
||||
return reg
|
||||
}
|
||||
|
||||
|
|
|
|||
76
cli/probe.go
Normal file
76
cli/probe.go
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// internalLBIP is the dedicated Traefik LB; every internal ingress routes through it.
|
||||
const internalLBIP = "10.0.20.203"
|
||||
|
||||
// clientDialingIP returns an http.Client that dials ip for ANY host while keeping
|
||||
// the URL host as SNI (so the cert matches) — the Go form of `curl --resolve
|
||||
// host:443:ip`. TLS verification is skipped (these are reachability/observability
|
||||
// probes, not security checks; internal .lan vhosts may serve a non-matching cert).
|
||||
func clientDialingIP(ip string, timeout time.Duration) *http.Client {
|
||||
d := &net.Dialer{Timeout: 8 * time.Second}
|
||||
tr := &http.Transport{
|
||||
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
|
||||
if i := strings.LastIndex(addr, ":"); i >= 0 {
|
||||
addr = ip + addr[i:]
|
||||
}
|
||||
return d.DialContext(ctx, network, addr)
|
||||
},
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
}
|
||||
return &http.Client{Timeout: timeout, Transport: tr}
|
||||
}
|
||||
|
||||
// probeURL issues a GET and returns status code + elapsed time.
|
||||
func probeURL(c *http.Client, rawurl string) (int, time.Duration, error) {
|
||||
start := time.Now()
|
||||
resp, err := c.Get(rawurl)
|
||||
dur := time.Since(start)
|
||||
if err != nil {
|
||||
return 0, dur, err
|
||||
}
|
||||
resp.Body.Close()
|
||||
return resp.StatusCode, dur, nil
|
||||
}
|
||||
|
||||
// lbGetBody GETs https://<host><path>?<q> through the internal LB and returns the body.
|
||||
func lbGetBody(host, path string, q url.Values) ([]byte, error) {
|
||||
u := "https://" + host + path
|
||||
if len(q) > 0 {
|
||||
u += "?" + q.Encode()
|
||||
}
|
||||
resp, err := clientDialingIP(internalLBIP, 20*time.Second).Get(u)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
return nil, fmt.Errorf("%s -> %d: %s", path, resp.StatusCode, strings.TrimSpace(string(body)))
|
||||
}
|
||||
return body, nil
|
||||
}
|
||||
|
||||
// dig runs `dig +short` against a resolver, optionally for a record type.
|
||||
func dig(name, server, rrtype string) (string, error) {
|
||||
args := []string{"+short", "+time=3", "+tries=1"}
|
||||
if rrtype != "" {
|
||||
args = append(args, rrtype)
|
||||
}
|
||||
args = append(args, name, "@"+server)
|
||||
out, err := exec.Command("dig", args...).Output()
|
||||
return strings.TrimSpace(string(out)), err
|
||||
}
|
||||
49
cli/probe_test.go
Normal file
49
cli/probe_test.go
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
package main
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestQueryArg(t *testing.T) {
|
||||
if got := queryArg([]string{"up"}, nil); got != "up" {
|
||||
t.Errorf(`queryArg(["up"]) = %q, want "up"`, got)
|
||||
}
|
||||
if got := queryArg([]string{"up", "--json"}, nil); got != "up" {
|
||||
t.Errorf(`--json should be dropped, got %q`, got)
|
||||
}
|
||||
// single quoted PromQL arrives as one token
|
||||
if got := queryArg([]string{"count by (node) (up)", "--json"}, nil); got != "count by (node) (up)" {
|
||||
t.Errorf(`quoted query mangled: %q`, got)
|
||||
}
|
||||
// value-flags and their values are skipped, query survives
|
||||
vf := map[string]bool{"--since": true, "--limit": true}
|
||||
if got := queryArg([]string{`{app="x"}`, "--since", "1h", "--limit", "50"}, vf); got != `{app="x"}` {
|
||||
t.Errorf(`value-flag skipping failed: %q`, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLabelStr(t *testing.T) {
|
||||
got := labelStr(map[string]string{"__name__": "up", "job": "x", "instance": "y"})
|
||||
if got != "up{instance=y,job=x}" { // __name__ extracted, rest sorted
|
||||
t.Errorf("labelStr = %q", got)
|
||||
}
|
||||
if got := labelStr(map[string]string{"alertname": "Foo"}); got != "{alertname=Foo}" {
|
||||
t.Errorf("labelStr (no __name__) = %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOneLineList(t *testing.T) {
|
||||
if got := oneLineList(" "); got != "(none)" {
|
||||
t.Errorf("empty = %q, want (none)", got)
|
||||
}
|
||||
if got := oneLineList("a\nb"); got != "a, b" {
|
||||
t.Errorf("multi = %q, want 'a, b'", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHostOnly(t *testing.T) {
|
||||
if got := hostOnly("foo.me/path"); got != "foo.me" {
|
||||
t.Errorf("hostOnly = %q", got)
|
||||
}
|
||||
if got := hostOnly("foo.me"); got != "foo.me" {
|
||||
t.Errorf("hostOnly = %q", got)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue