fan-control: presence-aware IPMI fan curve for the R730 PVE host
The iDRAC stock curve runs the CPU at ~72°C on the 7080 RPM floor even under load (optimises for quiet, not cool). Add a bash daemon + systemd unit that drives the chassis fans from CPU temp on two curves, picked by garage occupancy (the server is in the garage): COOL when empty (measured ~58-65°C under load), QUIET near the silent floor when the ha-sofia garage door shows someone is there (open, or <15min since last activity). Manual fan mode is backstopped: bash EXIT trap + systemd ExecStopPost hand fans back to Dell auto on stop/crash; CPU>=83°C or repeated IPMI failures do the same. Pushgateway metrics (job=fan_control). 36 unit tests cover the pure curve/hysteresis/presence/parse logic; DRY_RUN + RUN_ONCE for integration checks. Deployed and verified on 192.168.1.127 (CPU 70->58°C in cool mode, hysteresis stepping confirmed). Design: docs/plans/2026-06-04-pve-fan-control-design.md Runbook: docs/runbooks/fan-control.md [ci skip] Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
c6f27fa172
commit
90ad6b9125
60 changed files with 640 additions and 9563 deletions
|
|
@ -1,301 +0,0 @@
|
|||
"""Stream health checker - verifies extracted streams are live and responsive.
|
||||
|
||||
Performs GET requests against m3u8 URLs to verify they contain valid HLS
|
||||
playlists (#EXTM3U header), measures response times for quality ranking,
|
||||
and supports concurrent checking of multiple streams.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# How long to wait for a single health check (seconds)
|
||||
HEALTH_CHECK_TIMEOUT = 10.0
|
||||
|
||||
# Maximum bytes to read when verifying m3u8 content
|
||||
# We only need to see the #EXTM3U header and a few lines
|
||||
MAX_CONTENT_BYTES = 8192
|
||||
|
||||
# User-Agent to send with health check requests
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamHealth:
|
||||
"""Result of a single stream health check."""
|
||||
|
||||
url: str
|
||||
is_live: bool
|
||||
response_time_ms: int # Lower = better quality indicator
|
||||
checked_at: str = field(
|
||||
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
||||
)
|
||||
error: str = "" # Error message if not live
|
||||
bitrate: int = 0 # Bitrate in bps if detectable from playlist
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serialize to a plain dictionary for JSON responses."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"is_live": self.is_live,
|
||||
"response_time_ms": self.response_time_ms,
|
||||
"checked_at": self.checked_at,
|
||||
"error": self.error,
|
||||
"bitrate": self.bitrate,
|
||||
}
|
||||
|
||||
|
||||
def _extract_bitrate(content: str) -> int:
|
||||
"""Try to extract bitrate from m3u8 playlist content.
|
||||
|
||||
Looks for BANDWIDTH= in #EXT-X-STREAM-INF tags. Returns the highest
|
||||
bitrate found, or 0 if none detected.
|
||||
"""
|
||||
max_bitrate = 0
|
||||
for line in content.splitlines():
|
||||
if "BANDWIDTH=" in line:
|
||||
try:
|
||||
# Parse BANDWIDTH=<number> from the tag
|
||||
for part in line.split(","):
|
||||
part = part.strip()
|
||||
if part.startswith("BANDWIDTH="):
|
||||
bw = int(part.split("=", 1)[1])
|
||||
max_bitrate = max(max_bitrate, bw)
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return max_bitrate
|
||||
|
||||
|
||||
class StreamHealthChecker:
|
||||
"""Background health checker for extracted streams.
|
||||
|
||||
Verifies streams are live by performing a partial GET on the m3u8 URL,
|
||||
checking for valid HLS content (#EXTM3U header), and measuring response
|
||||
time as a quality indicator.
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: float = HEALTH_CHECK_TIMEOUT) -> None:
|
||||
self._timeout = timeout
|
||||
|
||||
async def check_stream(self, url: str) -> StreamHealth:
|
||||
"""Check if a stream URL is live by doing a partial GET on the m3u8.
|
||||
|
||||
Verification steps:
|
||||
1. GET the m3u8 URL (not just HEAD - need to verify playlist content)
|
||||
2. Check if response contains #EXTM3U header
|
||||
3. Measure response time as a quality indicator
|
||||
4. Extract bitrate info if available
|
||||
|
||||
Args:
|
||||
url: The m3u8 stream URL to check.
|
||||
|
||||
Returns:
|
||||
StreamHealth with is_live, response_time_ms, checked_at, and
|
||||
optional bitrate and error information.
|
||||
"""
|
||||
start_time = time.monotonic()
|
||||
checked_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=self._timeout,
|
||||
follow_redirects=True,
|
||||
headers={
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "*/*",
|
||||
},
|
||||
) as client:
|
||||
# Use a partial GET with Range header to limit download
|
||||
# but fall back to reading limited bytes if Range not supported
|
||||
response = await client.get(
|
||||
url,
|
||||
headers={"Range": f"bytes=0-{MAX_CONTENT_BYTES - 1}"},
|
||||
)
|
||||
|
||||
elapsed_ms = int((time.monotonic() - start_time) * 1000)
|
||||
|
||||
# Accept 200 (full content) or 206 (partial content)
|
||||
if response.status_code not in (200, 206):
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
error=f"HTTP {response.status_code}",
|
||||
)
|
||||
|
||||
content = response.text[:MAX_CONTENT_BYTES]
|
||||
|
||||
# Verify it's a valid HLS playlist
|
||||
if "#EXTM3U" not in content:
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
error="Response does not contain #EXTM3U header",
|
||||
)
|
||||
|
||||
# Extract bitrate info if available
|
||||
bitrate = _extract_bitrate(content)
|
||||
|
||||
# If this is a master playlist, validate at least one variant
|
||||
if "#EXT-X-STREAM-INF:" in content:
|
||||
variant_ok = await self._check_first_variant(
|
||||
content, url, client
|
||||
)
|
||||
if not variant_ok:
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
bitrate=bitrate,
|
||||
error="Master playlist OK but variant playlists are unreachable",
|
||||
)
|
||||
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=True,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
bitrate=bitrate,
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
elapsed_ms = int((time.monotonic() - start_time) * 1000)
|
||||
logger.debug("Health check timed out for %s", url)
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
error="Timeout",
|
||||
)
|
||||
except httpx.HTTPError as e:
|
||||
elapsed_ms = int((time.monotonic() - start_time) * 1000)
|
||||
logger.debug("Health check HTTP error for %s: %s", url, e)
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
error=f"HTTP error: {e}",
|
||||
)
|
||||
except Exception as e:
|
||||
elapsed_ms = int((time.monotonic() - start_time) * 1000)
|
||||
logger.exception("Unexpected error during health check for %s", url)
|
||||
return StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=elapsed_ms,
|
||||
checked_at=checked_at,
|
||||
error=f"Unexpected error: {e}",
|
||||
)
|
||||
|
||||
async def _check_first_variant(
|
||||
self, content: str, base_url: str, client: httpx.AsyncClient
|
||||
) -> bool:
|
||||
"""Check that at least one variant playlist in a master playlist is reachable.
|
||||
|
||||
Extracts the first variant URI from a master playlist and does a HEAD
|
||||
request to verify it returns 200/206. This catches streams where the
|
||||
master playlist is valid but all variant playlists are 404.
|
||||
|
||||
Args:
|
||||
content: The master playlist text content.
|
||||
base_url: The URL of the master playlist (for resolving relative URIs).
|
||||
client: An existing httpx client to reuse.
|
||||
|
||||
Returns:
|
||||
True if at least one variant is reachable, False otherwise.
|
||||
"""
|
||||
lines = content.splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if not line.strip().startswith("#EXT-X-STREAM-INF:"):
|
||||
continue
|
||||
# Next non-empty, non-comment line is the variant URI
|
||||
for j in range(i + 1, len(lines)):
|
||||
variant_uri = lines[j].strip()
|
||||
if variant_uri and not variant_uri.startswith("#"):
|
||||
# Resolve relative URI
|
||||
if not variant_uri.startswith(("http://", "https://")):
|
||||
variant_uri = urljoin(base_url, variant_uri)
|
||||
try:
|
||||
resp = await client.head(variant_uri)
|
||||
if resp.status_code in (200, 206):
|
||||
return True
|
||||
# HEAD might not be supported, try GET
|
||||
resp = await client.get(
|
||||
variant_uri,
|
||||
headers={"Range": f"bytes=0-{MAX_CONTENT_BYTES - 1}"},
|
||||
)
|
||||
if resp.status_code in (200, 206):
|
||||
return True
|
||||
logger.debug(
|
||||
"Variant playlist %s returned HTTP %d",
|
||||
variant_uri, resp.status_code,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Variant check failed for %s: %s", variant_uri, e
|
||||
)
|
||||
# Only check the first variant
|
||||
return False
|
||||
# No variants found (shouldn't happen if #EXT-X-STREAM-INF was detected)
|
||||
return True
|
||||
|
||||
async def check_all(
|
||||
self, streams: list[dict],
|
||||
) -> dict[str, StreamHealth]:
|
||||
"""Check all streams concurrently, return health map keyed by URL.
|
||||
|
||||
Args:
|
||||
streams: List of stream dicts (must have a "url" key).
|
||||
|
||||
Returns:
|
||||
Dictionary mapping stream URL to its StreamHealth result.
|
||||
"""
|
||||
urls = [s["url"] for s in streams if "url" in s]
|
||||
|
||||
if not urls:
|
||||
return {}
|
||||
|
||||
logger.info("Running health checks on %d stream(s)...", len(urls))
|
||||
|
||||
# Run all checks concurrently
|
||||
tasks = [self.check_stream(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
health_map: dict[str, StreamHealth] = {}
|
||||
for url, result in zip(urls, results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error("Health check task failed for %s: %s", url, result)
|
||||
health_map[url] = StreamHealth(
|
||||
url=url,
|
||||
is_live=False,
|
||||
response_time_ms=0,
|
||||
error=f"Task error: {result}",
|
||||
)
|
||||
else:
|
||||
health_map[url] = result
|
||||
|
||||
live_count = sum(1 for h in health_map.values() if h.is_live)
|
||||
logger.info(
|
||||
"Health checks complete: %d/%d streams are live",
|
||||
live_count,
|
||||
len(health_map),
|
||||
)
|
||||
|
||||
return health_map
|
||||
Loading…
Add table
Add a link
Reference in a new issue