[ci skip] Move Terraform modules into stack directories

Move all 88 service modules (66 individual + 22 platform) from modules/kubernetes/<service>/ into their corresponding stack directories: - Service stacks: stacks/<service>/module/ - Platform stack: stacks/platform/modules/<service>/ This collocates module source code with its Terragrunt definition. Only shared utility modules remain in modules/kubernetes/: ingress_factory, setup_tls_secret, dockerhub_secret, oauth-proxy. All cross-references to shared modules updated to use correct relative paths. Verified with terragrunt run --all -- plan: 0 adds, 0 destroys across all 68 stacks.
2026-02-22 14:38:14 +00:00 · 2026-02-22 14:38:14 +00:00 · e225e81ebf
commit e225e81ebf
parent 73cb696f12
614 changed files with 12075 additions and 352 deletions
--- a/stacks/f1-stream/module/files/internal/scraper/reddit.go
+++ b/stacks/f1-stream/module/files/internal/scraper/reddit.go
@ -0,0 +1,327 @@
+package scraper
+
+import (
+	"crypto/rand"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"time"
+
+	"f1-stream/internal/models"
+)
+
+const (
+	subredditURL = "https://www.reddit.com/r/motorsportsstreams2/new.json?limit=25"
+	userAgent    = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+	requestDelay = 1 * time.Second
+)
+
+var (
+	urlRe = regexp.MustCompile(`https?://[^\s\)\]\>"]+`)
+
+	// Keywords in post title that indicate F1 content (matched case-insensitively)
+	f1Keywords = []string{
+		"f1",
+		"formula 1",
+		"formula one",
+		"formula1",
+		"grand prix",
+		"gp qualifying",
+		"gp race",
+		"gp sprint",
+		"gp practice",
+	}
+
+	f1NegativeKeywords = []string{
+		"f1 key",
+		"function 1",
+		"help f1",
+	}
+
+	// URLs to filter out (not stream sources)
+	filteredDomains = map[string]bool{
+		"reddit.com":     true,
+		"www.reddit.com": true,
+		"imgur.com":      true,
+		"i.imgur.com":    true,
+		"redd.it":        true,
+		"i.redd.it":      true,
+		"v.redd.it":      true,
+		"youtu.be":       true,
+		"youtube.com":    true,
+		"twitter.com":    true,
+		"x.com":          true,
+	}
+)
+
+type redditListing struct {
+	Data struct {
+		Children []struct {
+			Data struct {
+				Title     string  `json:"title"`
+				SelfText  string  `json:"selftext"`
+				Permalink string  `json:"permalink"`
+				CreatedUTC float64 `json:"created_utc"`
+			} `json:"data"`
+		} `json:"children"`
+	} `json:"data"`
+}
+
+type redditComments []struct {
+	Data struct {
+		Children []struct {
+			Data struct {
+				Body    string `json:"body"`
+				Replies json.RawMessage `json:"replies"`
+			} `json:"data"`
+		} `json:"children"`
+	} `json:"data"`
+}
+
+func scrapeReddit() ([]models.ScrapedLink, error) {
+	client := &http.Client{Timeout: 15 * time.Second}
+	var allLinks []models.ScrapedLink
+	seen := make(map[string]bool)
+
+	log.Printf("scraper: fetching listing from %s", subredditURL)
+	listing, err := fetchJSON[redditListing](client, subredditURL)
+	if err != nil {
+		return nil, fmt.Errorf("fetch listing: %w", err)
+	}
+
+	totalPosts := len(listing.Data.Children)
+	matchedPosts := 0
+	log.Printf("scraper: got %d posts from listing", totalPosts)
+
+	for _, child := range listing.Data.Children {
+		post := child.Data
+
+		if !isF1Post(post.Title) {
+			log.Printf("scraper: skipped post: %s", truncate(post.Title, 60))
+			continue
+		}
+
+		matchedPosts++
+		log.Printf("scraper: matched post: %s", truncate(post.Title, 60))
+
+		selftextLinks := extractURLs(post.SelfText, post.Title)
+		log.Printf("scraper: extracted %d URLs from selftext of %q", len(selftextLinks), truncate(post.Title, 40))
+		for _, link := range selftextLinks {
+			norm := normalizeURL(link.URL)
+			if !seen[norm] {
+				seen[norm] = true
+				allLinks = append(allLinks, link)
+			}
+		}
+
+		time.Sleep(requestDelay)
+		commentsURL := fmt.Sprintf("https://www.reddit.com%s.json", post.Permalink)
+		comments, err := fetchJSONWithRetry[redditComments](client, commentsURL, 3)
+		if err != nil {
+			log.Printf("scraper: failed to fetch comments for %s: %v", post.Permalink, err)
+			continue
+		}
+
+		commentURLCount := 0
+		walkComments(*comments, func(body string) {
+			links := extractURLs(body, post.Title)
+			commentURLCount += len(links)
+			for _, link := range links {
+				norm := normalizeURL(link.URL)
+				if !seen[norm] {
+					seen[norm] = true
+					allLinks = append(allLinks, link)
+				}
+			}
+		})
+		log.Printf("scraper: extracted %d URLs from comments of %q", commentURLCount, truncate(post.Title, 40))
+
+		time.Sleep(requestDelay)
+	}
+
+	log.Printf("scraper: summary — matched %d/%d posts, extracted %d unique URLs", matchedPosts, totalPosts, len(allLinks))
+	return allLinks, nil
+}
+
+func fetchJSON[T any](client *http.Client, rawURL string) (*T, error) {
+	req, err := http.NewRequest("GET", rawURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", userAgent)
+
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	log.Printf("scraper: GET %s -> %d", truncate(rawURL, 80), resp.StatusCode)
+
+	if resp.StatusCode != 200 {
+		return nil, fmt.Errorf("status %d", resp.StatusCode)
+	}
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, 5*1024*1024))
+	if err != nil {
+		return nil, err
+	}
+
+	var result T
+	if err := json.Unmarshal(body, &result); err != nil {
+		return nil, err
+	}
+	return &result, nil
+}
+
+func fetchJSONWithRetry[T any](client *http.Client, rawURL string, maxRetries int) (*T, error) {
+	var lastErr error
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		result, err := fetchJSON[T](client, rawURL)
+		if err == nil {
+			return result, nil
+		}
+		lastErr = err
+
+		errMsg := err.Error()
+		if strings.Contains(errMsg, "status 429") {
+			log.Printf("scraper: rate limited on %s, backing off 30s", truncate(rawURL, 60))
+			time.Sleep(30 * time.Second)
+			continue
+		}
+		if strings.Contains(errMsg, "status 502") || strings.Contains(errMsg, "status 503") {
+			backoff := time.Duration(math.Pow(2, float64(attempt))) * time.Second
+			log.Printf("scraper: server error on %s, retry %d/%d in %v", truncate(rawURL, 60), attempt+1, maxRetries, backoff)
+			time.Sleep(backoff)
+			continue
+		}
+
+		return nil, err
+	}
+	return nil, fmt.Errorf("after %d retries: %w", maxRetries, lastErr)
+}
+
+// deobfuscateText normalises obfuscated URLs commonly posted on Reddit to
+// evade auto-moderation.  Examples:
+//   - "pitsport . xyz/watch/f1" → "https://pitsport.xyz/watch/f1"
+//   - "dlhd dot link"           → "https://dlhd.link"
+func deobfuscateText(text string) string {
+	// Common TLDs used in streaming links.
+	tlds := `(?:com|net|org|xyz|link|info|live|tv|me|cc|to|io|co|stream|site|fun|top|club|watch|racing)`
+
+	// 1. Replace " dot " (case-insensitive) between word-like parts that
+	//    look like domain components:  "dlhd dot link" → "dlhd.link"
+	dotWord := regexp.MustCompile(`(?i)(\b\w[\w-]*)\s+dot\s+(` + tlds + `\b)`)
+	text = dotWord.ReplaceAllString(text, "${1}.${2}")
+
+	// 2. Collapse spaces around dots in domain-like strings:
+	//    "pitsport . xyz" → "pitsport.xyz"
+	spaceDot := regexp.MustCompile(`(\b\w[\w-]*)\s*\.\s*(` + tlds + `\b)`)
+	text = spaceDot.ReplaceAllString(text, "${1}.${2}")
+
+	// 3. Prepend https:// to bare domain-like strings that the URL regex
+	//    would otherwise miss (no scheme present).
+	bareDomain := regexp.MustCompile(`(?:^|[\s(>\[])(\w[\w-]*\.` + tlds + `(?:/[^\s)\]<"]*)?)`)
+	text = bareDomain.ReplaceAllStringFunc(text, func(m string) string {
+		// Preserve the leading whitespace/punctuation character.
+		trimmed := strings.TrimLeft(m, " \t\n(>[")
+		prefix := m[:len(m)-len(trimmed)]
+		if strings.HasPrefix(trimmed, "http://") || strings.HasPrefix(trimmed, "https://") {
+			return m
+		}
+		return prefix + "https://" + trimmed
+	})
+
+	return text
+}
+
+func extractURLs(text, postTitle string) []models.ScrapedLink {
+	text = deobfuscateText(text)
+	matches := urlRe.FindAllString(text, -1)
+	var links []models.ScrapedLink
+	filtered := 0
+	for _, u := range matches {
+		u = strings.TrimRight(u, ".,;:!?)")
+
+		parsed, err := url.Parse(u)
+		if err != nil {
+			continue
+		}
+		if filteredDomains[parsed.Hostname()] {
+			filtered++
+			continue
+		}
+
+		id := make([]byte, 16)
+		if _, err := rand.Read(id); err != nil {
+			continue
+		}
+
+		links = append(links, models.ScrapedLink{
+			ID:        fmt.Sprintf("%x", id),
+			URL:       u,
+			Title:     postTitle,
+			Source:    "r/motorsportsstreams2",
+			ScrapedAt: time.Now(),
+		})
+	}
+	if filtered > 0 {
+		log.Printf("scraper: filtered %d URLs from known domains in %q", filtered, truncate(postTitle, 40))
+	}
+	return links
+}
+
+func walkComments(comments redditComments, fn func(string)) {
+	for _, listing := range comments {
+		for _, child := range listing.Data.Children {
+			if child.Data.Body != "" {
+				fn(child.Data.Body)
+			}
+			// Recurse into replies
+			if len(child.Data.Replies) > 0 && child.Data.Replies[0] == '{' {
+				var nested redditComments
+				if err := json.Unmarshal([]byte("["+string(child.Data.Replies)+"]"), &nested); err == nil {
+					walkComments(nested, fn)
+				}
+			}
+		}
+	}
+}
+
+func normalizeURL(u string) string {
+	parsed, err := url.Parse(u)
+	if err != nil {
+		return strings.ToLower(u)
+	}
+	parsed.Host = strings.ToLower(parsed.Host)
+	path := strings.TrimRight(parsed.Path, "/")
+	return fmt.Sprintf("%s://%s%s", parsed.Scheme, parsed.Host, path)
+}
+
+func isF1Post(title string) bool {
+	lower := strings.ToLower(title)
+	for _, neg := range f1NegativeKeywords {
+		if strings.Contains(lower, neg) {
+			return false
+		}
+	}
+	for _, kw := range f1Keywords {
+		if strings.Contains(lower, kw) {
+			return true
+		}
+	}
+	return false
+}
+
+func truncate(s string, maxLen int) string {
+	if len(s) <= maxLen {
+		return s
+	}
+	return s[:maxLen] + "..."
+}
--- a/stacks/f1-stream/module/files/internal/scraper/scraper.go
+++ b/stacks/f1-stream/module/files/internal/scraper/scraper.go
@ -0,0 +1,105 @@
+package scraper
+
+import (
+	"context"
+	"log"
+	"sync"
+	"time"
+
+	"f1-stream/internal/models"
+	"f1-stream/internal/store"
+)
+
+type Scraper struct {
+	store           *store.Store
+	interval        time.Duration
+	validateTimeout time.Duration
+	mu              sync.Mutex
+}
+
+func New(s *store.Store, interval time.Duration, validateTimeout time.Duration) *Scraper {
+	return &Scraper{store: s, interval: interval, validateTimeout: validateTimeout}
+}
+
+func (s *Scraper) Run(ctx context.Context) {
+	log.Printf("scraper: starting with interval %v", s.interval)
+	// Run immediately on start
+	s.scrape()
+
+	ticker := time.NewTicker(s.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			log.Println("scraper: shutting down")
+			return
+		case <-ticker.C:
+			s.scrape()
+		}
+	}
+}
+
+func (s *Scraper) TriggerScrape() {
+	go s.scrape()
+}
+
+func (s *Scraper) scrape() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	start := time.Now()
+	log.Println("scraper: starting scrape")
+	links, err := scrapeReddit()
+	if err != nil {
+		log.Printf("scraper: error after %v: %v", time.Since(start).Round(time.Millisecond), err)
+		return
+	}
+	log.Printf("scraper: reddit scrape completed in %v, got %d links", time.Since(start).Round(time.Millisecond), len(links))
+
+	// Merge with existing links, filtering out non-F1 entries
+	existing, err := s.store.LoadScrapedLinks()
+	if err != nil {
+		log.Printf("scraper: failed to load existing links: %v", err)
+		existing = nil
+	}
+	seen := make(map[string]bool)
+	var filtered []models.ScrapedLink
+	for _, l := range existing {
+		if !isF1Post(l.Title) {
+			continue
+		}
+		norm := normalizeURL(l.URL)
+		seen[norm] = true
+		filtered = append(filtered, l)
+	}
+	existing = filtered
+
+	added := 0
+	for _, l := range links {
+		norm := normalizeURL(l.URL)
+		if !seen[norm] {
+			existing = append(existing, l)
+			seen[norm] = true
+			added++
+		}
+	}
+
+	if err := s.store.SaveScrapedLinks(existing); err != nil {
+		log.Printf("scraper: failed to save: %v", err)
+		return
+	}
+
+	// Auto-publish newly validated links as streams
+	for _, l := range links {
+		if err := s.store.PublishScrapedStream(l.URL, l.Title); err != nil {
+			u := l.URL
+			if len(u) > 80 {
+				u = u[:80] + "..."
+			}
+			log.Printf("scraper: failed to auto-publish %s: %v", u, err)
+		}
+	}
+
+	log.Printf("scraper: done in %v, added %d new links (total: %d)", time.Since(start).Round(time.Millisecond), added, len(existing))
+}
--- a/stacks/f1-stream/module/files/internal/scraper/validate.go
+++ b/stacks/f1-stream/module/files/internal/scraper/validate.go
@ -0,0 +1,142 @@
+package scraper
+
+import (
+	"io"
+	"log"
+	"net/http"
+	"strings"
+	"time"
+
+	"f1-stream/internal/models"
+)
+
+// videoMarkers are substrings checked (case-insensitively) against the HTML
+// body to detect the presence of a video player or streaming manifest.
+var videoMarkers = []string{
+	// HTML5 video element
+	"<video",
+	// HLS manifests
+	".m3u8",
+	"application/x-mpegurl",
+	"application/vnd.apple.mpegurl",
+	// DASH manifests
+	".mpd",
+	"application/dash+xml",
+	// Player libraries
+	"hls.js",
+	"hls.min.js",
+	"dash.js",
+	"dash.all.min.js",
+	"video.js",
+	"video.min.js",
+	"videojs",
+	"jwplayer",
+	"clappr",
+	"flowplayer",
+	"plyr",
+	"shaka-player",
+	"mediaelement",
+	"fluidplayer",
+}
+
+// videoContentTypes are Content-Type prefixes/substrings that indicate a
+// direct video response (no HTML inspection needed).
+var videoContentTypes = []string{
+	"video/",
+	"application/x-mpegurl",
+	"application/vnd.apple.mpegurl",
+	"application/dash+xml",
+}
+
+// validateBodyLimit caps how much HTML we read when looking for markers.
+const validateBodyLimit = 2 * 1024 * 1024 // 2 MB
+
+// validateLinks fetches each link and keeps only those whose response
+// contains video/player content markers.
+func validateLinks(links []models.ScrapedLink, timeout time.Duration) []models.ScrapedLink {
+	client := &http.Client{
+		Timeout: timeout,
+		CheckRedirect: func(req *http.Request, via []*http.Request) error {
+			if len(via) >= 3 {
+				return http.ErrUseLastResponse
+			}
+			return nil
+		},
+	}
+
+	var kept []models.ScrapedLink
+	for _, link := range links {
+		if HasVideoContent(client, link.URL) {
+			kept = append(kept, link)
+		} else {
+			log.Printf("scraper: discarded %s (no video markers)", truncate(link.URL, 60))
+		}
+	}
+	return kept
+}
+
+// HasVideoContent performs a GET request for rawURL and returns true if the
+// response is a direct video file (by Content-Type) or an HTML page that
+// contains at least one video marker substring.
+func HasVideoContent(client *http.Client, rawURL string) bool {
+	req, err := http.NewRequest("GET", rawURL, nil)
+	if err != nil {
+		log.Printf("scraper: validate request error for %s: %v", truncate(rawURL, 60), err)
+		return false
+	}
+	req.Header.Set("User-Agent", userAgent)
+
+	resp, err := client.Do(req)
+	if err != nil {
+		log.Printf("scraper: validate fetch error for %s: %v", truncate(rawURL, 60), err)
+		return false
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return false
+	}
+
+	ct := strings.ToLower(resp.Header.Get("Content-Type"))
+
+	// Direct video content type — no need to inspect body.
+	if isDirectVideoContentType(ct) {
+		return true
+	}
+
+	// Only inspect HTML pages for markers.
+	if !strings.Contains(ct, "text/html") && !strings.Contains(ct, "application/xhtml") {
+		return false
+	}
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, validateBodyLimit))
+	if err != nil {
+		log.Printf("scraper: validate read error for %s: %v", truncate(rawURL, 60), err)
+		return false
+	}
+
+	return containsVideoMarkers(strings.ToLower(string(body)))
+}
+
+// containsVideoMarkers returns true if loweredBody contains any known video
+// player or streaming marker substring.
+func containsVideoMarkers(loweredBody string) bool {
+	for _, marker := range videoMarkers {
+		if strings.Contains(loweredBody, marker) {
+			return true
+		}
+	}
+	return false
+}
+
+// isDirectVideoContentType returns true if ct (already lowercased) matches a
+// known video content type.
+func isDirectVideoContentType(ct string) bool {
+	ct = strings.ToLower(ct)
+	for _, vct := range videoContentTypes {
+		if strings.Contains(ct, vct) {
+			return true
+		}
+	}
+	return false
+}
--- a/stacks/f1-stream/module/files/internal/scraper/validate_test.go
+++ b/stacks/f1-stream/module/files/internal/scraper/validate_test.go
@ -0,0 +1,124 @@
+package scraper
+
+import "testing"
+
+func TestContainsVideoMarkers(t *testing.T) {
+	tests := []struct {
+		name string
+		body string
+		want bool
+	}{
+		// Positive cases
+		{
+			name: "video tag",
+			body: `<div><video src="stream.mp4"></video></div>`,
+			want: true,
+		},
+		{
+			name: "HLS manifest reference",
+			body: `var url = "https://cdn.example.com/live.m3u8";`,
+			want: true,
+		},
+		{
+			name: "DASH manifest reference",
+			body: `<source src="stream.mpd" type="application/dash+xml">`,
+			want: true,
+		},
+		{
+			name: "HLS.js library",
+			body: `<script src="/js/hls.min.js"></script>`,
+			want: true,
+		},
+		{
+			name: "Video.js library",
+			body: `<script src="https://cdn.example.com/video.js"></script>`,
+			want: true,
+		},
+		{
+			name: "JW Player",
+			body: `<div id="jwplayer-container"></div><script>jwplayer("jwplayer-container")</script>`,
+			want: true,
+		},
+		{
+			name: "Clappr player",
+			body: `<script src="clappr.min.js"></script>`,
+			want: true,
+		},
+		{
+			name: "Flowplayer",
+			body: `<script>flowplayer("#player")</script>`,
+			want: true,
+		},
+		{
+			name: "Plyr player",
+			body: `<link rel="stylesheet" href="plyr.css"><script src="plyr.js"></script>`,
+			want: true,
+		},
+		{
+			name: "Shaka Player",
+			body: `<script src="shaka-player.compiled.js"></script>`,
+			want: true,
+		},
+		// Negative cases
+		{
+			name: "plain HTML",
+			body: `<html><body><p>Hello world</p></body></html>`,
+			want: false,
+		},
+		{
+			name: "reddit link page",
+			body: `<html><body><a href="https://example.com">Click here</a></body></html>`,
+			want: false,
+		},
+		{
+			name: "blog post",
+			body: `<html><body><article>F1 race results and analysis...</article></body></html>`,
+			want: false,
+		},
+		{
+			name: "empty string",
+			body: "",
+			want: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := containsVideoMarkers(tt.body)
+			if got != tt.want {
+				t.Errorf("containsVideoMarkers(%q) = %v, want %v", truncate(tt.body, 60), got, tt.want)
+			}
+		})
+	}
+}
+
+func TestIsDirectVideoContentType(t *testing.T) {
+	tests := []struct {
+		name string
+		ct   string
+		want bool
+	}{
+		// Positive cases
+		{name: "video/mp4", ct: "video/mp4", want: true},
+		{name: "video/webm", ct: "video/webm", want: true},
+		{name: "HLS content type", ct: "application/x-mpegurl", want: true},
+		{name: "Apple HLS content type", ct: "application/vnd.apple.mpegurl", want: true},
+		{name: "DASH content type", ct: "application/dash+xml", want: true},
+		{name: "video with params", ct: "video/mp4; charset=utf-8", want: true},
+		// Negative cases
+		{name: "text/html", ct: "text/html", want: false},
+		{name: "application/json", ct: "application/json", want: false},
+		{name: "image/png", ct: "image/png", want: false},
+		{name: "text/plain", ct: "text/plain", want: false},
+		{name: "empty string", ct: "", want: false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := isDirectVideoContentType(tt.ct)
+			if got != tt.want {
+				t.Errorf("isDirectVideoContentType(%q) = %v, want %v", tt.ct, got, tt.want)
+			}
+		})
+	}
+}