infra/modules/kubernetes/f1-stream/files/internal/scraper/reddit.go
Viktor Barzin 450dfc28e4
[ci skip] Add reverse proxy mode to f1-stream
Replace CPU-intensive headless Chrome + WebRTC pipeline with a
lightweight Go reverse proxy that strips anti-framing headers
(X-Frame-Options, CSP) and embeds streaming sites in iframes.

- New internal/proxy package with URL rewriting for HTML/CSS
- JS shim injection to intercept fetch/XHR/WebSocket/createElement
- Referer reconstruction for correct cross-origin auth (HLS streams)
- Inline iframe viewer preserving site navigation (not fullscreen overlay)
2026-02-21 21:23:21 +00:00

327 lines
8.5 KiB
Go

package scraper
import (
"crypto/rand"
"encoding/json"
"fmt"
"io"
"log"
"math"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"f1-stream/internal/models"
)
const (
subredditURL = "https://www.reddit.com/r/motorsportsstreams2/new.json?limit=25"
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
requestDelay = 1 * time.Second
)
var (
urlRe = regexp.MustCompile(`https?://[^\s\)\]\>"]+`)
// Keywords in post title that indicate F1 content (matched case-insensitively)
f1Keywords = []string{
"f1",
"formula 1",
"formula one",
"formula1",
"grand prix",
"gp qualifying",
"gp race",
"gp sprint",
"gp practice",
}
f1NegativeKeywords = []string{
"f1 key",
"function 1",
"help f1",
}
// URLs to filter out (not stream sources)
filteredDomains = map[string]bool{
"reddit.com": true,
"www.reddit.com": true,
"imgur.com": true,
"i.imgur.com": true,
"redd.it": true,
"i.redd.it": true,
"v.redd.it": true,
"youtu.be": true,
"youtube.com": true,
"twitter.com": true,
"x.com": true,
}
)
type redditListing struct {
Data struct {
Children []struct {
Data struct {
Title string `json:"title"`
SelfText string `json:"selftext"`
Permalink string `json:"permalink"`
CreatedUTC float64 `json:"created_utc"`
} `json:"data"`
} `json:"children"`
} `json:"data"`
}
type redditComments []struct {
Data struct {
Children []struct {
Data struct {
Body string `json:"body"`
Replies json.RawMessage `json:"replies"`
} `json:"data"`
} `json:"children"`
} `json:"data"`
}
func scrapeReddit() ([]models.ScrapedLink, error) {
client := &http.Client{Timeout: 15 * time.Second}
var allLinks []models.ScrapedLink
seen := make(map[string]bool)
log.Printf("scraper: fetching listing from %s", subredditURL)
listing, err := fetchJSON[redditListing](client, subredditURL)
if err != nil {
return nil, fmt.Errorf("fetch listing: %w", err)
}
totalPosts := len(listing.Data.Children)
matchedPosts := 0
log.Printf("scraper: got %d posts from listing", totalPosts)
for _, child := range listing.Data.Children {
post := child.Data
if !isF1Post(post.Title) {
log.Printf("scraper: skipped post: %s", truncate(post.Title, 60))
continue
}
matchedPosts++
log.Printf("scraper: matched post: %s", truncate(post.Title, 60))
selftextLinks := extractURLs(post.SelfText, post.Title)
log.Printf("scraper: extracted %d URLs from selftext of %q", len(selftextLinks), truncate(post.Title, 40))
for _, link := range selftextLinks {
norm := normalizeURL(link.URL)
if !seen[norm] {
seen[norm] = true
allLinks = append(allLinks, link)
}
}
time.Sleep(requestDelay)
commentsURL := fmt.Sprintf("https://www.reddit.com%s.json", post.Permalink)
comments, err := fetchJSONWithRetry[redditComments](client, commentsURL, 3)
if err != nil {
log.Printf("scraper: failed to fetch comments for %s: %v", post.Permalink, err)
continue
}
commentURLCount := 0
walkComments(*comments, func(body string) {
links := extractURLs(body, post.Title)
commentURLCount += len(links)
for _, link := range links {
norm := normalizeURL(link.URL)
if !seen[norm] {
seen[norm] = true
allLinks = append(allLinks, link)
}
}
})
log.Printf("scraper: extracted %d URLs from comments of %q", commentURLCount, truncate(post.Title, 40))
time.Sleep(requestDelay)
}
log.Printf("scraper: summary — matched %d/%d posts, extracted %d unique URLs", matchedPosts, totalPosts, len(allLinks))
return allLinks, nil
}
func fetchJSON[T any](client *http.Client, rawURL string) (*T, error) {
req, err := http.NewRequest("GET", rawURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
log.Printf("scraper: GET %s -> %d", truncate(rawURL, 80), resp.StatusCode)
if resp.StatusCode != 200 {
return nil, fmt.Errorf("status %d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 5*1024*1024))
if err != nil {
return nil, err
}
var result T
if err := json.Unmarshal(body, &result); err != nil {
return nil, err
}
return &result, nil
}
func fetchJSONWithRetry[T any](client *http.Client, rawURL string, maxRetries int) (*T, error) {
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
result, err := fetchJSON[T](client, rawURL)
if err == nil {
return result, nil
}
lastErr = err
errMsg := err.Error()
if strings.Contains(errMsg, "status 429") {
log.Printf("scraper: rate limited on %s, backing off 30s", truncate(rawURL, 60))
time.Sleep(30 * time.Second)
continue
}
if strings.Contains(errMsg, "status 502") || strings.Contains(errMsg, "status 503") {
backoff := time.Duration(math.Pow(2, float64(attempt))) * time.Second
log.Printf("scraper: server error on %s, retry %d/%d in %v", truncate(rawURL, 60), attempt+1, maxRetries, backoff)
time.Sleep(backoff)
continue
}
return nil, err
}
return nil, fmt.Errorf("after %d retries: %w", maxRetries, lastErr)
}
// deobfuscateText normalises obfuscated URLs commonly posted on Reddit to
// evade auto-moderation. Examples:
// - "pitsport . xyz/watch/f1" → "https://pitsport.xyz/watch/f1"
// - "dlhd dot link" → "https://dlhd.link"
func deobfuscateText(text string) string {
// Common TLDs used in streaming links.
tlds := `(?:com|net|org|xyz|link|info|live|tv|me|cc|to|io|co|stream|site|fun|top|club|watch|racing)`
// 1. Replace " dot " (case-insensitive) between word-like parts that
// look like domain components: "dlhd dot link" → "dlhd.link"
dotWord := regexp.MustCompile(`(?i)(\b\w[\w-]*)\s+dot\s+(` + tlds + `\b)`)
text = dotWord.ReplaceAllString(text, "${1}.${2}")
// 2. Collapse spaces around dots in domain-like strings:
// "pitsport . xyz" → "pitsport.xyz"
spaceDot := regexp.MustCompile(`(\b\w[\w-]*)\s*\.\s*(` + tlds + `\b)`)
text = spaceDot.ReplaceAllString(text, "${1}.${2}")
// 3. Prepend https:// to bare domain-like strings that the URL regex
// would otherwise miss (no scheme present).
bareDomain := regexp.MustCompile(`(?:^|[\s(>\[])(\w[\w-]*\.` + tlds + `(?:/[^\s)\]<"]*)?)`)
text = bareDomain.ReplaceAllStringFunc(text, func(m string) string {
// Preserve the leading whitespace/punctuation character.
trimmed := strings.TrimLeft(m, " \t\n(>[")
prefix := m[:len(m)-len(trimmed)]
if strings.HasPrefix(trimmed, "http://") || strings.HasPrefix(trimmed, "https://") {
return m
}
return prefix + "https://" + trimmed
})
return text
}
func extractURLs(text, postTitle string) []models.ScrapedLink {
text = deobfuscateText(text)
matches := urlRe.FindAllString(text, -1)
var links []models.ScrapedLink
filtered := 0
for _, u := range matches {
u = strings.TrimRight(u, ".,;:!?)")
parsed, err := url.Parse(u)
if err != nil {
continue
}
if filteredDomains[parsed.Hostname()] {
filtered++
continue
}
id := make([]byte, 16)
if _, err := rand.Read(id); err != nil {
continue
}
links = append(links, models.ScrapedLink{
ID: fmt.Sprintf("%x", id),
URL: u,
Title: postTitle,
Source: "r/motorsportsstreams2",
ScrapedAt: time.Now(),
})
}
if filtered > 0 {
log.Printf("scraper: filtered %d URLs from known domains in %q", filtered, truncate(postTitle, 40))
}
return links
}
func walkComments(comments redditComments, fn func(string)) {
for _, listing := range comments {
for _, child := range listing.Data.Children {
if child.Data.Body != "" {
fn(child.Data.Body)
}
// Recurse into replies
if len(child.Data.Replies) > 0 && child.Data.Replies[0] == '{' {
var nested redditComments
if err := json.Unmarshal([]byte("["+string(child.Data.Replies)+"]"), &nested); err == nil {
walkComments(nested, fn)
}
}
}
}
}
func normalizeURL(u string) string {
parsed, err := url.Parse(u)
if err != nil {
return strings.ToLower(u)
}
parsed.Host = strings.ToLower(parsed.Host)
path := strings.TrimRight(parsed.Path, "/")
return fmt.Sprintf("%s://%s%s", parsed.Scheme, parsed.Host, path)
}
func isF1Post(title string) bool {
lower := strings.ToLower(title)
for _, neg := range f1NegativeKeywords {
if strings.Contains(lower, neg) {
return false
}
}
for _, kw := range f1Keywords {
if strings.Contains(lower, kw) {
return true
}
}
return false
}
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen] + "..."
}