Replace CPU-intensive headless Chrome + WebRTC pipeline with a lightweight Go reverse proxy that strips anti-framing headers (X-Frame-Options, CSP) and embeds streaming sites in iframes. - New internal/proxy package with URL rewriting for HTML/CSS - JS shim injection to intercept fetch/XHR/WebSocket/createElement - Referer reconstruction for correct cross-origin auth (HLS streams) - Inline iframe viewer preserving site navigation (not fullscreen overlay)
327 lines
8.5 KiB
Go
327 lines
8.5 KiB
Go
package scraper
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"math"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"f1-stream/internal/models"
|
|
)
|
|
|
|
const (
|
|
subredditURL = "https://www.reddit.com/r/motorsportsstreams2/new.json?limit=25"
|
|
userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
requestDelay = 1 * time.Second
|
|
)
|
|
|
|
var (
|
|
urlRe = regexp.MustCompile(`https?://[^\s\)\]\>"]+`)
|
|
|
|
// Keywords in post title that indicate F1 content (matched case-insensitively)
|
|
f1Keywords = []string{
|
|
"f1",
|
|
"formula 1",
|
|
"formula one",
|
|
"formula1",
|
|
"grand prix",
|
|
"gp qualifying",
|
|
"gp race",
|
|
"gp sprint",
|
|
"gp practice",
|
|
}
|
|
|
|
f1NegativeKeywords = []string{
|
|
"f1 key",
|
|
"function 1",
|
|
"help f1",
|
|
}
|
|
|
|
// URLs to filter out (not stream sources)
|
|
filteredDomains = map[string]bool{
|
|
"reddit.com": true,
|
|
"www.reddit.com": true,
|
|
"imgur.com": true,
|
|
"i.imgur.com": true,
|
|
"redd.it": true,
|
|
"i.redd.it": true,
|
|
"v.redd.it": true,
|
|
"youtu.be": true,
|
|
"youtube.com": true,
|
|
"twitter.com": true,
|
|
"x.com": true,
|
|
}
|
|
)
|
|
|
|
type redditListing struct {
|
|
Data struct {
|
|
Children []struct {
|
|
Data struct {
|
|
Title string `json:"title"`
|
|
SelfText string `json:"selftext"`
|
|
Permalink string `json:"permalink"`
|
|
CreatedUTC float64 `json:"created_utc"`
|
|
} `json:"data"`
|
|
} `json:"children"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
type redditComments []struct {
|
|
Data struct {
|
|
Children []struct {
|
|
Data struct {
|
|
Body string `json:"body"`
|
|
Replies json.RawMessage `json:"replies"`
|
|
} `json:"data"`
|
|
} `json:"children"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
func scrapeReddit() ([]models.ScrapedLink, error) {
|
|
client := &http.Client{Timeout: 15 * time.Second}
|
|
var allLinks []models.ScrapedLink
|
|
seen := make(map[string]bool)
|
|
|
|
log.Printf("scraper: fetching listing from %s", subredditURL)
|
|
listing, err := fetchJSON[redditListing](client, subredditURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("fetch listing: %w", err)
|
|
}
|
|
|
|
totalPosts := len(listing.Data.Children)
|
|
matchedPosts := 0
|
|
log.Printf("scraper: got %d posts from listing", totalPosts)
|
|
|
|
for _, child := range listing.Data.Children {
|
|
post := child.Data
|
|
|
|
if !isF1Post(post.Title) {
|
|
log.Printf("scraper: skipped post: %s", truncate(post.Title, 60))
|
|
continue
|
|
}
|
|
|
|
matchedPosts++
|
|
log.Printf("scraper: matched post: %s", truncate(post.Title, 60))
|
|
|
|
selftextLinks := extractURLs(post.SelfText, post.Title)
|
|
log.Printf("scraper: extracted %d URLs from selftext of %q", len(selftextLinks), truncate(post.Title, 40))
|
|
for _, link := range selftextLinks {
|
|
norm := normalizeURL(link.URL)
|
|
if !seen[norm] {
|
|
seen[norm] = true
|
|
allLinks = append(allLinks, link)
|
|
}
|
|
}
|
|
|
|
time.Sleep(requestDelay)
|
|
commentsURL := fmt.Sprintf("https://www.reddit.com%s.json", post.Permalink)
|
|
comments, err := fetchJSONWithRetry[redditComments](client, commentsURL, 3)
|
|
if err != nil {
|
|
log.Printf("scraper: failed to fetch comments for %s: %v", post.Permalink, err)
|
|
continue
|
|
}
|
|
|
|
commentURLCount := 0
|
|
walkComments(*comments, func(body string) {
|
|
links := extractURLs(body, post.Title)
|
|
commentURLCount += len(links)
|
|
for _, link := range links {
|
|
norm := normalizeURL(link.URL)
|
|
if !seen[norm] {
|
|
seen[norm] = true
|
|
allLinks = append(allLinks, link)
|
|
}
|
|
}
|
|
})
|
|
log.Printf("scraper: extracted %d URLs from comments of %q", commentURLCount, truncate(post.Title, 40))
|
|
|
|
time.Sleep(requestDelay)
|
|
}
|
|
|
|
log.Printf("scraper: summary — matched %d/%d posts, extracted %d unique URLs", matchedPosts, totalPosts, len(allLinks))
|
|
return allLinks, nil
|
|
}
|
|
|
|
func fetchJSON[T any](client *http.Client, rawURL string) (*T, error) {
|
|
req, err := http.NewRequest("GET", rawURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
log.Printf("scraper: GET %s -> %d", truncate(rawURL, 80), resp.StatusCode)
|
|
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("status %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 5*1024*1024))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var result T
|
|
if err := json.Unmarshal(body, &result); err != nil {
|
|
return nil, err
|
|
}
|
|
return &result, nil
|
|
}
|
|
|
|
func fetchJSONWithRetry[T any](client *http.Client, rawURL string, maxRetries int) (*T, error) {
|
|
var lastErr error
|
|
for attempt := 0; attempt <= maxRetries; attempt++ {
|
|
result, err := fetchJSON[T](client, rawURL)
|
|
if err == nil {
|
|
return result, nil
|
|
}
|
|
lastErr = err
|
|
|
|
errMsg := err.Error()
|
|
if strings.Contains(errMsg, "status 429") {
|
|
log.Printf("scraper: rate limited on %s, backing off 30s", truncate(rawURL, 60))
|
|
time.Sleep(30 * time.Second)
|
|
continue
|
|
}
|
|
if strings.Contains(errMsg, "status 502") || strings.Contains(errMsg, "status 503") {
|
|
backoff := time.Duration(math.Pow(2, float64(attempt))) * time.Second
|
|
log.Printf("scraper: server error on %s, retry %d/%d in %v", truncate(rawURL, 60), attempt+1, maxRetries, backoff)
|
|
time.Sleep(backoff)
|
|
continue
|
|
}
|
|
|
|
return nil, err
|
|
}
|
|
return nil, fmt.Errorf("after %d retries: %w", maxRetries, lastErr)
|
|
}
|
|
|
|
// deobfuscateText normalises obfuscated URLs commonly posted on Reddit to
|
|
// evade auto-moderation. Examples:
|
|
// - "pitsport . xyz/watch/f1" → "https://pitsport.xyz/watch/f1"
|
|
// - "dlhd dot link" → "https://dlhd.link"
|
|
func deobfuscateText(text string) string {
|
|
// Common TLDs used in streaming links.
|
|
tlds := `(?:com|net|org|xyz|link|info|live|tv|me|cc|to|io|co|stream|site|fun|top|club|watch|racing)`
|
|
|
|
// 1. Replace " dot " (case-insensitive) between word-like parts that
|
|
// look like domain components: "dlhd dot link" → "dlhd.link"
|
|
dotWord := regexp.MustCompile(`(?i)(\b\w[\w-]*)\s+dot\s+(` + tlds + `\b)`)
|
|
text = dotWord.ReplaceAllString(text, "${1}.${2}")
|
|
|
|
// 2. Collapse spaces around dots in domain-like strings:
|
|
// "pitsport . xyz" → "pitsport.xyz"
|
|
spaceDot := regexp.MustCompile(`(\b\w[\w-]*)\s*\.\s*(` + tlds + `\b)`)
|
|
text = spaceDot.ReplaceAllString(text, "${1}.${2}")
|
|
|
|
// 3. Prepend https:// to bare domain-like strings that the URL regex
|
|
// would otherwise miss (no scheme present).
|
|
bareDomain := regexp.MustCompile(`(?:^|[\s(>\[])(\w[\w-]*\.` + tlds + `(?:/[^\s)\]<"]*)?)`)
|
|
text = bareDomain.ReplaceAllStringFunc(text, func(m string) string {
|
|
// Preserve the leading whitespace/punctuation character.
|
|
trimmed := strings.TrimLeft(m, " \t\n(>[")
|
|
prefix := m[:len(m)-len(trimmed)]
|
|
if strings.HasPrefix(trimmed, "http://") || strings.HasPrefix(trimmed, "https://") {
|
|
return m
|
|
}
|
|
return prefix + "https://" + trimmed
|
|
})
|
|
|
|
return text
|
|
}
|
|
|
|
func extractURLs(text, postTitle string) []models.ScrapedLink {
|
|
text = deobfuscateText(text)
|
|
matches := urlRe.FindAllString(text, -1)
|
|
var links []models.ScrapedLink
|
|
filtered := 0
|
|
for _, u := range matches {
|
|
u = strings.TrimRight(u, ".,;:!?)")
|
|
|
|
parsed, err := url.Parse(u)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if filteredDomains[parsed.Hostname()] {
|
|
filtered++
|
|
continue
|
|
}
|
|
|
|
id := make([]byte, 16)
|
|
if _, err := rand.Read(id); err != nil {
|
|
continue
|
|
}
|
|
|
|
links = append(links, models.ScrapedLink{
|
|
ID: fmt.Sprintf("%x", id),
|
|
URL: u,
|
|
Title: postTitle,
|
|
Source: "r/motorsportsstreams2",
|
|
ScrapedAt: time.Now(),
|
|
})
|
|
}
|
|
if filtered > 0 {
|
|
log.Printf("scraper: filtered %d URLs from known domains in %q", filtered, truncate(postTitle, 40))
|
|
}
|
|
return links
|
|
}
|
|
|
|
func walkComments(comments redditComments, fn func(string)) {
|
|
for _, listing := range comments {
|
|
for _, child := range listing.Data.Children {
|
|
if child.Data.Body != "" {
|
|
fn(child.Data.Body)
|
|
}
|
|
// Recurse into replies
|
|
if len(child.Data.Replies) > 0 && child.Data.Replies[0] == '{' {
|
|
var nested redditComments
|
|
if err := json.Unmarshal([]byte("["+string(child.Data.Replies)+"]"), &nested); err == nil {
|
|
walkComments(nested, fn)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func normalizeURL(u string) string {
|
|
parsed, err := url.Parse(u)
|
|
if err != nil {
|
|
return strings.ToLower(u)
|
|
}
|
|
parsed.Host = strings.ToLower(parsed.Host)
|
|
path := strings.TrimRight(parsed.Path, "/")
|
|
return fmt.Sprintf("%s://%s%s", parsed.Scheme, parsed.Host, path)
|
|
}
|
|
|
|
func isF1Post(title string) bool {
|
|
lower := strings.ToLower(title)
|
|
for _, neg := range f1NegativeKeywords {
|
|
if strings.Contains(lower, neg) {
|
|
return false
|
|
}
|
|
}
|
|
for _, kw := range f1Keywords {
|
|
if strings.Contains(lower, kw) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func truncate(s string, maxLen int) string {
|
|
if len(s) <= maxLen {
|
|
return s
|
|
}
|
|
return s[:maxLen] + "..."
|
|
}
|