infra/modules/kubernetes/f1-stream/files/internal/scraper/reddit.go

package scraper

import (
	"crypto/rand"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"math"
	"net/http"
	"net/url"
	"regexp"
	"strings"
	"time"

	"f1-stream/internal/models"
)

const (
	subredditURL = "https://www.reddit.com/r/motorsportsstreams2/new.json?limit=25"
	userAgent    = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
	requestDelay = 1 * time.Second
)

var (
	urlRe = regexp.MustCompile(`https?://[^\s\)\]\>"]+`)

	// Keywords in post title that indicate F1 content (matched case-insensitively)
	f1Keywords = []string{
		"f1",
		"formula 1",
		"formula one",
		"formula1",
		"grand prix",
		"gp qualifying",
		"gp race",
		"gp sprint",
		"gp practice",
	}

	f1NegativeKeywords = []string{
		"f1 key",
		"function 1",
		"help f1",
	}

	// URLs to filter out (not stream sources)
	filteredDomains = map[string]bool{
		"reddit.com":     true,
		"www.reddit.com": true,
		"imgur.com":      true,
		"i.imgur.com":    true,
		"redd.it":        true,
		"i.redd.it":      true,
		"v.redd.it":      true,
		"youtu.be":       true,
		"youtube.com":    true,
		"twitter.com":    true,
		"x.com":          true,
	}
)

type redditListing struct {
	Data struct {
		Children []struct {
			Data struct {
				Title     string  `json:"title"`
				SelfText  string  `json:"selftext"`
				Permalink string  `json:"permalink"`
				CreatedUTC float64 `json:"created_utc"`
			} `json:"data"`
		} `json:"children"`
	} `json:"data"`
}

type redditComments []struct {
	Data struct {
		Children []struct {
			Data struct {
				Body    string `json:"body"`
				Replies json.RawMessage `json:"replies"`
			} `json:"data"`
		} `json:"children"`
	} `json:"data"`
}

func scrapeReddit() ([]models.ScrapedLink, error) {
	client := &http.Client{Timeout: 15 * time.Second}
	var allLinks []models.ScrapedLink
	seen := make(map[string]bool)

	log.Printf("scraper: fetching listing from %s", subredditURL)
	listing, err := fetchJSON[redditListing](client, subredditURL)
	if err != nil {
		return nil, fmt.Errorf("fetch listing: %w", err)
	}

	totalPosts := len(listing.Data.Children)
	matchedPosts := 0
	log.Printf("scraper: got %d posts from listing", totalPosts)

	for _, child := range listing.Data.Children {
		post := child.Data

		if !isF1Post(post.Title) {
			log.Printf("scraper: skipped post: %s", truncate(post.Title, 60))
			continue
		}

		matchedPosts++
		log.Printf("scraper: matched post: %s", truncate(post.Title, 60))

		selftextLinks := extractURLs(post.SelfText, post.Title)
		log.Printf("scraper: extracted %d URLs from selftext of %q", len(selftextLinks), truncate(post.Title, 40))
		for _, link := range selftextLinks {
			norm := normalizeURL(link.URL)
			if !seen[norm] {
				seen[norm] = true
				allLinks = append(allLinks, link)
			}
		}

		time.Sleep(requestDelay)
		commentsURL := fmt.Sprintf("https://www.reddit.com%s.json", post.Permalink)
		comments, err := fetchJSONWithRetry[redditComments](client, commentsURL, 3)
		if err != nil {
			log.Printf("scraper: failed to fetch comments for %s: %v", post.Permalink, err)
			continue
		}

		commentURLCount := 0
		walkComments(*comments, func(body string) {
			links := extractURLs(body, post.Title)
			commentURLCount += len(links)
			for _, link := range links {
				norm := normalizeURL(link.URL)
				if !seen[norm] {
					seen[norm] = true
					allLinks = append(allLinks, link)
				}
			}
		})
		log.Printf("scraper: extracted %d URLs from comments of %q", commentURLCount, truncate(post.Title, 40))

		time.Sleep(requestDelay)
	}

	log.Printf("scraper: summary — matched %d/%d posts, extracted %d unique URLs", matchedPosts, totalPosts, len(allLinks))
	return allLinks, nil
}

func fetchJSON[T any](client *http.Client, rawURL string) (*T, error) {
	req, err := http.NewRequest("GET", rawURL, nil)
	if err != nil {
		return nil, err
	}
	req.Header.Set("User-Agent", userAgent)

	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	log.Printf("scraper: GET %s -> %d", truncate(rawURL, 80), resp.StatusCode)

	if resp.StatusCode != 200 {
		return nil, fmt.Errorf("status %d", resp.StatusCode)
	}

	body, err := io.ReadAll(io.LimitReader(resp.Body, 5*1024*1024))
	if err != nil {
		return nil, err
	}

	var result T
	if err := json.Unmarshal(body, &result); err != nil {
		return nil, err
	}
	return &result, nil
}

func fetchJSONWithRetry[T any](client *http.Client, rawURL string, maxRetries int) (*T, error) {
	var lastErr error
	for attempt := 0; attempt <= maxRetries; attempt++ {
		result, err := fetchJSON[T](client, rawURL)
		if err == nil {
			return result, nil
		}
		lastErr = err

		errMsg := err.Error()
		if strings.Contains(errMsg, "status 429") {
			log.Printf("scraper: rate limited on %s, backing off 30s", truncate(rawURL, 60))
			time.Sleep(30 * time.Second)
			continue
		}
		if strings.Contains(errMsg, "status 502") || strings.Contains(errMsg, "status 503") {
			backoff := time.Duration(math.Pow(2, float64(attempt))) * time.Second
			log.Printf("scraper: server error on %s, retry %d/%d in %v", truncate(rawURL, 60), attempt+1, maxRetries, backoff)
			time.Sleep(backoff)
			continue
		}

		return nil, err
	}
	return nil, fmt.Errorf("after %d retries: %w", maxRetries, lastErr)
}

// deobfuscateText normalises obfuscated URLs commonly posted on Reddit to
// evade auto-moderation.  Examples:
//   - "pitsport . xyz/watch/f1" → "https://pitsport.xyz/watch/f1"
//   - "dlhd dot link"           → "https://dlhd.link"
func deobfuscateText(text string) string {
	// Common TLDs used in streaming links.
	tlds := `(?:com|net|org|xyz|link|info|live|tv|me|cc|to|io|co|stream|site|fun|top|club|watch|racing)`

	// 1. Replace " dot " (case-insensitive) between word-like parts that
	//    look like domain components:  "dlhd dot link" → "dlhd.link"
	dotWord := regexp.MustCompile(`(?i)(\b\w[\w-]*)\s+dot\s+(` + tlds + `\b)`)
	text = dotWord.ReplaceAllString(text, "${1}.${2}")

	// 2. Collapse spaces around dots in domain-like strings:
	//    "pitsport . xyz" → "pitsport.xyz"
	spaceDot := regexp.MustCompile(`(\b\w[\w-]*)\s*\.\s*(` + tlds + `\b)`)
	text = spaceDot.ReplaceAllString(text, "${1}.${2}")

	// 3. Prepend https:// to bare domain-like strings that the URL regex
	//    would otherwise miss (no scheme present).
	bareDomain := regexp.MustCompile(`(?:^|[\s(>\[])(\w[\w-]*\.` + tlds + `(?:/[^\s)\]<"]*)?)`)
	text = bareDomain.ReplaceAllStringFunc(text, func(m string) string {
		// Preserve the leading whitespace/punctuation character.
		trimmed := strings.TrimLeft(m, " \t\n(>[")
		prefix := m[:len(m)-len(trimmed)]
		if strings.HasPrefix(trimmed, "http://") || strings.HasPrefix(trimmed, "https://") {
			return m
		}
		return prefix + "https://" + trimmed
	})

	return text
}

func extractURLs(text, postTitle string) []models.ScrapedLink {
	text = deobfuscateText(text)
	matches := urlRe.FindAllString(text, -1)
	var links []models.ScrapedLink
	filtered := 0
	for _, u := range matches {
		u = strings.TrimRight(u, ".,;:!?)")

		parsed, err := url.Parse(u)
		if err != nil {
			continue
		}
		if filteredDomains[parsed.Hostname()] {
			filtered++
			continue
		}

		id := make([]byte, 16)
		if _, err := rand.Read(id); err != nil {
			continue
		}

		links = append(links, models.ScrapedLink{
			ID:        fmt.Sprintf("%x", id),
			URL:       u,
			Title:     postTitle,
			Source:    "r/motorsportsstreams2",
			ScrapedAt: time.Now(),
		})
	}
	if filtered > 0 {
		log.Printf("scraper: filtered %d URLs from known domains in %q", filtered, truncate(postTitle, 40))
	}
	return links
}

func walkComments(comments redditComments, fn func(string)) {
	for _, listing := range comments {
		for _, child := range listing.Data.Children {
			if child.Data.Body != "" {
				fn(child.Data.Body)
			}
			// Recurse into replies
			if len(child.Data.Replies) > 0 && child.Data.Replies[0] == '{' {
				var nested redditComments
				if err := json.Unmarshal([]byte("["+string(child.Data.Replies)+"]"), &nested); err == nil {
					walkComments(nested, fn)
				}
			}
		}
	}
}

func normalizeURL(u string) string {
	parsed, err := url.Parse(u)
	if err != nil {
		return strings.ToLower(u)
	}
	parsed.Host = strings.ToLower(parsed.Host)
	path := strings.TrimRight(parsed.Path, "/")
	return fmt.Sprintf("%s://%s%s", parsed.Scheme, parsed.Host, path)
}

func isF1Post(title string) bool {
	lower := strings.ToLower(title)
	for _, neg := range f1NegativeKeywords {
		if strings.Contains(lower, neg) {
			return false
		}
	}
	for _, kw := range f1Keywords {
		if strings.Contains(lower, kw) {
			return true
		}
	}
	return false
}

func truncate(s string, maxLen int) string {
	if len(s) <= maxLen {
		return s
	}
	return s[:maxLen] + "..."
}