feat(meet-kevin): caption extractor via yt-dlp

- Implement CaptionResult frozen dataclass for structured caption data
- Add parse_srt() to parse SubRip format with flexible timestamp handling
- Add extract_captions() async function using yt-dlp subprocess wrapper
- Prefer manual captions over auto-generated; clean up SRT files after parsing
- Add 16 comprehensive tests covering edge cases (empty input, malformed SRT,
  timestamp variations, language extraction, manual vs auto selection)
- Type-safe implementation with full mypy --strict compliance
- Add sample.srt fixture with 3 segments mentioning NVDA for test reference
This commit is contained in:
Viktor Barzin 2026-05-21 19:40:52 +00:00
parent 8ce3ede09c
commit 145f7dbec5
3 changed files with 589 additions and 0 deletions

11
tests/fixtures/sample.srt vendored Normal file
View file

@ -0,0 +1,11 @@
1
00:00:01,000 --> 00:00:04,500
Welcome back to Meet Kevin
2
00:00:04,500 --> 00:00:09,000
Today we are talking about NVDA and AMD earnings
3
00:00:09,000 --> 00:00:14,250
You will want to watch this until the end

View file

@ -0,0 +1,329 @@
"""Tests for caption extraction via yt-dlp."""
from pathlib import Path
from unittest.mock import patch
import pytest
from services.meet_kevin_watcher.caption_extractor import (
CaptionResult,
parse_srt,
extract_captions,
)
@pytest.fixture
def sample_srt() -> str:
"""Load sample SRT fixture."""
fixture_path = Path(__file__).parent.parent.parent / "fixtures" / "sample.srt"
return fixture_path.read_text()
class TestParseSrt:
"""Test parse_srt with various inputs."""
def test_parse_srt_valid_fixture(self, sample_srt: str):
"""parse_srt extracts segments from valid SRT with correct timing."""
segments = parse_srt(sample_srt)
# Should return 3 segments
assert len(segments) == 3
assert isinstance(segments, list)
# Segment 0: Welcome back
seg0 = segments[0]
assert seg0["start"] == 1.0
assert seg0["end"] == 4.5
assert "Welcome back to Meet Kevin" in seg0["text"]
# Segment 1: NVDA mention
seg1 = segments[1]
assert seg1["start"] == 4.5
assert seg1["end"] == 9.0
assert "NVDA" in seg1["text"]
assert "AMD earnings" in seg1["text"]
# Segment 2: End
seg2 = segments[2]
assert seg2["start"] == 9.0
assert seg2["end"] == 14.25
assert "watch this until the end" in seg2["text"]
def test_parse_srt_empty_input(self):
"""parse_srt returns empty list on empty input."""
result = parse_srt("")
assert result == []
def test_parse_srt_whitespace_only(self):
"""parse_srt returns empty list on whitespace-only input."""
result = parse_srt(" \n\n \t ")
assert result == []
def test_parse_srt_invalid_format(self):
"""parse_srt returns empty list on malformed SRT."""
result = parse_srt("not valid srt content")
assert result == []
def test_parse_srt_timestamp_with_period(self):
"""parse_srt handles timestamps with period instead of comma."""
srt = """1
00:00:01.000 --> 00:00:04.500
With period separator
2
00:00:05.000 --> 00:00:10.000
Second segment"""
segments = parse_srt(srt)
assert len(segments) == 2
assert segments[0]["start"] == 1.0
assert segments[0]["end"] == 4.5
assert segments[1]["start"] == 5.0
def test_parse_srt_multiline_text(self):
"""parse_srt handles multiline subtitle text."""
srt = """1
00:00:01,000 --> 00:00:05,000
First line
Second line
Third line"""
segments = parse_srt(srt)
assert len(segments) == 1
assert "First line\nSecond line\nThird line" in segments[0]["text"]
def test_parse_srt_various_durations(self):
"""parse_srt correctly converts various timestamp formats."""
srt = """1
00:00:00,000 --> 00:00:00,500
Short
2
01:23:45,678 --> 01:23:46,789
Long"""
segments = parse_srt(srt)
assert len(segments) == 2
# First: very short (0.5 seconds)
assert segments[0]["start"] == 0.0
assert segments[0]["end"] == 0.5
# Second: 1 hour 23 min 45+ sec
assert segments[1]["start"] == pytest.approx(1 * 3600 + 23 * 60 + 45.678)
assert segments[1]["end"] == pytest.approx(1 * 3600 + 23 * 60 + 46.789)
class TestExtractCaptions:
"""Test extract_captions with mocked yt-dlp."""
@pytest.mark.asyncio
async def test_extract_captions_success(self, tmp_path):
"""extract_captions returns CaptionResult when yt-dlp succeeds."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
# Pre-populate the SRT file that yt-dlp would create
srt_file = tmp_path / f"{video_id}.en.srt"
srt_file.write_text("""1
00:00:01,000 --> 00:00:04,500
Welcome
2
00:00:04,500 --> 00:00:09,000
NVDA discussion""")
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
result = await extract_captions(video_id, workdir)
# Should return CaptionResult
assert result is not None
assert isinstance(result, CaptionResult)
assert result.source == "youtube"
assert result.language == "en"
assert result.raw_text is not None
assert len(result.segments) == 2
assert result.word_count > 0
# Verify yt-dlp was called with correct arguments
mock_run.assert_called_once()
call_args = mock_run.call_args[0][0] # First positional arg is cmd list
assert "yt-dlp" in call_args[0]
assert "--write-auto-sub" in call_args
assert "--write-sub" in call_args
assert "--sub-lang" in call_args
assert "en.*" in call_args
assert "--skip-download" in call_args
assert "--convert-subs" in call_args
assert "srt" in call_args
@pytest.mark.asyncio
async def test_extract_captions_no_srt_produced(self, tmp_path):
"""extract_captions returns None when no SRT file is created."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
# Don't create any SRT file
result = await extract_captions(video_id, workdir)
assert result is None
@pytest.mark.asyncio
async def test_extract_captions_yt_dlp_failure(self, tmp_path):
"""extract_captions returns None when yt-dlp exits with non-zero."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 1 # Non-zero exit
result = await extract_captions(video_id, workdir)
assert result is None
@pytest.mark.asyncio
async def test_extract_captions_prefers_manual_subs(self, tmp_path):
"""extract_captions prefers manual subs over auto-generated."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
# Create both manual and auto subs
manual_srt = tmp_path / f"{video_id}.en.srt"
manual_srt.write_text("""1
00:00:01,000 --> 00:00:05,000
Manual subtitle""")
auto_srt = tmp_path / f"{video_id}.en.auto.srt"
auto_srt.write_text("""1
00:00:01,000 --> 00:00:05,000
Auto subtitle""")
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
result = await extract_captions(video_id, workdir)
# Should use manual (without .auto.)
assert result is not None
assert "Manual subtitle" in result.raw_text
@pytest.mark.asyncio
async def test_extract_captions_fallback_to_auto(self, tmp_path):
"""extract_captions falls back to auto-subs if no manual subs."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
# Create only auto subs
auto_srt = tmp_path / f"{video_id}.en.auto.srt"
auto_srt.write_text("""1
00:00:01,000 --> 00:00:05,000
Auto subtitle""")
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
result = await extract_captions(video_id, workdir)
# Should use auto-subs when manual not available
assert result is not None
assert "Auto subtitle" in result.raw_text
@pytest.mark.asyncio
async def test_extract_captions_cleans_up_srt(self, tmp_path):
"""extract_captions deletes SRT files after parsing."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
# Create SRT file
srt_file = tmp_path / f"{video_id}.en.srt"
srt_file.write_text("""1
00:00:01,000 --> 00:00:05,000
Test""")
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
result = await extract_captions(video_id, workdir)
assert result is not None
# SRT file should be deleted after parsing
assert not srt_file.exists()
@pytest.mark.asyncio
async def test_extract_captions_custom_lang_glob(self, tmp_path):
"""extract_captions accepts custom language glob pattern."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
# Create a German subtitle file
de_srt = tmp_path / f"{video_id}.de.srt"
de_srt.write_text("""1
00:00:01,000 --> 00:00:05,000
Deutscher Untertitel""")
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
result = await extract_captions(video_id, workdir, sub_lang_glob="de.*")
# Should find the German subtitle
assert result is not None
assert result.language == "de"
assert "Deutscher Untertitel" in result.raw_text
@pytest.mark.asyncio
async def test_extract_captions_word_count(self, tmp_path):
"""extract_captions calculates word count correctly."""
video_id = "dQw4w9WgXcQ"
workdir = str(tmp_path)
# Create SRT with known word count
srt_file = tmp_path / f"{video_id}.en.srt"
srt_file.write_text("""1
00:00:01,000 --> 00:00:05,000
One two three
2
00:00:05,000 --> 00:00:10,000
Four five""")
with patch(
"services.meet_kevin_watcher.caption_extractor._run_yt_dlp"
) as mock_run:
mock_run.return_value = 0
result = await extract_captions(video_id, workdir)
assert result is not None
# 5 words total: "One two three Four five"
assert result.word_count == 5
def test_caption_result_is_frozen(self):
"""CaptionResult is frozen (immutable)."""
result = CaptionResult(
source="youtube",
language="en",
raw_text="Test subtitle",
segments=({"start": 0.0, "end": 1.0, "text": "Test"},),
word_count=1,
)
# Should not be mutable (frozen=True)
with pytest.raises(AttributeError):
result.source = "other" # type: ignore