infra/modules/kubernetes/ebook2audiobook/audiblez-web/backend/services/epub_parser.py
Viktor Barzin bcad200a23 chore: add untracked stacks, scripts, and agent configs
- New stacks: beads-server, hermes-agent
- Terragrunt tiers.tf for infra, phpipam, status-page
- Secrets symlinks for vault, phpipam, hermes-agent
- Scripts: cluster_manager, image_pull, containerd pullthrough setup
- Frigate config, audiblez-web app source, n8n workflows dir
- Claude agent: service-upgrade, reference: upgrade-config.json
- Removed: claudeception skill, excalidraw empty submodule, temp listings

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 09:33:06 +00:00

166 lines
4.9 KiB
Python

"""EPUB chapter extraction service.
This parser attempts to match audiblez's chapter detection logic to ensure
the extracted chapters align with the WAV files audiblez produces.
audiblez iterates through EPUB ITEM_DOCUMENTs and uses is_chapter() to determine
if a document is a chapter based on content length (100+ chars) and filename patterns.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from bs4 import BeautifulSoup
from ebooklib import epub, ITEM_DOCUMENT
@dataclass
class Chapter:
"""Represents a chapter extracted from an EPUB."""
title: str
index: int
duration_ms: int = 0
start_ms: int = 0
end_ms: int = 0
def sanitize_title(title: str) -> str:
"""Remove characters that break FFmpeg metadata format."""
if not title:
return "Untitled"
# Escape special chars for FFmpeg FFMETADATA format
return (title
.replace('=', '-')
.replace(';', '-')
.replace('#', '')
.replace('\\', '')
.replace('\n', ' ')
.replace('\r', '')
.strip())
def is_chapter(text: str, filename: str) -> bool:
"""Determine if a document is a chapter.
Matches audiblez's is_chapter() logic:
- Content must be over 100 characters
- Filename should match common chapter patterns
"""
if len(text) < 100:
return False
# Check filename patterns that indicate a chapter
filename_lower = filename.lower()
chapter_patterns = [
r'chapter',
r'part[_-]?\d+',
r'split[_-]?\d+',
r'ch[_-]?\d+',
r'chap[_-]?\d+',
r'sect', # section
r'content',
r'text',
]
for pattern in chapter_patterns:
if re.search(pattern, filename_lower):
return True
# If content is substantial (1000+ chars), likely a chapter even without pattern match
if len(text) > 1000:
return True
return False
def extract_title_from_content(soup: BeautifulSoup, filename: str, index: int) -> str:
"""Extract a chapter title from the document content."""
# Try to find title in common heading tags
for tag in ['title', 'h1', 'h2', 'h3']:
element = soup.find(tag)
if element and element.get_text(strip=True):
title = element.get_text(strip=True)
# Truncate long titles
if len(title) > 100:
title = title[:97] + "..."
return title
# Fallback: use filename without extension
stem = Path(filename).stem
# Clean up common patterns
stem = re.sub(r'^(chapter|chap|ch)[_-]?', 'Chapter ', stem, flags=re.IGNORECASE)
stem = re.sub(r'[_-]', ' ', stem)
if stem and len(stem) < 50:
return stem.title()
return f"Chapter {index + 1}"
def extract_chapters(epub_path: Path) -> list[Chapter]:
"""Extract chapter titles matching audiblez's chapter detection logic.
audiblez determines chapters by:
1. Iterating through ITEM_DOCUMENT items
2. Checking is_chapter() based on content length and filename patterns
This ensures our chapter count matches the WAV files audiblez produces.
Args:
epub_path: Path to the EPUB file
Returns:
List of Chapter objects with title and index
"""
try:
book = epub.read_epub(str(epub_path))
except Exception as e:
print(f"Failed to read EPUB: {e}")
return []
chapters: list[Chapter] = []
chapter_index = 0
# Iterate through documents like audiblez does
for item in book.get_items():
if item.get_type() != ITEM_DOCUMENT:
continue
try:
# Get content and parse with BeautifulSoup
content = item.get_content()
soup = BeautifulSoup(content, features='lxml')
# Extract text from relevant tags (matching audiblez)
text_parts = []
for tag in soup.find_all(['title', 'p', 'h1', 'h2', 'h3', 'h4', 'li']):
text = tag.get_text(strip=True)
if text:
text_parts.append(text)
full_text = ' '.join(text_parts)
filename = item.get_name() or ""
# Check if this document is a chapter
if is_chapter(full_text, filename):
title = extract_title_from_content(soup, filename, chapter_index)
chapters.append(Chapter(
title=sanitize_title(title),
index=chapter_index
))
chapter_index += 1
except Exception as e:
print(f"Error processing document {item.get_name()}: {e}")
continue
print(f"Extracted {len(chapters)} chapters from EPUB (audiblez-style detection)")
# Debug: print first few chapters
for i, ch in enumerate(chapters[:5]):
print(f" {i+1}. {ch.title}")
if len(chapters) > 5:
print(f" ... and {len(chapters) - 5} more")
return chapters