infra/modules/kubernetes/ebook2audiobook/audiblez-web/backend/services/converter.py
Viktor Barzin bcad200a23 chore: add untracked stacks, scripts, and agent configs
- New stacks: beads-server, hermes-agent
- Terragrunt tiers.tf for infra, phpipam, status-page
- Secrets symlinks for vault, phpipam, hermes-agent
- Scripts: cluster_manager, image_pull, containerd pullthrough setup
- Frigate config, audiblez-web app source, n8n workflows dir
- Claude agent: service-upgrade, reference: upgrade-config.json
- Removed: claudeception skill, excalidraw empty submodule, temp listings

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 09:33:06 +00:00

291 lines
12 KiB
Python

import asyncio
import uuid
import os
from datetime import datetime
from pathlib import Path
from typing import Callable, Optional
import subprocess
import json
from models.schemas import Job, JobStatus, JobProgress, ChapterInfo
from services.epub_parser import extract_chapters, Chapter
from services.chapter_embedder import (
get_chapter_audio_durations,
generate_ffmpeg_metadata,
embed_chapters_in_m4b
)
class JobManager:
"""Manages conversion jobs and their state with user isolation."""
def __init__(self, storage_path: str = "/mnt"):
self.storage_path = Path(storage_path)
self.jobs: dict[str, Job] = {}
self.progress_callbacks: dict[str, list[Callable]] = {}
def get_user_uploads_dir(self, user_id: str) -> Path:
"""Get the uploads directory for a specific user."""
user_dir = self.storage_path / "users" / user_id / "uploads"
user_dir.mkdir(parents=True, exist_ok=True)
return user_dir
def get_user_outputs_dir(self, user_id: str) -> Path:
"""Get the outputs directory for a specific user."""
user_dir = self.storage_path / "users" / user_id / "outputs"
user_dir.mkdir(parents=True, exist_ok=True)
return user_dir
def create_job(self, user_id: str, filename: str, voice: str, speed: float, use_gpu: bool) -> Job:
"""Create a new conversion job for a user."""
job_id = str(uuid.uuid4())
now = datetime.now()
job = Job(
id=job_id,
user_id=user_id,
filename=filename,
voice=voice,
speed=speed,
use_gpu=use_gpu,
status=JobStatus.PENDING,
created_at=now,
updated_at=now,
)
self.jobs[job_id] = job
return job
def get_job(self, job_id: str, user_id: Optional[str] = None) -> Optional[Job]:
"""Get a job by ID. If user_id is provided, verify ownership."""
job = self.jobs.get(job_id)
if job and user_id and job.user_id != user_id:
return None # User doesn't own this job
return job
def get_user_jobs(self, user_id: str) -> list[Job]:
"""Get all jobs for a specific user."""
return [job for job in self.jobs.values() if job.user_id == user_id]
def get_all_jobs(self) -> list[Job]:
"""Get all jobs (admin use only)."""
return list(self.jobs.values())
def update_job_status(self, job_id: str, status: JobStatus, error: Optional[str] = None):
"""Update job status."""
if job_id in self.jobs:
self.jobs[job_id].status = status
self.jobs[job_id].updated_at = datetime.now()
if error:
self.jobs[job_id].error = error
def update_job_progress(self, job_id: str, progress: float, current_chapter: Optional[str] = None, eta: Optional[str] = None):
"""Update job progress."""
if job_id in self.jobs:
self.jobs[job_id].progress = progress
self.jobs[job_id].updated_at = datetime.now()
# Notify callbacks
if job_id in self.progress_callbacks:
progress_data = JobProgress(
progress=progress,
eta=eta,
current_chapter=current_chapter,
status=self.jobs[job_id].status
)
for callback in self.progress_callbacks[job_id]:
try:
asyncio.create_task(callback(progress_data))
except Exception as e:
print(f"Error in progress callback: {e}")
def register_progress_callback(self, job_id: str, callback: Callable):
"""Register a callback for progress updates."""
if job_id not in self.progress_callbacks:
self.progress_callbacks[job_id] = []
self.progress_callbacks[job_id].append(callback)
def unregister_progress_callback(self, job_id: str, callback: Callable):
"""Unregister a progress callback."""
if job_id in self.progress_callbacks:
self.progress_callbacks[job_id].remove(callback)
if not self.progress_callbacks[job_id]:
del self.progress_callbacks[job_id]
async def run_conversion(self, job_id: str):
"""Run the audiblez conversion in the background."""
job = self.jobs.get(job_id)
if not job:
return
try:
self.update_job_status(job_id, JobStatus.PROCESSING)
# Prepare user-specific paths
input_path = self.get_user_uploads_dir(job.user_id) / job.filename
output_dir = self.get_user_outputs_dir(job.user_id) / job_id
output_dir.mkdir(parents=True, exist_ok=True)
# Extract chapters from EPUB before conversion
chapters: list[Chapter] = []
if input_path.suffix.lower() == '.epub':
chapters = extract_chapters(input_path)
self.jobs[job_id].total_chapters = len(chapters)
print(f"Extracted {len(chapters)} chapters from EPUB")
# Build audiblez command - use the venv python/audiblez
cmd = [
"/app/audiblez/bin/audiblez",
str(input_path),
"-o", str(output_dir),
"-v", job.voice,
"-s", str(job.speed),
]
if job.use_gpu:
cmd.append("-c") # --cuda flag for GPU
# Run conversion
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd=str(self.storage_path)
)
# Monitor progress from stderr/stdout
async def read_stream(stream, is_stderr=False):
while True:
line = await stream.readline()
if not line:
break
line_str = line.decode().strip()
print(f"{'[STDERR]' if is_stderr else '[STDOUT]'} {line_str}")
# Parse progress from output
# Audiblez outputs progress in various formats
# We'll do a simple pattern match
if "%" in line_str:
try:
# Try to extract percentage
parts = line_str.split("%")
if len(parts) > 1:
# Find the number before %
num_str = parts[0].split()[-1]
progress = float(num_str)
self.update_job_progress(job_id, progress)
except:
pass
# Check for chapter info
if "Chapter" in line_str or "chapter" in line_str:
self.update_job_progress(
job_id,
job.progress,
current_chapter=line_str
)
# Read both streams concurrently
await asyncio.gather(
read_stream(process.stdout),
read_stream(process.stderr, is_stderr=True)
)
# Wait for completion
returncode = await process.wait()
if returncode == 0:
# Find output file
output_files = list(output_dir.glob("*.m4b"))
if not output_files:
output_files = list(output_dir.glob("*.mp3"))
if output_files:
output_file = output_files[0]
# Embed chapter metadata if we have chapters and WAV files
if chapters:
try:
durations = get_chapter_audio_durations(output_dir)
print(f"Found {len(durations)} chapter audio durations")
if durations:
# Match chapter count with duration count
num_chapters = min(len(chapters), len(durations))
if num_chapters != len(chapters):
print(f"Warning: chapter count ({len(chapters)}) != duration count ({len(durations)})")
metadata = generate_ffmpeg_metadata(chapters[:num_chapters], durations[:num_chapters])
embed_chapters_in_m4b(output_file, metadata)
# Store chapter info in job for API access
self.jobs[job_id].chapters = [
ChapterInfo(
title=c.title,
start_ms=c.start_ms,
end_ms=c.end_ms
)
for c in chapters[:num_chapters]
]
print(f"Embedded {num_chapters} chapters in M4B")
else:
print("No WAV files found for chapter duration calculation")
except Exception as e:
print(f"Failed to embed chapters: {e}")
# Continue without chapter embedding - non-fatal error
self.jobs[job_id].output_file = str(output_file.name)
self.update_job_status(job_id, JobStatus.COMPLETED)
self.update_job_progress(job_id, 100.0)
else:
self.update_job_status(job_id, JobStatus.FAILED, "No output file generated")
else:
self.update_job_status(job_id, JobStatus.FAILED, f"Conversion failed with code {returncode}")
except Exception as e:
print(f"Conversion error: {e}")
self.update_job_status(job_id, JobStatus.FAILED, str(e))
def delete_job(self, job_id: str, user_id: str) -> bool:
"""Delete a job and its output files."""
job = self.get_job(job_id, user_id)
if not job:
return False
# Delete output directory if exists
output_dir = self.get_user_outputs_dir(user_id) / job_id
if output_dir.exists():
import shutil
shutil.rmtree(output_dir)
# Remove from jobs dict
del self.jobs[job_id]
return True
def get_user_audiobooks(self, user_id: str) -> list[dict]:
"""List all completed audiobooks for a user."""
outputs_dir = self.get_user_outputs_dir(user_id)
audiobooks = []
if outputs_dir.exists():
for job_dir in outputs_dir.iterdir():
if job_dir.is_dir():
# Look for m4b or mp3 files
audio_files = list(job_dir.glob("*.m4b")) + list(job_dir.glob("*.mp3"))
for audio_file in audio_files:
stat = audio_file.stat()
audiobooks.append({
"id": job_dir.name,
"filename": audio_file.name,
"size": stat.st_size,
"created_at": stat.st_mtime,
})
# Sort by creation time, newest first
audiobooks.sort(key=lambda x: x["created_at"], reverse=True)
return audiobooks
# Global job manager instance
job_manager = JobManager()