Add crawl job progress drawer with phase tracking and live logs
- Add phase-aware progress reporting across all crawl phases (splitting, fetching, filtering, processing) with per-step counters - Add TaskProgressDrawer component with phase timeline stepper, detail counters, progress bar with ETA, and live worker log viewer - Add on_step_complete callback to ListingProcessor for granular tracking of details/images/OCR steps - Extend QuerySplitter on_progress callback with structured counter data - Capture celery worker logs via ring buffer handler and inject into task state updates for frontend display - Guard taskResult updates with phase presence check to prevent drawer from blanking during state transitions
This commit is contained in:
parent
4018503723
commit
b4837e1603
6 changed files with 617 additions and 24 deletions
|
|
@ -7,6 +7,7 @@ import { useEffect, useState } from 'react';
|
||||||
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from './ui/tooltip';
|
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from './ui/tooltip';
|
||||||
import { Button } from './ui/button';
|
import { Button } from './ui/button';
|
||||||
import { Loader2, CheckCircle2, XCircle, X, Trash2 } from 'lucide-react';
|
import { Loader2, CheckCircle2, XCircle, X, Trash2 } from 'lucide-react';
|
||||||
|
import { TaskProgressDrawer } from './TaskProgressDrawer';
|
||||||
|
|
||||||
interface TaskIndicatorProps {
|
interface TaskIndicatorProps {
|
||||||
taskID: string | null;
|
taskID: string | null;
|
||||||
|
|
@ -19,8 +20,10 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
const [processed, setProcessed] = useState<number | null>(null);
|
const [processed, setProcessed] = useState<number | null>(null);
|
||||||
const [total, setTotal] = useState<number | null>(null);
|
const [total, setTotal] = useState<number | null>(null);
|
||||||
const [taskStatus, setTaskStatus] = useState<TaskStatus | null>(null);
|
const [taskStatus, setTaskStatus] = useState<TaskStatus | null>(null);
|
||||||
|
const [taskResult, setTaskResult] = useState<TaskResult | null>(null);
|
||||||
const [isCancelling, setIsCancelling] = useState(false);
|
const [isCancelling, setIsCancelling] = useState(false);
|
||||||
const [isClearing, setIsClearing] = useState(false);
|
const [isClearing, setIsClearing] = useState(false);
|
||||||
|
const [drawerOpen, setDrawerOpen] = useState(false);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
getUser().then(setUser);
|
getUser().then(setUser);
|
||||||
|
|
@ -29,6 +32,7 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!user || !taskID) {
|
if (!user || !taskID) {
|
||||||
setTaskStatus(null);
|
setTaskStatus(null);
|
||||||
|
setTaskResult(null);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -37,6 +41,7 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
setProgressPercentage(0);
|
setProgressPercentage(0);
|
||||||
setProcessed(null);
|
setProcessed(null);
|
||||||
setTotal(null);
|
setTotal(null);
|
||||||
|
setTaskResult(null);
|
||||||
|
|
||||||
const pollTaskStatus = async () => {
|
const pollTaskStatus = async () => {
|
||||||
try {
|
try {
|
||||||
|
|
@ -46,6 +51,20 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
|
|
||||||
if (status === TaskStatus.SUCCESS) {
|
if (status === TaskStatus.SUCCESS) {
|
||||||
setProgressPercentage(100);
|
setProgressPercentage(100);
|
||||||
|
// Parse final result for the drawer to show completed state.
|
||||||
|
// Only update taskResult if the new result has phase info;
|
||||||
|
// otherwise keep the last in-progress result which has richer data
|
||||||
|
// than the bare SUCCESS return value.
|
||||||
|
if (data.result) {
|
||||||
|
try {
|
||||||
|
const parsedResult: TaskResult = JSON.parse(data.result);
|
||||||
|
if (parsedResult.phase) {
|
||||||
|
setTaskResult(parsedResult);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore parsing errors
|
||||||
|
}
|
||||||
|
}
|
||||||
return true; // Stop polling
|
return true; // Stop polling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -57,7 +76,18 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
if (data.result) {
|
if (data.result) {
|
||||||
try {
|
try {
|
||||||
const parsedResult: TaskResult = JSON.parse(data.result);
|
const parsedResult: TaskResult = JSON.parse(data.result);
|
||||||
setProgressPercentage(parsedResult.progress * 100);
|
// Only update taskResult if the parsed data has a phase field.
|
||||||
|
// This prevents blanking the drawer when the backend sends a
|
||||||
|
// state update without phase info (e.g. during brief transitions).
|
||||||
|
if (parsedResult.phase) {
|
||||||
|
setTaskResult(parsedResult);
|
||||||
|
}
|
||||||
|
// Only update progress/processed/total when the fields are
|
||||||
|
// actually present — otherwise keep the previous values so
|
||||||
|
// the UI doesn't flash back to 0 during phase transitions.
|
||||||
|
if (parsedResult.progress !== undefined) {
|
||||||
|
setProgressPercentage(parsedResult.progress * 100);
|
||||||
|
}
|
||||||
if (parsedResult.processed !== undefined) {
|
if (parsedResult.processed !== undefined) {
|
||||||
setProcessed(parsedResult.processed);
|
setProcessed(parsedResult.processed);
|
||||||
}
|
}
|
||||||
|
|
@ -113,6 +143,7 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
const result = await clearAllTasks(user);
|
const result = await clearAllTasks(user);
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
setTaskStatus(null);
|
setTaskStatus(null);
|
||||||
|
setTaskResult(null);
|
||||||
onTaskCancelled?.();
|
onTaskCancelled?.();
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
|
|
@ -144,18 +175,27 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
if (processed !== null && total !== null && total > 0) {
|
if (processed !== null && total !== null && total > 0) {
|
||||||
return `${processed} / ${total}`;
|
return `${processed} / ${total}`;
|
||||||
}
|
}
|
||||||
|
if (taskResult?.phase && taskResult.phase !== 'processing') {
|
||||||
|
const phaseLabels: Record<string, string> = {
|
||||||
|
splitting: 'Splitting',
|
||||||
|
splitting_complete: 'Split done',
|
||||||
|
fetching: 'Fetching',
|
||||||
|
filtering: 'Filtering',
|
||||||
|
};
|
||||||
|
return phaseLabels[taskResult.phase] ?? `${Math.round(progressPercentage)}%`;
|
||||||
|
}
|
||||||
return `${Math.round(progressPercentage)}%`;
|
return `${Math.round(progressPercentage)}%`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const getTooltipContent = () => {
|
const getTooltipContent = () => {
|
||||||
if (isInProgress) {
|
if (isInProgress) {
|
||||||
if (processed !== null && total !== null && total > 0) {
|
if (processed !== null && total !== null && total > 0) {
|
||||||
return `Processing: ${processed} / ${total} listings (${Math.round(progressPercentage)}%)`;
|
return `Processing: ${processed} / ${total} listings (${Math.round(progressPercentage)}%) — click for details`;
|
||||||
}
|
}
|
||||||
return `Task running: ${Math.round(progressPercentage)}%`;
|
return `Task running: ${getProgressText()} — click for details`;
|
||||||
}
|
}
|
||||||
if (taskStatus === TaskStatus.SUCCESS) {
|
if (taskStatus === TaskStatus.SUCCESS) {
|
||||||
return 'Task completed successfully';
|
return 'Task completed successfully — click for details';
|
||||||
}
|
}
|
||||||
if (taskStatus === TaskStatus.REVOKED) {
|
if (taskStatus === TaskStatus.REVOKED) {
|
||||||
return 'Task was cancelled';
|
return 'Task was cancelled';
|
||||||
|
|
@ -168,7 +208,10 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
<Tooltip>
|
<Tooltip>
|
||||||
<TooltipTrigger asChild>
|
<TooltipTrigger asChild>
|
||||||
<div className="flex items-center gap-2 cursor-default">
|
<div
|
||||||
|
className="flex items-center gap-2 cursor-pointer"
|
||||||
|
onClick={() => setDrawerOpen(true)}
|
||||||
|
>
|
||||||
{getStatusIcon()}
|
{getStatusIcon()}
|
||||||
{isInProgress && (
|
{isInProgress && (
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
|
|
@ -230,6 +273,15 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
</TooltipContent>
|
</TooltipContent>
|
||||||
</Tooltip>
|
</Tooltip>
|
||||||
</div>
|
</div>
|
||||||
|
<TaskProgressDrawer
|
||||||
|
open={drawerOpen}
|
||||||
|
onOpenChange={setDrawerOpen}
|
||||||
|
taskResult={taskResult}
|
||||||
|
taskStatus={taskStatus}
|
||||||
|
taskID={taskID}
|
||||||
|
onCancel={handleCancel}
|
||||||
|
isCancelling={isCancelling}
|
||||||
|
/>
|
||||||
</TooltipProvider>
|
</TooltipProvider>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
363
crawler/frontend/src/components/TaskProgressDrawer.tsx
Normal file
363
crawler/frontend/src/components/TaskProgressDrawer.tsx
Normal file
|
|
@ -0,0 +1,363 @@
|
||||||
|
import { TaskStatus, type TaskPhase, type TaskResult } from '@/types';
|
||||||
|
import {
|
||||||
|
Sheet,
|
||||||
|
SheetContent,
|
||||||
|
SheetHeader,
|
||||||
|
SheetTitle,
|
||||||
|
SheetDescription,
|
||||||
|
SheetFooter,
|
||||||
|
} from './ui/sheet';
|
||||||
|
import { Button } from './ui/button';
|
||||||
|
import { CheckCircle2, Circle, Loader2, XCircle } from 'lucide-react';
|
||||||
|
import { useEffect, useRef } from 'react';
|
||||||
|
|
||||||
|
interface TaskProgressDrawerProps {
|
||||||
|
open: boolean;
|
||||||
|
onOpenChange: (open: boolean) => void;
|
||||||
|
taskResult: TaskResult | null;
|
||||||
|
taskStatus: TaskStatus | null;
|
||||||
|
taskID: string | null;
|
||||||
|
onCancel: () => void;
|
||||||
|
isCancelling: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PHASES: { key: TaskPhase; label: string }[] = [
|
||||||
|
{ key: 'splitting', label: 'Splitting queries' },
|
||||||
|
{ key: 'fetching', label: 'Fetching listings' },
|
||||||
|
{ key: 'filtering', label: 'Filtering results' },
|
||||||
|
{ key: 'processing', label: 'Processing listings' },
|
||||||
|
];
|
||||||
|
|
||||||
|
function getPhaseIndex(phase: TaskPhase | undefined): number {
|
||||||
|
if (!phase) return -1;
|
||||||
|
if (phase === 'splitting_complete') return 1; // splitting done, fetching is next
|
||||||
|
if (phase === 'completed') return PHASES.length;
|
||||||
|
return PHASES.findIndex((p) => p.key === phase);
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatEta(seconds: number | undefined): string {
|
||||||
|
if (seconds === undefined || seconds <= 0) return '';
|
||||||
|
const mins = Math.floor(seconds / 60);
|
||||||
|
const secs = Math.round(seconds % 60);
|
||||||
|
if (mins > 0) {
|
||||||
|
return `~${mins}m ${secs}s remaining`;
|
||||||
|
}
|
||||||
|
return `~${secs}s remaining`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function StatusBadge({ status }: { status: TaskStatus | null }) {
|
||||||
|
if (!status) return null;
|
||||||
|
|
||||||
|
const isInProgress =
|
||||||
|
status !== TaskStatus.SUCCESS &&
|
||||||
|
status !== TaskStatus.FAILURE &&
|
||||||
|
status !== TaskStatus.REVOKED;
|
||||||
|
|
||||||
|
if (isInProgress) {
|
||||||
|
return (
|
||||||
|
<span className="inline-flex items-center gap-1 rounded-full bg-blue-100 px-2 py-0.5 text-xs font-medium text-blue-700">
|
||||||
|
<Loader2 className="h-3 w-3 animate-spin" />
|
||||||
|
Running
|
||||||
|
</span>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (status === TaskStatus.SUCCESS) {
|
||||||
|
return (
|
||||||
|
<span className="inline-flex items-center gap-1 rounded-full bg-green-100 px-2 py-0.5 text-xs font-medium text-green-700">
|
||||||
|
<CheckCircle2 className="h-3 w-3" />
|
||||||
|
Complete
|
||||||
|
</span>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (status === TaskStatus.REVOKED) {
|
||||||
|
return (
|
||||||
|
<span className="inline-flex items-center gap-1 rounded-full bg-yellow-100 px-2 py-0.5 text-xs font-medium text-yellow-700">
|
||||||
|
<XCircle className="h-3 w-3" />
|
||||||
|
Cancelled
|
||||||
|
</span>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
<span className="inline-flex items-center gap-1 rounded-full bg-red-100 px-2 py-0.5 text-xs font-medium text-red-700">
|
||||||
|
<XCircle className="h-3 w-3" />
|
||||||
|
Failed
|
||||||
|
</span>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function PhaseTimeline({
|
||||||
|
currentPhase,
|
||||||
|
taskStatus,
|
||||||
|
}: {
|
||||||
|
currentPhase: TaskPhase | undefined;
|
||||||
|
taskStatus: TaskStatus | null;
|
||||||
|
}) {
|
||||||
|
const isTerminal =
|
||||||
|
taskStatus === TaskStatus.SUCCESS ||
|
||||||
|
taskStatus === TaskStatus.FAILURE ||
|
||||||
|
taskStatus === TaskStatus.REVOKED;
|
||||||
|
const activeIdx = isTerminal ? PHASES.length : getPhaseIndex(currentPhase);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col gap-1">
|
||||||
|
{PHASES.map((phase, idx) => {
|
||||||
|
const isCompleted = idx < activeIdx;
|
||||||
|
const isActive = idx === activeIdx && !isTerminal;
|
||||||
|
const isFuture = idx > activeIdx;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div key={phase.key} className="flex items-center gap-2">
|
||||||
|
{isCompleted && (
|
||||||
|
<CheckCircle2 className="h-4 w-4 text-green-500 shrink-0" />
|
||||||
|
)}
|
||||||
|
{isActive && (
|
||||||
|
<Loader2 className="h-4 w-4 animate-spin text-blue-500 shrink-0" />
|
||||||
|
)}
|
||||||
|
{isFuture && (
|
||||||
|
<Circle className="h-4 w-4 text-muted-foreground/40 shrink-0" />
|
||||||
|
)}
|
||||||
|
<span
|
||||||
|
className={
|
||||||
|
isActive
|
||||||
|
? 'text-sm font-medium text-foreground'
|
||||||
|
: isCompleted
|
||||||
|
? 'text-sm text-muted-foreground'
|
||||||
|
: 'text-sm text-muted-foreground/40'
|
||||||
|
}
|
||||||
|
>
|
||||||
|
{phase.label}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function CounterRow({ label, value, total }: { label: string; value?: number; total?: number }) {
|
||||||
|
if (value === undefined) return null;
|
||||||
|
return (
|
||||||
|
<div className="flex justify-between text-sm">
|
||||||
|
<span className="text-muted-foreground">{label}</span>
|
||||||
|
<span className="font-mono tabular-nums">
|
||||||
|
{value}
|
||||||
|
{total !== undefined && ` / ${total}`}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function PhaseDetails({ result }: { result: TaskResult }) {
|
||||||
|
const phase = result.phase;
|
||||||
|
|
||||||
|
if (phase === 'splitting' || phase === 'splitting_complete') {
|
||||||
|
return (
|
||||||
|
<div className="rounded-md border p-3 space-y-1">
|
||||||
|
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
||||||
|
Splitting
|
||||||
|
</p>
|
||||||
|
<CounterRow
|
||||||
|
label="Subqueries probed"
|
||||||
|
value={result.subqueries_probed}
|
||||||
|
total={result.subqueries_initial}
|
||||||
|
/>
|
||||||
|
{result.subqueries_total !== undefined && (
|
||||||
|
<CounterRow label="Final subqueries" value={result.subqueries_total} />
|
||||||
|
)}
|
||||||
|
{result.estimated_results !== undefined && (
|
||||||
|
<CounterRow label="Estimated results" value={result.estimated_results} />
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (phase === 'fetching') {
|
||||||
|
return (
|
||||||
|
<div className="rounded-md border p-3 space-y-1">
|
||||||
|
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
||||||
|
Fetching
|
||||||
|
</p>
|
||||||
|
<CounterRow
|
||||||
|
label="Subqueries completed"
|
||||||
|
value={result.subqueries_completed}
|
||||||
|
total={result.subqueries_total}
|
||||||
|
/>
|
||||||
|
<CounterRow label="IDs collected" value={result.ids_collected} />
|
||||||
|
<CounterRow label="Pages fetched" value={result.pages_fetched} />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (phase === 'filtering') {
|
||||||
|
return (
|
||||||
|
<div className="rounded-md border p-3 space-y-1">
|
||||||
|
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
||||||
|
Filtering
|
||||||
|
</p>
|
||||||
|
<CounterRow label="Total from API" value={result.total_found} />
|
||||||
|
<CounterRow label="Already in DB" value={result.existing_in_db} />
|
||||||
|
<CounterRow label="New to process" value={result.new_listings} />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (phase === 'processing') {
|
||||||
|
return (
|
||||||
|
<div className="rounded-md border p-3 space-y-1">
|
||||||
|
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
||||||
|
Processing
|
||||||
|
</p>
|
||||||
|
<CounterRow
|
||||||
|
label="Details fetched"
|
||||||
|
value={result.details_fetched}
|
||||||
|
total={result.total}
|
||||||
|
/>
|
||||||
|
<CounterRow label="Images downloaded" value={result.images_downloaded} />
|
||||||
|
<CounterRow label="OCR completed" value={result.ocr_completed} />
|
||||||
|
{(result.failed ?? 0) > 0 && (
|
||||||
|
<div className="flex justify-between text-sm">
|
||||||
|
<span className="text-red-500">Failed</span>
|
||||||
|
<span className="font-mono tabular-nums text-red-500">{result.failed}</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function LogViewer({ logs }: { logs: string[] }) {
|
||||||
|
const scrollRef = useRef<HTMLDivElement>(null);
|
||||||
|
const isAutoScrolling = useRef(true);
|
||||||
|
|
||||||
|
const handleScroll = () => {
|
||||||
|
const el = scrollRef.current;
|
||||||
|
if (!el) return;
|
||||||
|
const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight < 30;
|
||||||
|
isAutoScrolling.current = atBottom;
|
||||||
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (isAutoScrolling.current && scrollRef.current) {
|
||||||
|
scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
|
||||||
|
}
|
||||||
|
}, [logs]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
ref={scrollRef}
|
||||||
|
onScroll={handleScroll}
|
||||||
|
className="rounded-md bg-zinc-950 p-3 overflow-y-auto font-mono text-[11px] leading-4 text-zinc-300 min-h-[100px] h-full"
|
||||||
|
>
|
||||||
|
{logs.length === 0 ? (
|
||||||
|
<span className="text-zinc-500 italic">Waiting for logs...</span>
|
||||||
|
) : (
|
||||||
|
logs.map((line, i) => (
|
||||||
|
<div key={i} className="whitespace-pre-wrap break-all">
|
||||||
|
{line}
|
||||||
|
</div>
|
||||||
|
))
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function TaskProgressDrawer({
|
||||||
|
open,
|
||||||
|
onOpenChange,
|
||||||
|
taskResult,
|
||||||
|
taskStatus,
|
||||||
|
taskID,
|
||||||
|
onCancel,
|
||||||
|
isCancelling,
|
||||||
|
}: TaskProgressDrawerProps) {
|
||||||
|
const isInProgress =
|
||||||
|
taskStatus !== null &&
|
||||||
|
taskStatus !== TaskStatus.SUCCESS &&
|
||||||
|
taskStatus !== TaskStatus.FAILURE &&
|
||||||
|
taskStatus !== TaskStatus.REVOKED;
|
||||||
|
|
||||||
|
const progressPercent = taskResult
|
||||||
|
? Math.min((taskResult.progress ?? 0) * 100, 100)
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Sheet open={open} onOpenChange={onOpenChange}>
|
||||||
|
<SheetContent side="right" className="flex flex-col w-full sm:!max-w-lg">
|
||||||
|
<SheetHeader>
|
||||||
|
<div className="flex items-center justify-between pr-6">
|
||||||
|
<SheetTitle>Crawl Job Progress</SheetTitle>
|
||||||
|
<StatusBadge status={taskStatus} />
|
||||||
|
</div>
|
||||||
|
{taskID && (
|
||||||
|
<SheetDescription>
|
||||||
|
Task ID: {taskID.slice(0, 8)}...
|
||||||
|
</SheetDescription>
|
||||||
|
)}
|
||||||
|
</SheetHeader>
|
||||||
|
|
||||||
|
{/* Fixed top section: timeline + counters + progress */}
|
||||||
|
<div className="space-y-3 px-4 shrink-0">
|
||||||
|
<PhaseTimeline
|
||||||
|
currentPhase={taskResult?.phase}
|
||||||
|
taskStatus={taskStatus}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{taskResult && <PhaseDetails result={taskResult} />}
|
||||||
|
|
||||||
|
{taskResult && taskResult.phase === 'processing' && (
|
||||||
|
<div className="space-y-1">
|
||||||
|
<div className="w-full h-2 bg-primary/20 rounded-full overflow-hidden">
|
||||||
|
<div
|
||||||
|
className="h-full bg-primary transition-all duration-300 ease-out rounded-full"
|
||||||
|
style={{ width: `${progressPercent}%` }}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className="flex justify-between text-xs text-muted-foreground">
|
||||||
|
<span>
|
||||||
|
{taskResult.processed ?? 0} / {taskResult.total ?? '?'}
|
||||||
|
</span>
|
||||||
|
<span>{formatEta(taskResult.eta_seconds)}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{taskResult?.message && (
|
||||||
|
<p className="text-sm text-muted-foreground">{taskResult.message}</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Log viewer fills remaining space */}
|
||||||
|
<div className="flex-1 min-h-0 flex flex-col gap-1 px-4 pb-2">
|
||||||
|
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide shrink-0">
|
||||||
|
Worker Logs
|
||||||
|
</p>
|
||||||
|
<div className="flex-1 min-h-0">
|
||||||
|
<LogViewer logs={taskResult?.logs ?? []} />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{isInProgress && (
|
||||||
|
<SheetFooter>
|
||||||
|
<Button
|
||||||
|
variant="destructive"
|
||||||
|
onClick={onCancel}
|
||||||
|
disabled={isCancelling}
|
||||||
|
className="w-full"
|
||||||
|
>
|
||||||
|
{isCancelling ? (
|
||||||
|
<>
|
||||||
|
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
|
||||||
|
Cancelling...
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
'Cancel Job'
|
||||||
|
)}
|
||||||
|
</Button>
|
||||||
|
</SheetFooter>
|
||||||
|
)}
|
||||||
|
</SheetContent>
|
||||||
|
</Sheet>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
@ -48,13 +48,41 @@ export enum TaskStatus {
|
||||||
|
|
||||||
export interface TaskStatusResponse {
|
export interface TaskStatusResponse {
|
||||||
status: TaskStatus;
|
status: TaskStatus;
|
||||||
result: string; // JSON string containing { progress: number }
|
result: string; // JSON string containing TaskResult
|
||||||
|
message?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'filtering' | 'processing' | 'completed';
|
||||||
|
|
||||||
export interface TaskResult {
|
export interface TaskResult {
|
||||||
progress: number;
|
progress: number;
|
||||||
processed?: number;
|
processed?: number;
|
||||||
total?: number;
|
total?: number;
|
||||||
|
phase?: TaskPhase;
|
||||||
|
message?: string;
|
||||||
|
// Splitting phase
|
||||||
|
subqueries_probed?: number;
|
||||||
|
subqueries_initial?: number;
|
||||||
|
estimated_results?: number;
|
||||||
|
subqueries_total?: number;
|
||||||
|
// Fetching phase
|
||||||
|
subqueries_completed?: number;
|
||||||
|
ids_collected?: number;
|
||||||
|
pages_fetched?: number;
|
||||||
|
// Filtering phase
|
||||||
|
total_found?: number;
|
||||||
|
existing_in_db?: number;
|
||||||
|
new_listings?: number;
|
||||||
|
// Processing phase
|
||||||
|
details_fetched?: number;
|
||||||
|
images_downloaded?: number;
|
||||||
|
ocr_completed?: number;
|
||||||
|
failed?: number;
|
||||||
|
elapsed_seconds?: number;
|
||||||
|
rate_per_second?: number;
|
||||||
|
eta_seconds?: number;
|
||||||
|
// Live logs
|
||||||
|
logs?: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RefreshListingsResponse {
|
export interface RefreshListingsResponse {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from collections.abc import Callable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import logging
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
@ -22,6 +23,13 @@ class ListingProcessor:
|
||||||
process_steps: list[Step]
|
process_steps: list[Step]
|
||||||
listing_repository: ListingRepository
|
listing_repository: ListingRepository
|
||||||
|
|
||||||
|
# Map step class names to short names for progress reporting
|
||||||
|
STEP_NAMES: dict[str, str] = {
|
||||||
|
"FetchListingDetailsStep": "details",
|
||||||
|
"FetchImagesStep": "images",
|
||||||
|
"DetectFloorplanStep": "ocr",
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, listing_repository: ListingRepository):
|
def __init__(self, listing_repository: ListingRepository):
|
||||||
self.semaphore = asyncio.Semaphore(20)
|
self.semaphore = asyncio.Semaphore(20)
|
||||||
self.listing_repository = listing_repository
|
self.listing_repository = listing_repository
|
||||||
|
|
@ -33,19 +41,28 @@ class ListingProcessor:
|
||||||
DetectFloorplanStep(listing_repository),
|
DetectFloorplanStep(listing_repository),
|
||||||
]
|
]
|
||||||
|
|
||||||
async def process_listing(self, listing_id: int) -> Listing | None:
|
async def process_listing(
|
||||||
|
self,
|
||||||
|
listing_id: int,
|
||||||
|
on_step_complete: Callable[[str], None] | None = None,
|
||||||
|
) -> Listing | None:
|
||||||
await self.listing_repository.mark_seen(listing_id)
|
await self.listing_repository.mark_seen(listing_id)
|
||||||
listing = None
|
listing = None
|
||||||
for step in self.process_steps:
|
for step in self.process_steps:
|
||||||
if await step.needs_processing(listing_id):
|
if await step.needs_processing(listing_id):
|
||||||
async with self.semaphore:
|
async with self.semaphore:
|
||||||
step_name = step.__class__.__name__
|
step_class_name = step.__class__.__name__
|
||||||
try:
|
try:
|
||||||
listing = await step.process(listing_id)
|
listing = await step.process(listing_id)
|
||||||
logger.debug(f"[{listing_id}] {step_name} completed")
|
logger.debug(f"[{listing_id}] {step_class_name} completed")
|
||||||
|
if on_step_complete:
|
||||||
|
short_name = self.STEP_NAMES.get(
|
||||||
|
step_class_name, step_class_name
|
||||||
|
)
|
||||||
|
on_step_complete(short_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[{listing_id}] {step_name} failed: {e}")
|
logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
||||||
celery_logger.error(f"[{listing_id}] {step_name} failed: {e}")
|
celery_logger.error(f"[{listing_id}] {step_class_name} failed: {e}")
|
||||||
return None
|
return None
|
||||||
return listing
|
return listing
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -238,6 +238,8 @@ class QuerySplitter:
|
||||||
parameters: Original query parameters to split.
|
parameters: Original query parameters to split.
|
||||||
session: aiohttp session for making requests.
|
session: aiohttp session for making requests.
|
||||||
on_progress: Optional callback for progress updates.
|
on_progress: Optional callback for progress updates.
|
||||||
|
Called as on_progress(phase, message, **kwargs) where kwargs
|
||||||
|
contains structured data like subqueries_probed, etc.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of SubQuery objects, each under the result threshold.
|
List of SubQuery objects, each under the result threshold.
|
||||||
|
|
@ -260,19 +262,32 @@ class QuerySplitter:
|
||||||
on_progress(
|
on_progress(
|
||||||
phase="splitting",
|
phase="splitting",
|
||||||
message=f"Created {len(initial_subqueries)} initial subqueries",
|
message=f"Created {len(initial_subqueries)} initial subqueries",
|
||||||
|
subqueries_initial=len(initial_subqueries),
|
||||||
|
subqueries_probed=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Phase 2: Probe and adaptively split
|
# Phase 2: Probe and adaptively split
|
||||||
semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
|
semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
|
||||||
refined_subqueries: list[SubQuery] = []
|
refined_subqueries: list[SubQuery] = []
|
||||||
|
probed_count = 0
|
||||||
|
|
||||||
# Probe all initial subqueries in parallel
|
# Probe all initial subqueries in parallel
|
||||||
async def probe_and_split(sq: SubQuery) -> list[SubQuery]:
|
async def probe_and_split(sq: SubQuery) -> list[SubQuery]:
|
||||||
|
nonlocal probed_count
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
await asyncio.sleep(self.config.request_delay_ms / 1000)
|
await asyncio.sleep(self.config.request_delay_ms / 1000)
|
||||||
count = await self.probe_result_count(sq, session, parameters)
|
count = await self.probe_result_count(sq, session, parameters)
|
||||||
|
|
||||||
sq = replace(sq, estimated_results=count)
|
sq = replace(sq, estimated_results=count)
|
||||||
|
probed_count += 1
|
||||||
|
|
||||||
|
if on_progress:
|
||||||
|
on_progress(
|
||||||
|
phase="splitting",
|
||||||
|
message=f"Probed {probed_count}/{len(initial_subqueries)} subqueries",
|
||||||
|
subqueries_initial=len(initial_subqueries),
|
||||||
|
subqueries_probed=probed_count,
|
||||||
|
)
|
||||||
|
|
||||||
if count > self.config.split_threshold:
|
if count > self.config.split_threshold:
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -294,10 +309,14 @@ class QuerySplitter:
|
||||||
f"Refined to {len(refined_subqueries)} subqueries after splitting"
|
f"Refined to {len(refined_subqueries)} subqueries after splitting"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
total_estimated = self.calculate_total_estimated_results(refined_subqueries)
|
||||||
|
|
||||||
if on_progress:
|
if on_progress:
|
||||||
on_progress(
|
on_progress(
|
||||||
phase="splitting_complete",
|
phase="splitting_complete",
|
||||||
message=f"Refined to {len(refined_subqueries)} subqueries",
|
message=f"Refined to {len(refined_subqueries)} subqueries",
|
||||||
|
subqueries_total=len(refined_subqueries),
|
||||||
|
estimated_results=total_estimated,
|
||||||
)
|
)
|
||||||
|
|
||||||
return refined_subqueries
|
return refined_subqueries
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections import deque
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from celery import Task
|
from celery import Task
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
|
|
@ -31,6 +32,21 @@ if not celery_logger.handlers:
|
||||||
celery_logger.setLevel(logging.INFO)
|
celery_logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
SCRAPE_LOCK_NAME = "scrape_listings"
|
SCRAPE_LOCK_NAME = "scrape_listings"
|
||||||
|
LOG_BUFFER_MAX_LINES = 200
|
||||||
|
|
||||||
|
|
||||||
|
class TaskLogHandler(logging.Handler):
|
||||||
|
"""Captures log records into a deque for inclusion in task state updates."""
|
||||||
|
|
||||||
|
def __init__(self, buffer: deque[str]) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.buffer = buffer
|
||||||
|
|
||||||
|
def emit(self, record: logging.LogRecord) -> None:
|
||||||
|
try:
|
||||||
|
self.buffer.append(self.format(record))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@app.task(bind=True, pydantic=True)
|
@app.task(bind=True, pydantic=True)
|
||||||
|
|
@ -49,9 +65,9 @@ def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
|
||||||
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
|
||||||
celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")
|
celery_logger.info(f"Starting scrape with parameters: {parsed_parameters}")
|
||||||
|
|
||||||
self.update_state(state="Starting...", meta={"progress": 0})
|
self.update_state(state="Starting...", meta={"phase": "splitting", "progress": 0})
|
||||||
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
|
asyncio.run(dump_listings_full(task=self, parameters=parsed_parameters))
|
||||||
return {"progress": 0}
|
return {"phase": "completed", "progress": 1}
|
||||||
|
|
||||||
|
|
||||||
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
|
async def async_dump_listings_task(parameters_json: str) -> dict[str, Any]:
|
||||||
|
|
@ -70,6 +86,39 @@ async def dump_listings_full(
|
||||||
*, task: Task, parameters: QueryParameters
|
*, task: Task, parameters: QueryParameters
|
||||||
) -> list[Listing]:
|
) -> list[Listing]:
|
||||||
"""Fetches all listings, images as well as detects floorplans"""
|
"""Fetches all listings, images as well as detects floorplans"""
|
||||||
|
# Set up log capture: a ring buffer handler that we inject into every
|
||||||
|
# task.update_state() call so the frontend can display live logs.
|
||||||
|
log_buffer: deque[str] = deque(maxlen=LOG_BUFFER_MAX_LINES)
|
||||||
|
log_handler = TaskLogHandler(log_buffer)
|
||||||
|
log_handler.setFormatter(
|
||||||
|
logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S")
|
||||||
|
)
|
||||||
|
celery_logger.addHandler(log_handler)
|
||||||
|
|
||||||
|
# Wrap task.update_state so every call automatically includes logs
|
||||||
|
_original_update_state = task.update_state
|
||||||
|
|
||||||
|
def _update_state_with_logs(
|
||||||
|
state: str | None = None, meta: dict[str, Any] | None = None, **kwargs: Any
|
||||||
|
) -> None:
|
||||||
|
if meta is None:
|
||||||
|
meta = {}
|
||||||
|
meta["logs"] = list(log_buffer)
|
||||||
|
_original_update_state(state=state, meta=meta, **kwargs)
|
||||||
|
|
||||||
|
task.update_state = _update_state_with_logs # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await _dump_listings_full_inner(task=task, parameters=parameters)
|
||||||
|
finally:
|
||||||
|
celery_logger.removeHandler(log_handler)
|
||||||
|
task.update_state = _original_update_state # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
|
async def _dump_listings_full_inner(
|
||||||
|
*, task: Task, parameters: QueryParameters
|
||||||
|
) -> list[Listing]:
|
||||||
|
"""Inner implementation — called with log-capturing update_state wrapper."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
celery_logger.info("=" * 60)
|
celery_logger.info("=" * 60)
|
||||||
celery_logger.info("PHASE 1: Initializing listing fetch")
|
celery_logger.info("PHASE 1: Initializing listing fetch")
|
||||||
|
|
@ -77,7 +126,7 @@ async def dump_listings_full(
|
||||||
|
|
||||||
repository = ListingRepository(engine)
|
repository = ListingRepository(engine)
|
||||||
|
|
||||||
task.update_state(state="Identifying missing listings", meta={"progress": 0})
|
task.update_state(state="Identifying missing listings", meta={"phase": "splitting", "progress": 0})
|
||||||
celery_logger.info("Querying Rightmove API to identify new listings...")
|
celery_logger.info("Querying Rightmove API to identify new listings...")
|
||||||
ids_to_process = await get_ids_to_process(
|
ids_to_process = await get_ids_to_process(
|
||||||
parameters=parameters, repository=repository, task=task
|
parameters=parameters, repository=repository, task=task
|
||||||
|
|
@ -92,7 +141,7 @@ async def dump_listings_full(
|
||||||
invalidate_cache()
|
invalidate_cache()
|
||||||
task.update_state(
|
task.update_state(
|
||||||
state="No new listings found",
|
state="No new listings found",
|
||||||
meta={"progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
|
meta={"phase": "completed", "progress": 1, "processed": 0, "total": 0, "message": "All listings are up to date"},
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
@ -115,6 +164,18 @@ async def dump_listings_full(
|
||||||
|
|
||||||
invalidate_cache()
|
invalidate_cache()
|
||||||
|
|
||||||
|
# Send final state so the frontend has rich data even after task completes
|
||||||
|
task.update_state(
|
||||||
|
state="Completed",
|
||||||
|
meta={
|
||||||
|
"phase": "completed",
|
||||||
|
"progress": 1,
|
||||||
|
"processed": len(result),
|
||||||
|
"total": len(ids_to_process),
|
||||||
|
"message": f"Processed {len(result)} listings in {elapsed:.0f}s",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -124,11 +185,26 @@ async def dump_listings_and_monitor(
|
||||||
task_progress = {missing_id: 0 for missing_id in missing_ids}
|
task_progress = {missing_id: 0 for missing_id in missing_ids}
|
||||||
processed_count = 0
|
processed_count = 0
|
||||||
failed_count = 0
|
failed_count = 0
|
||||||
|
details_fetched = 0
|
||||||
|
images_downloaded = 0
|
||||||
|
ocr_completed = 0
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
async def process(missing_id: int) -> Listing | None:
|
async def process(missing_id: int) -> Listing | None:
|
||||||
nonlocal processed_count, failed_count
|
nonlocal processed_count, failed_count
|
||||||
listing = await listing_processor.process_listing(missing_id)
|
|
||||||
|
def step_callback(step_name: str) -> None:
|
||||||
|
nonlocal details_fetched, images_downloaded, ocr_completed
|
||||||
|
if step_name == "details":
|
||||||
|
details_fetched += 1
|
||||||
|
elif step_name == "images":
|
||||||
|
images_downloaded += 1
|
||||||
|
elif step_name == "ocr":
|
||||||
|
ocr_completed += 1
|
||||||
|
|
||||||
|
listing = await listing_processor.process_listing(
|
||||||
|
missing_id, on_step_complete=step_callback
|
||||||
|
)
|
||||||
task_progress[missing_id] = 1
|
task_progress[missing_id] = 1
|
||||||
if listing is not None:
|
if listing is not None:
|
||||||
processed_count += 1
|
processed_count += 1
|
||||||
|
|
@ -141,12 +217,12 @@ async def dump_listings_and_monitor(
|
||||||
while (progress := sum(task_progress.values())) < len(missing_ids):
|
while (progress := sum(task_progress.values())) < len(missing_ids):
|
||||||
progress_ratio = round(progress / len(missing_ids), 2)
|
progress_ratio = round(progress / len(missing_ids), 2)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
rate = progress / elapsed if elapsed > 0 else 0
|
||||||
|
eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
|
||||||
|
|
||||||
# Log every 10% progress or at least every update
|
# Log every 10% progress or at least every update
|
||||||
if progress_ratio >= last_progress + 0.1 or progress == 1:
|
if progress_ratio >= last_progress + 0.1 or progress == 1:
|
||||||
elapsed = time.time() - start_time
|
|
||||||
rate = progress / elapsed if elapsed > 0 else 0
|
|
||||||
eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
|
|
||||||
|
|
||||||
celery_logger.info(
|
celery_logger.info(
|
||||||
f"Progress: {progress_ratio * 100:.0f}% "
|
f"Progress: {progress_ratio * 100:.0f}% "
|
||||||
f"({progress}/{len(missing_ids)}) "
|
f"({progress}/{len(missing_ids)}) "
|
||||||
|
|
@ -158,7 +234,19 @@ async def dump_listings_and_monitor(
|
||||||
|
|
||||||
task.update_state(
|
task.update_state(
|
||||||
state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
|
state=f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
|
||||||
meta={"progress": progress_ratio, "processed": progress, "total": len(missing_ids)},
|
meta={
|
||||||
|
"phase": "processing",
|
||||||
|
"progress": progress_ratio,
|
||||||
|
"processed": progress,
|
||||||
|
"total": len(missing_ids),
|
||||||
|
"details_fetched": details_fetched,
|
||||||
|
"images_downloaded": images_downloaded,
|
||||||
|
"ocr_completed": ocr_completed,
|
||||||
|
"failed": failed_count,
|
||||||
|
"elapsed_seconds": round(elapsed, 1),
|
||||||
|
"rate_per_second": round(rate, 2),
|
||||||
|
"eta_seconds": round(eta, 1),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
|
@ -224,8 +312,10 @@ async def get_ids_to_process(
|
||||||
# Reset throttle metrics
|
# Reset throttle metrics
|
||||||
reset_throttle_metrics()
|
reset_throttle_metrics()
|
||||||
|
|
||||||
def on_progress(phase: str, message: str) -> None:
|
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
|
||||||
task.update_state(state=message, meta={"phase": phase})
|
meta: dict[str, Any] = {"phase": phase, "message": message}
|
||||||
|
meta.update(kwargs)
|
||||||
|
task.update_state(state=message, meta=meta)
|
||||||
celery_logger.info(f"[{phase}] {message}")
|
celery_logger.info(f"[{phase}] {message}")
|
||||||
|
|
||||||
celery_logger.info("Starting query splitting and probing...")
|
celery_logger.info("Starting query splitting and probing...")
|
||||||
|
|
@ -254,7 +344,10 @@ async def get_ids_to_process(
|
||||||
state=f"Fetching listings from {len(subqueries)} subqueries...",
|
state=f"Fetching listings from {len(subqueries)} subqueries...",
|
||||||
meta={
|
meta={
|
||||||
"phase": "fetching",
|
"phase": "fetching",
|
||||||
"subqueries": len(subqueries),
|
"subqueries_completed": 0,
|
||||||
|
"subqueries_total": len(subqueries),
|
||||||
|
"ids_collected": 0,
|
||||||
|
"pages_fetched": 0,
|
||||||
"estimated_results": total_estimated,
|
"estimated_results": total_estimated,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
@ -275,6 +368,16 @@ async def get_ids_to_process(
|
||||||
estimated = sq.estimated_results or 0
|
estimated = sq.estimated_results or 0
|
||||||
if estimated == 0:
|
if estimated == 0:
|
||||||
completed_subqueries += 1
|
completed_subqueries += 1
|
||||||
|
task.update_state(
|
||||||
|
state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
|
||||||
|
meta={
|
||||||
|
"phase": "fetching",
|
||||||
|
"subqueries_completed": completed_subqueries,
|
||||||
|
"subqueries_total": len(subqueries),
|
||||||
|
"ids_collected": len(identifiers),
|
||||||
|
"pages_fetched": total_pages_fetched,
|
||||||
|
},
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# Fetch pages up to max_pages_per_query or until no more results
|
# Fetch pages up to max_pages_per_query or until no more results
|
||||||
|
|
@ -333,6 +436,16 @@ async def get_ids_to_process(
|
||||||
break
|
break
|
||||||
|
|
||||||
completed_subqueries += 1
|
completed_subqueries += 1
|
||||||
|
task.update_state(
|
||||||
|
state=f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
|
||||||
|
meta={
|
||||||
|
"phase": "fetching",
|
||||||
|
"subqueries_completed": completed_subqueries,
|
||||||
|
"subqueries_total": len(subqueries),
|
||||||
|
"ids_collected": len(identifiers),
|
||||||
|
"pages_fetched": total_pages_fetched,
|
||||||
|
},
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# Fetch all subqueries concurrently
|
# Fetch all subqueries concurrently
|
||||||
|
|
@ -391,6 +504,7 @@ async def get_ids_to_process(
|
||||||
meta={
|
meta={
|
||||||
"phase": "filtering",
|
"phase": "filtering",
|
||||||
"total_found": len(identifiers),
|
"total_found": len(identifiers),
|
||||||
|
"existing_in_db": len(all_listing_ids),
|
||||||
"new_listings": len(new_ids),
|
"new_listings": len(new_ids),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue