Files
Mangalord/backend/src/crawler/browser_manager.rs
MechaCat02 9fe0f26d75 feat: in-process crawler daemon with cron and worker pool (0.28.0)
The backend now boots an internal crawler daemon that runs a daily
metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded
for multi-replica safety) and drains SyncChapterContent jobs from
crawler_jobs through a worker pool. Chromium launches lazily on first
job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity.

Modules:
- crawler::browser_manager — lazy-launch / idle-teardown wrapper
  around browser::Handle, with an on_launch hook that re-injects
  PHPSESSID on every fresh Chromium spawn.
- crawler::pipeline — run_metadata_pass (the shared discover/upsert
  /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper
  used by the cron tick.
- crawler::daemon — cron task + worker pool, behind two trait seams
  (MetadataPass, ChapterDispatcher) so tests can inject stubs without
  standing up Chromium or a live source.

Behavior:
- CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests).
- Catch-up tick fires on startup if the last persisted slot was missed.
- A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers
  idle until operator restart with a refreshed PHPSESSID.
- Worker dispatch wrapped in catch_unwind so a panicking handler
  marks the job failed instead of taking down the worker.
- Migration 0015 adds a small crawler_state k-v table for the
  last_metadata_tick_at watermark.

Dep additions: chrono-tz (IANA TZ parsing).

CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds
the browser via BrowserManager so the on_launch session injection
flow stays in one place. Inline chapter-content sync semantics are
unchanged — the queue is for the daemon, force-refetches and manual
backfills still bypass it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:32:02 +02:00

263 lines
8.2 KiB
Rust

//! Lazy-launch / idle-teardown Chromium manager for the daemon.
//!
//! The first worker that calls [`BrowserManager::acquire`] triggers a real
//! Chromium launch (and the `on_launch` hook — used to re-inject the
//! PHPSESSID cookie on every fresh process). Each acquire bumps an active
//! counter; the returned [`BrowserLease`] decrements it on drop.
//!
//! When the active counter hits zero, a background reaper task waits
//! `idle_timeout`. If still zero on wake, it closes Chromium and clears the
//! cached handle. The next acquire re-launches.
//!
//! `idle_timeout = Duration::ZERO` disables the reaper — Chromium stays alive
//! until [`BrowserManager::shutdown`].
use std::ops::Deref;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::Duration;
use anyhow::Context;
use chromiumoxide::browser::Browser;
use futures_util::future::BoxFuture;
use tokio::sync::{Mutex, Notify};
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use crate::crawler::browser::{self, LaunchOptions};
/// Hook invoked on every fresh launch with the new browser. Typically used
/// to re-inject PHPSESSID + run the session probe. Errors abort the
/// `acquire` that triggered the launch — the next acquire will re-launch.
pub type OnLaunch =
Arc<dyn Fn(Arc<Browser>) -> BoxFuture<'static, anyhow::Result<()>> + Send + Sync>;
/// Returns an `OnLaunch` that does nothing — useful when no session is
/// configured (e.g. CLI metadata-only runs).
pub fn noop_on_launch() -> OnLaunch {
Arc::new(|_| Box::pin(async { Ok(()) }))
}
/// Decoupled active-lease tracker. Owns the atomic counter and the idle
/// notifier so the wiring is unit-testable without standing up a real
/// `BrowserManager` (which would require launching Chromium).
#[derive(Default)]
pub(crate) struct ActiveTracker {
counter: AtomicUsize,
idle_signal: Notify,
}
impl ActiveTracker {
pub(crate) fn new() -> Arc<Self> {
Arc::new(Self::default())
}
pub(crate) fn acquire(self: &Arc<Self>) {
self.counter.fetch_add(1, Ordering::AcqRel);
}
pub(crate) fn release(self: &Arc<Self>) {
if self.counter.fetch_sub(1, Ordering::AcqRel) == 1 {
self.idle_signal.notify_one();
}
}
pub(crate) fn current(&self) -> usize {
self.counter.load(Ordering::Acquire)
}
pub(crate) fn idle_signal(&self) -> &Notify {
&self.idle_signal
}
}
pub struct BrowserManager {
inner: Mutex<Inner>,
active: Arc<ActiveTracker>,
launch_opts: LaunchOptions,
idle_timeout: Duration,
on_launch: OnLaunch,
}
struct Inner {
handle: Option<browser::Handle>,
shared: Option<Arc<Browser>>,
}
impl BrowserManager {
pub fn new(
launch_opts: LaunchOptions,
idle_timeout: Duration,
on_launch: OnLaunch,
) -> Arc<Self> {
Arc::new(Self {
inner: Mutex::new(Inner {
handle: None,
shared: None,
}),
active: ActiveTracker::new(),
launch_opts,
idle_timeout,
on_launch,
})
}
/// Acquire a shared browser lease. The first acquire after a teardown
/// launches a fresh Chromium (and runs `on_launch`); subsequent acquires
/// while a process is alive just bump the counter and clone the `Arc`.
pub async fn acquire(&self) -> anyhow::Result<BrowserLease> {
let mut guard = self.inner.lock().await;
if guard.handle.is_none() {
let handle = browser::launch(self.launch_opts.clone())
.await
.context("BrowserManager: launch chromium")?;
let shared = handle.shared();
// Run the on-launch hook before publishing the handle so a session
// probe failure doesn't leave a half-initialized browser behind.
if let Err(e) = (self.on_launch)(Arc::clone(&shared)).await {
// Close the just-launched browser since we won't be using it.
let _ = handle.close().await;
return Err(e.context("BrowserManager: on_launch hook failed"));
}
guard.handle = Some(handle);
guard.shared = Some(shared);
}
let browser = guard
.shared
.as_ref()
.expect("shared set above")
.clone();
self.active.acquire();
Ok(BrowserLease {
browser,
active: Arc::clone(&self.active),
})
}
/// Forcefully close the cached browser regardless of active count.
/// Used on daemon shutdown. After this returns the next acquire will
/// re-launch from scratch.
pub async fn shutdown(&self) {
let mut guard = self.inner.lock().await;
guard.shared = None;
if let Some(handle) = guard.handle.take() {
let _ = handle.close().await;
}
}
fn idle_timeout(&self) -> Duration {
self.idle_timeout
}
fn active(&self) -> Arc<ActiveTracker> {
Arc::clone(&self.active)
}
}
/// Background reaper. Returns immediately when `idle_timeout == 0`.
/// Otherwise spawns a task that:
/// 1. Waits on `idle_signal` (woken when active hits zero).
/// 2. Sleeps `idle_timeout`.
/// 3. Re-checks the counter under the mutex — if still zero, takes the
/// handle and closes it.
///
/// Repeats forever until `cancel` fires.
pub fn spawn_idle_reaper(mgr: Arc<BrowserManager>, cancel: CancellationToken) -> JoinHandle<()> {
tokio::spawn(async move {
if mgr.idle_timeout().is_zero() {
// Block until cancellation, then exit.
cancel.cancelled().await;
return;
}
let active = mgr.active();
loop {
tokio::select! {
_ = cancel.cancelled() => return,
_ = active.idle_signal().notified() => {}
}
if active.current() > 0 {
continue;
}
tokio::select! {
_ = cancel.cancelled() => return,
_ = tokio::time::sleep(mgr.idle_timeout()) => {}
}
let mut guard = mgr.inner.lock().await;
if active.current() > 0 {
// A worker grabbed a lease during the sleep — abort teardown.
continue;
}
let handle = guard.handle.take();
guard.shared = None;
drop(guard);
if let Some(h) = handle {
let _ = h.close().await;
tracing::info!("BrowserManager: idle teardown — Chromium closed");
}
}
})
}
/// A worker-side handle that keeps the browser alive while in scope.
/// `Deref<Target = Browser>` so callers can pass `&*lease` to APIs that
/// expect `&Browser`.
pub struct BrowserLease {
browser: Arc<Browser>,
active: Arc<ActiveTracker>,
}
impl Deref for BrowserLease {
type Target = Browser;
fn deref(&self) -> &Browser {
&self.browser
}
}
impl Drop for BrowserLease {
fn drop(&mut self) {
self.active.release();
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::atomic::AtomicBool;
#[test]
fn noop_on_launch_is_send_sync() {
fn assert_send_sync<T: Send + Sync>(_: &T) {}
let h = noop_on_launch();
assert_send_sync(&h);
}
#[tokio::test]
async fn active_tracker_signals_idle_only_on_zero_transition() {
let tracker = ActiveTracker::new();
let signaled = Arc::new(AtomicBool::new(false));
{
let s = Arc::clone(&signaled);
let t = Arc::clone(&tracker);
tokio::spawn(async move {
t.idle_signal().notified().await;
s.store(true, Ordering::Release);
});
}
tracker.acquire();
tracker.acquire();
assert_eq!(tracker.current(), 2);
tracker.release();
assert_eq!(tracker.current(), 1);
tokio::time::sleep(Duration::from_millis(20)).await;
assert!(!signaled.load(Ordering::Acquire), "no idle signal at count 1");
tracker.release();
tokio::time::sleep(Duration::from_millis(20)).await;
assert_eq!(tracker.current(), 0);
assert!(
signaled.load(Ordering::Acquire),
"idle signal fires on 1 -> 0 transition"
);
}
}