feat: in-process crawler daemon with cron and worker pool (0.28.0)
The backend now boots an internal crawler daemon that runs a daily metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded for multi-replica safety) and drains SyncChapterContent jobs from crawler_jobs through a worker pool. Chromium launches lazily on first job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity. Modules: - crawler::browser_manager — lazy-launch / idle-teardown wrapper around browser::Handle, with an on_launch hook that re-injects PHPSESSID on every fresh Chromium spawn. - crawler::pipeline — run_metadata_pass (the shared discover/upsert /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper used by the cron tick. - crawler::daemon — cron task + worker pool, behind two trait seams (MetadataPass, ChapterDispatcher) so tests can inject stubs without standing up Chromium or a live source. Behavior: - CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests). - Catch-up tick fires on startup if the last persisted slot was missed. - A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers idle until operator restart with a refreshed PHPSESSID. - Worker dispatch wrapped in catch_unwind so a panicking handler marks the job failed instead of taking down the worker. - Migration 0015 adds a small crawler_state k-v table for the last_metadata_tick_at watermark. Dep additions: chrono-tz (IANA TZ parsing). CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds the browser via BrowserManager so the on_launch session injection flow stays in one place. Inline chapter-content sync semantics are unchanged — the queue is for the daemon, force-refetches and manual backfills still bypass it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
262
backend/src/crawler/browser_manager.rs
Normal file
262
backend/src/crawler/browser_manager.rs
Normal file
@@ -0,0 +1,262 @@
|
||||
//! Lazy-launch / idle-teardown Chromium manager for the daemon.
|
||||
//!
|
||||
//! The first worker that calls [`BrowserManager::acquire`] triggers a real
|
||||
//! Chromium launch (and the `on_launch` hook — used to re-inject the
|
||||
//! PHPSESSID cookie on every fresh process). Each acquire bumps an active
|
||||
//! counter; the returned [`BrowserLease`] decrements it on drop.
|
||||
//!
|
||||
//! When the active counter hits zero, a background reaper task waits
|
||||
//! `idle_timeout`. If still zero on wake, it closes Chromium and clears the
|
||||
//! cached handle. The next acquire re-launches.
|
||||
//!
|
||||
//! `idle_timeout = Duration::ZERO` disables the reaper — Chromium stays alive
|
||||
//! until [`BrowserManager::shutdown`].
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use chromiumoxide::browser::Browser;
|
||||
use futures_util::future::BoxFuture;
|
||||
use tokio::sync::{Mutex, Notify};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::crawler::browser::{self, LaunchOptions};
|
||||
|
||||
/// Hook invoked on every fresh launch with the new browser. Typically used
|
||||
/// to re-inject PHPSESSID + run the session probe. Errors abort the
|
||||
/// `acquire` that triggered the launch — the next acquire will re-launch.
|
||||
pub type OnLaunch =
|
||||
Arc<dyn Fn(Arc<Browser>) -> BoxFuture<'static, anyhow::Result<()>> + Send + Sync>;
|
||||
|
||||
/// Returns an `OnLaunch` that does nothing — useful when no session is
|
||||
/// configured (e.g. CLI metadata-only runs).
|
||||
pub fn noop_on_launch() -> OnLaunch {
|
||||
Arc::new(|_| Box::pin(async { Ok(()) }))
|
||||
}
|
||||
|
||||
/// Decoupled active-lease tracker. Owns the atomic counter and the idle
|
||||
/// notifier so the wiring is unit-testable without standing up a real
|
||||
/// `BrowserManager` (which would require launching Chromium).
|
||||
#[derive(Default)]
|
||||
pub(crate) struct ActiveTracker {
|
||||
counter: AtomicUsize,
|
||||
idle_signal: Notify,
|
||||
}
|
||||
|
||||
impl ActiveTracker {
|
||||
pub(crate) fn new() -> Arc<Self> {
|
||||
Arc::new(Self::default())
|
||||
}
|
||||
|
||||
pub(crate) fn acquire(self: &Arc<Self>) {
|
||||
self.counter.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
|
||||
pub(crate) fn release(self: &Arc<Self>) {
|
||||
if self.counter.fetch_sub(1, Ordering::AcqRel) == 1 {
|
||||
self.idle_signal.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn current(&self) -> usize {
|
||||
self.counter.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
pub(crate) fn idle_signal(&self) -> &Notify {
|
||||
&self.idle_signal
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BrowserManager {
|
||||
inner: Mutex<Inner>,
|
||||
active: Arc<ActiveTracker>,
|
||||
launch_opts: LaunchOptions,
|
||||
idle_timeout: Duration,
|
||||
on_launch: OnLaunch,
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
handle: Option<browser::Handle>,
|
||||
shared: Option<Arc<Browser>>,
|
||||
}
|
||||
|
||||
impl BrowserManager {
|
||||
pub fn new(
|
||||
launch_opts: LaunchOptions,
|
||||
idle_timeout: Duration,
|
||||
on_launch: OnLaunch,
|
||||
) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
inner: Mutex::new(Inner {
|
||||
handle: None,
|
||||
shared: None,
|
||||
}),
|
||||
active: ActiveTracker::new(),
|
||||
launch_opts,
|
||||
idle_timeout,
|
||||
on_launch,
|
||||
})
|
||||
}
|
||||
|
||||
/// Acquire a shared browser lease. The first acquire after a teardown
|
||||
/// launches a fresh Chromium (and runs `on_launch`); subsequent acquires
|
||||
/// while a process is alive just bump the counter and clone the `Arc`.
|
||||
pub async fn acquire(&self) -> anyhow::Result<BrowserLease> {
|
||||
let mut guard = self.inner.lock().await;
|
||||
if guard.handle.is_none() {
|
||||
let handle = browser::launch(self.launch_opts.clone())
|
||||
.await
|
||||
.context("BrowserManager: launch chromium")?;
|
||||
let shared = handle.shared();
|
||||
// Run the on-launch hook before publishing the handle so a session
|
||||
// probe failure doesn't leave a half-initialized browser behind.
|
||||
if let Err(e) = (self.on_launch)(Arc::clone(&shared)).await {
|
||||
// Close the just-launched browser since we won't be using it.
|
||||
let _ = handle.close().await;
|
||||
return Err(e.context("BrowserManager: on_launch hook failed"));
|
||||
}
|
||||
guard.handle = Some(handle);
|
||||
guard.shared = Some(shared);
|
||||
}
|
||||
let browser = guard
|
||||
.shared
|
||||
.as_ref()
|
||||
.expect("shared set above")
|
||||
.clone();
|
||||
self.active.acquire();
|
||||
Ok(BrowserLease {
|
||||
browser,
|
||||
active: Arc::clone(&self.active),
|
||||
})
|
||||
}
|
||||
|
||||
/// Forcefully close the cached browser regardless of active count.
|
||||
/// Used on daemon shutdown. After this returns the next acquire will
|
||||
/// re-launch from scratch.
|
||||
pub async fn shutdown(&self) {
|
||||
let mut guard = self.inner.lock().await;
|
||||
guard.shared = None;
|
||||
if let Some(handle) = guard.handle.take() {
|
||||
let _ = handle.close().await;
|
||||
}
|
||||
}
|
||||
|
||||
fn idle_timeout(&self) -> Duration {
|
||||
self.idle_timeout
|
||||
}
|
||||
|
||||
fn active(&self) -> Arc<ActiveTracker> {
|
||||
Arc::clone(&self.active)
|
||||
}
|
||||
}
|
||||
|
||||
/// Background reaper. Returns immediately when `idle_timeout == 0`.
|
||||
/// Otherwise spawns a task that:
|
||||
/// 1. Waits on `idle_signal` (woken when active hits zero).
|
||||
/// 2. Sleeps `idle_timeout`.
|
||||
/// 3. Re-checks the counter under the mutex — if still zero, takes the
|
||||
/// handle and closes it.
|
||||
///
|
||||
/// Repeats forever until `cancel` fires.
|
||||
pub fn spawn_idle_reaper(mgr: Arc<BrowserManager>, cancel: CancellationToken) -> JoinHandle<()> {
|
||||
tokio::spawn(async move {
|
||||
if mgr.idle_timeout().is_zero() {
|
||||
// Block until cancellation, then exit.
|
||||
cancel.cancelled().await;
|
||||
return;
|
||||
}
|
||||
let active = mgr.active();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return,
|
||||
_ = active.idle_signal().notified() => {}
|
||||
}
|
||||
if active.current() > 0 {
|
||||
continue;
|
||||
}
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return,
|
||||
_ = tokio::time::sleep(mgr.idle_timeout()) => {}
|
||||
}
|
||||
let mut guard = mgr.inner.lock().await;
|
||||
if active.current() > 0 {
|
||||
// A worker grabbed a lease during the sleep — abort teardown.
|
||||
continue;
|
||||
}
|
||||
let handle = guard.handle.take();
|
||||
guard.shared = None;
|
||||
drop(guard);
|
||||
if let Some(h) = handle {
|
||||
let _ = h.close().await;
|
||||
tracing::info!("BrowserManager: idle teardown — Chromium closed");
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A worker-side handle that keeps the browser alive while in scope.
|
||||
/// `Deref<Target = Browser>` so callers can pass `&*lease` to APIs that
|
||||
/// expect `&Browser`.
|
||||
pub struct BrowserLease {
|
||||
browser: Arc<Browser>,
|
||||
active: Arc<ActiveTracker>,
|
||||
}
|
||||
|
||||
impl Deref for BrowserLease {
|
||||
type Target = Browser;
|
||||
fn deref(&self) -> &Browser {
|
||||
&self.browser
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BrowserLease {
|
||||
fn drop(&mut self) {
|
||||
self.active.release();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
#[test]
|
||||
fn noop_on_launch_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>(_: &T) {}
|
||||
let h = noop_on_launch();
|
||||
assert_send_sync(&h);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_tracker_signals_idle_only_on_zero_transition() {
|
||||
let tracker = ActiveTracker::new();
|
||||
let signaled = Arc::new(AtomicBool::new(false));
|
||||
{
|
||||
let s = Arc::clone(&signaled);
|
||||
let t = Arc::clone(&tracker);
|
||||
tokio::spawn(async move {
|
||||
t.idle_signal().notified().await;
|
||||
s.store(true, Ordering::Release);
|
||||
});
|
||||
}
|
||||
|
||||
tracker.acquire();
|
||||
tracker.acquire();
|
||||
assert_eq!(tracker.current(), 2);
|
||||
tracker.release();
|
||||
assert_eq!(tracker.current(), 1);
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
assert!(!signaled.load(Ordering::Acquire), "no idle signal at count 1");
|
||||
tracker.release();
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
assert_eq!(tracker.current(), 0);
|
||||
assert!(
|
||||
signaled.load(Ordering::Acquire),
|
||||
"idle signal fires on 1 -> 0 transition"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user