feat: in-process crawler daemon with cron and worker pool (0.28.0)

The backend now boots an internal crawler daemon that runs a daily
metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded
for multi-replica safety) and drains SyncChapterContent jobs from
crawler_jobs through a worker pool. Chromium launches lazily on first
job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity.

Modules:
- crawler::browser_manager — lazy-launch / idle-teardown wrapper
  around browser::Handle, with an on_launch hook that re-injects
  PHPSESSID on every fresh Chromium spawn.
- crawler::pipeline — run_metadata_pass (the shared discover/upsert
  /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper
  used by the cron tick.
- crawler::daemon — cron task + worker pool, behind two trait seams
  (MetadataPass, ChapterDispatcher) so tests can inject stubs without
  standing up Chromium or a live source.

Behavior:
- CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests).
- Catch-up tick fires on startup if the last persisted slot was missed.
- A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers
  idle until operator restart with a refreshed PHPSESSID.
- Worker dispatch wrapped in catch_unwind so a panicking handler
  marks the job failed instead of taking down the worker.
- Migration 0015 adds a small crawler_state k-v table for the
  last_metadata_tick_at watermark.

Dep additions: chrono-tz (IANA TZ parsing).

CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds
the browser via BrowserManager so the on_launch session injection
flow stays in one place. Inline chapter-content sync semantics are
unchanged — the queue is for the daemon, force-refetches and manual
backfills still bypass it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-25 20:32:02 +02:00
parent 93c7fd63fc
commit 9fe0f26d75
14 changed files with 2162 additions and 309 deletions

View File

@@ -1,14 +1,25 @@
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use anyhow::Context;
use async_trait::async_trait;
use axum::extract::DefaultBodyLimit;
use axum::http::{HeaderName, HeaderValue, Method};
use axum::Router;
use sqlx::postgres::PgPoolOptions;
use sqlx::PgPool;
use tokio_util::sync::CancellationToken;
use tower_http::cors::{AllowOrigin, CorsLayer};
use tower_http::trace::TraceLayer;
use crate::config::{AuthConfig, Config, UploadConfig};
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
use crate::crawler::browser_manager::{self, BrowserManager};
use crate::crawler::content::{self, SyncOutcome};
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
use crate::crawler::jobs::JobPayload;
use crate::crawler::pipeline::{self, MetadataStats};
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::session;
use crate::storage::{LocalStorage, Storage};
#[derive(Clone)]
@@ -19,7 +30,23 @@ pub struct AppState {
pub upload: UploadConfig,
}
pub async fn build(config: Config) -> anyhow::Result<Router> {
/// Bundle returned by [`build`]. The router is what `axum::serve` consumes;
/// the daemon (when enabled) outlives the HTTP server and is awaited via
/// [`AppHandle::shutdown`] after the listener has finished gracefully.
pub struct AppHandle {
pub router: Router,
pub daemon: Option<daemon::DaemonHandle>,
}
impl AppHandle {
pub async fn shutdown(self) {
if let Some(d) = self.daemon {
d.shutdown().await;
}
}
}
pub async fn build(config: Config) -> anyhow::Result<AppHandle> {
let db = PgPoolOptions::new()
.max_connections(10)
.connect(&config.database_url)
@@ -28,13 +55,235 @@ pub async fn build(config: Config) -> anyhow::Result<Router> {
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(config.storage_dir.clone()));
let daemon = if config.crawler.daemon_enabled {
Some(spawn_crawler_daemon(db.clone(), Arc::clone(&storage), &config.crawler).await?)
} else {
tracing::info!("crawler daemon disabled (CRAWLER_DAEMON=false)");
None
};
let state = AppState {
db,
storage,
auth: config.auth.clone(),
upload: config.upload.clone(),
};
Ok(router(state).layer(cors_layer(&config.cors_allowed_origins)))
let router = router(state).layer(cors_layer(&config.cors_allowed_origins));
Ok(AppHandle { router, daemon })
}
async fn spawn_crawler_daemon(
db: PgPool,
storage: Arc<dyn Storage>,
cfg: &CrawlerConfig,
) -> anyhow::Result<daemon::DaemonHandle> {
// Reqwest client with cookie jar pre-seeded so CDN image fetches
// include PHPSESSID. Same shape as bin/crawler.rs main().
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
if let (Some(sid), Some(domain), Some(start_url)) =
(&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url)
{
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
let seed_url = reqwest::Url::parse(start_url)
.context("parse CRAWLER_START_URL for cookie seed")?;
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
}
let mut http_builder = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.no_proxy()
.cookie_provider(cookie_jar);
if let Some(ua) = &cfg.user_agent {
http_builder = http_builder.user_agent(ua);
}
if let Some(proxy) = &cfg.proxy {
http_builder = http_builder
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy: {proxy}"))?);
}
let http = http_builder.build().context("build crawler reqwest")?;
let mut rate = HostRateLimiters::new(std::time::Duration::from_millis(cfg.rate_ms));
if let Some(host) = &cfg.cdn_host {
rate = rate.with_override(host, std::time::Duration::from_millis(cfg.cdn_rate_ms));
}
let rate = Arc::new(rate);
// Browser manager. on_launch re-injects PHPSESSID on every fresh
// chromium spawn so an idle teardown followed by re-launch stays
// authenticated without operator action.
let mut launch_opts = cfg.browser.clone();
if let Some(proxy) = &cfg.proxy {
launch_opts.extra_args.push(format!("--proxy-server={proxy}"));
}
let on_launch = match (&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url) {
(Some(sid), Some(domain), Some(start_url)) => {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url.clone();
let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url.clone();
Box::pin(async move {
session::inject_phpsessid(&browser, &sid, &domain)
.await
.context("on_launch: inject_phpsessid")?;
session::verify_session(&browser, &start_url)
.await
.context("on_launch: verify_session")?;
Ok(())
})
});
on_launch
}
_ => browser_manager::noop_on_launch(),
};
let browser_manager = BrowserManager::new(launch_opts, cfg.idle_timeout, on_launch);
let session_expired = Arc::new(AtomicBool::new(false));
let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| {
let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass {
browser_manager: Arc::clone(&browser_manager),
db: db.clone(),
storage: Arc::clone(&storage),
http: http.clone(),
rate: Arc::clone(&rate),
start_url: url.clone(),
});
m
});
let dispatcher: Arc<dyn ChapterDispatcher> = Arc::new(RealChapterDispatcher {
browser_manager: Arc::clone(&browser_manager),
db: db.clone(),
storage: Arc::clone(&storage),
http,
rate: Arc::clone(&rate),
});
// Shared cancellation: daemon shutdown cancels the BrowserManager's
// idle reaper too. Reaper itself is added to the daemon's extra_tasks
// so DaemonHandle::shutdown awaits its completion.
let cancel = CancellationToken::new();
let reaper_task = browser_manager::spawn_idle_reaper(
Arc::clone(&browser_manager),
cancel.clone(),
);
// Also close the browser explicitly on shutdown so we don't rely on
// kill-on-drop when other Arc<Browser> holders may still exist.
let shutdown_task = {
let cancel = cancel.clone();
let mgr = Arc::clone(&browser_manager);
tokio::spawn(async move {
cancel.cancelled().await;
mgr.shutdown().await;
})
};
let daemon_handle = daemon::spawn(
db,
cancel,
DaemonConfig {
metadata_pass,
dispatcher,
chapter_workers: cfg.chapter_workers,
daily_at: cfg.daily_at,
tz: cfg.tz,
retention_days: cfg.retention_days,
session_expired,
extra_tasks: vec![reaper_task, shutdown_task],
},
);
Ok(daemon_handle)
}
// Real impls of the daemon traits, owning the browser manager + I/O. Kept
// in app.rs because they need the same builder-side env wiring that
// AppState gets — the daemon module itself stays free of reqwest / storage
// details so its tests don't pull them in.
struct RealMetadataPass {
browser_manager: Arc<BrowserManager>,
db: PgPool,
storage: Arc<dyn Storage>,
http: reqwest::Client,
rate: Arc<HostRateLimiters>,
start_url: String,
}
#[async_trait]
impl MetadataPass for RealMetadataPass {
async fn run(&self) -> anyhow::Result<MetadataStats> {
pipeline::run_metadata_pass(
&self.browser_manager,
&self.db,
self.storage.as_ref(),
&self.http,
&self.rate,
&self.start_url,
0,
false,
)
.await
}
}
struct RealChapterDispatcher {
browser_manager: Arc<BrowserManager>,
db: PgPool,
storage: Arc<dyn Storage>,
http: reqwest::Client,
rate: Arc<HostRateLimiters>,
}
#[async_trait]
impl ChapterDispatcher for RealChapterDispatcher {
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome> {
match payload {
JobPayload::SyncChapterContent {
source_id: _,
chapter_id,
source_chapter_key: _,
} => {
// Look up manga_id + source_url for this chapter.
let row: Option<(uuid::Uuid, String)> = sqlx::query_as(
"SELECT c.manga_id, cs.source_url \
FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE c.id = $1 \
LIMIT 1",
)
.bind(chapter_id)
.fetch_optional(&self.db)
.await
.context("look up chapter for dispatch")?;
let Some((manga_id, source_url)) = row else {
// Chapter (or its source row) is gone — ack done.
return Ok(SyncOutcome::Skipped);
};
let lease = self.browser_manager.acquire().await?;
let outcome = content::sync_chapter_content(
&lease,
&self.db,
self.storage.as_ref(),
&self.http,
&self.rate,
chapter_id,
manga_id,
&source_url,
false,
)
.await?;
drop(lease);
Ok(outcome)
}
// Other payload kinds aren't dispatched by this daemon yet —
// metadata-driven jobs (Discover/SyncManga/SyncChapterList)
// are handled inline by the cron's metadata pass.
_ => Ok(SyncOutcome::Skipped),
}
}
}
/// Build a router from a pre-assembled state. Used by integration tests