Compare commits
8 Commits
b1a3a4e9d3
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b845d88766 | ||
|
|
9fe0f26d75 | ||
|
|
93c7fd63fc | ||
|
|
89b84252a5 | ||
|
|
728d704a66 | ||
|
|
d24e68c78d | ||
|
|
51346227dd | ||
|
|
c51353ead3 |
85
backend/Cargo.lock
generated
85
backend/Cargo.lock
generated
@@ -397,6 +397,28 @@ dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono-tz"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"chrono-tz-build",
|
||||
"phf 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono-tz-build"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1"
|
||||
dependencies = [
|
||||
"parse-zoneinfo",
|
||||
"phf 0.11.3",
|
||||
"phf_codegen 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "2.5.0"
|
||||
@@ -423,6 +445,24 @@ dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cookie_store"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206"
|
||||
dependencies = [
|
||||
"cookie",
|
||||
"document-features",
|
||||
"idna",
|
||||
"log",
|
||||
"publicsuffix",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"time",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.7"
|
||||
@@ -601,6 +641,15 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "document-features"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61"
|
||||
dependencies = [
|
||||
"litrs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dotenvy"
|
||||
version = "0.15.7"
|
||||
@@ -1386,6 +1435,12 @@ version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
|
||||
|
||||
[[package]]
|
||||
name = "litrs"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.14"
|
||||
@@ -1415,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.23.0"
|
||||
version = "0.29.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
@@ -1426,6 +1481,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"chromiumoxide",
|
||||
"chrono",
|
||||
"chrono-tz",
|
||||
"dotenvy",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
@@ -1835,6 +1891,15 @@ dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse-zoneinfo"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24"
|
||||
dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "password-hash"
|
||||
version = "0.5.0"
|
||||
@@ -2039,6 +2104,22 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "psl-types"
|
||||
version = "2.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac"
|
||||
|
||||
[[package]]
|
||||
name = "publicsuffix"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42ea446cab60335f76979ec15e12619a2165b5ae2c12166bef27d283a9fadf"
|
||||
dependencies = [
|
||||
"idna",
|
||||
"psl-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn"
|
||||
version = "0.11.9"
|
||||
@@ -2240,6 +2321,8 @@ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"cookie",
|
||||
"cookie_store",
|
||||
"futures-core",
|
||||
"http",
|
||||
"http-body",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.23.0"
|
||||
version = "0.29.0"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
@@ -23,6 +23,7 @@ serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
uuid = { version = "1", features = ["v4", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
chrono-tz = "0.9"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
tower = { version = "0.5", features = ["util"] }
|
||||
@@ -45,7 +46,7 @@ futures-util = "0.3"
|
||||
bytes = "1"
|
||||
chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
|
||||
scraper = "0.20"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks"] }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies"] }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
18
backend/migrations/0013_drop_chapters_unique_number.sql
Normal file
18
backend/migrations/0013_drop_chapters_unique_number.sql
Normal file
@@ -0,0 +1,18 @@
|
||||
-- Real-world sources publish multiple chapters at the same number:
|
||||
-- different uploaders, translator notices/farewells, paid-vs-free
|
||||
-- re-uploads, and our own users can legitimately have two versions of
|
||||
-- "Ch.52" with different scanlations. The (manga_id, number) UNIQUE
|
||||
-- from 0001_init silently collapses all of those into a single row via
|
||||
-- ON CONFLICT, dropping data. Drop the constraint and lean on the
|
||||
-- chapter id (UUID) as the only chapter identity going forward.
|
||||
|
||||
ALTER TABLE chapters DROP CONSTRAINT chapters_manga_id_number_key;
|
||||
|
||||
-- The UNIQUE was also our only index on (manga_id, number) since
|
||||
-- 0007 dropped the redundant explicit one. Chapter list pages
|
||||
-- ORDER BY number ASC and the manga page is a hot read path, so put
|
||||
-- the index back without the uniqueness. Secondary sort by created_at
|
||||
-- so duplicate-numbered chapters have a stable order in lists and
|
||||
-- prev/next navigation.
|
||||
CREATE INDEX chapters_manga_id_number_idx
|
||||
ON chapters (manga_id, number, created_at);
|
||||
15
backend/migrations/0014_crawler_jobs_dedup_index.sql
Normal file
15
backend/migrations/0014_crawler_jobs_dedup_index.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
-- Dedup SyncChapterContent jobs in flight.
|
||||
--
|
||||
-- Without this, the daemon's bookmark/cron enqueue paths would have to do a
|
||||
-- pre-check + insert race that's incorrect under concurrency. The partial
|
||||
-- unique index lets both producers use plain `INSERT ... ON CONFLICT DO
|
||||
-- NOTHING`: at most one (pending|running) job per chapter_id exists, and the
|
||||
-- slot frees again as soon as the job transitions to done/failed/dead so a
|
||||
-- re-enqueue is possible after the row is reaped or a force-refetch is wanted.
|
||||
--
|
||||
-- Scoped to sync_chapter_content payloads only so Discover / SyncManga /
|
||||
-- SyncChapterList jobs (which don't carry a chapter_id) remain un-deduped.
|
||||
CREATE UNIQUE INDEX crawler_jobs_chapter_content_dedup_idx
|
||||
ON crawler_jobs ((payload->>'chapter_id'))
|
||||
WHERE state IN ('pending', 'running')
|
||||
AND payload->>'kind' = 'sync_chapter_content';
|
||||
12
backend/migrations/0015_crawler_state.sql
Normal file
12
backend/migrations/0015_crawler_state.sql
Normal file
@@ -0,0 +1,12 @@
|
||||
-- Small key-value table for daemon state that needs to survive restarts.
|
||||
--
|
||||
-- Used so far only by the cron scheduler (`last_metadata_tick_at`) so it can
|
||||
-- detect that the most recent slot was missed (e.g. the backend was down at
|
||||
-- midnight) and fire immediately on startup before resuming the regular
|
||||
-- schedule. JSONB on the value column lets future keys carry richer payloads
|
||||
-- without another migration.
|
||||
CREATE TABLE crawler_state (
|
||||
key text PRIMARY KEY,
|
||||
value jsonb NOT NULL,
|
||||
updated_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
@@ -13,6 +13,7 @@ use uuid::Uuid;
|
||||
use crate::api::pagination::PagedResponse;
|
||||
use crate::app::AppState;
|
||||
use crate::auth::extractor::CurrentUser;
|
||||
use crate::crawler::pipeline;
|
||||
use crate::domain::{Bookmark, BookmarkSummary};
|
||||
use crate::error::{AppError, AppResult};
|
||||
use crate::repo;
|
||||
@@ -86,6 +87,29 @@ async fn create(
|
||||
input.page,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Fire-and-forget: kick off content syncs for any pending chapters of
|
||||
// the newly-bookmarked manga. The dedup index makes this idempotent
|
||||
// across repeated bookmarks of the same manga; failure here must not
|
||||
// surface to the user (the daily cron sweeps anything missed).
|
||||
let pool = state.db.clone();
|
||||
let manga_id = input.manga_id;
|
||||
tokio::spawn(async move {
|
||||
match pipeline::enqueue_pending_for_manga(&pool, manga_id).await {
|
||||
Ok(summary) => tracing::info!(
|
||||
%manga_id,
|
||||
inserted = summary.inserted,
|
||||
skipped = summary.skipped,
|
||||
failed = summary.failed,
|
||||
"bookmark hook: enqueued pending chapters"
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
%manga_id, error = ?e,
|
||||
"bookmark hook: enqueue_pending_for_manga failed"
|
||||
),
|
||||
}
|
||||
});
|
||||
|
||||
Ok((StatusCode::CREATED, Json(bookmark)))
|
||||
}
|
||||
|
||||
|
||||
@@ -26,9 +26,9 @@ use crate::upload::{parse_image, UploadedImage};
|
||||
pub fn routes() -> Router<AppState> {
|
||||
Router::new()
|
||||
.route("/mangas/:manga_id/chapters", get(list).post(create))
|
||||
.route("/mangas/:manga_id/chapters/:number", get(get_one))
|
||||
.route("/mangas/:manga_id/chapters/:chapter_id", get(get_one))
|
||||
.route(
|
||||
"/mangas/:manga_id/chapters/:number/pages",
|
||||
"/mangas/:manga_id/chapters/:chapter_id/pages",
|
||||
get(list_pages),
|
||||
)
|
||||
}
|
||||
@@ -60,10 +60,10 @@ async fn list(
|
||||
|
||||
async fn get_one(
|
||||
State(state): State<AppState>,
|
||||
Path((manga_id, number)): Path<(Uuid, i32)>,
|
||||
Path((manga_id, chapter_id)): Path<(Uuid, Uuid)>,
|
||||
) -> AppResult<Json<Chapter>> {
|
||||
repo::manga::get(&state.db, manga_id).await?;
|
||||
let chapter = repo::chapter::find_by_manga_and_number(&state.db, manga_id, number)
|
||||
let chapter = repo::chapter::find_by_id_in_manga(&state.db, manga_id, chapter_id)
|
||||
.await?
|
||||
.ok_or(AppError::NotFound)?;
|
||||
Ok(Json(chapter))
|
||||
@@ -164,10 +164,10 @@ struct PagesResponse {
|
||||
|
||||
async fn list_pages(
|
||||
State(state): State<AppState>,
|
||||
Path((manga_id, number)): Path<(Uuid, i32)>,
|
||||
Path((manga_id, chapter_id)): Path<(Uuid, Uuid)>,
|
||||
) -> AppResult<Json<PagesResponse>> {
|
||||
repo::manga::get(&state.db, manga_id).await?;
|
||||
let chapter = repo::chapter::find_by_manga_and_number(&state.db, manga_id, number)
|
||||
let chapter = repo::chapter::find_by_id_in_manga(&state.db, manga_id, chapter_id)
|
||||
.await?
|
||||
.ok_or(AppError::NotFound)?;
|
||||
let pages = repo::page::list_for_chapter(&state.db, chapter.id).await?;
|
||||
|
||||
@@ -1,14 +1,25 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use axum::extract::DefaultBodyLimit;
|
||||
use axum::http::{HeaderName, HeaderValue, Method};
|
||||
use axum::Router;
|
||||
use sqlx::postgres::PgPoolOptions;
|
||||
use sqlx::PgPool;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tower_http::cors::{AllowOrigin, CorsLayer};
|
||||
use tower_http::trace::TraceLayer;
|
||||
|
||||
use crate::config::{AuthConfig, Config, UploadConfig};
|
||||
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
|
||||
use crate::crawler::browser_manager::{self, BrowserManager};
|
||||
use crate::crawler::content::{self, SyncOutcome};
|
||||
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
|
||||
use crate::crawler::jobs::JobPayload;
|
||||
use crate::crawler::pipeline::{self, MetadataStats};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::session;
|
||||
use crate::storage::{LocalStorage, Storage};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -19,7 +30,23 @@ pub struct AppState {
|
||||
pub upload: UploadConfig,
|
||||
}
|
||||
|
||||
pub async fn build(config: Config) -> anyhow::Result<Router> {
|
||||
/// Bundle returned by [`build`]. The router is what `axum::serve` consumes;
|
||||
/// the daemon (when enabled) outlives the HTTP server and is awaited via
|
||||
/// [`AppHandle::shutdown`] after the listener has finished gracefully.
|
||||
pub struct AppHandle {
|
||||
pub router: Router,
|
||||
pub daemon: Option<daemon::DaemonHandle>,
|
||||
}
|
||||
|
||||
impl AppHandle {
|
||||
pub async fn shutdown(self) {
|
||||
if let Some(d) = self.daemon {
|
||||
d.shutdown().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn build(config: Config) -> anyhow::Result<AppHandle> {
|
||||
let db = PgPoolOptions::new()
|
||||
.max_connections(10)
|
||||
.connect(&config.database_url)
|
||||
@@ -28,13 +55,235 @@ pub async fn build(config: Config) -> anyhow::Result<Router> {
|
||||
|
||||
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(config.storage_dir.clone()));
|
||||
|
||||
let daemon = if config.crawler.daemon_enabled {
|
||||
Some(spawn_crawler_daemon(db.clone(), Arc::clone(&storage), &config.crawler).await?)
|
||||
} else {
|
||||
tracing::info!("crawler daemon disabled (CRAWLER_DAEMON=false)");
|
||||
None
|
||||
};
|
||||
|
||||
let state = AppState {
|
||||
db,
|
||||
storage,
|
||||
auth: config.auth.clone(),
|
||||
upload: config.upload.clone(),
|
||||
};
|
||||
Ok(router(state).layer(cors_layer(&config.cors_allowed_origins)))
|
||||
let router = router(state).layer(cors_layer(&config.cors_allowed_origins));
|
||||
Ok(AppHandle { router, daemon })
|
||||
}
|
||||
|
||||
async fn spawn_crawler_daemon(
|
||||
db: PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
cfg: &CrawlerConfig,
|
||||
) -> anyhow::Result<daemon::DaemonHandle> {
|
||||
// Reqwest client with cookie jar pre-seeded so CDN image fetches
|
||||
// include PHPSESSID. Same shape as bin/crawler.rs main().
|
||||
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
|
||||
if let (Some(sid), Some(domain), Some(start_url)) =
|
||||
(&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url)
|
||||
{
|
||||
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
||||
let seed_url = reqwest::Url::parse(start_url)
|
||||
.context("parse CRAWLER_START_URL for cookie seed")?;
|
||||
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
||||
}
|
||||
let mut http_builder = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.no_proxy()
|
||||
.cookie_provider(cookie_jar);
|
||||
if let Some(ua) = &cfg.user_agent {
|
||||
http_builder = http_builder.user_agent(ua);
|
||||
}
|
||||
if let Some(proxy) = &cfg.proxy {
|
||||
http_builder = http_builder
|
||||
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy: {proxy}"))?);
|
||||
}
|
||||
let http = http_builder.build().context("build crawler reqwest")?;
|
||||
|
||||
let mut rate = HostRateLimiters::new(std::time::Duration::from_millis(cfg.rate_ms));
|
||||
if let Some(host) = &cfg.cdn_host {
|
||||
rate = rate.with_override(host, std::time::Duration::from_millis(cfg.cdn_rate_ms));
|
||||
}
|
||||
let rate = Arc::new(rate);
|
||||
|
||||
// Browser manager. on_launch re-injects PHPSESSID on every fresh
|
||||
// chromium spawn so an idle teardown followed by re-launch stays
|
||||
// authenticated without operator action.
|
||||
let mut launch_opts = cfg.browser.clone();
|
||||
if let Some(proxy) = &cfg.proxy {
|
||||
launch_opts.extra_args.push(format!("--proxy-server={proxy}"));
|
||||
}
|
||||
let on_launch = match (&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url) {
|
||||
(Some(sid), Some(domain), Some(start_url)) => {
|
||||
let sid = sid.clone();
|
||||
let domain = domain.clone();
|
||||
let start_url = start_url.clone();
|
||||
let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| {
|
||||
let sid = sid.clone();
|
||||
let domain = domain.clone();
|
||||
let start_url = start_url.clone();
|
||||
Box::pin(async move {
|
||||
session::inject_phpsessid(&browser, &sid, &domain)
|
||||
.await
|
||||
.context("on_launch: inject_phpsessid")?;
|
||||
session::verify_session(&browser, &start_url)
|
||||
.await
|
||||
.context("on_launch: verify_session")?;
|
||||
Ok(())
|
||||
})
|
||||
});
|
||||
on_launch
|
||||
}
|
||||
_ => browser_manager::noop_on_launch(),
|
||||
};
|
||||
let browser_manager = BrowserManager::new(launch_opts, cfg.idle_timeout, on_launch);
|
||||
|
||||
let session_expired = Arc::new(AtomicBool::new(false));
|
||||
|
||||
let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| {
|
||||
let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass {
|
||||
browser_manager: Arc::clone(&browser_manager),
|
||||
db: db.clone(),
|
||||
storage: Arc::clone(&storage),
|
||||
http: http.clone(),
|
||||
rate: Arc::clone(&rate),
|
||||
start_url: url.clone(),
|
||||
});
|
||||
m
|
||||
});
|
||||
|
||||
let dispatcher: Arc<dyn ChapterDispatcher> = Arc::new(RealChapterDispatcher {
|
||||
browser_manager: Arc::clone(&browser_manager),
|
||||
db: db.clone(),
|
||||
storage: Arc::clone(&storage),
|
||||
http,
|
||||
rate: Arc::clone(&rate),
|
||||
});
|
||||
|
||||
// Shared cancellation: daemon shutdown cancels the BrowserManager's
|
||||
// idle reaper too. Reaper itself is added to the daemon's extra_tasks
|
||||
// so DaemonHandle::shutdown awaits its completion.
|
||||
let cancel = CancellationToken::new();
|
||||
let reaper_task = browser_manager::spawn_idle_reaper(
|
||||
Arc::clone(&browser_manager),
|
||||
cancel.clone(),
|
||||
);
|
||||
// Also close the browser explicitly on shutdown so we don't rely on
|
||||
// kill-on-drop when other Arc<Browser> holders may still exist.
|
||||
let shutdown_task = {
|
||||
let cancel = cancel.clone();
|
||||
let mgr = Arc::clone(&browser_manager);
|
||||
tokio::spawn(async move {
|
||||
cancel.cancelled().await;
|
||||
mgr.shutdown().await;
|
||||
})
|
||||
};
|
||||
|
||||
let daemon_handle = daemon::spawn(
|
||||
db,
|
||||
cancel,
|
||||
DaemonConfig {
|
||||
metadata_pass,
|
||||
dispatcher,
|
||||
chapter_workers: cfg.chapter_workers,
|
||||
daily_at: cfg.daily_at,
|
||||
tz: cfg.tz,
|
||||
retention_days: cfg.retention_days,
|
||||
session_expired,
|
||||
extra_tasks: vec![reaper_task, shutdown_task],
|
||||
},
|
||||
);
|
||||
|
||||
Ok(daemon_handle)
|
||||
}
|
||||
|
||||
// Real impls of the daemon traits, owning the browser manager + I/O. Kept
|
||||
// in app.rs because they need the same builder-side env wiring that
|
||||
// AppState gets — the daemon module itself stays free of reqwest / storage
|
||||
// details so its tests don't pull them in.
|
||||
|
||||
struct RealMetadataPass {
|
||||
browser_manager: Arc<BrowserManager>,
|
||||
db: PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
http: reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
start_url: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MetadataPass for RealMetadataPass {
|
||||
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
||||
pipeline::run_metadata_pass(
|
||||
&self.browser_manager,
|
||||
&self.db,
|
||||
self.storage.as_ref(),
|
||||
&self.http,
|
||||
&self.rate,
|
||||
&self.start_url,
|
||||
0,
|
||||
false,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
struct RealChapterDispatcher {
|
||||
browser_manager: Arc<BrowserManager>,
|
||||
db: PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
http: reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ChapterDispatcher for RealChapterDispatcher {
|
||||
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
match payload {
|
||||
JobPayload::SyncChapterContent {
|
||||
source_id: _,
|
||||
chapter_id,
|
||||
source_chapter_key: _,
|
||||
} => {
|
||||
// Look up manga_id + source_url for this chapter.
|
||||
let row: Option<(uuid::Uuid, String)> = sqlx::query_as(
|
||||
"SELECT c.manga_id, cs.source_url \
|
||||
FROM chapters c \
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id \
|
||||
WHERE c.id = $1 \
|
||||
LIMIT 1",
|
||||
)
|
||||
.bind(chapter_id)
|
||||
.fetch_optional(&self.db)
|
||||
.await
|
||||
.context("look up chapter for dispatch")?;
|
||||
let Some((manga_id, source_url)) = row else {
|
||||
// Chapter (or its source row) is gone — ack done.
|
||||
return Ok(SyncOutcome::Skipped);
|
||||
};
|
||||
let lease = self.browser_manager.acquire().await?;
|
||||
let outcome = content::sync_chapter_content(
|
||||
&lease,
|
||||
&self.db,
|
||||
self.storage.as_ref(),
|
||||
&self.http,
|
||||
&self.rate,
|
||||
chapter_id,
|
||||
manga_id,
|
||||
&source_url,
|
||||
false,
|
||||
)
|
||||
.await?;
|
||||
drop(lease);
|
||||
Ok(outcome)
|
||||
}
|
||||
// Other payload kinds aren't dispatched by this daemon yet —
|
||||
// metadata-driven jobs (Discover/SyncManga/SyncChapterList)
|
||||
// are handled inline by the cron's metadata pass.
|
||||
_ => Ok(SyncOutcome::Skipped),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a router from a pre-assembled state. Used by integration tests
|
||||
|
||||
@@ -1,47 +1,39 @@
|
||||
//! Crawler binary.
|
||||
//!
|
||||
//! Walks the source's manga listing (all pages), fetches each manga's
|
||||
//! metadata + chapter list, downloads the cover into `Storage`, and
|
||||
//! reconciles everything into the DB. Chapter *content* (page images)
|
||||
//! is out of scope for now — only chapter rows + their source links
|
||||
//! are written.
|
||||
//! Now an ops escape hatch sitting alongside the in-process daemon: walks
|
||||
//! the source's manga listing (all pages), fetches each manga's metadata +
|
||||
//! chapter list, downloads covers, reconciles chapters — and then, for any
|
||||
//! chapter belonging to a bookmarked manga whose `page_count` is still 0,
|
||||
//! fetches the chapter pages inline. The daemon does the same work through
|
||||
//! `crawler_jobs`; the CLI is kept around for force-refetches and manual
|
||||
//! backfills.
|
||||
//!
|
||||
//! Configuration:
|
||||
//! - **Start URL** (required): first CLI positional arg, else
|
||||
//! `$CRAWLER_START_URL`. This is the manga *list* page (page 1).
|
||||
//! - **Database** (required): `$DATABASE_URL`.
|
||||
//! - **Storage dir**: `$STORAGE_DIR`, default `./data/storage` —
|
||||
//! matches the API binary so both write to the same local tree.
|
||||
//! - **Browser**: see `LaunchOptions::from_env` —
|
||||
//! `CRAWLER_BROWSER_MODE` (`headed`|`headless`) and
|
||||
//! `CRAWLER_BROWSER_ARGS`.
|
||||
//! - **Rate limit**: `CRAWLER_RATE_MS` (ms between requests, default
|
||||
//! `1000`).
|
||||
//! - **Cap**: `CRAWLER_LIMIT` (max manga detail fetches per run,
|
||||
//! default `0` = no cap).
|
||||
//! - **Skip chapters**: `CRAWLER_SKIP_CHAPTERS=1` — turn off the
|
||||
//! chapter selector in the parser AND skip the per-manga
|
||||
//! `sync_manga_chapters` write. Use this for "metadata only" runs.
|
||||
//! - **Proxy**: `$CRAWLER_PROXY` — single URL applied to both
|
||||
//! Chromium (`--proxy-server`) and `reqwest::Proxy::all`. Supports
|
||||
//! `http://`, `https://`, and `socks5://` (with optional user:pass).
|
||||
//! Example: `socks5://user:pass@host:1080`. Unset → direct.
|
||||
//! Configuration mirrors the daemon's `CRAWLER_*` env vars (see
|
||||
//! `crate::config::CrawlerConfig`) plus the CLI-only:
|
||||
//! - **Start URL**: first CLI positional arg, else `$CRAWLER_START_URL`.
|
||||
//! - **Skip chapters / chapter content / force re-fetch / keep browser**:
|
||||
//! `CRAWLER_SKIP_CHAPTERS`, `CRAWLER_SKIP_CHAPTER_CONTENT`,
|
||||
//! `CRAWLER_FORCE_REFETCH_CHAPTERS`, `CRAWLER_KEEP_BROWSER_OPEN`.
|
||||
//! - **Limit**: `CRAWLER_LIMIT` (max manga detail fetches per run).
|
||||
//!
|
||||
//! See `crawler::pipeline::run_metadata_pass` for the shared metadata
|
||||
//! flow.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use mangalord::crawler::{
|
||||
browser::{self, LaunchOptions},
|
||||
rate_limit::RateLimiter,
|
||||
source::{target::TargetSource, DiscoverMode, FetchContext, Source},
|
||||
};
|
||||
use mangalord::repo;
|
||||
use futures_util::stream::{self, StreamExt};
|
||||
use mangalord::crawler::browser::{BrowserMode, LaunchOptions};
|
||||
use mangalord::crawler::browser_manager::{self, BrowserManager};
|
||||
use mangalord::crawler::content::{self, SyncOutcome};
|
||||
use mangalord::crawler::pipeline;
|
||||
use mangalord::crawler::rate_limit::HostRateLimiters;
|
||||
use mangalord::crawler::session;
|
||||
use mangalord::storage::{LocalStorage, Storage};
|
||||
use sqlx::postgres::PgPoolOptions;
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use uuid::Uuid;
|
||||
|
||||
@@ -64,11 +56,29 @@ async fn main() -> anyhow::Result<()> {
|
||||
.unwrap_or_else(|_| "./data/storage".to_string())
|
||||
.into();
|
||||
let rate_ms = env_u64("CRAWLER_RATE_MS", 1000);
|
||||
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty());
|
||||
let cdn_rate_ms = env_u64("CRAWLER_CDN_RATE_MS", rate_ms);
|
||||
let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
|
||||
let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
|
||||
let skip_chapter_content = env_bool("CRAWLER_SKIP_CHAPTER_CONTENT", false);
|
||||
let chapter_workers = env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize;
|
||||
let force_refetch_chapters = env_bool("CRAWLER_FORCE_REFETCH_CHAPTERS", false);
|
||||
let phpsessid = std::env::var("CRAWLER_PHPSESSID")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty());
|
||||
let cookie_domain = std::env::var("CRAWLER_COOKIE_DOMAIN")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty())
|
||||
.or_else(|| session::registrable_domain(&start_url));
|
||||
let user_agent = std::env::var("CRAWLER_USER_AGENT")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty());
|
||||
let proxy_url = std::env::var("CRAWLER_PROXY")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty());
|
||||
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
|
||||
|
||||
let db = PgPoolOptions::new()
|
||||
.max_connections(5)
|
||||
@@ -79,13 +89,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(&storage_dir));
|
||||
|
||||
// `no_proxy()` disables reqwest's own env-based detection so the
|
||||
// single `CRAWLER_PROXY` knob is the only thing that influences
|
||||
// routing. Otherwise an unrelated `HTTPS_PROXY` in the shell would
|
||||
// silently route cover downloads while the browser stayed direct.
|
||||
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
|
||||
if let (Some(sid), Some(domain)) = (&phpsessid, &cookie_domain) {
|
||||
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
||||
let seed_url =
|
||||
reqwest::Url::parse(&start_url).context("parse start URL for cookie seed")?;
|
||||
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
||||
tracing::info!(domain, "seeded PHPSESSID into reqwest cookie jar");
|
||||
}
|
||||
let mut http_builder = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.no_proxy();
|
||||
.no_proxy()
|
||||
.cookie_provider(cookie_jar);
|
||||
if let Some(ua) = &user_agent {
|
||||
http_builder = http_builder.user_agent(ua);
|
||||
}
|
||||
if let Some(proxy) = &proxy_url {
|
||||
http_builder = http_builder
|
||||
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?);
|
||||
@@ -96,204 +114,269 @@ async fn main() -> anyhow::Result<()> {
|
||||
if let Some(proxy) = &proxy_url {
|
||||
options.extra_args.push(format!("--proxy-server={proxy}"));
|
||||
}
|
||||
let keep_open = match (keep_browser_open, options.mode) {
|
||||
(true, BrowserMode::Headed) => true,
|
||||
(true, BrowserMode::Headless) => {
|
||||
tracing::warn!(
|
||||
"CRAWLER_KEEP_BROWSER_OPEN ignored in headless mode (no window to inspect)"
|
||||
);
|
||||
false
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
tracing::info!(
|
||||
?options,
|
||||
%start_url,
|
||||
rate_ms,
|
||||
cdn_host = ?cdn_host,
|
||||
cdn_rate_ms,
|
||||
limit,
|
||||
skip_chapters,
|
||||
skip_chapter_content,
|
||||
chapter_workers,
|
||||
force_refetch_chapters,
|
||||
phpsessid_set = phpsessid.is_some(),
|
||||
cookie_domain = ?cookie_domain,
|
||||
user_agent = ?user_agent,
|
||||
proxy = ?proxy_url,
|
||||
keep_open,
|
||||
storage_dir = %storage_dir.display(),
|
||||
"starting crawler"
|
||||
);
|
||||
|
||||
let handle = browser::launch(options).await.context("launch browser")?;
|
||||
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
|
||||
// alive for the entire run — same lifecycle as the old direct
|
||||
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
|
||||
// session probe; bad cookies fail fast before any real work happens.
|
||||
let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) {
|
||||
(Some(sid), Some(domain)) => {
|
||||
let sid = sid.clone();
|
||||
let domain = domain.clone();
|
||||
let start_url_clone = start_url.clone();
|
||||
Arc::new(move |browser| {
|
||||
let sid = sid.clone();
|
||||
let domain = domain.clone();
|
||||
let start_url = start_url_clone.clone();
|
||||
Box::pin(async move {
|
||||
session::inject_phpsessid(&browser, &sid, &domain)
|
||||
.await
|
||||
.context("inject_phpsessid")?;
|
||||
session::verify_session(&browser, &start_url)
|
||||
.await
|
||||
.context("verify_session")?;
|
||||
Ok(())
|
||||
})
|
||||
})
|
||||
}
|
||||
_ => browser_manager::noop_on_launch(),
|
||||
};
|
||||
let session_ready = phpsessid.is_some() && cookie_domain.is_some();
|
||||
let manager = BrowserManager::new(options, Duration::ZERO, on_launch);
|
||||
|
||||
let result = run(
|
||||
handle.browser(),
|
||||
Arc::clone(&manager),
|
||||
&db,
|
||||
storage.as_ref(),
|
||||
Arc::clone(&storage),
|
||||
&http,
|
||||
&start_url,
|
||||
rate_ms,
|
||||
cdn_host.as_deref(),
|
||||
cdn_rate_ms,
|
||||
limit,
|
||||
skip_chapters,
|
||||
skip_chapter_content || !session_ready,
|
||||
chapter_workers,
|
||||
force_refetch_chapters,
|
||||
)
|
||||
.await;
|
||||
handle.close().await.ok();
|
||||
|
||||
if keep_open {
|
||||
tracing::info!(
|
||||
"crawler finished; browser kept open. Press Ctrl+C to close and exit."
|
||||
);
|
||||
let _ = tokio::signal::ctrl_c().await;
|
||||
tracing::info!("Ctrl+C received; closing browser");
|
||||
}
|
||||
manager.shutdown().await;
|
||||
result
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run(
|
||||
browser: &chromiumoxide::Browser,
|
||||
manager: Arc<BrowserManager>,
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
storage: Arc<dyn Storage>,
|
||||
http: &reqwest::Client,
|
||||
start_url: &str,
|
||||
rate_ms: u64,
|
||||
cdn_host: Option<&str>,
|
||||
cdn_rate_ms: u64,
|
||||
limit: usize,
|
||||
skip_chapters: bool,
|
||||
skip_chapter_content: bool,
|
||||
chapter_workers: usize,
|
||||
force_refetch_chapters: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let rate = Mutex::new(RateLimiter::new(Duration::from_millis(rate_ms)));
|
||||
let source = {
|
||||
let s = TargetSource::new(start_url.to_string());
|
||||
if skip_chapters {
|
||||
s.without_chapter_parsing()
|
||||
} else {
|
||||
s
|
||||
let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms));
|
||||
if let Some(host) = cdn_host {
|
||||
rate = rate.with_override(host, Duration::from_millis(cdn_rate_ms));
|
||||
}
|
||||
};
|
||||
let ctx = FetchContext {
|
||||
browser,
|
||||
rate: &rate,
|
||||
};
|
||||
let rate = Arc::new(rate);
|
||||
|
||||
let source_id = source.id();
|
||||
repo::crawler::ensure_source(
|
||||
let stats = pipeline::run_metadata_pass(
|
||||
manager.as_ref(),
|
||||
db,
|
||||
source_id,
|
||||
"Target Site",
|
||||
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
|
||||
storage.as_ref(),
|
||||
http,
|
||||
rate.as_ref(),
|
||||
start_url,
|
||||
limit,
|
||||
skip_chapters,
|
||||
)
|
||||
.await?;
|
||||
tracing::info!(?stats, "metadata pass complete");
|
||||
|
||||
if !skip_chapter_content {
|
||||
sync_bookmarked_chapter_content(
|
||||
Arc::clone(&manager),
|
||||
db,
|
||||
Arc::clone(&storage),
|
||||
http,
|
||||
Arc::clone(&rate),
|
||||
"target",
|
||||
chapter_workers,
|
||||
force_refetch_chapters,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Find every chapter whose manga is bookmarked by at least one user and
|
||||
/// that hasn't been content-synced yet, then fan them out across `workers`
|
||||
/// concurrent tasks. Same as before except the browser comes from a
|
||||
/// BrowserManager lease so it interleaves cleanly with the metadata pass.
|
||||
///
|
||||
/// A `SessionExpired` result aborts the phase.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn sync_bookmarked_chapter_content(
|
||||
manager: Arc<BrowserManager>,
|
||||
db: &PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
http: &reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
source_id: &str,
|
||||
workers: usize,
|
||||
force_refetch: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as(
|
||||
r#"
|
||||
SELECT id, manga_id, source_url FROM (
|
||||
SELECT DISTINCT c.id, c.manga_id, c.created_at, cs.source_url
|
||||
FROM chapters c
|
||||
JOIN bookmarks b ON b.manga_id = c.manga_id
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||
WHERE cs.source_id = $1
|
||||
AND cs.dropped_at IS NULL
|
||||
AND (c.page_count = 0 OR $2)
|
||||
) sub
|
||||
ORDER BY manga_id, created_at ASC
|
||||
"#,
|
||||
)
|
||||
.bind(source_id)
|
||||
.bind(force_refetch)
|
||||
.fetch_all(db)
|
||||
.await
|
||||
.context("ensure_source")?;
|
||||
.context("query pending chapter content")?;
|
||||
|
||||
let run_started_at = chrono::Utc::now();
|
||||
if pending.is_empty() {
|
||||
tracing::info!("chapter content: nothing pending");
|
||||
return Ok(());
|
||||
}
|
||||
tracing::info!(count = pending.len(), workers, "chapter content phase starting");
|
||||
|
||||
let max_refs = (limit > 0).then_some(limit);
|
||||
tracing::info!(?max_refs, "discovering manga list");
|
||||
let refs = source
|
||||
.discover(&ctx, DiscoverMode::Backfill, max_refs)
|
||||
.await
|
||||
.context("discover failed")?;
|
||||
tracing::info!(count = refs.len(), "discovered manga list");
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let stats = std::sync::Mutex::new(WorkerStats::default());
|
||||
|
||||
let to_fetch = refs;
|
||||
let total = to_fetch.len();
|
||||
|
||||
for (i, r) in to_fetch.iter().enumerate() {
|
||||
tracing::info!(idx = i + 1, total, key = %r.source_manga_key, "fetching metadata");
|
||||
let manga = match source.fetch_manga(&ctx, r).await {
|
||||
Ok(m) => m,
|
||||
stream::iter(pending.into_iter())
|
||||
.for_each_concurrent(workers.max(1), |(chapter_id, manga_id, source_url)| {
|
||||
let session_expired = Arc::clone(&session_expired);
|
||||
let storage = Arc::clone(&storage);
|
||||
let rate = Arc::clone(&rate);
|
||||
let manager = Arc::clone(&manager);
|
||||
let stats = &stats;
|
||||
async move {
|
||||
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
return;
|
||||
}
|
||||
let lease = match manager.acquire().await {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
tracing::warn!(key = %r.source_manga_key, url = %r.url, error = ?e, "fetch_manga failed");
|
||||
continue;
|
||||
tracing::error!(%chapter_id, error = ?e, "browser acquire failed");
|
||||
let mut s = stats.lock().unwrap();
|
||||
s.failed += 1;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
|
||||
.await
|
||||
{
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
tracing::error!(key = %r.source_manga_key, error = ?e, "upsert_manga_from_source failed");
|
||||
continue;
|
||||
let outcome = content::sync_chapter_content(
|
||||
&lease,
|
||||
db,
|
||||
storage.as_ref(),
|
||||
http,
|
||||
rate.as_ref(),
|
||||
chapter_id,
|
||||
manga_id,
|
||||
&source_url,
|
||||
force_refetch,
|
||||
)
|
||||
.await;
|
||||
drop(lease);
|
||||
let mut s = stats.lock().unwrap();
|
||||
match outcome {
|
||||
Ok(SyncOutcome::Fetched { pages }) => {
|
||||
tracing::info!(%chapter_id, pages, "chapter content fetched");
|
||||
s.fetched += 1;
|
||||
}
|
||||
};
|
||||
Ok(SyncOutcome::Skipped) => s.skipped += 1,
|
||||
Ok(SyncOutcome::SessionExpired) => {
|
||||
tracing::error!(
|
||||
%chapter_id,
|
||||
"session expired mid-run — refresh CRAWLER_PHPSESSID and re-run"
|
||||
);
|
||||
session_expired
|
||||
.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
%chapter_id, error = ?e, "chapter content sync failed"
|
||||
);
|
||||
s.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
let total = stats.into_inner().unwrap();
|
||||
tracing::info!(
|
||||
key = %manga.source_manga_key,
|
||||
manga_id = %upsert.manga_id,
|
||||
status = ?upsert.status,
|
||||
title = %manga.title,
|
||||
"manga upserted"
|
||||
fetched = total.fetched,
|
||||
skipped = total.skipped,
|
||||
failed = total.failed,
|
||||
"chapter content phase done"
|
||||
);
|
||||
|
||||
// Cover image: download when missing in storage (backfill for
|
||||
// mangas synced before cover-download support, plus the New
|
||||
// path) or when metadata changed (cover URL is part of
|
||||
// metadata_hash, so an Updated status implies the URL may
|
||||
// have moved). Failures are non-fatal.
|
||||
let needs_cover = upsert.cover_image_path.is_none()
|
||||
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
|
||||
if needs_cover {
|
||||
if let Some(cover_url) = manga.cover_url.as_deref() {
|
||||
if let Err(e) = download_and_store_cover(
|
||||
db,
|
||||
storage,
|
||||
http,
|
||||
&rate,
|
||||
&r.url,
|
||||
upsert.manga_id,
|
||||
cover_url,
|
||||
)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(manga_id = %upsert.manga_id, error = ?e, "cover download failed");
|
||||
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
anyhow::bail!("session expired during chapter content phase");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_chapters {
|
||||
match repo::crawler::sync_manga_chapters(
|
||||
db,
|
||||
source_id,
|
||||
upsert.manga_id,
|
||||
&manga.chapters,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(diff) => tracing::info!(
|
||||
manga_id = %upsert.manga_id,
|
||||
new = diff.new,
|
||||
refreshed = diff.refreshed,
|
||||
dropped = diff.dropped,
|
||||
"chapters synced"
|
||||
),
|
||||
Err(e) => tracing::warn!(manga_id = %upsert.manga_id, error = ?e, "chapter sync failed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if limit == 0 {
|
||||
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
|
||||
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
|
||||
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
|
||||
}
|
||||
} else {
|
||||
tracing::info!(limit, "partial sync — skipping drop pass");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_and_store_cover(
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &Mutex<RateLimiter>,
|
||||
manga_url: &str,
|
||||
manga_id: Uuid,
|
||||
cover_url: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let absolute = reqwest::Url::parse(manga_url)
|
||||
.context("parse manga URL")?
|
||||
.join(cover_url)
|
||||
.context("join cover URL onto manga URL")?;
|
||||
|
||||
rate.lock().await.wait().await;
|
||||
let resp = http
|
||||
.get(absolute.clone())
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("GET {absolute}"))?
|
||||
.error_for_status()
|
||||
.with_context(|| format!("non-2xx for {absolute}"))?;
|
||||
let bytes = resp.bytes().await.context("read cover body")?;
|
||||
|
||||
// `infer` sniffs the magic bytes — same crate the upload handler
|
||||
// uses, so we don't trust the URL's extension.
|
||||
let kind = infer::get(&bytes);
|
||||
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
|
||||
let key = format!("mangas/{manga_id}/cover.{ext}");
|
||||
|
||||
storage
|
||||
.put(&key, &bytes)
|
||||
.await
|
||||
.with_context(|| format!("store cover at {key}"))?;
|
||||
repo::manga::set_cover_image_path(db, manga_id, &key)
|
||||
.await
|
||||
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
|
||||
tracing::info!(manga_id = %manga_id, key = %key, bytes = bytes.len(), %absolute, "cover stored");
|
||||
Ok(())
|
||||
#[derive(Default, Clone, Copy)]
|
||||
struct WorkerStats {
|
||||
fetched: usize,
|
||||
skipped: usize,
|
||||
failed: usize,
|
||||
}
|
||||
|
||||
fn resolve_start_url() -> anyhow::Result<String> {
|
||||
@@ -307,12 +390,6 @@ fn resolve_start_url() -> anyhow::Result<String> {
|
||||
})
|
||||
}
|
||||
|
||||
fn origin_of(url: &str) -> Option<String> {
|
||||
let (scheme, rest) = url.split_once("://")?;
|
||||
let host = rest.split('/').next()?;
|
||||
Some(format!("{scheme}://{host}"))
|
||||
}
|
||||
|
||||
fn env_u64(name: &str, default: u64) -> u64 {
|
||||
std::env::var(name)
|
||||
.ok()
|
||||
@@ -327,3 +404,4 @@ fn env_bool(name: &str, default: bool) -> bool {
|
||||
_ => default,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::NaiveTime;
|
||||
use chrono_tz::Tz;
|
||||
|
||||
use crate::crawler::browser::LaunchOptions;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AuthConfig {
|
||||
@@ -45,6 +51,54 @@ pub struct Config {
|
||||
pub auth: AuthConfig,
|
||||
pub upload: UploadConfig,
|
||||
pub cors_allowed_origins: Vec<String>,
|
||||
pub crawler: CrawlerConfig,
|
||||
}
|
||||
|
||||
/// All crawler-daemon knobs read from env. Mirrors the env vars the
|
||||
/// `bin/crawler` binary already reads, plus the new daemon-only knobs
|
||||
/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
|
||||
///
|
||||
/// `daemon_enabled = false` skips the daemon spawn entirely — used by
|
||||
/// integration tests and dev runs that don't want background activity.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CrawlerConfig {
|
||||
pub daemon_enabled: bool,
|
||||
pub daily_at: NaiveTime,
|
||||
pub tz: Tz,
|
||||
pub idle_timeout: Duration,
|
||||
pub chapter_workers: usize,
|
||||
pub retention_days: u32,
|
||||
pub start_url: Option<String>,
|
||||
pub rate_ms: u64,
|
||||
pub cdn_host: Option<String>,
|
||||
pub cdn_rate_ms: u64,
|
||||
pub phpsessid: Option<String>,
|
||||
pub cookie_domain: Option<String>,
|
||||
pub user_agent: Option<String>,
|
||||
pub proxy: Option<String>,
|
||||
pub browser: LaunchOptions,
|
||||
}
|
||||
|
||||
impl Default for CrawlerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
daemon_enabled: false,
|
||||
daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
||||
tz: Tz::UTC,
|
||||
idle_timeout: Duration::from_secs(600),
|
||||
chapter_workers: 1,
|
||||
retention_days: 7,
|
||||
start_url: None,
|
||||
rate_ms: 1000,
|
||||
cdn_host: None,
|
||||
cdn_rate_ms: 1000,
|
||||
phpsessid: None,
|
||||
cookie_domain: None,
|
||||
user_agent: None,
|
||||
proxy: None,
|
||||
browser: LaunchOptions::headless(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
@@ -77,10 +131,65 @@ impl Config {
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default(),
|
||||
crawler: CrawlerConfig::from_env()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl CrawlerConfig {
|
||||
pub fn from_env() -> anyhow::Result<Self> {
|
||||
// Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
|
||||
let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
|
||||
None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
||||
Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
|
||||
anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
|
||||
})?,
|
||||
};
|
||||
let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
|
||||
None | Some("") => Tz::UTC,
|
||||
Some(raw) => raw
|
||||
.parse()
|
||||
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
|
||||
};
|
||||
Ok(Self {
|
||||
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
|
||||
daily_at,
|
||||
tz,
|
||||
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
|
||||
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
|
||||
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
|
||||
start_url: std::env::var("CRAWLER_START_URL")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
|
||||
cdn_host: std::env::var("CRAWLER_CDN_HOST")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
|
||||
phpsessid: std::env::var("CRAWLER_PHPSESSID")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
user_agent: std::env::var("CRAWLER_USER_AGENT")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
proxy: std::env::var("CRAWLER_PROXY")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
browser: LaunchOptions::from_env(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn env_u64(name: &str, default: u64) -> u64 {
|
||||
std::env::var(name)
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(default)
|
||||
}
|
||||
|
||||
fn env_bool(name: &str, default: bool) -> bool {
|
||||
match std::env::var(name).ok().as_deref() {
|
||||
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! caller-provided.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use chromiumoxide::browser::{Browser, BrowserConfig};
|
||||
@@ -95,25 +96,49 @@ pub(crate) fn parse_args(s: &str) -> Vec<String> {
|
||||
/// Owned browser plus the spawned task that drives its CDP event loop.
|
||||
/// Dropping `Handle` without calling `close` leaks the Chromium process
|
||||
/// — always call `close().await` in production paths.
|
||||
///
|
||||
/// The browser is stored behind an `Arc` so it can be shared across
|
||||
/// worker tasks (via [`Handle::shared`]) without copying. `Browser::new_page`
|
||||
/// only needs `&self`, so multiple workers can drive the same browser
|
||||
/// concurrently as long as the manager keeps the `Arc` alive.
|
||||
pub struct Handle {
|
||||
browser: Browser,
|
||||
browser: Arc<Browser>,
|
||||
driver: JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl Handle {
|
||||
/// Borrow the browser. Equivalent to `&*handle.shared()`.
|
||||
pub fn browser(&self) -> &Browser {
|
||||
&self.browser
|
||||
}
|
||||
|
||||
pub fn browser_mut(&mut self) -> &mut Browser {
|
||||
&mut self.browser
|
||||
/// Clone the shared handle. Workers hold these to call `new_page`
|
||||
/// concurrently. The browser only exits when the last `Arc<Browser>`
|
||||
/// is dropped (kill-on-drop), or when `close()` is called on the
|
||||
/// originating `Handle` while it is the sole holder.
|
||||
pub fn shared(&self) -> Arc<Browser> {
|
||||
Arc::clone(&self.browser)
|
||||
}
|
||||
|
||||
/// Closes the browser and awaits the driver task. Safe to call
|
||||
/// multiple times — subsequent calls are no-ops.
|
||||
pub async fn close(mut self) -> anyhow::Result<()> {
|
||||
let _ = self.browser.close().await;
|
||||
let _ = self.browser.wait().await;
|
||||
/// Closes the browser and awaits the driver task. If other Arcs to
|
||||
/// the browser are still alive we fall back to drop-kills-Chromium
|
||||
/// semantics and just join the driver — this is the rare case where
|
||||
/// shutdown raced an outstanding worker; the OS-level kill is the
|
||||
/// safety net.
|
||||
pub async fn close(self) -> anyhow::Result<()> {
|
||||
match Arc::try_unwrap(self.browser) {
|
||||
Ok(mut owned) => {
|
||||
let _ = owned.close().await;
|
||||
let _ = owned.wait().await;
|
||||
}
|
||||
Err(shared) => {
|
||||
tracing::warn!(
|
||||
strong_count = Arc::strong_count(&shared),
|
||||
"Handle::close while Arc<Browser> still shared — relying on kill-on-drop"
|
||||
);
|
||||
drop(shared);
|
||||
}
|
||||
}
|
||||
let _ = self.driver.await;
|
||||
Ok(())
|
||||
}
|
||||
@@ -184,7 +209,10 @@ pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Handle { browser, driver })
|
||||
Ok(Handle {
|
||||
browser: Arc::new(browser),
|
||||
driver,
|
||||
})
|
||||
}
|
||||
|
||||
fn cache_dir() -> anyhow::Result<PathBuf> {
|
||||
|
||||
262
backend/src/crawler/browser_manager.rs
Normal file
262
backend/src/crawler/browser_manager.rs
Normal file
@@ -0,0 +1,262 @@
|
||||
//! Lazy-launch / idle-teardown Chromium manager for the daemon.
|
||||
//!
|
||||
//! The first worker that calls [`BrowserManager::acquire`] triggers a real
|
||||
//! Chromium launch (and the `on_launch` hook — used to re-inject the
|
||||
//! PHPSESSID cookie on every fresh process). Each acquire bumps an active
|
||||
//! counter; the returned [`BrowserLease`] decrements it on drop.
|
||||
//!
|
||||
//! When the active counter hits zero, a background reaper task waits
|
||||
//! `idle_timeout`. If still zero on wake, it closes Chromium and clears the
|
||||
//! cached handle. The next acquire re-launches.
|
||||
//!
|
||||
//! `idle_timeout = Duration::ZERO` disables the reaper — Chromium stays alive
|
||||
//! until [`BrowserManager::shutdown`].
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use chromiumoxide::browser::Browser;
|
||||
use futures_util::future::BoxFuture;
|
||||
use tokio::sync::{Mutex, Notify};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::crawler::browser::{self, LaunchOptions};
|
||||
|
||||
/// Hook invoked on every fresh launch with the new browser. Typically used
|
||||
/// to re-inject PHPSESSID + run the session probe. Errors abort the
|
||||
/// `acquire` that triggered the launch — the next acquire will re-launch.
|
||||
pub type OnLaunch =
|
||||
Arc<dyn Fn(Arc<Browser>) -> BoxFuture<'static, anyhow::Result<()>> + Send + Sync>;
|
||||
|
||||
/// Returns an `OnLaunch` that does nothing — useful when no session is
|
||||
/// configured (e.g. CLI metadata-only runs).
|
||||
pub fn noop_on_launch() -> OnLaunch {
|
||||
Arc::new(|_| Box::pin(async { Ok(()) }))
|
||||
}
|
||||
|
||||
/// Decoupled active-lease tracker. Owns the atomic counter and the idle
|
||||
/// notifier so the wiring is unit-testable without standing up a real
|
||||
/// `BrowserManager` (which would require launching Chromium).
|
||||
#[derive(Default)]
|
||||
pub(crate) struct ActiveTracker {
|
||||
counter: AtomicUsize,
|
||||
idle_signal: Notify,
|
||||
}
|
||||
|
||||
impl ActiveTracker {
|
||||
pub(crate) fn new() -> Arc<Self> {
|
||||
Arc::new(Self::default())
|
||||
}
|
||||
|
||||
pub(crate) fn acquire(self: &Arc<Self>) {
|
||||
self.counter.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
|
||||
pub(crate) fn release(self: &Arc<Self>) {
|
||||
if self.counter.fetch_sub(1, Ordering::AcqRel) == 1 {
|
||||
self.idle_signal.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn current(&self) -> usize {
|
||||
self.counter.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
pub(crate) fn idle_signal(&self) -> &Notify {
|
||||
&self.idle_signal
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BrowserManager {
|
||||
inner: Mutex<Inner>,
|
||||
active: Arc<ActiveTracker>,
|
||||
launch_opts: LaunchOptions,
|
||||
idle_timeout: Duration,
|
||||
on_launch: OnLaunch,
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
handle: Option<browser::Handle>,
|
||||
shared: Option<Arc<Browser>>,
|
||||
}
|
||||
|
||||
impl BrowserManager {
|
||||
pub fn new(
|
||||
launch_opts: LaunchOptions,
|
||||
idle_timeout: Duration,
|
||||
on_launch: OnLaunch,
|
||||
) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
inner: Mutex::new(Inner {
|
||||
handle: None,
|
||||
shared: None,
|
||||
}),
|
||||
active: ActiveTracker::new(),
|
||||
launch_opts,
|
||||
idle_timeout,
|
||||
on_launch,
|
||||
})
|
||||
}
|
||||
|
||||
/// Acquire a shared browser lease. The first acquire after a teardown
|
||||
/// launches a fresh Chromium (and runs `on_launch`); subsequent acquires
|
||||
/// while a process is alive just bump the counter and clone the `Arc`.
|
||||
pub async fn acquire(&self) -> anyhow::Result<BrowserLease> {
|
||||
let mut guard = self.inner.lock().await;
|
||||
if guard.handle.is_none() {
|
||||
let handle = browser::launch(self.launch_opts.clone())
|
||||
.await
|
||||
.context("BrowserManager: launch chromium")?;
|
||||
let shared = handle.shared();
|
||||
// Run the on-launch hook before publishing the handle so a session
|
||||
// probe failure doesn't leave a half-initialized browser behind.
|
||||
if let Err(e) = (self.on_launch)(Arc::clone(&shared)).await {
|
||||
// Close the just-launched browser since we won't be using it.
|
||||
let _ = handle.close().await;
|
||||
return Err(e.context("BrowserManager: on_launch hook failed"));
|
||||
}
|
||||
guard.handle = Some(handle);
|
||||
guard.shared = Some(shared);
|
||||
}
|
||||
let browser = guard
|
||||
.shared
|
||||
.as_ref()
|
||||
.expect("shared set above")
|
||||
.clone();
|
||||
self.active.acquire();
|
||||
Ok(BrowserLease {
|
||||
browser,
|
||||
active: Arc::clone(&self.active),
|
||||
})
|
||||
}
|
||||
|
||||
/// Forcefully close the cached browser regardless of active count.
|
||||
/// Used on daemon shutdown. After this returns the next acquire will
|
||||
/// re-launch from scratch.
|
||||
pub async fn shutdown(&self) {
|
||||
let mut guard = self.inner.lock().await;
|
||||
guard.shared = None;
|
||||
if let Some(handle) = guard.handle.take() {
|
||||
let _ = handle.close().await;
|
||||
}
|
||||
}
|
||||
|
||||
fn idle_timeout(&self) -> Duration {
|
||||
self.idle_timeout
|
||||
}
|
||||
|
||||
fn active(&self) -> Arc<ActiveTracker> {
|
||||
Arc::clone(&self.active)
|
||||
}
|
||||
}
|
||||
|
||||
/// Background reaper. Returns immediately when `idle_timeout == 0`.
|
||||
/// Otherwise spawns a task that:
|
||||
/// 1. Waits on `idle_signal` (woken when active hits zero).
|
||||
/// 2. Sleeps `idle_timeout`.
|
||||
/// 3. Re-checks the counter under the mutex — if still zero, takes the
|
||||
/// handle and closes it.
|
||||
///
|
||||
/// Repeats forever until `cancel` fires.
|
||||
pub fn spawn_idle_reaper(mgr: Arc<BrowserManager>, cancel: CancellationToken) -> JoinHandle<()> {
|
||||
tokio::spawn(async move {
|
||||
if mgr.idle_timeout().is_zero() {
|
||||
// Block until cancellation, then exit.
|
||||
cancel.cancelled().await;
|
||||
return;
|
||||
}
|
||||
let active = mgr.active();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return,
|
||||
_ = active.idle_signal().notified() => {}
|
||||
}
|
||||
if active.current() > 0 {
|
||||
continue;
|
||||
}
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return,
|
||||
_ = tokio::time::sleep(mgr.idle_timeout()) => {}
|
||||
}
|
||||
let mut guard = mgr.inner.lock().await;
|
||||
if active.current() > 0 {
|
||||
// A worker grabbed a lease during the sleep — abort teardown.
|
||||
continue;
|
||||
}
|
||||
let handle = guard.handle.take();
|
||||
guard.shared = None;
|
||||
drop(guard);
|
||||
if let Some(h) = handle {
|
||||
let _ = h.close().await;
|
||||
tracing::info!("BrowserManager: idle teardown — Chromium closed");
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A worker-side handle that keeps the browser alive while in scope.
|
||||
/// `Deref<Target = Browser>` so callers can pass `&*lease` to APIs that
|
||||
/// expect `&Browser`.
|
||||
pub struct BrowserLease {
|
||||
browser: Arc<Browser>,
|
||||
active: Arc<ActiveTracker>,
|
||||
}
|
||||
|
||||
impl Deref for BrowserLease {
|
||||
type Target = Browser;
|
||||
fn deref(&self) -> &Browser {
|
||||
&self.browser
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BrowserLease {
|
||||
fn drop(&mut self) {
|
||||
self.active.release();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
#[test]
|
||||
fn noop_on_launch_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>(_: &T) {}
|
||||
let h = noop_on_launch();
|
||||
assert_send_sync(&h);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_tracker_signals_idle_only_on_zero_transition() {
|
||||
let tracker = ActiveTracker::new();
|
||||
let signaled = Arc::new(AtomicBool::new(false));
|
||||
{
|
||||
let s = Arc::clone(&signaled);
|
||||
let t = Arc::clone(&tracker);
|
||||
tokio::spawn(async move {
|
||||
t.idle_signal().notified().await;
|
||||
s.store(true, Ordering::Release);
|
||||
});
|
||||
}
|
||||
|
||||
tracker.acquire();
|
||||
tracker.acquire();
|
||||
assert_eq!(tracker.current(), 2);
|
||||
tracker.release();
|
||||
assert_eq!(tracker.current(), 1);
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
assert!(!signaled.load(Ordering::Acquire), "no idle signal at count 1");
|
||||
tracker.release();
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
assert_eq!(tracker.current(), 0);
|
||||
assert!(
|
||||
signaled.load(Ordering::Acquire),
|
||||
"idle signal fires on 1 -> 0 transition"
|
||||
);
|
||||
}
|
||||
}
|
||||
244
backend/src/crawler/content.rs
Normal file
244
backend/src/crawler/content.rs
Normal file
@@ -0,0 +1,244 @@
|
||||
//! Chapter content sync — fetch a logged-in chapter page, extract its
|
||||
//! image URLs in `pageN` order, download each to storage, and atomically
|
||||
//! persist a `pages` row per image plus the chapter's `page_count`.
|
||||
//!
|
||||
//! Only chapters belonging to a manga someone has bookmarked are
|
||||
//! candidates. The crawler scans bookmarks at the start of each run and
|
||||
//! enqueues unfetched chapters; the API also enqueues at bookmark-time
|
||||
//! so users get instant feedback. Both feed into the same queue and
|
||||
//! dedup by chapter id.
|
||||
|
||||
// Implementation lands in the next commits in this branch. Module is
|
||||
// declared so other crates can `use crawler::content` without breaking
|
||||
// builds while iteration is in progress.
|
||||
|
||||
use anyhow::Context;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::session;
|
||||
use crate::storage::Storage;
|
||||
|
||||
/// Parse the chapter page DOM and return the page images in `pageN`
|
||||
/// order. Filters out the loader `<img class="loading">` and any
|
||||
/// `<img>` without a numeric `id="pageN"`.
|
||||
pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let sel = scraper::Selector::parse("a#pic_container img:not(.loading)").unwrap();
|
||||
let mut pages: Vec<ChapterImage> = doc
|
||||
.select(&sel)
|
||||
.filter_map(|img| {
|
||||
let id = img.value().id()?;
|
||||
let n: i32 = id.strip_prefix("page")?.parse().ok()?;
|
||||
let src = img.value().attr("src")?.trim().to_string();
|
||||
if src.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(ChapterImage { page_number: n, url: src })
|
||||
})
|
||||
.collect();
|
||||
pages.sort_by_key(|p| p.page_number);
|
||||
pages
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ChapterImage {
|
||||
pub page_number: i32,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
/// Outcome of a single chapter sync — surfaced to callers for logging
|
||||
/// and exit-code decisions.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum SyncOutcome {
|
||||
/// All images downloaded and stored, chapter row updated.
|
||||
Fetched { pages: usize },
|
||||
/// `page_count > 0` already — no-op unless force_refetch is set.
|
||||
Skipped,
|
||||
/// Session probe failed mid-sync (avatar selector missing on the
|
||||
/// chapter page). Caller should abort the whole crawler run.
|
||||
SessionExpired,
|
||||
}
|
||||
|
||||
/// Fetch all images for one chapter and persist them atomically. On
|
||||
/// any error after the first storage put, the DB transaction rolls
|
||||
/// back so the chapter stays at `page_count = 0` and is retried on the
|
||||
/// next run. Bytes already written to storage become orphans; a future
|
||||
/// reaper sweeps them.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn sync_chapter_content(
|
||||
browser: &chromiumoxide::Browser,
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
chapter_id: Uuid,
|
||||
manga_id: Uuid,
|
||||
source_url: &str,
|
||||
force_refetch: bool,
|
||||
) -> anyhow::Result<SyncOutcome> {
|
||||
// Skip if already fetched, unless caller explicitly forces.
|
||||
if !force_refetch {
|
||||
let (page_count,): (i32,) =
|
||||
sqlx::query_as("SELECT page_count FROM chapters WHERE id = $1")
|
||||
.bind(chapter_id)
|
||||
.fetch_one(db)
|
||||
.await
|
||||
.context("read chapter page_count")?;
|
||||
if page_count > 0 {
|
||||
return Ok(SyncOutcome::Skipped);
|
||||
}
|
||||
}
|
||||
|
||||
// Nav to chapter page (rate-limited per host).
|
||||
rate.wait_for(source_url).await?;
|
||||
let page = browser
|
||||
.new_page(source_url)
|
||||
.await
|
||||
.with_context(|| format!("open chapter page {source_url}"))?;
|
||||
page.wait_for_navigation().await.context("wait for chapter nav")?;
|
||||
|
||||
// Session probe: avatar present == still logged in. Missing means
|
||||
// PHPSESSID expired; bail the entire crawler run.
|
||||
if page.find_element("#avatar_menu").await.is_err() {
|
||||
page.close().await.ok();
|
||||
return Ok(SyncOutcome::SessionExpired);
|
||||
}
|
||||
|
||||
let html = page.content().await.context("read chapter html")?;
|
||||
page.close().await.ok();
|
||||
|
||||
let images = parse_chapter_pages(&html);
|
||||
if images.is_empty() {
|
||||
anyhow::bail!("no page images parsed from {source_url}");
|
||||
}
|
||||
|
||||
// Resolve image URLs against the chapter URL (they may be relative).
|
||||
let base = reqwest::Url::parse(source_url).context("parse chapter URL")?;
|
||||
|
||||
// Fetch every image bytes-first into memory before writing
|
||||
// anything. Lets us bail the whole chapter cleanly if any image
|
||||
// fails — DB stays at page_count=0, no partial rows persisted.
|
||||
let mut fetched: Vec<(i32, Vec<u8>, &'static str)> = Vec::with_capacity(images.len());
|
||||
for img in &images {
|
||||
let url = base.join(&img.url).with_context(|| {
|
||||
format!("join image URL {} onto {source_url}", img.url)
|
||||
})?;
|
||||
rate.wait_for(url.as_str()).await?;
|
||||
let resp = http
|
||||
.get(url.clone())
|
||||
// Source CDNs commonly check Referer. Set it to the
|
||||
// chapter page — matches what the browser would send.
|
||||
.header(reqwest::header::REFERER, source_url)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("GET {url}"))?
|
||||
.error_for_status()
|
||||
.with_context(|| format!("non-2xx for {url}"))?;
|
||||
let bytes = resp.bytes().await.context("read image body")?.to_vec();
|
||||
let ext = infer::get(&bytes).map(|k| k.extension()).unwrap_or("bin");
|
||||
fetched.push((img.page_number, bytes, ext));
|
||||
}
|
||||
|
||||
// Atomic write: storage puts + page row inserts + page_count
|
||||
// update, all in one transaction. If anything fails, rollback +
|
||||
// the chapter is retried next run. Storage orphans the bytes; a
|
||||
// reaper sweeps them later.
|
||||
let mut tx = db.begin().await.context("open chapter sync tx")?;
|
||||
for (page_number, bytes, ext) in &fetched {
|
||||
let key = format!(
|
||||
"mangas/{manga_id}/chapters/{chapter_id}/pages/{:04}.{ext}",
|
||||
page_number
|
||||
);
|
||||
storage
|
||||
.put(&key, bytes)
|
||||
.await
|
||||
.with_context(|| format!("put {key}"))?;
|
||||
// (chapter_id, page_number) is unique — re-runs idempotent.
|
||||
sqlx::query(
|
||||
"INSERT INTO pages (chapter_id, page_number, storage_key, content_type)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT (chapter_id, page_number) DO UPDATE
|
||||
SET storage_key = EXCLUDED.storage_key,
|
||||
content_type = EXCLUDED.content_type",
|
||||
)
|
||||
.bind(chapter_id)
|
||||
.bind(page_number)
|
||||
.bind(&key)
|
||||
.bind(format!("image/{ext}"))
|
||||
.execute(&mut *tx)
|
||||
.await
|
||||
.with_context(|| format!("insert page row {page_number}"))?;
|
||||
}
|
||||
sqlx::query("UPDATE chapters SET page_count = $1 WHERE id = $2")
|
||||
.bind(fetched.len() as i32)
|
||||
.bind(chapter_id)
|
||||
.execute(&mut *tx)
|
||||
.await
|
||||
.context("update page_count")?;
|
||||
tx.commit().await.context("commit chapter sync")?;
|
||||
|
||||
Ok(SyncOutcome::Fetched { pages: fetched.len() })
|
||||
}
|
||||
|
||||
// Suppress unused-import warning for `session` until the bin/crawler
|
||||
// wiring lands in this branch and uses it through this module.
|
||||
#[allow(dead_code)]
|
||||
fn _keep_session_in_scope() {
|
||||
let _ = session::registrable_domain;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_pages_skips_loader_and_sorts_by_id() {
|
||||
// Loader image, two real pages out of order, and one with no id.
|
||||
let html = r#"
|
||||
<html><body id="body"><a id="pic_container">
|
||||
<img class="loading" src="/images/ajax-loader2.gif">
|
||||
<img id="page2" class="page2" src="https://cdn/2.jpg">
|
||||
<img id="page1" class="page1" src="https://cdn/1.jpg">
|
||||
<img src="https://cdn/orphan.jpg">
|
||||
<img id="not-a-page" src="https://cdn/not-a-page.jpg">
|
||||
</a></body></html>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
assert_eq!(pages.len(), 2);
|
||||
assert_eq!(pages[0].page_number, 1);
|
||||
assert_eq!(pages[0].url, "https://cdn/1.jpg");
|
||||
assert_eq!(pages[1].page_number, 2);
|
||||
assert_eq!(pages[1].url, "https://cdn/2.jpg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_pages_drops_images_without_src() {
|
||||
let html = r#"
|
||||
<a id="pic_container">
|
||||
<img id="page1" src="">
|
||||
<img id="page2" src="https://cdn/2.jpg">
|
||||
</a>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
assert_eq!(pages.len(), 1);
|
||||
assert_eq!(pages[0].page_number, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_pages_handles_three_digit_page_ids() {
|
||||
let html = r#"
|
||||
<a id="pic_container">
|
||||
<img id="page126" src="https://cdn/126.jpg">
|
||||
<img id="page9" src="https://cdn/9.jpg">
|
||||
<img id="page50" src="https://cdn/50.jpg">
|
||||
</a>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
assert_eq!(
|
||||
pages.iter().map(|p| p.page_number).collect::<Vec<_>>(),
|
||||
vec![9, 50, 126]
|
||||
);
|
||||
}
|
||||
}
|
||||
633
backend/src/crawler/daemon.rs
Normal file
633
backend/src/crawler/daemon.rs
Normal file
@@ -0,0 +1,633 @@
|
||||
//! In-process crawler daemon.
|
||||
//!
|
||||
//! Owns a cron task that fires a daily metadata pass and N worker tasks
|
||||
//! that drain `SyncChapterContent` jobs from `crawler_jobs`. The dispatch
|
||||
//! seams ([`MetadataPass`], [`ChapterDispatcher`]) are traits so tests can
|
||||
//! inject stubs without standing up a real Chromium / `Source` impl.
|
||||
//!
|
||||
//! ## Cron
|
||||
//!
|
||||
//! Each tick:
|
||||
//! 1. Acquire a Postgres advisory lock on a dedicated pool connection
|
||||
//! (multi-replica safety). Skip the tick on contention.
|
||||
//! 2. Call [`MetadataPass::run`] (typically `pipeline::run_metadata_pass`).
|
||||
//! 3. Enqueue `SyncChapterContent` jobs for any bookmarked manga whose
|
||||
//! chapters still have `page_count = 0`.
|
||||
//! 4. Reap `done` jobs older than `retention_days`.
|
||||
//! 5. Persist `last_metadata_tick_at` and release the lock.
|
||||
//!
|
||||
//! If the last persisted tick is older than the most recent scheduled slot
|
||||
//! (e.g. backend was down at midnight), the daemon fires immediately on
|
||||
//! startup before resuming the regular schedule.
|
||||
//!
|
||||
//! ## Workers
|
||||
//!
|
||||
//! Each worker leases one chapter-content job at a time, dispatches via the
|
||||
//! [`ChapterDispatcher`], and acks `done` / `failed` / re-`pending` based on
|
||||
//! the outcome. A `SessionExpired` outcome flips the sticky
|
||||
//! `session_expired` flag — all workers idle while it's set (until operator
|
||||
//! restart with a refreshed PHPSESSID).
|
||||
//!
|
||||
//! Worker dispatch is wrapped in `catch_unwind` so a panicking handler
|
||||
//! marks the job failed instead of taking down the worker task.
|
||||
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chrono::{DateTime, Datelike, NaiveTime, TimeZone, Timelike, Utc};
|
||||
use chrono_tz::Tz;
|
||||
use futures_util::FutureExt;
|
||||
use serde_json::json;
|
||||
use sqlx::PgPool;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::crawler::content::SyncOutcome;
|
||||
use crate::crawler::jobs::{self, JobPayload, Lease, KIND_SYNC_CHAPTER_CONTENT};
|
||||
use crate::crawler::pipeline;
|
||||
|
||||
/// Fixed `pg_try_advisory_lock` key. ASCII "MANGALRD" interpreted as a
|
||||
/// big-endian i64. Hardcoded so every replica agrees on the lock identity
|
||||
/// without consulting config.
|
||||
pub const CRON_LOCK_KEY: i64 = 0x4D414E47414C5244;
|
||||
|
||||
const STATE_KEY_LAST_TICK: &str = "last_metadata_tick_at";
|
||||
|
||||
#[async_trait]
|
||||
pub trait MetadataPass: Send + Sync {
|
||||
async fn run(&self) -> anyhow::Result<pipeline::MetadataStats>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait ChapterDispatcher: Send + Sync {
|
||||
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome>;
|
||||
}
|
||||
|
||||
/// Configuration for [`spawn`]. Use `None` for `metadata_pass` to disable
|
||||
/// the cron entirely (worker-pool-only mode — useful when only the
|
||||
/// bookmark-triggered enqueue path is wanted).
|
||||
pub struct DaemonConfig {
|
||||
pub metadata_pass: Option<Arc<dyn MetadataPass>>,
|
||||
pub dispatcher: Arc<dyn ChapterDispatcher>,
|
||||
pub chapter_workers: usize,
|
||||
pub daily_at: NaiveTime,
|
||||
pub tz: Tz,
|
||||
pub retention_days: u32,
|
||||
pub session_expired: Arc<AtomicBool>,
|
||||
/// Tasks that should run alongside the cron + workers and be cancelled
|
||||
/// on shutdown. Used to hand the daemon ownership of the browser
|
||||
/// manager's idle reaper.
|
||||
pub extra_tasks: Vec<tokio::task::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
pub struct DaemonHandle {
|
||||
cancel: CancellationToken,
|
||||
join: JoinSet<()>,
|
||||
extra: Vec<tokio::task::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
impl DaemonHandle {
|
||||
/// Trigger shutdown and await all worker / cron / extra tasks.
|
||||
pub async fn shutdown(mut self) {
|
||||
self.cancel.cancel();
|
||||
while self.join.join_next().await.is_some() {}
|
||||
for task in self.extra.drain(..) {
|
||||
let _ = task.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation token that drives shutdown — exposed so callers
|
||||
/// (`app::spawn_crawler_daemon`) can hand the same token to auxiliary
|
||||
/// tasks (e.g. the BrowserManager idle reaper) and have them stop on
|
||||
/// the daemon's signal.
|
||||
pub fn cancel_token(&self) -> CancellationToken {
|
||||
self.cancel.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn the daemon. Returns immediately; tasks run in the background.
|
||||
/// Pass an external [`CancellationToken`] so auxiliary tasks (e.g. a
|
||||
/// BrowserManager idle reaper) can share the same shutdown signal —
|
||||
/// typically created in the caller, cloned into both spawns.
|
||||
pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> DaemonHandle {
|
||||
let mut join = JoinSet::new();
|
||||
|
||||
let DaemonConfig {
|
||||
metadata_pass,
|
||||
dispatcher,
|
||||
chapter_workers,
|
||||
daily_at,
|
||||
tz,
|
||||
retention_days,
|
||||
session_expired,
|
||||
extra_tasks,
|
||||
} = cfg;
|
||||
|
||||
if let Some(metadata) = metadata_pass {
|
||||
let ctx = CronContext {
|
||||
pool: pool.clone(),
|
||||
cancel: cancel.clone(),
|
||||
daily_at,
|
||||
tz,
|
||||
retention_days,
|
||||
metadata,
|
||||
};
|
||||
join.spawn(async move { ctx.run().await });
|
||||
} else {
|
||||
tracing::info!("crawler daemon: no metadata_pass — cron disabled");
|
||||
}
|
||||
|
||||
for worker_id in 0..chapter_workers.max(1) {
|
||||
let ctx = WorkerContext {
|
||||
pool: pool.clone(),
|
||||
cancel: cancel.clone(),
|
||||
dispatcher: Arc::clone(&dispatcher),
|
||||
session_expired: Arc::clone(&session_expired),
|
||||
id: worker_id,
|
||||
};
|
||||
join.spawn(async move { ctx.run().await });
|
||||
}
|
||||
|
||||
DaemonHandle {
|
||||
cancel,
|
||||
join,
|
||||
extra: extra_tasks,
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cron
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct CronContext {
|
||||
pool: PgPool,
|
||||
cancel: CancellationToken,
|
||||
daily_at: NaiveTime,
|
||||
tz: Tz,
|
||||
retention_days: u32,
|
||||
metadata: Arc<dyn MetadataPass>,
|
||||
}
|
||||
|
||||
impl CronContext {
|
||||
async fn run(self) {
|
||||
// On startup, fire immediately if the most recent slot has already
|
||||
// passed and we never recorded a tick for it.
|
||||
let now = Utc::now();
|
||||
let mut catchup = match read_last_tick(&self.pool).await {
|
||||
Ok(Some(last)) => previous_fire(now, self.daily_at, self.tz) > last,
|
||||
Ok(None) => true,
|
||||
Err(e) => {
|
||||
tracing::warn!(?e, "cron: read_last_tick failed; assuming no catch-up");
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
loop {
|
||||
if catchup {
|
||||
tracing::info!("cron: catch-up tick (missed scheduled slot)");
|
||||
self.run_tick().await;
|
||||
catchup = false;
|
||||
continue;
|
||||
}
|
||||
// Recompute next-fire from now() each iteration so clock jumps
|
||||
// (NTP step, suspend/resume) don't strand us on a stale instant.
|
||||
let next = next_fire(Utc::now(), self.daily_at, self.tz);
|
||||
let wait = (next - Utc::now()).to_std().unwrap_or(Duration::ZERO);
|
||||
tracing::info!(
|
||||
next_fire_utc = %next.to_rfc3339(),
|
||||
wait_seconds = wait.as_secs(),
|
||||
"cron: sleeping until next slot"
|
||||
);
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(wait) => {}
|
||||
_ = self.cancel.cancelled() => {
|
||||
tracing::info!("cron: shutdown");
|
||||
return;
|
||||
}
|
||||
}
|
||||
self.run_tick().await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_tick(&self) {
|
||||
let mut conn = match self.pool.acquire().await {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::error!(?e, "cron: acquire conn failed; skipping tick");
|
||||
return;
|
||||
}
|
||||
};
|
||||
// pg_try_advisory_lock is session-scoped — we must hold the same
|
||||
// connection for the unlock or the call silently no-ops on a
|
||||
// different connection from the pool.
|
||||
let acquired: bool = sqlx::query_scalar("SELECT pg_try_advisory_lock($1)")
|
||||
.bind(CRON_LOCK_KEY)
|
||||
.fetch_one(&mut *conn)
|
||||
.await
|
||||
.unwrap_or(false);
|
||||
if !acquired {
|
||||
tracing::info!("cron: tick skipped — another replica holds the lock");
|
||||
return;
|
||||
}
|
||||
|
||||
match self.metadata.run().await {
|
||||
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
|
||||
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
|
||||
}
|
||||
|
||||
match pipeline::enqueue_bookmarked_pending(&self.pool).await {
|
||||
Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"),
|
||||
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
|
||||
}
|
||||
|
||||
match jobs::reap_done(&self.pool, self.retention_days).await {
|
||||
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
|
||||
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
|
||||
}
|
||||
|
||||
if let Err(e) = write_last_tick(&self.pool, Utc::now()).await {
|
||||
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
|
||||
}
|
||||
|
||||
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
|
||||
.bind(CRON_LOCK_KEY)
|
||||
.execute(&mut *conn)
|
||||
.await;
|
||||
drop(conn);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Workers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct WorkerContext {
|
||||
pool: PgPool,
|
||||
cancel: CancellationToken,
|
||||
dispatcher: Arc<dyn ChapterDispatcher>,
|
||||
session_expired: Arc<AtomicBool>,
|
||||
id: usize,
|
||||
}
|
||||
|
||||
impl WorkerContext {
|
||||
async fn run(self) {
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
tracing::info!(worker = self.id, "worker: shutdown");
|
||||
return;
|
||||
}
|
||||
if self.session_expired.load(Ordering::Acquire) {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(Duration::from_secs(30)) => continue,
|
||||
_ = self.cancel.cancelled() => return,
|
||||
}
|
||||
}
|
||||
let leases = match jobs::lease(
|
||||
&self.pool,
|
||||
Some(KIND_SYNC_CHAPTER_CONTENT),
|
||||
1,
|
||||
Duration::from_secs(60),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
tracing::warn!(worker = self.id, ?e, "worker: lease failed");
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(Duration::from_secs(5)) => continue,
|
||||
_ = self.cancel.cancelled() => return,
|
||||
}
|
||||
}
|
||||
};
|
||||
let Some(lease) = leases.into_iter().next() else {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(Duration::from_secs(1)) => continue,
|
||||
_ = self.cancel.cancelled() => return,
|
||||
}
|
||||
};
|
||||
self.process_lease(lease).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn process_lease(&self, lease: Lease) {
|
||||
// Consumer-side dedup safety net: if the chapter already has pages
|
||||
// (because a force-refetch race or a job that was re-enqueued
|
||||
// after a previous one finished), ack done without re-fetching.
|
||||
if let JobPayload::SyncChapterContent { chapter_id, .. } = &lease.payload {
|
||||
let page_count: Option<i32> = sqlx::query_scalar(
|
||||
"SELECT page_count FROM chapters WHERE id = $1",
|
||||
)
|
||||
.bind(chapter_id)
|
||||
.fetch_optional(&self.pool)
|
||||
.await
|
||||
.ok()
|
||||
.flatten();
|
||||
if matches!(page_count, Some(n) if n > 0) {
|
||||
let _ = jobs::ack_done(&self.pool, lease.id).await;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let outcome = AssertUnwindSafe(self.dispatcher.dispatch(lease.payload.clone()))
|
||||
.catch_unwind()
|
||||
.await;
|
||||
match outcome {
|
||||
Ok(Ok(SyncOutcome::Fetched { .. } | SyncOutcome::Skipped)) => {
|
||||
let _ = jobs::ack_done(&self.pool, lease.id).await;
|
||||
}
|
||||
Ok(Ok(SyncOutcome::SessionExpired)) => {
|
||||
tracing::error!(
|
||||
worker = self.id,
|
||||
lease_id = %lease.id,
|
||||
"session expired — workers will idle until restart"
|
||||
);
|
||||
self.session_expired.store(true, Ordering::Release);
|
||||
let _ = jobs::release(&self.pool, lease.id).await;
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
tracing::warn!(
|
||||
worker = self.id,
|
||||
lease_id = %lease.id,
|
||||
error = ?e,
|
||||
"worker: dispatch error — ack failed"
|
||||
);
|
||||
let _ = jobs::ack_failed(
|
||||
&self.pool,
|
||||
lease.id,
|
||||
&format!("{e:#}"),
|
||||
lease.attempts,
|
||||
lease.max_attempts,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Err(_panic) => {
|
||||
tracing::error!(
|
||||
worker = self.id,
|
||||
lease_id = %lease.id,
|
||||
"worker: dispatcher panicked — ack failed"
|
||||
);
|
||||
let _ = jobs::ack_failed(
|
||||
&self.pool,
|
||||
lease.id,
|
||||
"worker panicked",
|
||||
lease.attempts,
|
||||
lease.max_attempts,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cron timing primitives
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Compute the next UTC instant when `daily_at` (interpreted in `tz`) will
|
||||
/// fire, strictly after `now`. Handles DST gaps (spring-forward) by
|
||||
/// advancing past the gap; on DST overlap (fall-back) picks the later
|
||||
/// instant so the job runs once, not twice.
|
||||
pub fn next_fire(now: DateTime<Utc>, daily_at: NaiveTime, tz: Tz) -> DateTime<Utc> {
|
||||
let now_local = now.with_timezone(&tz);
|
||||
// Start with today's slot in the local TZ.
|
||||
let mut candidate = local_at(now_local.date_naive(), daily_at, tz);
|
||||
// If today's slot is in the past (or now), roll forward day-by-day.
|
||||
while candidate <= now {
|
||||
let next_day = candidate
|
||||
.with_timezone(&tz)
|
||||
.date_naive()
|
||||
.succ_opt()
|
||||
.unwrap_or_else(|| {
|
||||
// Defensive: succ_opt only fails at chrono's max date.
|
||||
chrono::NaiveDate::from_ymd_opt(
|
||||
candidate.year(),
|
||||
candidate.month(),
|
||||
candidate.day(),
|
||||
)
|
||||
.expect("valid date")
|
||||
});
|
||||
candidate = local_at(next_day, daily_at, tz);
|
||||
}
|
||||
candidate
|
||||
}
|
||||
|
||||
/// The most recent fire instant at or before `now`. Used to detect missed
|
||||
/// slots after a restart.
|
||||
pub fn previous_fire(now: DateTime<Utc>, daily_at: NaiveTime, tz: Tz) -> DateTime<Utc> {
|
||||
let now_local = now.with_timezone(&tz);
|
||||
let today = local_at(now_local.date_naive(), daily_at, tz);
|
||||
if today <= now {
|
||||
return today;
|
||||
}
|
||||
let yesterday = now_local
|
||||
.date_naive()
|
||||
.pred_opt()
|
||||
.expect("a day before now");
|
||||
local_at(yesterday, daily_at, tz)
|
||||
}
|
||||
|
||||
/// Resolve a local date+time to a UTC instant in `tz`, navigating DST
|
||||
/// edges deterministically:
|
||||
/// - `LocalResult::Single` → that instant.
|
||||
/// - `LocalResult::Ambiguous(_, latest)` → the later instant (fall-back
|
||||
/// hour). Picking latest means a daily job fires once across the
|
||||
/// repeated hour, not twice.
|
||||
/// - `LocalResult::None` → spring-forward gap. Advance the local time
|
||||
/// by 1 minute and try again, repeating up to 120 times (so the worst
|
||||
/// case is still well inside an hour-long gap).
|
||||
fn local_at(date: chrono::NaiveDate, time: NaiveTime, tz: Tz) -> DateTime<Utc> {
|
||||
use chrono::LocalResult;
|
||||
for offset_minutes in 0..120 {
|
||||
let mut t = time;
|
||||
if offset_minutes > 0 {
|
||||
let added = chrono::NaiveTime::from_num_seconds_from_midnight_opt(
|
||||
((time.num_seconds_from_midnight() as i64 + offset_minutes * 60) % 86_400) as u32,
|
||||
0,
|
||||
)
|
||||
.unwrap_or(time);
|
||||
t = added;
|
||||
}
|
||||
let naive = date.and_time(t);
|
||||
match tz.from_local_datetime(&naive) {
|
||||
LocalResult::Single(dt) => return dt.with_timezone(&Utc),
|
||||
LocalResult::Ambiguous(_, latest) => return latest.with_timezone(&Utc),
|
||||
LocalResult::None => continue,
|
||||
}
|
||||
}
|
||||
// Should be unreachable — DST gaps are always less than an hour.
|
||||
Utc.from_utc_datetime(&date.and_time(time))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// crawler_state I/O
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async fn read_last_tick(pool: &PgPool) -> sqlx::Result<Option<DateTime<Utc>>> {
|
||||
let row: Option<serde_json::Value> = sqlx::query_scalar(
|
||||
"SELECT value FROM crawler_state WHERE key = $1",
|
||||
)
|
||||
.bind(STATE_KEY_LAST_TICK)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.and_then(|v| {
|
||||
v.get("at")
|
||||
.and_then(|s| s.as_str())
|
||||
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
}))
|
||||
}
|
||||
|
||||
async fn write_last_tick(pool: &PgPool, at: DateTime<Utc>) -> sqlx::Result<()> {
|
||||
sqlx::query(
|
||||
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||
VALUES ($1, $2, now()) \
|
||||
ON CONFLICT (key) DO UPDATE \
|
||||
SET value = EXCLUDED.value, updated_at = now()",
|
||||
)
|
||||
.bind(STATE_KEY_LAST_TICK)
|
||||
.bind(json!({ "at": at.to_rfc3339() }))
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test helpers (not gated on cfg(test) — integration tests in tests/ dir
|
||||
// need them too).
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
pub mod test_support {
|
||||
//! Lightweight stubs the daemon tests use. Public because integration
|
||||
//! tests live outside this module.
|
||||
use super::*;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
|
||||
pub struct CountingMetadataPass {
|
||||
pub count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl Default for CountingMetadataPass {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MetadataPass for CountingMetadataPass {
|
||||
async fn run(&self) -> anyhow::Result<pipeline::MetadataStats> {
|
||||
self.count.fetch_add(1, Ordering::AcqRel);
|
||||
Ok(pipeline::MetadataStats::default())
|
||||
}
|
||||
}
|
||||
|
||||
pub type DispatchFn = Arc<
|
||||
dyn Fn(JobPayload) -> futures_util::future::BoxFuture<'static, anyhow::Result<SyncOutcome>>
|
||||
+ Send
|
||||
+ Sync,
|
||||
>;
|
||||
|
||||
pub struct StubDispatcher {
|
||||
pub handler: DispatchFn,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ChapterDispatcher for StubDispatcher {
|
||||
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
(self.handler)(payload).await
|
||||
}
|
||||
}
|
||||
|
||||
pub fn always_done() -> Arc<StubDispatcher> {
|
||||
Arc::new(StubDispatcher {
|
||||
handler: Arc::new(|_| Box::pin(async { Ok(SyncOutcome::Fetched { pages: 1 }) })),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn panicking_dispatcher() -> Arc<StubDispatcher> {
|
||||
Arc::new(StubDispatcher {
|
||||
handler: Arc::new(|_| Box::pin(async { panic!("intentional dispatcher panic") })),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use chrono::Duration as ChronoDuration;
|
||||
|
||||
fn dt_utc(y: i32, mo: u32, d: u32, h: u32, mi: u32) -> DateTime<Utc> {
|
||||
Utc.with_ymd_and_hms(y, mo, d, h, mi, 0).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn next_fire_in_utc_at_midnight_advances_one_day() {
|
||||
let now = dt_utc(2026, 5, 25, 12, 0); // noon UTC
|
||||
let at = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
|
||||
let next = next_fire(now, at, Tz::UTC);
|
||||
// Next midnight is May 26 00:00 UTC.
|
||||
assert_eq!(next, dt_utc(2026, 5, 26, 0, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn next_fire_before_today_slot_returns_today() {
|
||||
let now = dt_utc(2026, 5, 25, 23, 0); // 23:00 UTC
|
||||
let at = NaiveTime::from_hms_opt(23, 30, 0).unwrap();
|
||||
let next = next_fire(now, at, Tz::UTC);
|
||||
assert_eq!(next, dt_utc(2026, 5, 25, 23, 30));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn next_fire_skips_spring_forward_gap_in_europe_berlin() {
|
||||
// 2024-03-31: clocks jump 02:00 -> 03:00 in Berlin (CET -> CEST).
|
||||
// Asking for daily_at = 02:30 on the morning of the jump should
|
||||
// land on the *next valid* local instant past the gap. We test
|
||||
// by computing `next_fire` at 2024-03-31 00:30 UTC (= 01:30 CET,
|
||||
// i.e. just before the gap). The next 02:30 local does not exist,
|
||||
// so the helper advances past it.
|
||||
let now = dt_utc(2024, 3, 31, 0, 30); // 01:30 local Berlin (CET = UTC+1)
|
||||
let at = NaiveTime::from_hms_opt(2, 30, 0).unwrap();
|
||||
let next = next_fire(now, at, Tz::Europe__Berlin);
|
||||
// Local Berlin time skips from 02:00 -> 03:00. After the +1 minute
|
||||
// search, the first valid slot is 03:00 local on 2024-03-31, which
|
||||
// is 01:00 UTC (CEST = UTC+2).
|
||||
// We assert the result is strictly between (now) and 1h later
|
||||
// and is in UTC — the exact minute depends on how many +1m steps
|
||||
// were required.
|
||||
assert!(next > now);
|
||||
assert!(next < now + ChronoDuration::hours(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn next_fire_on_fall_back_picks_later_instant() {
|
||||
// 2024-10-27: clocks jump 03:00 -> 02:00 (CEST -> CET) in Berlin.
|
||||
// 02:30 happens twice on that day. We pick the later one.
|
||||
let now = dt_utc(2024, 10, 26, 12, 0); // day before, noon UTC
|
||||
let at = NaiveTime::from_hms_opt(2, 30, 0).unwrap();
|
||||
let next = next_fire(now, at, Tz::Europe__Berlin);
|
||||
// First 02:30 local is 00:30 UTC (CEST = UTC+2).
|
||||
// Second 02:30 local is 01:30 UTC (CET = UTC+1).
|
||||
// We expect the later instant: 01:30 UTC on 2024-10-27.
|
||||
assert_eq!(next, dt_utc(2024, 10, 27, 1, 30));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn previous_fire_returns_today_when_now_is_after_slot() {
|
||||
let now = dt_utc(2026, 5, 25, 12, 0); // noon UTC
|
||||
let at = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
|
||||
let prev = previous_fire(now, at, Tz::UTC);
|
||||
assert_eq!(prev, dt_utc(2026, 5, 25, 0, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn previous_fire_returns_yesterday_when_now_is_before_today_slot() {
|
||||
let now = dt_utc(2026, 5, 25, 8, 0); // 08:00 UTC
|
||||
let at = NaiveTime::from_hms_opt(23, 30, 0).unwrap();
|
||||
let prev = previous_fire(now, at, Tz::UTC);
|
||||
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
|
||||
}
|
||||
}
|
||||
@@ -5,11 +5,11 @@
|
||||
//! `leased_until`, and ack by transitioning to `done` (or backoff /
|
||||
//! `dead`). Handlers are idempotent so a crash mid-run is recoverable
|
||||
//! by replay.
|
||||
//!
|
||||
//! Scaffold only — the actual queue wrapper and handler dispatch land
|
||||
//! once we have the first `Source` impl exercising the pipeline.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::source::DiscoverMode;
|
||||
@@ -53,3 +53,217 @@ pub enum JobState {
|
||||
Failed,
|
||||
Dead,
|
||||
}
|
||||
|
||||
/// Kind discriminator stored in `payload->>'kind'`. Public so callers
|
||||
/// (daemon worker, bookmark hook) can filter `lease()` to a single kind
|
||||
/// without re-spelling the literal.
|
||||
pub const KIND_SYNC_CHAPTER_CONTENT: &str = "sync_chapter_content";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum EnqueueResult {
|
||||
Inserted(Uuid),
|
||||
Skipped,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Lease {
|
||||
pub id: Uuid,
|
||||
pub payload: JobPayload,
|
||||
pub attempts: i32,
|
||||
pub max_attempts: i32,
|
||||
}
|
||||
|
||||
/// Exponential backoff for `ack_failed` retries. `attempts` is the
|
||||
/// post-increment value reported by `lease()` (so the first failure has
|
||||
/// `attempts == 1` and waits 60s, the second 120s, etc.). Capped at 1h to
|
||||
/// avoid runaway long sleeps that would outlive the daemon process.
|
||||
fn backoff_for(attempts: i32) -> Duration {
|
||||
let shift = attempts.saturating_sub(1).clamp(0, 20) as u32;
|
||||
let secs = 60u64.saturating_mul(1u64 << shift);
|
||||
Duration::from_secs(secs.min(3600))
|
||||
}
|
||||
|
||||
/// Insert a new pending job. For `SyncChapterContent` payloads the
|
||||
/// partial unique index `crawler_jobs_chapter_content_dedup_idx` blocks
|
||||
/// a second `(pending|running)` insert per chapter_id, returning
|
||||
/// `Skipped`. The slot frees again once the previous job leaves the
|
||||
/// in-flight states (done/failed/dead), so a re-enqueue after a force
|
||||
/// refetch succeeds.
|
||||
pub async fn enqueue(pool: &PgPool, payload: &JobPayload) -> sqlx::Result<EnqueueResult> {
|
||||
let json = serde_json::to_value(payload).expect("JobPayload is always serializable");
|
||||
let id: Option<Uuid> = sqlx::query_scalar(
|
||||
"INSERT INTO crawler_jobs (payload) VALUES ($1) \
|
||||
ON CONFLICT DO NOTHING RETURNING id",
|
||||
)
|
||||
.bind(json)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(match id {
|
||||
Some(id) => EnqueueResult::Inserted(id),
|
||||
None => EnqueueResult::Skipped,
|
||||
})
|
||||
}
|
||||
|
||||
/// Lease up to `max` rows whose `state` is `pending`, or `running` with
|
||||
/// an expired `leased_until` (the crashed-worker recovery path). The
|
||||
/// inner CTE uses `FOR UPDATE SKIP LOCKED` so concurrent leasers don't
|
||||
/// block each other and each row is handed to exactly one worker.
|
||||
///
|
||||
/// `kind_filter` matches against `payload->>'kind'`; `None` means
|
||||
/// any kind.
|
||||
pub async fn lease(
|
||||
pool: &PgPool,
|
||||
kind_filter: Option<&str>,
|
||||
max: i64,
|
||||
lease_duration: Duration,
|
||||
) -> sqlx::Result<Vec<Lease>> {
|
||||
let lease_ms: i64 = lease_duration.as_millis().min(i64::MAX as u128) as i64;
|
||||
let rows: Vec<(Uuid, serde_json::Value, i32, i32)> = sqlx::query_as(
|
||||
r#"
|
||||
WITH leased AS (
|
||||
SELECT id FROM crawler_jobs
|
||||
WHERE (state = 'pending' OR (state = 'running' AND leased_until < now()))
|
||||
AND scheduled_at <= now()
|
||||
AND ($1::text IS NULL OR payload->>'kind' = $1)
|
||||
ORDER BY scheduled_at
|
||||
LIMIT $2
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
UPDATE crawler_jobs j
|
||||
SET state = 'running',
|
||||
attempts = j.attempts + 1,
|
||||
leased_until = now() + ($3::bigint || ' milliseconds')::interval,
|
||||
updated_at = now()
|
||||
FROM leased l
|
||||
WHERE j.id = l.id
|
||||
RETURNING j.id, j.payload, j.attempts, j.max_attempts
|
||||
"#,
|
||||
)
|
||||
.bind(kind_filter)
|
||||
.bind(max)
|
||||
.bind(lease_ms)
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
let mut leases = Vec::with_capacity(rows.len());
|
||||
for (id, payload_json, attempts, max_attempts) in rows {
|
||||
let payload: JobPayload = serde_json::from_value(payload_json).map_err(|e| {
|
||||
sqlx::Error::Decode(format!("invalid JobPayload JSON for job {id}: {e}").into())
|
||||
})?;
|
||||
leases.push(Lease {
|
||||
id,
|
||||
payload,
|
||||
attempts,
|
||||
max_attempts,
|
||||
});
|
||||
}
|
||||
Ok(leases)
|
||||
}
|
||||
|
||||
/// Mark a leased job as successfully completed.
|
||||
pub async fn ack_done(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
|
||||
sqlx::query(
|
||||
"UPDATE crawler_jobs \
|
||||
SET state = 'done', leased_until = NULL, updated_at = now() \
|
||||
WHERE id = $1",
|
||||
)
|
||||
.bind(lease_id)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mark a leased job as failed. If the current attempt count has reached
|
||||
/// `max_attempts` the job is terminally dead and stops retrying;
|
||||
/// otherwise it goes back to `pending` with `scheduled_at` pushed into
|
||||
/// the future by the exponential backoff.
|
||||
pub async fn ack_failed(
|
||||
pool: &PgPool,
|
||||
lease_id: Uuid,
|
||||
error: &str,
|
||||
attempts: i32,
|
||||
max_attempts: i32,
|
||||
) -> sqlx::Result<()> {
|
||||
if attempts >= max_attempts {
|
||||
sqlx::query(
|
||||
"UPDATE crawler_jobs \
|
||||
SET state = 'dead', last_error = $2, leased_until = NULL, updated_at = now() \
|
||||
WHERE id = $1",
|
||||
)
|
||||
.bind(lease_id)
|
||||
.bind(error)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
} else {
|
||||
let backoff_ms: i64 = backoff_for(attempts).as_millis().min(i64::MAX as u128) as i64;
|
||||
sqlx::query(
|
||||
"UPDATE crawler_jobs \
|
||||
SET state = 'pending', last_error = $2, leased_until = NULL, \
|
||||
scheduled_at = now() + ($3::bigint || ' milliseconds')::interval, \
|
||||
updated_at = now() \
|
||||
WHERE id = $1",
|
||||
)
|
||||
.bind(lease_id)
|
||||
.bind(error)
|
||||
.bind(backoff_ms)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return a leased job to `pending` without burning a retry attempt.
|
||||
/// Used on graceful shutdown and on session-expired aborts where the
|
||||
/// failure isn't the job's fault.
|
||||
pub async fn release(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
|
||||
sqlx::query(
|
||||
"UPDATE crawler_jobs \
|
||||
SET state = 'pending', leased_until = NULL, \
|
||||
attempts = GREATEST(0, attempts - 1), updated_at = now() \
|
||||
WHERE id = $1",
|
||||
)
|
||||
.bind(lease_id)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete `done` jobs whose `updated_at` is older than `retention_days`
|
||||
/// days. `0` disables the reaper without touching the table. Returns the
|
||||
/// number of rows removed.
|
||||
pub async fn reap_done(pool: &PgPool, retention_days: u32) -> sqlx::Result<u64> {
|
||||
if retention_days == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
let result = sqlx::query(
|
||||
"DELETE FROM crawler_jobs \
|
||||
WHERE state = 'done' \
|
||||
AND updated_at < now() - ($1::bigint || ' days')::interval",
|
||||
)
|
||||
.bind(retention_days as i64)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(result.rows_affected())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_grows_exponentially_and_caps_at_one_hour() {
|
||||
// attempts == 1 → 60s, doubling each step.
|
||||
assert_eq!(backoff_for(1), Duration::from_secs(60));
|
||||
assert_eq!(backoff_for(2), Duration::from_secs(120));
|
||||
assert_eq!(backoff_for(3), Duration::from_secs(240));
|
||||
assert_eq!(backoff_for(4), Duration::from_secs(480));
|
||||
assert_eq!(backoff_for(5), Duration::from_secs(960));
|
||||
assert_eq!(backoff_for(6), Duration::from_secs(1920));
|
||||
// 7th: 60 * 64 = 3840 → capped to 3600.
|
||||
assert_eq!(backoff_for(7), Duration::from_secs(3600));
|
||||
assert_eq!(backoff_for(20), Duration::from_secs(3600));
|
||||
// Garbage / zero / negatives stay sane.
|
||||
assert_eq!(backoff_for(0), Duration::from_secs(60));
|
||||
assert_eq!(backoff_for(-5), Duration::from_secs(60));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,12 @@
|
||||
//! - [`diff`]: change detection — new / updated / dropped semantics.
|
||||
|
||||
pub mod browser;
|
||||
pub mod browser_manager;
|
||||
pub mod content;
|
||||
pub mod daemon;
|
||||
pub mod diff;
|
||||
pub mod jobs;
|
||||
pub mod pipeline;
|
||||
pub mod rate_limit;
|
||||
pub mod session;
|
||||
pub mod source;
|
||||
|
||||
347
backend/src/crawler/pipeline.rs
Normal file
347
backend/src/crawler/pipeline.rs
Normal file
@@ -0,0 +1,347 @@
|
||||
//! Crawler pipeline — the reusable metadata pass and the enqueue helpers
|
||||
//! that fan out chapter-content work. Shared between the daemon (cron tick)
|
||||
//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
|
||||
|
||||
use anyhow::Context;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::crawler::browser_manager::BrowserManager;
|
||||
use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::source::target::TargetSource;
|
||||
use crate::crawler::source::{DiscoverMode, FetchContext, Source};
|
||||
use crate::repo;
|
||||
use crate::storage::Storage;
|
||||
|
||||
/// Coarse counters surfaced for logging at the end of a metadata pass.
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct MetadataStats {
|
||||
pub discovered: usize,
|
||||
pub upserted: usize,
|
||||
pub covers_fetched: usize,
|
||||
pub mangas_failed: usize,
|
||||
}
|
||||
|
||||
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
|
||||
/// for the target source. Pure metadata; chapter content is enqueued as
|
||||
/// separate `SyncChapterContent` jobs by the caller after this returns.
|
||||
///
|
||||
/// `limit == 0` means no cap (full backfill). `skip_chapters == true` is
|
||||
/// the "metadata-only" mode (parser doesn't extract chapters, and
|
||||
/// `sync_manga_chapters` is skipped — otherwise an empty chapter list
|
||||
/// would soft-drop existing rows).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run_metadata_pass(
|
||||
browser_manager: &BrowserManager,
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
start_url: &str,
|
||||
limit: usize,
|
||||
skip_chapters: bool,
|
||||
) -> anyhow::Result<MetadataStats> {
|
||||
let lease = browser_manager
|
||||
.acquire()
|
||||
.await
|
||||
.context("acquire browser lease for metadata pass")?;
|
||||
let browser_ref: &chromiumoxide::Browser = &lease;
|
||||
|
||||
let source = {
|
||||
let s = TargetSource::new(start_url.to_string());
|
||||
if skip_chapters {
|
||||
s.without_chapter_parsing()
|
||||
} else {
|
||||
s
|
||||
}
|
||||
};
|
||||
let ctx = FetchContext {
|
||||
browser: browser_ref,
|
||||
rate,
|
||||
};
|
||||
|
||||
let source_id = source.id();
|
||||
repo::crawler::ensure_source(
|
||||
db,
|
||||
source_id,
|
||||
"Target Site",
|
||||
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
|
||||
)
|
||||
.await
|
||||
.context("ensure_source")?;
|
||||
|
||||
let run_started_at = chrono::Utc::now();
|
||||
let max_refs = (limit > 0).then_some(limit);
|
||||
|
||||
tracing::info!(?max_refs, "discovering manga list");
|
||||
let refs = source
|
||||
.discover(&ctx, DiscoverMode::Backfill, max_refs)
|
||||
.await
|
||||
.context("discover failed")?;
|
||||
tracing::info!(count = refs.len(), "discovered manga list");
|
||||
|
||||
let mut stats = MetadataStats {
|
||||
discovered: refs.len(),
|
||||
..MetadataStats::default()
|
||||
};
|
||||
|
||||
for (i, r) in refs.iter().enumerate() {
|
||||
tracing::info!(
|
||||
idx = i + 1,
|
||||
total = stats.discovered,
|
||||
key = %r.source_manga_key,
|
||||
"fetching metadata"
|
||||
);
|
||||
let manga = match source.fetch_manga(&ctx, r).await {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
key = %r.source_manga_key,
|
||||
url = %r.url,
|
||||
error = ?e,
|
||||
"fetch_manga failed"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
|
||||
.await
|
||||
{
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
tracing::error!(
|
||||
key = %r.source_manga_key,
|
||||
error = ?e,
|
||||
"upsert_manga_from_source failed"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
stats.upserted += 1;
|
||||
tracing::info!(
|
||||
key = %manga.source_manga_key,
|
||||
manga_id = %upsert.manga_id,
|
||||
status = ?upsert.status,
|
||||
title = %manga.title,
|
||||
"manga upserted"
|
||||
);
|
||||
|
||||
// Cover image: download when missing in storage or when metadata
|
||||
// signaled an update (cover URL is part of metadata_hash, so
|
||||
// Updated implies the URL may have moved). Failures are non-fatal.
|
||||
let needs_cover = upsert.cover_image_path.is_none()
|
||||
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
|
||||
if needs_cover {
|
||||
if let Some(cover_url) = manga.cover_url.as_deref() {
|
||||
match download_and_store_cover(
|
||||
db,
|
||||
storage,
|
||||
http,
|
||||
rate,
|
||||
&r.url,
|
||||
upsert.manga_id,
|
||||
cover_url,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => stats.covers_fetched += 1,
|
||||
Err(e) => tracing::warn!(
|
||||
manga_id = %upsert.manga_id,
|
||||
error = ?e,
|
||||
"cover download failed"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_chapters {
|
||||
match repo::crawler::sync_manga_chapters(
|
||||
db,
|
||||
source_id,
|
||||
upsert.manga_id,
|
||||
&manga.chapters,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(diff) => tracing::info!(
|
||||
manga_id = %upsert.manga_id,
|
||||
new = diff.new,
|
||||
refreshed = diff.refreshed,
|
||||
dropped = diff.dropped,
|
||||
"chapters synced"
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
manga_id = %upsert.manga_id,
|
||||
error = ?e,
|
||||
"chapter sync failed"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if limit == 0 {
|
||||
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
|
||||
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
|
||||
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
|
||||
}
|
||||
} else {
|
||||
tracing::info!(limit, "partial sync — skipping drop pass");
|
||||
}
|
||||
|
||||
drop(lease);
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
|
||||
/// manga that still has `page_count = 0` and a non-dropped source row.
|
||||
/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
|
||||
pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
|
||||
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
|
||||
r#"
|
||||
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
|
||||
FROM chapters c
|
||||
JOIN bookmarks b ON b.manga_id = c.manga_id
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||
WHERE c.page_count = 0
|
||||
AND cs.dropped_at IS NULL
|
||||
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.created_at
|
||||
ORDER BY c.manga_id, c.created_at ASC
|
||||
"#,
|
||||
)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.context("query bookmarked-pending chapters")?;
|
||||
|
||||
let mut summary = EnqueueSummary::default();
|
||||
for (source_id, chapter_id, source_chapter_key) in rows {
|
||||
let payload = JobPayload::SyncChapterContent {
|
||||
source_id,
|
||||
chapter_id,
|
||||
source_chapter_key,
|
||||
};
|
||||
match jobs::enqueue(pool, &payload).await {
|
||||
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
|
||||
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
%chapter_id,
|
||||
error = ?e,
|
||||
"enqueue chapter content failed"
|
||||
);
|
||||
summary.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
|
||||
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`].
|
||||
pub async fn enqueue_pending_for_manga(
|
||||
pool: &PgPool,
|
||||
manga_id: Uuid,
|
||||
) -> anyhow::Result<EnqueueSummary> {
|
||||
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
|
||||
r#"
|
||||
SELECT DISTINCT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
|
||||
FROM chapters c
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||
WHERE c.manga_id = $1
|
||||
AND c.page_count = 0
|
||||
AND cs.dropped_at IS NULL
|
||||
ORDER BY cs.source_id, c.id
|
||||
"#,
|
||||
)
|
||||
.bind(manga_id)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.context("query pending chapters for manga")?;
|
||||
|
||||
let mut summary = EnqueueSummary::default();
|
||||
for (source_id, chapter_id, source_chapter_key) in rows {
|
||||
let payload = JobPayload::SyncChapterContent {
|
||||
source_id,
|
||||
chapter_id,
|
||||
source_chapter_key,
|
||||
};
|
||||
match jobs::enqueue(pool, &payload).await {
|
||||
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
|
||||
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
%chapter_id,
|
||||
error = ?e,
|
||||
"enqueue chapter content failed"
|
||||
);
|
||||
summary.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct EnqueueSummary {
|
||||
pub inserted: usize,
|
||||
pub skipped: usize,
|
||||
pub failed: usize,
|
||||
}
|
||||
|
||||
/// Download a cover image and persist its storage path. Local to the
|
||||
/// pipeline because the CLI still calls it from its inline chapter-content
|
||||
/// loop; once the worker pool fully replaces that path we can fold this
|
||||
/// into `pipeline` proper.
|
||||
async fn download_and_store_cover(
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
manga_url: &str,
|
||||
manga_id: Uuid,
|
||||
cover_url: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let absolute = reqwest::Url::parse(manga_url)
|
||||
.context("parse manga URL")?
|
||||
.join(cover_url)
|
||||
.context("join cover URL onto manga URL")?;
|
||||
|
||||
rate.wait_for(absolute.as_str()).await?;
|
||||
let resp = http
|
||||
.get(absolute.clone())
|
||||
.header(reqwest::header::REFERER, manga_url)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("GET {absolute}"))?
|
||||
.error_for_status()
|
||||
.with_context(|| format!("non-2xx for {absolute}"))?;
|
||||
let bytes = resp.bytes().await.context("read cover body")?;
|
||||
let kind = infer::get(&bytes);
|
||||
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
|
||||
let key = format!("mangas/{manga_id}/cover.{ext}");
|
||||
|
||||
storage
|
||||
.put(&key, &bytes)
|
||||
.await
|
||||
.with_context(|| format!("store cover at {key}"))?;
|
||||
repo::manga::set_cover_image_path(db, manga_id, &key)
|
||||
.await
|
||||
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
|
||||
tracing::info!(
|
||||
manga_id = %manga_id,
|
||||
key = %key,
|
||||
bytes = bytes.len(),
|
||||
%absolute,
|
||||
"cover stored"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn origin_of(url: &str) -> Option<String> {
|
||||
let (scheme, rest) = url.split_once("://")?;
|
||||
let host = rest.split('/').next()?;
|
||||
Some(format!("{scheme}://{host}"))
|
||||
}
|
||||
@@ -1,11 +1,22 @@
|
||||
//! Per-host request pacing.
|
||||
//!
|
||||
//! Single-token bucket: each `wait().await` either returns immediately
|
||||
//! (if at least `interval` has elapsed since the last call) or sleeps
|
||||
//! just enough to satisfy it. Uses `tokio::time::Instant` so tests can
|
||||
//! run under `start_paused` virtual time without sleeping for real.
|
||||
//! `RateLimiter` is a single-token bucket: each `wait().await` returns
|
||||
//! immediately when at least `interval` has elapsed since the last call,
|
||||
//! otherwise sleeps just enough to satisfy it. Uses
|
||||
//! `tokio::time::Instant` so tests can run under `start_paused` virtual
|
||||
//! time without sleeping for real.
|
||||
//!
|
||||
//! `HostRateLimiters` is the multi-host wrapper actually used by the
|
||||
//! crawler — concurrent workers issuing requests to different origins
|
||||
//! (catalog vs. CDN) don't contend on a shared budget; each host gets
|
||||
//! its own bucket. `wait_for(url)` extracts the host, lazily creates a
|
||||
//! limiter for it, and serializes only against other callers hitting
|
||||
//! the same host.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::time::Instant;
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -33,6 +44,70 @@ impl RateLimiter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-host rate limiter map. The outer `Mutex<HashMap>` is held only
|
||||
/// during the entry-or-insert + Arc clone; the per-host `Mutex<RateLimiter>`
|
||||
/// is held during the actual `wait().await`. So N workers calling
|
||||
/// `wait_for(url)` on N different hosts contend nowhere except the brief
|
||||
/// HashMap lookup; workers hitting the same host serialize on that
|
||||
/// host's bucket.
|
||||
#[derive(Debug)]
|
||||
pub struct HostRateLimiters {
|
||||
default_interval: Duration,
|
||||
overrides: HashMap<String, Duration>,
|
||||
map: Mutex<HashMap<String, Arc<Mutex<RateLimiter>>>>,
|
||||
}
|
||||
|
||||
impl HostRateLimiters {
|
||||
pub fn new(default_interval: Duration) -> Self {
|
||||
Self {
|
||||
default_interval,
|
||||
overrides: HashMap::new(),
|
||||
map: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a per-host interval that overrides `default_interval`. Calls
|
||||
/// after a host's limiter has been instantiated do *not* re-create
|
||||
/// it — set all overrides before the first `wait_for` to that host.
|
||||
pub fn with_override(mut self, host: impl Into<String>, interval: Duration) -> Self {
|
||||
self.overrides.insert(host.into(), interval);
|
||||
self
|
||||
}
|
||||
|
||||
/// Block until the per-host budget allows the next request to
|
||||
/// `url`'s host. Returns an error only when the URL has no host
|
||||
/// (malformed input).
|
||||
pub async fn wait_for(&self, url: &str) -> anyhow::Result<()> {
|
||||
let host = host_of(url)
|
||||
.ok_or_else(|| anyhow::anyhow!("no host in url: {url}"))?;
|
||||
let limiter = {
|
||||
let mut map = self.map.lock().await;
|
||||
map.entry(host.clone())
|
||||
.or_insert_with(|| {
|
||||
let interval = self
|
||||
.overrides
|
||||
.get(&host)
|
||||
.copied()
|
||||
.unwrap_or(self.default_interval);
|
||||
Arc::new(Mutex::new(RateLimiter::new(interval)))
|
||||
})
|
||||
.clone()
|
||||
};
|
||||
limiter.lock().await.wait().await;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the host (no port) from a URL string. Returns `None` for
|
||||
/// inputs without a `scheme://host` shape — those would never have
|
||||
/// reached the network layer anyway.
|
||||
fn host_of(url: &str) -> Option<String> {
|
||||
let after_scheme = url.split_once("://")?.1;
|
||||
let host_with_port = after_scheme.split('/').next()?;
|
||||
let host = host_with_port.rsplit_once(':').map_or(host_with_port, |(h, _)| h);
|
||||
(!host.is_empty()).then(|| host.to_ascii_lowercase())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -66,4 +141,44 @@ mod tests {
|
||||
// Already 250ms past — no further wait needed.
|
||||
assert_eq!(Instant::now() - t0, Duration::ZERO);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn host_of_parses_scheme_path_and_port() {
|
||||
assert_eq!(host_of("https://Example.com/path").as_deref(), Some("example.com"));
|
||||
assert_eq!(host_of("http://cdn.foo.bar/img.jpg").as_deref(), Some("cdn.foo.bar"));
|
||||
assert_eq!(host_of("http://localhost:8080/x").as_deref(), Some("localhost"));
|
||||
assert!(host_of("not a url").is_none());
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn host_rate_limiters_pace_per_host() {
|
||||
// Two hosts at 100ms each. Two consecutive calls to the SAME
|
||||
// host wait 100ms total. Two consecutive calls to DIFFERENT
|
||||
// hosts both fire immediately.
|
||||
let rl = HostRateLimiters::new(Duration::from_millis(100));
|
||||
|
||||
let t0 = Instant::now();
|
||||
rl.wait_for("https://a.example/x").await.unwrap();
|
||||
rl.wait_for("https://b.example/y").await.unwrap();
|
||||
assert_eq!(Instant::now() - t0, Duration::ZERO, "different hosts don't contend");
|
||||
|
||||
let t1 = Instant::now();
|
||||
rl.wait_for("https://a.example/x").await.unwrap();
|
||||
assert_eq!(
|
||||
Instant::now() - t1,
|
||||
Duration::from_millis(100),
|
||||
"second call to same host waits a full interval"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn host_rate_limiters_honor_overrides() {
|
||||
let rl = HostRateLimiters::new(Duration::from_millis(1000))
|
||||
.with_override("fast.example", Duration::from_millis(100));
|
||||
|
||||
rl.wait_for("https://fast.example/a").await.unwrap();
|
||||
let t0 = Instant::now();
|
||||
rl.wait_for("https://fast.example/b").await.unwrap();
|
||||
assert_eq!(Instant::now() - t0, Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
|
||||
161
backend/src/crawler/session.rs
Normal file
161
backend/src/crawler/session.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
//! PHPSESSID injection + login probe.
|
||||
//!
|
||||
//! The catalog site we crawl renders chapter pages as a single multi-
|
||||
//! page list only for logged-in users. We don't try to bypass the
|
||||
//! login (CAPTCHA wall) — instead the operator pastes their browser's
|
||||
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
|
||||
//! it into Chromium *and* reqwest before the first navigation.
|
||||
//!
|
||||
//! Two things the cookie alone doesn't give us:
|
||||
//! 1. The cookie value is only meaningful to the *server* — we have
|
||||
//! no way to predict from the value alone whether it's still valid.
|
||||
//! `verify_session` does a navigation and checks for `#avatar_menu`,
|
||||
//! which only renders for authenticated visitors. Bail clean at
|
||||
//! startup if it's missing rather than discovering it 30 minutes
|
||||
//! into a backfill.
|
||||
//! 2. The reqwest client (used for cover and chapter-image downloads)
|
||||
//! has its own cookie store; we seed it for the catalog host only.
|
||||
//! CDN hosts are deliberately *not* given the cookie — they serve
|
||||
//! image bytes by signed URLs and don't need it.
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use chromiumoxide::browser::Browser;
|
||||
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
|
||||
|
||||
/// Compute the cookie domain (e.g. `.example.com`) from a start URL.
|
||||
/// The leading dot makes the cookie cover every subdomain — the source
|
||||
/// often redirects between `www.` and other prefixes mid-crawl, and a
|
||||
/// host-only cookie would silently drop on the cross-subdomain hop.
|
||||
///
|
||||
/// Caveat: this takes the last two dot-labels, which is wrong for
|
||||
/// multi-part TLDs (`.co.uk`, `.com.br` would resolve to `.co.uk` and
|
||||
/// attach to every site on `.co.uk`). For those, the operator should
|
||||
/// override via `CRAWLER_COOKIE_DOMAIN` rather than relying on this
|
||||
/// function — pulling in the Public Suffix List for one knob isn't
|
||||
/// worth it yet.
|
||||
pub fn registrable_domain(url: &str) -> Option<String> {
|
||||
let after_scheme = url.split_once("://")?.1;
|
||||
let host_with_port = after_scheme.split('/').next()?;
|
||||
let host = host_with_port
|
||||
.rsplit_once(':')
|
||||
.map_or(host_with_port, |(h, _)| h)
|
||||
.to_ascii_lowercase();
|
||||
if host.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect();
|
||||
if labels.len() < 2 {
|
||||
// Bare hostname (e.g. `localhost`) — return as-is, no leading
|
||||
// dot. Setting `.localhost` as cookie domain is invalid.
|
||||
return Some(host);
|
||||
}
|
||||
let registrable = &labels[labels.len() - 2..];
|
||||
Some(format!(".{}", registrable.join(".")))
|
||||
}
|
||||
|
||||
/// Inject the PHPSESSID cookie into the browser's cookie store for the
|
||||
/// catalog domain. Must be called before any navigation that depends on
|
||||
/// authentication; subsequent navigations include the cookie
|
||||
/// automatically.
|
||||
pub async fn inject_phpsessid(
|
||||
browser: &Browser,
|
||||
sid: &str,
|
||||
cookie_domain: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let cookie = CookieParam {
|
||||
name: "PHPSESSID".to_string(),
|
||||
value: sid.to_string(),
|
||||
url: None,
|
||||
domain: Some(cookie_domain.to_string()),
|
||||
path: Some("/".to_string()),
|
||||
secure: None,
|
||||
http_only: Some(true),
|
||||
same_site: None,
|
||||
expires: None,
|
||||
priority: None,
|
||||
same_party: None,
|
||||
source_scheme: None,
|
||||
source_port: None,
|
||||
partition_key: None,
|
||||
};
|
||||
browser
|
||||
.set_cookies(vec![cookie])
|
||||
.await
|
||||
.context("set PHPSESSID in chromium cookie store")?;
|
||||
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Navigate to `probe_url` and confirm the logged-in `#avatar_menu`
|
||||
/// element is present. The selector only renders for authenticated
|
||||
/// visitors, so its absence is the unambiguous signal that PHPSESSID
|
||||
/// is missing, expired, or revoked.
|
||||
///
|
||||
/// This burns one navigation against the catalog's rate limiter. The
|
||||
/// trade is worth it — failing here costs ~1s; failing 30 minutes into
|
||||
/// a backfill costs 30 minutes.
|
||||
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
||||
let page = browser
|
||||
.new_page(probe_url)
|
||||
.await
|
||||
.with_context(|| format!("open probe page {probe_url}"))?;
|
||||
page.wait_for_navigation().await.context("wait for nav on probe")?;
|
||||
// The avatar menu is rendered server-side as part of the header
|
||||
// when a valid session cookie is present; absent JS is fine.
|
||||
let found = page.find_element("#avatar_menu").await.is_ok();
|
||||
page.close().await.ok();
|
||||
if found {
|
||||
tracing::info!("session probe ok — #avatar_menu present");
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url}; \
|
||||
PHPSESSID is missing, expired, or revoked. Refresh CRAWLER_PHPSESSID \
|
||||
and re-run."
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn registrable_domain_strips_subdomain() {
|
||||
assert_eq!(
|
||||
registrable_domain("https://www.target-site.com/manga/foo/").as_deref(),
|
||||
Some(".target-site.com")
|
||||
);
|
||||
assert_eq!(
|
||||
registrable_domain("https://m.example.org").as_deref(),
|
||||
Some(".example.org")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registrable_domain_keeps_two_label_host() {
|
||||
assert_eq!(
|
||||
registrable_domain("https://example.com/").as_deref(),
|
||||
Some(".example.com")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registrable_domain_handles_port() {
|
||||
assert_eq!(
|
||||
registrable_domain("http://www.foo.bar:8080/x").as_deref(),
|
||||
Some(".foo.bar")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registrable_domain_bare_hostname_no_leading_dot() {
|
||||
// .localhost would be invalid as a cookie Domain.
|
||||
assert_eq!(registrable_domain("http://localhost:5173").as_deref(), Some("localhost"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registrable_domain_returns_none_for_garbage() {
|
||||
assert!(registrable_domain("not a url").is_none());
|
||||
}
|
||||
}
|
||||
@@ -74,12 +74,12 @@ pub struct SourceChapter {
|
||||
}
|
||||
|
||||
/// Context passed to every `Source` call. Carries the browser handle
|
||||
/// plus a shared rate limiter so impls that issue multiple requests in
|
||||
/// one call (e.g. pagination walks) honor the same per-host budget as
|
||||
/// the outer job loop.
|
||||
/// plus the per-host rate-limiter map so impls that issue multiple
|
||||
/// requests in one call (pagination walks, multi-page chapter image
|
||||
/// fetches) honor the right budget for each origin.
|
||||
pub struct FetchContext<'a> {
|
||||
pub browser: &'a Browser,
|
||||
pub rate: &'a tokio::sync::Mutex<crate::crawler::rate_limit::RateLimiter>,
|
||||
pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -149,10 +149,10 @@ fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
|
||||
}
|
||||
|
||||
/// Single point of rate-limited navigation. Every Source request goes
|
||||
/// through here, so the limiter is the only knob that controls
|
||||
/// per-host RPS.
|
||||
/// through here, so the per-host limiter map is the only knob that
|
||||
/// controls per-origin RPS.
|
||||
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result<String> {
|
||||
ctx.rate.lock().await.wait().await;
|
||||
ctx.rate.wait_for(url).await?;
|
||||
let page = ctx.browser.new_page(url).await?;
|
||||
page.wait_for_navigation().await?;
|
||||
// Stopgap until we wait on a specific selector per page type —
|
||||
@@ -334,7 +334,7 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
|
||||
let title_text = collapse_whitespace(&a.text().collect::<String>());
|
||||
let number = parse_chapter_number(&title_text).unwrap_or(0);
|
||||
Some(SourceChapterRef {
|
||||
source_chapter_key: derive_key_from_url(&url),
|
||||
source_chapter_key: derive_chapter_key_from_url(&url),
|
||||
number,
|
||||
title: (!title_text.is_empty()).then_some(title_text),
|
||||
url,
|
||||
@@ -366,6 +366,29 @@ fn derive_key_from_url(url: &str) -> String {
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Chapter URLs on this source point at the reader's page 1, e.g.
|
||||
/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
|
||||
/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
|
||||
/// identifies a page *within* a chapter, so naively taking the last
|
||||
/// path component returns `"pg-1"` for every chapter and collapses
|
||||
/// them all under one source_chapter_key downstream.
|
||||
fn derive_chapter_key_from_url(url: &str) -> String {
|
||||
let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
let without_reader_page = match trimmed.rsplit_once('/') {
|
||||
Some((prefix, last)) if is_reader_page_segment(last) => prefix,
|
||||
_ => trimmed,
|
||||
};
|
||||
without_reader_page
|
||||
.rsplit('/')
|
||||
.find(|s| !s.is_empty())
|
||||
.unwrap_or(url)
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn is_reader_page_segment(s: &str) -> bool {
|
||||
s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
|
||||
}
|
||||
|
||||
fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
|
||||
let s = scraper::Selector::parse(sel).ok()?;
|
||||
let el = doc.select(&s).next()?;
|
||||
@@ -577,6 +600,61 @@ mod tests {
|
||||
assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
|
||||
// Real listing fixture from the target site. 15 rows: chapters
|
||||
// with various Ch.N markup, one hiatus row, three "notice." rows,
|
||||
// and duplicates of Ch.1 and Ch.52 from different uploaders.
|
||||
// Every row must survive parsing and every chapter must have a
|
||||
// distinct source_chapter_key — chapter URLs all end in `/pg-1/`
|
||||
// (the reader's page-1 entry point), and a naive
|
||||
// last-segment-of-URL derivation returns "pg-1" for every row,
|
||||
// collapsing the whole list into one downstream chapter row.
|
||||
let html = include_str!(
|
||||
"../../../tests/fixtures/target/chapter_list_uu.html"
|
||||
);
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let chapters = parse_chapter_list(&doc);
|
||||
|
||||
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
|
||||
|
||||
let mut keys: Vec<&str> =
|
||||
chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
|
||||
keys.sort();
|
||||
let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
|
||||
assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
|
||||
for c in &chapters {
|
||||
assert_ne!(
|
||||
c.source_chapter_key, "pg-1",
|
||||
"key must not be the reader-page segment: {:?}", c
|
||||
);
|
||||
}
|
||||
|
||||
// Latest chapter is first (source orders newest → oldest).
|
||||
assert_eq!(chapters[0].number, 67);
|
||||
assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
|
||||
assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");
|
||||
|
||||
// Duplicate-number chapters (different uploaders) survive as
|
||||
// two rows. The (manga_id, number) UNIQUE collapse is a
|
||||
// downstream schema concern handled separately.
|
||||
assert_eq!(
|
||||
chapters.iter().filter(|c| c.number == 52).count(),
|
||||
2,
|
||||
"two Ch.52 uploads must both survive parsing"
|
||||
);
|
||||
assert_eq!(
|
||||
chapters.iter().filter(|c| c.number == 1).count(),
|
||||
2,
|
||||
"Ch.1 Official and Ch.1 Team Hazama are both kept"
|
||||
);
|
||||
|
||||
// Notices / hiatus rows have no leading digit so they parse to
|
||||
// number=0. They are not filtered out.
|
||||
let zero = chapters.iter().filter(|c| c.number == 0).count();
|
||||
assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_number_grabs_first_integer_run() {
|
||||
assert_eq!(parse_chapter_number("Ch.1"), Some(1));
|
||||
@@ -630,6 +708,45 @@ mod tests {
|
||||
assert_eq!(derive_key_from_url("/manga/bar"), "bar");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn derive_chapter_key_strips_trailing_reader_page_segment() {
|
||||
// Listing links go to page 1 of the reader; strip /pg-\d+/.
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
|
||||
"br_chapter-379272"
|
||||
);
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
|
||||
"to_chapter-13"
|
||||
);
|
||||
// Defensive: deep-link to a non-first page should still resolve
|
||||
// to the same chapter identity.
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
|
||||
"br_chapter-379272"
|
||||
);
|
||||
// No reader-page suffix → behaves like derive_key_from_url.
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
|
||||
"br_chapter-379272"
|
||||
);
|
||||
// Query strings are stripped.
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
|
||||
"br_chapter-379272"
|
||||
);
|
||||
// `pg-foo` is not a valid reader-page segment; treated as identity.
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/something/pg-foo/"),
|
||||
"pg-foo"
|
||||
);
|
||||
// Bare `pg-` (no digits) likewise not stripped.
|
||||
assert_eq!(
|
||||
derive_chapter_key_from_url(".../uu/something/pg-/"),
|
||||
"pg-"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metadata_hash_is_stable_and_field_sensitive() {
|
||||
let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
|
||||
|
||||
@@ -12,10 +12,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let config = mangalord::config::Config::from_env()?;
|
||||
let addr: SocketAddr = config.bind_address.parse()?;
|
||||
let app = mangalord::app::build(config).await?;
|
||||
let mangalord::app::AppHandle { router, daemon } = mangalord::app::build(config).await?;
|
||||
|
||||
tracing::info!(%addr, "mangalord listening");
|
||||
let listener = tokio::net::TcpListener::bind(addr).await?;
|
||||
axum::serve(listener, app).await?;
|
||||
axum::serve(listener, router)
|
||||
.with_graceful_shutdown(async {
|
||||
let _ = tokio::signal::ctrl_c().await;
|
||||
tracing::info!("ctrl-c received; shutting down");
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Drain background tasks (crawler daemon) before exiting so Chromium
|
||||
// gets a clean shutdown rather than relying on kill-on-drop.
|
||||
if let Some(d) = daemon {
|
||||
d.shutdown().await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -12,12 +12,15 @@ pub async fn list_for_manga(
|
||||
limit: i64,
|
||||
offset: i64,
|
||||
) -> AppResult<Vec<Chapter>> {
|
||||
// Secondary sort by created_at gives duplicate-numbered chapters
|
||||
// (multiple uploaders/translations of the same number) a stable
|
||||
// order in lists and prev/next reader navigation.
|
||||
let rows = sqlx::query_as::<_, Chapter>(
|
||||
r#"
|
||||
SELECT id, manga_id, number, title, page_count, created_at
|
||||
FROM chapters
|
||||
WHERE manga_id = $1
|
||||
ORDER BY number ASC
|
||||
ORDER BY number ASC, created_at ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
"#,
|
||||
)
|
||||
@@ -29,33 +32,40 @@ pub async fn list_for_manga(
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
pub async fn find_by_manga_and_number(
|
||||
/// Look up a chapter by its UUID, scoped to its manga so a UUID guessed
|
||||
/// from a different manga's URL doesn't accidentally resolve.
|
||||
pub async fn find_by_id_in_manga(
|
||||
pool: &PgPool,
|
||||
manga_id: Uuid,
|
||||
number: i32,
|
||||
chapter_id: Uuid,
|
||||
) -> AppResult<Option<Chapter>> {
|
||||
let row = sqlx::query_as::<_, Chapter>(
|
||||
r#"
|
||||
SELECT id, manga_id, number, title, page_count, created_at
|
||||
FROM chapters
|
||||
WHERE manga_id = $1 AND number = $2
|
||||
WHERE manga_id = $1 AND id = $2
|
||||
"#,
|
||||
)
|
||||
.bind(manga_id)
|
||||
.bind(number)
|
||||
.bind(chapter_id)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row)
|
||||
}
|
||||
|
||||
/// Accepts any `PgExecutor` so the upload handler can run this inside a
|
||||
/// transaction with the per-page inserts. Returns `AppError::Conflict`
|
||||
/// on the (manga_id, number) unique violation so handlers can surface a
|
||||
/// clean 409.
|
||||
/// transaction with the per-page inserts.
|
||||
///
|
||||
/// `uploaded_by` records who uploaded the chapter and feeds the
|
||||
/// per-user upload history. `None` means "historical / API token with
|
||||
/// no associated user" — kept nullable to support that case.
|
||||
///
|
||||
/// Chapter identity is the row UUID; the same (manga_id, number)
|
||||
/// combination can repeat (multiple translations, re-uploads). The
|
||||
/// `is_unique_violation` branch below is a defensive holdover from
|
||||
/// 0001's (manga_id, number) UNIQUE — it can no longer fire under
|
||||
/// normal operation, but we surface a clean 409 if a future migration
|
||||
/// re-adds any chapter uniqueness.
|
||||
pub async fn create<'e, E: PgExecutor<'e>>(
|
||||
executor: E,
|
||||
manga_id: Uuid,
|
||||
@@ -80,7 +90,7 @@ pub async fn create<'e, E: PgExecutor<'e>>(
|
||||
match result {
|
||||
Ok(c) => Ok(c),
|
||||
Err(e) if is_unique_violation(&e) => Err(AppError::Conflict(format!(
|
||||
"chapter {number} already exists for this manga"
|
||||
"chapter {number} conflicts with an existing chapter for this manga"
|
||||
))),
|
||||
Err(e) => Err(AppError::Database(e)),
|
||||
}
|
||||
|
||||
@@ -332,15 +332,15 @@ pub async fn sync_manga_chapters(
|
||||
|
||||
match existing {
|
||||
None => {
|
||||
// New chapter row. The (manga_id, number) unique
|
||||
// constraint protects against re-inserts if the same
|
||||
// number arrives via a different source_chapter_key.
|
||||
// New chapter row. As of 0013 there's no (manga_id,
|
||||
// number) UNIQUE, so duplicate-numbered chapters from
|
||||
// the source (different uploaders, notices, alt
|
||||
// translations) each get their own row — chapter
|
||||
// identity is the UUID, not the number.
|
||||
let (chapter_id,): (Uuid,) = sqlx::query_as(
|
||||
r#"
|
||||
INSERT INTO chapters (manga_id, number, title, page_count)
|
||||
VALUES ($1, $2, $3, 0)
|
||||
ON CONFLICT (manga_id, number) DO UPDATE
|
||||
SET title = EXCLUDED.title
|
||||
RETURNING id
|
||||
"#,
|
||||
)
|
||||
|
||||
@@ -438,3 +438,196 @@ async fn list_me_returns_paged_envelope(pool: PgPool) {
|
||||
// without paging through.
|
||||
assert_eq!(body["page"]["total"], 0);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Bookmark create -> SyncChapterContent job enqueue (background task)
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
async fn seed_chapter_with_source(
|
||||
pool: &PgPool,
|
||||
manga_id: Uuid,
|
||||
number: i32,
|
||||
source_id: &str,
|
||||
source_chapter_key: &str,
|
||||
source_url: &str,
|
||||
dropped: bool,
|
||||
) -> Uuid {
|
||||
let chapter_id: Uuid =
|
||||
mangalord::repo::chapter::create(pool, manga_id, number, None, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.id;
|
||||
sqlx::query("INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
|
||||
.bind(source_id)
|
||||
.bind(source_id)
|
||||
.bind("https://example.com")
|
||||
.execute(pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let dropped_at = if dropped { "now()" } else { "NULL" };
|
||||
sqlx::query(&format!(
|
||||
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url, dropped_at) \
|
||||
VALUES ($1, $2, $3, $4, {dropped_at})"
|
||||
))
|
||||
.bind(source_id)
|
||||
.bind(source_chapter_key)
|
||||
.bind(chapter_id)
|
||||
.bind(source_url)
|
||||
.execute(pool)
|
||||
.await
|
||||
.unwrap();
|
||||
chapter_id
|
||||
}
|
||||
|
||||
/// Poll `crawler_jobs` for the expected pending count, up to ~1.5s, so the
|
||||
/// detached `tokio::spawn` from the bookmark create handler has time to
|
||||
/// land regardless of CI scheduling jitter.
|
||||
async fn wait_for_pending_count(pool: &PgPool, expected: i64) -> i64 {
|
||||
for _ in 0..30 {
|
||||
let count: i64 = sqlx::query_scalar(
|
||||
"SELECT COUNT(*) FROM crawler_jobs \
|
||||
WHERE state = 'pending' \
|
||||
AND payload->>'kind' = 'sync_chapter_content'",
|
||||
)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap();
|
||||
if count >= expected {
|
||||
return count;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
}
|
||||
sqlx::query_scalar::<_, i64>(
|
||||
"SELECT COUNT(*) FROM crawler_jobs \
|
||||
WHERE state = 'pending' \
|
||||
AND payload->>'kind' = 'sync_chapter_content'",
|
||||
)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn create_enqueues_sync_chapter_content_jobs_for_pending_chapters(pool: PgPool) {
|
||||
let h = common::harness(pool.clone());
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||
|
||||
// Two zero-page chapters with non-dropped sources.
|
||||
let c1 = seed_chapter_with_source(&pool, manga_id, 1, "target", "ch1", "https://example.com/c1", false).await;
|
||||
let c2 = seed_chapter_with_source(&pool, manga_id, 2, "target", "ch2", "https://example.com/c2", false).await;
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.clone()
|
||||
.oneshot(common::post_json_with_cookie(
|
||||
"/api/v1/bookmarks",
|
||||
json!({ "manga_id": manga_id.to_string() }),
|
||||
&cookie,
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||
|
||||
let count = wait_for_pending_count(&pool, 2).await;
|
||||
assert_eq!(count, 2, "both pending chapters should be enqueued");
|
||||
|
||||
let chapter_ids: Vec<String> = sqlx::query_scalar(
|
||||
"SELECT payload->>'chapter_id' FROM crawler_jobs \
|
||||
WHERE payload->>'kind' = 'sync_chapter_content' \
|
||||
ORDER BY payload->>'chapter_id'",
|
||||
)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut expected = vec![c1.to_string(), c2.to_string()];
|
||||
expected.sort();
|
||||
assert_eq!(chapter_ids, expected);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn re_bookmark_after_delete_does_not_re_enqueue_pending_jobs(pool: PgPool) {
|
||||
let h = common::harness(pool.clone());
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||
let _ = seed_chapter_with_source(&pool, manga_id, 1, "target", "ch1", "https://example.com/c1", false).await;
|
||||
|
||||
// First bookmark — should enqueue 1.
|
||||
let resp = h
|
||||
.app
|
||||
.clone()
|
||||
.oneshot(common::post_json_with_cookie(
|
||||
"/api/v1/bookmarks",
|
||||
json!({ "manga_id": manga_id.to_string() }),
|
||||
&cookie,
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
let bookmark_id = common::body_json(resp).await["id"].as_str().unwrap().to_string();
|
||||
assert_eq!(wait_for_pending_count(&pool, 1).await, 1);
|
||||
|
||||
// Delete the bookmark, then re-bookmark — the existing pending job
|
||||
// is still there so the dedup index suppresses the second enqueue.
|
||||
let resp = h
|
||||
.app
|
||||
.clone()
|
||||
.oneshot(common::delete_with_cookie(
|
||||
&format!("/api/v1/bookmarks/{bookmark_id}"),
|
||||
&cookie,
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.clone()
|
||||
.oneshot(common::post_json_with_cookie(
|
||||
"/api/v1/bookmarks",
|
||||
json!({ "manga_id": manga_id.to_string() }),
|
||||
&cookie,
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||
|
||||
// Give the background task time to attempt re-enqueue (it should be a no-op).
|
||||
tokio::time::sleep(std::time::Duration::from_millis(300)).await;
|
||||
let final_count: i64 = sqlx::query_scalar(
|
||||
"SELECT COUNT(*) FROM crawler_jobs \
|
||||
WHERE state IN ('pending', 'running') \
|
||||
AND payload->>'kind' = 'sync_chapter_content'",
|
||||
)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(final_count, 1, "dedup index keeps the queue at a single in-flight row");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn create_skips_chapters_with_dropped_sources(pool: PgPool) {
|
||||
let h = common::harness(pool.clone());
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||
|
||||
let _alive = seed_chapter_with_source(&pool, manga_id, 1, "target", "ch1", "https://example.com/c1", false).await;
|
||||
let _dropped = seed_chapter_with_source(&pool, manga_id, 2, "target", "ch2", "https://example.com/c2", true).await;
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.clone()
|
||||
.oneshot(common::post_json_with_cookie(
|
||||
"/api/v1/bookmarks",
|
||||
json!({ "manga_id": manga_id.to_string() }),
|
||||
&cookie,
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||
|
||||
assert_eq!(
|
||||
wait_for_pending_count(&pool, 1).await,
|
||||
1,
|
||||
"only the chapter with a non-dropped source row gets enqueued"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -12,12 +12,18 @@ async fn seed_manga(h: &common::Harness, cookie: &str, title: &str) -> Uuid {
|
||||
common::seed_manga_via_api(&h.app, cookie, title).await
|
||||
}
|
||||
|
||||
async fn seed_chapter(pool: &PgPool, manga_id: Uuid, number: i32, title: Option<&str>) {
|
||||
async fn seed_chapter(
|
||||
pool: &PgPool,
|
||||
manga_id: Uuid,
|
||||
number: i32,
|
||||
title: Option<&str>,
|
||||
) -> Uuid {
|
||||
// Historical seed — uploaded_by remains NULL, mirroring the
|
||||
// pre-Phase-5 rows in the production DB.
|
||||
mangalord::repo::chapter::create(pool, manga_id, number, title, None)
|
||||
.await
|
||||
.unwrap();
|
||||
.unwrap()
|
||||
.id
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
@@ -81,16 +87,16 @@ async fn list_chapters_returns_404_for_unknown_manga(pool: PgPool) {
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn get_chapter_by_number(pool: PgPool) {
|
||||
async fn get_chapter_by_id(pool: PgPool) {
|
||||
let h = common::harness(pool.clone());
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||
seed_chapter(&pool, manga_id, 1, Some("The Brand")).await;
|
||||
let chapter_id = seed_chapter(&pool, manga_id, 1, Some("The Brand")).await;
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{manga_id}/chapters/1"
|
||||
"/api/v1/mangas/{manga_id}/chapters/{chapter_id}"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -99,18 +105,20 @@ async fn get_chapter_by_number(pool: PgPool) {
|
||||
assert_eq!(body["number"], 1);
|
||||
assert_eq!(body["title"], "The Brand");
|
||||
assert_eq!(body["page_count"], 0);
|
||||
assert_eq!(body["id"], chapter_id.to_string());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn get_chapter_unknown_number_is_404(pool: PgPool) {
|
||||
async fn get_chapter_unknown_id_is_404(pool: PgPool) {
|
||||
let h = common::harness(pool);
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||
let unknown_chapter = Uuid::new_v4();
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{manga_id}/chapters/99"
|
||||
"/api/v1/mangas/{manga_id}/chapters/{unknown_chapter}"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -122,10 +130,34 @@ async fn get_chapter_unknown_number_is_404(pool: PgPool) {
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn get_chapter_unknown_manga_is_404(pool: PgPool) {
|
||||
let h = common::harness(pool);
|
||||
let unknown = Uuid::nil();
|
||||
let unknown_manga = Uuid::nil();
|
||||
let unknown_chapter = Uuid::new_v4();
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!("/api/v1/mangas/{unknown}/chapters/1")))
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{unknown_manga}/chapters/{unknown_chapter}"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||
}
|
||||
|
||||
/// Cross-manga isolation: a chapter id belonging to manga A must not
|
||||
/// resolve when accessed via manga B's URL. The (manga_id, id) scoping
|
||||
/// in `find_by_id_in_manga` enforces this.
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn get_chapter_from_wrong_manga_is_404(pool: PgPool) {
|
||||
let h = common::harness(pool.clone());
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_a = seed_manga(&h, &cookie, "Berserk").await;
|
||||
let manga_b = seed_manga(&h, &cookie, "Vagabond").await;
|
||||
let chapter_id = seed_chapter(&pool, manga_a, 1, Some("Episode 1")).await;
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{manga_b}/chapters/{chapter_id}"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||
@@ -136,12 +168,12 @@ async fn list_pages_empty_for_chapter_without_upload(pool: PgPool) {
|
||||
let h = common::harness(pool.clone());
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||
seed_chapter(&pool, manga_id, 1, None).await;
|
||||
let chapter_id = seed_chapter(&pool, manga_id, 1, None).await;
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{manga_id}/chapters/1/pages"
|
||||
"/api/v1/mangas/{manga_id}/chapters/{chapter_id}/pages"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -155,11 +187,12 @@ async fn list_pages_returns_404_for_unknown_chapter(pool: PgPool) {
|
||||
let h = common::harness(pool);
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||
let unknown_chapter = Uuid::new_v4();
|
||||
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{manga_id}/chapters/99/pages"
|
||||
"/api/v1/mangas/{manga_id}/chapters/{unknown_chapter}/pages"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -139,13 +139,17 @@ async fn files_endpoint_streams_in_multiple_frames(pool: PgPool) {
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||
let chapter_id = common::body_json(resp).await["id"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
// Fetch the page back via the streaming files endpoint.
|
||||
let pages = h
|
||||
.app
|
||||
.clone()
|
||||
.oneshot(common::get(&format!(
|
||||
"/api/v1/mangas/{manga_id}/chapters/1/pages"
|
||||
"/api/v1/mangas/{manga_id}/chapters/{chapter_id}/pages"
|
||||
)))
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -317,8 +321,12 @@ async fn create_chapter_rejects_renamed_non_image_page(pool: PgPool) {
|
||||
assert_eq!(body["error"]["code"], "unsupported_media_type");
|
||||
}
|
||||
|
||||
/// Multiple chapters can share the same number — different
|
||||
/// scanlations, re-uploads, translator notes. As of migration 0013,
|
||||
/// (manga_id, number) is not unique and each upload gets its own
|
||||
/// chapter id.
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn create_chapter_returns_409_on_duplicate_number(pool: PgPool) {
|
||||
async fn create_chapter_allows_duplicate_numbers_as_separate_chapters(pool: PgPool) {
|
||||
let h = common::harness(pool);
|
||||
let (_, cookie) = common::register_user(&h.app).await;
|
||||
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||
@@ -334,10 +342,27 @@ async fn create_chapter_returns_409_on_duplicate_number(pool: PgPool) {
|
||||
};
|
||||
let first = h.app.clone().oneshot(make()).await.unwrap();
|
||||
assert_eq!(first.status(), StatusCode::CREATED);
|
||||
let second = h.app.oneshot(make()).await.unwrap();
|
||||
assert_eq!(second.status(), StatusCode::CONFLICT);
|
||||
let body = common::body_json(second).await;
|
||||
assert_eq!(body["error"]["code"], "conflict");
|
||||
let first_id = common::body_json(first).await["id"].as_str().unwrap().to_string();
|
||||
|
||||
let second = h.app.clone().oneshot(make()).await.unwrap();
|
||||
assert_eq!(second.status(), StatusCode::CREATED);
|
||||
let second_id = common::body_json(second).await["id"].as_str().unwrap().to_string();
|
||||
|
||||
assert_ne!(first_id, second_id, "each upload gets a distinct chapter id");
|
||||
|
||||
// List endpoint surfaces both rows.
|
||||
let resp = h
|
||||
.app
|
||||
.oneshot(common::get(&format!("/api/v1/mangas/{manga_id}/chapters")))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), StatusCode::OK);
|
||||
let body = common::body_json(resp).await;
|
||||
let items = body["items"].as_array().unwrap();
|
||||
assert_eq!(items.len(), 2, "both Ch.1 uploads listed separately");
|
||||
for item in items {
|
||||
assert_eq!(item["number"], 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
|
||||
372
backend/tests/crawler_daemon.rs
Normal file
372
backend/tests/crawler_daemon.rs
Normal file
@@ -0,0 +1,372 @@
|
||||
//! Integration tests for the crawler daemon's cron + worker pool. The
|
||||
//! daemon's full real path requires Chromium and a live source; here we
|
||||
//! test the seam (MetadataPass / ChapterDispatcher traits) and the
|
||||
//! cron/worker control-flow.
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::NaiveTime;
|
||||
use chrono_tz::Tz;
|
||||
use mangalord::crawler::content::SyncOutcome;
|
||||
use mangalord::crawler::daemon::{
|
||||
self, test_support::CountingMetadataPass, ChapterDispatcher, DaemonConfig, MetadataPass,
|
||||
CRON_LOCK_KEY,
|
||||
};
|
||||
use mangalord::crawler::jobs::{self, JobPayload};
|
||||
use mangalord::crawler::pipeline;
|
||||
use serde_json::json;
|
||||
use sqlx::PgPool;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use uuid::Uuid;
|
||||
|
||||
fn far_future_daily_at() -> NaiveTime {
|
||||
// Some time hours from "now" so the scheduler sleeps for the whole test.
|
||||
NaiveTime::from_hms_opt(23, 59, 0).unwrap()
|
||||
}
|
||||
|
||||
fn make_cfg(
|
||||
metadata_pass: Option<Arc<dyn MetadataPass>>,
|
||||
dispatcher: Arc<dyn ChapterDispatcher>,
|
||||
session_expired: Arc<std::sync::atomic::AtomicBool>,
|
||||
workers: usize,
|
||||
) -> DaemonConfig {
|
||||
DaemonConfig {
|
||||
metadata_pass,
|
||||
dispatcher,
|
||||
chapter_workers: workers,
|
||||
daily_at: far_future_daily_at(),
|
||||
tz: Tz::UTC,
|
||||
retention_days: 7,
|
||||
session_expired,
|
||||
extra_tasks: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn enqueue_chapter_job(pool: &PgPool) -> Uuid {
|
||||
let chapter_id = Uuid::new_v4();
|
||||
let payload = JobPayload::SyncChapterContent {
|
||||
source_id: "target".into(),
|
||||
chapter_id,
|
||||
source_chapter_key: format!("ch-{chapter_id}"),
|
||||
};
|
||||
let res = jobs::enqueue(pool, &payload).await.unwrap();
|
||||
match res {
|
||||
jobs::EnqueueResult::Inserted(_) => chapter_id,
|
||||
jobs::EnqueueResult::Skipped => unreachable!("fresh chapter_id"),
|
||||
}
|
||||
}
|
||||
|
||||
async fn count_state(pool: &PgPool, state: &str) -> i64 {
|
||||
sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM crawler_jobs WHERE state = $1")
|
||||
.bind(state)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
struct AlwaysDoneDispatcher {
|
||||
seen: AtomicUsize,
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl ChapterDispatcher for AlwaysDoneDispatcher {
|
||||
async fn dispatch(&self, _payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
self.seen.fetch_add(1, Ordering::AcqRel);
|
||||
Ok(SyncOutcome::Fetched { pages: 1 })
|
||||
}
|
||||
}
|
||||
|
||||
struct PanickingDispatcher {
|
||||
seen: AtomicUsize,
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl ChapterDispatcher for PanickingDispatcher {
|
||||
async fn dispatch(&self, _payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
self.seen.fetch_add(1, Ordering::AcqRel);
|
||||
panic!("intentional dispatcher panic");
|
||||
}
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn workers_drain_jobs_through_dispatcher(pool: PgPool) {
|
||||
enqueue_chapter_job(&pool).await;
|
||||
enqueue_chapter_job(&pool).await;
|
||||
enqueue_chapter_job(&pool).await;
|
||||
|
||||
let dispatcher = Arc::new(AlwaysDoneDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let cancel = CancellationToken::new();
|
||||
let handle = daemon::spawn(
|
||||
pool.clone(),
|
||||
cancel.clone(),
|
||||
make_cfg(None, dispatcher.clone(), session_expired, 2),
|
||||
);
|
||||
|
||||
// Wait for the workers to drain all three jobs.
|
||||
let dispatcher_seen = || dispatcher.seen.load(Ordering::Acquire);
|
||||
for _ in 0..40 {
|
||||
if dispatcher_seen() >= 3 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
assert!(
|
||||
dispatcher_seen() >= 3,
|
||||
"expected at least 3 dispatches, got {}",
|
||||
dispatcher_seen()
|
||||
);
|
||||
|
||||
handle.shutdown().await;
|
||||
assert_eq!(count_state(&pool, "done").await, 3);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn workers_idle_while_session_expired(pool: PgPool) {
|
||||
let id = enqueue_chapter_job(&pool).await;
|
||||
let dispatcher = Arc::new(AlwaysDoneDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(true));
|
||||
let cancel = CancellationToken::new();
|
||||
let handle = daemon::spawn(
|
||||
pool.clone(),
|
||||
cancel.clone(),
|
||||
make_cfg(None, dispatcher.clone(), Arc::clone(&session_expired), 1),
|
||||
);
|
||||
|
||||
// Wait long enough that a non-idled worker would have leased and ack'd.
|
||||
tokio::time::sleep(Duration::from_millis(800)).await;
|
||||
assert_eq!(
|
||||
dispatcher.seen.load(Ordering::Acquire),
|
||||
0,
|
||||
"dispatcher must not be invoked while session_expired flag is set"
|
||||
);
|
||||
assert_eq!(count_state(&pool, "pending").await, 1);
|
||||
let _ = id;
|
||||
|
||||
handle.shutdown().await;
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn dispatcher_panic_is_contained_and_job_is_acked_failed(pool: PgPool) {
|
||||
enqueue_chapter_job(&pool).await;
|
||||
enqueue_chapter_job(&pool).await;
|
||||
|
||||
let dispatcher = Arc::new(PanickingDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let cancel = CancellationToken::new();
|
||||
let handle = daemon::spawn(
|
||||
pool.clone(),
|
||||
cancel.clone(),
|
||||
make_cfg(None, dispatcher.clone(), session_expired, 1),
|
||||
);
|
||||
|
||||
// Wait for the worker to handle both panicking jobs.
|
||||
for _ in 0..40 {
|
||||
if dispatcher.seen.load(Ordering::Acquire) >= 2 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
assert!(
|
||||
dispatcher.seen.load(Ordering::Acquire) >= 2,
|
||||
"worker must keep going after a panic — handled at least 2 jobs"
|
||||
);
|
||||
|
||||
handle.shutdown().await;
|
||||
|
||||
// attempts=1 below max=5, so the panicking jobs go back to pending with
|
||||
// backoff and `last_error = "worker panicked"`.
|
||||
let last_errors: Vec<String> = sqlx::query_scalar(
|
||||
"SELECT last_error FROM crawler_jobs WHERE last_error IS NOT NULL",
|
||||
)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(last_errors.len(), 2);
|
||||
assert!(last_errors.iter().all(|e| e == "worker panicked"));
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn cron_skips_tick_when_advisory_lock_held(pool: PgPool) {
|
||||
// With no last_metadata_tick_at row, the daemon does a catch-up tick
|
||||
// immediately on spawn. We hold the advisory lock on a separate
|
||||
// connection beforehand so the catch-up's pg_try_advisory_lock returns
|
||||
// false and the tick must skip without invoking the metadata pass.
|
||||
let mut lock_conn = pool.acquire().await.unwrap();
|
||||
sqlx::query("SELECT pg_advisory_lock($1)")
|
||||
.bind(CRON_LOCK_KEY)
|
||||
.execute(&mut *lock_conn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let counter = Arc::new(CountingMetadataPass::default());
|
||||
let dispatcher = Arc::new(AlwaysDoneDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let cancel = CancellationToken::new();
|
||||
// daily_at far in the future so after the (skipped) catch-up the
|
||||
// cron sleeps for the rest of the test rather than racing for the lock.
|
||||
let cfg = make_cfg(
|
||||
Some(counter.clone() as Arc<dyn MetadataPass>),
|
||||
dispatcher,
|
||||
session_expired,
|
||||
1,
|
||||
);
|
||||
let handle = daemon::spawn(pool.clone(), cancel.clone(), cfg);
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(800)).await;
|
||||
assert_eq!(
|
||||
counter.count.load(Ordering::Acquire),
|
||||
0,
|
||||
"cron must skip the catch-up tick while the advisory lock is held"
|
||||
);
|
||||
|
||||
sqlx::query("SELECT pg_advisory_unlock($1)")
|
||||
.bind(CRON_LOCK_KEY)
|
||||
.execute(&mut *lock_conn)
|
||||
.await
|
||||
.unwrap();
|
||||
drop(lock_conn);
|
||||
|
||||
handle.shutdown().await;
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn cron_catches_up_when_last_tick_is_stale(pool: PgPool) {
|
||||
// Pre-seed last_metadata_tick_at well in the past so previous_fire(now)
|
||||
// > last_tick is trivially true and the daemon catches up immediately.
|
||||
sqlx::query(
|
||||
"INSERT INTO crawler_state (key, value) VALUES ($1, $2)
|
||||
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value",
|
||||
)
|
||||
.bind("last_metadata_tick_at")
|
||||
.bind(json!({"at": "2020-01-01T00:00:00Z"}))
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let counter = Arc::new(CountingMetadataPass::default());
|
||||
let dispatcher = Arc::new(AlwaysDoneDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let cancel = CancellationToken::new();
|
||||
let handle = daemon::spawn(
|
||||
pool.clone(),
|
||||
cancel.clone(),
|
||||
make_cfg(
|
||||
Some(counter.clone() as Arc<dyn MetadataPass>),
|
||||
dispatcher,
|
||||
session_expired,
|
||||
1,
|
||||
),
|
||||
);
|
||||
|
||||
for _ in 0..40 {
|
||||
if counter.count.load(Ordering::Acquire) >= 1 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
assert!(
|
||||
counter.count.load(Ordering::Acquire) >= 1,
|
||||
"catch-up tick should have fired immediately"
|
||||
);
|
||||
|
||||
handle.shutdown().await;
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn enqueue_bookmarked_pending_skips_dropped_sources(pool: PgPool) {
|
||||
// Setup: one manga with two chapters (page_count = 0). One has a
|
||||
// non-dropped source; the other's source is dropped. A user bookmarks
|
||||
// the manga. Expectation: only the non-dropped chapter is enqueued.
|
||||
let user_id: Uuid = sqlx::query_scalar(
|
||||
"INSERT INTO users (username, password_hash) VALUES ($1, $2) RETURNING id",
|
||||
)
|
||||
.bind("alice")
|
||||
.bind("not-a-real-hash")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let manga_id: Uuid = sqlx::query_scalar(
|
||||
"INSERT INTO mangas (title) VALUES ($1) RETURNING id",
|
||||
)
|
||||
.bind("Berserk")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query("INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
|
||||
.bind("target")
|
||||
.bind("Target")
|
||||
.bind("https://example.com")
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let c1: Uuid = sqlx::query_scalar(
|
||||
"INSERT INTO chapters (manga_id, number, page_count) VALUES ($1, 1, 0) RETURNING id",
|
||||
)
|
||||
.bind(manga_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let c2: Uuid = sqlx::query_scalar(
|
||||
"INSERT INTO chapters (manga_id, number, page_count) VALUES ($1, 2, 0) RETURNING id",
|
||||
)
|
||||
.bind(manga_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
// c1: alive source. c2: dropped source.
|
||||
sqlx::query(
|
||||
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url) \
|
||||
VALUES ($1, $2, $3, $4)",
|
||||
)
|
||||
.bind("target")
|
||||
.bind("ch1")
|
||||
.bind(c1)
|
||||
.bind("https://example.com/ch1")
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query(
|
||||
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url, dropped_at) \
|
||||
VALUES ($1, $2, $3, $4, now())",
|
||||
)
|
||||
.bind("target")
|
||||
.bind("ch2")
|
||||
.bind(c2)
|
||||
.bind("https://example.com/ch2")
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query("INSERT INTO bookmarks (user_id, manga_id) VALUES ($1, $2)")
|
||||
.bind(user_id)
|
||||
.bind(manga_id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let summary = pipeline::enqueue_bookmarked_pending(&pool).await.unwrap();
|
||||
assert_eq!(summary.inserted, 1, "only the non-dropped chapter enqueued");
|
||||
assert_eq!(summary.skipped, 0);
|
||||
let payloads: Vec<serde_json::Value> = sqlx::query_scalar(
|
||||
"SELECT payload FROM crawler_jobs WHERE payload->>'kind' = 'sync_chapter_content'",
|
||||
)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(payloads.len(), 1);
|
||||
assert_eq!(
|
||||
payloads[0]["chapter_id"].as_str().unwrap(),
|
||||
c1.to_string()
|
||||
);
|
||||
}
|
||||
|
||||
441
backend/tests/crawler_jobs.rs
Normal file
441
backend/tests/crawler_jobs.rs
Normal file
@@ -0,0 +1,441 @@
|
||||
//! Integration tests for `crawler::jobs` queue operations.
|
||||
//!
|
||||
//! Uses `#[sqlx::test(migrations = "./migrations")]` which provisions a fresh
|
||||
//! migrated DB per test. No browser, no axum router — these exercise the SQL
|
||||
//! shape and dedup-index semantics directly against Postgres.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use mangalord::crawler::jobs::{
|
||||
self, EnqueueResult, JobPayload, KIND_SYNC_CHAPTER_CONTENT,
|
||||
};
|
||||
use mangalord::crawler::source::DiscoverMode;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
fn chapter_content_payload(chapter_id: Uuid) -> JobPayload {
|
||||
JobPayload::SyncChapterContent {
|
||||
source_id: "target".into(),
|
||||
chapter_id,
|
||||
source_chapter_key: format!("ch-{chapter_id}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn discover_payload() -> JobPayload {
|
||||
JobPayload::Discover {
|
||||
source_id: "target".into(),
|
||||
mode: DiscoverMode::Backfill,
|
||||
}
|
||||
}
|
||||
|
||||
async fn job_state(pool: &PgPool, id: Uuid) -> String {
|
||||
sqlx::query_scalar::<_, String>("SELECT state FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn job_attempts(pool: &PgPool, id: Uuid) -> i32 {
|
||||
sqlx::query_scalar::<_, i32>("SELECT attempts FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn job_count(pool: &PgPool) -> i64 {
|
||||
sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM crawler_jobs")
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn enqueue_inserts_pending_row_with_round_trip_payload(pool: PgPool) {
|
||||
let chapter_id = Uuid::new_v4();
|
||||
let payload = chapter_content_payload(chapter_id);
|
||||
|
||||
let result = jobs::enqueue(&pool, &payload).await.unwrap();
|
||||
let id = match result {
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
EnqueueResult::Skipped => panic!("expected Inserted on first enqueue"),
|
||||
};
|
||||
|
||||
assert_eq!(job_state(&pool, id).await, "pending");
|
||||
assert_eq!(job_attempts(&pool, id).await, 0);
|
||||
|
||||
let raw_payload: serde_json::Value =
|
||||
sqlx::query_scalar("SELECT payload FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let decoded: JobPayload = serde_json::from_value(raw_payload).unwrap();
|
||||
match decoded {
|
||||
JobPayload::SyncChapterContent {
|
||||
source_id,
|
||||
chapter_id: c,
|
||||
source_chapter_key,
|
||||
} => {
|
||||
assert_eq!(source_id, "target");
|
||||
assert_eq!(c, chapter_id);
|
||||
assert_eq!(source_chapter_key, format!("ch-{chapter_id}"));
|
||||
}
|
||||
_ => panic!("payload variant mismatch"),
|
||||
}
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn duplicate_chapter_content_while_pending_is_skipped(pool: PgPool) {
|
||||
let chapter_id = Uuid::new_v4();
|
||||
let p = chapter_content_payload(chapter_id);
|
||||
|
||||
let first = jobs::enqueue(&pool, &p).await.unwrap();
|
||||
assert!(matches!(first, EnqueueResult::Inserted(_)));
|
||||
|
||||
let second = jobs::enqueue(&pool, &p).await.unwrap();
|
||||
assert!(matches!(second, EnqueueResult::Skipped));
|
||||
|
||||
assert_eq!(job_count(&pool).await, 1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn duplicate_after_done_releases_dedup_slot(pool: PgPool) {
|
||||
let chapter_id = Uuid::new_v4();
|
||||
let p = chapter_content_payload(chapter_id);
|
||||
|
||||
let first_id = match jobs::enqueue(&pool, &p).await.unwrap() {
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
EnqueueResult::Skipped => panic!("first enqueue should insert"),
|
||||
};
|
||||
// Move the first job out of (pending|running) so the partial index drops it.
|
||||
sqlx::query("UPDATE crawler_jobs SET state = 'done' WHERE id = $1")
|
||||
.bind(first_id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let second = jobs::enqueue(&pool, &p).await.unwrap();
|
||||
assert!(
|
||||
matches!(second, EnqueueResult::Inserted(_)),
|
||||
"after done the chapter_id slot is free again"
|
||||
);
|
||||
assert_eq!(job_count(&pool).await, 2);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn different_chapter_ids_can_coexist(pool: PgPool) {
|
||||
let p1 = chapter_content_payload(Uuid::new_v4());
|
||||
let p2 = chapter_content_payload(Uuid::new_v4());
|
||||
assert!(matches!(
|
||||
jobs::enqueue(&pool, &p1).await.unwrap(),
|
||||
EnqueueResult::Inserted(_)
|
||||
));
|
||||
assert!(matches!(
|
||||
jobs::enqueue(&pool, &p2).await.unwrap(),
|
||||
EnqueueResult::Inserted(_)
|
||||
));
|
||||
assert_eq!(job_count(&pool).await, 2);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn non_chapter_content_payloads_are_never_deduped(pool: PgPool) {
|
||||
let p = discover_payload();
|
||||
assert!(matches!(
|
||||
jobs::enqueue(&pool, &p).await.unwrap(),
|
||||
EnqueueResult::Inserted(_)
|
||||
));
|
||||
assert!(matches!(
|
||||
jobs::enqueue(&pool, &p).await.unwrap(),
|
||||
EnqueueResult::Inserted(_)
|
||||
));
|
||||
assert_eq!(job_count(&pool).await, 2);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn lease_marks_running_and_bumps_attempts_and_sets_leased_until(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
EnqueueResult::Skipped => unreachable!(),
|
||||
};
|
||||
|
||||
let leases = jobs::lease(&pool, None, 10, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(leases.len(), 1);
|
||||
let lease = &leases[0];
|
||||
assert_eq!(lease.id, id);
|
||||
assert_eq!(lease.attempts, 1);
|
||||
|
||||
assert_eq!(job_state(&pool, id).await, "running");
|
||||
|
||||
let leased_until: Option<chrono::DateTime<chrono::Utc>> =
|
||||
sqlx::query_scalar("SELECT leased_until FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
let leased_until = leased_until.expect("leased_until set");
|
||||
assert!(leased_until > chrono::Utc::now());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
|
||||
let discover_id = match jobs::enqueue(&pool, &discover_payload()).await.unwrap() {
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let chapter_id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let leases = jobs::lease(
|
||||
&pool,
|
||||
Some(KIND_SYNC_CHAPTER_CONTENT),
|
||||
10,
|
||||
Duration::from_secs(60),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(leases.len(), 1, "only chapter content payload leases");
|
||||
assert_eq!(leases[0].id, chapter_id);
|
||||
// discover is still pending
|
||||
assert_eq!(job_state(&pool, discover_id).await, "pending");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn concurrent_leases_under_skip_locked_return_disjoint_ids(pool: PgPool) {
|
||||
// 4 pending jobs, two concurrent calls each asking for up to 2.
|
||||
let mut ids = Vec::new();
|
||||
for _ in 0..4 {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
ids.push(id);
|
||||
}
|
||||
|
||||
let (a, b) = tokio::join!(
|
||||
jobs::lease(&pool, None, 2, Duration::from_secs(60)),
|
||||
jobs::lease(&pool, None, 2, Duration::from_secs(60)),
|
||||
);
|
||||
let a = a.unwrap();
|
||||
let b = b.unwrap();
|
||||
let mut seen: Vec<Uuid> = a.iter().chain(b.iter()).map(|l| l.id).collect();
|
||||
seen.sort();
|
||||
seen.dedup();
|
||||
let count = a.len() + b.len();
|
||||
assert_eq!(
|
||||
seen.len(),
|
||||
count,
|
||||
"no id appears in both lease results (SKIP LOCKED)"
|
||||
);
|
||||
assert!(count >= 2, "at least one lease saw work");
|
||||
assert!(count <= 4);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn stale_running_lease_can_be_reclaimed(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let first = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(first.len(), 1);
|
||||
// Pretend the worker crashed: rewind leased_until into the past.
|
||||
sqlx::query("UPDATE crawler_jobs SET leased_until = now() - interval '1 minute' WHERE id = $1")
|
||||
.bind(id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let second = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(second.len(), 1, "stale running row was re-leased");
|
||||
assert_eq!(second[0].id, id);
|
||||
assert_eq!(second[0].attempts, 2, "attempts bumped again");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn ack_done_transitions_state_and_clears_lease(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
jobs::ack_done(&pool, leases[0].id).await.unwrap();
|
||||
|
||||
assert_eq!(job_state(&pool, id).await, "done");
|
||||
let leased_until: Option<chrono::DateTime<chrono::Utc>> =
|
||||
sqlx::query_scalar("SELECT leased_until FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(leased_until.is_none());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn ack_failed_under_max_returns_to_pending_with_future_schedule(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
let lease = &leases[0];
|
||||
jobs::ack_failed(&pool, lease.id, "boom", lease.attempts, lease.max_attempts)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(job_state(&pool, id).await, "pending");
|
||||
|
||||
let (scheduled_at, last_error): (chrono::DateTime<chrono::Utc>, Option<String>) =
|
||||
sqlx::query_as("SELECT scheduled_at, last_error FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(scheduled_at > chrono::Utc::now());
|
||||
assert_eq!(last_error.as_deref(), Some("boom"));
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn ack_failed_at_max_marks_dead(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
// Force a single lease then mark "this was attempt N where N == max_attempts".
|
||||
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
let lease = &leases[0];
|
||||
jobs::ack_failed(&pool, lease.id, "final boom", lease.max_attempts, lease.max_attempts)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(job_state(&pool, id).await, "dead");
|
||||
let last_error: Option<String> =
|
||||
sqlx::query_scalar("SELECT last_error FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(last_error.as_deref(), Some("final boom"));
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn release_returns_to_pending_and_undoes_attempt_increment(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(leases[0].attempts, 1);
|
||||
jobs::release(&pool, leases[0].id).await.unwrap();
|
||||
|
||||
assert_eq!(job_state(&pool, id).await, "pending");
|
||||
assert_eq!(job_attempts(&pool, id).await, 0);
|
||||
let leased_until: Option<chrono::DateTime<chrono::Utc>> =
|
||||
sqlx::query_scalar("SELECT leased_until FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(leased_until.is_none());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn reap_done_deletes_old_rows_keeps_fresh(pool: PgPool) {
|
||||
// Two done rows: one old (updated_at 10 days ago), one fresh.
|
||||
let old_id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let fresh_id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
sqlx::query("UPDATE crawler_jobs SET state='done', updated_at = now() - interval '10 days' WHERE id = $1")
|
||||
.bind(old_id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query("UPDATE crawler_jobs SET state='done' WHERE id = $1")
|
||||
.bind(fresh_id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let deleted = jobs::reap_done(&pool, 7).await.unwrap();
|
||||
assert_eq!(deleted, 1);
|
||||
|
||||
let remaining: Vec<Uuid> = sqlx::query_scalar("SELECT id FROM crawler_jobs ORDER BY id")
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(remaining, vec![fresh_id], "only fresh row remains");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn reap_done_zero_is_a_no_op(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
sqlx::query("UPDATE crawler_jobs SET state='done', updated_at = now() - interval '999 days' WHERE id = $1")
|
||||
.bind(id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let deleted = jobs::reap_done(&pool, 0).await.unwrap();
|
||||
assert_eq!(deleted, 0);
|
||||
assert_eq!(job_count(&pool).await, 1);
|
||||
}
|
||||
@@ -232,6 +232,82 @@ async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPo
|
||||
assert!(dropped.0.is_some(), "ch2 should be soft-dropped");
|
||||
}
|
||||
|
||||
/// Real-world sources publish multiple chapters at the same number
|
||||
/// (different uploaders, translator notes, re-releases). After the
|
||||
/// (manga_id, number) UNIQUE drop in 0013, each `SourceChapterRef`
|
||||
/// becomes its own `chapters` row even when the parsed number matches
|
||||
/// — chapter identity is now the chapter id, not the number.
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
||||
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Two distinct uploads of Ch.52 (different uploaders → different
|
||||
// URLs/keys, same parsed number) plus a notice/hiatus row that
|
||||
// parses to number=0 alongside a real chapter at number 1.
|
||||
let chapters = vec![
|
||||
SourceChapterRef {
|
||||
source_chapter_key: "br_chapter-A".into(),
|
||||
number: 52,
|
||||
title: Some("Ch.52 : Official".into()),
|
||||
url: "https://x.example/foo/A/pg-1/".into(),
|
||||
},
|
||||
SourceChapterRef {
|
||||
source_chapter_key: "br_chapter-B".into(),
|
||||
number: 52,
|
||||
title: Some("Ch.52 : Official (alt)".into()),
|
||||
url: "https://x.example/foo/B/pg-1/".into(),
|
||||
},
|
||||
SourceChapterRef {
|
||||
source_chapter_key: "br_chapter-NOTICE".into(),
|
||||
number: 0,
|
||||
title: Some("hitaus.".into()),
|
||||
url: "https://x.example/foo/notice/pg-1/".into(),
|
||||
},
|
||||
SourceChapterRef {
|
||||
source_chapter_key: "br_chapter-1".into(),
|
||||
number: 1,
|
||||
title: Some("Ch.1 : Official".into()),
|
||||
url: "https://x.example/foo/1/pg-1/".into(),
|
||||
},
|
||||
];
|
||||
|
||||
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
diff,
|
||||
ChapterDiff {
|
||||
new: 4,
|
||||
refreshed: 0,
|
||||
dropped: 0
|
||||
},
|
||||
"every source ref yields a new chapter row"
|
||||
);
|
||||
|
||||
let rows: (i64,) =
|
||||
sqlx::query_as("SELECT COUNT(*) FROM chapters WHERE manga_id = $1")
|
||||
.bind(up.manga_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(rows.0, 4, "4 distinct chapter rows even with duplicate numbers");
|
||||
|
||||
let ch52_count: (i64,) = sqlx::query_as(
|
||||
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1 AND number = 52",
|
||||
)
|
||||
.bind(up.manga_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
|
||||
194
backend/tests/fixtures/target/chapter_list_uu.html
vendored
Normal file
194
backend/tests/fixtures/target/chapter_list_uu.html
vendored
Normal file
@@ -0,0 +1,194 @@
|
||||
<table class="listing" id="chapter_table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-379272/pg-1/"><b>Ch.67</b>
|
||||
: Official </a>
|
||||
<b style="color:#FEFD7F;width;30px;display:inline-block;margin-left:5px">new</b>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">May 20, 2026</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-328248/pg-1/"><b>hitaus.</b>
|
||||
</a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Jan 15, 2026</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-326351/pg-1/"><b>Ch.66</b>
|
||||
: Official </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Jan 10, 2026</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-295078/pg-1/"><b>Ch.52</b>
|
||||
: Official </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Aug 28, 2025</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-294815/pg-1/"><b>Ch.52</b>
|
||||
: Official </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../4300634/upload/">mina</a>
|
||||
</td>
|
||||
<td class="no">Aug 27, 2025</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-249964/pg-1/"><b>Ch.10</b>
|
||||
: Official </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Jan 5, 2025</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/to_chapter-13/pg-1/"><b>Ch.13</b>
|
||||
: Thank you, we'll see you in the next one! </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no"></td>
|
||||
<td class="no">Dec 30, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-249095/pg-1/"><b>Ch.9</b>
|
||||
: Official </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Dec 28, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-248930/pg-1/"><b>Ch.1</b>
|
||||
: Official </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Dec 26, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/to_chapter-12/pg-1/"><b>Ch.12</b>
|
||||
</a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no"></td>
|
||||
<td class="no">Dec 1, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-244844/pg-1/"><b>notice.</b>
|
||||
: Officials </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Nov 26, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/to_chapter-11/pg-1/"><b>Ch.11</b>
|
||||
</a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no"></td>
|
||||
<td class="no">Nov 18, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-221180/pg-1/"><b>notice.</b>
|
||||
</a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../3781074/upload/">Izanami</a>
|
||||
</td>
|
||||
<td class="no">Jun 21, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-234803/pg-1/"><b>notice.</b>
|
||||
</a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../2843005/upload/">bloomingdale</a>
|
||||
</td>
|
||||
<td class="no">Sep 13, 2024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<h4>
|
||||
<a class="chico"
|
||||
href=".../uu/br_chapter-220299/pg-1/"><b>Ch.1</b>
|
||||
: Team Hazama </a>
|
||||
</h4>
|
||||
</td>
|
||||
<td class="no">
|
||||
<a href=".../1457681/upload/">purplepandabear</a>
|
||||
</td>
|
||||
<td class="no">Jun 16, 2024</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
@@ -1,6 +1,7 @@
|
||||
import { test, expect, type Page } from '@playwright/test';
|
||||
|
||||
const mangaId = '22222222-2222-2222-2222-222222222222';
|
||||
const chapterId = 'c2222222-2222-2222-2222-222222222222';
|
||||
const mangaFixture = {
|
||||
id: mangaId,
|
||||
title: 'Vagabond',
|
||||
@@ -11,7 +12,7 @@ const mangaFixture = {
|
||||
updated_at: '2026-01-01T00:00:00Z'
|
||||
};
|
||||
const chapterFixture = {
|
||||
id: 'c1',
|
||||
id: chapterId,
|
||||
manga_id: mangaId,
|
||||
number: 1,
|
||||
title: null,
|
||||
@@ -20,24 +21,24 @@ const chapterFixture = {
|
||||
};
|
||||
const pagesFixture = [
|
||||
{
|
||||
id: 'p1',
|
||||
chapter_id: 'c1',
|
||||
id: 'p1111111-2222-2222-2222-222222222222',
|
||||
chapter_id: chapterId,
|
||||
page_number: 1,
|
||||
storage_key: 'mangas/m2/chapters/c1/pages/0001.png',
|
||||
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0001.png`,
|
||||
content_type: 'image/png'
|
||||
},
|
||||
{
|
||||
id: 'p2',
|
||||
chapter_id: 'c1',
|
||||
id: 'p2222222-2222-2222-2222-222222222222',
|
||||
chapter_id: chapterId,
|
||||
page_number: 2,
|
||||
storage_key: 'mangas/m2/chapters/c1/pages/0002.png',
|
||||
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0002.png`,
|
||||
content_type: 'image/png'
|
||||
},
|
||||
{
|
||||
id: 'p3',
|
||||
chapter_id: 'c1',
|
||||
id: 'p3333333-2222-2222-2222-222222222222',
|
||||
chapter_id: chapterId,
|
||||
page_number: 3,
|
||||
storage_key: 'mangas/m2/chapters/c1/pages/0003.png',
|
||||
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0003.png`,
|
||||
content_type: 'image/png'
|
||||
}
|
||||
];
|
||||
@@ -92,14 +93,16 @@ async function mockReaderApis(page: Page) {
|
||||
})
|
||||
})
|
||||
);
|
||||
await page.route(`**/api/v1/mangas/${mangaId}/chapters/1`, (route) =>
|
||||
await page.route(`**/api/v1/mangas/${mangaId}/chapters/${chapterId}`, (route) =>
|
||||
route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
body: JSON.stringify(chapterFixture)
|
||||
})
|
||||
);
|
||||
await page.route(`**/api/v1/mangas/${mangaId}/chapters/1/pages`, (route) =>
|
||||
await page.route(
|
||||
`**/api/v1/mangas/${mangaId}/chapters/${chapterId}/pages`,
|
||||
(route) =>
|
||||
route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
@@ -131,7 +134,7 @@ test.beforeEach(async ({ context }) => {
|
||||
|
||||
test('switching to continuous mode stacks all pages and hides chevrons', async ({ page }) => {
|
||||
await mockReaderApis(page);
|
||||
await page.goto(`/manga/${mangaId}/chapter/1`);
|
||||
await page.goto(`/manga/${mangaId}/chapter/${chapterId}`);
|
||||
|
||||
// Default single-page mode is active.
|
||||
await expect(page.getByTestId('reader-page')).toBeVisible();
|
||||
@@ -149,7 +152,7 @@ test('switching to continuous mode stacks all pages and hides chevrons', async (
|
||||
|
||||
test('arrow keys do not paginate while in continuous mode', async ({ page }) => {
|
||||
await mockReaderApis(page);
|
||||
await page.goto(`/manga/${mangaId}/chapter/1`);
|
||||
await page.goto(`/manga/${mangaId}/chapter/${chapterId}`);
|
||||
await page.getByTestId('reader-mode-continuous').click();
|
||||
await expect(page.getByTestId('reader-continuous')).toBeVisible();
|
||||
|
||||
@@ -164,7 +167,7 @@ test('arrow keys do not paginate while in continuous mode', async ({ page }) =>
|
||||
|
||||
test('gap select updates the inline gap on the continuous container', async ({ page }) => {
|
||||
await mockReaderApis(page);
|
||||
await page.goto(`/manga/${mangaId}/chapter/1`);
|
||||
await page.goto(`/manga/${mangaId}/chapter/${chapterId}`);
|
||||
await page.getByTestId('reader-mode-continuous').click();
|
||||
|
||||
const container = page.getByTestId('reader-continuous');
|
||||
@@ -192,7 +195,7 @@ test('reader-mode preference set on one page is honored when the reader opens',
|
||||
});
|
||||
await mockReaderApis(page);
|
||||
|
||||
await page.goto(`/manga/${mangaId}/chapter/1`);
|
||||
await page.goto(`/manga/${mangaId}/chapter/${chapterId}`);
|
||||
await expect(page.getByTestId('reader-continuous')).toBeVisible();
|
||||
await expect(page.getByTestId('page-indicator')).toHaveText('3 pages');
|
||||
await expect(page.getByTestId('reader-continuous')).toHaveAttribute(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { test, expect, type Page } from '@playwright/test';
|
||||
|
||||
const mangaId = '11111111-1111-1111-1111-111111111111';
|
||||
const chapterId = 'c1111111-1111-1111-1111-111111111111';
|
||||
const mangaFixture = {
|
||||
id: mangaId,
|
||||
title: 'Berserk',
|
||||
@@ -12,7 +13,7 @@ const mangaFixture = {
|
||||
};
|
||||
const chaptersFixture = [
|
||||
{
|
||||
id: 'c1',
|
||||
id: chapterId,
|
||||
manga_id: mangaId,
|
||||
number: 1,
|
||||
title: 'The Brand',
|
||||
@@ -22,24 +23,24 @@ const chaptersFixture = [
|
||||
];
|
||||
const pagesFixture = [
|
||||
{
|
||||
id: 'p1',
|
||||
chapter_id: 'c1',
|
||||
id: 'p1111111-1111-1111-1111-111111111111',
|
||||
chapter_id: chapterId,
|
||||
page_number: 1,
|
||||
storage_key: 'mangas/m1/chapters/c1/pages/0001.png',
|
||||
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0001.png`,
|
||||
content_type: 'image/png'
|
||||
},
|
||||
{
|
||||
id: 'p2',
|
||||
chapter_id: 'c1',
|
||||
id: 'p2222222-1111-1111-1111-111111111111',
|
||||
chapter_id: chapterId,
|
||||
page_number: 2,
|
||||
storage_key: 'mangas/m1/chapters/c1/pages/0002.png',
|
||||
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0002.png`,
|
||||
content_type: 'image/png'
|
||||
},
|
||||
{
|
||||
id: 'p3',
|
||||
chapter_id: 'c1',
|
||||
id: 'p3333333-1111-1111-1111-111111111111',
|
||||
chapter_id: chapterId,
|
||||
page_number: 3,
|
||||
storage_key: 'mangas/m1/chapters/c1/pages/0003.png',
|
||||
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0003.png`,
|
||||
content_type: 'image/png'
|
||||
}
|
||||
];
|
||||
@@ -86,14 +87,16 @@ async function mockReaderApis(page: Page) {
|
||||
})
|
||||
})
|
||||
);
|
||||
await page.route(`**/api/v1/mangas/${mangaId}/chapters/1`, (route) =>
|
||||
await page.route(`**/api/v1/mangas/${mangaId}/chapters/${chapterId}`, (route) =>
|
||||
route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
body: JSON.stringify(chaptersFixture[0])
|
||||
})
|
||||
);
|
||||
await page.route(`**/api/v1/mangas/${mangaId}/chapters/1/pages`, (route) =>
|
||||
await page.route(
|
||||
`**/api/v1/mangas/${mangaId}/chapters/${chapterId}/pages`,
|
||||
(route) =>
|
||||
route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
@@ -123,7 +126,7 @@ test('manga overview shows title, cover, and a chapter list', async ({ page }) =
|
||||
|
||||
test('reader paginates with arrow keys and j/k, and preloads the next page', async ({ page }) => {
|
||||
await mockReaderApis(page);
|
||||
await page.goto(`/manga/${mangaId}/chapter/1`);
|
||||
await page.goto(`/manga/${mangaId}/chapter/${chapterId}`);
|
||||
|
||||
// Page 1 shown, preload for page 2 in the DOM.
|
||||
await expect(page.getByTestId('page-indicator')).toHaveText('Page 1 / 3');
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "mangalord-frontend",
|
||||
"version": "0.23.0",
|
||||
"version": "0.29.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
@@ -76,17 +76,17 @@ describe('chapters api client', () => {
|
||||
expect(result.page.total).toBeNull();
|
||||
});
|
||||
|
||||
it('getChapter hits /v1/mangas/{id}/chapters/{n}', async () => {
|
||||
it('getChapter hits /v1/mangas/{id}/chapters/{chapter_id}', async () => {
|
||||
fetchSpy.mockResolvedValueOnce(ok(chapterFixture));
|
||||
const c = await getChapter('m1', 1);
|
||||
const c = await getChapter('m1', 'ch-uuid-1');
|
||||
expect(c).toEqual(chapterFixture);
|
||||
const url = fetchSpy.mock.calls[0][0] as string;
|
||||
expect(url).toMatch(/\/v1\/mangas\/m1\/chapters\/1$/);
|
||||
expect(url).toMatch(/\/v1\/mangas\/m1\/chapters\/ch-uuid-1$/);
|
||||
});
|
||||
|
||||
it('getChapter surfaces 404 via ApiError.code', async () => {
|
||||
fetchSpy.mockResolvedValueOnce(envelope(404, 'not_found', 'not found'));
|
||||
await expect(getChapter('m1', 99)).rejects.toMatchObject({
|
||||
await expect(getChapter('m1', 'unknown-uuid')).rejects.toMatchObject({
|
||||
status: 404,
|
||||
code: 'not_found'
|
||||
});
|
||||
@@ -143,10 +143,10 @@ describe('chapters api client', () => {
|
||||
]
|
||||
})
|
||||
);
|
||||
const pages = await getChapterPages('m1', 1);
|
||||
const pages = await getChapterPages('m1', 'ch-uuid-1');
|
||||
expect(pages).toHaveLength(1);
|
||||
expect(pages[0].storage_key).toContain('0001.png');
|
||||
const url = fetchSpy.mock.calls[0][0] as string;
|
||||
expect(url).toMatch(/\/v1\/mangas\/m1\/chapters\/1\/pages$/);
|
||||
expect(url).toMatch(/\/v1\/mangas\/m1\/chapters\/ch-uuid-1\/pages$/);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -32,9 +32,9 @@ export async function listChapters(
|
||||
);
|
||||
}
|
||||
|
||||
export async function getChapter(mangaId: string, number: number): Promise<Chapter> {
|
||||
export async function getChapter(mangaId: string, chapterId: string): Promise<Chapter> {
|
||||
return request<Chapter>(
|
||||
`/v1/mangas/${encodeURIComponent(mangaId)}/chapters/${number}`
|
||||
`/v1/mangas/${encodeURIComponent(mangaId)}/chapters/${encodeURIComponent(chapterId)}`
|
||||
);
|
||||
}
|
||||
|
||||
@@ -48,10 +48,10 @@ export type ChapterPage = {
|
||||
|
||||
export async function getChapterPages(
|
||||
mangaId: string,
|
||||
number: number
|
||||
chapterId: string
|
||||
): Promise<ChapterPage[]> {
|
||||
const r = await request<{ pages: ChapterPage[] }>(
|
||||
`/v1/mangas/${encodeURIComponent(mangaId)}/chapters/${number}/pages`
|
||||
`/v1/mangas/${encodeURIComponent(mangaId)}/chapters/${encodeURIComponent(chapterId)}/pages`
|
||||
);
|
||||
return r.pages;
|
||||
}
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
</a>
|
||||
{#if b.chapter_id && b.chapter_number != null}
|
||||
<a
|
||||
href="/manga/{b.manga_id}/chapter/{b.chapter_number}"
|
||||
href="/manga/{b.manga_id}/chapter/{b.chapter_id}"
|
||||
class="target"
|
||||
>
|
||||
Chapter {b.chapter_number}{#if b.page != null && b.page > 0} — page {b.page}{/if}
|
||||
|
||||
@@ -29,6 +29,9 @@
|
||||
? chapters.find((c) => c.id === readProgress.chapter_id) ?? null
|
||||
: null
|
||||
);
|
||||
/** Reader link target — always the chapter id when we have one,
|
||||
* even for chapters past the loaded `chapters` list page. */
|
||||
const continueChapterId = $derived(readProgress?.chapter_id ?? null);
|
||||
const continueChapterNumber = $derived(
|
||||
continueChapter?.number ?? readProgress?.chapter_number ?? null
|
||||
);
|
||||
@@ -351,10 +354,10 @@
|
||||
|
||||
<section aria-label="chapters">
|
||||
<h2>Chapters</h2>
|
||||
{#if continueChapterNumber != null}
|
||||
{#if continueChapterId != null && continueChapterNumber != null}
|
||||
<a
|
||||
class="continue"
|
||||
href="/manga/{manga.id}/chapter/{continueChapterNumber}"
|
||||
href="/manga/{manga.id}/chapter/{continueChapterId}"
|
||||
data-testid="continue-reading"
|
||||
>
|
||||
<span class="continue-label">Continue reading</span>
|
||||
@@ -372,7 +375,7 @@
|
||||
<ol class="chapter-list" data-testid="chapter-list">
|
||||
{#each chapters as c (c.id)}
|
||||
<li>
|
||||
<a href="/manga/{manga.id}/chapter/{c.number}">
|
||||
<a href="/manga/{manga.id}/chapter/{c.id}">
|
||||
Chapter {c.number}{#if c.title}: {c.title}{/if}
|
||||
</a>
|
||||
<span class="pages">({c.page_count} pages)</span>
|
||||
|
||||
@@ -135,11 +135,11 @@
|
||||
// navigation feels continuous in single mode. Harmless in
|
||||
// continuous mode (the reader just shows everything).
|
||||
const target = mode === 'single' ? `?page=last` : '';
|
||||
void goto(`/manga/${manga.id}/chapter/${prevChapter.number}${target}`);
|
||||
void goto(`/manga/${manga.id}/chapter/${prevChapter.id}${target}`);
|
||||
}
|
||||
function jumpToNextChapter() {
|
||||
if (!nextChapter) return;
|
||||
void goto(`/manga/${manga.id}/chapter/${nextChapter.number}`);
|
||||
void goto(`/manga/${manga.id}/chapter/${nextChapter.id}`);
|
||||
}
|
||||
|
||||
function next() {
|
||||
@@ -6,11 +6,10 @@ import type { PageLoad } from './$types';
|
||||
export const ssr = false;
|
||||
|
||||
export const load: PageLoad = async ({ params, url }) => {
|
||||
const number = Number(params.n);
|
||||
const [manga, chapter, pages, readProgress, chapterList] = await Promise.all([
|
||||
getManga(params.id),
|
||||
getChapter(params.id, number),
|
||||
getChapterPages(params.id, number),
|
||||
getChapter(params.id, params.chapter_id),
|
||||
getChapterPages(params.id, params.chapter_id),
|
||||
// `null` for guests or first-time openers — the reader uses
|
||||
// this to seed its session-local high-water mark.
|
||||
getMyReadProgressForManga(params.id),
|
||||
@@ -60,8 +60,8 @@
|
||||
{#each progress as p (p.manga_id)}
|
||||
<li class="entry">
|
||||
<a
|
||||
href={p.chapter_number != null
|
||||
? `/manga/${p.manga_id}/chapter/${p.chapter_number}`
|
||||
href={p.chapter_id != null
|
||||
? `/manga/${p.manga_id}/chapter/${p.chapter_id}`
|
||||
: `/manga/${p.manga_id}`}
|
||||
class="cover-link"
|
||||
tabindex="-1"
|
||||
@@ -89,9 +89,9 @@
|
||||
{p.manga_title}
|
||||
</a>
|
||||
<span class="target">
|
||||
{#if p.chapter_number != null}
|
||||
{#if p.chapter_id != null && p.chapter_number != null}
|
||||
<a
|
||||
href="/manga/{p.manga_id}/chapter/{p.chapter_number}"
|
||||
href="/manga/{p.manga_id}/chapter/{p.chapter_id}"
|
||||
>
|
||||
Continue Ch. {p.chapter_number}{#if p.page > 1} — page {p.page}{/if}
|
||||
</a>
|
||||
@@ -185,7 +185,7 @@
|
||||
<div class="meta">
|
||||
<a href="/manga/{u.manga_id}" class="title">{u.manga_title}</a>
|
||||
<span class="target">
|
||||
<a href="/manga/{u.manga_id}/chapter/{u.chapter.number}">
|
||||
<a href="/manga/{u.manga_id}/chapter/{u.chapter.id}">
|
||||
Chapter {u.chapter.number}{#if u.chapter.title}: {u.chapter.title}{/if}
|
||||
</a>
|
||||
<span class="muted">({u.chapter.page_count} pages)</span>
|
||||
|
||||
Reference in New Issue
Block a user