Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
140 lines
4.8 KiB
Rust
140 lines
4.8 KiB
Rust
//! `Source` trait — the per-site abstraction.
|
|
//!
|
|
//! Job handlers depend on this trait, not on a concrete site. Adding a
|
|
//! new site is: implement `Source`, register it in a `sources` table
|
|
//! row, and the existing job pipeline picks it up unchanged.
|
|
|
|
pub mod target;
|
|
|
|
use async_trait::async_trait;
|
|
use chromiumoxide::browser::Browser;
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
/// How a `discover` job should walk the source's index.
|
|
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
|
pub enum DiscoverMode {
|
|
/// Walk every index page from last back to first. Used for the
|
|
/// initial seed of a source.
|
|
Backfill,
|
|
/// Walk index pages from page 1 forward, stopping after
|
|
/// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
|
|
/// matches storage. Used for the recurring cron tick.
|
|
Incremental { stop_after_unchanged: usize },
|
|
}
|
|
|
|
/// Pointer at a manga in the source's index, before we've fetched the
|
|
/// detail page. The `source_manga_key` is whatever stable id the source
|
|
/// uses (slug, numeric id, etc).
|
|
#[derive(Clone, Debug)]
|
|
pub struct SourceMangaRef {
|
|
pub source_manga_key: String,
|
|
pub title: String,
|
|
pub url: String,
|
|
}
|
|
|
|
/// Full metadata returned by `fetch_manga`. The hash is computed by the
|
|
/// source impl over the metadata-only field set (title through
|
|
/// cover_url) — chapter changes are tracked separately via
|
|
/// `chapter_sources`, so they intentionally do not affect
|
|
/// `metadata_hash`.
|
|
#[derive(Clone, Debug)]
|
|
pub struct SourceManga {
|
|
pub source_manga_key: String,
|
|
pub title: String,
|
|
pub alternative_titles: Vec<String>,
|
|
pub authors: Vec<String>,
|
|
pub genres: Vec<String>,
|
|
pub tags: Vec<String>,
|
|
pub status: Option<String>,
|
|
pub summary: Option<String>,
|
|
pub cover_url: Option<String>,
|
|
/// Chapters surfaced on the same page as the metadata. Sources
|
|
/// where the chapter list lives elsewhere can leave this empty
|
|
/// and supply it via `fetch_chapter_list` instead.
|
|
pub chapters: Vec<SourceChapterRef>,
|
|
pub metadata_hash: String,
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct SourceChapterRef {
|
|
pub source_chapter_key: String,
|
|
pub number: i32,
|
|
pub title: Option<String>,
|
|
pub url: String,
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct SourceChapter {
|
|
pub source_chapter_key: String,
|
|
pub number: i32,
|
|
pub title: Option<String>,
|
|
/// Ordered list of page image URLs, ready to be fetched and put
|
|
/// into `Storage`.
|
|
pub page_urls: Vec<String>,
|
|
}
|
|
|
|
/// Context passed to every `Source` call. Carries the browser handle
|
|
/// plus the per-host rate-limiter map so impls that issue multiple
|
|
/// requests in one call (pagination walks, multi-page chapter image
|
|
/// fetches) honor the right budget for each origin.
|
|
pub struct FetchContext<'a> {
|
|
pub browser: &'a Browser,
|
|
pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
|
|
}
|
|
|
|
/// Lazy iterator over discovered manga refs. The caller drives the
|
|
/// walk one batch at a time, so it can break out as soon as a
|
|
/// downstream stop condition is met (e.g. N consecutive Unchanged
|
|
/// upserts in Incremental mode) without paying for pages it won't use.
|
|
///
|
|
/// Batches are typically one source-index page each. Within a batch
|
|
/// refs are already in the right per-page order for the active mode
|
|
/// (Backfill reverses each page to oldest-first; Incremental leaves
|
|
/// the source's natural newest-first ordering).
|
|
#[async_trait]
|
|
pub trait DiscoverWalk: Send {
|
|
/// Return the next batch of refs, or `Ok(None)` when the source has
|
|
/// no more pages. The walker is single-use; calling `next_batch`
|
|
/// after `None` is allowed and continues to return `None`.
|
|
async fn next_batch(
|
|
&mut self,
|
|
ctx: &FetchContext<'_>,
|
|
) -> anyhow::Result<Option<Vec<SourceMangaRef>>>;
|
|
}
|
|
|
|
#[async_trait]
|
|
pub trait Source: Send + Sync {
|
|
/// Stable identifier — also the row key in the `sources` table.
|
|
fn id(&self) -> &'static str;
|
|
|
|
/// Begin discovery in `mode`. Returns a walker the caller drives
|
|
/// page-by-page via `next_batch`. The initial page-1 probe (used
|
|
/// to determine `last_page` and warm the cache for sites that
|
|
/// can't be paged without knowing the bound) happens inside this
|
|
/// call, so a fresh walker is ready to yield its first batch
|
|
/// without further setup.
|
|
async fn discover(
|
|
&self,
|
|
ctx: &FetchContext<'_>,
|
|
mode: DiscoverMode,
|
|
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>>;
|
|
|
|
async fn fetch_manga(
|
|
&self,
|
|
ctx: &FetchContext<'_>,
|
|
r: &SourceMangaRef,
|
|
) -> anyhow::Result<SourceManga>;
|
|
|
|
async fn fetch_chapter_list(
|
|
&self,
|
|
ctx: &FetchContext<'_>,
|
|
manga: &SourceManga,
|
|
) -> anyhow::Result<Vec<SourceChapterRef>>;
|
|
|
|
async fn fetch_chapter(
|
|
&self,
|
|
ctx: &FetchContext<'_>,
|
|
r: &SourceChapterRef,
|
|
) -> anyhow::Result<SourceChapter>;
|
|
}
|