//! `Source` trait — the per-site abstraction. //! //! Job handlers depend on this trait, not on a concrete site. Adding a //! new site is: implement `Source`, register it in a `sources` table //! row, and the existing job pipeline picks it up unchanged. pub mod target; use async_trait::async_trait; use chromiumoxide::browser::Browser; use serde::{Deserialize, Serialize}; /// How a `discover` job should walk the source's index. #[derive(Clone, Copy, Debug, Serialize, Deserialize)] pub enum DiscoverMode { /// Walk every index page from last back to first. Used for the /// initial seed of a source. Backfill, /// Walk index pages from page 1 forward, stopping after /// `stop_after_unchanged` consecutive mangas whose `metadata_hash` /// matches storage. Used for the recurring cron tick. Incremental { stop_after_unchanged: usize }, } /// Pointer at a manga in the source's index, before we've fetched the /// detail page. The `source_manga_key` is whatever stable id the source /// uses (slug, numeric id, etc). #[derive(Clone, Debug)] pub struct SourceMangaRef { pub source_manga_key: String, pub title: String, pub url: String, } /// Full metadata returned by `fetch_manga`. The hash is computed by the /// source impl over the metadata-only field set (title through /// cover_url) — chapter changes are tracked separately via /// `chapter_sources`, so they intentionally do not affect /// `metadata_hash`. #[derive(Clone, Debug)] pub struct SourceManga { pub source_manga_key: String, pub title: String, pub alternative_titles: Vec, pub authors: Vec, pub genres: Vec, pub tags: Vec, pub status: Option, pub summary: Option, pub cover_url: Option, /// Chapters surfaced on the same page as the metadata. Sources /// where the chapter list lives elsewhere can leave this empty /// and supply it via `fetch_chapter_list` instead. pub chapters: Vec, pub metadata_hash: String, } #[derive(Clone, Debug)] pub struct SourceChapterRef { pub source_chapter_key: String, pub number: i32, pub title: Option, pub url: String, } #[derive(Clone, Debug)] pub struct SourceChapter { pub source_chapter_key: String, pub number: i32, pub title: Option, /// Ordered list of page image URLs, ready to be fetched and put /// into `Storage`. pub page_urls: Vec, } /// Context passed to every `Source` call. Carries the browser handle /// plus the per-host rate-limiter map so impls that issue multiple /// requests in one call (pagination walks, multi-page chapter image /// fetches) honor the right budget for each origin. pub struct FetchContext<'a> { pub browser: &'a Browser, pub rate: &'a crate::crawler::rate_limit::HostRateLimiters, } /// Lazy iterator over discovered manga refs. The caller drives the /// walk one batch at a time, so it can break out as soon as a /// downstream stop condition is met (e.g. N consecutive Unchanged /// upserts in Incremental mode) without paying for pages it won't use. /// /// Batches are typically one source-index page each. Within a batch /// refs are already in the right per-page order for the active mode /// (Backfill reverses each page to oldest-first; Incremental leaves /// the source's natural newest-first ordering). #[async_trait] pub trait DiscoverWalk: Send { /// Return the next batch of refs, or `Ok(None)` when the source has /// no more pages. The walker is single-use; calling `next_batch` /// after `None` is allowed and continues to return `None`. async fn next_batch( &mut self, ctx: &FetchContext<'_>, ) -> anyhow::Result>>; } #[async_trait] pub trait Source: Send + Sync { /// Stable identifier — also the row key in the `sources` table. fn id(&self) -> &'static str; /// Begin discovery in `mode`. Returns a walker the caller drives /// page-by-page via `next_batch`. The initial page-1 probe (used /// to determine `last_page` and warm the cache for sites that /// can't be paged without knowing the bound) happens inside this /// call, so a fresh walker is ready to yield its first batch /// without further setup. async fn discover( &self, ctx: &FetchContext<'_>, mode: DiscoverMode, ) -> anyhow::Result>; async fn fetch_manga( &self, ctx: &FetchContext<'_>, r: &SourceMangaRef, ) -> anyhow::Result; async fn fetch_chapter_list( &self, ctx: &FetchContext<'_>, manga: &SourceManga, ) -> anyhow::Result>; async fn fetch_chapter( &self, ctx: &FetchContext<'_>, r: &SourceChapterRef, ) -> anyhow::Result; }