//! `Source` trait — the per-site abstraction. //! //! Job handlers depend on this trait, not on a concrete site. Adding a //! new site is: implement `Source`, register it in a `sources` table //! row, and the existing job pipeline picks it up unchanged. pub mod target; use async_trait::async_trait; use chromiumoxide::browser::Browser; /// Pointer at a manga in the source's index, before we've fetched the /// detail page. The `source_manga_key` is whatever stable id the source /// uses (slug, numeric id, etc). #[derive(Clone, Debug)] pub struct SourceMangaRef { pub source_manga_key: String, pub title: String, pub url: String, } /// Full metadata returned by `fetch_manga`. The hash is computed by the /// source impl over the metadata-only field set (title through /// cover_url) — chapter changes are tracked separately via /// `chapter_sources`, so they intentionally do not affect /// `metadata_hash`. #[derive(Clone, Debug)] pub struct SourceManga { pub source_manga_key: String, pub title: String, pub alternative_titles: Vec, pub authors: Vec, pub genres: Vec, pub tags: Vec, pub status: Option, pub summary: Option, pub cover_url: Option, /// Chapters surfaced on the same page as the metadata. Sources /// where the chapter list lives elsewhere can leave this empty /// and supply it via `fetch_chapter_list` instead. pub chapters: Vec, pub metadata_hash: String, } #[derive(Clone, Debug)] pub struct SourceChapterRef { pub source_chapter_key: String, pub number: i32, pub title: Option, pub url: String, } #[derive(Clone, Debug)] pub struct SourceChapter { pub source_chapter_key: String, pub number: i32, pub title: Option, /// Ordered list of page image URLs, ready to be fetched and put /// into `Storage`. pub page_urls: Vec, } /// Context passed to every `Source` call. Carries the browser handle /// plus the per-host rate-limiter map so impls that issue multiple /// requests in one call (pagination walks, multi-page chapter image /// fetches) honor the right budget for each origin. pub struct FetchContext<'a> { pub browser: &'a Browser, pub rate: &'a crate::crawler::rate_limit::HostRateLimiters, } /// Lazy iterator over discovered manga refs. The caller drives the /// walk one batch at a time, so it can break out as soon as the /// downstream stop condition is met (the first manga where metadata is /// `Unchanged` and chapter sync reports zero new chapters) without /// paying for pages it won't use. /// /// Batches are typically one source-index page each. Within a batch /// refs are in the source's natural newest-first ordering — the same /// `update_date DESC` sort that makes the stop condition meaningful. #[async_trait] pub trait DiscoverWalk: Send { /// Return the next batch of refs, or `Ok(None)` when the source has /// no more pages. The walker is single-use; calling `next_batch` /// after `None` is allowed and continues to return `None`. async fn next_batch( &mut self, ctx: &FetchContext<'_>, ) -> anyhow::Result>>; } #[async_trait] pub trait Source: Send + Sync { /// Stable identifier — also the row key in the `sources` table. fn id(&self) -> &'static str; /// Begin discovery. Returns a walker the caller drives page-by-page /// via `next_batch`. The initial page-1 probe (used to determine /// `last_page` and warm the cache for sites that can't be paged /// without knowing the bound) happens inside this call, so a fresh /// walker is ready to yield its first batch without further setup. async fn discover( &self, ctx: &FetchContext<'_>, ) -> anyhow::Result>; async fn fetch_manga( &self, ctx: &FetchContext<'_>, r: &SourceMangaRef, ) -> anyhow::Result; async fn fetch_chapter_list( &self, ctx: &FetchContext<'_>, manga: &SourceManga, ) -> anyhow::Result>; async fn fetch_chapter( &self, ctx: &FetchContext<'_>, r: &SourceChapterRef, ) -> anyhow::Result; }