Mangalord/backend/src/crawler/source.rs

//! `Source` trait — the per-site abstraction.
//!
//! Job handlers depend on this trait, not on a concrete site. Adding a
//! new site is: implement `Source`, register it in a `sources` table
//! row, and the existing job pipeline picks it up unchanged.

pub mod target;

use async_trait::async_trait;
use chromiumoxide::browser::Browser;

/// Pointer at a manga in the source's index, before we've fetched the
/// detail page. The `source_manga_key` is whatever stable id the source
/// uses (slug, numeric id, etc).
#[derive(Clone, Debug)]
pub struct SourceMangaRef {
    pub source_manga_key: String,
    pub title: String,
    pub url: String,
}

/// Full metadata returned by `fetch_manga`. The hash is computed by the
/// source impl over the metadata-only field set (title through
/// cover_url) — chapter changes are tracked separately via
/// `chapter_sources`, so they intentionally do not affect
/// `metadata_hash`.
#[derive(Clone, Debug)]
pub struct SourceManga {
    pub source_manga_key: String,
    pub title: String,
    pub alternative_titles: Vec<String>,
    pub authors: Vec<String>,
    pub genres: Vec<String>,
    pub tags: Vec<String>,
    pub status: Option<String>,
    pub summary: Option<String>,
    pub cover_url: Option<String>,
    /// Chapters surfaced on the same page as the metadata. Sources
    /// where the chapter list lives elsewhere can leave this empty
    /// and supply it via `fetch_chapter_list` instead.
    pub chapters: Vec<SourceChapterRef>,
    pub metadata_hash: String,
}

#[derive(Clone, Debug)]
pub struct SourceChapterRef {
    pub source_chapter_key: String,
    pub number: i32,
    pub title: Option<String>,
    pub url: String,
}

#[derive(Clone, Debug)]
pub struct SourceChapter {
    pub source_chapter_key: String,
    pub number: i32,
    pub title: Option<String>,
    /// Ordered list of page image URLs, ready to be fetched and put
    /// into `Storage`.
    pub page_urls: Vec<String>,
}

/// Context passed to every `Source` call. Carries the browser handle
/// plus the per-host rate-limiter map so impls that issue multiple
/// requests in one call (pagination walks, multi-page chapter image
/// fetches) honor the right budget for each origin.
pub struct FetchContext<'a> {
    pub browser: &'a Browser,
    pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
    /// Optional TOR control-port client. When `Some`, retry helpers
    /// signal `NEWNYM` between transient-page attempts so the next try
    /// draws a fresh exit. `None` keeps pre-TOR behavior.
    pub tor: Option<&'a crate::crawler::tor::TorController>,
}

/// Lazy iterator over discovered manga refs. The caller drives the
/// walk one batch at a time, so it can break out as soon as the
/// downstream stop condition is met (the first manga where metadata is
/// `Unchanged` and chapter sync reports zero new chapters) without
/// paying for pages it won't use.
///
/// Batches are typically one source-index page each. Within a batch
/// refs are in the source's natural newest-first ordering — the same
/// `update_date DESC` sort that makes the stop condition meaningful.
#[async_trait]
pub trait DiscoverWalk: Send {
    /// Return the next batch of refs, or `Ok(None)` when the source has
    /// no more pages. The walker is single-use; calling `next_batch`
    /// after `None` is allowed and continues to return `None`.
    async fn next_batch(
        &mut self,
        ctx: &FetchContext<'_>,
    ) -> anyhow::Result<Option<Vec<SourceMangaRef>>>;
}

#[async_trait]
pub trait Source: Send + Sync {
    /// Stable identifier — also the row key in the `sources` table.
    fn id(&self) -> &'static str;

    /// Begin discovery. Returns a walker the caller drives page-by-page
    /// via `next_batch`. The initial page-1 probe (used to determine
    /// `last_page` and warm the cache for sites that can't be paged
    /// without knowing the bound) happens inside this call, so a fresh
    /// walker is ready to yield its first batch without further setup.
    async fn discover(
        &self,
        ctx: &FetchContext<'_>,
    ) -> anyhow::Result<Box<dyn DiscoverWalk + Send>>;

    async fn fetch_manga(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga>;

    async fn fetch_chapter_list(
        &self,
        ctx: &FetchContext<'_>,
        manga: &SourceManga,
    ) -> anyhow::Result<Vec<SourceChapterRef>>;

    async fn fetch_chapter(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceChapterRef,
    ) -> anyhow::Result<SourceChapter>;
}