Mangalord/backend/src/crawler/source.rs

//! `Source` trait — the per-site abstraction.
//!
//! Job handlers depend on this trait, not on a concrete site. Adding a
//! new site is: implement `Source`, register it in a `sources` table
//! row, and the existing job pipeline picks it up unchanged.

pub mod target;

use async_trait::async_trait;
use chromiumoxide::browser::Browser;
use serde::{Deserialize, Serialize};

/// How a `discover` job should walk the source's index.
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub enum DiscoverMode {
    /// Walk every index page from last back to first. Used for the
    /// initial seed of a source.
    Backfill,
    /// Walk index pages from page 1 forward, stopping after
    /// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
    /// matches storage. Used for the recurring cron tick.
    Incremental { stop_after_unchanged: usize },
}

/// Pointer at a manga in the source's index, before we've fetched the
/// detail page. The `source_manga_key` is whatever stable id the source
/// uses (slug, numeric id, etc).
#[derive(Clone, Debug)]
pub struct SourceMangaRef {
    pub source_manga_key: String,
    pub title: String,
    pub url: String,
}

/// Full metadata returned by `fetch_manga`. The hash is computed by the
/// source impl over the metadata-only field set (title through
/// cover_url) — chapter changes are tracked separately via
/// `chapter_sources`, so they intentionally do not affect
/// `metadata_hash`.
#[derive(Clone, Debug)]
pub struct SourceManga {
    pub source_manga_key: String,
    pub title: String,
    pub alternative_titles: Vec<String>,
    pub authors: Vec<String>,
    pub genres: Vec<String>,
    pub tags: Vec<String>,
    pub status: Option<String>,
    pub summary: Option<String>,
    pub cover_url: Option<String>,
    /// Chapters surfaced on the same page as the metadata. Sources
    /// where the chapter list lives elsewhere can leave this empty
    /// and supply it via `fetch_chapter_list` instead.
    pub chapters: Vec<SourceChapterRef>,
    pub metadata_hash: String,
}

#[derive(Clone, Debug)]
pub struct SourceChapterRef {
    pub source_chapter_key: String,
    pub number: i32,
    pub title: Option<String>,
    pub url: String,
}

#[derive(Clone, Debug)]
pub struct SourceChapter {
    pub source_chapter_key: String,
    pub number: i32,
    pub title: Option<String>,
    /// Ordered list of page image URLs, ready to be fetched and put
    /// into `Storage`.
    pub page_urls: Vec<String>,
}

/// Context passed to every `Source` call. Carries the browser handle
/// plus the per-host rate-limiter map so impls that issue multiple
/// requests in one call (pagination walks, multi-page chapter image
/// fetches) honor the right budget for each origin.
pub struct FetchContext<'a> {
    pub browser: &'a Browser,
    pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
}

/// Lazy iterator over discovered manga refs. The caller drives the
/// walk one batch at a time, so it can break out as soon as a
/// downstream stop condition is met (e.g. N consecutive Unchanged
/// upserts in Incremental mode) without paying for pages it won't use.
///
/// Batches are typically one source-index page each. Within a batch
/// refs are already in the right per-page order for the active mode
/// (Backfill reverses each page to oldest-first; Incremental leaves
/// the source's natural newest-first ordering).
#[async_trait]
pub trait DiscoverWalk: Send {
    /// Return the next batch of refs, or `Ok(None)` when the source has
    /// no more pages. The walker is single-use; calling `next_batch`
    /// after `None` is allowed and continues to return `None`.
    async fn next_batch(
        &mut self,
        ctx: &FetchContext<'_>,
    ) -> anyhow::Result<Option<Vec<SourceMangaRef>>>;
}

#[async_trait]
pub trait Source: Send + Sync {
    /// Stable identifier — also the row key in the `sources` table.
    fn id(&self) -> &'static str;

    /// Begin discovery in `mode`. Returns a walker the caller drives
    /// page-by-page via `next_batch`. The initial page-1 probe (used
    /// to determine `last_page` and warm the cache for sites that
    /// can't be paged without knowing the bound) happens inside this
    /// call, so a fresh walker is ready to yield its first batch
    /// without further setup.
    async fn discover(
        &self,
        ctx: &FetchContext<'_>,
        mode: DiscoverMode,
    ) -> anyhow::Result<Box<dyn DiscoverWalk + Send>>;

    async fn fetch_manga(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga>;

    async fn fetch_chapter_list(
        &self,
        ctx: &FetchContext<'_>,
        manga: &SourceManga,
    ) -> anyhow::Result<Vec<SourceChapterRef>>;

    async fn fetch_chapter(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceChapterRef,
    ) -> anyhow::Result<SourceChapter>;
}