Files
Mangalord/backend/src/crawler/source.rs
MechaCat02 8557e432a2 feat(crawler): plumb TorController through FetchContext and pipelines
Adds CRAWLER_TOR_CONTROL_URL / _PASSWORD / _COOKIE_PATH /
_RECIRCUIT_MAX_ATTEMPTS to CrawlerConfig and to bin/crawler.rs's
env reads. Constructs an Option<Arc<TorController>> at daemon /
CLI startup and threads it through FetchContext,
pipeline::run_metadata_pass, and content::sync_chapter_content as
Option<&TorController>.

Pure scaffolding — the controller isn't used yet; behavior is
unchanged. Next commit wires the retry hooks and session-probe
recircuit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 19:59:47 +02:00

129 lines
4.5 KiB
Rust

//! `Source` trait — the per-site abstraction.
//!
//! Job handlers depend on this trait, not on a concrete site. Adding a
//! new site is: implement `Source`, register it in a `sources` table
//! row, and the existing job pipeline picks it up unchanged.
pub mod target;
use async_trait::async_trait;
use chromiumoxide::browser::Browser;
/// Pointer at a manga in the source's index, before we've fetched the
/// detail page. The `source_manga_key` is whatever stable id the source
/// uses (slug, numeric id, etc).
#[derive(Clone, Debug)]
pub struct SourceMangaRef {
pub source_manga_key: String,
pub title: String,
pub url: String,
}
/// Full metadata returned by `fetch_manga`. The hash is computed by the
/// source impl over the metadata-only field set (title through
/// cover_url) — chapter changes are tracked separately via
/// `chapter_sources`, so they intentionally do not affect
/// `metadata_hash`.
#[derive(Clone, Debug)]
pub struct SourceManga {
pub source_manga_key: String,
pub title: String,
pub alternative_titles: Vec<String>,
pub authors: Vec<String>,
pub genres: Vec<String>,
pub tags: Vec<String>,
pub status: Option<String>,
pub summary: Option<String>,
pub cover_url: Option<String>,
/// Chapters surfaced on the same page as the metadata. Sources
/// where the chapter list lives elsewhere can leave this empty
/// and supply it via `fetch_chapter_list` instead.
pub chapters: Vec<SourceChapterRef>,
pub metadata_hash: String,
}
#[derive(Clone, Debug)]
pub struct SourceChapterRef {
pub source_chapter_key: String,
pub number: i32,
pub title: Option<String>,
pub url: String,
}
#[derive(Clone, Debug)]
pub struct SourceChapter {
pub source_chapter_key: String,
pub number: i32,
pub title: Option<String>,
/// Ordered list of page image URLs, ready to be fetched and put
/// into `Storage`.
pub page_urls: Vec<String>,
}
/// Context passed to every `Source` call. Carries the browser handle
/// plus the per-host rate-limiter map so impls that issue multiple
/// requests in one call (pagination walks, multi-page chapter image
/// fetches) honor the right budget for each origin.
pub struct FetchContext<'a> {
pub browser: &'a Browser,
pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
/// Optional TOR control-port client. When `Some`, retry helpers
/// signal `NEWNYM` between transient-page attempts so the next try
/// draws a fresh exit. `None` keeps pre-TOR behavior.
pub tor: Option<&'a crate::crawler::tor::TorController>,
}
/// Lazy iterator over discovered manga refs. The caller drives the
/// walk one batch at a time, so it can break out as soon as the
/// downstream stop condition is met (the first manga where metadata is
/// `Unchanged` and chapter sync reports zero new chapters) without
/// paying for pages it won't use.
///
/// Batches are typically one source-index page each. Within a batch
/// refs are in the source's natural newest-first ordering — the same
/// `update_date DESC` sort that makes the stop condition meaningful.
#[async_trait]
pub trait DiscoverWalk: Send {
/// Return the next batch of refs, or `Ok(None)` when the source has
/// no more pages. The walker is single-use; calling `next_batch`
/// after `None` is allowed and continues to return `None`.
async fn next_batch(
&mut self,
ctx: &FetchContext<'_>,
) -> anyhow::Result<Option<Vec<SourceMangaRef>>>;
}
#[async_trait]
pub trait Source: Send + Sync {
/// Stable identifier — also the row key in the `sources` table.
fn id(&self) -> &'static str;
/// Begin discovery. Returns a walker the caller drives page-by-page
/// via `next_batch`. The initial page-1 probe (used to determine
/// `last_page` and warm the cache for sites that can't be paged
/// without knowing the bound) happens inside this call, so a fresh
/// walker is ready to yield its first batch without further setup.
async fn discover(
&self,
ctx: &FetchContext<'_>,
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>>;
async fn fetch_manga(
&self,
ctx: &FetchContext<'_>,
r: &SourceMangaRef,
) -> anyhow::Result<SourceManga>;
async fn fetch_chapter_list(
&self,
ctx: &FetchContext<'_>,
manga: &SourceManga,
) -> anyhow::Result<Vec<SourceChapterRef>>;
async fn fetch_chapter(
&self,
ctx: &FetchContext<'_>,
r: &SourceChapterRef,
) -> anyhow::Result<SourceChapter>;
}