feat: crawler scaffold with chromium launcher (0.22.0)
- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
This commit is contained in:
105
backend/src/crawler/source.rs
Normal file
105
backend/src/crawler/source.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
//! `Source` trait — the per-site abstraction.
|
||||
//!
|
||||
//! Job handlers depend on this trait, not on a concrete site. Adding a
|
||||
//! new site is: implement `Source`, register it in a `sources` table
|
||||
//! row, and the existing job pipeline picks it up unchanged.
|
||||
//!
|
||||
//! Scaffold only — the first concrete impl lands in a follow-up commit
|
||||
//! once the target site is locked in.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chromiumoxide::browser::Browser;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// How a `discover` job should walk the source's index.
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub enum DiscoverMode {
|
||||
/// Walk every index page from last back to first. Used for the
|
||||
/// initial seed of a source.
|
||||
Backfill,
|
||||
/// Walk index pages from page 1 forward, stopping after
|
||||
/// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
|
||||
/// matches storage. Used for the recurring cron tick.
|
||||
Incremental { stop_after_unchanged: usize },
|
||||
}
|
||||
|
||||
/// Pointer at a manga in the source's index, before we've fetched the
|
||||
/// detail page. The `source_manga_key` is whatever stable id the source
|
||||
/// uses (slug, numeric id, etc).
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceMangaRef {
|
||||
pub source_manga_key: String,
|
||||
pub title: String,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
/// Full metadata returned by `fetch_manga`. The hash is computed by the
|
||||
/// source impl (typically over the normalized field set) and is the
|
||||
/// signal `diff` uses to detect metadata updates.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceManga {
|
||||
pub source_manga_key: String,
|
||||
pub title: String,
|
||||
pub alternative_titles: Vec<String>,
|
||||
pub authors: Vec<String>,
|
||||
pub genres: Vec<String>,
|
||||
pub tags: Vec<String>,
|
||||
pub status: Option<String>,
|
||||
pub summary: Option<String>,
|
||||
pub cover_url: Option<String>,
|
||||
pub metadata_hash: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceChapterRef {
|
||||
pub source_chapter_key: String,
|
||||
pub number: i32,
|
||||
pub title: Option<String>,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceChapter {
|
||||
pub source_chapter_key: String,
|
||||
pub number: i32,
|
||||
pub title: Option<String>,
|
||||
/// Ordered list of page image URLs, ready to be fetched and put
|
||||
/// into `Storage`.
|
||||
pub page_urls: Vec<String>,
|
||||
}
|
||||
|
||||
/// Context passed to every `Source` call. Owns the browser handle, so
|
||||
/// impls can `browser.new_page(...)` without bringing their own.
|
||||
pub struct FetchContext<'a> {
|
||||
pub browser: &'a Browser,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait Source: Send + Sync {
|
||||
/// Stable identifier — also the row key in the `sources` table.
|
||||
fn id(&self) -> &'static str;
|
||||
|
||||
async fn discover(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
mode: DiscoverMode,
|
||||
) -> anyhow::Result<Vec<SourceMangaRef>>;
|
||||
|
||||
async fn fetch_manga(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
r: &SourceMangaRef,
|
||||
) -> anyhow::Result<SourceManga>;
|
||||
|
||||
async fn fetch_chapter_list(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
manga: &SourceManga,
|
||||
) -> anyhow::Result<Vec<SourceChapterRef>>;
|
||||
|
||||
async fn fetch_chapter(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
r: &SourceChapterRef,
|
||||
) -> anyhow::Result<SourceChapter>;
|
||||
}
|
||||
Reference in New Issue
Block a user