feat: crawler manga-list & metadata sync with cover download (0.23.0)
- TargetSource: first concrete impl of the Source trait, modeled on
the old Puppeteer crawler's selectors (+ status normalization,
tag-count stripping, chapter list)
- DiscoverMode::Backfill walks pagination last->1, reverse within each
page (oldest-first); Incremental walks forward
- RateLimiter (tokio-time aware) plumbed through FetchContext so the
pagination walk honors the same per-host budget as the outer loop
- repo::crawler: ensure_source, upsert_manga_from_source (returns
New/Updated/Unchanged + current cover_image_path for backfill
decisions), sync_manga_chapters, mark_dropped_mangas — all
transactional, with case-insensitive lookups and source-insertable
genres
- Cover image download via reqwest+infer; stored under
mangas/{id}/cover.{ext} via the Storage trait
- Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and
reqwest::Proxy::all (HTTP/HTTPS/SOCKS5)
- Crawler binary: positional start URL or $CRAWLER_START_URL,
$CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs),
$CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS
- Silences chromiumoxide 0.7's known CDP deserialize log spam via
default tracing filter + CdpError::Serde downgrade
- 9 sqlx integration tests + 11 selector/rate-limit unit tests
This commit is contained in:
@@ -3,9 +3,8 @@
|
||||
//! Job handlers depend on this trait, not on a concrete site. Adding a
|
||||
//! new site is: implement `Source`, register it in a `sources` table
|
||||
//! row, and the existing job pipeline picks it up unchanged.
|
||||
//!
|
||||
//! Scaffold only — the first concrete impl lands in a follow-up commit
|
||||
//! once the target site is locked in.
|
||||
|
||||
pub mod target;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chromiumoxide::browser::Browser;
|
||||
@@ -34,8 +33,10 @@ pub struct SourceMangaRef {
|
||||
}
|
||||
|
||||
/// Full metadata returned by `fetch_manga`. The hash is computed by the
|
||||
/// source impl (typically over the normalized field set) and is the
|
||||
/// signal `diff` uses to detect metadata updates.
|
||||
/// source impl over the metadata-only field set (title through
|
||||
/// cover_url) — chapter changes are tracked separately via
|
||||
/// `chapter_sources`, so they intentionally do not affect
|
||||
/// `metadata_hash`.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceManga {
|
||||
pub source_manga_key: String,
|
||||
@@ -47,6 +48,10 @@ pub struct SourceManga {
|
||||
pub status: Option<String>,
|
||||
pub summary: Option<String>,
|
||||
pub cover_url: Option<String>,
|
||||
/// Chapters surfaced on the same page as the metadata. Sources
|
||||
/// where the chapter list lives elsewhere can leave this empty
|
||||
/// and supply it via `fetch_chapter_list` instead.
|
||||
pub chapters: Vec<SourceChapterRef>,
|
||||
pub metadata_hash: String,
|
||||
}
|
||||
|
||||
@@ -68,10 +73,13 @@ pub struct SourceChapter {
|
||||
pub page_urls: Vec<String>,
|
||||
}
|
||||
|
||||
/// Context passed to every `Source` call. Owns the browser handle, so
|
||||
/// impls can `browser.new_page(...)` without bringing their own.
|
||||
/// Context passed to every `Source` call. Carries the browser handle
|
||||
/// plus a shared rate limiter so impls that issue multiple requests in
|
||||
/// one call (e.g. pagination walks) honor the same per-host budget as
|
||||
/// the outer job loop.
|
||||
pub struct FetchContext<'a> {
|
||||
pub browser: &'a Browser,
|
||||
pub rate: &'a tokio::sync::Mutex<crate::crawler::rate_limit::RateLimiter>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -79,10 +87,15 @@ pub trait Source: Send + Sync {
|
||||
/// Stable identifier — also the row key in the `sources` table.
|
||||
fn id(&self) -> &'static str;
|
||||
|
||||
/// Returns up to `max_results` manga refs in source order. Pass
|
||||
/// `None` for an uncapped walk (full backfill / incremental sweep).
|
||||
/// Implementations should stop paginating as soon as the cap is
|
||||
/// reached so partial runs don't pay for pages they won't use.
|
||||
async fn discover(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
mode: DiscoverMode,
|
||||
max_results: Option<usize>,
|
||||
) -> anyhow::Result<Vec<SourceMangaRef>>;
|
||||
|
||||
async fn fetch_manga(
|
||||
|
||||
Reference in New Issue
Block a user