feat: crawler manga-list & metadata sync with cover download (0.23.0)
- TargetSource: first concrete impl of the Source trait, modeled on
the old Puppeteer crawler's selectors (+ status normalization,
tag-count stripping, chapter list)
- DiscoverMode::Backfill walks pagination last->1, reverse within each
page (oldest-first); Incremental walks forward
- RateLimiter (tokio-time aware) plumbed through FetchContext so the
pagination walk honors the same per-host budget as the outer loop
- repo::crawler: ensure_source, upsert_manga_from_source (returns
New/Updated/Unchanged + current cover_image_path for backfill
decisions), sync_manga_chapters, mark_dropped_mangas — all
transactional, with case-insensitive lookups and source-insertable
genres
- Cover image download via reqwest+infer; stored under
mangas/{id}/cover.{ext} via the Storage trait
- Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and
reqwest::Proxy::all (HTTP/HTTPS/SOCKS5)
- Crawler binary: positional start URL or $CRAWLER_START_URL,
$CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs),
$CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS
- Silences chromiumoxide 0.7's known CDP deserialize log spam via
default tracing filter + CdpError::Serde downgrade
- 9 sqlx integration tests + 11 selector/rate-limit unit tests
This commit is contained in:
@@ -1,29 +1,329 @@
|
||||
//! Crawler binary.
|
||||
//!
|
||||
//! Today: a thin shell that launches Chromium via the shared
|
||||
//! `crawler::browser` module and exits. Useful as an ad-hoc smoke test
|
||||
//! for the launcher in addition to the integration test in
|
||||
//! `tests/crawler_browser_smoke.rs`.
|
||||
//! Walks the source's manga listing (all pages), fetches each manga's
|
||||
//! metadata + chapter list, downloads the cover into `Storage`, and
|
||||
//! reconciles everything into the DB. Chapter *content* (page images)
|
||||
//! is out of scope for now — only chapter rows + their source links
|
||||
//! are written.
|
||||
//!
|
||||
//! Future: reads config, picks `Source` impls, runs the job loop.
|
||||
//! Configuration:
|
||||
//! - **Start URL** (required): first CLI positional arg, else
|
||||
//! `$CRAWLER_START_URL`. This is the manga *list* page (page 1).
|
||||
//! - **Database** (required): `$DATABASE_URL`.
|
||||
//! - **Storage dir**: `$STORAGE_DIR`, default `./data/storage` —
|
||||
//! matches the API binary so both write to the same local tree.
|
||||
//! - **Browser**: see `LaunchOptions::from_env` —
|
||||
//! `CRAWLER_BROWSER_MODE` (`headed`|`headless`) and
|
||||
//! `CRAWLER_BROWSER_ARGS`.
|
||||
//! - **Rate limit**: `CRAWLER_RATE_MS` (ms between requests, default
|
||||
//! `1000`).
|
||||
//! - **Cap**: `CRAWLER_LIMIT` (max manga detail fetches per run,
|
||||
//! default `0` = no cap).
|
||||
//! - **Skip chapters**: `CRAWLER_SKIP_CHAPTERS=1` — turn off the
|
||||
//! chapter selector in the parser AND skip the per-manga
|
||||
//! `sync_manga_chapters` write. Use this for "metadata only" runs.
|
||||
//! - **Proxy**: `$CRAWLER_PROXY` — single URL applied to both
|
||||
//! Chromium (`--proxy-server`) and `reqwest::Proxy::all`. Supports
|
||||
//! `http://`, `https://`, and `socks5://` (with optional user:pass).
|
||||
//! Example: `socks5://user:pass@host:1080`. Unset → direct.
|
||||
|
||||
use mangalord::crawler::browser::{self, LaunchOptions};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use mangalord::crawler::{
|
||||
browser::{self, LaunchOptions},
|
||||
rate_limit::RateLimiter,
|
||||
source::{target::TargetSource, DiscoverMode, FetchContext, Source},
|
||||
};
|
||||
use mangalord::repo;
|
||||
use mangalord::storage::{LocalStorage, Storage};
|
||||
use sqlx::postgres::PgPoolOptions;
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
dotenvy::dotenv().ok();
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| "info,mangalord=debug".into()),
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| {
|
||||
"info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off"
|
||||
.into()
|
||||
}),
|
||||
)
|
||||
.init();
|
||||
|
||||
let options = LaunchOptions::from_env();
|
||||
tracing::info!(?options, "launching browser");
|
||||
let handle = browser::launch(options).await?;
|
||||
tracing::info!("browser launched; closing");
|
||||
handle.close().await?;
|
||||
let start_url = resolve_start_url()?;
|
||||
let database_url = std::env::var("DATABASE_URL")
|
||||
.map_err(|_| anyhow!("DATABASE_URL must be set"))?;
|
||||
let storage_dir: PathBuf = std::env::var("STORAGE_DIR")
|
||||
.unwrap_or_else(|_| "./data/storage".to_string())
|
||||
.into();
|
||||
let rate_ms = env_u64("CRAWLER_RATE_MS", 1000);
|
||||
let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
|
||||
let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
|
||||
let proxy_url = std::env::var("CRAWLER_PROXY")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty());
|
||||
|
||||
let db = PgPoolOptions::new()
|
||||
.max_connections(5)
|
||||
.connect(&database_url)
|
||||
.await
|
||||
.context("connect to database")?;
|
||||
sqlx::migrate!("./migrations").run(&db).await?;
|
||||
|
||||
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(&storage_dir));
|
||||
|
||||
// `no_proxy()` disables reqwest's own env-based detection so the
|
||||
// single `CRAWLER_PROXY` knob is the only thing that influences
|
||||
// routing. Otherwise an unrelated `HTTPS_PROXY` in the shell would
|
||||
// silently route cover downloads while the browser stayed direct.
|
||||
let mut http_builder = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.no_proxy();
|
||||
if let Some(proxy) = &proxy_url {
|
||||
http_builder = http_builder
|
||||
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?);
|
||||
}
|
||||
let http = http_builder.build().context("build http client")?;
|
||||
|
||||
let mut options = LaunchOptions::from_env();
|
||||
if let Some(proxy) = &proxy_url {
|
||||
options.extra_args.push(format!("--proxy-server={proxy}"));
|
||||
}
|
||||
tracing::info!(
|
||||
?options,
|
||||
%start_url,
|
||||
rate_ms,
|
||||
limit,
|
||||
skip_chapters,
|
||||
proxy = ?proxy_url,
|
||||
storage_dir = %storage_dir.display(),
|
||||
"starting crawler"
|
||||
);
|
||||
|
||||
let handle = browser::launch(options).await.context("launch browser")?;
|
||||
let result = run(
|
||||
handle.browser(),
|
||||
&db,
|
||||
storage.as_ref(),
|
||||
&http,
|
||||
&start_url,
|
||||
rate_ms,
|
||||
limit,
|
||||
skip_chapters,
|
||||
)
|
||||
.await;
|
||||
handle.close().await.ok();
|
||||
result
|
||||
}
|
||||
|
||||
async fn run(
|
||||
browser: &chromiumoxide::Browser,
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
start_url: &str,
|
||||
rate_ms: u64,
|
||||
limit: usize,
|
||||
skip_chapters: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let rate = Mutex::new(RateLimiter::new(Duration::from_millis(rate_ms)));
|
||||
let source = {
|
||||
let s = TargetSource::new(start_url.to_string());
|
||||
if skip_chapters {
|
||||
s.without_chapter_parsing()
|
||||
} else {
|
||||
s
|
||||
}
|
||||
};
|
||||
let ctx = FetchContext {
|
||||
browser,
|
||||
rate: &rate,
|
||||
};
|
||||
|
||||
let source_id = source.id();
|
||||
repo::crawler::ensure_source(
|
||||
db,
|
||||
source_id,
|
||||
"Target Site",
|
||||
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
|
||||
)
|
||||
.await
|
||||
.context("ensure_source")?;
|
||||
|
||||
let run_started_at = chrono::Utc::now();
|
||||
|
||||
let max_refs = (limit > 0).then_some(limit);
|
||||
tracing::info!(?max_refs, "discovering manga list");
|
||||
let refs = source
|
||||
.discover(&ctx, DiscoverMode::Backfill, max_refs)
|
||||
.await
|
||||
.context("discover failed")?;
|
||||
tracing::info!(count = refs.len(), "discovered manga list");
|
||||
|
||||
let to_fetch = refs;
|
||||
let total = to_fetch.len();
|
||||
|
||||
for (i, r) in to_fetch.iter().enumerate() {
|
||||
tracing::info!(idx = i + 1, total, key = %r.source_manga_key, "fetching metadata");
|
||||
let manga = match source.fetch_manga(&ctx, r).await {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::warn!(key = %r.source_manga_key, url = %r.url, error = ?e, "fetch_manga failed");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
|
||||
.await
|
||||
{
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
tracing::error!(key = %r.source_manga_key, error = ?e, "upsert_manga_from_source failed");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
tracing::info!(
|
||||
key = %manga.source_manga_key,
|
||||
manga_id = %upsert.manga_id,
|
||||
status = ?upsert.status,
|
||||
title = %manga.title,
|
||||
"manga upserted"
|
||||
);
|
||||
|
||||
// Cover image: download when missing in storage (backfill for
|
||||
// mangas synced before cover-download support, plus the New
|
||||
// path) or when metadata changed (cover URL is part of
|
||||
// metadata_hash, so an Updated status implies the URL may
|
||||
// have moved). Failures are non-fatal.
|
||||
let needs_cover = upsert.cover_image_path.is_none()
|
||||
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
|
||||
if needs_cover {
|
||||
if let Some(cover_url) = manga.cover_url.as_deref() {
|
||||
if let Err(e) = download_and_store_cover(
|
||||
db,
|
||||
storage,
|
||||
http,
|
||||
&rate,
|
||||
&r.url,
|
||||
upsert.manga_id,
|
||||
cover_url,
|
||||
)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(manga_id = %upsert.manga_id, error = ?e, "cover download failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_chapters {
|
||||
match repo::crawler::sync_manga_chapters(
|
||||
db,
|
||||
source_id,
|
||||
upsert.manga_id,
|
||||
&manga.chapters,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(diff) => tracing::info!(
|
||||
manga_id = %upsert.manga_id,
|
||||
new = diff.new,
|
||||
refreshed = diff.refreshed,
|
||||
dropped = diff.dropped,
|
||||
"chapters synced"
|
||||
),
|
||||
Err(e) => tracing::warn!(manga_id = %upsert.manga_id, error = ?e, "chapter sync failed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if limit == 0 {
|
||||
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
|
||||
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
|
||||
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
|
||||
}
|
||||
} else {
|
||||
tracing::info!(limit, "partial sync — skipping drop pass");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_and_store_cover(
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &Mutex<RateLimiter>,
|
||||
manga_url: &str,
|
||||
manga_id: Uuid,
|
||||
cover_url: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let absolute = reqwest::Url::parse(manga_url)
|
||||
.context("parse manga URL")?
|
||||
.join(cover_url)
|
||||
.context("join cover URL onto manga URL")?;
|
||||
|
||||
rate.lock().await.wait().await;
|
||||
let resp = http
|
||||
.get(absolute.clone())
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("GET {absolute}"))?
|
||||
.error_for_status()
|
||||
.with_context(|| format!("non-2xx for {absolute}"))?;
|
||||
let bytes = resp.bytes().await.context("read cover body")?;
|
||||
|
||||
// `infer` sniffs the magic bytes — same crate the upload handler
|
||||
// uses, so we don't trust the URL's extension.
|
||||
let kind = infer::get(&bytes);
|
||||
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
|
||||
let key = format!("mangas/{manga_id}/cover.{ext}");
|
||||
|
||||
storage
|
||||
.put(&key, &bytes)
|
||||
.await
|
||||
.with_context(|| format!("store cover at {key}"))?;
|
||||
repo::manga::set_cover_image_path(db, manga_id, &key)
|
||||
.await
|
||||
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
|
||||
tracing::info!(manga_id = %manga_id, key = %key, bytes = bytes.len(), %absolute, "cover stored");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn resolve_start_url() -> anyhow::Result<String> {
|
||||
if let Some(arg) = std::env::args().nth(1) {
|
||||
return Ok(arg);
|
||||
}
|
||||
std::env::var("CRAWLER_START_URL").map_err(|_| {
|
||||
anyhow!(
|
||||
"start URL is required — pass as first CLI arg or set $CRAWLER_START_URL"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn origin_of(url: &str) -> Option<String> {
|
||||
let (scheme, rest) = url.split_once("://")?;
|
||||
let host = rest.split('/').next()?;
|
||||
Some(format!("{scheme}://{host}"))
|
||||
}
|
||||
|
||||
fn env_u64(name: &str, default: u64) -> u64 {
|
||||
std::env::var(name)
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(default)
|
||||
}
|
||||
|
||||
fn env_bool(name: &str, default: bool) -> bool {
|
||||
match std::env::var(name).ok().as_deref() {
|
||||
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
||||
Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
|
||||
_ => default,
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user