feat: crawler scaffold with chromium launcher (0.22.0)
- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
This commit is contained in:
1297
backend/Cargo.lock
generated
1297
backend/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.21.3"
|
||||
version = "0.22.0"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
@@ -10,6 +10,10 @@ path = "src/lib.rs"
|
||||
name = "mangalord"
|
||||
path = "src/main.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "crawler"
|
||||
path = "src/bin/crawler.rs"
|
||||
|
||||
[dependencies]
|
||||
axum = { version = "0.7", features = ["macros", "multipart"] }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
@@ -36,7 +40,10 @@ time = "0.3"
|
||||
infer = "0.16"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
bytes = "1"
|
||||
chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
|
||||
scraper = "0.20"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
72
backend/migrations/0012_crawler.sql
Normal file
72
backend/migrations/0012_crawler.sql
Normal file
@@ -0,0 +1,72 @@
|
||||
-- Crawler tables.
|
||||
--
|
||||
-- Same philosophy as 0001_init.sql: new concepts go in new tables
|
||||
-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
|
||||
-- crawled manga IS a manga; the only thing the source-link tables
|
||||
-- carry is "where did this come from and when did we last see it".
|
||||
-- That keeps the API and frontend source-agnostic.
|
||||
|
||||
-- 1. Source registry. One row per site the crawler knows about.
|
||||
-- `config` carries per-site knobs (base URL, rate limits, custom
|
||||
-- selectors) so adding a source is a row insert plus a `Source`
|
||||
-- trait impl — no schema change.
|
||||
CREATE TABLE sources (
|
||||
id text PRIMARY KEY,
|
||||
name text NOT NULL,
|
||||
base_url text NOT NULL,
|
||||
enabled boolean NOT NULL DEFAULT true,
|
||||
config jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
created_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
|
||||
-- source itself exposes; the FK to `mangas`/`chapters` is what
|
||||
-- threads it back into our domain. `metadata_hash` is the signal
|
||||
-- used by `crawler::diff` to detect updates without re-comparing
|
||||
-- every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
|
||||
CREATE TABLE manga_sources (
|
||||
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
||||
source_manga_key text NOT NULL,
|
||||
manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
|
||||
source_url text NOT NULL,
|
||||
metadata_hash text,
|
||||
first_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||
last_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||
dropped_at timestamptz,
|
||||
PRIMARY KEY (source_id, source_manga_key)
|
||||
);
|
||||
CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
|
||||
CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
|
||||
|
||||
CREATE TABLE chapter_sources (
|
||||
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
||||
source_chapter_key text NOT NULL,
|
||||
chapter_id uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
|
||||
source_url text NOT NULL,
|
||||
first_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||
last_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||
dropped_at timestamptz,
|
||||
PRIMARY KEY (source_id, source_chapter_key)
|
||||
);
|
||||
CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
|
||||
|
||||
-- 3. Persistent job queue. Workers lease with
|
||||
-- `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
|
||||
-- by transitioning state. The partial index keeps the hot path
|
||||
-- (pick the next ready job) off the bulk of done/dead rows.
|
||||
CREATE TABLE crawler_jobs (
|
||||
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
payload jsonb NOT NULL,
|
||||
state text NOT NULL DEFAULT 'pending'
|
||||
CHECK (state IN ('pending','running','done','failed','dead')),
|
||||
attempts integer NOT NULL DEFAULT 0,
|
||||
max_attempts integer NOT NULL DEFAULT 5,
|
||||
scheduled_at timestamptz NOT NULL DEFAULT now(),
|
||||
leased_until timestamptz,
|
||||
last_error text,
|
||||
created_at timestamptz NOT NULL DEFAULT now(),
|
||||
updated_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX crawler_jobs_ready_idx
|
||||
ON crawler_jobs (scheduled_at)
|
||||
WHERE state IN ('pending', 'failed');
|
||||
29
backend/src/bin/crawler.rs
Normal file
29
backend/src/bin/crawler.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
//! Crawler binary.
|
||||
//!
|
||||
//! Today: a thin shell that launches Chromium via the shared
|
||||
//! `crawler::browser` module and exits. Useful as an ad-hoc smoke test
|
||||
//! for the launcher in addition to the integration test in
|
||||
//! `tests/crawler_browser_smoke.rs`.
|
||||
//!
|
||||
//! Future: reads config, picks `Source` impls, runs the job loop.
|
||||
|
||||
use mangalord::crawler::browser::{self, LaunchOptions};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
dotenvy::dotenv().ok();
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| "info,mangalord=debug".into()),
|
||||
)
|
||||
.init();
|
||||
|
||||
let options = LaunchOptions::from_env();
|
||||
tracing::info!(?options, "launching browser");
|
||||
let handle = browser::launch(options).await?;
|
||||
tracing::info!("browser launched; closing");
|
||||
handle.close().await?;
|
||||
Ok(())
|
||||
}
|
||||
217
backend/src/crawler/browser.rs
Normal file
217
backend/src/crawler/browser.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Chromium launcher and lifecycle.
|
||||
//!
|
||||
//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
|
||||
//! system Chrome install — first call downloads a known-good revision
|
||||
//! into a cache dir and reuses it forever after. `BrowserMode` toggles
|
||||
//! headed vs headless; the headed path needs a display (real `$DISPLAY`
|
||||
//! or `xvfb-run`).
|
||||
//!
|
||||
//! Extra Chromium command-line flags can be supplied through
|
||||
//! [`LaunchOptions::extra_args`] in code, or via the
|
||||
//! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going
|
||||
//! through [`LaunchOptions::from_env`]. The launcher always also
|
||||
//! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're
|
||||
//! near-mandatory for containerized Chromium; everything else is
|
||||
//! caller-provided.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Context;
|
||||
use chromiumoxide::browser::{Browser, BrowserConfig};
|
||||
use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions};
|
||||
use futures_util::StreamExt;
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum BrowserMode {
|
||||
/// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the
|
||||
/// binary). This is the default the old Puppeteer crawler used and
|
||||
/// the assumed mode for the target site until we prove headless
|
||||
/// works against it.
|
||||
Headed,
|
||||
/// No window. Faster, lower resource use, but more likely to trip
|
||||
/// fingerprinting on hostile sites.
|
||||
Headless,
|
||||
}
|
||||
|
||||
/// Configuration for a single browser launch.
|
||||
///
|
||||
/// Public fields rather than a builder — there are only two of them
|
||||
/// and callers benefit from struct literal syntax for clarity.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LaunchOptions {
|
||||
pub mode: BrowserMode,
|
||||
/// Extra Chromium flags, appended after the launcher's own
|
||||
/// defaults. Example: `vec!["--lang=de-DE".into(),
|
||||
/// "--window-size=1280,800".into()]`.
|
||||
pub extra_args: Vec<String>,
|
||||
}
|
||||
|
||||
impl LaunchOptions {
|
||||
pub fn headed() -> Self {
|
||||
Self {
|
||||
mode: BrowserMode::Headed,
|
||||
extra_args: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn headless() -> Self {
|
||||
Self {
|
||||
mode: BrowserMode::Headless,
|
||||
extra_args: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default
|
||||
/// `headed`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated
|
||||
/// Chromium flags). Flags containing whitespace aren't supported
|
||||
/// through the env var — use the programmatic API for those.
|
||||
pub fn from_env() -> Self {
|
||||
let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() {
|
||||
Ok("headless") => BrowserMode::Headless,
|
||||
_ => BrowserMode::Headed,
|
||||
};
|
||||
let extra_args = std::env::var("CRAWLER_BROWSER_ARGS")
|
||||
.map(|s| parse_args(&s))
|
||||
.unwrap_or_default();
|
||||
Self { mode, extra_args }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LaunchOptions {
|
||||
fn default() -> Self {
|
||||
Self::headed()
|
||||
}
|
||||
}
|
||||
|
||||
/// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed
|
||||
/// separately from `from_env` so it can be unit-tested without
|
||||
/// touching process environment.
|
||||
pub(crate) fn parse_args(s: &str) -> Vec<String> {
|
||||
s.split_whitespace().map(str::to_string).collect()
|
||||
}
|
||||
|
||||
/// Owned browser plus the spawned task that drives its CDP event loop.
|
||||
/// Dropping `Handle` without calling `close` leaks the Chromium process
|
||||
/// — always call `close().await` in production paths.
|
||||
pub struct Handle {
|
||||
browser: Browser,
|
||||
driver: JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl Handle {
|
||||
pub fn browser(&self) -> &Browser {
|
||||
&self.browser
|
||||
}
|
||||
|
||||
pub fn browser_mut(&mut self) -> &mut Browser {
|
||||
&mut self.browser
|
||||
}
|
||||
|
||||
/// Closes the browser and awaits the driver task. Safe to call
|
||||
/// multiple times — subsequent calls are no-ops.
|
||||
pub async fn close(mut self) -> anyhow::Result<()> {
|
||||
let _ = self.browser.close().await;
|
||||
let _ = self.browser.wait().await;
|
||||
let _ = self.driver.await;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Launches Chromium. Downloads it on first run via the `fetcher`
|
||||
/// feature; subsequent runs hit the cache. The cache dir is
|
||||
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
|
||||
/// else `./.chromium-cache` as a last-resort repo-local fallback.
|
||||
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
||||
let cache = cache_dir()?;
|
||||
tokio::fs::create_dir_all(&cache)
|
||||
.await
|
||||
.with_context(|| format!("create cache dir {}", cache.display()))?;
|
||||
|
||||
let fetcher = BrowserFetcher::new(
|
||||
BrowserFetcherOptions::builder()
|
||||
.with_path(&cache)
|
||||
.build()
|
||||
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
|
||||
);
|
||||
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
|
||||
let info = fetcher
|
||||
.fetch()
|
||||
.await
|
||||
.context("download chromium via fetcher")?;
|
||||
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
||||
|
||||
let mut builder = BrowserConfig::builder()
|
||||
.chrome_executable(info.executable_path)
|
||||
// Linux containers / CI commonly lack the user namespaces
|
||||
// Chromium's sandbox wants. Disable it; the crawler runs in its
|
||||
// own container anyway.
|
||||
.arg("--no-sandbox")
|
||||
.arg("--disable-dev-shm-usage");
|
||||
for arg in &options.extra_args {
|
||||
builder = builder.arg(arg);
|
||||
}
|
||||
if matches!(options.mode, BrowserMode::Headed) {
|
||||
builder = builder.with_head();
|
||||
}
|
||||
tracing::info!(
|
||||
mode = ?options.mode,
|
||||
extra_args = ?options.extra_args,
|
||||
"building browser config"
|
||||
);
|
||||
let config = builder
|
||||
.build()
|
||||
.map_err(|e| anyhow::anyhow!("browser config: {e}"))?;
|
||||
|
||||
let (browser, mut handler) = Browser::launch(config)
|
||||
.await
|
||||
.context("launch chromium")?;
|
||||
|
||||
let driver = tokio::spawn(async move {
|
||||
while let Some(event) = handler.next().await {
|
||||
if let Err(err) = event {
|
||||
tracing::warn!(?err, "chromium handler event error");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Handle { browser, driver })
|
||||
}
|
||||
|
||||
fn cache_dir() -> anyhow::Result<PathBuf> {
|
||||
if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") {
|
||||
return Ok(PathBuf::from(dir));
|
||||
}
|
||||
if let Ok(home) = std::env::var("HOME") {
|
||||
return Ok(PathBuf::from(home).join(".cache/mangalord/chromium"));
|
||||
}
|
||||
Ok(PathBuf::from("./.chromium-cache"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_args_splits_on_whitespace() {
|
||||
assert_eq!(
|
||||
parse_args("--lang=de-DE --window-size=1280,800"),
|
||||
vec!["--lang=de-DE", "--window-size=1280,800"]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_args_tolerates_irregular_whitespace() {
|
||||
// tabs, multiple spaces, leading/trailing — all collapsed.
|
||||
assert_eq!(
|
||||
parse_args(" --a\t--b --c=1\n"),
|
||||
vec!["--a", "--b", "--c=1"]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_args_empty_string_yields_empty_vec() {
|
||||
assert!(parse_args("").is_empty());
|
||||
assert!(parse_args(" \t\n").is_empty());
|
||||
}
|
||||
}
|
||||
15
backend/src/crawler/diff.rs
Normal file
15
backend/src/crawler/diff.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
//! Change-detection rules between the source and our DB.
|
||||
//!
|
||||
//! | Event | Signal |
|
||||
//! |--------------------|----------------------------------------------------------------------------------------|
|
||||
//! | New manga | `(source_id, source_manga_key)` not in `manga_sources` |
|
||||
//! | Updated metadata | freshly computed `metadata_hash` differs from the stored one |
|
||||
//! | Dropped manga | `last_seen_at < discover_run_started_at` for N consecutive successful discover runs |
|
||||
//! | New chapter | `(source_id, source_chapter_key)` not in `chapter_sources` |
|
||||
//! | Dropped chapter | present in DB but absent from the latest `fetch_chapter_list` for the same manga |
|
||||
//!
|
||||
//! Dropped is always a soft flag (`dropped_at`), never a row delete —
|
||||
//! restoring is a matter of clearing the flag if the source brings the
|
||||
//! item back.
|
||||
//!
|
||||
//! Scaffold only — implementations land once `repo::crawler` exists.
|
||||
55
backend/src/crawler/jobs.rs
Normal file
55
backend/src/crawler/jobs.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
//! Persistent job queue and the four job kinds.
|
||||
//!
|
||||
//! Backed by Postgres (the `crawler_jobs` table). Workers lease rows
|
||||
//! with `SELECT ... FOR UPDATE SKIP LOCKED`, heartbeat via
|
||||
//! `leased_until`, and ack by transitioning to `done` (or backoff /
|
||||
//! `dead`). Handlers are idempotent so a crash mid-run is recoverable
|
||||
//! by replay.
|
||||
//!
|
||||
//! Scaffold only — the actual queue wrapper and handler dispatch land
|
||||
//! once we have the first `Source` impl exercising the pipeline.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::source::DiscoverMode;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum JobPayload {
|
||||
/// Walk the source index and enqueue `SyncManga` jobs.
|
||||
Discover {
|
||||
source_id: String,
|
||||
mode: DiscoverMode,
|
||||
},
|
||||
/// Fetch one manga's detail page, upsert metadata, enqueue
|
||||
/// `SyncChapterList`.
|
||||
SyncManga {
|
||||
source_id: String,
|
||||
source_manga_key: String,
|
||||
},
|
||||
/// Diff the chapter list, enqueue `SyncChapterContent` for new
|
||||
/// chapters, soft-drop vanished ones.
|
||||
SyncChapterList {
|
||||
source_id: String,
|
||||
manga_id: Uuid,
|
||||
source_manga_key: String,
|
||||
},
|
||||
/// Download a single chapter's page images into storage.
|
||||
SyncChapterContent {
|
||||
source_id: String,
|
||||
chapter_id: Uuid,
|
||||
source_chapter_key: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, sqlx::Type, Serialize, Deserialize)]
|
||||
#[sqlx(type_name = "text", rename_all = "snake_case")]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum JobState {
|
||||
Pending,
|
||||
Running,
|
||||
Done,
|
||||
Failed,
|
||||
Dead,
|
||||
}
|
||||
19
backend/src/crawler/mod.rs
Normal file
19
backend/src/crawler/mod.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
//! Crawler subsystem.
|
||||
//!
|
||||
//! Runs as its own binary (`src/bin/crawler.rs`) and shares `domain`,
|
||||
//! `repo`, and `storage` with the API binary. Layering mirrors the
|
||||
//! `Storage` trait pattern: callers depend on the `source::Source`
|
||||
//! trait, not on a concrete site; new sites plug in as additional
|
||||
//! impls without touching the job runner.
|
||||
//!
|
||||
//! Submodules:
|
||||
//! - [`browser`]: launches and pools Chromium via `chromiumoxide`.
|
||||
//! First run downloads a known-good build via the `fetcher` feature.
|
||||
//! - [`source`]: the `Source` trait. Per-site impls live alongside it.
|
||||
//! - [`jobs`]: job kinds, queue wrapper, handler dispatch.
|
||||
//! - [`diff`]: change detection — new / updated / dropped semantics.
|
||||
|
||||
pub mod browser;
|
||||
pub mod diff;
|
||||
pub mod jobs;
|
||||
pub mod source;
|
||||
105
backend/src/crawler/source.rs
Normal file
105
backend/src/crawler/source.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
//! `Source` trait — the per-site abstraction.
|
||||
//!
|
||||
//! Job handlers depend on this trait, not on a concrete site. Adding a
|
||||
//! new site is: implement `Source`, register it in a `sources` table
|
||||
//! row, and the existing job pipeline picks it up unchanged.
|
||||
//!
|
||||
//! Scaffold only — the first concrete impl lands in a follow-up commit
|
||||
//! once the target site is locked in.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chromiumoxide::browser::Browser;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// How a `discover` job should walk the source's index.
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub enum DiscoverMode {
|
||||
/// Walk every index page from last back to first. Used for the
|
||||
/// initial seed of a source.
|
||||
Backfill,
|
||||
/// Walk index pages from page 1 forward, stopping after
|
||||
/// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
|
||||
/// matches storage. Used for the recurring cron tick.
|
||||
Incremental { stop_after_unchanged: usize },
|
||||
}
|
||||
|
||||
/// Pointer at a manga in the source's index, before we've fetched the
|
||||
/// detail page. The `source_manga_key` is whatever stable id the source
|
||||
/// uses (slug, numeric id, etc).
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceMangaRef {
|
||||
pub source_manga_key: String,
|
||||
pub title: String,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
/// Full metadata returned by `fetch_manga`. The hash is computed by the
|
||||
/// source impl (typically over the normalized field set) and is the
|
||||
/// signal `diff` uses to detect metadata updates.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceManga {
|
||||
pub source_manga_key: String,
|
||||
pub title: String,
|
||||
pub alternative_titles: Vec<String>,
|
||||
pub authors: Vec<String>,
|
||||
pub genres: Vec<String>,
|
||||
pub tags: Vec<String>,
|
||||
pub status: Option<String>,
|
||||
pub summary: Option<String>,
|
||||
pub cover_url: Option<String>,
|
||||
pub metadata_hash: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceChapterRef {
|
||||
pub source_chapter_key: String,
|
||||
pub number: i32,
|
||||
pub title: Option<String>,
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SourceChapter {
|
||||
pub source_chapter_key: String,
|
||||
pub number: i32,
|
||||
pub title: Option<String>,
|
||||
/// Ordered list of page image URLs, ready to be fetched and put
|
||||
/// into `Storage`.
|
||||
pub page_urls: Vec<String>,
|
||||
}
|
||||
|
||||
/// Context passed to every `Source` call. Owns the browser handle, so
|
||||
/// impls can `browser.new_page(...)` without bringing their own.
|
||||
pub struct FetchContext<'a> {
|
||||
pub browser: &'a Browser,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait Source: Send + Sync {
|
||||
/// Stable identifier — also the row key in the `sources` table.
|
||||
fn id(&self) -> &'static str;
|
||||
|
||||
async fn discover(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
mode: DiscoverMode,
|
||||
) -> anyhow::Result<Vec<SourceMangaRef>>;
|
||||
|
||||
async fn fetch_manga(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
r: &SourceMangaRef,
|
||||
) -> anyhow::Result<SourceManga>;
|
||||
|
||||
async fn fetch_chapter_list(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
manga: &SourceManga,
|
||||
) -> anyhow::Result<Vec<SourceChapterRef>>;
|
||||
|
||||
async fn fetch_chapter(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
r: &SourceChapterRef,
|
||||
) -> anyhow::Result<SourceChapter>;
|
||||
}
|
||||
@@ -2,6 +2,7 @@ pub mod api;
|
||||
pub mod app;
|
||||
pub mod auth;
|
||||
pub mod config;
|
||||
pub mod crawler;
|
||||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod repo;
|
||||
|
||||
157
backend/tests/crawler_browser_smoke.rs
Normal file
157
backend/tests/crawler_browser_smoke.rs
Normal file
@@ -0,0 +1,157 @@
|
||||
//! Smoke test for the Chromium launcher.
|
||||
//!
|
||||
//! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on
|
||||
//! first run via the `fetcher` feature and (b) requires a real `$DISPLAY`
|
||||
//! for the headed path. Run it explicitly:
|
||||
//!
|
||||
//! ```sh
|
||||
//! cargo test --test crawler_browser_smoke -- --ignored --nocapture
|
||||
//! ```
|
||||
//!
|
||||
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
|
||||
//! `$HOME/.cache/mangalord/chromium` isn't writable.
|
||||
|
||||
use mangalord::crawler::browser::{self, LaunchOptions};
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "downloads Chromium and needs a display; run with --ignored"]
|
||||
async fn headed_browser_can_navigate_and_read_title() {
|
||||
// A data URL avoids any network dependency — we're testing the
|
||||
// browser launcher, not connectivity.
|
||||
const PAGE: &str = "data:text/html,<html><head><title>Mangalord%20Smoke</title></head><body>OK</body></html>";
|
||||
|
||||
let handle = browser::launch(LaunchOptions::headed())
|
||||
.await
|
||||
.expect("launch headed chromium");
|
||||
|
||||
let page = handle
|
||||
.browser()
|
||||
.new_page(PAGE)
|
||||
.await
|
||||
.expect("open new page");
|
||||
page.wait_for_navigation()
|
||||
.await
|
||||
.expect("wait for navigation");
|
||||
|
||||
let title = page.get_title().await.expect("get title");
|
||||
assert_eq!(title.as_deref(), Some("Mangalord Smoke"));
|
||||
|
||||
handle.close().await.expect("close cleanly");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "downloads Chromium; run with --ignored"]
|
||||
async fn headless_browser_can_navigate_and_read_title() {
|
||||
const PAGE: &str = "data:text/html,<html><head><title>Headless%20OK</title></head><body></body></html>";
|
||||
|
||||
let handle = browser::launch(LaunchOptions::headless())
|
||||
.await
|
||||
.expect("launch headless chromium");
|
||||
|
||||
let page = handle.browser().new_page(PAGE).await.expect("open new page");
|
||||
page.wait_for_navigation().await.expect("wait for navigation");
|
||||
|
||||
let title = page.get_title().await.expect("get title");
|
||||
assert_eq!(title.as_deref(), Some("Headless OK"));
|
||||
|
||||
handle.close().await.expect("close cleanly");
|
||||
}
|
||||
|
||||
/// Live end-to-end: navigate to a real page, get the rendered HTML, and
|
||||
/// parse it with `scraper`. ipify.org renders the visitor's public IP
|
||||
/// into the page DOM, so a successful run proves browser → render →
|
||||
/// `Html::parse_document` → selector → text extraction all work
|
||||
/// against a real site. This is the same path each future `Source`
|
||||
/// impl will take.
|
||||
#[tokio::test]
|
||||
#[ignore = "needs network; run with --ignored"]
|
||||
async fn fetches_public_ip_from_ipify() {
|
||||
use std::time::Duration;
|
||||
|
||||
let handle = browser::launch(LaunchOptions::headless())
|
||||
.await
|
||||
.expect("launch headless chromium");
|
||||
|
||||
let page = handle
|
||||
.browser()
|
||||
.new_page("https://www.ipify.org")
|
||||
.await
|
||||
.expect("open ipify");
|
||||
page.wait_for_navigation().await.expect("wait for navigation");
|
||||
// ipify injects the IP via JS after load, so the navigation event
|
||||
// alone isn't enough — give the script a beat to run.
|
||||
tokio::time::sleep(Duration::from_secs(2)).await;
|
||||
|
||||
let html = page.content().await.expect("get rendered html");
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
let body_sel = scraper::Selector::parse("body").unwrap();
|
||||
let body_text: String = doc
|
||||
.select(&body_sel)
|
||||
.next()
|
||||
.map(|n| n.text().collect::<Vec<_>>().join(" "))
|
||||
.unwrap_or_default();
|
||||
|
||||
let ip = extract_ipv4(&body_text)
|
||||
.unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}"));
|
||||
eprintln!("ipify says our public IP is: {ip}");
|
||||
|
||||
handle.close().await.expect("close cleanly");
|
||||
}
|
||||
|
||||
/// Proves that `LaunchOptions::extra_args` actually reach Chromium and
|
||||
/// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`,
|
||||
/// observable from JS — read it back via `page.evaluate`.
|
||||
#[tokio::test]
|
||||
#[ignore = "downloads Chromium; run with --ignored"]
|
||||
async fn extra_args_reach_chromium() {
|
||||
const UA: &str = "MangalordCrawlerTest/1.0";
|
||||
let options = LaunchOptions {
|
||||
mode: browser::BrowserMode::Headless,
|
||||
extra_args: vec![format!("--user-agent={UA}")],
|
||||
};
|
||||
let handle = browser::launch(options).await.expect("launch with extra args");
|
||||
|
||||
let page = handle
|
||||
.browser()
|
||||
.new_page("about:blank")
|
||||
.await
|
||||
.expect("open page");
|
||||
page.wait_for_navigation().await.expect("wait");
|
||||
|
||||
let ua: String = page
|
||||
.evaluate("navigator.userAgent")
|
||||
.await
|
||||
.expect("evaluate navigator.userAgent")
|
||||
.into_value()
|
||||
.expect("string value");
|
||||
assert_eq!(
|
||||
ua, UA,
|
||||
"extra --user-agent flag should override navigator.userAgent"
|
||||
);
|
||||
|
||||
handle.close().await.expect("close cleanly");
|
||||
}
|
||||
|
||||
/// Tiny dotted-quad finder — avoids pulling `regex` in just for one
|
||||
/// test. Scans the first valid IPv4 substring (four 0..=255 octets
|
||||
/// separated by dots).
|
||||
fn extract_ipv4(s: &str) -> Option<String> {
|
||||
let bytes = s.as_bytes();
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
if !bytes[i].is_ascii_digit() {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
let start = i;
|
||||
while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
|
||||
i += 1;
|
||||
}
|
||||
let candidate = &s[start..i];
|
||||
let parts: Vec<&str> = candidate.split('.').collect();
|
||||
if parts.len() == 4 && parts.iter().all(|p| p.parse::<u8>().is_ok()) {
|
||||
return Some(candidate.to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "mangalord-frontend",
|
||||
"version": "0.21.3",
|
||||
"version": "0.22.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
Reference in New Issue
Block a user