feat: crawler scaffold with chromium launcher (0.22.0)

- crawler module (browser, source trait, jobs, diff) + binary
- chromiumoxide launcher with fetcher feature (auto-downloads
  Chromium on first run, caches under ~/.cache/mangalord/chromium)
- LaunchOptions struct with extra_args, parseable from
  CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS
- migration 0012 introduces sources, manga_sources,
  chapter_sources, crawler_jobs
- integration tests for headed + headless launch, ipify load+parse,
  and extra-args propagation (all #[ignore], opt-in)
This commit is contained in:
MechaCat02
2026-05-20 22:07:56 +02:00
parent 89b8785a40
commit 26eccd0abe
12 changed files with 1951 additions and 27 deletions

View File

@@ -0,0 +1,72 @@
-- Crawler tables.
--
-- Same philosophy as 0001_init.sql: new concepts go in new tables
-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
-- crawled manga IS a manga; the only thing the source-link tables
-- carry is "where did this come from and when did we last see it".
-- That keeps the API and frontend source-agnostic.
-- 1. Source registry. One row per site the crawler knows about.
-- `config` carries per-site knobs (base URL, rate limits, custom
-- selectors) so adding a source is a row insert plus a `Source`
-- trait impl — no schema change.
CREATE TABLE sources (
id text PRIMARY KEY,
name text NOT NULL,
base_url text NOT NULL,
enabled boolean NOT NULL DEFAULT true,
config jsonb NOT NULL DEFAULT '{}'::jsonb,
created_at timestamptz NOT NULL DEFAULT now()
);
-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
-- source itself exposes; the FK to `mangas`/`chapters` is what
-- threads it back into our domain. `metadata_hash` is the signal
-- used by `crawler::diff` to detect updates without re-comparing
-- every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
CREATE TABLE manga_sources (
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
source_manga_key text NOT NULL,
manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
source_url text NOT NULL,
metadata_hash text,
first_seen_at timestamptz NOT NULL DEFAULT now(),
last_seen_at timestamptz NOT NULL DEFAULT now(),
dropped_at timestamptz,
PRIMARY KEY (source_id, source_manga_key)
);
CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
CREATE TABLE chapter_sources (
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
source_chapter_key text NOT NULL,
chapter_id uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
source_url text NOT NULL,
first_seen_at timestamptz NOT NULL DEFAULT now(),
last_seen_at timestamptz NOT NULL DEFAULT now(),
dropped_at timestamptz,
PRIMARY KEY (source_id, source_chapter_key)
);
CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
-- 3. Persistent job queue. Workers lease with
-- `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
-- by transitioning state. The partial index keeps the hot path
-- (pick the next ready job) off the bulk of done/dead rows.
CREATE TABLE crawler_jobs (
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
payload jsonb NOT NULL,
state text NOT NULL DEFAULT 'pending'
CHECK (state IN ('pending','running','done','failed','dead')),
attempts integer NOT NULL DEFAULT 0,
max_attempts integer NOT NULL DEFAULT 5,
scheduled_at timestamptz NOT NULL DEFAULT now(),
leased_until timestamptz,
last_error text,
created_at timestamptz NOT NULL DEFAULT now(),
updated_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX crawler_jobs_ready_idx
ON crawler_jobs (scheduled_at)
WHERE state IN ('pending', 'failed');