feat: crawler scaffold with chromium launcher (0.22.0)

- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
2026-05-20 22:07:56 +02:00
parent 89b8785a40
commit 26eccd0abe
12 changed files with 1951 additions and 27 deletions
--- a/backend/migrations/0012_crawler.sql
+++ b/backend/migrations/0012_crawler.sql
@@ -0,0 +1,72 @@
+-- Crawler tables.
+--
+-- Same philosophy as 0001_init.sql: new concepts go in new tables
+-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
+-- crawled manga IS a manga; the only thing the source-link tables
+-- carry is "where did this come from and when did we last see it".
+-- That keeps the API and frontend source-agnostic.
+
+-- 1. Source registry. One row per site the crawler knows about.
+--    `config` carries per-site knobs (base URL, rate limits, custom
+--    selectors) so adding a source is a row insert plus a `Source`
+--    trait impl — no schema change.
+CREATE TABLE sources (
+    id          text PRIMARY KEY,
+    name        text NOT NULL,
+    base_url    text NOT NULL,
+    enabled     boolean NOT NULL DEFAULT true,
+    config      jsonb NOT NULL DEFAULT '{}'::jsonb,
+    created_at  timestamptz NOT NULL DEFAULT now()
+);
+
+-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
+--    source itself exposes; the FK to `mangas`/`chapters` is what
+--    threads it back into our domain. `metadata_hash` is the signal
+--    used by `crawler::diff` to detect updates without re-comparing
+--    every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
+CREATE TABLE manga_sources (
+    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
+    source_manga_key    text NOT NULL,
+    manga_id            uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
+    source_url          text NOT NULL,
+    metadata_hash       text,
+    first_seen_at       timestamptz NOT NULL DEFAULT now(),
+    last_seen_at        timestamptz NOT NULL DEFAULT now(),
+    dropped_at          timestamptz,
+    PRIMARY KEY (source_id, source_manga_key)
+);
+CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
+CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
+
+CREATE TABLE chapter_sources (
+    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
+    source_chapter_key  text NOT NULL,
+    chapter_id          uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
+    source_url          text NOT NULL,
+    first_seen_at       timestamptz NOT NULL DEFAULT now(),
+    last_seen_at        timestamptz NOT NULL DEFAULT now(),
+    dropped_at          timestamptz,
+    PRIMARY KEY (source_id, source_chapter_key)
+);
+CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
+
+-- 3. Persistent job queue. Workers lease with
+--    `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
+--    by transitioning state. The partial index keeps the hot path
+--    (pick the next ready job) off the bulk of done/dead rows.
+CREATE TABLE crawler_jobs (
+    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
+    payload         jsonb NOT NULL,
+    state           text NOT NULL DEFAULT 'pending'
+                       CHECK (state IN ('pending','running','done','failed','dead')),
+    attempts        integer NOT NULL DEFAULT 0,
+    max_attempts    integer NOT NULL DEFAULT 5,
+    scheduled_at    timestamptz NOT NULL DEFAULT now(),
+    leased_until    timestamptz,
+    last_error      text,
+    created_at      timestamptz NOT NULL DEFAULT now(),
+    updated_at      timestamptz NOT NULL DEFAULT now()
+);
+CREATE INDEX crawler_jobs_ready_idx
+    ON crawler_jobs (scheduled_at)
+    WHERE state IN ('pending', 'failed');