- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
73 lines
3.3 KiB
SQL
73 lines
3.3 KiB
SQL
-- Crawler tables.
|
|
--
|
|
-- Same philosophy as 0001_init.sql: new concepts go in new tables
|
|
-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
|
|
-- crawled manga IS a manga; the only thing the source-link tables
|
|
-- carry is "where did this come from and when did we last see it".
|
|
-- That keeps the API and frontend source-agnostic.
|
|
|
|
-- 1. Source registry. One row per site the crawler knows about.
|
|
-- `config` carries per-site knobs (base URL, rate limits, custom
|
|
-- selectors) so adding a source is a row insert plus a `Source`
|
|
-- trait impl — no schema change.
|
|
CREATE TABLE sources (
|
|
id text PRIMARY KEY,
|
|
name text NOT NULL,
|
|
base_url text NOT NULL,
|
|
enabled boolean NOT NULL DEFAULT true,
|
|
config jsonb NOT NULL DEFAULT '{}'::jsonb,
|
|
created_at timestamptz NOT NULL DEFAULT now()
|
|
);
|
|
|
|
-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
|
|
-- source itself exposes; the FK to `mangas`/`chapters` is what
|
|
-- threads it back into our domain. `metadata_hash` is the signal
|
|
-- used by `crawler::diff` to detect updates without re-comparing
|
|
-- every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
|
|
CREATE TABLE manga_sources (
|
|
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
source_manga_key text NOT NULL,
|
|
manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
|
|
source_url text NOT NULL,
|
|
metadata_hash text,
|
|
first_seen_at timestamptz NOT NULL DEFAULT now(),
|
|
last_seen_at timestamptz NOT NULL DEFAULT now(),
|
|
dropped_at timestamptz,
|
|
PRIMARY KEY (source_id, source_manga_key)
|
|
);
|
|
CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
|
|
CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
|
|
|
|
CREATE TABLE chapter_sources (
|
|
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
source_chapter_key text NOT NULL,
|
|
chapter_id uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
|
|
source_url text NOT NULL,
|
|
first_seen_at timestamptz NOT NULL DEFAULT now(),
|
|
last_seen_at timestamptz NOT NULL DEFAULT now(),
|
|
dropped_at timestamptz,
|
|
PRIMARY KEY (source_id, source_chapter_key)
|
|
);
|
|
CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
|
|
|
|
-- 3. Persistent job queue. Workers lease with
|
|
-- `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
|
|
-- by transitioning state. The partial index keeps the hot path
|
|
-- (pick the next ready job) off the bulk of done/dead rows.
|
|
CREATE TABLE crawler_jobs (
|
|
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
payload jsonb NOT NULL,
|
|
state text NOT NULL DEFAULT 'pending'
|
|
CHECK (state IN ('pending','running','done','failed','dead')),
|
|
attempts integer NOT NULL DEFAULT 0,
|
|
max_attempts integer NOT NULL DEFAULT 5,
|
|
scheduled_at timestamptz NOT NULL DEFAULT now(),
|
|
leased_until timestamptz,
|
|
last_error text,
|
|
created_at timestamptz NOT NULL DEFAULT now(),
|
|
updated_at timestamptz NOT NULL DEFAULT now()
|
|
);
|
|
CREATE INDEX crawler_jobs_ready_idx
|
|
ON crawler_jobs (scheduled_at)
|
|
WHERE state IN ('pending', 'failed');
|