-- Crawler tables. -- -- Same philosophy as 0001_init.sql: new concepts go in new tables -- joined to existing ones, not jammed onto `mangas`/`chapters`. A -- crawled manga IS a manga; the only thing the source-link tables -- carry is "where did this come from and when did we last see it". -- That keeps the API and frontend source-agnostic. -- 1. Source registry. One row per site the crawler knows about. -- `config` carries per-site knobs (base URL, rate limits, custom -- selectors) so adding a source is a row insert plus a `Source` -- trait impl — no schema change. CREATE TABLE sources ( id text PRIMARY KEY, name text NOT NULL, base_url text NOT NULL, enabled boolean NOT NULL DEFAULT true, config jsonb NOT NULL DEFAULT '{}'::jsonb, created_at timestamptz NOT NULL DEFAULT now() ); -- 2. Link tables. `(source_id, source_*_key)` is the natural key the -- source itself exposes; the FK to `mangas`/`chapters` is what -- threads it back into our domain. `metadata_hash` is the signal -- used by `crawler::diff` to detect updates without re-comparing -- every field. `last_seen_at` + `dropped_at` is the soft-drop pair. CREATE TABLE manga_sources ( source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE, source_manga_key text NOT NULL, manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE, source_url text NOT NULL, metadata_hash text, first_seen_at timestamptz NOT NULL DEFAULT now(), last_seen_at timestamptz NOT NULL DEFAULT now(), dropped_at timestamptz, PRIMARY KEY (source_id, source_manga_key) ); CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id); CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at); CREATE TABLE chapter_sources ( source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE, source_chapter_key text NOT NULL, chapter_id uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE, source_url text NOT NULL, first_seen_at timestamptz NOT NULL DEFAULT now(), last_seen_at timestamptz NOT NULL DEFAULT now(), dropped_at timestamptz, PRIMARY KEY (source_id, source_chapter_key) ); CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id); -- 3. Persistent job queue. Workers lease with -- `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack -- by transitioning state. The partial index keeps the hot path -- (pick the next ready job) off the bulk of done/dead rows. CREATE TABLE crawler_jobs ( id uuid PRIMARY KEY DEFAULT gen_random_uuid(), payload jsonb NOT NULL, state text NOT NULL DEFAULT 'pending' CHECK (state IN ('pending','running','done','failed','dead')), attempts integer NOT NULL DEFAULT 0, max_attempts integer NOT NULL DEFAULT 5, scheduled_at timestamptz NOT NULL DEFAULT now(), leased_until timestamptz, last_error text, created_at timestamptz NOT NULL DEFAULT now(), updated_at timestamptz NOT NULL DEFAULT now() ); CREATE INDEX crawler_jobs_ready_idx ON crawler_jobs (scheduled_at) WHERE state IN ('pending', 'failed');