Mangalord/backend/migrations/0012_crawler.sql

-- Crawler tables.
--
-- Same philosophy as 0001_init.sql: new concepts go in new tables
-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
-- crawled manga IS a manga; the only thing the source-link tables
-- carry is "where did this come from and when did we last see it".
-- That keeps the API and frontend source-agnostic.

-- 1. Source registry. One row per site the crawler knows about.
--    `config` carries per-site knobs (base URL, rate limits, custom
--    selectors) so adding a source is a row insert plus a `Source`
--    trait impl — no schema change.
CREATE TABLE sources (
    id          text PRIMARY KEY,
    name        text NOT NULL,
    base_url    text NOT NULL,
    enabled     boolean NOT NULL DEFAULT true,
    config      jsonb NOT NULL DEFAULT '{}'::jsonb,
    created_at  timestamptz NOT NULL DEFAULT now()
);

-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
--    source itself exposes; the FK to `mangas`/`chapters` is what
--    threads it back into our domain. `metadata_hash` is the signal
--    used by `crawler::diff` to detect updates without re-comparing
--    every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
CREATE TABLE manga_sources (
    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
    source_manga_key    text NOT NULL,
    manga_id            uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
    source_url          text NOT NULL,
    metadata_hash       text,
    first_seen_at       timestamptz NOT NULL DEFAULT now(),
    last_seen_at        timestamptz NOT NULL DEFAULT now(),
    dropped_at          timestamptz,
    PRIMARY KEY (source_id, source_manga_key)
);
CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);

CREATE TABLE chapter_sources (
    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
    source_chapter_key  text NOT NULL,
    chapter_id          uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
    source_url          text NOT NULL,
    first_seen_at       timestamptz NOT NULL DEFAULT now(),
    last_seen_at        timestamptz NOT NULL DEFAULT now(),
    dropped_at          timestamptz,
    PRIMARY KEY (source_id, source_chapter_key)
);
CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);

-- 3. Persistent job queue. Workers lease with
--    `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
--    by transitioning state. The partial index keeps the hot path
--    (pick the next ready job) off the bulk of done/dead rows.
CREATE TABLE crawler_jobs (
    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
    payload         jsonb NOT NULL,
    state           text NOT NULL DEFAULT 'pending'
                       CHECK (state IN ('pending','running','done','failed','dead')),
    attempts        integer NOT NULL DEFAULT 0,
    max_attempts    integer NOT NULL DEFAULT 5,
    scheduled_at    timestamptz NOT NULL DEFAULT now(),
    leased_until    timestamptz,
    last_error      text,
    created_at      timestamptz NOT NULL DEFAULT now(),
    updated_at      timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX crawler_jobs_ready_idx
    ON crawler_jobs (scheduled_at)
    WHERE state IN ('pending', 'failed');