Mangalord/backend/src/crawler/source/target.rs

//! First concrete [`Source`] impl, modeled on the selectors of the
//! old Puppeteer crawler. The name "target" is a placeholder — rename
//! once the site is officially identified.
//!
//! `scraper`'s selector parser does not support `:has()` or
//! `:contains()`, so the labelled-`td` lookups from the old script
//! (`td:has(label:contains("Author:"))`) are implemented by walking
//! the parsed tree.

use std::collections::VecDeque;
use std::time::Duration;

use anyhow::Context;
use async_trait::async_trait;
use sha2::{Digest, Sha256};

use super::{
    DiscoverMode, DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef,
    SourceManga, SourceMangaRef,
};
use crate::crawler::detect::{
    has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
};

/// `sources.id` value for this Source impl. Exposed as a const so the
/// daemon can look up per-source state (e.g. `seed_completed_at`)
/// before constructing the Source itself.
pub const SOURCE_ID: &str = "target";

/// In-loop retry budget for transient pages encountered during a single
/// `discover` walk. Bounded small because the job system itself retries
/// the whole `Discover` job on failure — these inline retries only need
/// to absorb a brief site hiccup mid-walk.
const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3;
const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2);

pub struct TargetSource {
    base_url: String,
    parse_chapters: bool,
}

impl TargetSource {
    pub fn new(base_url: impl Into<String>) -> Self {
        Self {
            base_url: base_url.into(),
            parse_chapters: true,
        }
    }

    pub fn base_url(&self) -> &str {
        &self.base_url
    }

    /// Skip the chapter-list selector when parsing detail pages.
    /// The returned `SourceManga.chapters` will be empty even when the
    /// page has a chapter table. Caller must also avoid calling
    /// `repo::crawler::sync_manga_chapters` for these mangas — an
    /// empty list would otherwise soft-drop the manga's existing
    /// chapter rows.
    pub fn without_chapter_parsing(mut self) -> Self {
        self.parse_chapters = false;
        self
    }
}

#[async_trait]
impl Source for TargetSource {
    fn id(&self) -> &'static str {
        SOURCE_ID
    }

    async fn discover(
        &self,
        ctx: &FetchContext<'_>,
        mode: DiscoverMode,
    ) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
        // Always visit page 1 first because that's the only way to
        // discover `last_page`. Retry it on transient — a broken first
        // page would otherwise abort the whole walk before we've even
        // started.
        let first_html = retry_on_transient(
            || async { navigate(ctx, self.base_url.as_str()).await },
            PAGE_TRANSIENT_RETRY_ATTEMPTS,
            PAGE_TRANSIENT_RETRY_DELAY,
        )
        .await?;
        let last_page = {
            let doc = scraper::Html::parse_document(&first_html);
            parse_last_page(&doc)
        };

        let backfill = matches!(mode, DiscoverMode::Backfill);
        let order = build_page_order(last_page, backfill);
        tracing::info!(
            ?mode,
            last_page = ?last_page,
            page_count = order.len(),
            "walking pagination"
        );

        Ok(Box::new(TargetSourceWalker {
            base_url: self.base_url.clone(),
            backfill,
            pages_remaining: order,
            first_page_html: Some(first_html),
        }))
    }

    async fn fetch_manga(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga> {
        let html = navigate(ctx, r.url.as_str()).await?;
        // Convert PageError → anyhow::Error via `?`. PageError stays
        // downcastable from the wrapped anyhow::Error so the pipeline
        // can still recognize Transient via `error.downcast_ref::<PageError>()`.
        let manga = parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
            .with_context(|| format!("parse manga detail at {}", r.url))?;
        Ok(manga)
    }

    async fn fetch_chapter_list(
        &self,
        _ctx: &FetchContext<'_>,
        _manga: &SourceManga,
    ) -> anyhow::Result<Vec<SourceChapterRef>> {
        anyhow::bail!("fetch_chapter_list not implemented yet")
    }

    async fn fetch_chapter(
        &self,
        _ctx: &FetchContext<'_>,
        _r: &SourceChapterRef,
    ) -> anyhow::Result<SourceChapter> {
        anyhow::bail!("fetch_chapter not implemented yet")
    }
}

/// Build the queue of page numbers `TargetSource::discover` will walk.
/// Backfill is oldest-first: pages `last..=1` (within each page the
/// walker reverses entries, since the source orders by update_date
/// DESC). Incremental is newest-first: pages `1..=last` in natural
/// order. If `last_page` is unknown (source surfaces no pagination)
/// only page 1 is visited.
fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
    match (last_page, backfill) {
        (None, _) => VecDeque::from([1]),
        (Some(last), true) => (1..=last).rev().collect(),
        (Some(last), false) => (1..=last).collect(),
    }
}

/// Walker returned by [`TargetSource::discover`]. Pops one source-index
/// page per `next_batch` call. Page 1's HTML is cached at construction
/// time (the discover call needed it to read `last_page` anyway) so the
/// batch covering page 1 doesn't re-fetch.
struct TargetSourceWalker {
    base_url: String,
    backfill: bool,
    pages_remaining: VecDeque<i32>,
    first_page_html: Option<String>,
}

#[async_trait]
impl DiscoverWalk for TargetSourceWalker {
    async fn next_batch(
        &mut self,
        ctx: &FetchContext<'_>,
    ) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
        let Some(page_num) = self.pages_remaining.pop_front() else {
            return Ok(None);
        };
        let mut page_refs = if page_num == 1 {
            // Reuse the cached page-1 HTML from the initial probe. Take
            // it (rather than clone) so a malformed page-order queue
            // that re-visits page 1 still falls back to a real fetch.
            match self.first_page_html.take() {
                Some(html) => {
                    let doc = scraper::Html::parse_document(&html);
                    parse_manga_list_from(&doc)?
                }
                None => {
                    retry_on_transient(
                        || async {
                            let html = navigate(ctx, self.base_url.as_str()).await?;
                            let doc = scraper::Html::parse_document(&html);
                            parse_manga_list_from(&doc)
                        },
                        PAGE_TRANSIENT_RETRY_ATTEMPTS,
                        PAGE_TRANSIENT_RETRY_DELAY,
                    )
                    .await?
                }
            }
        } else {
            retry_on_transient(
                || async {
                    let url = page_url(&self.base_url, page_num);
                    let html = navigate(ctx, &url).await?;
                    let doc = scraper::Html::parse_document(&html);
                    parse_manga_list_from(&doc)
                },
                PAGE_TRANSIENT_RETRY_ATTEMPTS,
                PAGE_TRANSIENT_RETRY_DELAY,
            )
            .await?
        };
        if self.backfill {
            page_refs.reverse();
        }
        tracing::info!(page_num, count = page_refs.len(), "page walked");
        Ok(Some(page_refs))
    }
}

/// Single point of rate-limited navigation. Every Source request goes
/// through here, so the per-host limiter map is the only knob that
/// controls per-origin RPS. Also the choke point for transient-page
/// detection — every fetched body is screened by
/// [`classify_navigate_html`] before being handed to a selector.
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
    ctx.rate.wait_for(url).await?;
    let page = ctx
        .browser
        .new_page(url)
        .await
        .map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
    page.wait_for_navigation()
        .await
        .map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
    // Stopgap until we wait on a specific selector per page type —
    // gives any post-load JS a beat to finish injecting content.
    tokio::time::sleep(Duration::from_secs(1)).await;
    let html = page
        .content()
        .await
        .map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
    page.close().await.ok();
    classify_navigate_html(html)
}

/// Classify a fetched body. The broken-page template is universal across
/// the site — every page type (list, detail, chapter list, reader) gets
/// the same `we're sorry, the request file are not found` body when the
/// server is hiccuping. Catching it here means individual parsers
/// downstream don't have to repeat the check.
fn classify_navigate_html(html: String) -> Result<String, PageError> {
    if is_broken_page_body(&html) {
        return Err(PageError::transient("broken-page body signature"));
    }
    Ok(html)
}

fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
    // Pagination links carry their page number as text. Take the
    // numeric maximum so we don't depend on a specific layout (Prev,
    // Next, ellipses, etc. all get filtered out by .parse).
    let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
    doc.select(&sel)
        .filter_map(|a| {
            collapse_whitespace(&a.text().collect::<String>())
                .parse::<i32>()
                .ok()
        })
        .max()
}

/// Substitutes the first `/N/` path segment with the target page
/// number. Source impls that paginate via a different URL shape can
/// override this — for the modeled site the segment is always present.
fn page_url(template_url: &str, page: i32) -> String {
    let bytes = template_url.as_bytes();
    let mut i = 0;
    while i + 1 < bytes.len() {
        if bytes[i] == b'/' && bytes[i + 1].is_ascii_digit() {
            let start = i;
            let mut j = i + 1;
            while j < bytes.len() && bytes[j].is_ascii_digit() {
                j += 1;
            }
            if j < bytes.len() && bytes[j] == b'/' {
                let mut out = String::with_capacity(template_url.len() + 4);
                out.push_str(&template_url[..start]);
                out.push_str(&format!("/{page}/"));
                out.push_str(&template_url[j + 1..]);
                return out;
            }
        }
        i += 1;
    }
    template_url.to_string()
}

#[cfg(test)]
fn parse_manga_list(html: &str) -> Result<Vec<SourceMangaRef>, PageError> {
    let doc = scraper::Html::parse_document(html);
    parse_manga_list_from(&doc)
}

/// Parse a manga listing page. `#logo` is present on every well-formed
/// listing page on the source; its absence means the response is a
/// broken-page placeholder (transient) rather than a genuinely empty
/// listing. Empty listings (last-page tail, search with no hits) remain
/// `Ok(vec![])`.
fn parse_manga_list_from(doc: &scraper::Html) -> Result<Vec<SourceMangaRef>, PageError> {
    if !has_logo_sentinel(doc) {
        return Err(PageError::transient("manga list: #logo sentinel missing"));
    }
    let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap();
    Ok(doc
        .select(&sel)
        .filter_map(|a| {
            let url = a.value().attr("href")?.trim().to_string();
            if url.is_empty() {
                return None;
            }
            let title = collapse_whitespace(&a.text().collect::<String>());
            if title.is_empty() {
                return None;
            }
            Some(SourceMangaRef {
                source_manga_key: derive_key_from_url(&url),
                title,
                url,
            })
        })
        .collect())
}

fn parse_manga_detail(
    html: &str,
    key: &str,
    include_chapters: bool,
) -> Result<SourceManga, PageError> {
    let doc = scraper::Html::parse_document(html);

    // Sentinel first: a broken-page response will trip this before any
    // anyhow context is added for missing required fields.
    if !has_logo_sentinel(&doc) {
        return Err(PageError::transient("manga detail: #logo sentinel missing"));
    }

    let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?;
    let summary = first_text(&doc, ".manga_summary");
    let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src");

    let authors = links_in_labelled_td(&doc, "Author");
    let genres = links_in_labelled_td(&doc, "Genre");
    let raw_status = labelled_td_child_text(&doc, "Status", "span");
    let status = normalize_status(raw_status.as_deref(), key);

    let alternative_titles = labelled_td_value_after_label(&doc, "Alternative")
        .map(|s| {
            s.split([';', ',', '|'])
                .map(str::trim)
                .filter(|p| !p.is_empty())
                .map(String::from)
                .collect()
        })
        .unwrap_or_default();

    let tag_sel = scraper::Selector::parse(".aside-body a.tag").unwrap();
    let tags: Vec<String> = doc
        .select(&tag_sel)
        .map(|a| collapse_whitespace(&a.text().collect::<String>()))
        .map(|s| strip_tag_count(&s))
        .filter(|s| !s.is_empty())
        .collect();

    let chapters = if include_chapters {
        parse_chapter_list(&doc)
    } else {
        Vec::new()
    };

    let mut manga = SourceManga {
        source_manga_key: key.to_string(),
        title,
        alternative_titles,
        authors,
        genres,
        tags,
        status,
        summary,
        cover_url,
        chapters,
        metadata_hash: String::new(),
    };
    manga.metadata_hash = compute_metadata_hash(&manga);
    Ok(manga)
}

/// Source advertises status as "Ongoing" or "Completed"; we normalize
/// to the lowercase form the `mangas.status` CHECK constraint accepts.
/// Anything else is a parse miss (selector drift, new value, etc.) and
/// returns `None` after logging — the manga sync continues regardless.
fn normalize_status(raw: Option<&str>, key: &str) -> Option<String> {
    let trimmed = raw.map(str::trim).filter(|s| !s.is_empty())?;
    if trimmed.eq_ignore_ascii_case("ongoing") {
        Some("ongoing".to_string())
    } else if trimmed.eq_ignore_ascii_case("completed") {
        Some("completed".to_string())
    } else {
        tracing::error!(
            key,
            raw_status = trimmed,
            "unknown manga status (expected 'Ongoing' or 'Completed'); continuing with status=None"
        );
        None
    }
}

/// Strips a trailing digit-only `(NN)` suffix from a tag name, the form
/// the source uses to display tag counts. Non-numeric parentheses are
/// preserved.
fn strip_tag_count(s: &str) -> String {
    let trimmed = s.trim();
    if trimmed.ends_with(')') {
        if let Some(open) = trimmed.rfind('(') {
            let inside = &trimmed[open + 1..trimmed.len() - 1];
            if !inside.is_empty() && inside.chars().all(|c| c.is_ascii_digit()) {
                return trimmed[..open].trim().to_string();
            }
        }
    }
    trimmed.to_string()
}

fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
    let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
    doc.select(&sel)
        .filter_map(|a| {
            let url = a.value().attr("href")?.trim().to_string();
            if url.is_empty() {
                return None;
            }
            let title_text = collapse_whitespace(&a.text().collect::<String>());
            let number = parse_chapter_number(&title_text).unwrap_or(0);
            Some(SourceChapterRef {
                source_chapter_key: derive_chapter_key_from_url(&url),
                number,
                title: (!title_text.is_empty()).then_some(title_text),
                url,
            })
        })
        .collect()
}

fn parse_chapter_number(text: &str) -> Option<i32> {
    let mut buf = String::new();
    for c in text.chars() {
        if c.is_ascii_digit() {
            buf.push(c);
        } else if !buf.is_empty() {
            break;
        }
    }
    buf.parse().ok()
}

fn derive_key_from_url(url: &str) -> String {
    url.split('?')
        .next()
        .unwrap_or(url)
        .trim_end_matches('/')
        .rsplit('/')
        .find(|s| !s.is_empty())
        .unwrap_or(url)
        .to_string()
}

/// Chapter URLs on this source point at the reader's page 1, e.g.
/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
/// identifies a page *within* a chapter, so naively taking the last
/// path component returns `"pg-1"` for every chapter and collapses
/// them all under one source_chapter_key downstream.
fn derive_chapter_key_from_url(url: &str) -> String {
    let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
    let without_reader_page = match trimmed.rsplit_once('/') {
        Some((prefix, last)) if is_reader_page_segment(last) => prefix,
        _ => trimmed,
    };
    without_reader_page
        .rsplit('/')
        .find(|s| !s.is_empty())
        .unwrap_or(url)
        .to_string()
}

fn is_reader_page_segment(s: &str) -> bool {
    s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
}

fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
    let s = scraper::Selector::parse(sel).ok()?;
    let el = doc.select(&s).next()?;
    let text = collapse_whitespace(&el.text().collect::<String>());
    (!text.is_empty()).then_some(text)
}

fn first_attr(doc: &scraper::Html, sel: &str, attr: &str) -> Option<String> {
    let s = scraper::Selector::parse(sel).ok()?;
    let el = doc.select(&s).next()?;
    el.value().attr(attr).map(str::to_string)
}

/// `td` whose contained `label` text begins with `label_prefix` — the
/// `scraper`-friendly equivalent of `td:has(label:contains("Foo"))`.
fn td_with_label<'a>(
    doc: &'a scraper::Html,
    label_prefix: &str,
) -> Option<scraper::ElementRef<'a>> {
    let td_sel = scraper::Selector::parse("td").unwrap();
    let label_sel = scraper::Selector::parse("label").unwrap();
    for td in doc.select(&td_sel) {
        for label in td.select(&label_sel) {
            let text: String = label.text().collect();
            if text.trim().starts_with(label_prefix) {
                return Some(td);
            }
        }
    }
    None
}

fn links_in_labelled_td(doc: &scraper::Html, label_prefix: &str) -> Vec<String> {
    let Some(td) = td_with_label(doc, label_prefix) else {
        return Vec::new();
    };
    let a_sel = scraper::Selector::parse("a").unwrap();
    td.select(&a_sel)
        .map(|a| collapse_whitespace(&a.text().collect::<String>()))
        .filter(|s| !s.is_empty())
        .collect()
}

fn labelled_td_child_text(
    doc: &scraper::Html,
    label_prefix: &str,
    child_sel: &str,
) -> Option<String> {
    let td = td_with_label(doc, label_prefix)?;
    let child = scraper::Selector::parse(child_sel).ok()?;
    let el = td.select(&child).next()?;
    let text = collapse_whitespace(&el.text().collect::<String>());
    (!text.is_empty()).then_some(text)
}

/// Returns the text content of the labelled `td` with the leading
/// "Label:" portion stripped — used for "Alternative:" which puts the
/// value directly in the cell rather than in a child element.
fn labelled_td_value_after_label(
    doc: &scraper::Html,
    label_prefix: &str,
) -> Option<String> {
    let td = td_with_label(doc, label_prefix)?;
    let full: String = td.text().collect();
    let after = full.split_once(':').map(|(_, r)| r).unwrap_or(&full);
    let trimmed = collapse_whitespace(after);
    (!trimmed.is_empty()).then_some(trimmed)
}

fn collapse_whitespace(s: &str) -> String {
    s.split_whitespace().collect::<Vec<_>>().join(" ")
}

fn compute_metadata_hash(m: &SourceManga) -> String {
    // Field separators are ASCII unit/record separators so a field
    // containing a delimiter character can't be mistaken for two
    // smaller fields.
    let mut h = Sha256::new();
    fn feed(h: &mut Sha256, s: &str) {
        h.update(s.as_bytes());
        h.update(b"\x1F");
    }
    fn feed_list(h: &mut Sha256, xs: &[String]) {
        for s in xs {
            feed(h, s);
        }
        h.update(b"\x1E");
    }
    feed(&mut h, &m.title);
    feed_list(&mut h, &m.alternative_titles);
    feed_list(&mut h, &m.authors);
    feed_list(&mut h, &m.genres);
    feed_list(&mut h, &m.tags);
    feed(&mut h, m.status.as_deref().unwrap_or(""));
    feed(&mut h, m.summary.as_deref().unwrap_or(""));
    feed(&mut h, m.cover_url.as_deref().unwrap_or(""));
    format!("{:x}", h.finalize())
}

#[cfg(test)]
mod tests {
    use super::*;

    const LISTING_HTML: &str = r#"
        <html><body>
        <header><div id="logo">Target</div></header>
        <div id="left_side">
          <div class="pic_list">
            <div class="updatesli">
              <span><a href="https://target.example/manga/foo">Foo Manga</a></span>
            </div>
            <div class="updatesli">
              <span><a href="https://target.example/manga/bar-baz">  Bar Baz  </a></span>
            </div>
            <div class="updatesli">
              <span><a href="">Empty href ignored</a></span>
            </div>
          </div>
        </div>
        </body></html>
    "#;

    const DETAIL_HTML: &str = r#"
        <html><body>
        <header><div id="logo">Target</div></header>
        <div class="w-title"><h1>Test Manga Title</h1></div>
        <div class="cover"><img src="/cover.jpg"><img src="/extra-not-cover.jpg"></div>
        <div class="manga_summary">A summary of the manga.</div>
        <table>
          <tr><td><label>Author:</label><a href="/a/1">Author One</a><a href="/a/2">Author Two</a></td></tr>
          <tr><td><label>Genre(s):</label><a href="/g/1">Action</a><a href="/g/2">Drama</a></td></tr>
          <tr><td><label>Status:</label><span>Ongoing</span></td></tr>
          <tr><td><label>Alternative:</label> Alt Title 1; Alt Title 2 </td></tr>
        </table>
        <aside><div class="aside-body">
          <a class="tag">Fantasy (21)</a>
          <a class="tag">Romance</a>
          <a class="tag">  Action  (5)</a>
          <a class="not-a-tag">should-be-ignored</a>
        </div></aside>
        <table id="chapter_table">
          <tr><td><h4><a class="chico" href="/manga/foo/chapter/1">Ch.1</a></h4></td></tr>
          <tr><td><h4><a class="chico" href="/manga/foo/chapter/2">Ch.2 - The Beginning</a></h4></td></tr>
          <tr><td><h4><a class="chico" href="/manga/foo/chapter/3">Chapter 3: Onward</a></h4></td></tr>
        </table>
        </body></html>
    "#;

    #[test]
    fn parse_manga_list_extracts_title_url_and_derives_key() {
        let refs = parse_manga_list(LISTING_HTML).expect("parse");
        assert_eq!(refs.len(), 2, "third entry has empty href and is skipped");
        assert_eq!(refs[0].title, "Foo Manga");
        assert_eq!(refs[0].url, "https://target.example/manga/foo");
        assert_eq!(refs[0].source_manga_key, "foo");
        assert_eq!(refs[1].title, "Bar Baz");
        assert_eq!(refs[1].source_manga_key, "bar-baz");
    }

    #[test]
    fn parse_manga_list_returns_transient_when_logo_missing() {
        // Broken-page response: no #logo, no listing. Empty Vec would
        // hide this as "page has no mangas"; Transient is the signal
        // upstream code retries on.
        let html = r#"<html><body>\
            <p>we're sorry, the request file are not found.</p>\
            </body></html>"#;
        let err = parse_manga_list(html).expect_err("expected Transient");
        assert!(err.is_transient(), "got non-transient: {err}");
    }

    #[test]
    fn parse_manga_list_ok_empty_when_logo_present_but_no_items() {
        // Last page of pagination, "no results" search, etc. Legitimately
        // empty must stay distinguishable from "page is broken".
        let html = r#"<html><body>\
            <header><div id="logo">Target</div></header>\
            <div id="left_side"><div class="pic_list"></div></div>\
            </body></html>"#;
        let refs = parse_manga_list(html).expect("logo present == not transient");
        assert!(refs.is_empty());
    }

    #[test]
    fn parse_manga_detail_pulls_all_fields() {
        let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse");
        assert_eq!(m.source_manga_key, "test-key");
        assert_eq!(m.title, "Test Manga Title");
        assert_eq!(m.summary.as_deref(), Some("A summary of the manga."));
        assert_eq!(m.authors, vec!["Author One", "Author Two"]);
        assert_eq!(m.genres, vec!["Action", "Drama"]);
        assert_eq!(m.status.as_deref(), Some("ongoing"));
        assert_eq!(m.alternative_titles, vec!["Alt Title 1", "Alt Title 2"]);
        // Counts in parentheses are stripped — "Fantasy (21)" → "Fantasy".
        assert_eq!(m.tags, vec!["Fantasy", "Romance", "Action"]);
        assert_eq!(m.cover_url.as_deref(), Some("/cover.jpg"));
        assert!(!m.metadata_hash.is_empty());

        assert_eq!(m.chapters.len(), 3);
        assert_eq!(m.chapters[0].number, 1);
        assert_eq!(m.chapters[0].title.as_deref(), Some("Ch.1"));
        assert_eq!(m.chapters[0].url, "/manga/foo/chapter/1");
        assert_eq!(m.chapters[0].source_chapter_key, "1");
        assert_eq!(m.chapters[1].number, 2);
        assert_eq!(m.chapters[1].title.as_deref(), Some("Ch.2 - The Beginning"));
        assert_eq!(m.chapters[2].number, 3);
        assert_eq!(m.chapters[2].title.as_deref(), Some("Chapter 3: Onward"));
    }

    #[test]
    fn status_normalized_case_insensitively() {
        assert_eq!(normalize_status(Some("Ongoing"), "k").as_deref(), Some("ongoing"));
        assert_eq!(normalize_status(Some("ONGOING"), "k").as_deref(), Some("ongoing"));
        assert_eq!(normalize_status(Some("  completed "), "k").as_deref(), Some("completed"));
    }

    #[test]
    fn unknown_status_logs_and_returns_none() {
        // Logging is observable in test output via tracing-test, but
        // here we just assert the contract: unknown becomes None
        // (and the manga is therefore still synced by the caller).
        assert!(normalize_status(Some("Hiatus"), "k").is_none());
        assert!(normalize_status(Some(""), "k").is_none());
        assert!(normalize_status(None, "k").is_none());
    }

    #[test]
    fn strip_tag_count_drops_trailing_digit_parens_only() {
        assert_eq!(strip_tag_count("Fantasy (21)"), "Fantasy");
        assert_eq!(strip_tag_count("  Action  (5) "), "Action");
        assert_eq!(strip_tag_count("Romance"), "Romance");
        // Non-numeric parens stay put.
        assert_eq!(strip_tag_count("Slice of Life (sub)"), "Slice of Life (sub)");
        // Only the trailing paren is considered.
        assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
    }

    #[test]
    fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
        // Real listing fixture from the target site. 15 rows: chapters
        // with various Ch.N markup, one hiatus row, three "notice." rows,
        // and duplicates of Ch.1 and Ch.52 from different uploaders.
        // Every row must survive parsing and every chapter must have a
        // distinct source_chapter_key — chapter URLs all end in `/pg-1/`
        // (the reader's page-1 entry point), and a naive
        // last-segment-of-URL derivation returns "pg-1" for every row,
        // collapsing the whole list into one downstream chapter row.
        let html = include_str!(
            "../../../tests/fixtures/target/chapter_list_uu.html"
        );
        let doc = scraper::Html::parse_document(html);
        let chapters = parse_chapter_list(&doc);

        assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");

        let mut keys: Vec<&str> =
            chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
        keys.sort();
        let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
        assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
        for c in &chapters {
            assert_ne!(
                c.source_chapter_key, "pg-1",
                "key must not be the reader-page segment: {:?}", c
            );
        }

        // Latest chapter is first (source orders newest → oldest).
        assert_eq!(chapters[0].number, 67);
        assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
        assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");

        // Duplicate-number chapters (different uploaders) survive as
        // two rows. The (manga_id, number) UNIQUE collapse is a
        // downstream schema concern handled separately.
        assert_eq!(
            chapters.iter().filter(|c| c.number == 52).count(),
            2,
            "two Ch.52 uploads must both survive parsing"
        );
        assert_eq!(
            chapters.iter().filter(|c| c.number == 1).count(),
            2,
            "Ch.1 Official and Ch.1 Team Hazama are both kept"
        );

        // Notices / hiatus rows have no leading digit so they parse to
        // number=0. They are not filtered out.
        let zero = chapters.iter().filter(|c| c.number == 0).count();
        assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
    }

    #[test]
    fn parse_chapter_number_grabs_first_integer_run() {
        assert_eq!(parse_chapter_number("Ch.1"), Some(1));
        assert_eq!(parse_chapter_number("Chapter 12"), Some(12));
        assert_eq!(parse_chapter_number("Ch.2 - The Beginning"), Some(2));
        // Decimal chapters keep the integer part (i32 storage).
        assert_eq!(parse_chapter_number("Ch.12.5"), Some(12));
        assert_eq!(parse_chapter_number("Special"), None);
    }

    #[test]
    fn parse_last_page_picks_highest_pagination_link() {
        let html = r#"
            <div id="left_side"><div class="pagination">
              <a href="/list/1/">Prev</a>
              <ol>
                <li><a href="/list/1/">1</a></li>
                <li><a href="/list/2/">2</a></li>
                <li><a href="/list/47/">47</a></li>
                <li><a href="/list/2/">Next</a></li>
              </ol>
            </div></div>
        "#;
        let doc = scraper::Html::parse_document(html);
        assert_eq!(parse_last_page(&doc), Some(47));
    }

    #[test]
    fn parse_last_page_none_when_no_pagination() {
        let doc = scraper::Html::parse_document("<html></html>");
        assert!(parse_last_page(&doc).is_none());
    }

    #[test]
    fn page_url_substitutes_numeric_path_segment() {
        assert_eq!(
            page_url("https://site.example/list/1/?f=1&o=1&sortby=update_date&e=", 5),
            "https://site.example/list/5/?f=1&o=1&sortby=update_date&e="
        );
        // No numeric segment → URL returned unchanged.
        assert_eq!(
            page_url("https://site.example/list/?f=1", 5),
            "https://site.example/list/?f=1"
        );
    }

    #[test]
    fn derive_key_strips_trailing_slash_and_query() {
        assert_eq!(derive_key_from_url("https://x.example/manga/foo/"), "foo");
        assert_eq!(derive_key_from_url("https://x.example/manga/foo?p=1"), "foo");
        assert_eq!(derive_key_from_url("/manga/bar"), "bar");
    }

    #[test]
    fn derive_chapter_key_strips_trailing_reader_page_segment() {
        // Listing links go to page 1 of the reader; strip /pg-\d+/.
        assert_eq!(
            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
            "br_chapter-379272"
        );
        assert_eq!(
            derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
            "to_chapter-13"
        );
        // Defensive: deep-link to a non-first page should still resolve
        // to the same chapter identity.
        assert_eq!(
            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
            "br_chapter-379272"
        );
        // No reader-page suffix → behaves like derive_key_from_url.
        assert_eq!(
            derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
            "br_chapter-379272"
        );
        // Query strings are stripped.
        assert_eq!(
            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
            "br_chapter-379272"
        );
        // `pg-foo` is not a valid reader-page segment; treated as identity.
        assert_eq!(
            derive_chapter_key_from_url(".../uu/something/pg-foo/"),
            "pg-foo"
        );
        // Bare `pg-` (no digits) likewise not stripped.
        assert_eq!(
            derive_chapter_key_from_url(".../uu/something/pg-/"),
            "pg-"
        );
    }

    #[test]
    fn metadata_hash_is_stable_and_field_sensitive() {
        let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
        let again = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
        assert_eq!(base.metadata_hash, again.metadata_hash);

        // Same fields except status flipped — hash must change.
        let altered_html = DETAIL_HTML.replace("Ongoing", "Completed");
        let altered = parse_manga_detail(&altered_html, "k", true).unwrap();
        assert_ne!(base.metadata_hash, altered.metadata_hash);
    }

    #[test]
    fn missing_optional_fields_parse_to_none() {
        let html = r#"<html><body>\
            <header><div id="logo">Target</div></header>\
            <div class="w-title"><h1>Minimal</h1></div></body></html>"#;
        let m = parse_manga_detail(html, "min", true).unwrap();
        assert_eq!(m.title, "Minimal");
        assert!(m.summary.is_none());
        assert!(m.status.is_none());
        assert!(m.authors.is_empty());
        assert!(m.genres.is_empty());
        assert!(m.tags.is_empty());
        assert!(m.alternative_titles.is_empty());
        assert!(m.chapters.is_empty());
    }

    #[test]
    fn parse_manga_detail_skips_chapters_when_disabled() {
        // Same fixture that yields 3 chapters above; with include_chapters=false
        // the chapter table is ignored and the rest of the metadata still parses.
        let m = parse_manga_detail(DETAIL_HTML, "k", false).unwrap();
        assert!(m.chapters.is_empty(), "chapters should be empty when disabled");
        assert_eq!(m.title, "Test Manga Title", "other fields still parse");
        assert_eq!(m.authors, vec!["Author One", "Author Two"]);
    }

    #[test]
    fn parse_manga_detail_errors_on_missing_title() {
        // Logo present (page is alive) — failure here is a real parse
        // miss (Other), not Transient.
        let html = r#"<html><body>\
            <header><div id="logo">Target</div></header>\
            <p>nothing</p></body></html>"#;
        let err = parse_manga_detail(html, "x", true).unwrap_err();
        assert!(!err.is_transient(), "expected Other, got Transient: {err}");
        assert!(err.to_string().contains("missing .w-title h1"));
    }

    #[test]
    fn classify_navigate_html_passes_normal_body_through() {
        let body = "<html><body><header><div id='logo'>Target</div></header>\
                    <p>content</p></body></html>"
            .to_string();
        let out = classify_navigate_html(body.clone()).expect("ok");
        assert_eq!(out, body);
    }

    #[test]
    fn classify_navigate_html_returns_transient_for_broken_template() {
        let body = "<html><head></head><body>\
                    <p>we're sorry, the request file are not found.</p>\
                    </body></html>"
            .to_string();
        let err = classify_navigate_html(body).expect_err("expected Transient");
        assert!(err.is_transient(), "got non-transient: {err}");
    }

    #[test]
    fn parse_manga_detail_returns_transient_when_logo_missing() {
        // Broken-page response on a detail URL — must be reported as
        // Transient so the job is retried rather than logging "missing
        // .w-title h1" against a permanently-skipped manga.
        let html = "<html><body>\
            <p>we're sorry, the request file are not found.</p>\
            </body></html>";
        let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
        assert!(err.is_transient(), "got non-transient: {err}");
    }

    #[test]
    fn build_page_order_backfill_is_last_to_one() {
        // Backfill walks pages oldest-first: queue is [last, last-1, ..., 1]
        // so popping from the front yields the last page first.
        let order = build_page_order(Some(3), true);
        assert_eq!(Vec::from(order), vec![3, 2, 1]);
    }

    #[test]
    fn build_page_order_incremental_is_one_to_last() {
        // Incremental walks newest-first in natural source order.
        let order = build_page_order(Some(3), false);
        assert_eq!(Vec::from(order), vec![1, 2, 3]);
    }

    #[test]
    fn build_page_order_falls_back_to_page_one_only_without_pagination() {
        let backfill = build_page_order(None, true);
        assert_eq!(Vec::from(backfill), vec![1]);
        let incremental = build_page_order(None, false);
        assert_eq!(Vec::from(incremental), vec![1]);
    }

    #[test]
    fn build_page_order_single_page_index_yields_one_entry() {
        // Sources with exactly one page should not yield duplicates
        // regardless of mode.
        let backfill = build_page_order(Some(1), true);
        assert_eq!(Vec::from(backfill), vec![1]);
        let incremental = build_page_order(Some(1), false);
        assert_eq!(Vec::from(incremental), vec![1]);
    }
}