feat: crawler manga-list & metadata sync with cover download (0.23.0)

- TargetSource: first concrete impl of the Source trait, modeled on the old Puppeteer crawler's selectors (+ status normalization, tag-count stripping, chapter list) - DiscoverMode::Backfill walks pagination last->1, reverse within each page (oldest-first); Incremental walks forward - RateLimiter (tokio-time aware) plumbed through FetchContext so the pagination walk honors the same per-host budget as the outer loop - repo::crawler: ensure_source, upsert_manga_from_source (returns New/Updated/Unchanged + current cover_image_path for backfill decisions), sync_manga_chapters, mark_dropped_mangas — all transactional, with case-insensitive lookups and source-insertable genres - Cover image download via reqwest+infer; stored under mangas/{id}/cover.{ext} via the Storage trait - Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and reqwest::Proxy::all (HTTP/HTTPS/SOCKS5) - Crawler binary: positional start URL or $CRAWLER_START_URL, $CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs), $CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS - Silences chromiumoxide 0.7's known CDP deserialize log spam via default tracing filter + CdpError::Serde downgrade - 9 sqlx integration tests + 11 selector/rate-limit unit tests
2026-05-21 22:04:23 +02:00
parent 26eccd0abe
commit b1a3a4e9d3
13 changed files with 1930 additions and 39 deletions
--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -0,0 +1,675 @@
+//! First concrete [`Source`] impl, modeled on the selectors of the
+//! old Puppeteer crawler. The name "target" is a placeholder — rename
+//! once the site is officially identified.
+//!
+//! `scraper`'s selector parser does not support `:has()` or
+//! `:contains()`, so the labelled-`td` lookups from the old script
+//! (`td:has(label:contains("Author:"))`) are implemented by walking
+//! the parsed tree.
+
+use std::time::Duration;
+
+use anyhow::Context;
+use async_trait::async_trait;
+use sha2::{Digest, Sha256};
+
+use super::{
+    DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
+    SourceMangaRef,
+};
+
+pub struct TargetSource {
+    base_url: String,
+    parse_chapters: bool,
+}
+
+impl TargetSource {
+    pub fn new(base_url: impl Into<String>) -> Self {
+        Self {
+            base_url: base_url.into(),
+            parse_chapters: true,
+        }
+    }
+
+    pub fn base_url(&self) -> &str {
+        &self.base_url
+    }
+
+    /// Skip the chapter-list selector when parsing detail pages.
+    /// The returned `SourceManga.chapters` will be empty even when the
+    /// page has a chapter table. Caller must also avoid calling
+    /// `repo::crawler::sync_manga_chapters` for these mangas — an
+    /// empty list would otherwise soft-drop the manga's existing
+    /// chapter rows.
+    pub fn without_chapter_parsing(mut self) -> Self {
+        self.parse_chapters = false;
+        self
+    }
+}
+
+#[async_trait]
+impl Source for TargetSource {
+    fn id(&self) -> &'static str {
+        "target"
+    }
+
+    async fn discover(
+        &self,
+        ctx: &FetchContext<'_>,
+        mode: DiscoverMode,
+        max_results: Option<usize>,
+    ) -> anyhow::Result<Vec<SourceMangaRef>> {
+        // Always visit page 1 first because that's the only way to
+        // discover `last_page`. We cache the HTML so we don't have to
+        // re-navigate when the iteration reaches page 1 again.
+        let first_html = navigate(ctx, self.base_url.as_str()).await?;
+        let last_page = {
+            let doc = scraper::Html::parse_document(&first_html);
+            parse_last_page(&doc)
+        };
+
+        let backfill = matches!(mode, DiscoverMode::Backfill);
+        let order: Vec<i32> = match (last_page, backfill) {
+            (None, _) => vec![1],
+            // Backfill = oldest-first: walk pages last → 1, then
+            // reverse within each page (the listing is update_date
+            // DESC, so the bottom of the last page is the oldest
+            // entry the source still surfaces).
+            (Some(last), true) => (1..=last).rev().collect(),
+            (Some(last), false) => (1..=last).collect(),
+        };
+        tracing::info!(
+            ?mode,
+            last_page = ?last_page,
+            page_count = order.len(),
+            "walking pagination"
+        );
+
+        let mut all = Vec::new();
+        for page_num in order {
+            let html = if page_num == 1 {
+                first_html.clone()
+            } else {
+                navigate(ctx, &page_url(&self.base_url, page_num)).await?
+            };
+            let mut page_refs = {
+                let doc = scraper::Html::parse_document(&html);
+                parse_manga_list_from(&doc)
+            };
+            if backfill {
+                page_refs.reverse();
+            }
+            tracing::info!(page_num, count = page_refs.len(), "page walked");
+            all.extend(page_refs);
+            if cap_reached(&all, max_results) {
+                tracing::info!(cap = ?max_results, "max_results reached; halting pagination");
+                break;
+            }
+        }
+
+        Ok(truncate_to_cap(all, max_results))
+    }
+
+    async fn fetch_manga(
+        &self,
+        ctx: &FetchContext<'_>,
+        r: &SourceMangaRef,
+    ) -> anyhow::Result<SourceManga> {
+        let html = navigate(ctx, r.url.as_str()).await?;
+        parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
+            .with_context(|| format!("parse manga detail at {}", r.url))
+    }
+
+    async fn fetch_chapter_list(
+        &self,
+        _ctx: &FetchContext<'_>,
+        _manga: &SourceManga,
+    ) -> anyhow::Result<Vec<SourceChapterRef>> {
+        anyhow::bail!("fetch_chapter_list not implemented yet")
+    }
+
+    async fn fetch_chapter(
+        &self,
+        _ctx: &FetchContext<'_>,
+        _r: &SourceChapterRef,
+    ) -> anyhow::Result<SourceChapter> {
+        anyhow::bail!("fetch_chapter not implemented yet")
+    }
+}
+
+fn cap_reached<T>(buf: &[T], max: Option<usize>) -> bool {
+    matches!(max, Some(m) if buf.len() >= m)
+}
+
+fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
+    if let Some(m) = max {
+        buf.truncate(m);
+    }
+    buf
+}
+
+/// Single point of rate-limited navigation. Every Source request goes
+/// through here, so the limiter is the only knob that controls
+/// per-host RPS.
+async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result<String> {
+    ctx.rate.lock().await.wait().await;
+    let page = ctx.browser.new_page(url).await?;
+    page.wait_for_navigation().await?;
+    // Stopgap until we wait on a specific selector per page type —
+    // gives any post-load JS a beat to finish injecting content.
+    tokio::time::sleep(Duration::from_secs(1)).await;
+    let html = page.content().await?;
+    page.close().await?;
+    Ok(html)
+}
+
+fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
+    // Pagination links carry their page number as text. Take the
+    // numeric maximum so we don't depend on a specific layout (Prev,
+    // Next, ellipses, etc. all get filtered out by .parse).
+    let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
+    doc.select(&sel)
+        .filter_map(|a| {
+            collapse_whitespace(&a.text().collect::<String>())
+                .parse::<i32>()
+                .ok()
+        })
+        .max()
+}
+
+/// Substitutes the first `/N/` path segment with the target page
+/// number. Source impls that paginate via a different URL shape can
+/// override this — for the modeled site the segment is always present.
+fn page_url(template_url: &str, page: i32) -> String {
+    let bytes = template_url.as_bytes();
+    let mut i = 0;
+    while i + 1 < bytes.len() {
+        if bytes[i] == b'/' && bytes[i + 1].is_ascii_digit() {
+            let start = i;
+            let mut j = i + 1;
+            while j < bytes.len() && bytes[j].is_ascii_digit() {
+                j += 1;
+            }
+            if j < bytes.len() && bytes[j] == b'/' {
+                let mut out = String::with_capacity(template_url.len() + 4);
+                out.push_str(&template_url[..start]);
+                out.push_str(&format!("/{page}/"));
+                out.push_str(&template_url[j + 1..]);
+                return out;
+            }
+        }
+        i += 1;
+    }
+    template_url.to_string()
+}
+
+#[cfg(test)]
+fn parse_manga_list(html: &str) -> Vec<SourceMangaRef> {
+    let doc = scraper::Html::parse_document(html);
+    parse_manga_list_from(&doc)
+}
+
+fn parse_manga_list_from(doc: &scraper::Html) -> Vec<SourceMangaRef> {
+    let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap();
+    doc.select(&sel)
+        .filter_map(|a| {
+            let url = a.value().attr("href")?.trim().to_string();
+            if url.is_empty() {
+                return None;
+            }
+            let title = collapse_whitespace(&a.text().collect::<String>());
+            if title.is_empty() {
+                return None;
+            }
+            Some(SourceMangaRef {
+                source_manga_key: derive_key_from_url(&url),
+                title,
+                url,
+            })
+        })
+        .collect()
+}
+
+fn parse_manga_detail(
+    html: &str,
+    key: &str,
+    include_chapters: bool,
+) -> anyhow::Result<SourceManga> {
+    let doc = scraper::Html::parse_document(html);
+
+    let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?;
+    let summary = first_text(&doc, ".manga_summary");
+    let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src");
+
+    let authors = links_in_labelled_td(&doc, "Author");
+    let genres = links_in_labelled_td(&doc, "Genre");
+    let raw_status = labelled_td_child_text(&doc, "Status", "span");
+    let status = normalize_status(raw_status.as_deref(), key);
+
+    let alternative_titles = labelled_td_value_after_label(&doc, "Alternative")
+        .map(|s| {
+            s.split([';', ',', '|'])
+                .map(str::trim)
+                .filter(|p| !p.is_empty())
+                .map(String::from)
+                .collect()
+        })
+        .unwrap_or_default();
+
+    let tag_sel = scraper::Selector::parse(".aside-body a.tag").unwrap();
+    let tags: Vec<String> = doc
+        .select(&tag_sel)
+        .map(|a| collapse_whitespace(&a.text().collect::<String>()))
+        .map(|s| strip_tag_count(&s))
+        .filter(|s| !s.is_empty())
+        .collect();
+
+    let chapters = if include_chapters {
+        parse_chapter_list(&doc)
+    } else {
+        Vec::new()
+    };
+
+    let mut manga = SourceManga {
+        source_manga_key: key.to_string(),
+        title,
+        alternative_titles,
+        authors,
+        genres,
+        tags,
+        status,
+        summary,
+        cover_url,
+        chapters,
+        metadata_hash: String::new(),
+    };
+    manga.metadata_hash = compute_metadata_hash(&manga);
+    Ok(manga)
+}
+
+/// Source advertises status as "Ongoing" or "Completed"; we normalize
+/// to the lowercase form the `mangas.status` CHECK constraint accepts.
+/// Anything else is a parse miss (selector drift, new value, etc.) and
+/// returns `None` after logging — the manga sync continues regardless.
+fn normalize_status(raw: Option<&str>, key: &str) -> Option<String> {
+    let trimmed = raw.map(str::trim).filter(|s| !s.is_empty())?;
+    if trimmed.eq_ignore_ascii_case("ongoing") {
+        Some("ongoing".to_string())
+    } else if trimmed.eq_ignore_ascii_case("completed") {
+        Some("completed".to_string())
+    } else {
+        tracing::error!(
+            key,
+            raw_status = trimmed,
+            "unknown manga status (expected 'Ongoing' or 'Completed'); continuing with status=None"
+        );
+        None
+    }
+}
+
+/// Strips a trailing digit-only `(NN)` suffix from a tag name, the form
+/// the source uses to display tag counts. Non-numeric parentheses are
+/// preserved.
+fn strip_tag_count(s: &str) -> String {
+    let trimmed = s.trim();
+    if trimmed.ends_with(')') {
+        if let Some(open) = trimmed.rfind('(') {
+            let inside = &trimmed[open + 1..trimmed.len() - 1];
+            if !inside.is_empty() && inside.chars().all(|c| c.is_ascii_digit()) {
+                return trimmed[..open].trim().to_string();
+            }
+        }
+    }
+    trimmed.to_string()
+}
+
+fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
+    let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
+    doc.select(&sel)
+        .filter_map(|a| {
+            let url = a.value().attr("href")?.trim().to_string();
+            if url.is_empty() {
+                return None;
+            }
+            let title_text = collapse_whitespace(&a.text().collect::<String>());
+            let number = parse_chapter_number(&title_text).unwrap_or(0);
+            Some(SourceChapterRef {
+                source_chapter_key: derive_key_from_url(&url),
+                number,
+                title: (!title_text.is_empty()).then_some(title_text),
+                url,
+            })
+        })
+        .collect()
+}
+
+fn parse_chapter_number(text: &str) -> Option<i32> {
+    let mut buf = String::new();
+    for c in text.chars() {
+        if c.is_ascii_digit() {
+            buf.push(c);
+        } else if !buf.is_empty() {
+            break;
+        }
+    }
+    buf.parse().ok()
+}
+
+fn derive_key_from_url(url: &str) -> String {
+    url.split('?')
+        .next()
+        .unwrap_or(url)
+        .trim_end_matches('/')
+        .rsplit('/')
+        .find(|s| !s.is_empty())
+        .unwrap_or(url)
+        .to_string()
+}
+
+fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
+    let s = scraper::Selector::parse(sel).ok()?;
+    let el = doc.select(&s).next()?;
+    let text = collapse_whitespace(&el.text().collect::<String>());
+    (!text.is_empty()).then_some(text)
+}
+
+fn first_attr(doc: &scraper::Html, sel: &str, attr: &str) -> Option<String> {
+    let s = scraper::Selector::parse(sel).ok()?;
+    let el = doc.select(&s).next()?;
+    el.value().attr(attr).map(str::to_string)
+}
+
+/// `td` whose contained `label` text begins with `label_prefix` — the
+/// `scraper`-friendly equivalent of `td:has(label:contains("Foo"))`.
+fn td_with_label<'a>(
+    doc: &'a scraper::Html,
+    label_prefix: &str,
+) -> Option<scraper::ElementRef<'a>> {
+    let td_sel = scraper::Selector::parse("td").unwrap();
+    let label_sel = scraper::Selector::parse("label").unwrap();
+    for td in doc.select(&td_sel) {
+        for label in td.select(&label_sel) {
+            let text: String = label.text().collect();
+            if text.trim().starts_with(label_prefix) {
+                return Some(td);
+            }
+        }
+    }
+    None
+}
+
+fn links_in_labelled_td(doc: &scraper::Html, label_prefix: &str) -> Vec<String> {
+    let Some(td) = td_with_label(doc, label_prefix) else {
+        return Vec::new();
+    };
+    let a_sel = scraper::Selector::parse("a").unwrap();
+    td.select(&a_sel)
+        .map(|a| collapse_whitespace(&a.text().collect::<String>()))
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
+fn labelled_td_child_text(
+    doc: &scraper::Html,
+    label_prefix: &str,
+    child_sel: &str,
+) -> Option<String> {
+    let td = td_with_label(doc, label_prefix)?;
+    let child = scraper::Selector::parse(child_sel).ok()?;
+    let el = td.select(&child).next()?;
+    let text = collapse_whitespace(&el.text().collect::<String>());
+    (!text.is_empty()).then_some(text)
+}
+
+/// Returns the text content of the labelled `td` with the leading
+/// "Label:" portion stripped — used for "Alternative:" which puts the
+/// value directly in the cell rather than in a child element.
+fn labelled_td_value_after_label(
+    doc: &scraper::Html,
+    label_prefix: &str,
+) -> Option<String> {
+    let td = td_with_label(doc, label_prefix)?;
+    let full: String = td.text().collect();
+    let after = full.split_once(':').map(|(_, r)| r).unwrap_or(&full);
+    let trimmed = collapse_whitespace(after);
+    (!trimmed.is_empty()).then_some(trimmed)
+}
+
+fn collapse_whitespace(s: &str) -> String {
+    s.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+
+fn compute_metadata_hash(m: &SourceManga) -> String {
+    // Field separators are ASCII unit/record separators so a field
+    // containing a delimiter character can't be mistaken for two
+    // smaller fields.
+    let mut h = Sha256::new();
+    fn feed(h: &mut Sha256, s: &str) {
+        h.update(s.as_bytes());
+        h.update(b"\x1F");
+    }
+    fn feed_list(h: &mut Sha256, xs: &[String]) {
+        for s in xs {
+            feed(h, s);
+        }
+        h.update(b"\x1E");
+    }
+    feed(&mut h, &m.title);
+    feed_list(&mut h, &m.alternative_titles);
+    feed_list(&mut h, &m.authors);
+    feed_list(&mut h, &m.genres);
+    feed_list(&mut h, &m.tags);
+    feed(&mut h, m.status.as_deref().unwrap_or(""));
+    feed(&mut h, m.summary.as_deref().unwrap_or(""));
+    feed(&mut h, m.cover_url.as_deref().unwrap_or(""));
+    format!("{:x}", h.finalize())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const LISTING_HTML: &str = r#"
+        <html><body>
+        <div id="left_side">
+          <div class="pic_list">
+            <div class="updatesli">
+              <span><a href="https://target.example/manga/foo">Foo Manga</a></span>
+            </div>
+            <div class="updatesli">
+              <span><a href="https://target.example/manga/bar-baz">  Bar Baz  </a></span>
+            </div>
+            <div class="updatesli">
+              <span><a href="">Empty href ignored</a></span>
+            </div>
+          </div>
+        </div>
+        </body></html>
+    "#;
+
+    const DETAIL_HTML: &str = r#"
+        <html><body>
+        <div class="w-title"><h1>Test Manga Title</h1></div>
+        <div class="cover"><img src="/cover.jpg"><img src="/extra-not-cover.jpg"></div>
+        <div class="manga_summary">A summary of the manga.</div>
+        <table>
+          <tr><td><label>Author:</label><a href="/a/1">Author One</a><a href="/a/2">Author Two</a></td></tr>
+          <tr><td><label>Genre(s):</label><a href="/g/1">Action</a><a href="/g/2">Drama</a></td></tr>
+          <tr><td><label>Status:</label><span>Ongoing</span></td></tr>
+          <tr><td><label>Alternative:</label> Alt Title 1; Alt Title 2 </td></tr>
+        </table>
+        <aside><div class="aside-body">
+          <a class="tag">Fantasy (21)</a>
+          <a class="tag">Romance</a>
+          <a class="tag">  Action  (5)</a>
+          <a class="not-a-tag">should-be-ignored</a>
+        </div></aside>
+        <table id="chapter_table">
+          <tr><td><h4><a class="chico" href="/manga/foo/chapter/1">Ch.1</a></h4></td></tr>
+          <tr><td><h4><a class="chico" href="/manga/foo/chapter/2">Ch.2 - The Beginning</a></h4></td></tr>
+          <tr><td><h4><a class="chico" href="/manga/foo/chapter/3">Chapter 3: Onward</a></h4></td></tr>
+        </table>
+        </body></html>
+    "#;
+
+    #[test]
+    fn parse_manga_list_extracts_title_url_and_derives_key() {
+        let refs = parse_manga_list(LISTING_HTML);
+        assert_eq!(refs.len(), 2, "third entry has empty href and is skipped");
+        assert_eq!(refs[0].title, "Foo Manga");
+        assert_eq!(refs[0].url, "https://target.example/manga/foo");
+        assert_eq!(refs[0].source_manga_key, "foo");
+        assert_eq!(refs[1].title, "Bar Baz");
+        assert_eq!(refs[1].source_manga_key, "bar-baz");
+    }
+
+    #[test]
+    fn parse_manga_detail_pulls_all_fields() {
+        let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse");
+        assert_eq!(m.source_manga_key, "test-key");
+        assert_eq!(m.title, "Test Manga Title");
+        assert_eq!(m.summary.as_deref(), Some("A summary of the manga."));
+        assert_eq!(m.authors, vec!["Author One", "Author Two"]);
+        assert_eq!(m.genres, vec!["Action", "Drama"]);
+        assert_eq!(m.status.as_deref(), Some("ongoing"));
+        assert_eq!(m.alternative_titles, vec!["Alt Title 1", "Alt Title 2"]);
+        // Counts in parentheses are stripped — "Fantasy (21)" → "Fantasy".
+        assert_eq!(m.tags, vec!["Fantasy", "Romance", "Action"]);
+        assert_eq!(m.cover_url.as_deref(), Some("/cover.jpg"));
+        assert!(!m.metadata_hash.is_empty());
+
+        assert_eq!(m.chapters.len(), 3);
+        assert_eq!(m.chapters[0].number, 1);
+        assert_eq!(m.chapters[0].title.as_deref(), Some("Ch.1"));
+        assert_eq!(m.chapters[0].url, "/manga/foo/chapter/1");
+        assert_eq!(m.chapters[0].source_chapter_key, "1");
+        assert_eq!(m.chapters[1].number, 2);
+        assert_eq!(m.chapters[1].title.as_deref(), Some("Ch.2 - The Beginning"));
+        assert_eq!(m.chapters[2].number, 3);
+        assert_eq!(m.chapters[2].title.as_deref(), Some("Chapter 3: Onward"));
+    }
+
+    #[test]
+    fn status_normalized_case_insensitively() {
+        assert_eq!(normalize_status(Some("Ongoing"), "k").as_deref(), Some("ongoing"));
+        assert_eq!(normalize_status(Some("ONGOING"), "k").as_deref(), Some("ongoing"));
+        assert_eq!(normalize_status(Some("  completed "), "k").as_deref(), Some("completed"));
+    }
+
+    #[test]
+    fn unknown_status_logs_and_returns_none() {
+        // Logging is observable in test output via tracing-test, but
+        // here we just assert the contract: unknown becomes None
+        // (and the manga is therefore still synced by the caller).
+        assert!(normalize_status(Some("Hiatus"), "k").is_none());
+        assert!(normalize_status(Some(""), "k").is_none());
+        assert!(normalize_status(None, "k").is_none());
+    }
+
+    #[test]
+    fn strip_tag_count_drops_trailing_digit_parens_only() {
+        assert_eq!(strip_tag_count("Fantasy (21)"), "Fantasy");
+        assert_eq!(strip_tag_count("  Action  (5) "), "Action");
+        assert_eq!(strip_tag_count("Romance"), "Romance");
+        // Non-numeric parens stay put.
+        assert_eq!(strip_tag_count("Slice of Life (sub)"), "Slice of Life (sub)");
+        // Only the trailing paren is considered.
+        assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
+    }
+
+    #[test]
+    fn parse_chapter_number_grabs_first_integer_run() {
+        assert_eq!(parse_chapter_number("Ch.1"), Some(1));
+        assert_eq!(parse_chapter_number("Chapter 12"), Some(12));
+        assert_eq!(parse_chapter_number("Ch.2 - The Beginning"), Some(2));
+        // Decimal chapters keep the integer part (i32 storage).
+        assert_eq!(parse_chapter_number("Ch.12.5"), Some(12));
+        assert_eq!(parse_chapter_number("Special"), None);
+    }
+
+    #[test]
+    fn parse_last_page_picks_highest_pagination_link() {
+        let html = r#"
+            <div id="left_side"><div class="pagination">
+              <a href="/list/1/">Prev</a>
+              <ol>
+                <li><a href="/list/1/">1</a></li>
+                <li><a href="/list/2/">2</a></li>
+                <li><a href="/list/47/">47</a></li>
+                <li><a href="/list/2/">Next</a></li>
+              </ol>
+            </div></div>
+        "#;
+        let doc = scraper::Html::parse_document(html);
+        assert_eq!(parse_last_page(&doc), Some(47));
+    }
+
+    #[test]
+    fn parse_last_page_none_when_no_pagination() {
+        let doc = scraper::Html::parse_document("<html></html>");
+        assert!(parse_last_page(&doc).is_none());
+    }
+
+    #[test]
+    fn page_url_substitutes_numeric_path_segment() {
+        assert_eq!(
+            page_url("https://site.example/list/1/?f=1&o=1&sortby=update_date&e=", 5),
+            "https://site.example/list/5/?f=1&o=1&sortby=update_date&e="
+        );
+        // No numeric segment → URL returned unchanged.
+        assert_eq!(
+            page_url("https://site.example/list/?f=1", 5),
+            "https://site.example/list/?f=1"
+        );
+    }
+
+    #[test]
+    fn derive_key_strips_trailing_slash_and_query() {
+        assert_eq!(derive_key_from_url("https://x.example/manga/foo/"), "foo");
+        assert_eq!(derive_key_from_url("https://x.example/manga/foo?p=1"), "foo");
+        assert_eq!(derive_key_from_url("/manga/bar"), "bar");
+    }
+
+    #[test]
+    fn metadata_hash_is_stable_and_field_sensitive() {
+        let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
+        let again = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
+        assert_eq!(base.metadata_hash, again.metadata_hash);
+
+        // Same fields except status flipped — hash must change.
+        let altered_html = DETAIL_HTML.replace("Ongoing", "Completed");
+        let altered = parse_manga_detail(&altered_html, "k", true).unwrap();
+        assert_ne!(base.metadata_hash, altered.metadata_hash);
+    }
+
+    #[test]
+    fn missing_optional_fields_parse_to_none() {
+        let html = r#"<html><body><div class="w-title"><h1>Minimal</h1></div></body></html>"#;
+        let m = parse_manga_detail(html, "min", true).unwrap();
+        assert_eq!(m.title, "Minimal");
+        assert!(m.summary.is_none());
+        assert!(m.status.is_none());
+        assert!(m.authors.is_empty());
+        assert!(m.genres.is_empty());
+        assert!(m.tags.is_empty());
+        assert!(m.alternative_titles.is_empty());
+        assert!(m.chapters.is_empty());
+    }
+
+    #[test]
+    fn parse_manga_detail_skips_chapters_when_disabled() {
+        // Same fixture that yields 3 chapters above; with include_chapters=false
+        // the chapter table is ignored and the rest of the metadata still parses.
+        let m = parse_manga_detail(DETAIL_HTML, "k", false).unwrap();
+        assert!(m.chapters.is_empty(), "chapters should be empty when disabled");
+        assert_eq!(m.title, "Test Manga Title", "other fields still parse");
+        assert_eq!(m.authors, vec!["Author One", "Author Two"]);
+    }
+
+    #[test]
+    fn parse_manga_detail_errors_on_missing_title() {
+        let html = "<html><body><p>nothing</p></body></html>";
+        let err = parse_manga_detail(html, "x", true).unwrap_err();
+        assert!(err.to_string().contains("missing .w-title h1"));
+    }
+}