|
|
|
|
@@ -7,6 +7,7 @@
|
|
|
|
|
//! (`td:has(label:contains("Author:"))`) are implemented by walking
|
|
|
|
|
//! the parsed tree.
|
|
|
|
|
|
|
|
|
|
use std::collections::VecDeque;
|
|
|
|
|
use std::time::Duration;
|
|
|
|
|
|
|
|
|
|
use anyhow::Context;
|
|
|
|
|
@@ -74,11 +75,10 @@ impl Source for TargetSource {
|
|
|
|
|
&self,
|
|
|
|
|
ctx: &FetchContext<'_>,
|
|
|
|
|
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
|
|
|
|
|
// Probe page 1 up front (with transient retry) for two reasons:
|
|
|
|
|
// a broken first page should abort cleanly rather than mid-walk,
|
|
|
|
|
// and the HTML is handed straight to the first `next_batch` call
|
|
|
|
|
// so the walker doesn't re-fetch it. Page count is discovered
|
|
|
|
|
// incrementally — see `TargetSourceWalker::next_batch`.
|
|
|
|
|
// Always visit page 1 first because that's the only way to
|
|
|
|
|
// discover `last_page`. Retry it on transient — a broken first
|
|
|
|
|
// page would otherwise abort the whole walk before we've even
|
|
|
|
|
// started.
|
|
|
|
|
let first_html = retry_on_transient(
|
|
|
|
|
|| async {
|
|
|
|
|
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
|
|
|
|
@@ -87,10 +87,21 @@ impl Source for TargetSource {
|
|
|
|
|
PAGE_TRANSIENT_RETRY_DELAY,
|
|
|
|
|
)
|
|
|
|
|
.await?;
|
|
|
|
|
let last_page = {
|
|
|
|
|
let doc = scraper::Html::parse_document(&first_html);
|
|
|
|
|
parse_last_page(&doc)
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let order = build_page_order(last_page);
|
|
|
|
|
tracing::info!(
|
|
|
|
|
last_page = ?last_page,
|
|
|
|
|
page_count = order.len(),
|
|
|
|
|
"walking pagination"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Ok(Box::new(TargetSourceWalker {
|
|
|
|
|
base_url: self.base_url.clone(),
|
|
|
|
|
next_page: 1,
|
|
|
|
|
pages_remaining: order,
|
|
|
|
|
first_page_html: Some(first_html),
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
@@ -136,19 +147,24 @@ impl Source for TargetSource {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in
|
|
|
|
|
/// order, terminating as soon as a page renders cleanly with zero entries
|
|
|
|
|
/// — that's the "we ran off the end of the index" signal. Page 1's HTML
|
|
|
|
|
/// is cached at construction time (discover already had to fetch it for
|
|
|
|
|
/// the transient probe) so the first batch doesn't re-fetch.
|
|
|
|
|
///
|
|
|
|
|
/// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what
|
|
|
|
|
/// stops us: the parser's `#logo` sentinel converts unrendered pages
|
|
|
|
|
/// into transient errors before they reach this loop, so an empty
|
|
|
|
|
/// parse result reliably means "no more entries."
|
|
|
|
|
/// Build the queue of page numbers `TargetSource::discover` will walk.
|
|
|
|
|
/// The site orders by `update_date DESC`, so newest-first is just the
|
|
|
|
|
/// natural page order: `1..=last`. If `last_page` is unknown (source
|
|
|
|
|
/// surfaces no pagination) only page 1 is visited.
|
|
|
|
|
fn build_page_order(last_page: Option<i32>) -> VecDeque<i32> {
|
|
|
|
|
match last_page {
|
|
|
|
|
None => VecDeque::from([1]),
|
|
|
|
|
Some(last) => (1..=last).collect(),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Walker returned by [`TargetSource::discover`]. Pops one source-index
|
|
|
|
|
/// page per `next_batch` call. Page 1's HTML is cached at construction
|
|
|
|
|
/// time (the discover call needed it to read `last_page` anyway) so the
|
|
|
|
|
/// batch covering page 1 doesn't re-fetch.
|
|
|
|
|
struct TargetSourceWalker {
|
|
|
|
|
base_url: String,
|
|
|
|
|
next_page: i32,
|
|
|
|
|
pages_remaining: VecDeque<i32>,
|
|
|
|
|
first_page_html: Option<String>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -158,11 +174,13 @@ impl DiscoverWalk for TargetSourceWalker {
|
|
|
|
|
&mut self,
|
|
|
|
|
ctx: &FetchContext<'_>,
|
|
|
|
|
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
|
|
|
|
|
let page_num = self.next_page;
|
|
|
|
|
let Some(page_num) = self.pages_remaining.pop_front() else {
|
|
|
|
|
return Ok(None);
|
|
|
|
|
};
|
|
|
|
|
let page_refs = if page_num == 1 {
|
|
|
|
|
// Reuse the cached page-1 HTML from the initial probe. Take
|
|
|
|
|
// it (rather than clone) so a future re-entry that somehow
|
|
|
|
|
// revisits page 1 still falls back to a real fetch.
|
|
|
|
|
// it (rather than clone) so a malformed page-order queue
|
|
|
|
|
// that re-visits page 1 still falls back to a real fetch.
|
|
|
|
|
match self.first_page_html.take() {
|
|
|
|
|
Some(html) => {
|
|
|
|
|
let doc = scraper::Html::parse_document(&html);
|
|
|
|
|
@@ -200,10 +218,6 @@ impl DiscoverWalk for TargetSourceWalker {
|
|
|
|
|
.await?
|
|
|
|
|
};
|
|
|
|
|
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
|
|
|
|
if page_refs.is_empty() {
|
|
|
|
|
return Ok(None);
|
|
|
|
|
}
|
|
|
|
|
self.next_page += 1;
|
|
|
|
|
Ok(Some(page_refs))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -274,6 +288,20 @@ fn classify_navigate_html(html: String) -> Result<String, PageError> {
|
|
|
|
|
Ok(html)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
|
|
|
|
|
// Pagination links carry their page number as text. Take the
|
|
|
|
|
// numeric maximum so we don't depend on a specific layout (Prev,
|
|
|
|
|
// Next, ellipses, etc. all get filtered out by .parse).
|
|
|
|
|
let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
|
|
|
|
|
doc.select(&sel)
|
|
|
|
|
.filter_map(|a| {
|
|
|
|
|
collapse_whitespace(&a.text().collect::<String>())
|
|
|
|
|
.parse::<i32>()
|
|
|
|
|
.ok()
|
|
|
|
|
})
|
|
|
|
|
.max()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Substitutes the first `/N/` path segment with the target page
|
|
|
|
|
/// number. Source impls that paginate via a different URL shape can
|
|
|
|
|
/// override this — for the modeled site the segment is always present.
|
|
|
|
|
@@ -825,6 +853,29 @@ mod tests {
|
|
|
|
|
assert_eq!(parse_chapter_number("Special"), None);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn parse_last_page_picks_highest_pagination_link() {
|
|
|
|
|
let html = r#"
|
|
|
|
|
<div id="left_side"><div class="pagination">
|
|
|
|
|
<a href="/list/1/">Prev</a>
|
|
|
|
|
<ol>
|
|
|
|
|
<li><a href="/list/1/">1</a></li>
|
|
|
|
|
<li><a href="/list/2/">2</a></li>
|
|
|
|
|
<li><a href="/list/47/">47</a></li>
|
|
|
|
|
<li><a href="/list/2/">Next</a></li>
|
|
|
|
|
</ol>
|
|
|
|
|
</div></div>
|
|
|
|
|
"#;
|
|
|
|
|
let doc = scraper::Html::parse_document(html);
|
|
|
|
|
assert_eq!(parse_last_page(&doc), Some(47));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn parse_last_page_none_when_no_pagination() {
|
|
|
|
|
let doc = scraper::Html::parse_document("<html></html>");
|
|
|
|
|
assert!(parse_last_page(&doc).is_none());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn page_url_substitutes_numeric_path_segment() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
@@ -973,6 +1024,28 @@ mod tests {
|
|
|
|
|
assert!(err.is_transient(), "got non-transient: {err}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn build_page_order_is_natural_one_to_last() {
|
|
|
|
|
// Newest-first is just the source's natural pagination order:
|
|
|
|
|
// (update_date DESC) lives at page 1, oldest at the last page.
|
|
|
|
|
let order = build_page_order(Some(3));
|
|
|
|
|
assert_eq!(Vec::from(order), vec![1, 2, 3]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn build_page_order_falls_back_to_page_one_only_without_pagination() {
|
|
|
|
|
// Source surfaced no pagination control — visit page 1 alone
|
|
|
|
|
// and let the walk end after one batch.
|
|
|
|
|
let order = build_page_order(None);
|
|
|
|
|
assert_eq!(Vec::from(order), vec![1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn build_page_order_single_page_index_yields_one_entry() {
|
|
|
|
|
let order = build_page_order(Some(1));
|
|
|
|
|
assert_eq!(Vec::from(order), vec![1]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn parse_chapter_list_returns_transient_when_table_missing() {
|
|
|
|
|
// Partial render (post-load JS hadn't injected the table, layout
|
|
|
|
|
|