feat(crawler): recircuit TOR on transient pages and unauthenticated probes
- target.rs swaps retry_on_transient → retry_on_transient_with_hook, signaling NEWNYM via ctx.tor between attempts when configured. - session.rs gains verify_session_with_recircuit; the bare verify_session is now a one-line wrapper passing tor=None, unauth_max_recircuit=0. The inner run_session_probe_loop is pure-over-IO and unit-tested with closure-based fakes. - content.rs extracts fetch_chapter_html_once + the closure-driven fetch_chapter_html_with_recircuit, used by sync_chapter_content to retry on Transient or Unauthenticated up to a recircuit_budget. Budget = 0 (no TOR) preserves original behavior bit-for-bit. - app.rs and bin/crawler.rs construct the controller before on_launch and pass it into verify_session_with_recircuit, so a transient hiccup at startup no longer requires PHPSESSID rotation. Recircuit budget defaults to CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS (3). Errors from NEWNYM are logged and swallowed — failing to recircuit should not take down the crawl. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -18,7 +18,7 @@ use super::{
|
||||
SourceMangaRef,
|
||||
};
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient_with_hook, PageError,
|
||||
};
|
||||
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
|
||||
|
||||
@@ -79,12 +79,13 @@ impl Source for TargetSource {
|
||||
// and the HTML is handed straight to the first `next_batch` call
|
||||
// so the walker doesn't re-fetch it. Page count is discovered
|
||||
// incrementally — see `TargetSourceWalker::next_batch`.
|
||||
let first_html = retry_on_transient(
|
||||
let first_html = retry_on_transient_with_hook(
|
||||
|| async {
|
||||
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
|| async { recircuit_if_configured(ctx.tor).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -169,7 +170,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
parse_manga_list_from(&doc)?
|
||||
}
|
||||
None => {
|
||||
retry_on_transient(
|
||||
retry_on_transient_with_hook(
|
||||
|| async {
|
||||
let html = navigate(
|
||||
ctx,
|
||||
@@ -182,12 +183,13 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
|| async { recircuit_if_configured(ctx.tor).await },
|
||||
)
|
||||
.await?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
retry_on_transient(
|
||||
retry_on_transient_with_hook(
|
||||
|| async {
|
||||
let url = page_url(&self.base_url, page_num);
|
||||
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
|
||||
@@ -196,6 +198,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
|| async { recircuit_if_configured(ctx.tor).await },
|
||||
)
|
||||
.await?
|
||||
};
|
||||
@@ -274,6 +277,20 @@ fn classify_navigate_html(html: String) -> Result<String, PageError> {
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
/// Hook for [`retry_on_transient_with_hook`]: when TOR is configured,
|
||||
/// signal `NEWNYM` so the next navigation draws a fresh exit. Errors
|
||||
/// from the controller are logged and swallowed — failing to recircuit
|
||||
/// shouldn't take down the crawl, the next attempt just runs on the
|
||||
/// same circuit as before.
|
||||
async fn recircuit_if_configured(tor: Option<&crate::crawler::tor::TorController>) {
|
||||
if let Some(t) = tor {
|
||||
if let Err(e) = t.new_identity().await {
|
||||
tracing::warn!(error = %e, "TOR NEWNYM failed; retrying on same circuit");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Substitutes the first `/N/` path segment with the target page
|
||||
/// number. Source impls that paginate via a different URL shape can
|
||||
/// override this — for the modeled site the segment is always present.
|
||||
|
||||
Reference in New Issue
Block a user