feat: CRAWLER_KEEP_BROWSER_OPEN waits for Ctrl+C in headed mode (0.26.0)

Debug aid: when set in headed mode, the crawler blocks on Ctrl+C at
every shutdown point (early auth bails + normal completion) instead
of closing the browser immediately. Operator can inspect DOM, cookies,
and network state in the visible Chromium window before exit.

Ignored in headless (no window to inspect) — logged as a warning if
set under headless so the operator doesn't sit waiting.

chromiumoxide's `Browser` is `kill_on_drop`, so the close-or-wait
helper must await Ctrl+C *before* the Handle is dropped — otherwise
the Chromium child gets killed out from under the operator.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-24 21:33:18 +02:00
parent d24e68c78d
commit 728d704a66
4 changed files with 43 additions and 6 deletions

View File

@@ -48,6 +48,11 @@
//! Chromium (`--proxy-server`) and `reqwest::Proxy::all`. Supports
//! `http://`, `https://`, and `socks5://` (with optional user:pass).
//! Example: `socks5://user:pass@host:1080`. Unset → direct.
//! - **Keep browser open**: `CRAWLER_KEEP_BROWSER_OPEN=1` — when
//! running headed, block on Ctrl+C at every shutdown point so the
//! operator can inspect DOM state, cookies, or network calls in the
//! visible Chromium window before exit. Ignored in headless mode
//! (no window to inspect).
use std::path::PathBuf;
use std::sync::Arc;
@@ -110,6 +115,7 @@ async fn main() -> anyhow::Result<()> {
let proxy_url = std::env::var("CRAWLER_PROXY")
.ok()
.filter(|s| !s.trim().is_empty());
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
let db = PgPoolOptions::new()
.max_connections(5)
@@ -149,6 +155,19 @@ async fn main() -> anyhow::Result<()> {
if let Some(proxy) = &proxy_url {
options.extra_args.push(format!("--proxy-server={proxy}"));
}
// Keep-open is a debug aid; only meaningful when there's a window
// to inspect. Warn loudly if the operator set it under headless so
// they don't sit waiting for a Ctrl+C that won't show anything.
let keep_open = match (keep_browser_open, options.mode) {
(true, browser::BrowserMode::Headed) => true,
(true, browser::BrowserMode::Headless) => {
tracing::warn!(
"CRAWLER_KEEP_BROWSER_OPEN ignored in headless mode (no window to inspect)"
);
false
}
_ => false,
};
tracing::info!(
?options,
%start_url,
@@ -164,6 +183,7 @@ async fn main() -> anyhow::Result<()> {
cookie_domain = ?cookie_domain,
user_agent = ?user_agent,
proxy = ?proxy_url,
keep_open,
storage_dir = %storage_dir.display(),
"starting crawler"
);
@@ -177,13 +197,13 @@ async fn main() -> anyhow::Result<()> {
// instead of 30 min into a backfill.
let session_ready = if let (Some(sid), Some(domain)) = (&phpsessid, &cookie_domain) {
if let Err(e) = session::inject_phpsessid(handle.browser(), sid, domain).await {
handle.close().await.ok();
close_or_wait(handle, keep_open).await;
return Err(e);
}
match session::verify_session(handle.browser(), &start_url).await {
Ok(()) => true,
Err(e) => {
handle.close().await.ok();
close_or_wait(handle, keep_open).await;
return Err(e);
}
}
@@ -208,10 +228,27 @@ async fn main() -> anyhow::Result<()> {
force_refetch_chapters,
)
.await;
handle.close().await.ok();
close_or_wait(handle, keep_open).await;
result
}
/// Either close the browser immediately or wait for Ctrl+C first.
/// `keep_open=true` is only ever passed when the browser is headed, so
/// the operator has a real window to poke at. Browser is dropped at
/// the end of this fn in both cases — chromiumoxide's `Browser` is
/// `kill_on_drop`, so we must wait for the Ctrl+C *before* the drop
/// or the Chromium child gets killed out from under the operator.
async fn close_or_wait(handle: browser::Handle, keep_open: bool) {
if keep_open {
tracing::info!(
"crawler finished; browser kept open. Press Ctrl+C to close and exit."
);
let _ = tokio::signal::ctrl_c().await;
tracing::info!("Ctrl+C received; closing browser");
}
let _ = handle.close().await;
}
#[allow(clippy::too_many_arguments)]
async fn run(
browser: &chromiumoxide::Browser,