Compare commits
5 Commits
fix/ci-dep
...
fix/ci-bui
| Author | SHA1 | Date | |
|---|---|---|---|
| bd61a64c70 | |||
| 3b3d13a0f6 | |||
| 0f90af80cb | |||
| 6b49a47d0a | |||
| e851355f28 |
@@ -74,6 +74,14 @@ CRAWLER_DOWNLOAD_ALLOWLIST=
|
|||||||
CRAWLER_ALLOW_ANY_HOST=false
|
CRAWLER_ALLOW_ANY_HOST=false
|
||||||
# Hard cap on a single image body. Default 32 MiB.
|
# Hard cap on a single image body. Default 32 MiB.
|
||||||
CRAWLER_MAX_IMAGE_BYTES=33554432
|
CRAWLER_MAX_IMAGE_BYTES=33554432
|
||||||
|
# Path to a system Chromium binary. When set, the crawler skips the
|
||||||
|
# bundled-fetcher download. Required on platforms without a usable
|
||||||
|
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On
|
||||||
|
# Debian: /usr/bin/chromium-headless-shell or /usr/bin/chromium. On
|
||||||
|
# Ubuntu the package is chromium-browser (different path). Pair with
|
||||||
|
# `docker compose build --build-arg INSTALL_CHROMIUM=true backend` so
|
||||||
|
# the image actually contains the binary.
|
||||||
|
CRAWLER_CHROMIUM_BINARY=
|
||||||
|
|
||||||
# ----- Frontend -----
|
# ----- Frontend -----
|
||||||
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
test-backend:
|
test-backend:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
|
||||||
image: rust:1-slim
|
|
||||||
services:
|
services:
|
||||||
postgres:
|
postgres:
|
||||||
image: postgres:16-alpine
|
image: postgres:16-alpine
|
||||||
@@ -28,10 +26,18 @@ jobs:
|
|||||||
DATABASE_URL: postgres://mangalord:mangalord@postgres:5432/mangalord
|
DATABASE_URL: postgres://mangalord:mangalord@postgres:5432/mangalord
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Install build deps
|
# ubuntu-latest has node (so JS actions like checkout/cache run) but no
|
||||||
|
# Rust. We intentionally avoid `container: rust:1-slim` because act_runner
|
||||||
|
# runs JS actions with node *inside* the job container, and the slim Rust
|
||||||
|
# image ships no node (checkout would fail with exit 127).
|
||||||
|
- name: Install Rust + build deps
|
||||||
run: |
|
run: |
|
||||||
apt-get update
|
set -eu
|
||||||
apt-get install -y --no-install-recommends pkg-config libssl-dev ca-certificates
|
SUDO=""; [ "$(id -u)" = "0" ] || SUDO="sudo"
|
||||||
|
$SUDO apt-get update
|
||||||
|
$SUDO apt-get install -y --no-install-recommends pkg-config libssl-dev ca-certificates curl
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
|
||||||
- name: Cache cargo registry and target
|
- name: Cache cargo registry and target
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v4
|
||||||
with:
|
with:
|
||||||
@@ -66,9 +72,17 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [test-backend, test-frontend]
|
needs: [test-backend, test-frontend]
|
||||||
# PRs only run the test jobs; build + deploy are reserved for
|
# PRs only run the test jobs; build + deploy are reserved for
|
||||||
# post-merge pushes to main. Without this gate every PR would push
|
# post-merge pushes to main.
|
||||||
# a tagged image to the registry and SSH-deploy to prod.
|
|
||||||
if: github.event_name != 'pull_request'
|
if: github.event_name != 'pull_request'
|
||||||
|
# Build on the host docker daemon directly (docker-outside-of-docker):
|
||||||
|
# the runner shares the deploy host's daemon, so a plain `docker build`
|
||||||
|
# reuses the host's layer cache and avoids buildx's docker-container
|
||||||
|
# driver + the gha cache exporter — neither works against this single-host
|
||||||
|
# act_runner, and there is no in-job daemon socket unless we mount it.
|
||||||
|
container:
|
||||||
|
image: docker.gitea.com/runner-images:ubuntu-latest
|
||||||
|
volumes:
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
outputs:
|
outputs:
|
||||||
image_tag: ${{ steps.meta.outputs.image_tag }}
|
image_tag: ${{ steps.meta.outputs.image_tag }}
|
||||||
version: ${{ steps.meta.outputs.version }}
|
version: ${{ steps.meta.outputs.version }}
|
||||||
@@ -87,48 +101,32 @@ jobs:
|
|||||||
echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT"
|
echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT"
|
||||||
echo "version=${version}" >> "$GITHUB_OUTPUT"
|
echo "version=${version}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
- uses: docker/setup-buildx-action@v3
|
- name: Build & push backend + frontend
|
||||||
|
env:
|
||||||
- name: docker login
|
REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
|
||||||
uses: docker/login-action@v3
|
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
|
||||||
with:
|
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
|
||||||
registry: ${{ secrets.REGISTRY_URL }}
|
IMAGE_TAG: ${{ steps.meta.outputs.image_tag }}
|
||||||
username: ${{ secrets.REGISTRY_USERNAME }}
|
VERSION: ${{ steps.meta.outputs.version }}
|
||||||
password: ${{ secrets.REGISTRY_PASSWORD }}
|
run: |
|
||||||
|
set -eu
|
||||||
- name: Build & push backend
|
echo "$REGISTRY_PASSWORD" | docker login "$REGISTRY_URL" -u "$REGISTRY_USERNAME" --password-stdin
|
||||||
uses: docker/build-push-action@v5
|
for svc in backend frontend; do
|
||||||
with:
|
img="$REGISTRY_URL/mangalord-$svc"
|
||||||
context: ./backend
|
docker build -t "$img:$IMAGE_TAG" -t "$img:latest" -t "$img:$VERSION" "./$svc"
|
||||||
push: true
|
for tag in "$IMAGE_TAG" latest "$VERSION"; do docker push "$img:$tag"; done
|
||||||
tags: |
|
done
|
||||||
${{ secrets.REGISTRY_URL }}/mangalord-backend:latest
|
docker logout "$REGISTRY_URL"
|
||||||
${{ secrets.REGISTRY_URL }}/mangalord-backend:${{ steps.meta.outputs.image_tag }}
|
|
||||||
${{ secrets.REGISTRY_URL }}/mangalord-backend:${{ steps.meta.outputs.version }}
|
|
||||||
cache-from: type=gha,scope=backend
|
|
||||||
cache-to: type=gha,mode=max,scope=backend
|
|
||||||
|
|
||||||
- name: Build & push frontend
|
|
||||||
uses: docker/build-push-action@v5
|
|
||||||
with:
|
|
||||||
context: ./frontend
|
|
||||||
push: true
|
|
||||||
tags: |
|
|
||||||
${{ secrets.REGISTRY_URL }}/mangalord-frontend:latest
|
|
||||||
${{ secrets.REGISTRY_URL }}/mangalord-frontend:${{ steps.meta.outputs.image_tag }}
|
|
||||||
${{ secrets.REGISTRY_URL }}/mangalord-frontend:${{ steps.meta.outputs.version }}
|
|
||||||
cache-from: type=gha,scope=frontend
|
|
||||||
cache-to: type=gha,mode=max,scope=frontend
|
|
||||||
|
|
||||||
deploy:
|
deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: build-and-push
|
needs: build-and-push
|
||||||
if: github.event_name != 'pull_request'
|
if: github.event_name != 'pull_request'
|
||||||
# Single-host deploy: the runner lives on the same box as the stack, so we
|
# Single-host deploy: the runner lives on the same box as the stack, so we
|
||||||
# drive the host docker daemon directly (act_runner shares its socket via
|
# drive the host docker daemon directly (the job mounts the host docker
|
||||||
# `docker_host: "-"`) instead of SSHing out. The compose dir is bind-mounted
|
# socket) instead of SSHing out. The compose dir is bind-mounted at its
|
||||||
# at its REAL host path so compose's relative bind-mounts (./mangalord/...,
|
# REAL host path so compose's relative bind-mounts (./mangalord/...,
|
||||||
# ./Caddyfile) resolve; this requires `/mnt/ssd/docker-data` in the runner's
|
# ./Caddyfile) resolve; both paths must be in the runner's
|
||||||
# container.valid_volumes. The central compose references the images as
|
# container.valid_volumes. The central compose references the images as
|
||||||
# registry.mc02.dev/mangalord-*:${MANGALORD_TAG:-latest}, so we only pull
|
# registry.mc02.dev/mangalord-*:${MANGALORD_TAG:-latest}, so we only pull
|
||||||
# and recreate the two mangalord services at the freshly built SHA.
|
# and recreate the two mangalord services at the freshly built SHA.
|
||||||
@@ -136,6 +134,7 @@ jobs:
|
|||||||
image: docker:cli
|
image: docker:cli
|
||||||
volumes:
|
volumes:
|
||||||
- /mnt/ssd/docker-data:/mnt/ssd/docker-data
|
- /mnt/ssd/docker-data:/mnt/ssd/docker-data
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
steps:
|
steps:
|
||||||
- name: Deploy to the local stack
|
- name: Deploy to the local stack
|
||||||
working-directory: /mnt/ssd/docker-data
|
working-directory: /mnt/ssd/docker-data
|
||||||
|
|||||||
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.44.0"
|
version = "0.45.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.44.0"
|
version = "0.45.1"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -25,8 +25,23 @@ FROM debian:trixie-slim
|
|||||||
# binary ("GLIBC_2.39 not found"). Keep these two in lockstep on bumps.
|
# binary ("GLIBC_2.39 not found"). Keep these two in lockstep on bumps.
|
||||||
# `curl` is for the container HEALTHCHECK; `ca-certificates` is for
|
# `curl` is for the container HEALTHCHECK; `ca-certificates` is for
|
||||||
# outbound HTTPS (crawler covers/pages).
|
# outbound HTTPS (crawler covers/pages).
|
||||||
|
#
|
||||||
|
# INSTALL_CHROMIUM is an opt-in for deployments that can't use the
|
||||||
|
# chromiumoxide fetcher path (notably Linux_arm64 / Raspberry Pi, where
|
||||||
|
# the upstream snapshot bucket has no usable build). When `true`, adds
|
||||||
|
# Debian's apt-packaged headless chromium plus a baseline font set —
|
||||||
|
# pair with `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell`
|
||||||
|
# at runtime so the launcher uses it. Default `false` keeps cloud/x86
|
||||||
|
# images slim.
|
||||||
|
#
|
||||||
|
# Build the Pi image with:
|
||||||
|
# docker compose build --build-arg INSTALL_CHROMIUM=true backend
|
||||||
|
ARG INSTALL_CHROMIUM=false
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
||||||
|
&& if [ "$INSTALL_CHROMIUM" = "true" ]; then \
|
||||||
|
apt-get install -y --no-install-recommends chromium-headless-shell fonts-liberation; \
|
||||||
|
fi \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Non-root runtime user. The API binary doesn't need any root
|
# Non-root runtime user. The API binary doesn't need any root
|
||||||
|
|||||||
@@ -1,10 +1,17 @@
|
|||||||
//! Chromium launcher and lifecycle.
|
//! Chromium launcher and lifecycle.
|
||||||
//!
|
//!
|
||||||
//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
|
//! By default uses `chromiumoxide`'s `fetcher` feature — first call
|
||||||
//! system Chrome install — first call downloads a known-good revision
|
//! downloads a known-good revision into a cache dir and reuses it
|
||||||
//! into a cache dir and reuses it forever after. `BrowserMode` toggles
|
//! forever after. Set `CRAWLER_CHROMIUM_BINARY` to skip the fetcher
|
||||||
//! headed vs headless; the headed path needs a display (real `$DISPLAY`
|
//! and use a system-installed Chromium instead; required on platforms
|
||||||
//! or `xvfb-run`).
|
//! where the upstream snapshot bucket has no usable build (notably
|
||||||
|
//! `Linux_arm64` / Raspberry Pi). Debian's package is at
|
||||||
|
//! `/usr/bin/chromium` or `/usr/bin/chromium-headless-shell`; Ubuntu
|
||||||
|
//! ships it as `chromium-browser` at a different path — don't paste
|
||||||
|
//! the wrong one.
|
||||||
|
//!
|
||||||
|
//! `BrowserMode` toggles headed vs headless; the headed path needs a
|
||||||
|
//! display (real `$DISPLAY` or `xvfb-run`).
|
||||||
//!
|
//!
|
||||||
//! Extra Chromium command-line flags can be supplied through
|
//! Extra Chromium command-line flags can be supplied through
|
||||||
//! [`LaunchOptions::extra_args`] in code, or via the
|
//! [`LaunchOptions::extra_args`] in code, or via the
|
||||||
@@ -165,11 +172,18 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Launches Chromium. Downloads it on first run via the `fetcher`
|
/// Launches Chromium. If `CRAWLER_CHROMIUM_BINARY` is set, uses that
|
||||||
/// feature; subsequent runs hit the cache. The cache dir is
|
/// path directly. Otherwise downloads via the `fetcher` feature on
|
||||||
|
/// first run and hits the cache after that. The fetcher cache dir is
|
||||||
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
|
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
|
||||||
/// else `./.chromium-cache` as a last-resort repo-local fallback.
|
/// else `./.chromium-cache` as a last-resort repo-local fallback.
|
||||||
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
||||||
|
let executable = match system_chromium_path_from_env() {
|
||||||
|
Some(path) => {
|
||||||
|
tracing::info!(path = %path.display(), "using system chromium (CRAWLER_CHROMIUM_BINARY)");
|
||||||
|
path
|
||||||
|
}
|
||||||
|
None => {
|
||||||
let cache = cache_dir()?;
|
let cache = cache_dir()?;
|
||||||
tokio::fs::create_dir_all(&cache)
|
tokio::fs::create_dir_all(&cache)
|
||||||
.await
|
.await
|
||||||
@@ -187,9 +201,12 @@ pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
|||||||
.await
|
.await
|
||||||
.context("download chromium via fetcher")?;
|
.context("download chromium via fetcher")?;
|
||||||
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
||||||
|
info.executable_path
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let mut builder = BrowserConfig::builder()
|
let mut builder = BrowserConfig::builder()
|
||||||
.chrome_executable(info.executable_path)
|
.chrome_executable(executable)
|
||||||
// Linux containers / CI commonly lack the user namespaces
|
// Linux containers / CI commonly lack the user namespaces
|
||||||
// Chromium's sandbox wants. Disable it; the crawler runs in its
|
// Chromium's sandbox wants. Disable it; the crawler runs in its
|
||||||
// own container anyway.
|
// own container anyway.
|
||||||
@@ -246,6 +263,24 @@ fn cache_dir() -> anyhow::Result<PathBuf> {
|
|||||||
Ok(PathBuf::from("./.chromium-cache"))
|
Ok(PathBuf::from("./.chromium-cache"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads `CRAWLER_CHROMIUM_BINARY` and delegates to the pure helper.
|
||||||
|
/// Thin wrapper kept separate so the decision logic can be unit-tested
|
||||||
|
/// without mutating the process environment.
|
||||||
|
fn system_chromium_path_from_env() -> Option<PathBuf> {
|
||||||
|
system_chromium_path_from_value(std::env::var_os("CRAWLER_CHROMIUM_BINARY").as_deref())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `Some(path)` only when the value is set and non-empty. An
|
||||||
|
/// exported-but-blank var (common in compose `${VAR:-}` patterns when
|
||||||
|
/// the operator didn't fill it in) must behave like "unset" — otherwise
|
||||||
|
/// we'd hand chromiumoxide an empty path and fail launch in a confusing
|
||||||
|
/// way.
|
||||||
|
pub(crate) fn system_chromium_path_from_value(
|
||||||
|
raw: Option<&std::ffi::OsStr>,
|
||||||
|
) -> Option<PathBuf> {
|
||||||
|
raw.filter(|v| !v.is_empty()).map(PathBuf::from)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -273,6 +308,33 @@ mod tests {
|
|||||||
assert!(parse_args(" \t\n").is_empty());
|
assert!(parse_args(" \t\n").is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn system_chromium_path_returns_some_when_value_set() {
|
||||||
|
let raw = std::ffi::OsString::from("/usr/bin/chromium-headless-shell");
|
||||||
|
assert_eq!(
|
||||||
|
system_chromium_path_from_value(Some(raw.as_os_str())),
|
||||||
|
Some(PathBuf::from("/usr/bin/chromium-headless-shell"))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn system_chromium_path_returns_none_when_unset() {
|
||||||
|
assert_eq!(system_chromium_path_from_value(None), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn system_chromium_path_treats_empty_as_unset() {
|
||||||
|
// Compose's `${VAR:-}` substitution produces an exported-but-empty
|
||||||
|
// env var when the operator left it blank. Treat it as unset so
|
||||||
|
// the launcher falls back to the fetcher path instead of handing
|
||||||
|
// chromiumoxide an empty path.
|
||||||
|
let raw = std::ffi::OsString::from("");
|
||||||
|
assert_eq!(
|
||||||
|
system_chromium_path_from_value(Some(raw.as_os_str())),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn default_launch_options_are_headless() {
|
fn default_launch_options_are_headless() {
|
||||||
// Headless is the production-safe default — no display required,
|
// Headless is the production-safe default — no display required,
|
||||||
|
|||||||
@@ -7,7 +7,6 @@
|
|||||||
//! (`td:has(label:contains("Author:"))`) are implemented by walking
|
//! (`td:has(label:contains("Author:"))`) are implemented by walking
|
||||||
//! the parsed tree.
|
//! the parsed tree.
|
||||||
|
|
||||||
use std::collections::VecDeque;
|
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
@@ -75,10 +74,11 @@ impl Source for TargetSource {
|
|||||||
&self,
|
&self,
|
||||||
ctx: &FetchContext<'_>,
|
ctx: &FetchContext<'_>,
|
||||||
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
|
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
|
||||||
// Always visit page 1 first because that's the only way to
|
// Probe page 1 up front (with transient retry) for two reasons:
|
||||||
// discover `last_page`. Retry it on transient — a broken first
|
// a broken first page should abort cleanly rather than mid-walk,
|
||||||
// page would otherwise abort the whole walk before we've even
|
// and the HTML is handed straight to the first `next_batch` call
|
||||||
// started.
|
// so the walker doesn't re-fetch it. Page count is discovered
|
||||||
|
// incrementally — see `TargetSourceWalker::next_batch`.
|
||||||
let first_html = retry_on_transient(
|
let first_html = retry_on_transient(
|
||||||
|| async {
|
|| async {
|
||||||
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
||||||
@@ -87,21 +87,10 @@ impl Source for TargetSource {
|
|||||||
PAGE_TRANSIENT_RETRY_DELAY,
|
PAGE_TRANSIENT_RETRY_DELAY,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
let last_page = {
|
|
||||||
let doc = scraper::Html::parse_document(&first_html);
|
|
||||||
parse_last_page(&doc)
|
|
||||||
};
|
|
||||||
|
|
||||||
let order = build_page_order(last_page);
|
|
||||||
tracing::info!(
|
|
||||||
last_page = ?last_page,
|
|
||||||
page_count = order.len(),
|
|
||||||
"walking pagination"
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(Box::new(TargetSourceWalker {
|
Ok(Box::new(TargetSourceWalker {
|
||||||
base_url: self.base_url.clone(),
|
base_url: self.base_url.clone(),
|
||||||
pages_remaining: order,
|
next_page: 1,
|
||||||
first_page_html: Some(first_html),
|
first_page_html: Some(first_html),
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -147,24 +136,19 @@ impl Source for TargetSource {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build the queue of page numbers `TargetSource::discover` will walk.
|
/// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in
|
||||||
/// The site orders by `update_date DESC`, so newest-first is just the
|
/// order, terminating as soon as a page renders cleanly with zero entries
|
||||||
/// natural page order: `1..=last`. If `last_page` is unknown (source
|
/// — that's the "we ran off the end of the index" signal. Page 1's HTML
|
||||||
/// surfaces no pagination) only page 1 is visited.
|
/// is cached at construction time (discover already had to fetch it for
|
||||||
fn build_page_order(last_page: Option<i32>) -> VecDeque<i32> {
|
/// the transient probe) so the first batch doesn't re-fetch.
|
||||||
match last_page {
|
///
|
||||||
None => VecDeque::from([1]),
|
/// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what
|
||||||
Some(last) => (1..=last).collect(),
|
/// stops us: the parser's `#logo` sentinel converts unrendered pages
|
||||||
}
|
/// into transient errors before they reach this loop, so an empty
|
||||||
}
|
/// parse result reliably means "no more entries."
|
||||||
|
|
||||||
/// Walker returned by [`TargetSource::discover`]. Pops one source-index
|
|
||||||
/// page per `next_batch` call. Page 1's HTML is cached at construction
|
|
||||||
/// time (the discover call needed it to read `last_page` anyway) so the
|
|
||||||
/// batch covering page 1 doesn't re-fetch.
|
|
||||||
struct TargetSourceWalker {
|
struct TargetSourceWalker {
|
||||||
base_url: String,
|
base_url: String,
|
||||||
pages_remaining: VecDeque<i32>,
|
next_page: i32,
|
||||||
first_page_html: Option<String>,
|
first_page_html: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,13 +158,11 @@ impl DiscoverWalk for TargetSourceWalker {
|
|||||||
&mut self,
|
&mut self,
|
||||||
ctx: &FetchContext<'_>,
|
ctx: &FetchContext<'_>,
|
||||||
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
|
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
|
||||||
let Some(page_num) = self.pages_remaining.pop_front() else {
|
let page_num = self.next_page;
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
let page_refs = if page_num == 1 {
|
let page_refs = if page_num == 1 {
|
||||||
// Reuse the cached page-1 HTML from the initial probe. Take
|
// Reuse the cached page-1 HTML from the initial probe. Take
|
||||||
// it (rather than clone) so a malformed page-order queue
|
// it (rather than clone) so a future re-entry that somehow
|
||||||
// that re-visits page 1 still falls back to a real fetch.
|
// revisits page 1 still falls back to a real fetch.
|
||||||
match self.first_page_html.take() {
|
match self.first_page_html.take() {
|
||||||
Some(html) => {
|
Some(html) => {
|
||||||
let doc = scraper::Html::parse_document(&html);
|
let doc = scraper::Html::parse_document(&html);
|
||||||
@@ -218,6 +200,10 @@ impl DiscoverWalk for TargetSourceWalker {
|
|||||||
.await?
|
.await?
|
||||||
};
|
};
|
||||||
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
||||||
|
if page_refs.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
self.next_page += 1;
|
||||||
Ok(Some(page_refs))
|
Ok(Some(page_refs))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -288,20 +274,6 @@ fn classify_navigate_html(html: String) -> Result<String, PageError> {
|
|||||||
Ok(html)
|
Ok(html)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
|
|
||||||
// Pagination links carry their page number as text. Take the
|
|
||||||
// numeric maximum so we don't depend on a specific layout (Prev,
|
|
||||||
// Next, ellipses, etc. all get filtered out by .parse).
|
|
||||||
let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
|
|
||||||
doc.select(&sel)
|
|
||||||
.filter_map(|a| {
|
|
||||||
collapse_whitespace(&a.text().collect::<String>())
|
|
||||||
.parse::<i32>()
|
|
||||||
.ok()
|
|
||||||
})
|
|
||||||
.max()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Substitutes the first `/N/` path segment with the target page
|
/// Substitutes the first `/N/` path segment with the target page
|
||||||
/// number. Source impls that paginate via a different URL shape can
|
/// number. Source impls that paginate via a different URL shape can
|
||||||
/// override this — for the modeled site the segment is always present.
|
/// override this — for the modeled site the segment is always present.
|
||||||
@@ -853,29 +825,6 @@ mod tests {
|
|||||||
assert_eq!(parse_chapter_number("Special"), None);
|
assert_eq!(parse_chapter_number("Special"), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn parse_last_page_picks_highest_pagination_link() {
|
|
||||||
let html = r#"
|
|
||||||
<div id="left_side"><div class="pagination">
|
|
||||||
<a href="/list/1/">Prev</a>
|
|
||||||
<ol>
|
|
||||||
<li><a href="/list/1/">1</a></li>
|
|
||||||
<li><a href="/list/2/">2</a></li>
|
|
||||||
<li><a href="/list/47/">47</a></li>
|
|
||||||
<li><a href="/list/2/">Next</a></li>
|
|
||||||
</ol>
|
|
||||||
</div></div>
|
|
||||||
"#;
|
|
||||||
let doc = scraper::Html::parse_document(html);
|
|
||||||
assert_eq!(parse_last_page(&doc), Some(47));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn parse_last_page_none_when_no_pagination() {
|
|
||||||
let doc = scraper::Html::parse_document("<html></html>");
|
|
||||||
assert!(parse_last_page(&doc).is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn page_url_substitutes_numeric_path_segment() {
|
fn page_url_substitutes_numeric_path_segment() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -1024,28 +973,6 @@ mod tests {
|
|||||||
assert!(err.is_transient(), "got non-transient: {err}");
|
assert!(err.is_transient(), "got non-transient: {err}");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn build_page_order_is_natural_one_to_last() {
|
|
||||||
// Newest-first is just the source's natural pagination order:
|
|
||||||
// (update_date DESC) lives at page 1, oldest at the last page.
|
|
||||||
let order = build_page_order(Some(3));
|
|
||||||
assert_eq!(Vec::from(order), vec![1, 2, 3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn build_page_order_falls_back_to_page_one_only_without_pagination() {
|
|
||||||
// Source surfaced no pagination control — visit page 1 alone
|
|
||||||
// and let the walk end after one batch.
|
|
||||||
let order = build_page_order(None);
|
|
||||||
assert_eq!(Vec::from(order), vec![1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn build_page_order_single_page_index_yields_one_entry() {
|
|
||||||
let order = build_page_order(Some(1));
|
|
||||||
assert_eq!(Vec::from(order), vec![1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_chapter_list_returns_transient_when_table_missing() {
|
fn parse_chapter_list_returns_transient_when_table_missing() {
|
||||||
// Partial render (post-load JS hadn't injected the table, layout
|
// Partial render (post-load JS hadn't injected the table, layout
|
||||||
|
|||||||
@@ -10,6 +10,11 @@
|
|||||||
//!
|
//!
|
||||||
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
|
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
|
||||||
//! `$HOME/.cache/mangalord/chromium` isn't writable.
|
//! `$HOME/.cache/mangalord/chromium` isn't writable.
|
||||||
|
//!
|
||||||
|
//! Set `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell` (or
|
||||||
|
//! another system chromium path) to exercise the system-chromium
|
||||||
|
//! launch path instead of the fetcher download — this is the path the
|
||||||
|
//! Raspberry Pi deployment takes.
|
||||||
|
|
||||||
use mangalord::crawler::browser::{self, LaunchOptions};
|
use mangalord::crawler::browser::{self, LaunchOptions};
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,11 @@ services:
|
|||||||
# Upload limits.
|
# Upload limits.
|
||||||
MAX_REQUEST_BYTES: ${MAX_REQUEST_BYTES:-209715200}
|
MAX_REQUEST_BYTES: ${MAX_REQUEST_BYTES:-209715200}
|
||||||
MAX_FILE_BYTES: ${MAX_FILE_BYTES:-20971520}
|
MAX_FILE_BYTES: ${MAX_FILE_BYTES:-20971520}
|
||||||
|
# System-chromium override for the crawler. Leave blank to use the
|
||||||
|
# bundled fetcher; set to e.g. /usr/bin/chromium-headless-shell on
|
||||||
|
# arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true`
|
||||||
|
# so the image actually contains the binary.
|
||||||
|
CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-}
|
||||||
volumes:
|
volumes:
|
||||||
- storage-data:/var/lib/mangalord/storage
|
- storage-data:/var/lib/mangalord/storage
|
||||||
# No host port mapping in the default setup — the frontend proxies
|
# No host port mapping in the default setup — the frontend proxies
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.44.0",
|
"version": "0.45.1",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user