PiCloud/crates/orchestrator-core/src/client.rs

use std::sync::Arc;
use std::time::Duration;

use async_trait::async_trait;
use picloud_executor_core::{Engine, ExecError, ExecRequest, ExecResponse};

use crate::gate::{AcquireError, ExecutionGate};

/// Maximum wall-clock time we'll wait for a single invocation, regardless
/// of the per-script `timeout_seconds`. Provides a hard ceiling on
/// resource usage independent of misconfigured scripts.
const HARD_TIMEOUT_CAP: Duration = Duration::from_secs(300);

/// The seam between the orchestrator and the executor.
///
/// Single-node mode plugs in `LocalExecutorClient`, which calls
/// `executor-core` in-process via `spawn_blocking`. Cluster mode plugs
/// in `RemoteExecutorClient`, which forwards over HTTP to an executor
/// node. Everything else in orchestrator-core depends only on this trait.
#[async_trait]
pub trait ExecutorClient: Send + Sync {
    async fn execute(
        &self,
        source: &str,
        req: ExecRequest,
        timeout: Duration,
    ) -> Result<ExecResponse, ExecError>;
}

/// In-process executor — wraps `executor-core::Engine` directly.
///
/// `executor-core::Engine::execute` is synchronous; we offload it to a
/// blocking thread so it doesn't park a Tokio worker, and apply the
/// wall-clock timeout here.
///
/// Holds an `ExecutionGate` and acquires a permit before `spawn_blocking`
/// so a script storm can't drain the blocking-thread pool. The permit
/// drops with the future, returning the slot.
pub struct LocalExecutorClient {
    engine: Arc<Engine>,
    gate: Arc<ExecutionGate>,
}

impl LocalExecutorClient {
    #[must_use]
    pub fn new(engine: Arc<Engine>, gate: Arc<ExecutionGate>) -> Self {
        Self { engine, gate }
    }
}

#[async_trait]
impl ExecutorClient for LocalExecutorClient {
    async fn execute(
        &self,
        source: &str,
        req: ExecRequest,
        timeout: Duration,
    ) -> Result<ExecResponse, ExecError> {
        // Acquire before spending any wall-clock budget. The permit is
        // held by this future; on `tokio::time::timeout` firing, the
        // future drops and the permit returns to the pool — but the
        // detached `spawn_blocking` thread keeps running until the
        // Rhai script finishes (or panics). So in-use blocking threads
        // can briefly exceed the gate's permit count after a timeout.
        // That is intentional: a new admission can be served while the
        // already-doomed script winds down, which is preferable to
        // wedging the slot for the worst-case timeout duration.
        let _permit =
            self.gate
                .try_acquire()
                .map_err(
                    |AcquireError::Overloaded { retry_after_secs }| ExecError::Overloaded {
                        retry_after_secs,
                    },
                )?;

        let timeout = timeout.min(HARD_TIMEOUT_CAP);
        let timeout_secs = u32::try_from(timeout.as_secs()).unwrap_or(u32::MAX);

        let engine = self.engine.clone();
        let source = source.to_string();
        let join = tokio::task::spawn_blocking(move || engine.execute(&source, req));

        match tokio::time::timeout(timeout, join).await {
            Err(_) => Err(ExecError::Timeout(timeout_secs)),
            Ok(Err(join_err)) => Err(ExecError::Runtime(format!(
                "execution task panicked: {join_err}"
            ))),
            Ok(Ok(res)) => res,
        }
    }
}

/// Remote executor — forwards to a peer executor node over HTTP.
///
/// Skeleton only; fleshed out when cluster mode lands.
pub struct RemoteExecutorClient {
    _client: reqwest::Client,
    _base_url: String,
}

impl RemoteExecutorClient {
    #[must_use]
    pub fn new(base_url: impl Into<String>) -> Self {
        Self {
            _client: reqwest::Client::new(),
            _base_url: base_url.into(),
        }
    }
}

#[async_trait]
impl ExecutorClient for RemoteExecutorClient {
    async fn execute(
        &self,
        _source: &str,
        _req: ExecRequest,
        _timeout: Duration,
    ) -> Result<ExecResponse, ExecError> {
        Err(ExecError::Runtime(
            "RemoteExecutorClient not implemented (cluster mode is v1.3+)".into(),
        ))
    }
}