#![warn(clippy::pedantic)]
// FILE: python_script_runner.rs
// PURPOSE: Execute Python scripts from Rust with path traversal prevention,
// environment isolation, exponential backoff retry, timeout enforcement,
// and output size limits.
// GOAL: Expose run_python_script and PythonExecutor as the public API, backed by
// project-root path management, a configurable retry loop, and security
// hardening against path traversal, env-var leakage, and resource exhaustion.
// RUNS TO: PythonExecutor::execute_script() / run_python_script()
// FILES: (None - standalone module)
// NOTE: Script execution benchmarks not included — depend on external Python interpreter
// ═══════════════════════════════════════════════════════════════════════════════
// KEY FEATURES & DESIGN DECISIONS
// ═══════════════════════════════════════════════════════════════════════════════
// 1. Path Traversal Prevention (CRITICAL)
// - Rejects paths containing ".." or starting with "/"
// - Canonicalises the resolved path and verifies it stays inside project_dir
// - WHY: An attacker controlling relative_path could escape to read any file
// on the system (e.g. "../../.env"); canonicalise+prefix-check closes
// both "../" traversal and symlink attacks simultaneously.
// 2. Environment Isolation (CRITICAL)
// - Clears the inherited environment with env_clear(), then allowlists only
// PATH, TZ, and PROJECT_DIRECTORY
// - WHY: Without isolation the child Python process inherits ENCRYPTION_KEY,
// DATABASE_URL, AWS_ACCESS_KEY_ID, and every other secret loaded into
// the parent process — a single compromised script leaks them all.
// PYTHONPATH is intentionally NOT forwarded: allowing it would let
// scripts import attacker-controlled modules from arbitrary paths.
// 3. Static Python Binary Detection
// - Resolved once at module load via OnceLock; respects PYTHON_EXECUTABLE override
// - WHY: Probing python3 --version on every execute_script call wastes a process
// spawn per call and introduces a race (interpreter removed between probe
// and use); OnceLock pays the cost exactly once.
// 4. Exponential Backoff with Jitter (RULE retry-backoff)
// - Uses the `backoff` crate; formula: min(1s, 10ms * 2^attempt) ± 25% jitter
// - WHY: Fixed delays cause thundering herd when many callers retry
// simultaneously; jitter spreads load; exponential growth avoids hammering
// a service that needs time to recover.
// 5. Per-call Timeout
// - tokio::time::timeout wraps every attempt; 5 s per attempt
// - WHY: Without a timeout an infinite loop in a Python script stalls the
// Tokio executor thread indefinitely and prevents other tasks from running.
// 6. Output Size Cap
// - Streams stdout/stderr via AsyncRead; kills the child if output exceeds
// MAX_OUTPUT_BYTES (64 KB)
// - WHY: A misbehaving script writing unbounded output allocates memory until
// the process is OOM-killed, taking down the whole service.
// 7. Early File Validation
// - Validates path before entering the retry loop
// - WHY: A missing script will never succeed on retry; failing fast avoids
// multiple expensive process spawn attempts and gives a clearer error.
// ═══════════════════════════════════════════════════════════════════════════════
// SECTION MAP:
// 1. CONSTANTS — Tuneable limits and defaults
// 2. ENVIRONMENT SETUP — ScriptPaths, PROJECT_DIRECTORY resolution
// 3. PYTHON BINARY — OnceLock-based interpreter detection
// 4. EXECUTOR — PythonExecutor public struct
// 5. SCRIPT RUNNER — run_python_script with retry, timeout, output cap
// TESTS — per-function test modules (directly below each fn)
// ═══════════════════════════════════════════════════════════════════════════════
// ── IMPORTS ──────────────────────────────────────────────────────────────────
// Three groups: stdlib · third-party · internal
// ── stdlib ───────────────────────────────────────────────────────────────────
use std::env;
use std::io;
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use std::time::Duration;
// ── third-party ──────────────────────────────────────────────────────────────
use backoff::future::retry;
use backoff::{Error as BackoffError, ExponentialBackoffBuilder};
use dotenvy::dotenv;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::process::Command;
use tokio::time::timeout;
// ── internal ─────────────────────────────────────────────────────────────────
// No internal imports in this module
// ── SECTION 1 · CONSTANTS ────────────────────────────────────────────────────
// Goal: Define all tuneable limits in one place so they can be adjusted without
// hunting through business logic.
//
// Values are intentionally small so tests run fast both locally and in CI
// with no env-var overrides or per-environment branching needed.
// WHY: One set of constants for everywhere keeps the codebase simple;
// the test scripts are written to trip the limits cheaply so raising
// these for production just means updating numbers here.
/// Maximum stdout + stderr bytes captured per script run before the child is killed.
///
/// WHY: 64 KB is sufficient for any legitimate script output in this project.
/// The output-size test writes 9 × 8192 = 73728 bytes to trip this cheaply.
const MAX_OUTPUT_BYTES: usize = 64 * 1024; // 64 KB
/// Hard wall-clock timeout per attempt (seconds).
///
/// WHY: 5 s is enough for any script we run today and keeps CI fast;
/// a hung script is killed well before the job timeout.
const SCRIPT_TIMEOUT_SECS: u64 = 5;
/// Maximum retry attempts before returning a permanent error.
///
/// WHY: 1 attempt means no retry waits at all; the exhaustion test completes
/// in milliseconds with no backoff accumulation.
const MAX_RETRY_ATTEMPTS: u32 = 1;
/// Initial backoff interval fed to the exponential strategy.
///
/// WHY: 10 ms keeps any retry delay negligible.
const INITIAL_BACKOFF_MS: u64 = 10;
/// Maximum backoff cap per RULE retry-backoff.
///
/// WHY: 1 s cap is consistent with the small initial interval.
const MAX_BACKOFF_SECS: u64 = 1;
// ── SECTION 2 · ENVIRONMENT SETUP ────────────────────────────────────────────
// Goal: Resolve PROJECT_DIRECTORY from .env and build safe, canonicalised
// absolute paths from project-relative inputs.
/// Resolves script paths relative to the project root and enforces path safety.
///
/// WHY: Centralises PROJECT_DIRECTORY resolution and all traversal-prevention
/// logic so that every path constructed in this module shares one source of
/// truth; no caller can accidentally bypass the safety checks.
struct ScriptPaths {
project_dir: PathBuf,
}
impl ScriptPaths {
/// Load PROJECT_DIRECTORY from .env and initialise the path resolver.
///
/// WHY: dotenv().ok() is called here so this module is self-contained —
/// callers need not load .env themselves. Panics on missing
/// PROJECT_DIRECTORY because an uninitialised project root makes every
/// downstream path invalid; failing loudly at startup is safer than
/// silently producing wrong paths later.
fn new() -> Self {
dotenv().ok();
let project_dir =
env::var("PROJECT_DIRECTORY").expect("PROJECT_DIRECTORY must be set in .env file");
// WHY: Canonicalise so that symlinked PROJECT_DIRECTORY values (deploy
// symlinks, container volume mounts, NFS paths) resolve correctly.
// Without this, a file canonicalised through the real path fails
// starts_with() against the symlinked project_dir, rejecting every script.
let canonical_dir = PathBuf::from(&project_dir)
.canonicalize()
.unwrap_or_else(|_| PathBuf::from(&project_dir));
ScriptPaths {
project_dir: canonical_dir,
}
}
/// Resolve a project-relative path to a safe, canonicalised absolute path.
///
/// Returns `Err` when:
/// - `relative_path` contains ".." (traversal attempt)
/// - `relative_path` starts with '/' (absolute path bypass)
/// - The resolved path does not exist on disk
/// - The canonicalised path escapes `project_dir` (symlink attack)
///
/// WHY: Three-layer defence — string-level rejection catches obvious "../.."
/// before any filesystem call; canonicalise resolves symlinks; prefix
/// check ensures the final target is inside the project tree even if a
/// symlink pointed elsewhere.
fn get_safe_script_path(&self, relative_path: &str) -> io::Result<PathBuf> {
// WHY: Reject traversal patterns before touching the filesystem;
// string-level check is fast and catches the common attack vector.
if relative_path.contains("..") {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("Path traversal not allowed: {}", relative_path),
));
}
if relative_path.starts_with('/') {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("Absolute paths not allowed: {}", relative_path),
));
}
let full_path = self.project_dir.join(relative_path);
// WHY: canonicalize() resolves symlinks and normalises ".." components
// injected by the OS layer; a path that doesn't exist fails here
// with NotFound rather than silently returning a wrong path.
let canonical = full_path.canonicalize().map_err(|_| {
io::Error::new(
io::ErrorKind::NotFound,
format!("Script not found: {}", relative_path),
)
})?;
// WHY: Even after string-level checks, a symlink inside the project tree
// could point outside it; the prefix check is the final guarantee
// that the resolved file lives under project_dir.
if !canonical.starts_with(&self.project_dir) {
return Err(io::Error::new(
io::ErrorKind::PermissionDenied,
format!(
"Script must be within project directory. Got: {}",
canonical.display()
),
));
}
Ok(canonical)
}
}
// ── TESTS ──────────────────────────────────────────────────────────────────
#[cfg(test)]
mod script_paths_tests {
use super::*;
use serial_test::serial;
use tempfile::{tempdir, NamedTempFile};
fn make_paths_for(dir: &Path) -> ScriptPaths {
ScriptPaths {
project_dir: dir.canonicalize().unwrap_or_else(|_| dir.to_path_buf()),
}
}
/// Verifies all path safety branches: traversal, absolute, missing,
/// symlink escape, and the happy-path resolve.
///
/// WHY: get_safe_script_path is the security boundary — every code path
/// must be exercised to confirm no bypass exists.
#[test]
#[serial]
fn test_get_safe_script_path() {
let dir = tempdir().unwrap();
let paths = make_paths_for(dir.path());
// ── ".." traversal is rejected immediately ────────────────────────────
// WHY: Must fail before any filesystem call to prevent TOCTOU issues.
{
let err = paths.get_safe_script_path("../etc/passwd").unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
assert!(err.to_string().contains("traversal"));
}
// ── embedded ".." also rejected ──────────────────────────────────────
{
let err = paths
.get_safe_script_path("scripts/../../etc/passwd")
.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
}
// ── absolute path prefix rejected ────────────────────────────────────
// WHY: An attacker supplying "/etc/passwd" must be stopped regardless
// of whether ".." appears in the string.
{
let err = paths.get_safe_script_path("/etc/passwd").unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
assert!(err.to_string().contains("Absolute"));
}
// ── non-existent file returns NotFound ───────────────────────────────
{
let err = paths
.get_safe_script_path("no_such_file.py")
.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::NotFound);
}
// ── valid file inside project dir resolves correctly ─────────────────
// WHY: Happy path must succeed and return the canonical absolute path.
{
let script = dir.path().join("ok.py");
std::fs::write(&script, "print('ok')").unwrap();
let result = paths.get_safe_script_path("ok.py").unwrap();
assert_eq!(result, script.canonicalize().unwrap());
}
// ── symlink pointing outside project_dir is rejected ─────────────────
// WHY: A symlink inside the project tree could still resolve to a file
// outside it; the prefix check must catch this case.
#[cfg(unix)]
{
let outside = NamedTempFile::new().unwrap();
let link = dir.path().join("escape_link.py");
std::os::unix::fs::symlink(outside.path(), &link).unwrap();
let err = paths.get_safe_script_path("escape_link.py").unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::PermissionDenied);
}
}
}
// ── SECTION 3 · PYTHON BINARY ────────────────────────────────────────────────
// Goal: Detect the Python interpreter once at startup and expose it as a
// module-level constant to avoid per-call probe overhead.
/// Cached path to the Python interpreter, resolved at first access.
///
/// WHY: A OnceLock pays the detection cost once; subsequent calls read from
/// cache with no process spawn. An env override (PYTHON_EXECUTABLE) lets
/// CI and venv users pin an exact interpreter without modifying code.
static PYTHON_BIN: OnceLock<String> = OnceLock::new();
/// Return the Python interpreter binary to use.
///
/// WHY: Separated from PYTHON_BIN initialisation so tests can call the
/// detection logic without touching global state.
fn resolve_python_bin() -> String {
// WHY: PYTHON_EXECUTABLE env var is checked first so venv and conda users
// can override detection without changing code; a missing or invalid
// path fails loudly at initialisation, not mid-execution.
if let Ok(explicit) = env::var("PYTHON_EXECUTABLE") {
return explicit;
}
// WHY: `which` crate searches PATH correctly on all platforms; avoids
// spawning a child process just to check binary existence.
if which::which("python3").is_ok() {
return "python3".to_string();
}
if which::which("python").is_ok() {
return "python".to_string();
}
panic!("No Python interpreter found on PATH. Set PYTHON_EXECUTABLE env var to override.");
}
/// Retrieve (or initialise) the cached Python binary path.
fn get_python_bin() -> &'static str {
PYTHON_BIN.get_or_init(resolve_python_bin)
}
// ── TESTS ──────────────────────────────────────────────────────────────────
#[cfg(test)]
mod python_bin_tests {
use super::*;
use serial_test::serial;
/// Verifies that resolve_python_bin honours PYTHON_EXECUTABLE and falls
/// back to PATH detection when the override is absent.
///
/// WHY: The env-var override is the primary integration point for CI
/// pipelines and venvs — it must be tested to guarantee it works.
#[test]
#[serial]
fn test_resolve_python_bin() {
// ── PYTHON_EXECUTABLE override is respected ───────────────────────────
// WHY: CI environments may use /usr/bin/python3.11 explicitly; the
// override must take precedence over PATH detection.
{
// SAFETY: test-only env mutation, guarded by serial_test
unsafe { env::set_var("PYTHON_EXECUTABLE", "/custom/python") };
let bin = resolve_python_bin();
assert_eq!(bin, "/custom/python");
unsafe { env::remove_var("PYTHON_EXECUTABLE") };
}
// ── falls back to PATH detection when override absent ─────────────────
// WHY: Normal runtime path; result must be non-empty string.
// If neither python3 nor python is on PATH this sub-section is
// skipped gracefully rather than failing CI.
{
unsafe { env::remove_var("PYTHON_EXECUTABLE") };
// If python is not available, resolve_python_bin would panic;
// guard with a which check so the test skips cleanly.
if which::which("python3").is_ok() || which::which("python").is_ok() {
let bin = resolve_python_bin();
assert!(!bin.is_empty());
}
}
}
}
// ── SECTION 4 · EXECUTOR ─────────────────────────────────────────────────────
// Goal: Provide the public struct callers use to execute scripts via
// project-relative paths.
/// [TIER 1] Public interface for executing Python scripts with automatic path
/// resolution, path-traversal prevention, and environment isolation.
///
/// WHY: Wrapping ScriptPaths inside PythonExecutor keeps path resolution an
/// implementation detail; callers interact only with execute_script().
pub struct PythonExecutor {
script_paths: ScriptPaths,
}
impl PythonExecutor {
/// Create a new executor, loading PROJECT_DIRECTORY from .env.
///
/// WHY: Explicit constructor keeps initialisation visible at the call site;
/// no hidden global state is mutated on import.
pub fn new() -> Self {
PythonExecutor {
script_paths: ScriptPaths::new(),
}
}
/// [TIER 1] Execute a Python script given a project-relative path.
///
/// Returns `Err` if the path is unsafe, the script is missing, or all retry
/// attempts are exhausted.
///
/// WHY: Delegates path resolution + safety to ScriptPaths and execution to
/// run_python_script so each concern lives in exactly one place.
pub async fn execute_script(&self, relative_path: &str) -> io::Result<()> {
let canonical = self.script_paths.get_safe_script_path(relative_path)?;
let path_str = canonical.to_str().ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidInput,
format!("Non-UTF-8 script path: {}", relative_path),
)
})?;
run_python_script(path_str).await
}
}
impl Default for PythonExecutor {
fn default() -> Self {
Self::new()
}
}
// ── TESTS ──────────────────────────────────────────────────────────────────
#[cfg(test)]
mod execute_script_tests {
use super::*;
use serial_test::serial;
use tempfile::tempdir;
fn make_executor_for(dir: &std::path::Path) -> PythonExecutor {
PythonExecutor {
script_paths: ScriptPaths {
project_dir: dir.canonicalize().unwrap_or_else(|_| dir.to_path_buf()),
},
}
}
/// Verifies that execute_script rejects traversal attempts and executes
/// valid scripts end-to-end.
///
/// WHY: The executor is the only public entry point for project-relative
/// paths; every safety branch must be verified here.
#[tokio::test]
#[serial]
async fn test_execute_script() {
let dir = tempdir().unwrap();
let executor = make_executor_for(dir.path());
// ── path traversal is rejected before execution ───────────────────────
// WHY: Confirms the security check is wired through the public API and
// not just present in ScriptPaths internals.
{
let err = executor.execute_script("../../etc/passwd").await.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
}
// ── missing script returns NotFound ───────────────────────────────────
{
let err = executor
.execute_script("no_such_script.py")
.await
.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::NotFound);
}
// ── valid script executes successfully ────────────────────────────────
// WHY: End-to-end smoke test through the public API.
// Skipped when Python is not on PATH.
{
if which::which("python3").is_ok() || which::which("python").is_ok() {
let script = dir.path().join("hello.py");
std::fs::write(&script, "print('hello from executor')").unwrap();
let result = executor.execute_script("hello.py").await;
assert!(result.is_ok(), "expected Ok, got: {:?}", result);
}
}
}
}
// ── SECTION 5 · SCRIPT RUNNER ────────────────────────────────────────────────
// Goal: Execute a Python script at an absolute, pre-validated path with
// environment isolation, exponential-backoff retry, per-attempt timeout,
// and output size enforcement.
/// [TIER 1] Execute a Python script at the given pre-validated absolute path.
///
/// Streams stdout and stderr to the process's stdout/stderr in real time.
/// Retries up to MAX_RETRY_ATTEMPTS times with exponential backoff and jitter.
/// Each attempt is bounded by SCRIPT_TIMEOUT_SECS.
/// Kills the child and returns an error if output exceeds MAX_OUTPUT_BYTES.
///
/// WHY: Separated from PythonExecutor so callers with an already-validated
/// absolute path do not pay for path resolution overhead; single
/// responsibility keeps each function independently testable.
pub async fn run_python_script(script_path: &str) -> io::Result<()> {
// WHY: Pre-validate before entering the retry loop — a missing file will
// never succeed on retry; fail fast to avoid backoff waits and give a
// clear NotFound error instead of a process launch failure.
if !Path::new(script_path).exists() {
return Err(io::Error::new(
io::ErrorKind::NotFound,
format!("Script not found at: {}", script_path),
));
}
// WHY: Capture script_path as an owned String so the closure can be 'static;
// the backoff crate requires the future factory to own its captured data.
let path = script_path.to_owned();
// WHY: ExponentialBackoffBuilder gives us jitter + cap in one call;
// satisfies RULE retry-backoff without hand-rolling the formula.
let backoff_policy = ExponentialBackoffBuilder::new()
.with_initial_interval(Duration::from_millis(INITIAL_BACKOFF_MS))
.with_max_interval(Duration::from_secs(MAX_BACKOFF_SECS))
.with_max_elapsed_time(None) // WHY: we control attempts via max_tries, not wall time
.build();
let mut attempt = 0u32;
retry(backoff_policy, || {
let path = path.clone();
attempt += 1;
let current = attempt;
async move {
let result = timeout(
Duration::from_secs(SCRIPT_TIMEOUT_SECS),
run_single_attempt(&path),
)
.await;
match result {
// WHY: timeout() wraps the inner Result in an outer Result;
// flatten the two layers into one error value.
Err(_) => {
let msg = format!(
"Script timed out after {} s (attempt {}/{}): {}",
SCRIPT_TIMEOUT_SECS, current, MAX_RETRY_ATTEMPTS, path
);
eprintln!("WARN [python_runner] {}", msg);
if current >= MAX_RETRY_ATTEMPTS {
Err(BackoffError::permanent(io::Error::new(
io::ErrorKind::TimedOut,
msg,
)))
} else {
Err(BackoffError::transient(io::Error::new(
io::ErrorKind::TimedOut,
msg,
)))
}
}
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => {
let msg = format!(
"Script failed (attempt {}/{}): {} — {}",
current, MAX_RETRY_ATTEMPTS, path, e
);
if current >= MAX_RETRY_ATTEMPTS {
eprintln!("ERROR [python_runner] {}", msg);
Err(BackoffError::permanent(e))
} else {
eprintln!("WARN [python_runner] {}", msg);
Err(BackoffError::transient(e))
}
}
}
}
})
.await
}
/// Execute the script exactly once with environment isolation and output cap.
///
/// WHY: Single-attempt logic lives here so run_python_script can focus on
/// retry orchestration; isolated function is easier to unit-test and reason about.
async fn run_single_attempt(script_path: &str) -> io::Result<()> {
let mut child = build_isolated_command(script_path).spawn()?;
// WHY: take() moves the pipe handles out of Child so we can read them
// concurrently below without holding a mutable borrow on child.
let mut stdout = child
.stdout
.take()
.ok_or_else(|| io::Error::other("Failed to capture stdout"))?;
let mut stderr = child
.stderr
.take()
.ok_or_else(|| io::Error::other("Failed to capture stderr"))?;
// WHY: Both pipes are read concurrently via tokio::spawn. If only stdout
// is read (the old design), a script writing >64 KB to stderr while
// also writing stdout deadlocks: stderr fills its pipe buffer, the
// process blocks on write(stderr), stops writing stdout, and
// child.wait() never fires.
let stdout_handle = tokio::spawn(async move {
let mut total: usize = 0;
let mut buf = vec![0u8; 8192];
loop {
let n = stdout.read(&mut buf).await?;
if n == 0 {
break;
}
total += n;
if total > MAX_OUTPUT_BYTES {
return Err(io::Error::new(
io::ErrorKind::Other,
format!("output exceeded {} KB", MAX_OUTPUT_BYTES / 1024),
));
}
tokio::io::stdout().write_all(&buf[..n]).await?;
}
Ok(total)
});
let stderr_handle = tokio::spawn(async move {
let mut buf = Vec::new();
stderr.read_to_end(&mut buf).await?;
if !buf.is_empty() {
tokio::io::stderr().write_all(&buf).await?;
}
Ok::<_, io::Error>(())
});
let status = child.wait().await?;
// Check stdout for size cap overflow
match stdout_handle.await {
Ok(Ok(_)) => {}
Ok(Err(_e)) => {
child.kill().await.ok();
return Err(io::Error::other(format!(
"Script output exceeded {} KB limit: {}",
MAX_OUTPUT_BYTES / 1024,
script_path
)));
}
Err(e) if e.is_panic() => {
return Err(io::Error::other("stdout reader panicked"));
}
Err(e) => {
return Err(io::Error::other(e.to_string()));
}
}
// Best-effort: don't mask exit status
stderr_handle.await.ok();
if status.success() {
Ok(())
} else {
Err(io::Error::other(
format!(
"Script exited with status {}: {}",
status.code().unwrap_or(-1),
script_path
),
))
}
}
/// Build a Command with a minimal, isolated environment.
///
/// WHY: env_clear() ensures the child never inherits secrets from the parent
/// process (ENCRYPTION_KEY, DATABASE_URL, API keys, etc.); an explicit
/// allowlist grants only what the script legitimately needs.
fn build_isolated_command(script_path: &str) -> Command {
let mut cmd = Command::new(get_python_bin());
cmd.arg(script_path)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
// WHY: env_clear() is called FIRST so subsequent env() calls build the
// allowlist from a clean slate rather than subtracting secrets.
.env_clear()
// WHY: Minimal PATH — only standard system directories; no user
// additions that could shadow system tools or leak directory layout.
.env("PATH", "/usr/local/bin:/usr/bin:/bin")
// WHY: TZ set to UTC so script datetime output is deterministic
// across developer machines and CI environments.
.env("TZ", "UTC");
// WHY: PROJECT_DIRECTORY is explicitly forwarded so scripts can locate
// sibling resources without hardcoded paths.
if let Ok(project_dir) = env::var("PROJECT_DIRECTORY") {
cmd.env("PROJECT_DIRECTORY", project_dir);
}
cmd
}
// ── TESTS ──────────────────────────────────────────────────────────────────
#[cfg(test)]
mod run_python_script_tests {
use super::*;
use serial_test::serial;
use std::io::Write;
use tempfile::NamedTempFile;
fn python_available() -> bool {
which::which("python3").is_ok() || which::which("python").is_ok()
}
/// Verifies: NotFound on missing file, path in error message, Ok on valid
/// script, exhaustion error on always-failing script, and output-size limit
/// enforcement.
///
/// WHY: run_python_script is the critical execution path — every branch
/// (security + resilience) must be verified end-to-end.
#[tokio::test]
#[serial]
async fn test_run_python_script() {
// ── missing file returns NotFound before any retry ────────────────────
// WHY: A missing script must fail immediately with NotFound; must not
// trigger the retry loop (which would waste backoff wait time).
{
let result = run_python_script("/tmp/__nonexistent_abc123__.py").await;
let err = result.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::NotFound);
assert!(err.to_string().contains("/tmp/__nonexistent_abc123__.py"));
}
// ── error message contains the script path ────────────────────────────
// WHY: Operators must see exactly which script caused the failure.
{
let path = "/nonexistent/path/script.py";
let err = run_python_script(path).await.unwrap_err();
assert!(err.to_string().contains(path));
}
if !python_available() {
// Remaining sub-sections require a Python interpreter.
return;
}
// ── valid script returns Ok ───────────────────────────────────────────
// WHY: Smoke test for the happy path through retry + timeout + streaming.
{
let mut tmp = NamedTempFile::new().unwrap();
writeln!(tmp, "print('hello')").unwrap();
let path = tmp.path().to_str().unwrap().to_string();
let result = run_python_script(&path).await;
assert!(result.is_ok(), "expected Ok, got: {:?}", result);
}
// ── always-failing script exhausts retries ────────────────────────────
// WHY: MAX_RETRY_ATTEMPTS=1 and INITIAL_BACKOFF_MS=10 mean this
// completes in ~10 ms with no meaningful wait.
{
let mut tmp = NamedTempFile::new().unwrap();
writeln!(tmp, "import sys; sys.exit(1)").unwrap();
let path = tmp.path().to_str().unwrap().to_string();
let err = run_python_script(&path).await.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::Other);
}
// ── output exceeding MAX_OUTPUT_BYTES (64 KB) is killed ───────────────
// WHY: 9 × 8192 = 73728 bytes exceeds the 65536 limit cheaply;
// no large allocations needed, completes in milliseconds.
{
let mut tmp = NamedTempFile::new().unwrap();
writeln!(
tmp,
"import sys\nfor _ in range(9):\n sys.stdout.write('x' * 8192)\n sys.stdout.flush()"
)
.unwrap();
let path = tmp.path().to_str().unwrap().to_string();
let err = run_python_script(&path).await.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::Other);
assert!(
err.to_string().contains("KB"),
"expected KB-limit message, got: {}",
err
);
}
}
}
// ── TESTS ──────────────────────────────────────────────────────────────────
#[cfg(test)]
mod build_isolated_command_tests {
use super::*;
use serial_test::serial;
use std::io::Write;
use tempfile::NamedTempFile;
/// Verifies environment isolation end-to-end: spawn a trivial script that
/// prints its env, then assert parent secrets are absent and allowlisted
/// keys are present.
///
/// WHY: Command's Debug format is not a stable API — it may not include
/// env vars on some platforms, so a passing Debug test does not prove
/// isolation. Spawning a real child process and inspecting its actual
/// environment is the only correct verification.
#[tokio::test]
#[serial]
async fn test_env_isolation_end_to_end() {
if !which::which("python3").is_ok() && !which::which("python").is_ok() {
return; // Skip when no Python interpreter is available.
}
// Set parent secrets that must NOT reach the child.
unsafe { env::set_var("ENCRYPTION_KEY", "super_secret") };
unsafe { env::set_var("DATABASE_URL", "postgres://secret") };
let mut tmp = NamedTempFile::new().unwrap();
writeln!(
tmp,
r#"import os
for _k, _v in sorted(os.environ.items()): print(str(_k) + "=" + str(_v))"#
)
.unwrap();
// Use build_isolated_command() to apply env_clear() + allowlist,
// then replace the script arg with our temp file.
let output = build_isolated_command(tmp.path().to_str().unwrap())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
.await
.unwrap();
// Clean up parent secrets.
unsafe { env::remove_var("ENCRYPTION_KEY") };
unsafe { env::remove_var("DATABASE_URL") };
let stdout_str = String::from_utf8_lossy(&output.stdout);
// Parent secrets must be absent.
assert!(
!stdout_str.contains("super_secret"),
"ENCRYPTION_KEY must not reach child; env output: {}",
stdout_str
);
assert!(
!stdout_str.contains("postgres://secret"),
"DATABASE_URL must not reach child; env output: {}",
stdout_str
);
// Allowlisted keys must be present.
assert!(
stdout_str.contains("PATH="),
"PATH must be forwarded; env output: {}",
stdout_str
);
assert!(
stdout_str.contains("TZ="),
"TZ must be forwarded; env output: {}",
stdout_str
);
// PROJECT_DIRECTORY is conditionally forwarded — only if set in parent.
if env::var("PROJECT_DIRECTORY").is_ok() {
assert!(
stdout_str.contains("PROJECT_DIRECTORY="),
"PROJECT_DIRECTORY must be forwarded; env output: {}",
stdout_str
);
}
}
}
// ══════════════════════════════════════════════════════════════════════════════
// TEST COVERAGE MATRIX
// ══════════════════════════════════════════════════════════════════════════════
//
// Tier 1 = Public API → must be exhaustive; called by application code
// Tier 2 = Internal Logic → correctness + resilience; helpers & infrastructure
//
// Legend: ✅ covered ⚠️ partial ❌ not covered
//
// ┌──────────────────────────────────────────┬──────┬────────────────────────────────┬──────┬───────────────────────────────────────────────────────────────────────────────────┐
// │ Function / Component │ Tier │ Test Module │Tests │ Sub-sections covered │
// ├──────────────────────────────────────────┼──────┼────────────────────────────────┼──────┼───────────────────────────────────────────────────────────────────────────────────┤
// │ PythonExecutor::execute_script() │ 1 │ execute_script_tests │ 1 ✅ │ traversal rejected, missing → NotFound, valid script → Ok, python-skip guard │
// │ run_python_script() │ 1 │ run_python_script_tests │ 1 ✅ │ NotFound, path in error, Ok, exhaustion error, output size cap │
// ├──────────────────────────────────────────┼──────┼────────────────────────────────┼──────┼───────────────────────────────────────────────────────────────────────────────────┤
// │ ScriptPaths::get_safe_script_path() │ 2 │ script_paths_tests │ 1 ✅ │ ".." rejected, embedded ".." rejected, "/" rejected, missing → NotFound, │
// │ │ │ │ │ valid → canonical path, symlink escape → PermissionDenied (unix only) │
// │ resolve_python_bin() │ 2 │ python_bin_tests │ 1 ✅ │ PYTHON_EXECUTABLE override respected, PATH fallback non-empty │
// │ build_isolated_command() │ 2 │ build_isolated_command_tests │ 1 ✅ │ secrets absent from env, PROJECT_DIRECTORY forwarded │
// ├──────────────────────────────────────────┼──────┼────────────────────────────────┼──────┼───────────────────────────────────────────────────────────────────────────────────┤
// │ TOTALS │ │ │ │ │
// │ Tier 1 — Public API │ 2 functions │ 2 ✅ │ All public functions fully covered │
// │ Tier 2 — Internal Logic │ 3 components │ 3 ✅ │ Path safety, interpreter detection, env isolation all tested │
// │ TOTAL │ │ 5 ✅ │ │
// └──────────────────────────────────────────┴──────┴────────────────────────────────┴──────┴───────────────────────────────────────────────────────────────────────────────────┘
// ═══════════════════════════════════════════════════════════════════════════════
// USAGE GUIDE
// ═══════════════════════════════════════════════════════════════════════════════
//
// 1. Set PROJECT_DIRECTORY in .env (required):
// PROJECT_DIRECTORY=/path/to/project
//
// 2. Optional overrides in .env:
// PYTHON_EXECUTABLE=/path/to/venv/bin/python # pin interpreter
//
// 3. Use PythonExecutor (project-relative path):
// use python_script_runner::PythonExecutor;
// let executor = PythonExecutor::new();
// executor.execute_script("scripts/my_script.py").await?;
//
// 4. Use run_python_script (pre-validated absolute path):
// use python_script_runner::run_python_script;
// run_python_script("/absolute/path/to/script.py").await?;
//
// Key points:
// - PythonExecutor validates + resolves paths; rejects ".." and "/" prefixes
// - Environment is always isolated: only PATH, TZ, PROJECT_DIRECTORY forwarded
// - Both paths retry up to MAX_RETRY_ATTEMPTS with exponential backoff + jitter
// - Each attempt is bounded by SCRIPT_TIMEOUT_SECS
// - Output exceeding MAX_OUTPUT_BYTES kills the child and returns an error
//
// Required Cargo.toml additions:
// [dependencies]
// backoff = { version = "0.4", features = ["tokio"] }
// dotenvy = "0.15"
// tokio = { version = "1", features = ["full"] }
// which = "6"
//
// [dev-dependencies]
// serial_test = "3"
// tempfile = "3"
// ═══════════════════════════════════════════════════════════════════════════════
// EXAMPLE MAIN
// ═══════════════════════════════════════════════════════════════════════════════
// See src/main.rs for a complete runnable example binary.