Skip to main content

ralph_workflow/logging/
run_log_context.rs

1use super::run_id::RunId;
2use crate::workspace::Workspace;
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::path::{Path, PathBuf};
6
7/// Context for managing per-run log directories and files.
8///
9/// This struct owns the `run_id` and provides path resolution for all logs
10/// from a single Ralph invocation. All logs are grouped under a per-run
11/// directory (`.agent/logs-<run_id>/`) for easy sharing and diagnosis.
12///
13/// ## Design Rationale
14///
15/// **Why per-run directories?**
16/// - **Shareability**: All logs from a run can be shared as a single tarball
17/// - **Resume continuity**: `--resume` continues logging to the same directory
18/// - **Isolation**: Multiple concurrent runs don't interfere with each other
19/// - **Organization**: Chronological sorting is natural (lexicographic sort)
20///
21/// **Why not scatter logs across `.agent/logs/`, `.agent/tmp/`, etc?**
22/// - Hard to identify which logs belong to which run
23/// - Difficult to share logs for debugging
24/// - Resume would create fragmented log artifacts
25/// - Log rotation and cleanup become complex
26///
27/// ## Integration with Checkpoint/Resume
28///
29/// The `run_id` is stored in the checkpoint (`.agent/checkpoint.json`) so that
30/// `--resume` can continue logging to the same directory. This ensures:
31/// - Logs from the original run and resumed run are in one place
32/// - Event loop sequence numbers continue from where they left off
33/// - Pipeline log is appended (not overwritten)
34///
35/// ## Architecture Compliance
36///
37/// This struct is created once per run in the **impure layer** (effect handlers)
38/// and passed to all effect handlers via `PhaseContext`. It must never be used
39/// in reducers or orchestrators (which are pure).
40///
41/// All filesystem operations go through the `Workspace` trait (never `std::fs`
42/// in pipeline code) to support both `WorkspaceFs` (production) and
43/// `MemoryWorkspace` (tests).
44///
45/// ## Future Extensibility
46///
47/// The per-run directory structure includes reserved subdirectories for future use:
48/// - `provider/`: Provider streaming logs (infrastructure exists, not yet used)
49/// - `debug/`: Future debug artifacts (e.g., memory dumps, profiling data)
50///
51/// ## Examples
52///
53/// ### Fresh run
54/// ```no_run
55/// use ralph_workflow::logging::RunLogContext;
56/// use ralph_workflow::workspace::WorkspaceFs;
57/// use std::path::PathBuf;
58///
59/// let workspace = WorkspaceFs::new(PathBuf::from("."));
60/// let ctx = RunLogContext::new(&workspace)?;
61///
62/// // Get log paths
63/// let pipeline_log = ctx.pipeline_log();  // .agent/logs-2026-02-06_14-03-27.123Z/pipeline.log
64/// let agent_log = ctx.agent_log("planning", 1, None);  // .agent/logs-.../agents/planning_1.log
65/// # Ok::<(), anyhow::Error>(())
66/// ```
67///
68/// ### Resume
69/// ```no_run
70/// use ralph_workflow::logging::RunLogContext;
71/// use ralph_workflow::workspace::WorkspaceFs;
72/// use std::path::PathBuf;
73///
74/// let workspace = WorkspaceFs::new(PathBuf::from("."));
75/// let run_id = "2026-02-06_14-03-27.123Z";  // From checkpoint
76/// let ctx = RunLogContext::from_checkpoint(run_id, &workspace)?;
77///
78/// // Logs will append to existing files in the same run directory
79/// let pipeline_log = ctx.pipeline_log();
80/// # Ok::<(), anyhow::Error>(())
81/// ```
82pub struct RunLogContext {
83    run_id: RunId,
84    run_dir: PathBuf,
85}
86
87impl RunLogContext {
88    /// Create a new `RunLogContext` with collision handling.
89    ///
90    /// Generates a new `run_id` and creates the run directory structure.
91    /// If directory exists, tries collision counter variants (rare case
92    /// of multiple runs starting in the same millisecond).
93    ///
94    /// Creates subdirectories:
95    /// - `.agent/logs-<run_id>/agents/` for per-agent logs
96    /// - `.agent/logs-<run_id>/provider/` for provider streaming logs
97    /// - `.agent/logs-<run_id>/debug/` for future debug artifacts
98    ///
99    /// # Collision Handling
100    ///
101    /// The collision handling loop tries counter values 0-99:
102    /// - Counter 0: Uses the base `run_id` (no suffix)
103    /// - Counter 1-99: Appends `-01` through `-99` suffixes
104    ///
105    /// # TOCTOU Race Condition Handling
106    ///
107    /// To avoid the time-of-check-to-time-of-use race condition, we:
108    /// 1. First check if the directory exists (fast path for common case)
109    /// 2. If it doesn't exist, try to create it
110    /// 3. If creation succeeds but the directory still doesn't exist afterward,
111    ///    another process may have created it, so we try the next collision variant
112    /// 4. We use the presence of the "agents" subdirectory as our "created" marker
113    ///
114    /// Note: If a base directory exists that was actually created as a collision
115    /// directory (e.g., due to a bug), the system will still work correctly by
116    /// creating the next collision variant. This is acceptable because the directory
117    /// naming format is deterministic and we always check for existence before creating.
118    ///
119    /// # Errors
120    ///
121    /// Returns error if the operation fails.
122    pub fn new(workspace: &dyn Workspace) -> Result<Self> {
123        let base_run_id = RunId::new();
124
125        // Try base run_id first, then collision variants 1-99
126        for counter in 0..=99 {
127            let run_id = if counter == 0 {
128                base_run_id.clone()
129            } else {
130                base_run_id.with_collision_counter(counter)
131            };
132
133            let run_dir = PathBuf::from(format!(".agent/logs-{run_id}"));
134            let agents_dir = run_dir.join("agents");
135
136            // Fast path: if agents subdirectory exists, this run_id is taken
137            if workspace.exists(&agents_dir) {
138                continue;
139            }
140
141            // Try to create the run directory and subdirectories
142            // create_dir_all is idempotent (Ok if directory exists)
143            workspace
144                .create_dir_all(&run_dir)
145                .context("Failed to create run log directory")?;
146
147            workspace
148                .create_dir_all(&agents_dir)
149                .context("Failed to create agents log subdirectory")?;
150
151            workspace
152                .create_dir_all(&run_dir.join("provider"))
153                .context("Failed to create provider log subdirectory")?;
154
155            workspace
156                .create_dir_all(&run_dir.join("debug"))
157                .context("Failed to create debug log subdirectory")?;
158
159            // Verify we're the ones who created it (agents_dir should exist now)
160            // If it doesn't, another process might have raced us, try next variant
161            if workspace.exists(&agents_dir) {
162                return Ok(Self { run_id, run_dir });
163            }
164        }
165
166        // If we exhausted all collision counters, bail
167        anyhow::bail!(
168            "Too many collisions creating run log directory (tried base + 99 variants). \
169             This is extremely rare (100+ runs in the same millisecond). \
170             Possible causes: clock skew, or filesystem issues. \
171             Suggestion: Wait 1ms and retry, or check system clock."
172        )
173    }
174
175    /// Create a `RunLogContext` from an existing checkpoint (for resume).
176    ///
177    /// Uses the timestamp-based log run ID from the checkpoint (stored in
178    /// `PipelineCheckpoint.log_run_id`) to continue logging to the same run
179    /// directory. This is distinct from the UUID-based `run_id` field in the
180    /// checkpoint which identifies the execution session.
181    ///
182    /// If the directory doesn't exist (e.g., deleted), it is recreated.
183    ///
184    /// # Errors
185    ///
186    /// Returns error if the operation fails.
187    pub fn from_checkpoint(run_id: &str, workspace: &dyn Workspace) -> Result<Self> {
188        let run_id = RunId::from_checkpoint(run_id);
189        let run_dir = PathBuf::from(format!(".agent/logs-{run_id}"));
190
191        // Ensure directory exists (may have been deleted)
192        if !workspace.exists(&run_dir) {
193            workspace
194                .create_dir_all(&run_dir)
195                .context("Failed to recreate run log directory for resume")?;
196
197            workspace
198                .create_dir_all(&run_dir.join("agents"))
199                .context("Failed to recreate agents log subdirectory for resume")?;
200
201            workspace
202                .create_dir_all(&run_dir.join("provider"))
203                .context("Failed to recreate provider log subdirectory for resume")?;
204
205            workspace
206                .create_dir_all(&run_dir.join("debug"))
207                .context("Failed to recreate debug log subdirectory for resume")?;
208        }
209
210        Ok(Self { run_id, run_dir })
211    }
212
213    /// Test-only helper to create a `RunLogContext` with a fixed `run_id`.
214    ///
215    /// This allows testing the collision handling logic by providing a predictable
216    /// `run_id` that can be pre-created on the filesystem to simulate collisions.
217    ///
218    /// # Warning
219    ///
220    /// This is intended for testing only. Using a fixed `run_id` in production
221    /// could lead to directory collisions. Always use [`RunLogContext::new`]
222    /// or [`RunLogContext::from_checkpoint`] in production code.
223    ///
224    /// # Examples
225    ///
226    /// ```ignore
227    /// use ralph_workflow::logging::{RunId, RunLogContext};
228    ///
229    /// // Create a fixed run_id for testing
230    /// let fixed_id = RunId::for_test("2026-02-06_14-03-27.123Z");
231    /// let ctx = RunLogContext::for_testing(&fixed_id, &workspace)?;
232    /// ```
233    ///
234    /// # Errors
235    ///
236    /// Returns error if the operation fails.
237    pub fn for_testing(base_run_id: &RunId, workspace: &dyn Workspace) -> Result<Self> {
238        // Try base run_id first, then collision variants 1-99
239        for counter in 0..=99 {
240            let run_id = if counter == 0 {
241                base_run_id.clone()
242            } else {
243                base_run_id.with_collision_counter(counter)
244            };
245
246            let run_dir = PathBuf::from(format!(".agent/logs-{run_id}"));
247            let agents_dir = run_dir.join("agents");
248
249            // Fast path: if agents subdirectory exists, this run_id is taken
250            if workspace.exists(&agents_dir) {
251                continue;
252            }
253
254            // Try to create the run directory and subdirectories
255            // create_dir_all is idempotent (Ok if directory exists)
256            workspace
257                .create_dir_all(&run_dir)
258                .context("Failed to create run log directory")?;
259
260            workspace
261                .create_dir_all(&agents_dir)
262                .context("Failed to create agents log subdirectory")?;
263
264            workspace
265                .create_dir_all(&run_dir.join("provider"))
266                .context("Failed to create provider log subdirectory")?;
267
268            workspace
269                .create_dir_all(&run_dir.join("debug"))
270                .context("Failed to create debug log subdirectory")?;
271
272            // Verify we're the ones who created it (agents_dir should exist now)
273            // If it doesn't, another process might have raced us, try next variant
274            if workspace.exists(&agents_dir) {
275                return Ok(Self { run_id, run_dir });
276            }
277        }
278
279        // If we exhausted all collision counters, bail
280        anyhow::bail!(
281            "Too many collisions creating run log directory (tried base + 99 variants). \
282             This is extremely rare (100+ runs in the same millisecond). \
283             Possible causes: clock skew, or filesystem issues. \
284             Suggestion: Wait 1ms and retry, or check system clock."
285        )
286    }
287
288    /// Get a reference to the run ID.
289    ///
290    /// This is the timestamp-based log run ID (format: `YYYY-MM-DD_HH-mm-ss.SSSZ[-NN]`)
291    /// used for naming the per-run log directory. It is distinct from the UUID-based
292    /// `run_id` field stored in `PipelineCheckpoint`, which uniquely identifies the
293    /// execution session.
294    #[must_use]
295    pub const fn run_id(&self) -> &RunId {
296        &self.run_id
297    }
298
299    /// Get the run directory path (relative to workspace root).
300    #[must_use]
301    pub fn run_dir(&self) -> &Path {
302        &self.run_dir
303    }
304
305    /// Get the path to the pipeline log file.
306    #[must_use]
307    pub fn pipeline_log(&self) -> PathBuf {
308        self.run_dir.join("pipeline.log")
309    }
310
311    /// Get the path to the event loop log file.
312    #[must_use]
313    pub fn event_loop_log(&self) -> PathBuf {
314        self.run_dir.join("event_loop.log")
315    }
316
317    /// Get the path to the event loop trace file (crash-only).
318    #[must_use]
319    pub fn event_loop_trace(&self) -> PathBuf {
320        self.run_dir.join("event_loop_trace.jsonl")
321    }
322
323    /// Get the path to an agent log file.
324    ///
325    /// # Arguments
326    /// * `phase` - Phase name (e.g., "planning", "dev", "reviewer", "commit")
327    /// * `index` - Invocation index within the phase (1-based)
328    /// * `attempt` - Optional retry attempt counter (1 for first retry, 2 for second retry, etc.; None for initial attempt with no retries)
329    ///
330    /// # Returns
331    /// Path like `.agent/logs-<run_id>/agents/planning_1.log` or
332    /// `.agent/logs-<run_id>/agents/dev_2_a1.log` for retries.
333    #[must_use]
334    pub fn agent_log(&self, phase: &str, index: u32, attempt: Option<u32>) -> PathBuf {
335        let filename = attempt.map_or_else(
336            || format!("{phase}_{index}.log"),
337            |a| format!("{phase}_{index}_a{a}.log"),
338        );
339        self.run_dir.join("agents").join(filename)
340    }
341
342    /// Get the path to a provider streaming log file.
343    ///
344    /// # Arguments
345    /// * `name` - Provider log filename (e.g., "claude-stream_dev_1.jsonl")
346    ///
347    /// # Returns
348    /// Path like `.agent/logs-<run_id>/provider/claude-stream_dev_1.jsonl`.
349    #[must_use]
350    pub fn provider_log(&self, name: &str) -> PathBuf {
351        self.run_dir.join("provider").join(name)
352    }
353
354    /// Get the path to the run metadata file (run.json).
355    #[must_use]
356    pub fn run_metadata(&self) -> PathBuf {
357        self.run_dir.join("run.json")
358    }
359
360    /// Write run.json metadata file.
361    ///
362    /// This should be called early in pipeline execution to record
363    /// essential metadata for debugging and tooling.
364    ///
365    /// # Errors
366    ///
367    /// Returns error if the operation fails.
368    pub fn write_run_metadata(
369        &self,
370        workspace: &dyn Workspace,
371        metadata: &RunMetadata,
372    ) -> Result<()> {
373        let path = self.run_metadata();
374        let json = serde_json::to_string_pretty(metadata).with_context(|| {
375            format!(
376                "Failed to serialize run metadata for run_id '{}'. \
377                 This usually means a field contains data that cannot be represented as JSON.",
378                self.run_id
379            )
380        })?;
381        workspace.write(&path, &json).with_context(|| {
382            format!(
383                "Failed to write run.json to '{}'. Check filesystem permissions and disk space.",
384                path.display()
385            )
386        })
387    }
388}
389
390/// Metadata recorded in run.json for each pipeline run.
391///
392/// This file is written at the start of each run to provide context
393/// for debugging and tooling. It anchors the run with essential info
394/// like command invocation, timestamps, and environment details.
395#[derive(Debug, Clone, Serialize, Deserialize)]
396pub struct RunMetadata {
397    /// Timestamp-based run identifier (matches directory name)
398    ///
399    /// Format: `YYYY-MM-DD_HH-mm-ss.SSSZ[-NN]` (e.g., `2026-02-06_14-03-27.123Z`)
400    ///
401    /// This is the log run ID used for the per-run log directory and is distinct
402    /// from the UUID-based `run_id` field in `PipelineCheckpoint` which uniquely
403    /// identifies the execution session.
404    pub run_id: String,
405
406    /// Timestamp when run started (UTC, RFC3339)
407    pub started_at_utc: String,
408
409    /// Command as invoked by user (e.g., "ralph" or "ralph --resume")
410    pub command: String,
411
412    /// Whether this is a resumed session
413    pub resume: bool,
414
415    /// Absolute path to repository root
416    pub repo_root: String,
417
418    /// Ralph version (from Cargo.toml)
419    pub ralph_version: String,
420
421    /// Process ID (if available)
422    #[serde(skip_serializing_if = "Option::is_none")]
423    pub pid: Option<u32>,
424
425    /// Configuration summary (non-secret metadata)
426    #[serde(skip_serializing_if = "Option::is_none")]
427    pub config_summary: Option<ConfigSummary>,
428}
429
430/// Non-secret configuration summary for run.json.
431///
432/// Captures high-level config info useful for debugging without
433/// exposing any sensitive data (API keys, tokens, etc.).
434#[derive(Debug, Clone, Serialize, Deserialize)]
435pub struct ConfigSummary {
436    /// Developer agent name (if configured)
437    #[serde(skip_serializing_if = "Option::is_none")]
438    pub developer_agent: Option<String>,
439
440    /// Reviewer agent name (if configured)
441    #[serde(skip_serializing_if = "Option::is_none")]
442    pub reviewer_agent: Option<String>,
443
444    /// Total iterations configured
445    #[serde(skip_serializing_if = "Option::is_none")]
446    pub total_iterations: Option<u32>,
447
448    /// Total reviewer passes configured
449    #[serde(skip_serializing_if = "Option::is_none")]
450    pub total_reviewer_passes: Option<u32>,
451}
452
453#[cfg(test)]
454mod tests;