ralph_workflow/logging/run_log_context.rs
1use super::run_id::RunId;
2use crate::workspace::Workspace;
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::path::{Path, PathBuf};
6
7/// Context for managing per-run log directories and files.
8///
9/// This struct owns the `run_id` and provides path resolution for all logs
10/// from a single Ralph invocation. All logs are grouped under a per-run
11/// directory (`.agent/logs-<run_id>/`) for easy sharing and diagnosis.
12///
13/// ## Design Rationale
14///
15/// **Why per-run directories?**
16/// - **Shareability**: All logs from a run can be shared as a single tarball
17/// - **Resume continuity**: `--resume` continues logging to the same directory
18/// - **Isolation**: Multiple concurrent runs don't interfere with each other
19/// - **Organization**: Chronological sorting is natural (lexicographic sort)
20///
21/// **Why not scatter logs across `.agent/logs/`, `.agent/tmp/`, etc?**
22/// - Hard to identify which logs belong to which run
23/// - Difficult to share logs for debugging
24/// - Resume would create fragmented log artifacts
25/// - Log rotation and cleanup become complex
26///
27/// ## Integration with Checkpoint/Resume
28///
29/// The `run_id` is stored in the checkpoint (`.agent/checkpoint.json`) so that
30/// `--resume` can continue logging to the same directory. This ensures:
31/// - Logs from the original run and resumed run are in one place
32/// - Event loop sequence numbers continue from where they left off
33/// - Pipeline log is appended (not overwritten)
34///
35/// ## Architecture Compliance
36///
37/// This struct is created once per run in the **impure layer** (effect handlers)
38/// and passed to all effect handlers via `PhaseContext`. It must never be used
39/// in reducers or orchestrators (which are pure).
40///
41/// All filesystem operations go through the `Workspace` trait (never `std::fs`
42/// in pipeline code) to support both `WorkspaceFs` (production) and
43/// `MemoryWorkspace` (tests).
44///
45/// ## Future Extensibility
46///
47/// The per-run directory structure includes reserved subdirectories for future use:
48/// - `provider/`: Provider streaming logs (infrastructure exists, not yet used)
49/// - `debug/`: Future debug artifacts (e.g., memory dumps, profiling data)
50///
51/// ## Examples
52///
53/// ### Fresh run
54/// ```no_run
55/// use ralph_workflow::logging::RunLogContext;
56/// use ralph_workflow::workspace::WorkspaceFs;
57/// use std::path::PathBuf;
58///
59/// let workspace = WorkspaceFs::new(PathBuf::from("."));
60/// let ctx = RunLogContext::new(&workspace)?;
61///
62/// // Get log paths
63/// let pipeline_log = ctx.pipeline_log(); // .agent/logs-2026-02-06_14-03-27.123Z/pipeline.log
64/// let agent_log = ctx.agent_log("planning", 1, None); // .agent/logs-.../agents/planning_1.log
65/// # Ok::<(), anyhow::Error>(())
66/// ```
67///
68/// ### Resume
69/// ```no_run
70/// use ralph_workflow::logging::RunLogContext;
71/// use ralph_workflow::workspace::WorkspaceFs;
72/// use std::path::PathBuf;
73///
74/// let workspace = WorkspaceFs::new(PathBuf::from("."));
75/// let run_id = "2026-02-06_14-03-27.123Z"; // From checkpoint
76/// let ctx = RunLogContext::from_checkpoint(run_id, &workspace)?;
77///
78/// // Logs will append to existing files in the same run directory
79/// let pipeline_log = ctx.pipeline_log();
80/// # Ok::<(), anyhow::Error>(())
81/// ```
82pub struct RunLogContext {
83 run_id: RunId,
84 run_dir: PathBuf,
85}
86
87impl RunLogContext {
88 /// Create a new `RunLogContext` with collision handling.
89 ///
90 /// Generates a new `run_id` and creates the run directory structure.
91 /// If directory exists, tries collision counter variants (rare case
92 /// of multiple runs starting in the same millisecond).
93 ///
94 /// Creates subdirectories:
95 /// - `.agent/logs-<run_id>/agents/` for per-agent logs
96 /// - `.agent/logs-<run_id>/provider/` for provider streaming logs
97 /// - `.agent/logs-<run_id>/debug/` for future debug artifacts
98 ///
99 /// # Collision Handling
100 ///
101 /// The collision handling loop tries counter values 0-99:
102 /// - Counter 0: Uses the base `run_id` (no suffix)
103 /// - Counter 1-99: Appends `-01` through `-99` suffixes
104 ///
105 /// # TOCTOU Race Condition Handling
106 ///
107 /// To avoid the time-of-check-to-time-of-use race condition, we:
108 /// 1. First check if the directory exists (fast path for common case)
109 /// 2. If it doesn't exist, try to create it
110 /// 3. If creation succeeds but the directory still doesn't exist afterward,
111 /// another process may have created it, so we try the next collision variant
112 /// 4. We use the presence of the "agents" subdirectory as our "created" marker
113 ///
114 /// Note: If a base directory exists that was actually created as a collision
115 /// directory (e.g., due to a bug), the system will still work correctly by
116 /// creating the next collision variant. This is acceptable because the directory
117 /// naming format is deterministic and we always check for existence before creating.
118 ///
119 /// # Errors
120 ///
121 /// Returns error if the operation fails.
122 pub fn new(workspace: &dyn Workspace) -> Result<Self> {
123 let base_run_id = RunId::new();
124
125 let (run_id, run_dir) = crate::logging::collision::create_run_dir_with_collision_handling(
126 workspace,
127 &base_run_id,
128 )?;
129
130 Ok(Self { run_id, run_dir })
131 }
132
133 /// Create a `RunLogContext` from an existing checkpoint (for resume).
134 ///
135 /// Uses the timestamp-based log run ID from the checkpoint (stored in
136 /// `PipelineCheckpoint.log_run_id`) to continue logging to the same run
137 /// directory. This is distinct from the UUID-based `run_id` field in the
138 /// checkpoint which identifies the execution session.
139 ///
140 /// If the directory doesn't exist (e.g., deleted), it is recreated.
141 ///
142 /// # Errors
143 ///
144 /// Returns error if the operation fails.
145 pub fn from_checkpoint(run_id: &str, workspace: &dyn Workspace) -> Result<Self> {
146 let run_id = RunId::from_checkpoint(run_id);
147 let run_dir = PathBuf::from(format!(".agent/logs-{run_id}"));
148
149 // Ensure directory exists (may have been deleted)
150 if !workspace.exists(&run_dir) {
151 workspace
152 .create_dir_all(&run_dir)
153 .context("Failed to recreate run log directory for resume")?;
154
155 workspace
156 .create_dir_all(&run_dir.join("agents"))
157 .context("Failed to recreate agents log subdirectory for resume")?;
158
159 workspace
160 .create_dir_all(&run_dir.join("provider"))
161 .context("Failed to recreate provider log subdirectory for resume")?;
162
163 workspace
164 .create_dir_all(&run_dir.join("debug"))
165 .context("Failed to recreate debug log subdirectory for resume")?;
166 }
167
168 Ok(Self { run_id, run_dir })
169 }
170
171 /// Test-only helper to create a `RunLogContext` with a fixed `run_id`.
172 ///
173 /// This allows testing the collision handling logic by providing a predictable
174 /// `run_id` that can be pre-created on the filesystem to simulate collisions.
175 ///
176 /// # Warning
177 ///
178 /// This is intended for testing only. Using a fixed `run_id` in production
179 /// could lead to directory collisions. Always use [`RunLogContext::new`]
180 /// or [`RunLogContext::from_checkpoint`] in production code.
181 ///
182 /// # Examples
183 ///
184 /// ```ignore
185 /// use ralph_workflow::logging::{RunId, RunLogContext};
186 ///
187 /// // Create a fixed run_id for testing
188 /// let fixed_id = RunId::for_test("2026-02-06_14-03-27.123Z");
189 /// let ctx = RunLogContext::for_testing(&fixed_id, &workspace)?;
190 /// ```
191 ///
192 /// # Errors
193 ///
194 /// Returns error if the operation fails.
195 pub fn for_testing(base_run_id: &RunId, workspace: &dyn Workspace) -> Result<Self> {
196 let (run_id, run_dir) = crate::logging::collision::create_run_dir_with_collision_handling(
197 workspace,
198 base_run_id,
199 )?;
200
201 Ok(Self { run_id, run_dir })
202 }
203
204 /// Get a reference to the run ID.
205 ///
206 /// This is the timestamp-based log run ID (format: `YYYY-MM-DD_HH-mm-ss.SSSZ[-NN]`)
207 /// used for naming the per-run log directory. It is distinct from the UUID-based
208 /// `run_id` field stored in `PipelineCheckpoint`, which uniquely identifies the
209 /// execution session.
210 #[must_use]
211 pub const fn run_id(&self) -> &RunId {
212 &self.run_id
213 }
214
215 /// Get the run directory path (relative to workspace root).
216 #[must_use]
217 pub fn run_dir(&self) -> &Path {
218 &self.run_dir
219 }
220
221 /// Get the path to the pipeline log file.
222 #[must_use]
223 pub fn pipeline_log(&self) -> PathBuf {
224 self.run_dir.join("pipeline.log")
225 }
226
227 /// Get the path to the event loop log file.
228 #[must_use]
229 pub fn event_loop_log(&self) -> PathBuf {
230 self.run_dir.join("event_loop.log")
231 }
232
233 /// Get the path to the event loop trace file (crash-only).
234 #[must_use]
235 pub fn event_loop_trace(&self) -> PathBuf {
236 self.run_dir.join("event_loop_trace.jsonl")
237 }
238
239 /// Get the path to an agent log file.
240 ///
241 /// # Arguments
242 /// * `phase` - Phase name (e.g., "planning", "dev", "reviewer", "commit")
243 /// * `index` - Invocation index within the phase (1-based)
244 /// * `attempt` - Optional retry attempt counter (1 for first retry, 2 for second retry, etc.; None for initial attempt with no retries)
245 ///
246 /// # Returns
247 /// Path like `.agent/logs-<run_id>/agents/planning_1.log` or
248 /// `.agent/logs-<run_id>/agents/dev_2_a1.log` for retries.
249 #[must_use]
250 pub fn agent_log(&self, phase: &str, index: u32, attempt: Option<u32>) -> PathBuf {
251 let filename = attempt.map_or_else(
252 || format!("{phase}_{index}.log"),
253 |a| format!("{phase}_{index}_a{a}.log"),
254 );
255 self.run_dir.join("agents").join(filename)
256 }
257
258 /// Get the path to a provider streaming log file.
259 ///
260 /// # Arguments
261 /// * `name` - Provider log filename (e.g., "claude-stream_dev_1.jsonl")
262 ///
263 /// # Returns
264 /// Path like `.agent/logs-<run_id>/provider/claude-stream_dev_1.jsonl`.
265 #[must_use]
266 pub fn provider_log(&self, name: &str) -> PathBuf {
267 self.run_dir.join("provider").join(name)
268 }
269
270 /// Get the path to the run metadata file (run.json).
271 #[must_use]
272 pub fn run_metadata(&self) -> PathBuf {
273 self.run_dir.join("run.json")
274 }
275
276 /// Write run.json metadata file.
277 ///
278 /// This should be called early in pipeline execution to record
279 /// essential metadata for debugging and tooling.
280 ///
281 /// # Errors
282 ///
283 /// Returns error if the operation fails.
284 pub fn write_run_metadata(
285 &self,
286 workspace: &dyn Workspace,
287 metadata: &RunMetadata,
288 ) -> Result<()> {
289 let path = self.run_metadata();
290 let json = serde_json::to_string_pretty(metadata).with_context(|| {
291 format!(
292 "Failed to serialize run metadata for run_id '{}'. \
293 This usually means a field contains data that cannot be represented as JSON.",
294 self.run_id
295 )
296 })?;
297 workspace.write(&path, &json).with_context(|| {
298 format!(
299 "Failed to write run.json to '{}'. Check filesystem permissions and disk space.",
300 path.display()
301 )
302 })
303 }
304}
305
306/// Metadata recorded in run.json for each pipeline run.
307///
308/// This file is written at the start of each run to provide context
309/// for debugging and tooling. It anchors the run with essential info
310/// like command invocation, timestamps, and environment details.
311#[derive(Debug, Clone, Serialize, Deserialize)]
312pub struct RunMetadata {
313 /// Timestamp-based run identifier (matches directory name)
314 ///
315 /// Format: `YYYY-MM-DD_HH-mm-ss.SSSZ[-NN]` (e.g., `2026-02-06_14-03-27.123Z`)
316 ///
317 /// This is the log run ID used for the per-run log directory and is distinct
318 /// from the UUID-based `run_id` field in `PipelineCheckpoint` which uniquely
319 /// identifies the execution session.
320 pub run_id: String,
321
322 /// Timestamp when run started (UTC, RFC3339)
323 pub started_at_utc: String,
324
325 /// Command as invoked by user (e.g., "ralph" or "ralph --resume")
326 pub command: String,
327
328 /// Whether this is a resumed session
329 pub resume: bool,
330
331 /// Absolute path to repository root
332 pub repo_root: String,
333
334 /// Ralph version (from Cargo.toml)
335 pub ralph_version: String,
336
337 /// Process ID (if available)
338 #[serde(skip_serializing_if = "Option::is_none")]
339 pub pid: Option<u32>,
340
341 /// Configuration summary (non-secret metadata)
342 #[serde(skip_serializing_if = "Option::is_none")]
343 pub config_summary: Option<ConfigSummary>,
344}
345
346/// Non-secret configuration summary for run.json.
347///
348/// Captures high-level config info useful for debugging without
349/// exposing any sensitive data (API keys, tokens, etc.).
350#[derive(Debug, Clone, Serialize, Deserialize)]
351pub struct ConfigSummary {
352 /// Developer agent name (if configured)
353 #[serde(skip_serializing_if = "Option::is_none")]
354 pub developer_agent: Option<String>,
355
356 /// Reviewer agent name (if configured)
357 #[serde(skip_serializing_if = "Option::is_none")]
358 pub reviewer_agent: Option<String>,
359
360 /// Total iterations configured
361 #[serde(skip_serializing_if = "Option::is_none")]
362 pub total_iterations: Option<u32>,
363
364 /// Total reviewer passes configured
365 #[serde(skip_serializing_if = "Option::is_none")]
366 pub total_reviewer_passes: Option<u32>,
367}
368
369#[cfg(test)]
370mod tests;