Skip to main content

ralph_workflow/logging/
run_log_context.rs

1use super::run_id::RunId;
2use crate::workspace::Workspace;
3use anyhow::{Context, Result};
4use serde::{Deserialize, Serialize};
5use std::path::{Path, PathBuf};
6
7/// Context for managing per-run log directories and files.
8///
9/// This struct owns the run_id and provides path resolution for all logs
10/// from a single Ralph invocation. All logs are grouped under a per-run
11/// directory (`.agent/logs-<run_id>/`) for easy sharing and diagnosis.
12///
13/// ## Design Rationale
14///
15/// **Why per-run directories?**
16/// - **Shareability**: All logs from a run can be shared as a single tarball
17/// - **Resume continuity**: `--resume` continues logging to the same directory
18/// - **Isolation**: Multiple concurrent runs don't interfere with each other
19/// - **Organization**: Chronological sorting is natural (lexicographic sort)
20///
21/// **Why not scatter logs across `.agent/logs/`, `.agent/tmp/`, etc?**
22/// - Hard to identify which logs belong to which run
23/// - Difficult to share logs for debugging
24/// - Resume would create fragmented log artifacts
25/// - Log rotation and cleanup become complex
26///
27/// ## Integration with Checkpoint/Resume
28///
29/// The `run_id` is stored in the checkpoint (`.agent/checkpoint.json`) so that
30/// `--resume` can continue logging to the same directory. This ensures:
31/// - Logs from the original run and resumed run are in one place
32/// - Event loop sequence numbers continue from where they left off
33/// - Pipeline log is appended (not overwritten)
34///
35/// ## Architecture Compliance
36///
37/// This struct is created once per run in the **impure layer** (effect handlers)
38/// and passed to all effect handlers via `PhaseContext`. It must never be used
39/// in reducers or orchestrators (which are pure).
40///
41/// All filesystem operations go through the `Workspace` trait (never `std::fs`
42/// in pipeline code) to support both `WorkspaceFs` (production) and
43/// `MemoryWorkspace` (tests).
44///
45/// ## Future Extensibility
46///
47/// The per-run directory structure includes reserved subdirectories for future use:
48/// - `provider/`: Provider streaming logs (infrastructure exists, not yet used)
49/// - `debug/`: Future debug artifacts (e.g., memory dumps, profiling data)
50///
51/// ## Examples
52///
53/// ### Fresh run
54/// ```no_run
55/// use ralph_workflow::logging::RunLogContext;
56/// use ralph_workflow::workspace::WorkspaceFs;
57/// use std::path::PathBuf;
58///
59/// let workspace = WorkspaceFs::new(PathBuf::from("."));
60/// let ctx = RunLogContext::new(&workspace)?;
61///
62/// // Get log paths
63/// let pipeline_log = ctx.pipeline_log();  // .agent/logs-2026-02-06_14-03-27.123Z/pipeline.log
64/// let agent_log = ctx.agent_log("planning", 1, None);  // .agent/logs-.../agents/planning_1.log
65/// # Ok::<(), anyhow::Error>(())
66/// ```
67///
68/// ### Resume
69/// ```no_run
70/// use ralph_workflow::logging::RunLogContext;
71/// use ralph_workflow::workspace::WorkspaceFs;
72/// use std::path::PathBuf;
73///
74/// let workspace = WorkspaceFs::new(PathBuf::from("."));
75/// let run_id = "2026-02-06_14-03-27.123Z";  // From checkpoint
76/// let ctx = RunLogContext::from_checkpoint(run_id, &workspace)?;
77///
78/// // Logs will append to existing files in the same run directory
79/// let pipeline_log = ctx.pipeline_log();
80/// # Ok::<(), anyhow::Error>(())
81/// ```
82pub struct RunLogContext {
83    run_id: RunId,
84    run_dir: PathBuf,
85}
86
87impl RunLogContext {
88    /// Create a new RunLogContext with collision handling.
89    ///
90    /// Generates a new run_id and creates the run directory structure.
91    /// If directory exists, tries collision counter variants (rare case
92    /// of multiple runs starting in the same millisecond).
93    ///
94    /// Creates subdirectories:
95    /// - `.agent/logs-<run_id>/agents/` for per-agent logs
96    /// - `.agent/logs-<run_id>/provider/` for provider streaming logs
97    /// - `.agent/logs-<run_id>/debug/` for future debug artifacts
98    ///
99    /// # Collision Handling
100    ///
101    /// The collision handling loop tries counter values 0-99:
102    /// - Counter 0: Uses the base run_id (no suffix)
103    /// - Counter 1-99: Appends `-01` through `-99` suffixes
104    ///
105    /// # TOCTOU Race Condition Handling
106    ///
107    /// To avoid the time-of-check-to-time-of-use race condition, we:
108    /// 1. First check if the directory exists (fast path for common case)
109    /// 2. If it doesn't exist, try to create it
110    /// 3. If creation succeeds but the directory still doesn't exist afterward,
111    ///    another process may have created it, so we try the next collision variant
112    /// 4. We use the presence of the "agents" subdirectory as our "created" marker
113    ///
114    /// Note: If a base directory exists that was actually created as a collision
115    /// directory (e.g., due to a bug), the system will still work correctly by
116    /// creating the next collision variant. This is acceptable because the directory
117    /// naming format is deterministic and we always check for existence before creating.
118    pub fn new(workspace: &dyn Workspace) -> Result<Self> {
119        let base_run_id = RunId::new();
120
121        // Try base run_id first, then collision variants 1-99
122        for counter in 0..=99 {
123            let run_id = if counter == 0 {
124                base_run_id.clone()
125            } else {
126                base_run_id.with_collision_counter(counter)
127            };
128
129            let run_dir = PathBuf::from(format!(".agent/logs-{}", run_id));
130            let agents_dir = run_dir.join("agents");
131
132            // Fast path: if agents subdirectory exists, this run_id is taken
133            if workspace.exists(&agents_dir) {
134                continue;
135            }
136
137            // Try to create the run directory and subdirectories
138            // create_dir_all is idempotent (Ok if directory exists)
139            workspace
140                .create_dir_all(&run_dir)
141                .context("Failed to create run log directory")?;
142
143            workspace
144                .create_dir_all(&agents_dir)
145                .context("Failed to create agents log subdirectory")?;
146
147            workspace
148                .create_dir_all(&run_dir.join("provider"))
149                .context("Failed to create provider log subdirectory")?;
150
151            workspace
152                .create_dir_all(&run_dir.join("debug"))
153                .context("Failed to create debug log subdirectory")?;
154
155            // Verify we're the ones who created it (agents_dir should exist now)
156            // If it doesn't, another process might have raced us, try next variant
157            if workspace.exists(&agents_dir) {
158                return Ok(Self { run_id, run_dir });
159            }
160        }
161
162        // If we exhausted all collision counters, bail
163        anyhow::bail!(
164            "Too many collisions creating run log directory (tried base + 99 variants). \
165             This is extremely rare (100+ runs in the same millisecond). \
166             Possible causes: clock skew, or filesystem issues. \
167             Suggestion: Wait 1ms and retry, or check system clock."
168        )
169    }
170
171    /// Create a RunLogContext from an existing checkpoint (for resume).
172    ///
173    /// Uses the timestamp-based log run ID from the checkpoint (stored in
174    /// `PipelineCheckpoint.log_run_id`) to continue logging to the same run
175    /// directory. This is distinct from the UUID-based `run_id` field in the
176    /// checkpoint which identifies the execution session.
177    ///
178    /// If the directory doesn't exist (e.g., deleted), it is recreated.
179    pub fn from_checkpoint(run_id: &str, workspace: &dyn Workspace) -> Result<Self> {
180        let run_id = RunId::from_checkpoint(run_id);
181        let run_dir = PathBuf::from(format!(".agent/logs-{}", run_id));
182
183        // Ensure directory exists (may have been deleted)
184        if !workspace.exists(&run_dir) {
185            workspace
186                .create_dir_all(&run_dir)
187                .context("Failed to recreate run log directory for resume")?;
188
189            workspace
190                .create_dir_all(&run_dir.join("agents"))
191                .context("Failed to recreate agents log subdirectory for resume")?;
192
193            workspace
194                .create_dir_all(&run_dir.join("provider"))
195                .context("Failed to recreate provider log subdirectory for resume")?;
196
197            workspace
198                .create_dir_all(&run_dir.join("debug"))
199                .context("Failed to recreate debug log subdirectory for resume")?;
200        }
201
202        Ok(Self { run_id, run_dir })
203    }
204
205    /// Test-only helper to create a RunLogContext with a fixed run_id.
206    ///
207    /// This allows testing the collision handling logic by providing a predictable
208    /// run_id that can be pre-created on the filesystem to simulate collisions.
209    ///
210    /// # Warning
211    ///
212    /// This is intended for testing only. Using a fixed run_id in production
213    /// could lead to directory collisions. Always use [`RunLogContext::new`]
214    /// or [`RunLogContext::from_checkpoint`] in production code.
215    ///
216    /// # Examples
217    ///
218    /// ```ignore
219    /// use ralph_workflow::logging::{RunId, RunLogContext};
220    ///
221    /// // Create a fixed run_id for testing
222    /// let fixed_id = RunId::for_test("2026-02-06_14-03-27.123Z");
223    /// let ctx = RunLogContext::for_testing(fixed_id, &workspace)?;
224    /// ```
225    pub fn for_testing(base_run_id: RunId, workspace: &dyn Workspace) -> Result<Self> {
226        // Try base run_id first, then collision variants 1-99
227        for counter in 0..=99 {
228            let run_id = if counter == 0 {
229                base_run_id.clone()
230            } else {
231                base_run_id.with_collision_counter(counter)
232            };
233
234            let run_dir = PathBuf::from(format!(".agent/logs-{}", run_id));
235            let agents_dir = run_dir.join("agents");
236
237            // Fast path: if agents subdirectory exists, this run_id is taken
238            if workspace.exists(&agents_dir) {
239                continue;
240            }
241
242            // Try to create the run directory and subdirectories
243            // create_dir_all is idempotent (Ok if directory exists)
244            workspace
245                .create_dir_all(&run_dir)
246                .context("Failed to create run log directory")?;
247
248            workspace
249                .create_dir_all(&agents_dir)
250                .context("Failed to create agents log subdirectory")?;
251
252            workspace
253                .create_dir_all(&run_dir.join("provider"))
254                .context("Failed to create provider log subdirectory")?;
255
256            workspace
257                .create_dir_all(&run_dir.join("debug"))
258                .context("Failed to create debug log subdirectory")?;
259
260            // Verify we're the ones who created it (agents_dir should exist now)
261            // If it doesn't, another process might have raced us, try next variant
262            if workspace.exists(&agents_dir) {
263                return Ok(Self { run_id, run_dir });
264            }
265        }
266
267        // If we exhausted all collision counters, bail
268        anyhow::bail!(
269            "Too many collisions creating run log directory (tried base + 99 variants). \
270             This is extremely rare (100+ runs in the same millisecond). \
271             Possible causes: clock skew, or filesystem issues. \
272             Suggestion: Wait 1ms and retry, or check system clock."
273        )
274    }
275
276    /// Get a reference to the run ID.
277    ///
278    /// This is the timestamp-based log run ID (format: `YYYY-MM-DD_HH-mm-ss.SSSZ[-NN]`)
279    /// used for naming the per-run log directory. It is distinct from the UUID-based
280    /// `run_id` field stored in `PipelineCheckpoint`, which uniquely identifies the
281    /// execution session.
282    pub fn run_id(&self) -> &RunId {
283        &self.run_id
284    }
285
286    /// Get the run directory path (relative to workspace root).
287    pub fn run_dir(&self) -> &Path {
288        &self.run_dir
289    }
290
291    /// Get the path to the pipeline log file.
292    pub fn pipeline_log(&self) -> PathBuf {
293        self.run_dir.join("pipeline.log")
294    }
295
296    /// Get the path to the event loop log file.
297    pub fn event_loop_log(&self) -> PathBuf {
298        self.run_dir.join("event_loop.log")
299    }
300
301    /// Get the path to the event loop trace file (crash-only).
302    pub fn event_loop_trace(&self) -> PathBuf {
303        self.run_dir.join("event_loop_trace.jsonl")
304    }
305
306    /// Get the path to an agent log file.
307    ///
308    /// # Arguments
309    /// * `phase` - Phase name (e.g., "planning", "dev", "reviewer", "commit")
310    /// * `index` - Invocation index within the phase (1-based)
311    /// * `attempt` - Optional retry attempt counter (1 for first retry, 2 for second retry, etc.; None for initial attempt with no retries)
312    ///
313    /// # Returns
314    /// Path like `.agent/logs-<run_id>/agents/planning_1.log` or
315    /// `.agent/logs-<run_id>/agents/dev_2_a1.log` for retries.
316    pub fn agent_log(&self, phase: &str, index: u32, attempt: Option<u32>) -> PathBuf {
317        let filename = if let Some(a) = attempt {
318            format!("{}_{}_a{}.log", phase, index, a)
319        } else {
320            format!("{}_{}.log", phase, index)
321        };
322        self.run_dir.join("agents").join(filename)
323    }
324
325    /// Get the path to a provider streaming log file.
326    ///
327    /// # Arguments
328    /// * `name` - Provider log filename (e.g., "claude-stream_dev_1.jsonl")
329    ///
330    /// # Returns
331    /// Path like `.agent/logs-<run_id>/provider/claude-stream_dev_1.jsonl`.
332    pub fn provider_log(&self, name: &str) -> PathBuf {
333        self.run_dir.join("provider").join(name)
334    }
335
336    /// Get the path to the run metadata file (run.json).
337    pub fn run_metadata(&self) -> PathBuf {
338        self.run_dir.join("run.json")
339    }
340
341    /// Write run.json metadata file.
342    ///
343    /// This should be called early in pipeline execution to record
344    /// essential metadata for debugging and tooling.
345    pub fn write_run_metadata(
346        &self,
347        workspace: &dyn Workspace,
348        metadata: &RunMetadata,
349    ) -> Result<()> {
350        let path = self.run_metadata();
351        let json = serde_json::to_string_pretty(metadata).with_context(|| {
352            format!(
353                "Failed to serialize run metadata for run_id '{}'. \
354                 This usually means a field contains data that cannot be represented as JSON.",
355                self.run_id
356            )
357        })?;
358        workspace.write(&path, &json).with_context(|| {
359            format!(
360                "Failed to write run.json to '{}'. Check filesystem permissions and disk space.",
361                path.display()
362            )
363        })
364    }
365}
366
367/// Metadata recorded in run.json for each pipeline run.
368///
369/// This file is written at the start of each run to provide context
370/// for debugging and tooling. It anchors the run with essential info
371/// like command invocation, timestamps, and environment details.
372#[derive(Debug, Clone, Serialize, Deserialize)]
373pub struct RunMetadata {
374    /// Timestamp-based run identifier (matches directory name)
375    ///
376    /// Format: `YYYY-MM-DD_HH-mm-ss.SSSZ[-NN]` (e.g., `2026-02-06_14-03-27.123Z`)
377    ///
378    /// This is the log run ID used for the per-run log directory and is distinct
379    /// from the UUID-based `run_id` field in `PipelineCheckpoint` which uniquely
380    /// identifies the execution session.
381    pub run_id: String,
382
383    /// Timestamp when run started (UTC, RFC3339)
384    pub started_at_utc: String,
385
386    /// Command as invoked by user (e.g., "ralph" or "ralph --resume")
387    pub command: String,
388
389    /// Whether this is a resumed session
390    pub resume: bool,
391
392    /// Absolute path to repository root
393    pub repo_root: String,
394
395    /// Ralph version (from Cargo.toml)
396    pub ralph_version: String,
397
398    /// Process ID (if available)
399    #[serde(skip_serializing_if = "Option::is_none")]
400    pub pid: Option<u32>,
401
402    /// Configuration summary (non-secret metadata)
403    #[serde(skip_serializing_if = "Option::is_none")]
404    pub config_summary: Option<ConfigSummary>,
405}
406
407/// Non-secret configuration summary for run.json.
408///
409/// Captures high-level config info useful for debugging without
410/// exposing any sensitive data (API keys, tokens, etc.).
411#[derive(Debug, Clone, Serialize, Deserialize)]
412pub struct ConfigSummary {
413    /// Developer agent name (if configured)
414    #[serde(skip_serializing_if = "Option::is_none")]
415    pub developer_agent: Option<String>,
416
417    /// Reviewer agent name (if configured)
418    #[serde(skip_serializing_if = "Option::is_none")]
419    pub reviewer_agent: Option<String>,
420
421    /// Total iterations configured
422    #[serde(skip_serializing_if = "Option::is_none")]
423    pub total_iterations: Option<u32>,
424
425    /// Total reviewer passes configured
426    #[serde(skip_serializing_if = "Option::is_none")]
427    pub total_reviewer_passes: Option<u32>,
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433    use crate::workspace::WorkspaceFs;
434
435    #[test]
436    fn test_run_log_context_creation() {
437        let tempdir = tempfile::tempdir().unwrap();
438        let workspace = WorkspaceFs::new(tempdir.path().to_path_buf());
439
440        let ctx = RunLogContext::new(&workspace).unwrap();
441
442        // Verify run directory exists
443        assert!(workspace.exists(ctx.run_dir()));
444
445        // Verify subdirectories exist
446        assert!(workspace.exists(&ctx.run_dir().join("agents")));
447        assert!(workspace.exists(&ctx.run_dir().join("provider")));
448        assert!(workspace.exists(&ctx.run_dir().join("debug")));
449    }
450
451    #[test]
452    fn test_run_log_context_path_resolution() {
453        let tempdir = tempfile::tempdir().unwrap();
454        let workspace = WorkspaceFs::new(tempdir.path().to_path_buf());
455
456        let ctx = RunLogContext::new(&workspace).unwrap();
457
458        // Test pipeline log path
459        let pipeline_log = ctx.pipeline_log();
460        assert!(pipeline_log.ends_with("pipeline.log"));
461
462        // Test event loop log path
463        let event_loop_log = ctx.event_loop_log();
464        assert!(event_loop_log.ends_with("event_loop.log"));
465
466        // Test agent log path (no attempt)
467        let agent_log = ctx.agent_log("planning", 1, None);
468        assert!(agent_log.ends_with("agents/planning_1.log"));
469
470        // Test agent log path (with attempt)
471        let agent_log_retry = ctx.agent_log("dev", 2, Some(3));
472        assert!(agent_log_retry.ends_with("agents/dev_2_a3.log"));
473
474        // Test provider log path
475        let provider_log = ctx.provider_log("claude-stream.jsonl");
476        assert!(provider_log.ends_with("provider/claude-stream.jsonl"));
477    }
478
479    #[test]
480    fn test_run_log_context_from_checkpoint() {
481        let tempdir = tempfile::tempdir().unwrap();
482        let workspace = WorkspaceFs::new(tempdir.path().to_path_buf());
483
484        let original_id = "2026-02-06_14-03-27.123Z";
485        let ctx = RunLogContext::from_checkpoint(original_id, &workspace).unwrap();
486
487        assert_eq!(ctx.run_id().as_str(), original_id);
488        assert!(workspace.exists(ctx.run_dir()));
489    }
490
491    #[test]
492    fn test_run_metadata_serialization() {
493        let metadata = RunMetadata {
494            run_id: "2026-02-06_14-03-27.123Z".to_string(),
495            started_at_utc: "2026-02-06T14:03:27.123Z".to_string(),
496            command: "ralph".to_string(),
497            resume: false,
498            repo_root: "/tmp/test".to_string(),
499            ralph_version: "0.6.3".to_string(),
500            pid: Some(12345),
501            config_summary: Some(ConfigSummary {
502                developer_agent: Some("claude".to_string()),
503                reviewer_agent: Some("claude".to_string()),
504                total_iterations: Some(3),
505                total_reviewer_passes: Some(1),
506            }),
507        };
508
509        let json = serde_json::to_string_pretty(&metadata).unwrap();
510        assert!(json.contains("run_id"));
511        assert!(json.contains("2026-02-06_14-03-27.123Z"));
512        assert!(json.contains("ralph"));
513    }
514
515    #[test]
516    fn test_write_run_metadata() {
517        let tempdir = tempfile::tempdir().unwrap();
518        let workspace = WorkspaceFs::new(tempdir.path().to_path_buf());
519
520        let ctx = RunLogContext::new(&workspace).unwrap();
521
522        let metadata = RunMetadata {
523            run_id: ctx.run_id().to_string(),
524            started_at_utc: "2026-02-06T14:03:27.123Z".to_string(),
525            command: "ralph".to_string(),
526            resume: false,
527            repo_root: tempdir.path().display().to_string(),
528            ralph_version: "0.6.3".to_string(),
529            pid: Some(12345),
530            config_summary: None,
531        };
532
533        ctx.write_run_metadata(&workspace, &metadata).unwrap();
534
535        // Verify file was written
536        let json_path = ctx.run_metadata();
537        assert!(workspace.exists(&json_path));
538
539        // Verify content
540        let content = workspace.read(&json_path).unwrap();
541        assert!(content.contains(&ctx.run_id().to_string()));
542    }
543
544    #[test]
545    fn test_collision_handling() {
546        let tempdir = tempfile::tempdir().unwrap();
547        let workspace = WorkspaceFs::new(tempdir.path().to_path_buf());
548
549        // Create a fixed run_id that we can use to simulate collision
550        let fixed_id = RunId::for_test("2026-02-06_14-03-27.123Z");
551
552        // Create the base directory with agents subdirectory to simulate a complete collision
553        let base_dir = PathBuf::from(format!(".agent/logs-{}", fixed_id));
554        workspace
555            .create_dir_all(&base_dir.join("agents"))
556            .expect("Failed to create base directory for collision test");
557
558        // Also create collision variants 1-5 with agents subdirectory
559        for i in 1..=5 {
560            let collision_dir = PathBuf::from(format!(".agent/logs-{}-{:02}", fixed_id, i));
561            workspace
562                .create_dir_all(&collision_dir.join("agents"))
563                .expect("Failed to create collision directory");
564        }
565
566        // Now create a RunLogContext with the fixed base run_id
567        // It should skip base and collisions 1-5 and create collision variant 06
568        let ctx = RunLogContext::for_testing(fixed_id, &workspace).unwrap();
569
570        // Verify the run_id has a collision suffix -06
571        let run_id_str = ctx.run_id().as_str();
572        assert!(
573            run_id_str.ends_with("-06"),
574            "Run ID should have collision suffix -06, got: {}",
575            run_id_str
576        );
577
578        // Verify the directory exists
579        assert!(workspace.exists(ctx.run_dir()));
580
581        // Verify the directory name matches
582        let expected_dir = PathBuf::from(format!(".agent/logs-{}", run_id_str));
583        assert_eq!(
584            ctx.run_dir(),
585            &expected_dir,
586            "Run directory should match the collision suffix path"
587        );
588    }
589
590    #[test]
591    fn test_collision_exhaustion() {
592        let tempdir = tempfile::tempdir().unwrap();
593        let workspace = WorkspaceFs::new(tempdir.path().to_path_buf());
594
595        // Create a fixed run_id
596        let fixed_id = RunId::for_test("2026-02-06_14-03-27.123Z");
597
598        // Create the base directory and all 99 collision variants with agents subdirectory
599        workspace
600            .create_dir_all(&PathBuf::from(format!(".agent/logs-{}", fixed_id)).join("agents"))
601            .unwrap();
602        for i in 1..=99 {
603            workspace
604                .create_dir_all(
605                    &PathBuf::from(format!(".agent/logs-{}-{:02}", fixed_id, i)).join("agents"),
606                )
607                .unwrap();
608        }
609
610        // Now try to create a RunLogContext with the fixed base run_id - it should fail
611        let result = RunLogContext::for_testing(fixed_id, &workspace);
612        assert!(
613            result.is_err(),
614            "Should fail when all collision variants are exhausted"
615        );
616
617        let err_msg = match result {
618            Err(e) => e.to_string(),
619            Ok(_) => panic!("Expected error but got success"),
620        };
621        assert!(
622            err_msg.contains("Too many collisions") || err_msg.contains("collisions"),
623            "Error message should mention collisions: {}",
624            err_msg
625        );
626    }
627}