Skip to main content

pounce_common/
diagnostics.rs

1//! Diagnostic-dump infrastructure shared by the solver and the CLI.
2//!
3//! # Why this exists
4//!
5//! Debugging a stalled solve or a perf regression usually means
6//! capturing the inner state of the IPM at specific iterations:
7//! the augmented-system KKT matrix, the iterate, the search step,
8//! the line-search trace. Historically this lived as a scatter of
9//! `POUNCE_DBG_*` env-vars across the codebase, each with bespoke
10//! semantics. This module centralizes the surface so the CLI
11//! (`--dump kkt:5-10`) and the dump sites (deep in the linear
12//! solver) speak the same vocabulary.
13//!
14//! # Lifecycle
15//!
16//! 1. The CLI parses `--dump <cat>[:<spec>]` flags into a
17//!    [`DiagnosticsConfig`] and constructs a [`DiagnosticsState`].
18//! 2. The application installs the state via
19//!    `IpoptApplication::set_diagnostics`, then propagates an
20//!    `Rc<DiagnosticsState>` through the solve in the same way
21//!    [`crate::timing::TimingStatistics`] is propagated.
22//! 3. At the top of each outer iteration, the IPM calls
23//!    [`DiagnosticsState::bump_iter`] to advance the current-iter
24//!    counter and reset the per-iter solve index.
25//! 4. Every dump site (KKT solver, line search, μ oracle, ...) calls
26//!    [`DiagnosticsState::want`] to gate the dump, then
27//!    [`DiagnosticsState::open_writer`] to obtain a file handle in
28//!    the right `iter_NNN/` sub-directory.
29//!
30//! # File layout
31//!
32//! ```text
33//! <dump_dir>/
34//!   manifest.json
35//!   iter_005/
36//!     kkt_solve_001.jsonl
37//!     iterate.json
38//!   iter_006/...
39//!   resto/
40//!     parent_iter_007/
41//!       iter_000/kkt_solve_001.jsonl
42//!   timing.json
43//! ```
44//!
45//! The `solve_NNN` suffix disambiguates the multi-solve-per-iter case
46//! (second-order corrections and perturbation re-solves issue extra
47//! factorizations inside one outer iteration). The
48//! `resto/parent_iter_NNN/` hierarchy keeps the restoration sub-IPM
49//! trace separate from the main solve trace.
50
51use std::collections::HashMap;
52use std::fs;
53use std::io::{BufWriter, Write};
54use std::path::{Path, PathBuf};
55use std::sync::atomic::{AtomicBool, AtomicI32, Ordering};
56
57/// Single diagnostic category the user can request.
58///
59/// Categories map roughly one-to-one to dump sites in the solver.
60/// `Kkt` is the only one actually wired in PR-A; the rest are
61/// declared up front so `--dump iterate:all` parses today and the
62/// follow-up PRs only need to flip the dump-site switch.
63#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy)]
64pub enum DiagCategory {
65    Kkt,
66    Iterate,
67    Step,
68    Mu,
69    Ls,
70    Resto,
71    Convergence,
72    Timing,
73}
74
75impl DiagCategory {
76    pub fn as_str(self) -> &'static str {
77        match self {
78            DiagCategory::Kkt => "kkt",
79            DiagCategory::Iterate => "iterate",
80            DiagCategory::Step => "step",
81            DiagCategory::Mu => "mu",
82            DiagCategory::Ls => "ls",
83            DiagCategory::Resto => "resto",
84            DiagCategory::Convergence => "convergence",
85            DiagCategory::Timing => "timing",
86        }
87    }
88
89    pub fn parse(s: &str) -> Result<Self, String> {
90        match s {
91            "kkt" => Ok(DiagCategory::Kkt),
92            "iterate" => Ok(DiagCategory::Iterate),
93            "step" => Ok(DiagCategory::Step),
94            "mu" => Ok(DiagCategory::Mu),
95            "ls" => Ok(DiagCategory::Ls),
96            "resto" => Ok(DiagCategory::Resto),
97            "convergence" => Ok(DiagCategory::Convergence),
98            "timing" => Ok(DiagCategory::Timing),
99            other => Err(format!(
100                "unknown dump category '{other}' (expected one of: kkt, iterate, step, mu, ls, resto, convergence, timing)"
101            )),
102        }
103    }
104}
105
106/// Iteration filter attached to a category. `None` endpoints denote
107/// open-ended ranges (`N-` is `Range(Some(N), None)`).
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum IterSpec {
110    All,
111    Single(i32),
112    Range(Option<i32>, Option<i32>),
113}
114
115impl IterSpec {
116    pub fn includes(&self, iter: i32) -> bool {
117        match self {
118            IterSpec::All => true,
119            IterSpec::Single(n) => iter == *n,
120            IterSpec::Range(lo, hi) => lo.is_none_or(|l| iter >= l) && hi.is_none_or(|h| iter <= h),
121        }
122    }
123
124    /// Parse the grammar `all | N | N-M | N- | -M`. Negative ints
125    /// aren't accepted — iter counts are non-negative by definition.
126    pub fn parse(s: &str) -> Result<Self, String> {
127        let s = s.trim();
128        if s.is_empty() || s == "all" {
129            return Ok(IterSpec::All);
130        }
131        if let Some(rest) = s.strip_prefix('-') {
132            // "-M"
133            let hi: i32 = rest.parse().map_err(|_| {
134                format!("invalid iter-spec '{s}': expected '-M' with non-negative integer M")
135            })?;
136            if hi < 0 {
137                return Err(format!(
138                    "invalid iter-spec '{s}': iter must be non-negative"
139                ));
140            }
141            return Ok(IterSpec::Range(None, Some(hi)));
142        }
143        if let Some((a, b)) = s.split_once('-') {
144            let lo: i32 = a
145                .parse()
146                .map_err(|_| format!("invalid iter-spec '{s}': '{a}' is not an integer"))?;
147            if lo < 0 {
148                return Err(format!(
149                    "invalid iter-spec '{s}': iter must be non-negative"
150                ));
151            }
152            if b.is_empty() {
153                // "N-"
154                return Ok(IterSpec::Range(Some(lo), None));
155            }
156            // "N-M"
157            let hi: i32 = b
158                .parse()
159                .map_err(|_| format!("invalid iter-spec '{s}': '{b}' is not an integer"))?;
160            if hi < 0 {
161                return Err(format!(
162                    "invalid iter-spec '{s}': iter must be non-negative"
163                ));
164            }
165            if hi < lo {
166                return Err(format!(
167                    "invalid iter-spec '{s}': end ({hi}) is below start ({lo})"
168                ));
169            }
170            return Ok(IterSpec::Range(Some(lo), Some(hi)));
171        }
172        // Bare "N"
173        let n: i32 = s.parse().map_err(|_| {
174            format!("invalid iter-spec '{s}': expected 'all', 'N', 'N-M', 'N-', or '-M'")
175        })?;
176        if n < 0 {
177            return Err(format!(
178                "invalid iter-spec '{s}': iter must be non-negative"
179            ));
180        }
181        Ok(IterSpec::Single(n))
182    }
183}
184
185#[derive(Debug, Clone, Copy, PartialEq, Eq)]
186pub enum DumpFormat {
187    /// Newline-delimited JSON records. One record per dump call.
188    /// Hackable from a shell one-liner; the default.
189    Jsonl,
190}
191
192impl DumpFormat {
193    pub fn parse(s: &str) -> Result<Self, String> {
194        match s {
195            "jsonl" => Ok(DumpFormat::Jsonl),
196            other => Err(format!("unknown dump format '{other}' (expected: jsonl)")),
197        }
198    }
199}
200
201/// Static configuration: where to dump, in what format, with what
202/// per-category iter filters. Constructed by the CLI, held by the
203/// application, frozen for the duration of a solve.
204#[derive(Debug, Clone)]
205pub struct DiagnosticsConfig {
206    pub dump_dir: PathBuf,
207    pub format: DumpFormat,
208    pub categories: HashMap<DiagCategory, IterSpec>,
209}
210
211impl DiagnosticsConfig {
212    pub fn new(dump_dir: PathBuf) -> Self {
213        Self {
214            dump_dir,
215            format: DumpFormat::Jsonl,
216            categories: HashMap::new(),
217        }
218    }
219
220    pub fn with_category(mut self, cat: DiagCategory, spec: IterSpec) -> Self {
221        self.categories.insert(cat, spec);
222        self
223    }
224
225    pub fn is_empty(&self) -> bool {
226        self.categories.is_empty()
227    }
228}
229
230/// Live state threaded through the solve via `Rc`. The IPM mutates
231/// `current_iter` and `solves_this_iter`; the dump sites read them.
232/// All fields use atomics so the type is `Send + Sync` even though
233/// the solver itself is single-threaded — keeps the door open for
234/// future parallel inner solvers without an ABI rewrite.
235pub struct DiagnosticsState {
236    pub config: DiagnosticsConfig,
237    current_iter: AtomicI32,
238    solves_this_iter: AtomicI32,
239    in_restoration: AtomicBool,
240    resto_parent_iter: AtomicI32,
241    resto_inner_iter: AtomicI32,
242    resto_solves_this_iter: AtomicI32,
243}
244
245impl DiagnosticsState {
246    /// Create a state and `mkdir -p` the dump directory. Failure to
247    /// create the directory bubbles up so the CLI can exit with a
248    /// clear error before the solve even starts.
249    pub fn new(config: DiagnosticsConfig) -> std::io::Result<Self> {
250        fs::create_dir_all(&config.dump_dir)?;
251        Ok(Self {
252            config,
253            current_iter: AtomicI32::new(-1),
254            solves_this_iter: AtomicI32::new(0),
255            in_restoration: AtomicBool::new(false),
256            resto_parent_iter: AtomicI32::new(-1),
257            resto_inner_iter: AtomicI32::new(-1),
258            resto_solves_this_iter: AtomicI32::new(0),
259        })
260    }
261
262    /// True if the caller should dump `cat` at the current iter.
263    pub fn want(&self, cat: DiagCategory) -> bool {
264        let iter = self.effective_iter();
265        if iter < 0 {
266            return false;
267        }
268        self.config
269            .categories
270            .get(&cat)
271            .map(|spec| spec.includes(iter))
272            .unwrap_or(false)
273    }
274
275    /// Advance the outer-iteration counter and reset the per-iter
276    /// solve index. Called by `IpoptAlgorithm::optimize` at the top
277    /// of each outer iteration.
278    pub fn bump_iter(&self) {
279        if self.in_restoration.load(Ordering::SeqCst) {
280            self.resto_inner_iter.fetch_add(1, Ordering::SeqCst);
281            self.resto_solves_this_iter.store(0, Ordering::SeqCst);
282        } else {
283            self.current_iter.fetch_add(1, Ordering::SeqCst);
284            self.solves_this_iter.store(0, Ordering::SeqCst);
285        }
286    }
287
288    /// Reserve the next per-iter solve index. Returned value is
289    /// 1-based to match the filenames (`kkt_solve_001.jsonl`).
290    pub fn next_solve_index(&self) -> i32 {
291        let counter = if self.in_restoration.load(Ordering::SeqCst) {
292            &self.resto_solves_this_iter
293        } else {
294            &self.solves_this_iter
295        };
296        counter.fetch_add(1, Ordering::SeqCst) + 1
297    }
298
299    /// Mark the start of a restoration sub-IPM run. `parent_iter` is
300    /// the outer iter that triggered restoration; dumps from the
301    /// resto sub-solve land under `resto/parent_iter_NNN/iter_MMM/`.
302    pub fn enter_restoration(&self) {
303        let parent = self.current_iter.load(Ordering::SeqCst);
304        self.resto_parent_iter.store(parent, Ordering::SeqCst);
305        self.resto_inner_iter.store(-1, Ordering::SeqCst);
306        self.resto_solves_this_iter.store(0, Ordering::SeqCst);
307        self.in_restoration.store(true, Ordering::SeqCst);
308    }
309
310    pub fn exit_restoration(&self) {
311        self.in_restoration.store(false, Ordering::SeqCst);
312    }
313
314    pub fn current_iter(&self) -> i32 {
315        self.effective_iter()
316    }
317
318    /// The iter counter that gates current dumps — resto inner iter
319    /// when in restoration, main outer iter otherwise.
320    fn effective_iter(&self) -> i32 {
321        if self.in_restoration.load(Ordering::SeqCst) {
322            self.resto_inner_iter.load(Ordering::SeqCst)
323        } else {
324            self.current_iter.load(Ordering::SeqCst)
325        }
326    }
327
328    /// Resolve the directory a category's dump file should live in,
329    /// creating it if necessary. Returns `None` if the directory
330    /// cannot be created (e.g., filesystem full) — callers should
331    /// silently skip the dump in that case rather than fail the
332    /// solve.
333    pub fn iter_dir(&self) -> Option<PathBuf> {
334        let dir = if self.in_restoration.load(Ordering::SeqCst) {
335            let parent = self.resto_parent_iter.load(Ordering::SeqCst);
336            let inner = self.resto_inner_iter.load(Ordering::SeqCst).max(0);
337            self.config
338                .dump_dir
339                .join(format!("resto/parent_iter_{parent:03}/iter_{inner:03}"))
340        } else {
341            let iter = self.current_iter.load(Ordering::SeqCst).max(0);
342            self.config.dump_dir.join(format!("iter_{iter:03}"))
343        };
344        fs::create_dir_all(&dir).ok()?;
345        Some(dir)
346    }
347
348    /// Open a writer for `<iter_dir>/<filename>`. Caller picks the
349    /// filename so callers that produce multi-solve traces can use
350    /// `next_solve_index` to disambiguate.
351    pub fn open_writer(&self, filename: &str) -> Option<BufWriter<fs::File>> {
352        let dir = self.iter_dir()?;
353        let path = dir.join(filename);
354        fs::File::create(path).ok().map(BufWriter::new)
355    }
356
357    /// Write a one-shot top-level file (manifest, timing summary).
358    /// Always lands directly under `dump_dir`, never under an iter
359    /// sub-directory.
360    pub fn write_top_level(&self, filename: &str, contents: &str) -> std::io::Result<()> {
361        let path = self.config.dump_dir.join(filename);
362        let mut f = fs::File::create(path)?;
363        f.write_all(contents.as_bytes())?;
364        f.flush()
365    }
366
367    pub fn dump_dir(&self) -> &Path {
368        &self.config.dump_dir
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    #[test]
377    fn iter_spec_parses_all_grammar_forms() {
378        assert_eq!(IterSpec::parse("").unwrap(), IterSpec::All);
379        assert_eq!(IterSpec::parse("all").unwrap(), IterSpec::All);
380        assert_eq!(IterSpec::parse("5").unwrap(), IterSpec::Single(5));
381        assert_eq!(
382            IterSpec::parse("5-10").unwrap(),
383            IterSpec::Range(Some(5), Some(10))
384        );
385        assert_eq!(
386            IterSpec::parse("5-").unwrap(),
387            IterSpec::Range(Some(5), None)
388        );
389        assert_eq!(
390            IterSpec::parse("-10").unwrap(),
391            IterSpec::Range(None, Some(10))
392        );
393    }
394
395    #[test]
396    fn iter_spec_rejects_malformed_input() {
397        assert!(IterSpec::parse("abc").is_err());
398        assert!(IterSpec::parse("5-3").is_err()); // end below start
399        assert!(IterSpec::parse("-x").is_err());
400        assert!(IterSpec::parse("5--10").is_err()); // doubled separator → "-10" tail parse fails
401    }
402
403    #[test]
404    fn iter_spec_includes_matches_grammar() {
405        assert!(IterSpec::All.includes(0));
406        assert!(IterSpec::All.includes(1000));
407        assert!(IterSpec::Single(5).includes(5));
408        assert!(!IterSpec::Single(5).includes(4));
409        let r = IterSpec::Range(Some(5), Some(10));
410        assert!(!r.includes(4));
411        assert!(r.includes(5));
412        assert!(r.includes(7));
413        assert!(r.includes(10));
414        assert!(!r.includes(11));
415        assert!(IterSpec::Range(Some(5), None).includes(1_000_000));
416        assert!(IterSpec::Range(None, Some(5)).includes(0));
417    }
418
419    #[test]
420    fn category_parses_known_names() {
421        assert_eq!(DiagCategory::parse("kkt").unwrap(), DiagCategory::Kkt);
422        assert_eq!(
423            DiagCategory::parse("iterate").unwrap(),
424            DiagCategory::Iterate
425        );
426        assert!(DiagCategory::parse("bogus").is_err());
427    }
428
429    #[test]
430    fn state_gates_on_iter_spec() {
431        let tmp = tempdir();
432        let cfg = DiagnosticsConfig::new(tmp.clone())
433            .with_category(DiagCategory::Kkt, IterSpec::Range(Some(2), Some(4)));
434        let state = DiagnosticsState::new(cfg).unwrap();
435
436        // Before bump_iter, current_iter == -1 → no dumps.
437        assert!(!state.want(DiagCategory::Kkt));
438
439        state.bump_iter(); // iter 0
440        assert!(!state.want(DiagCategory::Kkt));
441        state.bump_iter(); // 1
442        assert!(!state.want(DiagCategory::Kkt));
443        state.bump_iter(); // 2
444        assert!(state.want(DiagCategory::Kkt));
445        state.bump_iter(); // 3
446        assert!(state.want(DiagCategory::Kkt));
447        state.bump_iter(); // 4
448        assert!(state.want(DiagCategory::Kkt));
449        state.bump_iter(); // 5
450        assert!(!state.want(DiagCategory::Kkt));
451
452        // Other categories silently skipped (not configured).
453        assert!(!state.want(DiagCategory::Iterate));
454
455        fs::remove_dir_all(tmp).ok();
456    }
457
458    #[test]
459    fn state_emits_solve_indices_and_iter_dirs() {
460        let tmp = tempdir();
461        let cfg =
462            DiagnosticsConfig::new(tmp.clone()).with_category(DiagCategory::Kkt, IterSpec::All);
463        let state = DiagnosticsState::new(cfg).unwrap();
464        state.bump_iter(); // iter 0
465        assert_eq!(state.next_solve_index(), 1);
466        assert_eq!(state.next_solve_index(), 2);
467        state.bump_iter(); // iter 1
468        assert_eq!(state.next_solve_index(), 1);
469
470        let dir = state.iter_dir().unwrap();
471        assert!(dir.ends_with("iter_001"));
472        fs::remove_dir_all(tmp).ok();
473    }
474
475    #[test]
476    fn restoration_dumps_live_under_resto_subtree() {
477        let tmp = tempdir();
478        let cfg =
479            DiagnosticsConfig::new(tmp.clone()).with_category(DiagCategory::Kkt, IterSpec::All);
480        let state = DiagnosticsState::new(cfg).unwrap();
481        state.bump_iter(); // main iter 0
482        state.bump_iter(); // main iter 1
483        state.enter_restoration();
484        state.bump_iter(); // resto inner 0
485        let dir = state.iter_dir().unwrap();
486        assert!(
487            dir.ends_with("resto/parent_iter_001/iter_000"),
488            "got {dir:?}"
489        );
490        assert_eq!(state.next_solve_index(), 1);
491        state.exit_restoration();
492        let dir = state.iter_dir().unwrap();
493        assert!(dir.ends_with("iter_001"), "got {dir:?}");
494        fs::remove_dir_all(tmp).ok();
495    }
496
497    fn tempdir() -> PathBuf {
498        let p = std::env::temp_dir().join(format!(
499            "pounce-diag-test-{}-{}",
500            std::process::id(),
501            std::time::SystemTime::now()
502                .duration_since(std::time::UNIX_EPOCH)
503                .unwrap()
504                .as_nanos()
505        ));
506        fs::create_dir_all(&p).unwrap();
507        p
508    }
509}