Skip to main content

matrixcode_core/prompt/
dump.rs

1//! Prompt Export and Observability
2//!
3//! Provides:
4//! - JSONL dump for prompt inspection
5//! - Debug output for development
6//! - Prompt analysis tools
7
8use chrono::{DateTime, Utc};
9use regex::Regex;
10use serde::{Deserialize, Serialize};
11use std::path::{Path, PathBuf};
12use std::sync::OnceLock;
13
14use crate::prompt::AssembledPrompt;
15
16// Pre-compiled regex patterns for analyze_prompt
17static SECTION_PATTERN: OnceLock<Regex> = OnceLock::new();
18static TAG_PATTERN: OnceLock<Regex> = OnceLock::new();
19
20fn get_section_pattern() -> &'static Regex {
21    SECTION_PATTERN.get_or_init(|| Regex::new(r"\[([^\]]+)\]").unwrap())
22}
23
24fn get_tag_pattern() -> &'static Regex {
25    TAG_PATTERN.get_or_init(|| Regex::new(r"<([a-zA-Z_][a-zA-Z0-9_]*)>").unwrap())
26}
27
28/// A single dump entry for JSONL export
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct DumpEntry {
31    /// Timestamp
32    pub timestamp: DateTime<Utc>,
33    /// Profile used
34    pub profile: String,
35    /// Full prompt content
36    pub prompt: String,
37    /// Cached sections count
38    pub cached_sections: usize,
39    /// Dynamic sections count
40    pub dynamic_sections: usize,
41    /// Cached tokens estimate
42    pub cached_tokens: usize,
43    /// Dynamic tokens estimate
44    pub dynamic_tokens: usize,
45    /// Total tokens estimate
46    pub total_tokens: usize,
47    /// Cache efficiency percentage
48    pub cache_efficiency: f64,
49    /// Session ID (optional)
50    pub session_id: Option<String>,
51    /// Conversation ID (optional)
52    pub conversation_id: Option<String>,
53}
54
55impl DumpEntry {
56    /// Create from assembled prompt
57    pub fn from_prompt(prompt: &AssembledPrompt, session_id: Option<String>) -> Self {
58        Self {
59            timestamp: Utc::now(),
60            profile: prompt.profile.to_string(),
61            prompt: prompt.prompt.clone(),
62            cached_sections: prompt.cached_sections,
63            dynamic_sections: prompt.dynamic_sections,
64            cached_tokens: prompt.cached_tokens,
65            dynamic_tokens: prompt.dynamic_tokens,
66            total_tokens: prompt.total_tokens,
67            cache_efficiency: prompt.cache_efficiency(),
68            session_id,
69            conversation_id: None,
70        }
71    }
72
73    /// Create with additional conversation context
74    pub fn with_conversation(mut self, conversation_id: String) -> Self {
75        self.conversation_id = Some(conversation_id);
76        self
77    }
78}
79
80/// Prompt dumper for observability
81pub struct PromptDumper {
82    /// Dump file path
83    dump_path: Option<PathBuf>,
84    /// Whether to dump to file
85    dump_enabled: bool,
86    /// Whether to print to stdout
87    print_enabled: bool,
88    /// Session ID
89    session_id: Option<String>,
90    /// Entries buffer
91    entries: Vec<DumpEntry>,
92    /// Maximum entries to buffer before flush
93    buffer_size: usize,
94}
95
96impl PromptDumper {
97    /// Create a new dumper
98    pub fn new() -> Self {
99        Self {
100            dump_path: None,
101            dump_enabled: false,
102            print_enabled: false,
103            session_id: None,
104            entries: Vec::new(),
105            buffer_size: 100,
106        }
107    }
108
109    /// Enable file dumping
110    pub fn enable_file_dump<P: Into<PathBuf>>(mut self, path: P) -> Self {
111        self.dump_path = Some(path.into());
112        self.dump_enabled = true;
113        self
114    }
115
116    /// Enable stdout printing
117    pub fn enable_print(mut self) -> Self {
118        self.print_enabled = true;
119        self
120    }
121
122    /// Set session ID
123    pub fn with_session(mut self, session_id: String) -> Self {
124        self.session_id = Some(session_id);
125        self
126    }
127
128    /// Set buffer size
129    pub fn with_buffer_size(mut self, size: usize) -> Self {
130        self.buffer_size = size;
131        self
132    }
133
134    /// Dump an assembled prompt
135    pub fn dump(&mut self, prompt: &AssembledPrompt) {
136        let entry = DumpEntry::from_prompt(prompt, self.session_id.clone());
137
138        // Print if enabled
139        if self.print_enabled {
140            self.print_entry(&entry);
141        }
142
143        // Buffer entry
144        self.entries.push(entry);
145
146        // Flush if buffer is full
147        if self.entries.len() >= self.buffer_size {
148            self.flush();
149        }
150    }
151
152    /// Dump with conversation ID
153    pub fn dump_with_conversation(&mut self, prompt: &AssembledPrompt, conversation_id: String) {
154        let entry = DumpEntry::from_prompt(prompt, self.session_id.clone())
155            .with_conversation(conversation_id);
156
157        if self.print_enabled {
158            self.print_entry(&entry);
159        }
160
161        self.entries.push(entry);
162
163        if self.entries.len() >= self.buffer_size {
164            self.flush();
165        }
166    }
167
168    /// Print entry to stdout
169    fn print_entry(&self, entry: &DumpEntry) {
170        println!("=== Prompt Dump ===");
171        println!("Timestamp: {}", entry.timestamp);
172        println!("Profile: {}", entry.profile);
173        println!(
174            "Sections: {} cached, {} dynamic",
175            entry.cached_sections, entry.dynamic_sections
176        );
177        println!(
178            "Tokens: {} cached, {} dynamic, {} total",
179            entry.cached_tokens, entry.dynamic_tokens, entry.total_tokens
180        );
181        println!("Cache efficiency: {:.1}%", entry.cache_efficiency);
182        println!("--- Prompt Content ---");
183
184        // Limit output for readability
185        if entry.prompt.len() > 2000 {
186            println!(
187                "{}... (truncated, {} chars total)",
188                &entry.prompt[..2000],
189                entry.prompt.len()
190            );
191        } else {
192            println!("{}", entry.prompt);
193        }
194
195        println!("=== End Dump ===");
196    }
197
198    /// Flush buffer to file
199    pub fn flush(&mut self) {
200        if !self.dump_enabled || self.dump_path.is_none() || self.entries.is_empty() {
201            return;
202        }
203
204        let path = self.dump_path.as_ref().unwrap();
205
206        // Create parent directories if needed
207        if let Some(parent) = path.parent() {
208            if !parent.exists() {
209                if let Err(e) = std::fs::create_dir_all(parent) {
210                    log::warn!("Failed to create dump directory: {}", e);
211                    return;
212                }
213            }
214        }
215
216        // Open file for append
217        match std::fs::OpenOptions::new()
218            .create(true)
219            .append(true)
220            .open(path)
221        {
222            Ok(mut file) => {
223                use std::io::Write;
224                for entry in &self.entries {
225                    match serde_json::to_string(entry) {
226                        Ok(json) => {
227                            if let Err(e) = writeln!(file, "{}", json) {
228                                log::warn!("Failed to write dump entry: {}", e);
229                            }
230                        }
231                        Err(e) => log::warn!("Failed to serialize dump entry: {}", e),
232                    }
233                }
234            }
235            Err(e) => log::warn!("Failed to open dump file {}: {}", path.display(), e),
236        }
237
238        self.entries.clear();
239    }
240
241    /// Get all entries
242    pub fn entries(&self) -> &[DumpEntry] {
243        &self.entries
244    }
245
246    /// Clear entries buffer
247    pub fn clear(&mut self) {
248        self.entries.clear();
249    }
250
251    /// Analyze prompt for debugging
252    pub fn analyze_prompt(prompt: &str) -> PromptAnalysis {
253        let mut analysis = PromptAnalysis::default();
254
255        // Count sections (using pre-compiled pattern)
256        let section_pattern = get_section_pattern();
257        for cap in section_pattern.captures_iter(prompt) {
258            analysis.sections.push(cap[1].to_string());
259        }
260
261        // Count XML tags (using pre-compiled pattern)
262        let tag_pattern = get_tag_pattern();
263        for cap in tag_pattern.captures_iter(prompt) {
264            let tag = cap[1].to_string();
265            analysis.xml_tags.push(tag.clone());
266            analysis
267                .xml_tag_counts
268                .entry(tag)
269                .and_modify(|c| *c += 1)
270                .or_insert(1);
271        }
272
273        // Find cache boundary
274        analysis.has_cache_boundary = prompt.contains(crate::prompt::CACHE_BOUNDARY);
275
276        // Estimate tokens
277        analysis.estimated_tokens = crate::prompt::cache::estimate_tokens(prompt);
278
279        // Character count
280        analysis.char_count = prompt.len();
281
282        // Line count
283        analysis.line_count = prompt.lines().count();
284
285        analysis
286    }
287}
288
289impl Default for PromptDumper {
290    fn default() -> Self {
291        Self::new()
292    }
293}
294
295/// Analysis result for a prompt
296#[derive(Debug, Clone, Default, Serialize, Deserialize)]
297pub struct PromptAnalysis {
298    /// Detected sections
299    pub sections: Vec<String>,
300    /// Detected XML tags
301    pub xml_tags: Vec<String>,
302    /// XML tag counts
303    pub xml_tag_counts: std::collections::HashMap<String, usize>,
304    /// Has cache boundary
305    pub has_cache_boundary: bool,
306    /// Estimated tokens
307    pub estimated_tokens: usize,
308    /// Character count
309    pub char_count: usize,
310    /// Line count
311    pub line_count: usize,
312}
313
314impl PromptAnalysis {
315    /// Print analysis summary
316    pub fn print_summary(&self) {
317        println!("Prompt Analysis Summary:");
318        println!("  Sections: {:?}", self.sections);
319        println!(
320            "  XML tags: {} unique, {:?} counts",
321            self.xml_tags.len(),
322            self.xml_tag_counts
323        );
324        println!("  Cache boundary: {}", self.has_cache_boundary);
325        println!("  Tokens estimate: {}", self.estimated_tokens);
326        println!("  Characters: {}", self.char_count);
327        println!("  Lines: {}", self.line_count);
328    }
329}
330
331/// Read dump entries from a JSONL file
332pub fn read_dump_file<P: AsRef<Path>>(path: P) -> Vec<DumpEntry> {
333    let path = path.as_ref();
334    if !path.exists() {
335        return Vec::new();
336    }
337
338    let content = std::fs::read_to_string(path).unwrap_or_default();
339    content
340        .lines()
341        .filter_map(|line| serde_json::from_str::<DumpEntry>(line).ok())
342        .collect()
343}
344
345/// Analyze a dump file for patterns
346pub fn analyze_dump_file<P: AsRef<Path>>(path: P) -> DumpFileAnalysis {
347    let entries = read_dump_file(path);
348
349    let mut analysis = DumpFileAnalysis::default();
350    analysis.total_entries = entries.len();
351
352    for entry in &entries {
353        analysis.total_tokens += entry.total_tokens;
354        analysis.avg_tokens += entry.total_tokens;
355        analysis
356            .profile_counts
357            .entry(entry.profile.clone())
358            .and_modify(|c| *c += 1)
359            .or_insert(1);
360
361        if entry.cache_efficiency > analysis.max_cache_efficiency {
362            analysis.max_cache_efficiency = entry.cache_efficiency;
363        }
364        if entry.cache_efficiency < analysis.min_cache_efficiency
365            || analysis.min_cache_efficiency == 0.0
366        {
367            analysis.min_cache_efficiency = entry.cache_efficiency;
368        }
369    }
370
371    if analysis.total_entries > 0 {
372        analysis.avg_tokens /= analysis.total_entries;
373        analysis.avg_cache_efficiency =
374            entries.iter().map(|e| e.cache_efficiency).sum::<f64>() / analysis.total_entries as f64;
375    }
376
377    analysis
378}
379
380/// Analysis of a dump file
381#[derive(Debug, Clone, Default, Serialize, Deserialize)]
382pub struct DumpFileAnalysis {
383    /// Total entries
384    pub total_entries: usize,
385    /// Total tokens across all entries
386    pub total_tokens: usize,
387    /// Average tokens per entry
388    pub avg_tokens: usize,
389    /// Profile counts
390    pub profile_counts: std::collections::HashMap<String, usize>,
391    /// Max cache efficiency
392    pub max_cache_efficiency: f64,
393    /// Min cache efficiency
394    pub min_cache_efficiency: f64,
395    /// Average cache efficiency
396    pub avg_cache_efficiency: f64,
397}
398
399impl DumpFileAnalysis {
400    pub fn print_summary(&self) {
401        println!("Dump File Analysis:");
402        println!("  Total entries: {}", self.total_entries);
403        println!("  Total tokens: {}", self.total_tokens);
404        println!("  Average tokens: {}", self.avg_tokens);
405        println!("  Profile distribution: {:?}", self.profile_counts);
406        println!(
407            "  Cache efficiency: min {:.1}%, max {:.1}%, avg {:.1}%",
408            self.min_cache_efficiency, self.max_cache_efficiency, self.avg_cache_efficiency
409        );
410    }
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416
417    #[test]
418    fn test_dump_entry_creation() {
419        let mut orchestrator =
420            crate::prompt::PromptOrchestrator::new(std::env::current_dir().unwrap());
421        orchestrator.add_section(crate::prompt::PromptSection::static_section(
422            "test",
423            "test content",
424        ));
425
426        let assembled = orchestrator.assemble();
427        let entry = DumpEntry::from_prompt(&assembled, Some("session-1".to_string()));
428
429        assert_eq!(entry.profile, "default");
430        assert!(entry.prompt.contains("test"));
431        assert_eq!(entry.session_id, Some("session-1".to_string()));
432    }
433
434    #[test]
435    fn test_dumper_basic() {
436        let mut dumper = PromptDumper::new().enable_print();
437
438        let mut orchestrator =
439            crate::prompt::PromptOrchestrator::new(std::env::current_dir().unwrap());
440        orchestrator.add_section(crate::prompt::PromptSection::static_section(
441            "identity",
442            "You are AI",
443        ));
444
445        let assembled = orchestrator.assemble();
446        dumper.dump(&assembled);
447
448        assert_eq!(dumper.entries().len(), 1);
449    }
450
451    #[test]
452    fn test_analyze_prompt() {
453        let prompt = "[identity]\nYou are AI\n\n<context>\nSome context\n</context>";
454        let analysis = PromptDumper::analyze_prompt(prompt);
455
456        assert!(analysis.sections.contains(&"identity".to_string()));
457        assert!(analysis.xml_tags.contains(&"context".to_string()));
458        assert!(!analysis.has_cache_boundary);
459        assert!(analysis.estimated_tokens > 0);
460    }
461
462    #[test]
463    fn test_prompt_analysis_summary() {
464        let prompt = "[test]\nContent";
465        let analysis = PromptDumper::analyze_prompt(prompt);
466        analysis.print_summary();
467    }
468
469    #[test]
470    fn test_dump_file_analysis() {
471        let temp_file = tempfile::NamedTempFile::new().unwrap();
472        let path = temp_file.path();
473
474        // Write some entries
475        let mut dumper = PromptDumper::new()
476            .enable_file_dump(path)
477            .with_session("test-session".to_string());
478
479        let mut orchestrator =
480            crate::prompt::PromptOrchestrator::new(std::env::current_dir().unwrap());
481        orchestrator.add_section(crate::prompt::PromptSection::static_section(
482            "test", "content",
483        ));
484
485        for _ in 0..5 {
486            let assembled = orchestrator.assemble();
487            dumper.dump(&assembled);
488        }
489        dumper.flush();
490
491        // Analyze
492        let analysis = analyze_dump_file(path);
493        assert_eq!(analysis.total_entries, 5);
494        assert!(analysis.avg_tokens > 0);
495        analysis.print_summary();
496    }
497}