Skip to main content

matrixcode_core/prompt/
dump.rs

1//! Prompt Export and Observability
2//!
3//! Provides:
4//! - JSONL dump for prompt inspection
5//! - Debug output for development
6//! - Prompt analysis tools
7
8use std::path::{Path, PathBuf};
9use std::sync::OnceLock;
10use serde::{Deserialize, Serialize};
11use chrono::{DateTime, Utc};
12use regex::Regex;
13
14use crate::prompt::AssembledPrompt;
15
16// Pre-compiled regex patterns for analyze_prompt
17static SECTION_PATTERN: OnceLock<Regex> = OnceLock::new();
18static TAG_PATTERN: OnceLock<Regex> = OnceLock::new();
19
20fn get_section_pattern() -> &'static Regex {
21    SECTION_PATTERN.get_or_init(|| Regex::new(r"\[([^\]]+)\]").unwrap())
22}
23
24fn get_tag_pattern() -> &'static Regex {
25    TAG_PATTERN.get_or_init(|| Regex::new(r"<([a-zA-Z_][a-zA-Z0-9_]*)>").unwrap())
26}
27
28/// A single dump entry for JSONL export
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct DumpEntry {
31    /// Timestamp
32    pub timestamp: DateTime<Utc>,
33    /// Profile used
34    pub profile: String,
35    /// Full prompt content
36    pub prompt: String,
37    /// Cached sections count
38    pub cached_sections: usize,
39    /// Dynamic sections count
40    pub dynamic_sections: usize,
41    /// Cached tokens estimate
42    pub cached_tokens: usize,
43    /// Dynamic tokens estimate
44    pub dynamic_tokens: usize,
45    /// Total tokens estimate
46    pub total_tokens: usize,
47    /// Cache efficiency percentage
48    pub cache_efficiency: f64,
49    /// Session ID (optional)
50    pub session_id: Option<String>,
51    /// Conversation ID (optional)
52    pub conversation_id: Option<String>,
53}
54
55impl DumpEntry {
56    /// Create from assembled prompt
57    pub fn from_prompt(prompt: &AssembledPrompt, session_id: Option<String>) -> Self {
58        Self {
59            timestamp: Utc::now(),
60            profile: prompt.profile.to_string(),
61            prompt: prompt.prompt.clone(),
62            cached_sections: prompt.cached_sections,
63            dynamic_sections: prompt.dynamic_sections,
64            cached_tokens: prompt.cached_tokens,
65            dynamic_tokens: prompt.dynamic_tokens,
66            total_tokens: prompt.total_tokens,
67            cache_efficiency: prompt.cache_efficiency(),
68            session_id,
69            conversation_id: None,
70        }
71    }
72
73    /// Create with additional conversation context
74    pub fn with_conversation(mut self, conversation_id: String) -> Self {
75        self.conversation_id = Some(conversation_id);
76        self
77    }
78}
79
80/// Prompt dumper for observability
81pub struct PromptDumper {
82    /// Dump file path
83    dump_path: Option<PathBuf>,
84    /// Whether to dump to file
85    dump_enabled: bool,
86    /// Whether to print to stdout
87    print_enabled: bool,
88    /// Session ID
89    session_id: Option<String>,
90    /// Entries buffer
91    entries: Vec<DumpEntry>,
92    /// Maximum entries to buffer before flush
93    buffer_size: usize,
94}
95
96impl PromptDumper {
97    /// Create a new dumper
98    pub fn new() -> Self {
99        Self {
100            dump_path: None,
101            dump_enabled: false,
102            print_enabled: false,
103            session_id: None,
104            entries: Vec::new(),
105            buffer_size: 100,
106        }
107    }
108
109    /// Enable file dumping
110    pub fn enable_file_dump<P: Into<PathBuf>>(mut self, path: P) -> Self {
111        self.dump_path = Some(path.into());
112        self.dump_enabled = true;
113        self
114    }
115
116    /// Enable stdout printing
117    pub fn enable_print(mut self) -> Self {
118        self.print_enabled = true;
119        self
120    }
121
122    /// Set session ID
123    pub fn with_session(mut self, session_id: String) -> Self {
124        self.session_id = Some(session_id);
125        self
126    }
127
128    /// Set buffer size
129    pub fn with_buffer_size(mut self, size: usize) -> Self {
130        self.buffer_size = size;
131        self
132    }
133
134    /// Dump an assembled prompt
135    pub fn dump(&mut self, prompt: &AssembledPrompt) {
136        let entry = DumpEntry::from_prompt(prompt, self.session_id.clone());
137        
138        // Print if enabled
139        if self.print_enabled {
140            self.print_entry(&entry);
141        }
142        
143        // Buffer entry
144        self.entries.push(entry);
145        
146        // Flush if buffer is full
147        if self.entries.len() >= self.buffer_size {
148            self.flush();
149        }
150    }
151
152    /// Dump with conversation ID
153    pub fn dump_with_conversation(&mut self, prompt: &AssembledPrompt, conversation_id: String) {
154        let entry = DumpEntry::from_prompt(prompt, self.session_id.clone())
155            .with_conversation(conversation_id);
156        
157        if self.print_enabled {
158            self.print_entry(&entry);
159        }
160        
161        self.entries.push(entry);
162        
163        if self.entries.len() >= self.buffer_size {
164            self.flush();
165        }
166    }
167
168    /// Print entry to stdout
169    fn print_entry(&self, entry: &DumpEntry) {
170        println!("=== Prompt Dump ===");
171        println!("Timestamp: {}", entry.timestamp);
172        println!("Profile: {}", entry.profile);
173        println!("Sections: {} cached, {} dynamic", entry.cached_sections, entry.dynamic_sections);
174        println!("Tokens: {} cached, {} dynamic, {} total", 
175                 entry.cached_tokens, entry.dynamic_tokens, entry.total_tokens);
176        println!("Cache efficiency: {:.1}%", entry.cache_efficiency);
177        println!("--- Prompt Content ---");
178        
179        // Limit output for readability
180        if entry.prompt.len() > 2000 {
181            println!("{}... (truncated, {} chars total)", 
182                     &entry.prompt[..2000], entry.prompt.len());
183        } else {
184            println!("{}", entry.prompt);
185        }
186        
187        println!("=== End Dump ===");
188    }
189
190    /// Flush buffer to file
191    pub fn flush(&mut self) {
192        if !self.dump_enabled || self.dump_path.is_none() || self.entries.is_empty() {
193            return;
194        }
195
196        let path = self.dump_path.as_ref().unwrap();
197
198        // Create parent directories if needed
199        if let Some(parent) = path.parent() {
200            if !parent.exists() {
201                if let Err(e) = std::fs::create_dir_all(parent) {
202                    log::warn!("Failed to create dump directory: {}", e);
203                    return;
204                }
205            }
206        }
207
208        // Open file for append
209        match std::fs::OpenOptions::new()
210            .create(true)
211            .append(true)
212            .open(path)
213        {
214            Ok(mut file) => {
215                use std::io::Write;
216                for entry in &self.entries {
217                    match serde_json::to_string(entry) {
218                        Ok(json) => {
219                            if let Err(e) = writeln!(file, "{}", json) {
220                                log::warn!("Failed to write dump entry: {}", e);
221                            }
222                        }
223                        Err(e) => log::warn!("Failed to serialize dump entry: {}", e),
224                    }
225                }
226            }
227            Err(e) => log::warn!("Failed to open dump file {}: {}", path.display(), e),
228        }
229
230        self.entries.clear();
231    }
232
233    /// Get all entries
234    pub fn entries(&self) -> &[DumpEntry] {
235        &self.entries
236    }
237
238    /// Clear entries buffer
239    pub fn clear(&mut self) {
240        self.entries.clear();
241    }
242
243    /// Analyze prompt for debugging
244    pub fn analyze_prompt(prompt: &str) -> PromptAnalysis {
245        let mut analysis = PromptAnalysis::default();
246
247        // Count sections (using pre-compiled pattern)
248        let section_pattern = get_section_pattern();
249        for cap in section_pattern.captures_iter(prompt) {
250            analysis.sections.push(cap[1].to_string());
251        }
252
253        // Count XML tags (using pre-compiled pattern)
254        let tag_pattern = get_tag_pattern();
255        for cap in tag_pattern.captures_iter(prompt) {
256            let tag = cap[1].to_string();
257            analysis.xml_tags.push(tag.clone());
258            analysis.xml_tag_counts.entry(tag).and_modify(|c| *c += 1).or_insert(1);
259        }
260        
261        // Find cache boundary
262        analysis.has_cache_boundary = prompt.contains(crate::prompt::CACHE_BOUNDARY);
263        
264        // Estimate tokens
265        analysis.estimated_tokens = crate::prompt::cache::estimate_tokens(prompt);
266        
267        // Character count
268        analysis.char_count = prompt.len();
269        
270        // Line count
271        analysis.line_count = prompt.lines().count();
272        
273        analysis
274    }
275}
276
277impl Default for PromptDumper {
278    fn default() -> Self {
279        Self::new()
280    }
281}
282
283/// Analysis result for a prompt
284#[derive(Debug, Clone, Default, Serialize, Deserialize)]
285pub struct PromptAnalysis {
286    /// Detected sections
287    pub sections: Vec<String>,
288    /// Detected XML tags
289    pub xml_tags: Vec<String>,
290    /// XML tag counts
291    pub xml_tag_counts: std::collections::HashMap<String, usize>,
292    /// Has cache boundary
293    pub has_cache_boundary: bool,
294    /// Estimated tokens
295    pub estimated_tokens: usize,
296    /// Character count
297    pub char_count: usize,
298    /// Line count
299    pub line_count: usize,
300}
301
302impl PromptAnalysis {
303    /// Print analysis summary
304    pub fn print_summary(&self) {
305        println!("Prompt Analysis Summary:");
306        println!("  Sections: {:?}", self.sections);
307        println!("  XML tags: {} unique, {:?} counts", self.xml_tags.len(), self.xml_tag_counts);
308        println!("  Cache boundary: {}", self.has_cache_boundary);
309        println!("  Tokens estimate: {}", self.estimated_tokens);
310        println!("  Characters: {}", self.char_count);
311        println!("  Lines: {}", self.line_count);
312    }
313}
314
315/// Read dump entries from a JSONL file
316pub fn read_dump_file<P: AsRef<Path>>(path: P) -> Vec<DumpEntry> {
317    let path = path.as_ref();
318    if !path.exists() {
319        return Vec::new();
320    }
321    
322    let content = std::fs::read_to_string(path).unwrap_or_default();
323    content.lines()
324        .filter_map(|line| serde_json::from_str::<DumpEntry>(line).ok())
325        .collect()
326}
327
328/// Analyze a dump file for patterns
329pub fn analyze_dump_file<P: AsRef<Path>>(path: P) -> DumpFileAnalysis {
330    let entries = read_dump_file(path);
331    
332    let mut analysis = DumpFileAnalysis::default();
333    analysis.total_entries = entries.len();
334    
335    for entry in &entries {
336        analysis.total_tokens += entry.total_tokens;
337        analysis.avg_tokens += entry.total_tokens;
338        analysis.profile_counts.entry(entry.profile.clone()).and_modify(|c| *c += 1).or_insert(1);
339        
340        if entry.cache_efficiency > analysis.max_cache_efficiency {
341            analysis.max_cache_efficiency = entry.cache_efficiency;
342        }
343        if entry.cache_efficiency < analysis.min_cache_efficiency || analysis.min_cache_efficiency == 0.0 {
344            analysis.min_cache_efficiency = entry.cache_efficiency;
345        }
346    }
347    
348    if analysis.total_entries > 0 {
349        analysis.avg_tokens /= analysis.total_entries;
350        analysis.avg_cache_efficiency = entries.iter().map(|e| e.cache_efficiency).sum::<f64>() / analysis.total_entries as f64;
351    }
352    
353    analysis
354}
355
356/// Analysis of a dump file
357#[derive(Debug, Clone, Default, Serialize, Deserialize)]
358pub struct DumpFileAnalysis {
359    /// Total entries
360    pub total_entries: usize,
361    /// Total tokens across all entries
362    pub total_tokens: usize,
363    /// Average tokens per entry
364    pub avg_tokens: usize,
365    /// Profile counts
366    pub profile_counts: std::collections::HashMap<String, usize>,
367    /// Max cache efficiency
368    pub max_cache_efficiency: f64,
369    /// Min cache efficiency
370    pub min_cache_efficiency: f64,
371    /// Average cache efficiency
372    pub avg_cache_efficiency: f64,
373}
374
375impl DumpFileAnalysis {
376    pub fn print_summary(&self) {
377        println!("Dump File Analysis:");
378        println!("  Total entries: {}", self.total_entries);
379        println!("  Total tokens: {}", self.total_tokens);
380        println!("  Average tokens: {}", self.avg_tokens);
381        println!("  Profile distribution: {:?}", self.profile_counts);
382        println!("  Cache efficiency: min {:.1}%, max {:.1}%, avg {:.1}%",
383                 self.min_cache_efficiency, self.max_cache_efficiency, self.avg_cache_efficiency);
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390
391    #[test]
392    fn test_dump_entry_creation() {
393        let mut orchestrator = crate::prompt::PromptOrchestrator::new(std::env::current_dir().unwrap());
394        orchestrator.add_section(crate::prompt::PromptSection::static_section("test", "test content"));
395        
396        let assembled = orchestrator.assemble();
397        let entry = DumpEntry::from_prompt(&assembled, Some("session-1".to_string()));
398        
399        assert_eq!(entry.profile, "default");
400        assert!(entry.prompt.contains("test"));
401        assert_eq!(entry.session_id, Some("session-1".to_string()));
402    }
403
404    #[test]
405    fn test_dumper_basic() {
406        let mut dumper = PromptDumper::new().enable_print();
407        
408        let mut orchestrator = crate::prompt::PromptOrchestrator::new(std::env::current_dir().unwrap());
409        orchestrator.add_section(crate::prompt::PromptSection::static_section("identity", "You are AI"));
410        
411        let assembled = orchestrator.assemble();
412        dumper.dump(&assembled);
413        
414        assert_eq!(dumper.entries().len(), 1);
415    }
416
417    #[test]
418    fn test_analyze_prompt() {
419        let prompt = "[identity]\nYou are AI\n\n<context>\nSome context\n</context>";
420        let analysis = PromptDumper::analyze_prompt(prompt);
421        
422        assert!(analysis.sections.contains(&"identity".to_string()));
423        assert!(analysis.xml_tags.contains(&"context".to_string()));
424        assert!(!analysis.has_cache_boundary);
425        assert!(analysis.estimated_tokens > 0);
426    }
427
428    #[test]
429    fn test_prompt_analysis_summary() {
430        let prompt = "[test]\nContent";
431        let analysis = PromptDumper::analyze_prompt(prompt);
432        analysis.print_summary();
433    }
434
435    #[test]
436    fn test_dump_file_analysis() {
437        let temp_file = tempfile::NamedTempFile::new().unwrap();
438        let path = temp_file.path();
439        
440        // Write some entries
441        let mut dumper = PromptDumper::new()
442            .enable_file_dump(path)
443            .with_session("test-session".to_string());
444        
445        let mut orchestrator = crate::prompt::PromptOrchestrator::new(std::env::current_dir().unwrap());
446        orchestrator.add_section(crate::prompt::PromptSection::static_section("test", "content"));
447        
448        for _ in 0..5 {
449            let assembled = orchestrator.assemble();
450            dumper.dump(&assembled);
451        }
452        dumper.flush();
453        
454        // Analyze
455        let analysis = analyze_dump_file(path);
456        assert_eq!(analysis.total_entries, 5);
457        assert!(analysis.avg_tokens > 0);
458        analysis.print_summary();
459    }
460}