syncable_cli/agent/tools/
compression.rs

1//! Smart Context Compression for Tool Outputs
2//!
3//! Implements multi-layer semantic compression with RAG retrieval pattern:
4//! 1. Semantic Deduplication - Group identical patterns
5//! 2. Importance-Weighted Output - Critical=full, Low=counts
6//! 3. Hierarchical Summaries - Multi-level detail
7//! 4. RAG Pattern - Store full data, return summary with retrieval reference
8
9use serde::{Deserialize, Serialize};
10use serde_json::{Value, json};
11use std::collections::HashMap;
12
13use super::output_store;
14
15/// Severity levels for importance-weighted filtering
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
17#[serde(rename_all = "lowercase")]
18pub enum Severity {
19    Info,
20    Low,
21    Medium,
22    High,
23    Critical,
24}
25
26impl Severity {
27    pub fn from_str(s: &str) -> Self {
28        match s.to_lowercase().as_str() {
29            "critical" | "error" => Severity::Critical,
30            "high" | "warning" => Severity::High,
31            "medium" => Severity::Medium,
32            "low" | "hint" => Severity::Low,
33            _ => Severity::Info,
34        }
35    }
36}
37
38/// A deduplicated pattern representing multiple similar issues
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct DeduplicatedPattern {
41    /// The issue code/type (e.g., "no-resource-limits", "DL3008")
42    pub code: String,
43    /// Number of occurrences
44    pub count: usize,
45    /// Severity level
46    pub severity: Severity,
47    /// Brief description of the issue
48    pub message: String,
49    /// List of affected files (truncated if too many)
50    pub affected_files: Vec<String>,
51    /// One full example for context
52    pub example: Option<Value>,
53    /// Suggested fix template
54    pub fix_template: Option<String>,
55}
56
57/// Compressed output ready for LLM context
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct CompressedOutput {
60    /// Tool that generated this output
61    pub tool: String,
62    /// Overall status
63    pub status: String,
64    /// Summary counts by severity
65    pub summary: SeveritySummary,
66    /// Critical issues - always shown in full
67    pub critical_issues: Vec<Value>,
68    /// High severity issues - shown in full if few, otherwise patterns
69    pub high_issues: Vec<Value>,
70    /// Deduplicated patterns for medium/low issues
71    pub patterns: Vec<DeduplicatedPattern>,
72    /// Reference ID for retrieving full data
73    pub full_data_ref: String,
74    /// Hint for agent on how to retrieve more details
75    pub retrieval_hint: String,
76}
77
78/// Summary counts by severity level
79#[derive(Debug, Clone, Default, Serialize, Deserialize)]
80pub struct SeveritySummary {
81    pub total: usize,
82    pub critical: usize,
83    pub high: usize,
84    pub medium: usize,
85    pub low: usize,
86    pub info: usize,
87}
88
89/// Configuration for compression behavior
90#[derive(Debug, Clone)]
91pub struct CompressionConfig {
92    /// Maximum high-severity issues to show in full (default: 10)
93    pub max_high_full: usize,
94    /// Maximum files to list per pattern (default: 5)
95    pub max_files_per_pattern: usize,
96    /// Target output size in bytes (default: 15KB)
97    pub target_size_bytes: usize,
98}
99
100impl Default for CompressionConfig {
101    fn default() -> Self {
102        Self {
103            max_high_full: 10,
104            max_files_per_pattern: 5,
105            target_size_bytes: 15_000,
106        }
107    }
108}
109
110/// Main compression function - compresses tool output and stores full data for retrieval
111///
112/// # Arguments
113/// * `output` - The raw JSON output from a tool
114/// * `tool_name` - Name of the tool (e.g., "kubelint", "k8s_optimize")
115/// * `config` - Compression configuration
116///
117/// # Returns
118/// JSON string of compressed output, or original if compression not applicable
119pub fn compress_tool_output(output: &Value, tool_name: &str, config: &CompressionConfig) -> String {
120    // Check if output is small enough - no compression needed
121    let raw_str = serde_json::to_string(output).unwrap_or_default();
122    if raw_str.len() <= config.target_size_bytes {
123        return raw_str;
124    }
125
126    // Store full output for later retrieval
127    let ref_id = output_store::store_output(output, tool_name);
128
129    // Extract issues/findings array from the output
130    let issues = extract_issues(output);
131
132    if issues.is_empty() {
133        // Register in session with description
134        let contains = format!("{} analysis data (no issues)", tool_name);
135        output_store::register_session_ref(
136            &ref_id,
137            tool_name,
138            &contains,
139            "0 issues",
140            raw_str.len(),
141        );
142
143        // No issues to compress, just store and return summary
144        let mut result = serde_json::to_string_pretty(&json!({
145            "tool": tool_name,
146            "status": "NO_ISSUES",
147            "summary": { "total": 0 },
148            "full_data_ref": ref_id,
149            "retrieval_hint": format!("Use retrieve_output('{}') for full analysis data", ref_id)
150        }))
151        .unwrap_or(raw_str.clone());
152
153        // Append ALL session refs so agent always knows what's available
154        result.push_str(&output_store::format_session_refs_for_agent());
155        return result;
156    }
157
158    // Classify issues by severity
159    let (critical, high, medium, low, info) = classify_by_severity(&issues);
160
161    // Build summary
162    let summary = SeveritySummary {
163        total: issues.len(),
164        critical: critical.len(),
165        high: high.len(),
166        medium: medium.len(),
167        low: low.len(),
168        info: info.len(),
169    };
170
171    // Critical issues: always full detail
172    let critical_issues: Vec<Value> = critical.clone();
173
174    // High issues: full detail if few, otherwise deduplicate
175    let high_issues: Vec<Value> = if high.len() <= config.max_high_full {
176        high.clone()
177    } else {
178        // Show first few + pattern for rest
179        high.iter().take(config.max_high_full).cloned().collect()
180    };
181
182    // Deduplicate medium/low/info issues into patterns
183    let mut all_lower: Vec<Value> = Vec::new();
184    all_lower.extend(medium.clone());
185    all_lower.extend(low.clone());
186    all_lower.extend(info.clone());
187
188    // Also add remaining high issues if there were too many
189    if high.len() > config.max_high_full {
190        all_lower.extend(high.iter().skip(config.max_high_full).cloned());
191    }
192
193    let patterns = deduplicate_to_patterns(&all_lower, config);
194
195    // Determine status
196    let status = if summary.critical > 0 {
197        "CRITICAL_ISSUES_FOUND"
198    } else if summary.high > 0 {
199        "HIGH_ISSUES_FOUND"
200    } else if summary.total > 0 {
201        "ISSUES_FOUND"
202    } else {
203        "CLEAN"
204    };
205
206    // Register in session registry with meaningful description
207    let contains = match tool_name {
208        "kubelint" => "Kubernetes manifest lint issues (security, best practices)",
209        "k8s_optimize" => "K8s resource optimization recommendations",
210        "analyze" => "Project analysis (languages, frameworks, dependencies)",
211        _ => "Tool analysis results",
212    };
213    let summary_str = format!(
214        "{} issues: {} critical, {} high, {} medium",
215        summary.total, summary.critical, summary.high, summary.medium
216    );
217    output_store::register_session_ref(&ref_id, tool_name, contains, &summary_str, raw_str.len());
218
219    let compressed = CompressedOutput {
220        tool: tool_name.to_string(),
221        status: status.to_string(),
222        summary,
223        critical_issues,
224        high_issues,
225        patterns,
226        full_data_ref: ref_id.clone(),
227        retrieval_hint: format!(
228            "Use retrieve_output('{}', query) to get full details. Query options: 'severity:critical', 'file:path', 'code:DL3008'",
229            ref_id
230        ),
231    };
232
233    let mut result = serde_json::to_string_pretty(&compressed).unwrap_or(raw_str);
234
235    // Append ALL session refs so agent always knows what's available
236    result.push_str(&output_store::format_session_refs_for_agent());
237    result
238}
239
240/// Extract issues/findings array from various output formats
241fn extract_issues(output: &Value) -> Vec<Value> {
242    // Try common field names for issues/findings
243    let issue_fields = [
244        "issues",
245        "findings",
246        "violations",
247        "warnings",
248        "errors",
249        "recommendations",
250        "results",
251        "diagnostics",
252        "failures", // LintResult from kubelint, hadolint, dclint, helmlint
253    ];
254
255    for field in &issue_fields {
256        if let Some(arr) = output.get(field).and_then(|v| v.as_array()) {
257            return arr.clone();
258        }
259    }
260
261    // Check if output itself is an array
262    if let Some(arr) = output.as_array() {
263        return arr.clone();
264    }
265
266    // Try nested structures
267    if let Some(obj) = output.as_object() {
268        for (_, v) in obj {
269            if let Some(arr) = v.as_array() {
270                if !arr.is_empty() && is_issue_like(&arr[0]) {
271                    return arr.clone();
272                }
273            }
274        }
275    }
276
277    Vec::new()
278}
279
280/// Check if a value looks like an issue/finding
281fn is_issue_like(value: &Value) -> bool {
282    if let Some(obj) = value.as_object() {
283        // Issues typically have severity, code, message, or file fields
284        obj.contains_key("severity")
285            || obj.contains_key("code")
286            || obj.contains_key("message")
287            || obj.contains_key("rule")
288            || obj.contains_key("level")
289    } else {
290        false
291    }
292}
293
294/// Classify issues by severity level
295fn classify_by_severity(
296    issues: &[Value],
297) -> (Vec<Value>, Vec<Value>, Vec<Value>, Vec<Value>, Vec<Value>) {
298    let mut critical = Vec::new();
299    let mut high = Vec::new();
300    let mut medium = Vec::new();
301    let mut low = Vec::new();
302    let mut info = Vec::new();
303
304    for issue in issues {
305        let severity = get_severity(issue);
306        match severity {
307            Severity::Critical => critical.push(issue.clone()),
308            Severity::High => high.push(issue.clone()),
309            Severity::Medium => medium.push(issue.clone()),
310            Severity::Low => low.push(issue.clone()),
311            Severity::Info => info.push(issue.clone()),
312        }
313    }
314
315    (critical, high, medium, low, info)
316}
317
318/// Extract severity from an issue value
319fn get_severity(issue: &Value) -> Severity {
320    // Try common severity field names
321    let severity_fields = ["severity", "level", "priority", "type"];
322
323    for field in &severity_fields {
324        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
325            return Severity::from_str(s);
326        }
327    }
328
329    // Check for error/warning in code field
330    if let Some(code) = issue.get("code").and_then(|v| v.as_str()) {
331        if code.to_lowercase().contains("error") {
332            return Severity::Critical;
333        }
334        if code.to_lowercase().contains("warn") {
335            return Severity::High;
336        }
337    }
338
339    Severity::Medium // Default
340}
341
342/// Get issue code/type for deduplication grouping
343fn get_issue_code(issue: &Value) -> String {
344    // Try common code field names
345    let code_fields = ["code", "rule", "rule_id", "type", "check", "id"];
346
347    for field in &code_fields {
348        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
349            return s.to_string();
350        }
351    }
352
353    // Fall back to message hash
354    if let Some(msg) = issue.get("message").and_then(|v| v.as_str()) {
355        return format!("msg:{}", &msg[..msg.len().min(30)]);
356    }
357
358    "unknown".to_string()
359}
360
361/// Get file path from an issue
362fn get_issue_file(issue: &Value) -> Option<String> {
363    let file_fields = ["file", "path", "filename", "location", "source"];
364
365    for field in &file_fields {
366        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
367            return Some(s.to_string());
368        }
369        // Handle nested location objects
370        if let Some(loc) = issue.get(field).and_then(|v| v.as_object()) {
371            if let Some(f) = loc.get("file").and_then(|v| v.as_str()) {
372                return Some(f.to_string());
373            }
374        }
375    }
376
377    None
378}
379
380/// Get message from an issue
381fn get_issue_message(issue: &Value) -> String {
382    let msg_fields = ["message", "msg", "description", "text", "detail"];
383
384    for field in &msg_fields {
385        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
386            return s.to_string();
387        }
388    }
389
390    "No message".to_string()
391}
392
393/// Deduplicate issues into patterns
394fn deduplicate_to_patterns(
395    issues: &[Value],
396    config: &CompressionConfig,
397) -> Vec<DeduplicatedPattern> {
398    // Group by issue code
399    let mut groups: HashMap<String, Vec<&Value>> = HashMap::new();
400
401    for issue in issues {
402        let code = get_issue_code(issue);
403        groups.entry(code).or_default().push(issue);
404    }
405
406    // Convert groups to patterns
407    let mut patterns: Vec<DeduplicatedPattern> = groups
408        .into_iter()
409        .map(|(code, group)| {
410            let first = group[0];
411            let severity = get_severity(first);
412            let message = get_issue_message(first);
413
414            // Collect affected files
415            let mut files: Vec<String> = group.iter().filter_map(|i| get_issue_file(i)).collect();
416            files.dedup();
417
418            let total_files = files.len();
419            let truncated_files: Vec<String> = if files.len() > config.max_files_per_pattern {
420                let mut truncated: Vec<String> = files
421                    .iter()
422                    .take(config.max_files_per_pattern)
423                    .cloned()
424                    .collect();
425                truncated.push(format!(
426                    "...+{} more",
427                    total_files - config.max_files_per_pattern
428                ));
429                truncated
430            } else {
431                files
432            };
433
434            // Extract fix template if available
435            let fix_template = first
436                .get("fix")
437                .or_else(|| first.get("suggestion"))
438                .or_else(|| first.get("recommendation"))
439                .and_then(|v| v.as_str())
440                .map(|s| s.to_string());
441
442            DeduplicatedPattern {
443                code,
444                count: group.len(),
445                severity,
446                message,
447                affected_files: truncated_files,
448                example: if group.len() > 1 {
449                    Some(first.clone())
450                } else {
451                    None
452                },
453                fix_template,
454            }
455        })
456        .collect();
457
458    // Sort by severity (critical first) then by count
459    patterns.sort_by(|a, b| {
460        b.severity
461            .cmp(&a.severity)
462            .then_with(|| b.count.cmp(&a.count))
463    });
464
465    patterns
466}
467
468/// Compress analyze_project output specifically
469///
470/// Handles both:
471/// - MonorepoAnalysis: has "projects" array, "is_monorepo", "root_path"
472/// - ProjectAnalysis: flat structure with "languages", "technologies" at top level
473///
474/// For large analysis, returns a minimal summary and stores full data for retrieval.
475pub fn compress_analysis_output(output: &Value, config: &CompressionConfig) -> String {
476    let raw_str = serde_json::to_string(output).unwrap_or_default();
477    if raw_str.len() <= config.target_size_bytes {
478        return raw_str;
479    }
480
481    // Store full output for later retrieval
482    let ref_id = output_store::store_output(output, "analyze_project");
483
484    // Build a MINIMAL summary - just enough to understand the project
485    let mut summary = json!({
486        "tool": "analyze_project",
487        "status": "ANALYSIS_COMPLETE",
488        "full_data_ref": ref_id.clone()
489    });
490
491    let summary_obj = summary.as_object_mut().unwrap();
492
493    // Detect output type and extract accordingly
494    let is_monorepo = output.get("projects").is_some() || output.get("is_monorepo").is_some();
495    let is_project_analysis = output.get("languages").is_some() && output.get("analysis_metadata").is_some();
496
497    if is_monorepo {
498        // MonorepoAnalysis structure
499        if let Some(mono) = output.get("is_monorepo").and_then(|v| v.as_bool()) {
500            summary_obj.insert("is_monorepo".to_string(), json!(mono));
501        }
502        if let Some(root) = output.get("root_path").and_then(|v| v.as_str()) {
503            summary_obj.insert("root_path".to_string(), json!(root));
504        }
505
506        if let Some(projects) = output.get("projects").and_then(|v| v.as_array()) {
507            summary_obj.insert("project_count".to_string(), json!(projects.len()));
508
509            let mut all_languages: Vec<String> = Vec::new();
510            let mut all_frameworks: Vec<String> = Vec::new();
511            let mut project_names: Vec<String> = Vec::new();
512
513            for project in projects.iter().take(20) {
514                if let Some(name) = project.get("name").and_then(|v| v.as_str()) {
515                    project_names.push(name.to_string());
516                }
517                if let Some(analysis) = project.get("analysis") {
518                    if let Some(langs) = analysis.get("languages").and_then(|v| v.as_array()) {
519                        for lang in langs {
520                            if let Some(name) = lang.get("name").and_then(|v| v.as_str()) {
521                                if !all_languages.contains(&name.to_string()) {
522                                    all_languages.push(name.to_string());
523                                }
524                            }
525                        }
526                    }
527                    if let Some(fws) = analysis.get("frameworks").and_then(|v| v.as_array()) {
528                        for fw in fws {
529                            if let Some(name) = fw.get("name").and_then(|v| v.as_str()) {
530                                if !all_frameworks.contains(&name.to_string()) {
531                                    all_frameworks.push(name.to_string());
532                                }
533                            }
534                        }
535                    }
536                }
537            }
538
539            summary_obj.insert("project_names".to_string(), json!(project_names));
540            summary_obj.insert("languages_detected".to_string(), json!(all_languages));
541            summary_obj.insert("frameworks_detected".to_string(), json!(all_frameworks));
542        }
543    } else if is_project_analysis {
544        // ProjectAnalysis flat structure - languages/technologies at top level
545        if let Some(root) = output.get("project_root").and_then(|v| v.as_str()) {
546            summary_obj.insert("project_root".to_string(), json!(root));
547        }
548        if let Some(arch) = output.get("architecture_type").and_then(|v| v.as_str()) {
549            summary_obj.insert("architecture_type".to_string(), json!(arch));
550        }
551        if let Some(proj_type) = output.get("project_type").and_then(|v| v.as_str()) {
552            summary_obj.insert("project_type".to_string(), json!(proj_type));
553        }
554
555        // Extract languages (at top level)
556        if let Some(langs) = output.get("languages").and_then(|v| v.as_array()) {
557            let names: Vec<&str> = langs
558                .iter()
559                .filter_map(|l| l.get("name").and_then(|n| n.as_str()))
560                .collect();
561            summary_obj.insert("languages_detected".to_string(), json!(names));
562        }
563
564        // Extract technologies (at top level)
565        if let Some(techs) = output.get("technologies").and_then(|v| v.as_array()) {
566            let names: Vec<&str> = techs
567                .iter()
568                .filter_map(|t| t.get("name").and_then(|n| n.as_str()))
569                .collect();
570            summary_obj.insert("technologies_detected".to_string(), json!(names));
571        }
572
573        // Extract services (include names, not just count)
574        if let Some(services) = output.get("services").and_then(|v| v.as_array()) {
575            summary_obj.insert("services_count".to_string(), json!(services.len()));
576            // Include service names so agent knows what microservices exist
577            let service_names: Vec<&str> = services
578                .iter()
579                .filter_map(|s| s.get("name").and_then(|n| n.as_str()))
580                .collect();
581            if !service_names.is_empty() {
582                summary_obj.insert("services_detected".to_string(), json!(service_names));
583            }
584        }
585    }
586
587    // CRITICAL: Include retrieval instructions prominently
588    summary_obj.insert(
589        "retrieval_instructions".to_string(),
590        json!({
591            "message": "Full analysis stored. Use retrieve_output with queries to get specific sections.",
592            "ref_id": ref_id,
593            "available_queries": [
594                "section:summary - Project overview",
595                "section:languages - All detected languages",
596                "section:frameworks - All detected frameworks/technologies",
597                "section:services - All detected services",
598                "language:<name> - Details for specific language (e.g., language:Rust)",
599                "framework:<name> - Details for specific framework"
600            ],
601            "example": format!("retrieve_output('{}', 'section:summary')", ref_id)
602        }),
603    );
604
605    // Build session summary
606    let project_count = output
607        .get("projects")
608        .and_then(|v| v.as_array())
609        .map(|a| a.len())
610        .unwrap_or(1);
611    let summary_str = format!(
612        "{} project(s), {} bytes stored",
613        project_count,
614        raw_str.len()
615    );
616
617    // Register in session registry
618    output_store::register_session_ref(
619        &ref_id,
620        "analyze_project",
621        "Full project analysis (use section queries to retrieve specific data)",
622        &summary_str,
623        raw_str.len(),
624    );
625
626    // Return minimal JSON
627    serde_json::to_string_pretty(&summary).unwrap_or_else(|_| {
628        format!(
629            r#"{{"tool":"analyze_project","status":"STORED","full_data_ref":"{}","message":"Analysis complete. Use retrieve_output('{}', 'section:summary') to view."}}"#,
630            ref_id, ref_id
631        )
632    })
633}
634
635#[cfg(test)]
636mod tests {
637    use super::*;
638
639    #[test]
640    fn test_severity_ordering() {
641        assert!(Severity::Critical > Severity::High);
642        assert!(Severity::High > Severity::Medium);
643        assert!(Severity::Medium > Severity::Low);
644        assert!(Severity::Low > Severity::Info);
645    }
646
647    #[test]
648    fn test_extract_issues_from_array_field() {
649        let output = json!({
650            "issues": [
651                { "code": "DL3008", "severity": "warning", "message": "Pin versions" },
652                { "code": "DL3009", "severity": "info", "message": "Delete apt lists" }
653            ]
654        });
655
656        let issues = extract_issues(&output);
657        assert_eq!(issues.len(), 2);
658    }
659
660    #[test]
661    fn test_deduplication() {
662        let issues = vec![
663            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile1" }),
664            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile2" }),
665            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile3" }),
666            json!({ "code": "DL3009", "severity": "info", "file": "Dockerfile1" }),
667        ];
668
669        let config = CompressionConfig::default();
670        let patterns = deduplicate_to_patterns(&issues, &config);
671
672        assert_eq!(patterns.len(), 2);
673
674        let dl3008 = patterns.iter().find(|p| p.code == "DL3008").unwrap();
675        assert_eq!(dl3008.count, 3);
676        assert_eq!(dl3008.affected_files.len(), 3);
677    }
678
679    #[test]
680    fn test_small_output_not_compressed() {
681        let small_output = json!({
682            "issues": [
683                { "code": "test", "severity": "low" }
684            ]
685        });
686
687        let config = CompressionConfig {
688            target_size_bytes: 10000,
689            ..Default::default()
690        };
691
692        let result = compress_tool_output(&small_output, "test", &config);
693        // Should return original (not compressed) since it's small
694        assert!(!result.contains("full_data_ref"));
695    }
696}