syncable_cli/agent/tools/
compression.rs

1//! Smart Context Compression for Tool Outputs
2//!
3//! Implements multi-layer semantic compression with RAG retrieval pattern:
4//! 1. Semantic Deduplication - Group identical patterns
5//! 2. Importance-Weighted Output - Critical=full, Low=counts
6//! 3. Hierarchical Summaries - Multi-level detail
7//! 4. RAG Pattern - Store full data, return summary with retrieval reference
8
9use serde::{Deserialize, Serialize};
10use serde_json::{Value, json};
11use std::collections::HashMap;
12
13use super::output_store;
14
15/// Severity levels for importance-weighted filtering
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
17#[serde(rename_all = "lowercase")]
18pub enum Severity {
19    Info,
20    Low,
21    Medium,
22    High,
23    Critical,
24}
25
26impl Severity {
27    pub fn from_str(s: &str) -> Self {
28        match s.to_lowercase().as_str() {
29            "critical" | "error" => Severity::Critical,
30            "high" | "warning" => Severity::High,
31            "medium" => Severity::Medium,
32            "low" | "hint" => Severity::Low,
33            _ => Severity::Info,
34        }
35    }
36}
37
38/// A deduplicated pattern representing multiple similar issues
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct DeduplicatedPattern {
41    /// The issue code/type (e.g., "no-resource-limits", "DL3008")
42    pub code: String,
43    /// Number of occurrences
44    pub count: usize,
45    /// Severity level
46    pub severity: Severity,
47    /// Brief description of the issue
48    pub message: String,
49    /// List of affected files (truncated if too many)
50    pub affected_files: Vec<String>,
51    /// One full example for context
52    pub example: Option<Value>,
53    /// Suggested fix template
54    pub fix_template: Option<String>,
55}
56
57/// Compressed output ready for LLM context
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct CompressedOutput {
60    /// Tool that generated this output
61    pub tool: String,
62    /// Overall status
63    pub status: String,
64    /// Summary counts by severity
65    pub summary: SeveritySummary,
66    /// Critical issues - always shown in full
67    pub critical_issues: Vec<Value>,
68    /// High severity issues - shown in full if few, otherwise patterns
69    pub high_issues: Vec<Value>,
70    /// Deduplicated patterns for medium/low issues
71    pub patterns: Vec<DeduplicatedPattern>,
72    /// Reference ID for retrieving full data
73    pub full_data_ref: String,
74    /// Hint for agent on how to retrieve more details
75    pub retrieval_hint: String,
76}
77
78/// Summary counts by severity level
79#[derive(Debug, Clone, Default, Serialize, Deserialize)]
80pub struct SeveritySummary {
81    pub total: usize,
82    pub critical: usize,
83    pub high: usize,
84    pub medium: usize,
85    pub low: usize,
86    pub info: usize,
87}
88
89/// Configuration for compression behavior
90#[derive(Debug, Clone)]
91pub struct CompressionConfig {
92    /// Maximum high-severity issues to show in full (default: 10)
93    pub max_high_full: usize,
94    /// Maximum files to list per pattern (default: 5)
95    pub max_files_per_pattern: usize,
96    /// Target output size in bytes (default: 15KB)
97    pub target_size_bytes: usize,
98}
99
100impl Default for CompressionConfig {
101    fn default() -> Self {
102        Self {
103            max_high_full: 10,
104            max_files_per_pattern: 5,
105            target_size_bytes: 15_000,
106        }
107    }
108}
109
110/// Main compression function - compresses tool output and stores full data for retrieval
111///
112/// # Arguments
113/// * `output` - The raw JSON output from a tool
114/// * `tool_name` - Name of the tool (e.g., "kubelint", "k8s_optimize")
115/// * `config` - Compression configuration
116///
117/// # Returns
118/// JSON string of compressed output, or original if compression not applicable
119pub fn compress_tool_output(output: &Value, tool_name: &str, config: &CompressionConfig) -> String {
120    // Check if output is small enough - no compression needed
121    let raw_str = serde_json::to_string(output).unwrap_or_default();
122    if raw_str.len() <= config.target_size_bytes {
123        return raw_str;
124    }
125
126    // Store full output for later retrieval
127    let ref_id = output_store::store_output(output, tool_name);
128
129    // Extract issues/findings array from the output
130    let issues = extract_issues(output);
131
132    if issues.is_empty() {
133        // Register in session with description
134        let contains = format!("{} analysis data (no issues)", tool_name);
135        output_store::register_session_ref(
136            &ref_id,
137            tool_name,
138            &contains,
139            "0 issues",
140            raw_str.len(),
141        );
142
143        // No issues to compress, just store and return summary
144        let mut result = serde_json::to_string_pretty(&json!({
145            "tool": tool_name,
146            "status": "NO_ISSUES",
147            "summary": { "total": 0 },
148            "full_data_ref": ref_id,
149            "retrieval_hint": format!("Use retrieve_output('{}') for full analysis data", ref_id)
150        }))
151        .unwrap_or(raw_str.clone());
152
153        // Append ALL session refs so agent always knows what's available
154        result.push_str(&output_store::format_session_refs_for_agent());
155        return result;
156    }
157
158    // Classify issues by severity
159    let (critical, high, medium, low, info) = classify_by_severity(&issues);
160
161    // Build summary
162    let summary = SeveritySummary {
163        total: issues.len(),
164        critical: critical.len(),
165        high: high.len(),
166        medium: medium.len(),
167        low: low.len(),
168        info: info.len(),
169    };
170
171    // Critical issues: always full detail
172    let critical_issues: Vec<Value> = critical.clone();
173
174    // High issues: full detail if few, otherwise deduplicate
175    let high_issues: Vec<Value> = if high.len() <= config.max_high_full {
176        high.clone()
177    } else {
178        // Show first few + pattern for rest
179        high.iter().take(config.max_high_full).cloned().collect()
180    };
181
182    // Deduplicate medium/low/info issues into patterns
183    let mut all_lower: Vec<Value> = Vec::new();
184    all_lower.extend(medium.clone());
185    all_lower.extend(low.clone());
186    all_lower.extend(info.clone());
187
188    // Also add remaining high issues if there were too many
189    if high.len() > config.max_high_full {
190        all_lower.extend(high.iter().skip(config.max_high_full).cloned());
191    }
192
193    let patterns = deduplicate_to_patterns(&all_lower, config);
194
195    // Determine status
196    let status = if summary.critical > 0 {
197        "CRITICAL_ISSUES_FOUND"
198    } else if summary.high > 0 {
199        "HIGH_ISSUES_FOUND"
200    } else if summary.total > 0 {
201        "ISSUES_FOUND"
202    } else {
203        "CLEAN"
204    };
205
206    // Register in session registry with meaningful description
207    let contains = match tool_name {
208        "kubelint" => "Kubernetes manifest lint issues (security, best practices)",
209        "k8s_optimize" => "K8s resource optimization recommendations",
210        "analyze" => "Project analysis (languages, frameworks, dependencies)",
211        _ => "Tool analysis results",
212    };
213    let summary_str = format!(
214        "{} issues: {} critical, {} high, {} medium",
215        summary.total, summary.critical, summary.high, summary.medium
216    );
217    output_store::register_session_ref(&ref_id, tool_name, contains, &summary_str, raw_str.len());
218
219    let compressed = CompressedOutput {
220        tool: tool_name.to_string(),
221        status: status.to_string(),
222        summary,
223        critical_issues,
224        high_issues,
225        patterns,
226        full_data_ref: ref_id.clone(),
227        retrieval_hint: format!(
228            "Use retrieve_output('{}', query) to get full details. Query options: 'severity:critical', 'file:path', 'code:DL3008'",
229            ref_id
230        ),
231    };
232
233    let mut result = serde_json::to_string_pretty(&compressed).unwrap_or(raw_str);
234
235    // Append ALL session refs so agent always knows what's available
236    result.push_str(&output_store::format_session_refs_for_agent());
237    result
238}
239
240/// Extract issues/findings array from various output formats
241fn extract_issues(output: &Value) -> Vec<Value> {
242    // Try common field names for issues/findings
243    let issue_fields = [
244        "issues",
245        "findings",
246        "violations",
247        "warnings",
248        "errors",
249        "recommendations",
250        "results",
251        "diagnostics",
252        "failures", // LintResult from kubelint, hadolint, dclint, helmlint
253    ];
254
255    for field in &issue_fields {
256        if let Some(arr) = output.get(field).and_then(|v| v.as_array()) {
257            return arr.clone();
258        }
259    }
260
261    // Check if output itself is an array
262    if let Some(arr) = output.as_array() {
263        return arr.clone();
264    }
265
266    // Try nested structures
267    if let Some(obj) = output.as_object() {
268        for (_, v) in obj {
269            if let Some(arr) = v.as_array()
270                && !arr.is_empty()
271                && is_issue_like(&arr[0])
272            {
273                return arr.clone();
274            }
275        }
276    }
277
278    Vec::new()
279}
280
281/// Check if a value looks like an issue/finding
282fn is_issue_like(value: &Value) -> bool {
283    if let Some(obj) = value.as_object() {
284        // Issues typically have severity, code, message, or file fields
285        obj.contains_key("severity")
286            || obj.contains_key("code")
287            || obj.contains_key("message")
288            || obj.contains_key("rule")
289            || obj.contains_key("level")
290    } else {
291        false
292    }
293}
294
295/// Classify issues by severity level
296fn classify_by_severity(
297    issues: &[Value],
298) -> (Vec<Value>, Vec<Value>, Vec<Value>, Vec<Value>, Vec<Value>) {
299    let mut critical = Vec::new();
300    let mut high = Vec::new();
301    let mut medium = Vec::new();
302    let mut low = Vec::new();
303    let mut info = Vec::new();
304
305    for issue in issues {
306        let severity = get_severity(issue);
307        match severity {
308            Severity::Critical => critical.push(issue.clone()),
309            Severity::High => high.push(issue.clone()),
310            Severity::Medium => medium.push(issue.clone()),
311            Severity::Low => low.push(issue.clone()),
312            Severity::Info => info.push(issue.clone()),
313        }
314    }
315
316    (critical, high, medium, low, info)
317}
318
319/// Extract severity from an issue value
320fn get_severity(issue: &Value) -> Severity {
321    // Try common severity field names
322    let severity_fields = ["severity", "level", "priority", "type"];
323
324    for field in &severity_fields {
325        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
326            return Severity::from_str(s);
327        }
328    }
329
330    // Check for error/warning in code field
331    if let Some(code) = issue.get("code").and_then(|v| v.as_str()) {
332        if code.to_lowercase().contains("error") {
333            return Severity::Critical;
334        }
335        if code.to_lowercase().contains("warn") {
336            return Severity::High;
337        }
338    }
339
340    Severity::Medium // Default
341}
342
343/// Get issue code/type for deduplication grouping
344fn get_issue_code(issue: &Value) -> String {
345    // Try common code field names
346    let code_fields = ["code", "rule", "rule_id", "type", "check", "id"];
347
348    for field in &code_fields {
349        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
350            return s.to_string();
351        }
352    }
353
354    // Fall back to message hash
355    if let Some(msg) = issue.get("message").and_then(|v| v.as_str()) {
356        return format!("msg:{}", &msg[..msg.len().min(30)]);
357    }
358
359    "unknown".to_string()
360}
361
362/// Get file path from an issue
363fn get_issue_file(issue: &Value) -> Option<String> {
364    let file_fields = ["file", "path", "filename", "location", "source"];
365
366    for field in &file_fields {
367        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
368            return Some(s.to_string());
369        }
370        // Handle nested location objects
371        if let Some(loc) = issue.get(field).and_then(|v| v.as_object())
372            && let Some(f) = loc.get("file").and_then(|v| v.as_str())
373        {
374            return Some(f.to_string());
375        }
376    }
377
378    None
379}
380
381/// Get message from an issue
382fn get_issue_message(issue: &Value) -> String {
383    let msg_fields = ["message", "msg", "description", "text", "detail"];
384
385    for field in &msg_fields {
386        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
387            return s.to_string();
388        }
389    }
390
391    "No message".to_string()
392}
393
394/// Deduplicate issues into patterns
395fn deduplicate_to_patterns(
396    issues: &[Value],
397    config: &CompressionConfig,
398) -> Vec<DeduplicatedPattern> {
399    // Group by issue code
400    let mut groups: HashMap<String, Vec<&Value>> = HashMap::new();
401
402    for issue in issues {
403        let code = get_issue_code(issue);
404        groups.entry(code).or_default().push(issue);
405    }
406
407    // Convert groups to patterns
408    let mut patterns: Vec<DeduplicatedPattern> = groups
409        .into_iter()
410        .map(|(code, group)| {
411            let first = group[0];
412            let severity = get_severity(first);
413            let message = get_issue_message(first);
414
415            // Collect affected files
416            let mut files: Vec<String> = group.iter().filter_map(|i| get_issue_file(i)).collect();
417            files.dedup();
418
419            let total_files = files.len();
420            let truncated_files: Vec<String> = if files.len() > config.max_files_per_pattern {
421                let mut truncated: Vec<String> = files
422                    .iter()
423                    .take(config.max_files_per_pattern)
424                    .cloned()
425                    .collect();
426                truncated.push(format!(
427                    "...+{} more",
428                    total_files - config.max_files_per_pattern
429                ));
430                truncated
431            } else {
432                files
433            };
434
435            // Extract fix template if available
436            let fix_template = first
437                .get("fix")
438                .or_else(|| first.get("suggestion"))
439                .or_else(|| first.get("recommendation"))
440                .and_then(|v| v.as_str())
441                .map(|s| s.to_string());
442
443            DeduplicatedPattern {
444                code,
445                count: group.len(),
446                severity,
447                message,
448                affected_files: truncated_files,
449                example: if group.len() > 1 {
450                    Some(first.clone())
451                } else {
452                    None
453                },
454                fix_template,
455            }
456        })
457        .collect();
458
459    // Sort by severity (critical first) then by count
460    patterns.sort_by(|a, b| {
461        b.severity
462            .cmp(&a.severity)
463            .then_with(|| b.count.cmp(&a.count))
464    });
465
466    patterns
467}
468
469/// Compress analyze_project output specifically
470///
471/// Handles both:
472/// - MonorepoAnalysis: has "projects" array, "is_monorepo", "root_path"
473/// - ProjectAnalysis: flat structure with "languages", "technologies" at top level
474///
475/// For large analysis, returns a minimal summary and stores full data for retrieval.
476pub fn compress_analysis_output(output: &Value, config: &CompressionConfig) -> String {
477    let raw_str = serde_json::to_string(output).unwrap_or_default();
478    if raw_str.len() <= config.target_size_bytes {
479        return raw_str;
480    }
481
482    // Store full output for later retrieval
483    let ref_id = output_store::store_output(output, "analyze_project");
484
485    // Build a MINIMAL summary - just enough to understand the project
486    let mut summary = json!({
487        "tool": "analyze_project",
488        "status": "ANALYSIS_COMPLETE",
489        "full_data_ref": ref_id.clone()
490    });
491
492    let summary_obj = summary.as_object_mut().unwrap();
493
494    // Detect output type and extract accordingly
495    let is_monorepo = output.get("projects").is_some() || output.get("is_monorepo").is_some();
496    let is_project_analysis =
497        output.get("languages").is_some() && output.get("analysis_metadata").is_some();
498
499    if is_monorepo {
500        // MonorepoAnalysis structure
501        if let Some(mono) = output.get("is_monorepo").and_then(|v| v.as_bool()) {
502            summary_obj.insert("is_monorepo".to_string(), json!(mono));
503        }
504        if let Some(root) = output.get("root_path").and_then(|v| v.as_str()) {
505            summary_obj.insert("root_path".to_string(), json!(root));
506        }
507
508        if let Some(projects) = output.get("projects").and_then(|v| v.as_array()) {
509            summary_obj.insert("project_count".to_string(), json!(projects.len()));
510
511            let mut all_languages: Vec<String> = Vec::new();
512            let mut all_frameworks: Vec<String> = Vec::new();
513            let mut project_names: Vec<String> = Vec::new();
514
515            for project in projects.iter().take(20) {
516                if let Some(name) = project.get("name").and_then(|v| v.as_str()) {
517                    project_names.push(name.to_string());
518                }
519                if let Some(analysis) = project.get("analysis") {
520                    if let Some(langs) = analysis.get("languages").and_then(|v| v.as_array()) {
521                        for lang in langs {
522                            if let Some(name) = lang.get("name").and_then(|v| v.as_str())
523                                && !all_languages.contains(&name.to_string())
524                            {
525                                all_languages.push(name.to_string());
526                            }
527                        }
528                    }
529                    if let Some(fws) = analysis.get("frameworks").and_then(|v| v.as_array()) {
530                        for fw in fws {
531                            if let Some(name) = fw.get("name").and_then(|v| v.as_str())
532                                && !all_frameworks.contains(&name.to_string())
533                            {
534                                all_frameworks.push(name.to_string());
535                            }
536                        }
537                    }
538                }
539            }
540
541            summary_obj.insert("project_names".to_string(), json!(project_names));
542            summary_obj.insert("languages_detected".to_string(), json!(all_languages));
543            summary_obj.insert("frameworks_detected".to_string(), json!(all_frameworks));
544        }
545    } else if is_project_analysis {
546        // ProjectAnalysis flat structure - languages/technologies at top level
547        if let Some(root) = output.get("project_root").and_then(|v| v.as_str()) {
548            summary_obj.insert("project_root".to_string(), json!(root));
549        }
550        if let Some(arch) = output.get("architecture_type").and_then(|v| v.as_str()) {
551            summary_obj.insert("architecture_type".to_string(), json!(arch));
552        }
553        if let Some(proj_type) = output.get("project_type").and_then(|v| v.as_str()) {
554            summary_obj.insert("project_type".to_string(), json!(proj_type));
555        }
556
557        // Extract languages (at top level)
558        if let Some(langs) = output.get("languages").and_then(|v| v.as_array()) {
559            let names: Vec<&str> = langs
560                .iter()
561                .filter_map(|l| l.get("name").and_then(|n| n.as_str()))
562                .collect();
563            summary_obj.insert("languages_detected".to_string(), json!(names));
564        }
565
566        // Extract technologies (at top level)
567        if let Some(techs) = output.get("technologies").and_then(|v| v.as_array()) {
568            let names: Vec<&str> = techs
569                .iter()
570                .filter_map(|t| t.get("name").and_then(|n| n.as_str()))
571                .collect();
572            summary_obj.insert("technologies_detected".to_string(), json!(names));
573        }
574
575        // Extract services (include names, not just count)
576        if let Some(services) = output.get("services").and_then(|v| v.as_array()) {
577            summary_obj.insert("services_count".to_string(), json!(services.len()));
578            // Include service names so agent knows what microservices exist
579            let service_names: Vec<&str> = services
580                .iter()
581                .filter_map(|s| s.get("name").and_then(|n| n.as_str()))
582                .collect();
583            if !service_names.is_empty() {
584                summary_obj.insert("services_detected".to_string(), json!(service_names));
585            }
586        }
587    }
588
589    // CRITICAL: Include retrieval instructions prominently
590    summary_obj.insert(
591        "retrieval_instructions".to_string(),
592        json!({
593            "message": "Full analysis stored. Use retrieve_output with queries to get specific sections.",
594            "ref_id": ref_id,
595            "available_queries": [
596                "section:summary - Project overview",
597                "section:languages - All detected languages",
598                "section:frameworks - All detected frameworks/technologies",
599                "section:services - All detected services",
600                "language:<name> - Details for specific language (e.g., language:Rust)",
601                "framework:<name> - Details for specific framework"
602            ],
603            "example": format!("retrieve_output('{}', 'section:summary')", ref_id)
604        }),
605    );
606
607    // Build session summary
608    let project_count = output
609        .get("projects")
610        .and_then(|v| v.as_array())
611        .map(|a| a.len())
612        .unwrap_or(1);
613    let summary_str = format!(
614        "{} project(s), {} bytes stored",
615        project_count,
616        raw_str.len()
617    );
618
619    // Register in session registry
620    output_store::register_session_ref(
621        &ref_id,
622        "analyze_project",
623        "Full project analysis (use section queries to retrieve specific data)",
624        &summary_str,
625        raw_str.len(),
626    );
627
628    // Return minimal JSON
629    serde_json::to_string_pretty(&summary).unwrap_or_else(|_| {
630        format!(
631            r#"{{"tool":"analyze_project","status":"STORED","full_data_ref":"{}","message":"Analysis complete. Use retrieve_output('{}', 'section:summary') to view."}}"#,
632            ref_id, ref_id
633        )
634    })
635}
636
637#[cfg(test)]
638mod tests {
639    use super::*;
640
641    #[test]
642    fn test_severity_ordering() {
643        assert!(Severity::Critical > Severity::High);
644        assert!(Severity::High > Severity::Medium);
645        assert!(Severity::Medium > Severity::Low);
646        assert!(Severity::Low > Severity::Info);
647    }
648
649    #[test]
650    fn test_extract_issues_from_array_field() {
651        let output = json!({
652            "issues": [
653                { "code": "DL3008", "severity": "warning", "message": "Pin versions" },
654                { "code": "DL3009", "severity": "info", "message": "Delete apt lists" }
655            ]
656        });
657
658        let issues = extract_issues(&output);
659        assert_eq!(issues.len(), 2);
660    }
661
662    #[test]
663    fn test_deduplication() {
664        let issues = vec![
665            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile1" }),
666            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile2" }),
667            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile3" }),
668            json!({ "code": "DL3009", "severity": "info", "file": "Dockerfile1" }),
669        ];
670
671        let config = CompressionConfig::default();
672        let patterns = deduplicate_to_patterns(&issues, &config);
673
674        assert_eq!(patterns.len(), 2);
675
676        let dl3008 = patterns.iter().find(|p| p.code == "DL3008").unwrap();
677        assert_eq!(dl3008.count, 3);
678        assert_eq!(dl3008.affected_files.len(), 3);
679    }
680
681    #[test]
682    fn test_small_output_not_compressed() {
683        let small_output = json!({
684            "issues": [
685                { "code": "test", "severity": "low" }
686            ]
687        });
688
689        let config = CompressionConfig {
690            target_size_bytes: 10000,
691            ..Default::default()
692        };
693
694        let result = compress_tool_output(&small_output, "test", &config);
695        // Should return original (not compressed) since it's small
696        assert!(!result.contains("full_data_ref"));
697    }
698}