syncable_cli/agent/tools/
compression.rs

1//! Smart Context Compression for Tool Outputs
2//!
3//! Implements multi-layer semantic compression with RAG retrieval pattern:
4//! 1. Semantic Deduplication - Group identical patterns
5//! 2. Importance-Weighted Output - Critical=full, Low=counts
6//! 3. Hierarchical Summaries - Multi-level detail
7//! 4. RAG Pattern - Store full data, return summary with retrieval reference
8
9use serde::{Deserialize, Serialize};
10use serde_json::{Value, json};
11use std::collections::HashMap;
12
13use super::output_store;
14
15/// Severity levels for importance-weighted filtering
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
17#[serde(rename_all = "lowercase")]
18pub enum Severity {
19    Info,
20    Low,
21    Medium,
22    High,
23    Critical,
24}
25
26impl Severity {
27    pub fn from_str(s: &str) -> Self {
28        match s.to_lowercase().as_str() {
29            "critical" | "error" => Severity::Critical,
30            "high" | "warning" => Severity::High,
31            "medium" => Severity::Medium,
32            "low" | "hint" => Severity::Low,
33            _ => Severity::Info,
34        }
35    }
36}
37
38/// A deduplicated pattern representing multiple similar issues
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct DeduplicatedPattern {
41    /// The issue code/type (e.g., "no-resource-limits", "DL3008")
42    pub code: String,
43    /// Number of occurrences
44    pub count: usize,
45    /// Severity level
46    pub severity: Severity,
47    /// Brief description of the issue
48    pub message: String,
49    /// List of affected files (truncated if too many)
50    pub affected_files: Vec<String>,
51    /// One full example for context
52    pub example: Option<Value>,
53    /// Suggested fix template
54    pub fix_template: Option<String>,
55}
56
57/// Compressed output ready for LLM context
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct CompressedOutput {
60    /// Tool that generated this output
61    pub tool: String,
62    /// Overall status
63    pub status: String,
64    /// Summary counts by severity
65    pub summary: SeveritySummary,
66    /// Critical issues - always shown in full
67    pub critical_issues: Vec<Value>,
68    /// High severity issues - shown in full if few, otherwise patterns
69    pub high_issues: Vec<Value>,
70    /// Deduplicated patterns for medium/low issues
71    pub patterns: Vec<DeduplicatedPattern>,
72    /// Reference ID for retrieving full data
73    pub full_data_ref: String,
74    /// Hint for agent on how to retrieve more details
75    pub retrieval_hint: String,
76}
77
78/// Summary counts by severity level
79#[derive(Debug, Clone, Default, Serialize, Deserialize)]
80pub struct SeveritySummary {
81    pub total: usize,
82    pub critical: usize,
83    pub high: usize,
84    pub medium: usize,
85    pub low: usize,
86    pub info: usize,
87}
88
89/// Configuration for compression behavior
90#[derive(Debug, Clone)]
91pub struct CompressionConfig {
92    /// Maximum high-severity issues to show in full (default: 10)
93    pub max_high_full: usize,
94    /// Maximum files to list per pattern (default: 5)
95    pub max_files_per_pattern: usize,
96    /// Target output size in bytes (default: 15KB)
97    pub target_size_bytes: usize,
98}
99
100impl Default for CompressionConfig {
101    fn default() -> Self {
102        Self {
103            max_high_full: 10,
104            max_files_per_pattern: 5,
105            target_size_bytes: 15_000,
106        }
107    }
108}
109
110/// Main compression function - compresses tool output and stores full data for retrieval
111///
112/// # Arguments
113/// * `output` - The raw JSON output from a tool
114/// * `tool_name` - Name of the tool (e.g., "kubelint", "k8s_optimize")
115/// * `config` - Compression configuration
116///
117/// # Returns
118/// JSON string of compressed output, or original if compression not applicable
119pub fn compress_tool_output(output: &Value, tool_name: &str, config: &CompressionConfig) -> String {
120    // Check if output is small enough - no compression needed
121    let raw_str = serde_json::to_string(output).unwrap_or_default();
122    if raw_str.len() <= config.target_size_bytes {
123        return raw_str;
124    }
125
126    // Store full output for later retrieval
127    let ref_id = output_store::store_output(output, tool_name);
128
129    // Extract issues/findings array from the output
130    let issues = extract_issues(output);
131
132    if issues.is_empty() {
133        // Register in session with description
134        let contains = format!("{} analysis data (no issues)", tool_name);
135        output_store::register_session_ref(
136            &ref_id,
137            tool_name,
138            &contains,
139            "0 issues",
140            raw_str.len(),
141        );
142
143        // No issues to compress, just store and return summary
144        let mut result = serde_json::to_string_pretty(&json!({
145            "tool": tool_name,
146            "status": "NO_ISSUES",
147            "summary": { "total": 0 },
148            "full_data_ref": ref_id,
149            "retrieval_hint": format!("Use retrieve_output('{}') for full analysis data", ref_id)
150        }))
151        .unwrap_or(raw_str.clone());
152
153        // Append ALL session refs so agent always knows what's available
154        result.push_str(&output_store::format_session_refs_for_agent());
155        return result;
156    }
157
158    // Classify issues by severity
159    let (critical, high, medium, low, info) = classify_by_severity(&issues);
160
161    // Build summary
162    let summary = SeveritySummary {
163        total: issues.len(),
164        critical: critical.len(),
165        high: high.len(),
166        medium: medium.len(),
167        low: low.len(),
168        info: info.len(),
169    };
170
171    // Critical issues: always full detail
172    let critical_issues: Vec<Value> = critical.clone();
173
174    // High issues: full detail if few, otherwise deduplicate
175    let high_issues: Vec<Value> = if high.len() <= config.max_high_full {
176        high.clone()
177    } else {
178        // Show first few + pattern for rest
179        high.iter().take(config.max_high_full).cloned().collect()
180    };
181
182    // Deduplicate medium/low/info issues into patterns
183    let mut all_lower: Vec<Value> = Vec::new();
184    all_lower.extend(medium.clone());
185    all_lower.extend(low.clone());
186    all_lower.extend(info.clone());
187
188    // Also add remaining high issues if there were too many
189    if high.len() > config.max_high_full {
190        all_lower.extend(high.iter().skip(config.max_high_full).cloned());
191    }
192
193    let patterns = deduplicate_to_patterns(&all_lower, config);
194
195    // Determine status
196    let status = if summary.critical > 0 {
197        "CRITICAL_ISSUES_FOUND"
198    } else if summary.high > 0 {
199        "HIGH_ISSUES_FOUND"
200    } else if summary.total > 0 {
201        "ISSUES_FOUND"
202    } else {
203        "CLEAN"
204    };
205
206    // Register in session registry with meaningful description
207    let contains = match tool_name {
208        "kubelint" => "Kubernetes manifest lint issues (security, best practices)",
209        "k8s_optimize" => "K8s resource optimization recommendations",
210        "analyze" => "Project analysis (languages, frameworks, dependencies)",
211        _ => "Tool analysis results",
212    };
213    let summary_str = format!(
214        "{} issues: {} critical, {} high, {} medium",
215        summary.total, summary.critical, summary.high, summary.medium
216    );
217    output_store::register_session_ref(&ref_id, tool_name, contains, &summary_str, raw_str.len());
218
219    let compressed = CompressedOutput {
220        tool: tool_name.to_string(),
221        status: status.to_string(),
222        summary,
223        critical_issues,
224        high_issues,
225        patterns,
226        full_data_ref: ref_id.clone(),
227        retrieval_hint: format!(
228            "Use retrieve_output('{}', query) to get full details. Query options: 'severity:critical', 'file:path', 'code:DL3008'",
229            ref_id
230        ),
231    };
232
233    let mut result = serde_json::to_string_pretty(&compressed).unwrap_or(raw_str);
234
235    // Append ALL session refs so agent always knows what's available
236    result.push_str(&output_store::format_session_refs_for_agent());
237    result
238}
239
240/// Extract issues/findings array from various output formats
241fn extract_issues(output: &Value) -> Vec<Value> {
242    // Try common field names for issues/findings
243    let issue_fields = [
244        "issues",
245        "findings",
246        "violations",
247        "warnings",
248        "errors",
249        "recommendations",
250        "results",
251        "diagnostics",
252        "failures", // LintResult from kubelint, hadolint, dclint, helmlint
253        "vulnerable_dependencies",
254    ];
255
256    for field in &issue_fields {
257        if let Some(arr) = output.get(field).and_then(|v| v.as_array()) {
258            // For vulnerable_dependencies, flatten inner vulnerabilities
259            // so each vuln has a severity field the compressor can classify
260            if field == &"vulnerable_dependencies" && !arr.is_empty() {
261                let mut flat = Vec::new();
262                for dep in arr {
263                    let dep_name = dep
264                        .get("name")
265                        .and_then(|v| v.as_str())
266                        .unwrap_or("unknown");
267                    let dep_version = dep.get("version").and_then(|v| v.as_str()).unwrap_or("?");
268                    let language = dep
269                        .get("language")
270                        .cloned()
271                        .unwrap_or(serde_json::Value::Null);
272                    if let Some(vulns) = dep.get("vulnerabilities").and_then(|v| v.as_array()) {
273                        for vuln in vulns {
274                            let mut entry = vuln.clone();
275                            if let Some(obj) = entry.as_object_mut() {
276                                obj.insert("package".to_string(), serde_json::json!(dep_name));
277                                obj.insert(
278                                    "package_version".to_string(),
279                                    serde_json::json!(dep_version),
280                                );
281                                obj.insert("language".to_string(), language.clone());
282                            }
283                            flat.push(entry);
284                        }
285                    }
286                }
287                return flat;
288            }
289            return arr.clone();
290        }
291    }
292
293    // Check if output itself is an array
294    if let Some(arr) = output.as_array() {
295        return arr.clone();
296    }
297
298    // Try nested structures
299    if let Some(obj) = output.as_object() {
300        for (_, v) in obj {
301            if let Some(arr) = v.as_array()
302                && !arr.is_empty()
303                && is_issue_like(&arr[0])
304            {
305                return arr.clone();
306            }
307        }
308    }
309
310    Vec::new()
311}
312
313/// Check if a value looks like an issue/finding
314fn is_issue_like(value: &Value) -> bool {
315    if let Some(obj) = value.as_object() {
316        // Issues typically have severity, code, message, or file fields
317        obj.contains_key("severity")
318            || obj.contains_key("code")
319            || obj.contains_key("message")
320            || obj.contains_key("rule")
321            || obj.contains_key("level")
322    } else {
323        false
324    }
325}
326
327/// Classify issues by severity level
328fn classify_by_severity(
329    issues: &[Value],
330) -> (Vec<Value>, Vec<Value>, Vec<Value>, Vec<Value>, Vec<Value>) {
331    let mut critical = Vec::new();
332    let mut high = Vec::new();
333    let mut medium = Vec::new();
334    let mut low = Vec::new();
335    let mut info = Vec::new();
336
337    for issue in issues {
338        let severity = get_severity(issue);
339        match severity {
340            Severity::Critical => critical.push(issue.clone()),
341            Severity::High => high.push(issue.clone()),
342            Severity::Medium => medium.push(issue.clone()),
343            Severity::Low => low.push(issue.clone()),
344            Severity::Info => info.push(issue.clone()),
345        }
346    }
347
348    (critical, high, medium, low, info)
349}
350
351/// Extract severity from an issue value
352fn get_severity(issue: &Value) -> Severity {
353    // Try common severity field names
354    let severity_fields = ["severity", "level", "priority", "type"];
355
356    for field in &severity_fields {
357        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
358            return Severity::from_str(s);
359        }
360    }
361
362    // Check for error/warning in code field
363    if let Some(code) = issue.get("code").and_then(|v| v.as_str()) {
364        if code.to_lowercase().contains("error") {
365            return Severity::Critical;
366        }
367        if code.to_lowercase().contains("warn") {
368            return Severity::High;
369        }
370    }
371
372    Severity::Medium // Default
373}
374
375/// Get issue code/type for deduplication grouping
376fn get_issue_code(issue: &Value) -> String {
377    // Try common code field names
378    let code_fields = ["code", "rule", "rule_id", "type", "check", "id"];
379
380    for field in &code_fields {
381        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
382            return s.to_string();
383        }
384    }
385
386    // Fall back to message hash
387    if let Some(msg) = issue.get("message").and_then(|v| v.as_str()) {
388        return format!("msg:{}", &msg[..msg.len().min(30)]);
389    }
390
391    "unknown".to_string()
392}
393
394/// Get file path from an issue
395fn get_issue_file(issue: &Value) -> Option<String> {
396    let file_fields = ["file", "path", "filename", "location", "source"];
397
398    for field in &file_fields {
399        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
400            return Some(s.to_string());
401        }
402        // Handle nested location objects
403        if let Some(loc) = issue.get(field).and_then(|v| v.as_object())
404            && let Some(f) = loc.get("file").and_then(|v| v.as_str())
405        {
406            return Some(f.to_string());
407        }
408    }
409
410    None
411}
412
413/// Get message from an issue
414fn get_issue_message(issue: &Value) -> String {
415    let msg_fields = ["message", "msg", "description", "text", "detail"];
416
417    for field in &msg_fields {
418        if let Some(s) = issue.get(field).and_then(|v| v.as_str()) {
419            return s.to_string();
420        }
421    }
422
423    "No message".to_string()
424}
425
426/// Deduplicate issues into patterns
427fn deduplicate_to_patterns(
428    issues: &[Value],
429    config: &CompressionConfig,
430) -> Vec<DeduplicatedPattern> {
431    // Group by issue code
432    let mut groups: HashMap<String, Vec<&Value>> = HashMap::new();
433
434    for issue in issues {
435        let code = get_issue_code(issue);
436        groups.entry(code).or_default().push(issue);
437    }
438
439    // Convert groups to patterns
440    let mut patterns: Vec<DeduplicatedPattern> = groups
441        .into_iter()
442        .map(|(code, group)| {
443            let first = group[0];
444            let severity = get_severity(first);
445            let message = get_issue_message(first);
446
447            // Collect affected files
448            let mut files: Vec<String> = group.iter().filter_map(|i| get_issue_file(i)).collect();
449            files.dedup();
450
451            let total_files = files.len();
452            let truncated_files: Vec<String> = if files.len() > config.max_files_per_pattern {
453                let mut truncated: Vec<String> = files
454                    .iter()
455                    .take(config.max_files_per_pattern)
456                    .cloned()
457                    .collect();
458                truncated.push(format!(
459                    "...+{} more",
460                    total_files - config.max_files_per_pattern
461                ));
462                truncated
463            } else {
464                files
465            };
466
467            // Extract fix template if available
468            let fix_template = first
469                .get("fix")
470                .or_else(|| first.get("suggestion"))
471                .or_else(|| first.get("recommendation"))
472                .and_then(|v| v.as_str())
473                .map(|s| s.to_string());
474
475            DeduplicatedPattern {
476                code,
477                count: group.len(),
478                severity,
479                message,
480                affected_files: truncated_files,
481                example: if group.len() > 1 {
482                    Some(first.clone())
483                } else {
484                    None
485                },
486                fix_template,
487            }
488        })
489        .collect();
490
491    // Sort by severity (critical first) then by count
492    patterns.sort_by(|a, b| {
493        b.severity
494            .cmp(&a.severity)
495            .then_with(|| b.count.cmp(&a.count))
496    });
497
498    patterns
499}
500
501/// Compress analyze_project output specifically
502///
503/// Handles both:
504/// - MonorepoAnalysis: has "projects" array, "is_monorepo", "root_path"
505/// - ProjectAnalysis: flat structure with "languages", "technologies" at top level
506///
507/// For large analysis, returns a minimal summary and stores full data for retrieval.
508pub fn compress_analysis_output(output: &Value, config: &CompressionConfig) -> String {
509    let raw_str = serde_json::to_string(output).unwrap_or_default();
510    if raw_str.len() <= config.target_size_bytes {
511        return raw_str;
512    }
513
514    // Store full output for later retrieval
515    let ref_id = output_store::store_output(output, "analyze_project");
516
517    // Build a MINIMAL summary - just enough to understand the project
518    let mut summary = json!({
519        "tool": "analyze_project",
520        "status": "ANALYSIS_COMPLETE",
521        "full_data_ref": ref_id.clone()
522    });
523
524    let summary_obj = summary.as_object_mut().unwrap();
525
526    // Detect output type and extract accordingly
527    let is_monorepo = output.get("projects").is_some() || output.get("is_monorepo").is_some();
528    let is_project_analysis =
529        output.get("languages").is_some() && output.get("analysis_metadata").is_some();
530
531    if is_monorepo {
532        // MonorepoAnalysis structure
533        if let Some(mono) = output.get("is_monorepo").and_then(|v| v.as_bool()) {
534            summary_obj.insert("is_monorepo".to_string(), json!(mono));
535        }
536        if let Some(root) = output.get("root_path").and_then(|v| v.as_str()) {
537            summary_obj.insert("root_path".to_string(), json!(root));
538        }
539
540        if let Some(projects) = output.get("projects").and_then(|v| v.as_array()) {
541            summary_obj.insert("project_count".to_string(), json!(projects.len()));
542
543            let mut all_languages: Vec<String> = Vec::new();
544            let mut all_frameworks: Vec<String> = Vec::new();
545            let mut project_names: Vec<String> = Vec::new();
546
547            for project in projects.iter().take(20) {
548                if let Some(name) = project.get("name").and_then(|v| v.as_str()) {
549                    project_names.push(name.to_string());
550                }
551                if let Some(analysis) = project.get("analysis") {
552                    if let Some(langs) = analysis.get("languages").and_then(|v| v.as_array()) {
553                        for lang in langs {
554                            if let Some(name) = lang.get("name").and_then(|v| v.as_str())
555                                && !all_languages.contains(&name.to_string())
556                            {
557                                all_languages.push(name.to_string());
558                            }
559                        }
560                    }
561                    if let Some(fws) = analysis.get("frameworks").and_then(|v| v.as_array()) {
562                        for fw in fws {
563                            if let Some(name) = fw.get("name").and_then(|v| v.as_str())
564                                && !all_frameworks.contains(&name.to_string())
565                            {
566                                all_frameworks.push(name.to_string());
567                            }
568                        }
569                    }
570                }
571            }
572
573            summary_obj.insert("project_names".to_string(), json!(project_names));
574            summary_obj.insert("languages_detected".to_string(), json!(all_languages));
575            summary_obj.insert("frameworks_detected".to_string(), json!(all_frameworks));
576        }
577    } else if is_project_analysis {
578        // ProjectAnalysis flat structure - languages/technologies at top level
579        if let Some(root) = output.get("project_root").and_then(|v| v.as_str()) {
580            summary_obj.insert("project_root".to_string(), json!(root));
581        }
582        if let Some(arch) = output.get("architecture_type").and_then(|v| v.as_str()) {
583            summary_obj.insert("architecture_type".to_string(), json!(arch));
584        }
585        if let Some(proj_type) = output.get("project_type").and_then(|v| v.as_str()) {
586            summary_obj.insert("project_type".to_string(), json!(proj_type));
587        }
588
589        // Extract languages (at top level)
590        if let Some(langs) = output.get("languages").and_then(|v| v.as_array()) {
591            let names: Vec<&str> = langs
592                .iter()
593                .filter_map(|l| l.get("name").and_then(|n| n.as_str()))
594                .collect();
595            summary_obj.insert("languages_detected".to_string(), json!(names));
596        }
597
598        // Extract technologies (at top level)
599        if let Some(techs) = output.get("technologies").and_then(|v| v.as_array()) {
600            let names: Vec<&str> = techs
601                .iter()
602                .filter_map(|t| t.get("name").and_then(|n| n.as_str()))
603                .collect();
604            summary_obj.insert("technologies_detected".to_string(), json!(names));
605        }
606
607        // Extract services (include names, not just count)
608        if let Some(services) = output.get("services").and_then(|v| v.as_array()) {
609            summary_obj.insert("services_count".to_string(), json!(services.len()));
610            // Include service names so agent knows what microservices exist
611            let service_names: Vec<&str> = services
612                .iter()
613                .filter_map(|s| s.get("name").and_then(|n| n.as_str()))
614                .collect();
615            if !service_names.is_empty() {
616                summary_obj.insert("services_detected".to_string(), json!(service_names));
617            }
618        }
619    }
620
621    // CRITICAL: Include retrieval instructions prominently
622    summary_obj.insert(
623        "retrieval_instructions".to_string(),
624        json!({
625            "message": "Full analysis stored. Use retrieve_output with queries to get specific sections.",
626            "ref_id": ref_id,
627            "available_queries": [
628                "section:summary - Project overview",
629                "section:languages - All detected languages",
630                "section:frameworks - All detected frameworks/technologies",
631                "section:services - All detected services",
632                "language:<name> - Details for specific language (e.g., language:Rust)",
633                "framework:<name> - Details for specific framework"
634            ],
635            "example": format!("retrieve_output('{}', 'section:summary')", ref_id)
636        }),
637    );
638
639    // Build session summary
640    let project_count = output
641        .get("projects")
642        .and_then(|v| v.as_array())
643        .map(|a| a.len())
644        .unwrap_or(1);
645    let summary_str = format!(
646        "{} project(s), {} bytes stored",
647        project_count,
648        raw_str.len()
649    );
650
651    // Register in session registry
652    output_store::register_session_ref(
653        &ref_id,
654        "analyze_project",
655        "Full project analysis (use section queries to retrieve specific data)",
656        &summary_str,
657        raw_str.len(),
658    );
659
660    // Return minimal JSON
661    serde_json::to_string_pretty(&summary).unwrap_or_else(|_| {
662        format!(
663            r#"{{"tool":"analyze_project","status":"STORED","full_data_ref":"{}","message":"Analysis complete. Use retrieve_output('{}', 'section:summary') to view."}}"#,
664            ref_id, ref_id
665        )
666    })
667}
668
669/// CLI variant of compress_tool_output - produces strict valid JSON with CLI-syntax retrieval hints.
670///
671/// Differences from compress_tool_output():
672/// - retrieval_hint uses CLI syntax (`sync-ctl retrieve '<ref_id>' --query '...'`)
673/// - Does NOT append format_session_refs_for_agent() plaintext footer
674/// - Output is guaranteed valid JSON
675pub fn compress_tool_output_cli(
676    output: &Value,
677    tool_name: &str,
678    config: &CompressionConfig,
679) -> String {
680    // Check if output is small enough - no compression needed
681    let raw_str = serde_json::to_string(output).unwrap_or_default();
682    if raw_str.len() <= config.target_size_bytes {
683        // Still store and add retrieval fields for consistency
684        let ref_id = output_store::store_output(output, tool_name);
685        let mut obj = match output.clone() {
686            Value::Object(m) => m,
687            other => {
688                let mut m = serde_json::Map::new();
689                m.insert("data".to_string(), other);
690                m
691            }
692        };
693        obj.insert("full_data_ref".to_string(), json!(ref_id));
694        obj.insert(
695            "retrieval_hint".to_string(),
696            json!(format!(
697                "Use `sync-ctl retrieve '{}' --query 'severity:critical'` for details. Paginate with --limit N --offset M. Other queries: 'file:<path>', 'code:<id>'",
698                ref_id
699            )),
700        );
701        return serde_json::to_string_pretty(&Value::Object(obj)).unwrap_or(raw_str);
702    }
703
704    // Store full output for later retrieval
705    let ref_id = output_store::store_output(output, tool_name);
706
707    // Handle dependency-map outputs (e.g. {"dependencies": {...}, "total": N})
708    // These aren't issues/findings — compress by summarizing the dep map
709    if let Some(deps_obj) = output.get("dependencies").and_then(|v| v.as_object()) {
710        let total = output
711            .get("total")
712            .and_then(|v| v.as_u64())
713            .unwrap_or(deps_obj.len() as u64);
714
715        // Build a compact summary: counts by source, license distribution
716        let mut by_source: std::collections::HashMap<String, usize> =
717            std::collections::HashMap::new();
718        let mut by_license: std::collections::HashMap<String, usize> =
719            std::collections::HashMap::new();
720        let mut dev_count = 0usize;
721        let mut prod_count = 0usize;
722
723        for dep in deps_obj.values() {
724            let source = dep
725                .get("source")
726                .and_then(|v| v.as_str())
727                .unwrap_or("unknown");
728            *by_source.entry(source.to_string()).or_default() += 1;
729            let license = dep
730                .get("license")
731                .and_then(|v| v.as_str())
732                .unwrap_or("Unknown");
733            *by_license.entry(license.to_string()).or_default() += 1;
734            if dep.get("is_dev").and_then(|v| v.as_bool()).unwrap_or(false) {
735                dev_count += 1;
736            } else {
737                prod_count += 1;
738            }
739        }
740
741        return serde_json::to_string_pretty(&json!({
742            "tool": tool_name,
743            "total": total,
744            "production": prod_count,
745            "development": dev_count,
746            "by_source": by_source,
747            "by_license": by_license,
748            "full_data_ref": ref_id,
749            "retrieval_hint": format!(
750                "Use `sync-ctl retrieve '{}' --query 'file:<path>'` for details. Paginate with --limit N --offset M.",
751                ref_id
752            )
753        }))
754        .unwrap_or(raw_str);
755    }
756
757    // Extract issues/findings array from the output
758    let issues = extract_issues(output);
759
760    if issues.is_empty() {
761        // No issues to compress, just store and return summary as strict JSON
762        return serde_json::to_string_pretty(&json!({
763            "tool": tool_name,
764            "status": "NO_ISSUES",
765            "summary": { "total": 0 },
766            "full_data_ref": ref_id,
767            "retrieval_hint": format!(
768                "Use `sync-ctl retrieve '{}' --query 'severity:critical'` for details. Paginate with --limit N --offset M.",
769                ref_id
770            )
771        }))
772        .unwrap_or(raw_str);
773    }
774
775    // Classify issues by severity
776    let (critical, high, medium, low, info) = classify_by_severity(&issues);
777
778    // Build summary
779    let summary = SeveritySummary {
780        total: issues.len(),
781        critical: critical.len(),
782        high: high.len(),
783        medium: medium.len(),
784        low: low.len(),
785        info: info.len(),
786    };
787
788    // Critical issues: always full detail
789    let critical_issues: Vec<Value> = critical.clone();
790
791    // High issues: full detail if few, otherwise show first max_high_full
792    let high_issues: Vec<Value> = if high.len() <= config.max_high_full {
793        high.clone()
794    } else {
795        high.iter().take(config.max_high_full).cloned().collect()
796    };
797
798    // Deduplicate medium/low/info issues into patterns
799    let mut all_lower: Vec<Value> = Vec::new();
800    all_lower.extend(medium.clone());
801    all_lower.extend(low.clone());
802    all_lower.extend(info.clone());
803
804    // Also add remaining high issues if there were too many
805    if high.len() > config.max_high_full {
806        all_lower.extend(high.iter().skip(config.max_high_full).cloned());
807    }
808
809    let patterns = deduplicate_to_patterns(&all_lower, config);
810
811    // Determine status
812    let status = if summary.critical > 0 {
813        "CRITICAL_ISSUES_FOUND"
814    } else if summary.high > 0 {
815        "HIGH_ISSUES_FOUND"
816    } else if summary.total > 0 {
817        "ISSUES_FOUND"
818    } else {
819        "CLEAN"
820    };
821
822    let compressed = CompressedOutput {
823        tool: tool_name.to_string(),
824        status: status.to_string(),
825        summary,
826        critical_issues,
827        high_issues,
828        patterns,
829        full_data_ref: ref_id.clone(),
830        retrieval_hint: format!(
831            "Use `sync-ctl retrieve '{}' --query 'severity:critical'` for details. Paginate with --limit N --offset M. Other queries: 'file:<path>', 'code:<id>'",
832            ref_id
833        ),
834    };
835
836    // Return strict JSON - no plaintext footer appended
837    serde_json::to_string_pretty(&compressed).unwrap_or(raw_str)
838}
839
840/// CLI variant of compress_analysis_output - produces strict valid JSON with CLI-syntax retrieval hints.
841///
842/// Differences from compress_analysis_output():
843/// - retrieval_hint uses CLI syntax (`sync-ctl retrieve '<ref_id>' --query '...'`)
844/// - Does NOT append format_session_refs_for_agent() plaintext footer
845/// - Output is guaranteed valid JSON
846pub fn compress_analysis_output_cli(output: &Value, _config: &CompressionConfig) -> String {
847    // Store full output for later retrieval
848    let ref_id = output_store::store_output(output, "analyze_project");
849
850    // Build a MINIMAL summary - just enough to understand the project
851    let mut summary = json!({
852        "tool": "analyze_project",
853        "status": "ANALYSIS_COMPLETE",
854        "full_data_ref": ref_id.clone()
855    });
856
857    let summary_obj = summary.as_object_mut().unwrap();
858
859    // Detect output type and extract accordingly
860    let is_monorepo = output.get("projects").is_some() || output.get("is_monorepo").is_some();
861    let is_project_analysis =
862        output.get("languages").is_some() && output.get("analysis_metadata").is_some();
863
864    if is_monorepo {
865        // MonorepoAnalysis structure
866        if let Some(mono) = output.get("is_monorepo").and_then(|v| v.as_bool()) {
867            summary_obj.insert("is_monorepo".to_string(), json!(mono));
868        }
869        if let Some(root) = output.get("root_path").and_then(|v| v.as_str()) {
870            summary_obj.insert("root_path".to_string(), json!(root));
871        }
872
873        if let Some(projects) = output.get("projects").and_then(|v| v.as_array()) {
874            summary_obj.insert("project_count".to_string(), json!(projects.len()));
875
876            let mut all_languages: Vec<String> = Vec::new();
877            let mut all_frameworks: Vec<String> = Vec::new();
878            let mut project_names: Vec<String> = Vec::new();
879
880            for project in projects.iter().take(20) {
881                if let Some(name) = project.get("name").and_then(|v| v.as_str()) {
882                    project_names.push(name.to_string());
883                }
884                if let Some(analysis) = project.get("analysis") {
885                    if let Some(langs) = analysis.get("languages").and_then(|v| v.as_array()) {
886                        for lang in langs {
887                            if let Some(name) = lang.get("name").and_then(|v| v.as_str())
888                                && !all_languages.contains(&name.to_string())
889                            {
890                                all_languages.push(name.to_string());
891                            }
892                        }
893                    }
894                    if let Some(fws) = analysis.get("frameworks").and_then(|v| v.as_array()) {
895                        for fw in fws {
896                            if let Some(name) = fw.get("name").and_then(|v| v.as_str())
897                                && !all_frameworks.contains(&name.to_string())
898                            {
899                                all_frameworks.push(name.to_string());
900                            }
901                        }
902                    }
903                }
904            }
905
906            summary_obj.insert("project_names".to_string(), json!(project_names));
907            summary_obj.insert("languages_detected".to_string(), json!(all_languages));
908            summary_obj.insert("frameworks_detected".to_string(), json!(all_frameworks));
909        }
910    } else if is_project_analysis {
911        // ProjectAnalysis flat structure - languages/technologies at top level
912        if let Some(root) = output.get("project_root").and_then(|v| v.as_str()) {
913            summary_obj.insert("project_root".to_string(), json!(root));
914        }
915        if let Some(arch) = output.get("architecture_type").and_then(|v| v.as_str()) {
916            summary_obj.insert("architecture_type".to_string(), json!(arch));
917        }
918        if let Some(proj_type) = output.get("project_type").and_then(|v| v.as_str()) {
919            summary_obj.insert("project_type".to_string(), json!(proj_type));
920        }
921
922        // Extract languages (at top level)
923        if let Some(langs) = output.get("languages").and_then(|v| v.as_array()) {
924            let names: Vec<&str> = langs
925                .iter()
926                .filter_map(|l| l.get("name").and_then(|n| n.as_str()))
927                .collect();
928            summary_obj.insert("languages_detected".to_string(), json!(names));
929        }
930
931        // Extract technologies (at top level)
932        if let Some(techs) = output.get("technologies").and_then(|v| v.as_array()) {
933            let names: Vec<&str> = techs
934                .iter()
935                .filter_map(|t| t.get("name").and_then(|n| n.as_str()))
936                .collect();
937            summary_obj.insert("technologies_detected".to_string(), json!(names));
938        }
939
940        // Extract services (include names, not just count)
941        if let Some(services) = output.get("services").and_then(|v| v.as_array()) {
942            summary_obj.insert("services_count".to_string(), json!(services.len()));
943            let service_names: Vec<&str> = services
944                .iter()
945                .filter_map(|s| s.get("name").and_then(|n| n.as_str()))
946                .collect();
947            if !service_names.is_empty() {
948                summary_obj.insert("services_detected".to_string(), json!(service_names));
949            }
950        }
951    }
952
953    // CLI-syntax retrieval hint
954    summary_obj.insert(
955        "retrieval_hint".to_string(),
956        json!(format!(
957            "Use `sync-ctl retrieve '{}' --query 'section:summary'` for full details. Other queries: 'section:languages', 'section:frameworks', 'section:services'",
958            ref_id
959        )),
960    );
961
962    // Return strict JSON - no plaintext footer appended
963    serde_json::to_string_pretty(&summary).unwrap_or_else(|_| {
964        format!(
965            r#"{{"tool":"analyze_project","status":"STORED","full_data_ref":"{}","retrieval_hint":"Use `sync-ctl retrieve '{}' --query 'section:summary'` for full details."}}"#,
966            ref_id, ref_id
967        )
968    })
969}
970
971#[cfg(test)]
972mod tests {
973    use super::*;
974
975    #[test]
976    fn test_severity_ordering() {
977        assert!(Severity::Critical > Severity::High);
978        assert!(Severity::High > Severity::Medium);
979        assert!(Severity::Medium > Severity::Low);
980        assert!(Severity::Low > Severity::Info);
981    }
982
983    #[test]
984    fn test_extract_issues_from_array_field() {
985        let output = json!({
986            "issues": [
987                { "code": "DL3008", "severity": "warning", "message": "Pin versions" },
988                { "code": "DL3009", "severity": "info", "message": "Delete apt lists" }
989            ]
990        });
991
992        let issues = extract_issues(&output);
993        assert_eq!(issues.len(), 2);
994    }
995
996    #[test]
997    fn test_deduplication() {
998        let issues = vec![
999            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile1" }),
1000            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile2" }),
1001            json!({ "code": "DL3008", "severity": "warning", "file": "Dockerfile3" }),
1002            json!({ "code": "DL3009", "severity": "info", "file": "Dockerfile1" }),
1003        ];
1004
1005        let config = CompressionConfig::default();
1006        let patterns = deduplicate_to_patterns(&issues, &config);
1007
1008        assert_eq!(patterns.len(), 2);
1009
1010        let dl3008 = patterns.iter().find(|p| p.code == "DL3008").unwrap();
1011        assert_eq!(dl3008.count, 3);
1012        assert_eq!(dl3008.affected_files.len(), 3);
1013    }
1014
1015    #[test]
1016    fn test_small_output_not_compressed() {
1017        let small_output = json!({
1018            "issues": [
1019                { "code": "test", "severity": "low" }
1020            ]
1021        });
1022
1023        let config = CompressionConfig {
1024            target_size_bytes: 10000,
1025            ..Default::default()
1026        };
1027
1028        let result = compress_tool_output(&small_output, "test", &config);
1029        // Should return original (not compressed) since it's small
1030        assert!(!result.contains("full_data_ref"));
1031    }
1032
1033    #[test]
1034    fn test_compress_tool_output_cli_produces_valid_json() {
1035        let output = serde_json::json!({
1036            "findings": (0..100).map(|i| serde_json::json!({
1037                "code": format!("SEC{:03}", i),
1038                "severity": if i < 3 { "critical" } else if i < 15 { "high" } else { "medium" },
1039                "message": format!("Finding {} with enough text to exceed compression threshold when multiplied", i),
1040                "file": format!("src/file_{}.rs", i),
1041            })).collect::<Vec<_>>()
1042        });
1043        let config = CompressionConfig::default();
1044        let result = compress_tool_output_cli(&output, "security", &config);
1045
1046        // Must be valid JSON
1047        let parsed: Result<serde_json::Value, _> = serde_json::from_str(&result);
1048        assert!(
1049            parsed.is_ok(),
1050            "CLI output must be valid JSON, got: {}",
1051            &result[..200.min(result.len())]
1052        );
1053
1054        let json = parsed.unwrap();
1055        // Must contain CLI-syntax retrieval hint
1056        let hint = json.get("retrieval_hint").and_then(|v| v.as_str()).unwrap();
1057        assert!(
1058            hint.contains("sync-ctl retrieve"),
1059            "Hint should use CLI syntax, got: {}",
1060            hint
1061        );
1062        assert!(
1063            !hint.contains("retrieve_output("),
1064            "Hint should NOT use internal tool call syntax"
1065        );
1066    }
1067}
syncable_cli/agent/tools/compression.rs

syncable_cli/agent/tools/
compression.rs