ruvllm/reflection/
confidence.rs

1//! Confidence-Based Revision (If-or-Else Pattern)
2//!
3//! Implements the If-or-Else (IoE) pattern where revision is only triggered
4//! when confidence is LOW. This is more efficient than always reflecting,
5//! as high-confidence outputs are accepted immediately.
6//!
7//! ## Key Insight
8//!
9//! The IoE pattern recognizes that:
10//! - Most outputs are acceptable and don't need revision
11//! - Only LOW confidence outputs benefit from reflection
12//! - Targeted revision based on weak points is more effective than generic retry
13//!
14//! ## Architecture
15//!
16//! ```text
17//! +-------------------+     +----------------------+
18//! | ConfidenceChecker |---->| should_revise()      |
19//! | - threshold       |     | - Check confidence   |
20//! | - budget          |     | - Compare threshold  |
21//! +-------------------+     +----------------------+
22//!           |
23//!           v (if LOW)
24//! +-------------------+     +----------------------+
25//! | identify_weak_pts |---->| generate_targeted_   |
26//! | - Parse output    |     | revision()           |
27//! | - Find issues     |     | - Focus on weak pts  |
28//! +-------------------+     +----------------------+
29//! ```
30
31use super::reflective_agent::ExecutionContext;
32use serde::{Deserialize, Serialize};
33use std::collections::HashMap;
34
35/// Configuration for confidence checking
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct ConfidenceConfig {
38    /// Threshold below which revision is triggered
39    pub threshold: f32,
40    /// Maximum revision attempts (budget)
41    pub revision_budget: u32,
42    /// Minimum improvement required to continue revising
43    pub min_improvement: f32,
44    /// Weights for different confidence factors
45    pub factor_weights: ConfidenceFactorWeights,
46    /// Whether to use structural analysis
47    pub use_structural_analysis: bool,
48    /// Patterns that indicate low confidence
49    pub low_confidence_patterns: Vec<String>,
50}
51
52impl Default for ConfidenceConfig {
53    fn default() -> Self {
54        Self {
55            threshold: 0.7,
56            revision_budget: 3,
57            min_improvement: 0.05,
58            factor_weights: ConfidenceFactorWeights::default(),
59            use_structural_analysis: true,
60            low_confidence_patterns: vec![
61                "I'm not sure".to_string(),
62                "might be".to_string(),
63                "possibly".to_string(),
64                "could be wrong".to_string(),
65                "uncertain".to_string(),
66                "TODO".to_string(),
67                "FIXME".to_string(),
68                "not implemented".to_string(),
69            ],
70        }
71    }
72}
73
74/// Weights for confidence factors
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct ConfidenceFactorWeights {
77    /// Weight for output completeness
78    pub completeness: f32,
79    /// Weight for output structure
80    pub structure: f32,
81    /// Weight for absence of uncertainty markers
82    pub certainty: f32,
83    /// Weight for task relevance
84    pub relevance: f32,
85    /// Weight for code validity (if applicable)
86    pub code_validity: f32,
87}
88
89impl Default for ConfidenceFactorWeights {
90    fn default() -> Self {
91        Self {
92            completeness: 0.25,
93            structure: 0.20,
94            certainty: 0.20,
95            relevance: 0.20,
96            code_validity: 0.15,
97        }
98    }
99}
100
101/// Confidence level classification
102#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
103pub enum ConfidenceLevel {
104    /// Very high confidence (>0.9)
105    VeryHigh,
106    /// High confidence (0.7-0.9)
107    High,
108    /// Medium confidence (0.5-0.7)
109    Medium,
110    /// Low confidence (0.3-0.5)
111    Low,
112    /// Very low confidence (<0.3)
113    VeryLow,
114}
115
116impl ConfidenceLevel {
117    /// Create from score
118    pub fn from_score(score: f32) -> Self {
119        match score {
120            s if s > 0.9 => Self::VeryHigh,
121            s if s > 0.7 => Self::High,
122            s if s > 0.5 => Self::Medium,
123            s if s > 0.3 => Self::Low,
124            _ => Self::VeryLow,
125        }
126    }
127
128    /// Get string representation
129    pub fn as_str(&self) -> &'static str {
130        match self {
131            Self::VeryHigh => "very_high",
132            Self::High => "high",
133            Self::Medium => "medium",
134            Self::Low => "low",
135            Self::VeryLow => "very_low",
136        }
137    }
138
139    /// Check if revision is recommended
140    pub fn should_revise(&self) -> bool {
141        matches!(self, Self::Low | Self::VeryLow)
142    }
143}
144
145/// A weak point identified in the output
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct WeakPoint {
148    /// Location in output (line number or description)
149    pub location: String,
150    /// Description of the weakness
151    pub description: String,
152    /// Severity (0.0-1.0)
153    pub severity: f32,
154    /// Type of weakness
155    pub weakness_type: WeaknessType,
156    /// Suggested fix
157    pub suggestion: String,
158    /// Confidence in this identification
159    pub confidence: f32,
160}
161
162impl WeakPoint {
163    /// Create a new weak point
164    pub fn new(
165        location: impl Into<String>,
166        description: impl Into<String>,
167        severity: f32,
168        weakness_type: WeaknessType,
169    ) -> Self {
170        Self {
171            location: location.into(),
172            description: description.into(),
173            severity: severity.clamp(0.0, 1.0),
174            weakness_type,
175            suggestion: String::new(),
176            confidence: 0.8,
177        }
178    }
179
180    /// Add suggestion
181    pub fn with_suggestion(mut self, suggestion: impl Into<String>) -> Self {
182        self.suggestion = suggestion.into();
183        self
184    }
185}
186
187/// Types of weaknesses
188#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
189pub enum WeaknessType {
190    /// Incomplete implementation
191    Incomplete,
192    /// Uncertain/hedge words
193    Uncertainty,
194    /// Missing error handling
195    MissingErrorHandling,
196    /// Missing validation
197    MissingValidation,
198    /// Code smell or anti-pattern
199    CodeSmell,
200    /// Missing tests
201    MissingTests,
202    /// Documentation gap
203    DocumentationGap,
204    /// Security concern
205    SecurityConcern,
206    /// Performance issue
207    PerformanceIssue,
208    /// Logic error
209    LogicError,
210    /// Other
211    Other,
212}
213
214/// Result of revision attempt
215#[derive(Debug, Clone, Serialize, Deserialize)]
216pub struct RevisionResult {
217    /// Original confidence
218    pub original_confidence: f32,
219    /// New confidence after revision
220    pub new_confidence: f32,
221    /// Improvement achieved
222    pub improvement: f32,
223    /// Weak points addressed
224    pub addressed_weak_points: Vec<WeakPoint>,
225    /// Remaining weak points
226    pub remaining_weak_points: Vec<WeakPoint>,
227    /// Revision count
228    pub revision_count: u32,
229    /// Whether revision was successful
230    pub successful: bool,
231}
232
233/// Confidence checker for IoE pattern
234#[derive(Debug)]
235pub struct ConfidenceChecker {
236    /// Configuration
237    config: ConfidenceConfig,
238    /// History of confidence checks
239    check_history: Vec<ConfidenceCheckRecord>,
240    /// Learned patterns that indicate low confidence
241    learned_patterns: HashMap<String, f32>,
242}
243
244/// Record of a confidence check
245#[derive(Debug, Clone, Serialize, Deserialize)]
246pub struct ConfidenceCheckRecord {
247    /// Computed confidence score
248    pub score: f32,
249    /// Confidence level
250    pub level: ConfidenceLevel,
251    /// Weak points found
252    pub weak_points: Vec<WeakPoint>,
253    /// Factors contributing to score
254    pub factors: HashMap<String, f32>,
255    /// Task context
256    pub task_summary: String,
257    /// Timestamp
258    pub timestamp: u64,
259}
260
261impl ConfidenceChecker {
262    /// Create a new confidence checker
263    pub fn new(config: ConfidenceConfig) -> Self {
264        Self {
265            config,
266            check_history: Vec::new(),
267            learned_patterns: HashMap::new(),
268        }
269    }
270
271    /// Check if revision is needed based on confidence
272    pub fn should_revise(&self, output: &str, context: &ExecutionContext) -> bool {
273        let confidence = self.compute_confidence(output, context);
274        let attempts = context.previous_attempts.len() as u32;
275
276        // Only revise when:
277        // 1. Confidence is below threshold
278        // 2. We haven't exceeded the revision budget
279        confidence < self.config.threshold && attempts < self.config.revision_budget
280    }
281
282    /// Compute confidence score for an output
283    pub fn compute_confidence(&self, output: &str, context: &ExecutionContext) -> f32 {
284        let weights = &self.config.factor_weights;
285        let mut score = 0.0f32;
286
287        // Factor 1: Completeness
288        let completeness = self.assess_completeness(output, context);
289        score += completeness * weights.completeness;
290
291        // Factor 2: Structure
292        let structure = self.assess_structure(output);
293        score += structure * weights.structure;
294
295        // Factor 3: Certainty (absence of uncertainty markers)
296        let certainty = self.assess_certainty(output);
297        score += certainty * weights.certainty;
298
299        // Factor 4: Relevance to task
300        let relevance = self.assess_relevance(output, context);
301        score += relevance * weights.relevance;
302
303        // Factor 5: Code validity (if applicable)
304        let code_validity = self.assess_code_validity(output);
305        score += code_validity * weights.code_validity;
306
307        // Apply learned pattern adjustments
308        for (pattern, weight) in &self.learned_patterns {
309            if output.to_lowercase().contains(&pattern.to_lowercase()) {
310                score *= 1.0 - weight; // Reduce confidence for negative patterns
311            }
312        }
313
314        score.clamp(0.0, 1.0)
315    }
316
317    /// Assess output completeness
318    fn assess_completeness(&self, output: &str, context: &ExecutionContext) -> f32 {
319        if output.is_empty() {
320            return 0.0;
321        }
322
323        let mut score = 0.5f32; // Base score
324
325        // Check if output addresses the task
326        let task_words: Vec<&str> = context.task.split_whitespace().collect();
327        let output_lower = output.to_lowercase();
328        let addressed_count = task_words
329            .iter()
330            .filter(|w| output_lower.contains(&w.to_lowercase()))
331            .count();
332        let addressed_ratio = addressed_count as f32 / task_words.len().max(1) as f32;
333        score += addressed_ratio * 0.3;
334
335        // Check for incomplete markers
336        let incomplete_markers = ["TODO", "FIXME", "...", "to be continued", "incomplete"];
337        let has_incomplete = incomplete_markers
338            .iter()
339            .any(|m| output.contains(m));
340        if has_incomplete {
341            score -= 0.2;
342        }
343
344        // Bonus for substantial output
345        if output.len() > 500 {
346            score += 0.1;
347        }
348        if output.len() > 1000 {
349            score += 0.1;
350        }
351
352        score.clamp(0.0, 1.0)
353    }
354
355    /// Assess output structure
356    fn assess_structure(&self, output: &str) -> f32 {
357        if !self.config.use_structural_analysis {
358            return 0.8; // Default to high if disabled
359        }
360
361        let mut score = 0.5f32;
362
363        // Check for code blocks
364        let has_code_blocks = output.contains("```");
365        if has_code_blocks {
366            score += 0.2;
367        }
368
369        // Check for sections/headers
370        let has_headers = output.contains("##") || output.contains("**");
371        if has_headers {
372            score += 0.1;
373        }
374
375        // Check for lists
376        let has_lists = output.contains("\n- ") || output.contains("\n* ") || output.contains("\n1.");
377        if has_lists {
378            score += 0.1;
379        }
380
381        // Penalize very short outputs
382        if output.len() < 50 {
383            score -= 0.2;
384        }
385
386        // Check line count for multi-line responses
387        let line_count = output.lines().count();
388        if line_count > 5 {
389            score += 0.1;
390        }
391
392        score.clamp(0.0, 1.0)
393    }
394
395    /// Assess certainty (absence of uncertainty markers)
396    fn assess_certainty(&self, output: &str) -> f32 {
397        let output_lower = output.to_lowercase();
398        let mut uncertainty_count = 0;
399
400        for pattern in &self.config.low_confidence_patterns {
401            if output_lower.contains(&pattern.to_lowercase()) {
402                uncertainty_count += 1;
403            }
404        }
405
406        // More uncertainty markers = lower confidence
407        match uncertainty_count {
408            0 => 1.0,
409            1 => 0.8,
410            2 => 0.6,
411            3 => 0.4,
412            _ => 0.2,
413        }
414    }
415
416    /// Assess relevance to task
417    fn assess_relevance(&self, output: &str, context: &ExecutionContext) -> f32 {
418        let task_lower = context.task.to_lowercase();
419        let output_lower = output.to_lowercase();
420
421        // Extract key terms from task
422        let key_terms: Vec<&str> = task_lower
423            .split_whitespace()
424            .filter(|w| w.len() > 3) // Skip short words
425            .collect();
426
427        if key_terms.is_empty() {
428            return 0.5;
429        }
430
431        let matched = key_terms
432            .iter()
433            .filter(|term| output_lower.contains(*term))
434            .count();
435
436        let ratio = matched as f32 / key_terms.len() as f32;
437        (ratio * 0.5 + 0.5).clamp(0.0, 1.0) // Scale to 0.5-1.0 range
438    }
439
440    /// Assess code validity (basic heuristics)
441    fn assess_code_validity(&self, output: &str) -> f32 {
442        // Check if output contains code
443        let has_code = output.contains("```") || output.contains("fn ") || output.contains("def ")
444            || output.contains("function ") || output.contains("class ");
445
446        if !has_code {
447            return 0.8; // Not code-related, give neutral score
448        }
449
450        let mut score = 0.7f32;
451
452        // Check for balanced brackets
453        let open_parens = output.matches('(').count();
454        let close_parens = output.matches(')').count();
455        let open_braces = output.matches('{').count();
456        let close_braces = output.matches('}').count();
457        let open_brackets = output.matches('[').count();
458        let close_brackets = output.matches(']').count();
459
460        if open_parens == close_parens {
461            score += 0.1;
462        } else {
463            score -= 0.2;
464        }
465
466        if open_braces == close_braces {
467            score += 0.1;
468        } else {
469            score -= 0.2;
470        }
471
472        if open_brackets == close_brackets {
473            score += 0.1;
474        } else {
475            score -= 0.1;
476        }
477
478        // Check for common error patterns
479        if output.contains("error[") || output.contains("Error:") {
480            score -= 0.3;
481        }
482
483        score.clamp(0.0, 1.0)
484    }
485
486    /// Identify weak points in the output
487    pub fn identify_weak_points(&self, output: &str, context: &ExecutionContext) -> Vec<WeakPoint> {
488        let mut weak_points = Vec::new();
489
490        // Check for uncertainty markers
491        for pattern in &self.config.low_confidence_patterns {
492            if let Some(pos) = output.to_lowercase().find(&pattern.to_lowercase()) {
493                let line_num = output[..pos].matches('\n').count() + 1;
494                weak_points.push(
495                    WeakPoint::new(
496                        format!("line {}", line_num),
497                        format!("Uncertainty marker: '{}'", pattern),
498                        0.6,
499                        WeaknessType::Uncertainty,
500                    )
501                    .with_suggestion(format!("Remove or clarify the uncertain statement at '{}'", pattern)),
502                );
503            }
504        }
505
506        // Check for TODO/FIXME
507        for marker in ["TODO", "FIXME", "XXX", "HACK"] {
508            if output.contains(marker) {
509                let count = output.matches(marker).count();
510                weak_points.push(
511                    WeakPoint::new(
512                        "multiple locations",
513                        format!("Found {} {} markers", count, marker),
514                        0.7,
515                        WeaknessType::Incomplete,
516                    )
517                    .with_suggestion(format!("Address all {} items", marker)),
518                );
519            }
520        }
521
522        // Check for missing error handling in code
523        if output.contains("fn ") || output.contains("async fn ") {
524            if !output.contains("Result<") && !output.contains("Option<") && !output.contains("?") {
525                weak_points.push(
526                    WeakPoint::new(
527                        "function definitions",
528                        "Functions may lack proper error handling",
529                        0.5,
530                        WeaknessType::MissingErrorHandling,
531                    )
532                    .with_suggestion("Add Result/Option return types and error propagation"),
533                );
534            }
535        }
536
537        // Check for missing validation
538        if context.task.to_lowercase().contains("input")
539            || context.task.to_lowercase().contains("parameter")
540        {
541            if !output.to_lowercase().contains("valid")
542                && !output.to_lowercase().contains("check")
543                && !output.to_lowercase().contains("assert")
544            {
545                weak_points.push(
546                    WeakPoint::new(
547                        "input handling",
548                        "May be missing input validation",
549                        0.4,
550                        WeaknessType::MissingValidation,
551                    )
552                    .with_suggestion("Add input validation and bounds checking"),
553                );
554            }
555        }
556
557        // Check for missing tests if task mentions testing
558        if context.task.to_lowercase().contains("test") {
559            if !output.contains("#[test]") && !output.contains("fn test_") {
560                weak_points.push(
561                    WeakPoint::new(
562                        "test coverage",
563                        "No test functions found",
564                        0.6,
565                        WeaknessType::MissingTests,
566                    )
567                    .with_suggestion("Add unit tests with #[test] attribute"),
568                );
569            }
570        }
571
572        weak_points
573    }
574
575    /// Generate a targeted revision based on weak points
576    pub fn generate_targeted_revision(
577        &self,
578        output: &str,
579        weak_points: &[WeakPoint],
580    ) -> String {
581        if weak_points.is_empty() {
582            return output.to_string();
583        }
584
585        let mut revision_prompt = String::from("Please revise the following output to address these specific issues:\n\n");
586
587        for (i, wp) in weak_points.iter().enumerate() {
588            revision_prompt.push_str(&format!(
589                "{}. [{:?}] At {}: {}\n   Suggestion: {}\n\n",
590                i + 1,
591                wp.weakness_type,
592                wp.location,
593                wp.description,
594                wp.suggestion
595            ));
596        }
597
598        revision_prompt.push_str("\nOriginal output:\n");
599        revision_prompt.push_str(output);
600
601        revision_prompt
602    }
603
604    /// Record a confidence check for learning
605    pub fn record_check(&mut self, output: &str, context: &ExecutionContext) -> ConfidenceCheckRecord {
606        let score = self.compute_confidence(output, context);
607        let level = ConfidenceLevel::from_score(score);
608        let weak_points = self.identify_weak_points(output, context);
609
610        let mut factors = HashMap::new();
611        factors.insert("completeness".to_string(), self.assess_completeness(output, context));
612        factors.insert("structure".to_string(), self.assess_structure(output));
613        factors.insert("certainty".to_string(), self.assess_certainty(output));
614        factors.insert("relevance".to_string(), self.assess_relevance(output, context));
615        factors.insert("code_validity".to_string(), self.assess_code_validity(output));
616
617        let record = ConfidenceCheckRecord {
618            score,
619            level,
620            weak_points,
621            factors,
622            task_summary: context.task.chars().take(100).collect(),
623            timestamp: std::time::SystemTime::now()
624                .duration_since(std::time::UNIX_EPOCH)
625                .map(|d| d.as_secs())
626                .unwrap_or(0),
627        };
628
629        self.check_history.push(record.clone());
630        record
631    }
632
633    /// Learn from a pattern that indicated low quality
634    pub fn learn_pattern(&mut self, pattern: String, weight: f32) {
635        self.learned_patterns.insert(pattern, weight.clamp(0.0, 1.0));
636    }
637
638    /// Get check history
639    pub fn history(&self) -> &[ConfidenceCheckRecord] {
640        &self.check_history
641    }
642
643    /// Clear history
644    pub fn clear_history(&mut self) {
645        self.check_history.clear();
646    }
647
648    /// Get configuration
649    pub fn config(&self) -> &ConfidenceConfig {
650        &self.config
651    }
652}
653
654#[cfg(test)]
655mod tests {
656    use super::*;
657    use crate::claude_flow::AgentType;
658
659    #[test]
660    fn test_confidence_level_from_score() {
661        assert_eq!(ConfidenceLevel::from_score(0.95), ConfidenceLevel::VeryHigh);
662        assert_eq!(ConfidenceLevel::from_score(0.8), ConfidenceLevel::High);
663        assert_eq!(ConfidenceLevel::from_score(0.6), ConfidenceLevel::Medium);
664        assert_eq!(ConfidenceLevel::from_score(0.4), ConfidenceLevel::Low);
665        assert_eq!(ConfidenceLevel::from_score(0.2), ConfidenceLevel::VeryLow);
666    }
667
668    #[test]
669    fn test_should_revise_low_levels() {
670        assert!(ConfidenceLevel::Low.should_revise());
671        assert!(ConfidenceLevel::VeryLow.should_revise());
672        assert!(!ConfidenceLevel::Medium.should_revise());
673        assert!(!ConfidenceLevel::High.should_revise());
674    }
675
676    #[test]
677    fn test_confidence_checker_creation() {
678        let config = ConfidenceConfig::default();
679        let checker = ConfidenceChecker::new(config);
680        assert_eq!(checker.config().threshold, 0.7);
681    }
682
683    #[test]
684    fn test_compute_confidence_empty() {
685        let checker = ConfidenceChecker::new(ConfidenceConfig::default());
686        let context = ExecutionContext::new("test task", AgentType::Coder, "input");
687        let confidence = checker.compute_confidence("", &context);
688        assert!(confidence < 0.5);
689    }
690
691    #[test]
692    fn test_compute_confidence_with_uncertainty() {
693        let checker = ConfidenceChecker::new(ConfidenceConfig::default());
694        let context = ExecutionContext::new("implement function", AgentType::Coder, "input");
695
696        let confident_output = "Here is the implementation:\n```rust\nfn example() { }\n```";
697        let uncertain_output = "I'm not sure but possibly this might work...";
698
699        let conf1 = checker.compute_confidence(confident_output, &context);
700        let conf2 = checker.compute_confidence(uncertain_output, &context);
701
702        assert!(conf1 > conf2);
703    }
704
705    #[test]
706    fn test_identify_weak_points_todo() {
707        let checker = ConfidenceChecker::new(ConfidenceConfig::default());
708        let context = ExecutionContext::new("implement function", AgentType::Coder, "input");
709        let output = "fn example() {\n    // TODO: implement this\n}";
710
711        let weak_points = checker.identify_weak_points(output, &context);
712        assert!(!weak_points.is_empty());
713        assert!(weak_points.iter().any(|wp| matches!(wp.weakness_type, WeaknessType::Incomplete)));
714    }
715
716    #[test]
717    fn test_should_revise() {
718        let checker = ConfidenceChecker::new(ConfidenceConfig {
719            threshold: 0.7,
720            revision_budget: 3,
721            ..Default::default()
722        });
723
724        let mut context = ExecutionContext::new("test", AgentType::Coder, "input");
725
726        // Low confidence output should trigger revision
727        let low_conf_output = "I'm not sure, maybe...";
728        assert!(checker.should_revise(low_conf_output, &context));
729
730        // After exceeding budget, should not revise
731        for _ in 0..3 {
732            context.previous_attempts.push(crate::reflection::reflective_agent::PreviousAttempt {
733                attempt_number: 1,
734                output: String::new(),
735                error: None,
736                quality_score: None,
737                duration_ms: 0,
738                reflection: None,
739            });
740        }
741        assert!(!checker.should_revise(low_conf_output, &context));
742    }
743
744    #[test]
745    fn test_weak_point_builder() {
746        let wp = WeakPoint::new("line 5", "Missing error handling", 0.7, WeaknessType::MissingErrorHandling)
747            .with_suggestion("Add Result return type");
748
749        assert_eq!(wp.location, "line 5");
750        assert!(!wp.suggestion.is_empty());
751    }
752
753    #[test]
754    fn test_generate_targeted_revision() {
755        let checker = ConfidenceChecker::new(ConfidenceConfig::default());
756        let weak_points = vec![
757            WeakPoint::new("line 1", "Issue 1", 0.5, WeaknessType::Incomplete)
758                .with_suggestion("Fix it"),
759        ];
760
761        let revision = checker.generate_targeted_revision("original output", &weak_points);
762        assert!(revision.contains("Issue 1"));
763        assert!(revision.contains("Fix it"));
764        assert!(revision.contains("original output"));
765    }
766
767    #[test]
768    fn test_learn_pattern() {
769        let mut checker = ConfidenceChecker::new(ConfidenceConfig::default());
770        checker.learn_pattern("problematic pattern".to_string(), 0.3);
771
772        let context = ExecutionContext::new("test", AgentType::Coder, "input");
773        let output_with_pattern = "This has a problematic pattern in it";
774        let output_without = "This is clean code";
775
776        let conf1 = checker.compute_confidence(output_with_pattern, &context);
777        let conf2 = checker.compute_confidence(output_without, &context);
778
779        assert!(conf1 < conf2);
780    }
781
782    #[test]
783    fn test_record_check() {
784        let mut checker = ConfidenceChecker::new(ConfidenceConfig::default());
785        let context = ExecutionContext::new("test task", AgentType::Coder, "input");
786
787        let record = checker.record_check("test output", &context);
788
789        assert!(!checker.history().is_empty());
790        assert!(record.factors.contains_key("completeness"));
791    }
792}