Skip to main content

aigent/
tester.rs

1//! Skill tester and previewer for evaluation-driven development.
2//!
3//! Simulates how Claude would discover and activate a skill given a sample
4//! user query. Shows what metadata would be injected into the system prompt
5//! and identifies potential issues (description mismatch, broken references,
6//! token budget).
7
8use std::path::Path;
9
10use crate::diagnostics::Diagnostic;
11use crate::models::SkillProperties;
12use crate::parser::read_properties;
13use crate::prompt::estimate_tokens;
14use crate::structure::validate_structure;
15use crate::validator::validate;
16use crate::Result;
17
18/// Result of testing a skill against a sample query.
19#[derive(Debug)]
20pub struct TestResult {
21    /// Skill name from frontmatter.
22    pub name: String,
23    /// Skill description from frontmatter.
24    pub description: String,
25    /// The test query provided by the user.
26    pub query: String,
27    /// Whether the description appears relevant to the query.
28    pub query_match: QueryMatch,
29    /// Numeric match score (0.0–1.0) from the weighted formula.
30    pub score: f64,
31    /// Estimated token cost of the skill's prompt footprint.
32    pub estimated_tokens: usize,
33    /// Validation diagnostics (errors + warnings).
34    pub diagnostics: Vec<Diagnostic>,
35    /// Structure diagnostics (missing references, etc.).
36    pub structure_diagnostics: Vec<Diagnostic>,
37    /// Parsed properties for display purposes.
38    pub properties: SkillProperties,
39}
40
41/// Describes how well the skill description matches a test query.
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub enum QueryMatch {
44    /// Strong match: weighted score ≥ 0.4.
45    Strong,
46    /// Weak match: weighted score ≥ 0.15.
47    Weak,
48    /// No match: weighted score < 0.15.
49    None,
50}
51
52/// Test a skill against a sample user query.
53///
54/// Simulates skill discovery by checking:
55/// 1. Whether the description is relevant to the query (word overlap)
56/// 2. Whether the skill passes validation (metadata + structure)
57/// 3. The estimated token cost
58///
59/// # Arguments
60///
61/// * `dir` - Path to the skill directory
62/// * `query` - A sample user query to test activation against
63///
64/// # Errors
65///
66/// Returns an error if the SKILL.md cannot be read or parsed.
67pub fn test_skill(dir: &Path, query: &str) -> Result<TestResult> {
68    let properties = read_properties(dir)?;
69
70    // Compute weighted match score and category.
71    let (query_match, score) =
72        compute_query_match(query, &properties.name, &properties.description);
73
74    // Estimate token footprint: name + description (what goes into system prompt).
75    let estimated_tokens =
76        estimate_tokens(&properties.name) + estimate_tokens(&properties.description);
77
78    // Run standard validation.
79    let diagnostics = validate(dir);
80
81    // Run structure validation.
82    let structure_diagnostics = validate_structure(dir);
83
84    Ok(TestResult {
85        name: properties.name.clone(),
86        description: properties.description.clone(),
87        query: query.to_string(),
88        query_match,
89        score,
90        estimated_tokens,
91        diagnostics,
92        structure_diagnostics,
93        properties,
94    })
95}
96
97/// Default terminal width for wrapping probe output.
98const DEFAULT_WIDTH: usize = 80;
99
100/// Format a labeled line, wrapping long values so continuation lines align
101/// to the value column. Uses character counts (not byte offsets) so that
102/// multibyte UTF-8 content (e.g., `✓`, `⚠`, `—`) never causes a panic.
103fn fmt_field(out: &mut String, label: &str, value: &str, col: usize, width: usize) {
104    let prefix = format!("{:<col$} ", label);
105    let indent = col + 1; // spaces for continuation lines
106    let max_val = width.saturating_sub(indent);
107    if max_val == 0 || value.chars().count() + indent <= width {
108        out.push_str(&prefix);
109        out.push_str(value);
110        out.push('\n');
111        return;
112    }
113    // Collect char indices for safe slicing on character boundaries.
114    let chars: Vec<(usize, char)> = value.char_indices().collect();
115    let mut char_pos = 0; // index into `chars`
116    let mut first = true;
117    while char_pos < chars.len() {
118        // Skip leading spaces at break boundaries to avoid blank lines.
119        if !first {
120            while char_pos < chars.len() && chars[char_pos].1 == ' ' {
121                char_pos += 1;
122            }
123            if char_pos >= chars.len() {
124                break;
125            }
126        }
127        if first {
128            out.push_str(&prefix);
129        } else {
130            for _ in 0..indent {
131                out.push(' ');
132            }
133        }
134        let remaining_chars = chars.len() - char_pos;
135        if remaining_chars <= max_val {
136            let byte_start = chars[char_pos].0;
137            out.push_str(&value[byte_start..]);
138            out.push('\n');
139            break;
140        }
141        // Find the last space within max_val characters.
142        let end = char_pos + max_val;
143        let break_char = (char_pos..end)
144            .rev()
145            .find(|&i| chars[i].1 == ' ')
146            .unwrap_or(end);
147        let byte_start = chars[char_pos].0;
148        let byte_end = chars[break_char].0;
149        out.push_str(&value[byte_start..byte_end]);
150        out.push('\n');
151        char_pos = break_char;
152        first = false;
153    }
154}
155
156/// Format a test result as human-readable text.
157#[must_use]
158pub fn format_test_result(result: &TestResult) -> String {
159    format_test_result_width(result, DEFAULT_WIDTH)
160}
161
162/// Format a test result with a specific terminal width (for testing).
163#[must_use]
164pub(crate) fn format_test_result_width(result: &TestResult, width: usize) -> String {
165    let mut out = String::new();
166
167    // Aligned label width (widest label is "Description:" at 12 chars + 1 padding).
168    const W: usize = 13;
169
170    fmt_field(&mut out, "Skill:", &result.name, W, width);
171    fmt_field(
172        &mut out,
173        "Query:",
174        &format!("\"{}\"", result.query),
175        W,
176        width,
177    );
178    fmt_field(&mut out, "Description:", &result.description, W, width);
179    out.push('\n');
180
181    // Query match assessment.
182    let match_label = match &result.query_match {
183        QueryMatch::Strong => "STRONG ✓ — description aligns well with query",
184        QueryMatch::Weak => "WEAK ⚠ — some overlap, but description may not trigger reliably",
185        QueryMatch::None => "NONE ✗ — description does not match the test query",
186    };
187    fmt_field(
188        &mut out,
189        "Activation:",
190        &format!("{match_label} (score: {:.2})", result.score),
191        W,
192        width,
193    );
194
195    // Token budget.
196    fmt_field(
197        &mut out,
198        "Tokens:",
199        &format!("~{} tokens", result.estimated_tokens),
200        W,
201        width,
202    );
203    out.push('\n');
204
205    // Validation results.
206    let errors: Vec<_> = result.diagnostics.iter().filter(|d| d.is_error()).collect();
207    let warnings: Vec<_> = result
208        .diagnostics
209        .iter()
210        .filter(|d| d.is_warning())
211        .collect();
212
213    if errors.is_empty() && warnings.is_empty() && result.structure_diagnostics.is_empty() {
214        out.push_str("Validation: PASS — no issues found\n");
215    } else {
216        if !errors.is_empty() {
217            out.push_str(&format!("Validation errors ({}):\n", errors.len()));
218            for d in &errors {
219                out.push_str(&format!("  {d}\n"));
220            }
221        }
222        if !warnings.is_empty() {
223            out.push_str(&format!("Validation warnings ({}):\n", warnings.len()));
224            for d in &warnings {
225                out.push_str(&format!("  {d}\n"));
226            }
227        }
228        if !result.structure_diagnostics.is_empty() {
229            out.push_str(&format!(
230                "Structure issues ({}):\n",
231                result.structure_diagnostics.len()
232            ));
233            for d in &result.structure_diagnostics {
234                out.push_str(&format!("  {d}\n"));
235            }
236        }
237    }
238
239    out
240}
241
242/// Common English stopwords excluded from token matching.
243const STOPWORDS: &[&str] = &[
244    "a", "an", "the", "is", "are", "was", "were", "of", "to", "in", "for", "on", "with", "and",
245    "or", "but", "not", "it", "this", "that",
246];
247
248/// Normalize a word by stripping common English suffixes.
249///
250/// This is a minimal stemmer, not a full Porter/Snowball implementation.
251/// It handles the most common cases to improve Jaccard overlap.
252fn stem(word: &str) -> String {
253    let w = word.to_lowercase();
254    // Order matters: check longer suffixes first.
255    for suffix in &[
256        "ting", "sing", "zing", "ning", "ring", "ses", "ies", "ing", "ed", "es", "s",
257    ] {
258        if w.len() > suffix.len() + 2 {
259            if let Some(root) = w.strip_suffix(suffix) {
260                return root.to_string();
261            }
262        }
263    }
264    w
265}
266
267/// Tokenize a string into lowercase, stemmed words with punctuation stripped
268/// and stopwords removed.
269fn tokenize(text: &str) -> Vec<String> {
270    text.split_whitespace()
271        .map(|w| {
272            let cleaned = w
273                .trim_matches(|c: char| !c.is_alphanumeric())
274                .to_lowercase();
275            stem(&cleaned)
276        })
277        .filter(|w| !w.is_empty() && !STOPWORDS.contains(&w.as_str()))
278        .collect()
279}
280
281/// Extract a trigger phrase from a description.
282///
283/// Scans for lines starting with "Use when" or "Use this when"
284/// (case-insensitive) and returns the full line text if found.
285fn extract_trigger(description: &str) -> Option<String> {
286    for line in description.lines() {
287        let trimmed = line.trim();
288        let lower = trimmed.to_lowercase();
289        if lower.starts_with("use when") || lower.starts_with("use this when") {
290            return Some(trimmed.to_string());
291        }
292    }
293    None
294}
295
296/// Compute a weighted match score between a query and a skill.
297///
298/// Uses a three-component weighted formula:
299/// - **0.5 × description overlap** (fraction of query tokens found in description)
300/// - **0.3 × trigger match**: 1.0 if any query token appears in the
301///   skill's trigger phrase ("Use when..."), 0.0 otherwise
302/// - **0.2 × name match**: 1.0 if any query token is a substring of the
303///   skill name, 0.0 otherwise
304///
305/// Returns the [`QueryMatch`] category and the numeric score (0.0–1.0).
306/// Strong ≥ 0.4, Weak ≥ 0.15, None < 0.15.
307fn compute_query_match(query: &str, name: &str, description: &str) -> (QueryMatch, f64) {
308    let query_tokens = tokenize(query);
309
310    if query_tokens.is_empty() {
311        return (QueryMatch::None, 0.0);
312    }
313
314    let desc_tokens = tokenize(description);
315
316    // Description overlap: fraction of query tokens found in description tokens.
317    // This measures recall (how many query terms are covered) rather than
318    // Jaccard (which penalizes for extra description tokens).
319    let query_set: std::collections::HashSet<&str> =
320        query_tokens.iter().map(|s| s.as_str()).collect();
321    let desc_set: std::collections::HashSet<&str> =
322        desc_tokens.iter().map(|s| s.as_str()).collect();
323    let intersection = query_set.intersection(&desc_set).count();
324    let desc_overlap = if query_set.is_empty() {
325        0.0
326    } else {
327        intersection as f64 / query_set.len() as f64
328    };
329
330    // Trigger match: 1.0 if any query token appears in the trigger phrase.
331    let trigger_score = if let Some(trigger) = extract_trigger(description) {
332        let trigger_lower = trigger.to_lowercase();
333        if query_tokens
334            .iter()
335            .any(|t| trigger_lower.contains(t.as_str()))
336        {
337            1.0
338        } else {
339            0.0
340        }
341    } else {
342        0.0
343    };
344
345    // Name match: 1.0 if any query token is a substring of the skill name.
346    let name_lower = name.to_lowercase();
347    let name_score = if query_tokens.iter().any(|t| name_lower.contains(t.as_str())) {
348        1.0
349    } else {
350        0.0
351    };
352
353    // Weighted formula.
354    let score = 0.5 * desc_overlap + 0.3 * trigger_score + 0.2 * name_score;
355
356    let category = if score >= 0.4 {
357        QueryMatch::Strong
358    } else if score >= 0.15 {
359        QueryMatch::Weak
360    } else {
361        QueryMatch::None
362    };
363
364    (category, score)
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370    use std::fs;
371    use tempfile::tempdir;
372
373    /// Create a skill dir with given frontmatter + body.
374    fn make_skill(
375        name: &str,
376        description: &str,
377        body: &str,
378    ) -> (tempfile::TempDir, std::path::PathBuf) {
379        let parent = tempdir().unwrap();
380        let dir = parent.path().join(name);
381        fs::create_dir(&dir).unwrap();
382        fs::write(
383            dir.join("SKILL.md"),
384            format!("---\nname: {name}\ndescription: {description}\n---\n{body}\n"),
385        )
386        .unwrap();
387        (parent, dir)
388    }
389
390    // ── Query matching ───────────────────────────────────────────────
391
392    #[test]
393    fn strong_match_when_query_words_in_description() {
394        let (m, score) = compute_query_match(
395            "process PDF files",
396            "pdf-processor",
397            "Processes PDF files and generates detailed reports",
398        );
399        assert_eq!(m, QueryMatch::Strong);
400        assert!(score >= 0.4, "score {score} should be ≥ 0.4");
401    }
402
403    #[test]
404    fn weak_match_with_partial_overlap() {
405        let (m, score) = compute_query_match(
406            "generate database migration scripts quickly",
407            "pdf-processor",
408            "Processes PDF files and generates detailed reports",
409        );
410        assert!(
411            matches!(m, QueryMatch::Weak | QueryMatch::None),
412            "expected Weak or None for partial overlap, got {m:?} (score: {score})"
413        );
414    }
415
416    #[test]
417    fn no_match_with_unrelated_query() {
418        let (m, score) = compute_query_match(
419            "deploy kubernetes cluster",
420            "pdf-processor",
421            "Processes PDF files and generates detailed reports",
422        );
423        assert_eq!(m, QueryMatch::None);
424        assert!(score < 0.15, "score {score} should be < 0.15");
425    }
426
427    #[test]
428    fn empty_query_is_no_match() {
429        let (m, score) = compute_query_match("", "some-skill", "Some description");
430        assert_eq!(m, QueryMatch::None);
431        assert_eq!(score, 0.0);
432    }
433
434    #[test]
435    fn case_insensitive_matching() {
436        let (m, _score) = compute_query_match(
437            "PDF PROCESSING",
438            "pdf-processor",
439            "Processes pdf files and generates reports",
440        );
441        assert!(
442            matches!(m, QueryMatch::Strong | QueryMatch::Weak),
443            "expected Strong or Weak for case-insensitive match, got {m:?}"
444        );
445    }
446
447    // ── Weighted scoring specific tests ──────────────────────────────
448
449    #[test]
450    fn trigger_phrase_boosts_score() {
451        // Use identical base descriptions + same extra words to isolate the trigger effect.
452        // The trigger bonus (0.3) should outweigh any Jaccard dilution from extra tokens.
453        let (_, score_with_trigger) = compute_query_match(
454            "lint javascript",
455            "unrelated-name",
456            "Analyzes syntax patterns. Use when you want to lint javascript files.",
457        );
458        let (_, score_without_trigger) = compute_query_match(
459            "lint javascript",
460            "unrelated-name",
461            "Analyzes syntax patterns in various source files.",
462        );
463        assert!(
464            score_with_trigger > score_without_trigger,
465            "trigger phrase should boost score: {score_with_trigger} vs {score_without_trigger}"
466        );
467    }
468
469    #[test]
470    fn name_match_boosts_score() {
471        let (_, score_name_match) = compute_query_match(
472            "process pdf",
473            "pdf-processor",
474            "Handles document transformation tasks.",
475        );
476        let (_, score_no_name) = compute_query_match(
477            "process pdf",
478            "document-handler",
479            "Handles document transformation tasks.",
480        );
481        assert!(
482            score_name_match > score_no_name,
483            "name match should boost score: {score_name_match} vs {score_no_name}"
484        );
485    }
486
487    #[test]
488    fn all_zero_inputs_produce_zero_score() {
489        let (m, score) = compute_query_match(
490            "xylophone zephyr",
491            "unrelated-name",
492            "Completely unrelated description about cooking pasta.",
493        );
494        assert_eq!(m, QueryMatch::None);
495        assert_eq!(score, 0.0, "totally unrelated query should score 0.0");
496    }
497
498    // ── test_skill integration ───────────────────────────────────────
499
500    #[test]
501    fn test_skill_returns_result_for_valid_skill() {
502        let (_parent, dir) = make_skill(
503            "pdf-tool",
504            "Processes PDF files and extracts text content",
505            "Body content here.",
506        );
507        let result = test_skill(&dir, "process some PDF files").unwrap();
508        assert_eq!(result.name, "pdf-tool");
509        assert_eq!(result.query_match, QueryMatch::Strong);
510        assert!(result.estimated_tokens > 0);
511    }
512
513    #[test]
514    fn test_skill_reports_validation_issues() {
515        let parent = tempdir().unwrap();
516        let dir = parent.path().join("bad-skill");
517        fs::create_dir(&dir).unwrap();
518        // Missing description → validation error.
519        fs::write(dir.join("SKILL.md"), "---\nname: bad-skill\n---\nBody.\n").unwrap();
520        let result = test_skill(&dir, "anything");
521        // Should fail because description is required.
522        assert!(result.is_err());
523    }
524
525    #[test]
526    fn test_skill_detects_structure_issues() {
527        let (_parent, dir) = make_skill(
528            "ref-skill",
529            "Skill with broken reference",
530            "See [guide](nonexistent.md) for details.",
531        );
532        let result = test_skill(&dir, "guide reference").unwrap();
533        assert!(
534            !result.structure_diagnostics.is_empty(),
535            "expected structure diagnostics for broken reference",
536        );
537    }
538
539    // ── format_test_result ───────────────────────────────────────────
540
541    #[test]
542    fn format_includes_skill_name_and_query() {
543        let (_parent, dir) =
544            make_skill("format-test", "A test skill for formatting output", "Body.");
545        let result = test_skill(&dir, "test formatting").unwrap();
546        let text = format_test_result(&result);
547        assert!(text.contains("format-test"));
548        assert!(text.contains("test formatting"));
549    }
550
551    #[test]
552    fn format_shows_activation_status() {
553        let (_parent, dir) = make_skill("activation-test", "Processes PDF files quickly", "Body.");
554        let result = test_skill(&dir, "deploy kubernetes cluster").unwrap();
555        let text = format_test_result(&result);
556        assert!(text.contains("NONE"));
557    }
558
559    #[test]
560    fn format_shows_pass_for_clean_skill() {
561        let (_parent, dir) = make_skill(
562            "clean-skill",
563            "A clean skill that passes validation",
564            "Body content.",
565        );
566        let result = test_skill(&dir, "clean skill").unwrap();
567        let text = format_test_result(&result);
568        assert!(text.contains("PASS"));
569    }
570
571    // ── fmt_field wrapping ──────────────────────────────────────────
572
573    #[test]
574    fn fmt_field_short_value_no_wrap() {
575        let mut out = String::new();
576        fmt_field(&mut out, "Label:", "short", 13, 80);
577        assert_eq!(out, "Label:        short\n");
578    }
579
580    #[test]
581    fn fmt_field_long_value_wraps_aligned() {
582        let mut out = String::new();
583        // Width 40, col 13 → 14 chars for indent, 26 chars for value per line.
584        fmt_field(
585            &mut out,
586            "Description:",
587            "Validates AI agent skill definitions against the spec",
588            13,
589            40,
590        );
591        let lines: Vec<&str> = out.lines().collect();
592        assert!(lines.len() > 1, "expected wrapping, got: {out:?}");
593        // All continuation lines must start with 14 spaces.
594        for line in &lines[1..] {
595            assert!(
596                line.starts_with("              "),
597                "continuation not aligned: {line:?}",
598            );
599        }
600    }
601
602    #[test]
603    fn fmt_field_multibyte_utf8_no_panic() {
604        let mut out = String::new();
605        // Value contains multibyte chars (✓=3 bytes, ⚠=3, —=3).
606        // At width 50, indent 14, max_val=36 chars — slicing must use
607        // char boundaries, not byte offsets, to avoid a panic.
608        fmt_field(
609            &mut out,
610            "Activation:",
611            "WEAK ⚠ — some overlap, but description may not trigger reliably (score: 0.33)",
612            13,
613            50,
614        );
615        let lines: Vec<&str> = out.lines().collect();
616        assert!(lines.len() > 1, "expected wrapping, got: {out:?}");
617        for line in &lines[1..] {
618            assert!(
619                line.starts_with("              "),
620                "continuation not aligned: {line:?}",
621            );
622        }
623    }
624
625    #[test]
626    fn fmt_field_char_count_not_byte_len() {
627        let mut out = String::new();
628        // "café" is 4 chars but 5 bytes (é = 2 bytes).
629        // With width=20, indent=7, max_val=13: "café latte warm" is 15 chars,
630        // triggers wrapping. Byte-based slicing would panic or break incorrectly.
631        fmt_field(&mut out, "Item:", "café latte warm drink", 6, 20);
632        assert!(!out.is_empty(), "should produce output without panic",);
633        // Verify no line exceeds the width in characters.
634        for line in out.lines() {
635            assert!(
636                line.chars().count() <= 20,
637                "line exceeds width: {line:?} ({} chars)",
638                line.chars().count(),
639            );
640        }
641    }
642
643    #[test]
644    fn fmt_field_consecutive_spaces_no_blank_lines() {
645        let mut out = String::new();
646        fmt_field(&mut out, "Label:", "word   word   word   end", 6, 18);
647        for line in out.lines() {
648            let trimmed = line.trim();
649            assert!(!trimmed.is_empty(), "blank continuation line: {out:?}");
650        }
651    }
652
653    #[test]
654    fn format_test_result_wraps_description() {
655        let long_desc = "Validates AI agent skill definitions (SKILL.md files) against \
656            the Anthropic agent skill specification and checks all fields";
657        let (_parent, dir) = make_skill("wrap-test", long_desc, "Body content.");
658        let result = test_skill(&dir, "validate skill").unwrap();
659        let text = format_test_result_width(&result, 60);
660        let desc_lines: Vec<&str> = text
661            .lines()
662            .skip_while(|l| !l.starts_with("Description:"))
663            .take_while(|l| !l.is_empty())
664            .collect();
665        assert!(
666            desc_lines.len() > 1,
667            "description should wrap at width 60: {desc_lines:?}",
668        );
669        for line in &desc_lines[1..] {
670            assert!(
671                line.starts_with("              "),
672                "continuation not aligned: {line:?}",
673            );
674        }
675    }
676}