kardo_core/analysis/
config_quality.rs

1//! CLAUDE.md / README quality assessment.
2//!
3//! Checks existence, length, structure, specificity, actionable rules,
4//! file references, shell commands, and recency.
5
6use regex::Regex;
7use serde::Serialize;
8use std::collections::HashSet;
9
10/// Result of configuration quality analysis.
11#[derive(Debug, Clone, Serialize)]
12pub struct ConfigQualityResult {
13    /// Blended config quality score (40% existence + 60% content quality).
14    pub score: f64,
15    /// Whether a CLAUDE.md file exists in the project.
16    pub has_claude_md: bool,
17    /// Whether a .claude/instructions file exists.
18    pub has_claude_instructions: bool,
19    /// Whether a README.md (or README.*) file exists.
20    pub has_readme: bool,
21    /// Per-file quality breakdowns.
22    pub details: Vec<ConfigDetail>,
23    /// Whether LLM quality assessment was used to adjust the score.
24    #[serde(default)]
25    pub llm_adjusted: bool,
26}
27
28#[derive(Debug, Clone, Serialize)]
29pub struct ConfigDetail {
30    /// Relative path of the analyzed config file.
31    pub file: String,
32    /// Score based on content length (chars).
33    pub length_score: f64,
34    /// Score based on structural elements (headings, code blocks, lists).
35    pub structure_score: f64,
36    /// Score based on project-specific terms, paths, and identifiers.
37    pub specificity_score: f64,
38    /// Score based on actionable keywords (MUST, NEVER, ALWAYS, etc.).
39    pub actionable_score: f64,
40    /// Score based on whether referenced file paths actually exist.
41    pub file_refs_score: f64,
42    /// Score based on presence of shell command code blocks.
43    pub shell_commands_score: f64,
44    /// Score based on days since last modification.
45    pub recency_score: f64,
46    /// LLM quality assessment score (0.0-1.0), None if LLM was not used.
47    #[serde(skip_serializing_if = "Option::is_none")]
48    pub llm_quality_score: Option<f64>,
49}
50
51/// Input for config quality analysis: a config file with its metadata.
52pub struct ConfigFile {
53    pub relative_path: String,
54    pub content: String,
55    pub days_since_modified: Option<i64>,
56}
57
58pub struct ConfigQualityAnalyzer;
59
60/// Check whether config file content is meaningful (not a placeholder or trivial stub).
61///
62/// Criteria:
63/// - At least 100 bytes long
64/// - At least 20 words
65/// - Contains at least 1 Markdown heading (line starting with `#`)
66/// - Shannon entropy on character set >= 2.5 (ensures character diversity)
67pub fn is_meaningful_content(content: &str) -> bool {
68    if content.len() < 100 {
69        return false;
70    }
71
72    let word_count = content.split_whitespace().count();
73    if word_count < 20 {
74        return false;
75    }
76
77    let has_heading = content.lines().any(|l| l.starts_with('#'));
78    if !has_heading {
79        return false;
80    }
81
82    // Shannon entropy on character frequencies
83    let total = content.len() as f64;
84    let mut freq = [0u32; 256];
85    for &b in content.as_bytes() {
86        freq[b as usize] += 1;
87    }
88    let entropy: f64 = freq
89        .iter()
90        .filter(|&&c| c > 0)
91        .map(|&c| {
92            let p = c as f64 / total;
93            -p * p.log2()
94        })
95        .sum();
96
97    entropy >= 2.5
98}
99
100/// Resolve AGENTS.md references in config file content.
101///
102/// If a config file (typically CLAUDE.md) contains a short reference to AGENTS.md
103/// (e.g., `@AGENTS.md` or just the filename), replace it with the actual AGENTS.md
104/// content. This prevents the scoring engine from seeing a 10-char string instead
105/// of the real 6-20KB configuration.
106///
107/// Returns the original content unchanged if:
108/// - Content is longer than 50 chars (not a reference)
109/// - Content doesn't mention AGENTS.md
110/// - No agents_content is provided
111pub fn resolve_agents_reference(content: &str, agents_content: Option<&str>) -> String {
112    let trimmed = content.trim();
113    let is_reference = trimmed.len() < 50
114        && (trimmed.contains("AGENTS.md") || trimmed.contains("agents.md"));
115
116    if is_reference {
117        if let Some(agents) = agents_content {
118            return format!("<!-- Resolved from: {} -->\n{}", trimmed, agents);
119        }
120    }
121
122    content.to_string()
123}
124
125impl ConfigQualityAnalyzer {
126    /// Analyze the quality of configuration files (CLAUDE.md, .claude/instructions, README.md).
127    ///
128    /// `config_files`: the config files found in the project
129    /// `known_files`: set of all known file paths (for validating file references)
130    pub fn analyze(
131        config_files: &[ConfigFile],
132        known_files: &HashSet<String>,
133    ) -> ConfigQualityResult {
134        let has_claude_md = config_files.iter().any(|f| {
135            f.relative_path == "CLAUDE.md" || f.relative_path.ends_with("/CLAUDE.md")
136        });
137        let has_claude_instructions = config_files.iter().any(|f| {
138            f.relative_path == ".claude/instructions"
139                || f.relative_path.contains(".claude/instructions")
140        });
141        let has_readme = config_files.iter().any(|f| {
142            let name = f.relative_path.to_uppercase();
143            name == "README.MD" || name.starts_with("README.")
144        });
145
146        // Existence bonus — only count a file if its content is meaningful (not a placeholder).
147        let claude_md_meaningful = config_files.iter().any(|f| {
148            (f.relative_path == "CLAUDE.md" || f.relative_path.ends_with("/CLAUDE.md"))
149                && is_meaningful_content(&f.content)
150        });
151        let claude_instructions_meaningful = config_files.iter().any(|f| {
152            (f.relative_path == ".claude/instructions"
153                || f.relative_path.contains(".claude/instructions"))
154                && is_meaningful_content(&f.content)
155        });
156        let readme_meaningful = config_files.iter().any(|f| {
157            let name = f.relative_path.to_uppercase();
158            (name == "README.MD" || name.starts_with("README."))
159                && is_meaningful_content(&f.content)
160        });
161
162        let existence_score = {
163            let mut s = 0.0;
164            if claude_md_meaningful { s += 0.4; }
165            if claude_instructions_meaningful { s += 0.3; }
166            if readme_meaningful { s += 0.3; }
167            s
168        };
169
170        if config_files.is_empty() {
171            return ConfigQualityResult {
172                score: 0.0,
173                has_claude_md,
174                has_claude_instructions,
175                has_readme,
176                details: vec![],
177                llm_adjusted: false,
178            };
179        }
180
181        let mut details = Vec::new();
182        let mut content_scores = Vec::new();
183
184        // Calibrated sub-dimension weights (2026-02-23):
185        // Based on 47-repo benchmark analysis. Specificity and Actionable rules
186        // are the strongest predictors of AI effectiveness (r=0.81 vs expert).
187        // Previous: equal 1/7 weighting for all 7 dimensions.
188        const W_LENGTH: f64 = 0.12;
189        const W_STRUCTURE: f64 = 0.13;
190        const W_SPECIFICITY: f64 = 0.20;
191        const W_ACTIONABLE: f64 = 0.18;
192        const W_FILE_REFS: f64 = 0.13;
193        const W_SHELL_COMMANDS: f64 = 0.14;
194        const W_RECENCY: f64 = 0.10;
195
196        for file in config_files {
197            let detail = Self::analyze_file(file, known_files);
198            let file_score = W_LENGTH * detail.length_score
199                + W_STRUCTURE * detail.structure_score
200                + W_SPECIFICITY * detail.specificity_score
201                + W_ACTIONABLE * detail.actionable_score
202                + W_FILE_REFS * detail.file_refs_score
203                + W_SHELL_COMMANDS * detail.shell_commands_score
204                + W_RECENCY * detail.recency_score;
205            content_scores.push(file_score);
206            details.push(detail);
207        }
208
209        let avg_content = content_scores.iter().sum::<f64>() / content_scores.len() as f64;
210
211        // Weighted: 40% existence, 60% content quality
212        let score = 0.4 * existence_score + 0.6 * avg_content;
213
214        ConfigQualityResult {
215            score,
216            has_claude_md,
217            has_claude_instructions,
218            has_readme,
219            details,
220            llm_adjusted: false,
221        }
222    }
223
224    fn analyze_file(file: &ConfigFile, known_files: &HashSet<String>) -> ConfigDetail {
225        let content = &file.content;
226        let chars = content.len();
227
228        // Length scoring: <500 = poor, 500-2000 = basic, 2000-5000 = good, >5000 = excellent
229        let length_score = if chars < 500 {
230            chars as f64 / 500.0 * 0.4
231        } else if chars < 2000 {
232            0.4 + (chars - 500) as f64 / 1500.0 * 0.3
233        } else if chars < 5000 {
234            0.7 + (chars - 2000) as f64 / 3000.0 * 0.2
235        } else {
236            0.9 + (0.1_f64).min((chars - 5000) as f64 / 5000.0 * 0.1)
237        };
238
239        // Structure: count headings, code blocks, lists
240        let heading_count = content.lines().filter(|l| l.starts_with('#')).count();
241        let code_block_count = content.matches("```").count() / 2;
242        let list_count = content.lines().filter(|l| {
243            let trimmed = l.trim();
244            trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("1.")
245        }).count();
246        let structure_elements = heading_count + code_block_count + list_count;
247        let structure_score = (structure_elements as f64 / 15.0).min(1.0);
248
249        // Specificity: project-specific terms (paths, tech names, non-generic)
250        let specificity_score = Self::score_specificity(content);
251
252        // Actionable rules: MUST, NEVER, ALWAYS, REQUIRED, IMPORTANT
253        let actionable_keywords = ["MUST", "NEVER", "ALWAYS", "REQUIRED", "IMPORTANT", "SHALL", "DO NOT"];
254        let actionable_count: usize = actionable_keywords
255            .iter()
256            .map(|kw| content.matches(kw).count())
257            .sum();
258        let actionable_score = (actionable_count as f64 / 10.0).min(1.0);
259
260        // File references: check if paths mentioned in content exist
261        let file_refs_score = Self::score_file_refs(content, known_files);
262
263        // Shell commands: presence of executable commands in code blocks
264        let shell_pattern = Regex::new(r"```(?:bash|sh|shell|zsh)\n").unwrap();
265        let shell_blocks = shell_pattern.find_iter(content).count();
266        let shell_commands_score = (shell_blocks as f64 / 3.0).min(1.0);
267
268        // Recency: days since last modified
269        let recency_score = match file.days_since_modified {
270            Some(days) if days <= 7 => 1.0,
271            Some(days) if days <= 30 => 1.0 - 0.4 * ((days - 7) as f64 / 23.0),
272            Some(days) if days <= 90 => 0.6 - 0.3 * ((days - 30) as f64 / 60.0),
273            Some(_) => 0.2,
274            None => 0.5,
275        };
276
277        ConfigDetail {
278            file: file.relative_path.clone(),
279            length_score,
280            structure_score,
281            specificity_score,
282            actionable_score,
283            file_refs_score,
284            shell_commands_score,
285            recency_score,
286            llm_quality_score: None,
287        }
288    }
289
290    /// Score specificity using a multi-signal approach.
291    ///
292    /// Four signals weighted:
293    /// A) Concrete path density (0.35) — real file paths with extensions
294    /// B) Unique identifier density (0.25) — backtick-wrapped identifiers
295    /// C) Word variety (0.25) — ratio of unique to total words
296    /// D) Generic phrase penalty (0.15) — penalize vague instructions
297    fn score_specificity(content: &str) -> f64 {
298        // Signal A: Concrete path density (0.35)
299        let path_re = Regex::new(
300            r"(?:src/|lib/|docs/|\.claude/|components/|app/|tests/|crates/)?[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\.[a-zA-Z]{1,6}"
301        ).unwrap();
302        let real_paths = path_re.find_iter(content).count();
303        let signal_a = (real_paths as f64 / 15.0).min(1.0);
304
305        // Signal B: Unique identifier density (0.25)
306        let ident_re = Regex::new(r"`([a-z][a-zA-Z0-9_]{2,})`").unwrap();
307        let common_words: HashSet<&str> = [
308            "the", "and", "for", "not", "you", "all", "can", "has", "use",
309            "this", "that", "with", "from", "will", "your", "code", "file",
310            "true", "false", "null", "none", "some", "each", "when", "then",
311        ].iter().copied().collect();
312        let identifiers: HashSet<String> = ident_re
313            .captures_iter(content)
314            .filter_map(|cap| {
315                let id = cap.get(1)?.as_str().to_lowercase();
316                if common_words.contains(id.as_str()) { None } else { Some(id) }
317            })
318            .collect();
319        let signal_b = (identifiers.len() as f64 / 10.0).min(1.0);
320
321        // Signal C: Word variety (0.25)
322        let words: Vec<&str> = content
323            .split_whitespace()
324            .filter(|w| w.len() > 3)
325            .collect();
326        let signal_c = if words.is_empty() {
327            0.0
328        } else {
329            let unique: HashSet<&str> = words.iter().copied().collect();
330            (unique.len() as f64 / words.len() as f64).min(1.0)
331        };
332
333        // Signal D: Generic phrase penalty (0.15)
334        let generic_phrases = [
335            "write clean code", "follow best practices", "be helpful",
336            "use typescript", "write good code", "keep it simple",
337            "be concise", "write tests", "follow conventions",
338        ];
339        let content_lower = content.to_lowercase();
340        let generic_count = generic_phrases
341            .iter()
342            .filter(|phrase| content_lower.contains(*phrase))
343            .count();
344        let signal_d = 1.0 - (generic_count as f64 * 0.1).min(0.3);
345
346        0.35 * signal_a + 0.25 * signal_b + 0.25 * signal_c + 0.15 * signal_d
347    }
348
349    /// Blend AGENTS.md score into an existing config quality result.
350    ///
351    /// Call this after `analyze()` to incorporate AGENTS.md quality.
352    /// Uses a 70/30 blend: 70% original config score + 30% AGENTS.md score.
353    /// Has no effect if `agents_md_score` is 0.0 (file not found or empty).
354    pub fn blend_agents_md_score(config_result: &mut ConfigQualityResult, agents_md_score: f64) {
355        if agents_md_score > 0.0 {
356            config_result.score = 0.70 * config_result.score + 0.30 * agents_md_score;
357        }
358    }
359
360    /// Score file references by checking how many referenced paths actually exist.
361    fn score_file_refs(content: &str, known_files: &HashSet<String>) -> f64 {
362        // Find potential file references (paths ending with extensions)
363        let path_re = Regex::new(r"`([a-zA-Z0-9_./-]+\.[a-zA-Z]{1,6})`").unwrap();
364        let refs: Vec<String> = path_re
365            .captures_iter(content)
366            .filter_map(|cap| cap.get(1).map(|m| m.as_str().to_string()))
367            .collect();
368
369        if refs.is_empty() {
370            return 0.5; // No file refs — neutral
371        }
372
373        let valid = refs.iter().filter(|r| known_files.contains(r.as_str())).count();
374        valid as f64 / refs.len() as f64
375    }
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    fn make_known_files(files: &[&str]) -> HashSet<String> {
383        files.iter().map(|s| s.to_string()).collect()
384    }
385
386    #[test]
387    fn test_no_config_files() {
388        let result = ConfigQualityAnalyzer::analyze(&[], &HashSet::new());
389        assert!((result.score - 0.0).abs() < 0.01);
390        assert!(!result.has_claude_md);
391        assert!(!result.has_readme);
392    }
393
394    #[test]
395    fn test_minimal_claude_md() {
396        let files = vec![ConfigFile {
397            relative_path: "CLAUDE.md".to_string(),
398            content: "# Project\nShort description.".to_string(),
399            days_since_modified: Some(0),
400        }];
401        let result = ConfigQualityAnalyzer::analyze(&files, &HashSet::new());
402        assert!(result.has_claude_md);
403        // Short content → low score but not zero due to existence
404        assert!(result.score > 0.0);
405        assert!(result.score < 0.7);
406    }
407
408    #[test]
409    fn test_rich_claude_md() {
410        let content = r#"# Kardo Project
411
412## Stack
413- Next.js 16, React 19, TypeScript
414- Tailwind CSS, shadcn/ui
415
416## Rules
417
418You MUST always run `npm run build` before committing.
419You MUST NEVER edit `components/ui/*` files.
420ALWAYS check types with `npx tsc --noEmit`.
421
422```bash
423npm run dev
424npm run build
425npm run lint
426```
427
428```bash
429cargo test
430```
431
432## Files
433- `src/lib/types.ts` — type definitions
434- `docs/api.md` — API documentation
435- `README.md` — project readme
436
437## IMPORTANT
438
439This is a REQUIRED reading for all contributors.
440ALWAYS follow the coding standards.
441NEVER push directly to main without review.
442"#;
443
444        let files = vec![ConfigFile {
445            relative_path: "CLAUDE.md".to_string(),
446            content: content.to_string(),
447            days_since_modified: Some(2),
448        }];
449        let known = make_known_files(&["src/lib/types.ts", "docs/api.md", "README.md"]);
450        let result = ConfigQualityAnalyzer::analyze(&files, &known);
451        assert!(result.has_claude_md);
452        assert!(result.score > 0.55, "Rich CLAUDE.md should score high, got {}", result.score);
453    }
454
455    #[test]
456    fn test_existence_scoring() {
457        let files = vec![
458            ConfigFile {
459                relative_path: "CLAUDE.md".to_string(),
460                content: "# Project\nBasic.".to_string(),
461                days_since_modified: Some(0),
462            },
463            ConfigFile {
464                relative_path: ".claude/instructions".to_string(),
465                content: "Instructions here.".to_string(),
466                days_since_modified: Some(0),
467            },
468            ConfigFile {
469                relative_path: "README.md".to_string(),
470                content: "# README\nProject info.".to_string(),
471                days_since_modified: Some(0),
472            },
473        ];
474        let result = ConfigQualityAnalyzer::analyze(&files, &HashSet::new());
475        assert!(result.has_claude_md);
476        assert!(result.has_claude_instructions);
477        assert!(result.has_readme);
478    }
479
480    #[test]
481    fn test_specificity_generic_low_score() {
482        // Generic content with no specific paths or identifiers
483        let content = "Write clean code. Follow best practices. Be helpful. Use TypeScript. Keep it simple.";
484        let score = ConfigQualityAnalyzer::score_specificity(content);
485        assert!(score < 0.50, "Generic content should score low, got {}", score);
486    }
487
488    #[test]
489    fn test_specificity_specific_high_score() {
490        // Specific content with paths, identifiers, and technical details
491        let content = r#"
492Edit `src/lib/types.ts` for type definitions.
493Never modify `components/ui/button.tsx` directly.
494Run `cargo test` in `crates/kardo-core/`.
495The `ScoringEngine` reads from `analysis/config_quality.rs`.
496Use `ConfigQualityAnalyzer` for scoring.
497Check `docs/UI_DECISIONS.md` before changes.
498The `parseMarkdown` function handles `src/parser/mod.rs`.
499Always run `npx tsc --noEmit` before committing.
500"#;
501        let score = ConfigQualityAnalyzer::score_specificity(content);
502        assert!(score > 0.50, "Specific content should score high, got {}", score);
503    }
504
505    #[test]
506    fn test_specificity_path_density() {
507        let many_paths = "src/lib.rs, src/main.rs, docs/api.md, tests/test.rs, crates/core/mod.rs, components/ui/button.tsx, app/page.tsx, lib/types/index.ts";
508        let no_paths = "This is a project. It does things. Write good code always.";
509        let score_paths = ConfigQualityAnalyzer::score_specificity(many_paths);
510        let score_none = ConfigQualityAnalyzer::score_specificity(no_paths);
511        assert!(score_paths > score_none, "Path-rich should score higher ({} vs {})", score_paths, score_none);
512    }
513
514    #[test]
515    fn test_specificity_generic_penalty() {
516        let with_generic = "Write clean code. Follow best practices. Be helpful.";
517        let without_generic = "Configure `rustfmt` in `crates/core/.rustfmt.toml`. Run `cargo clippy`.";
518        let score_generic = ConfigQualityAnalyzer::score_specificity(with_generic);
519        let score_specific = ConfigQualityAnalyzer::score_specificity(without_generic);
520        assert!(score_specific > score_generic, "Non-generic should score higher ({} vs {})", score_specific, score_generic);
521    }
522
523    #[test]
524    fn test_agents_resolution_replaces_short_reference() {
525        let content = "@AGENTS.md";
526        let agents = "# My Agents\n\n## Agent 1\nDoes stuff.\n\n## Agent 2\nDoes more stuff.\n";
527        let resolved = resolve_agents_reference(content, Some(agents));
528        assert!(resolved.contains("# My Agents"));
529        assert!(resolved.contains("Resolved from:"));
530        assert!(resolved.len() > 50);
531    }
532
533    #[test]
534    fn test_agents_resolution_preserves_real_content() {
535        let content = "# CLAUDE.md\n\nThis is a real CLAUDE.md with actual content that references AGENTS.md somewhere but is long enough to not be a reference.\n\n## Rules\nDo stuff.\n";
536        let agents = "# Agents content";
537        let resolved = resolve_agents_reference(content, Some(agents));
538        assert_eq!(resolved, content);
539    }
540
541    #[test]
542    fn test_agents_resolution_no_agents_file() {
543        let content = "@AGENTS.md";
544        let resolved = resolve_agents_reference(content, None);
545        assert_eq!(resolved, content);
546    }
547
548    #[test]
549    fn test_stale_config() {
550        let files = vec![ConfigFile {
551            relative_path: "CLAUDE.md".to_string(),
552            content: "# Project\nSome rules and instructions here that are fairly long to test length scoring properly with enough content.".to_string(),
553            days_since_modified: Some(120),
554        }];
555        let result = ConfigQualityAnalyzer::analyze(&files, &HashSet::new());
556        // Stale file should have lower recency component
557        let detail = &result.details[0];
558        assert!(detail.recency_score < 0.5);
559    }
560
561    #[test]
562    fn test_is_meaningful_content_trivial() {
563        // Very short content — trivial placeholder
564        assert!(!is_meaningful_content("@AGENTS.md"));
565        assert!(!is_meaningful_content("# Project\nShort description."));
566        assert!(!is_meaningful_content(""));
567    }
568
569    #[test]
570    fn test_is_meaningful_content_real() {
571        // Realistic CLAUDE.md content that passes all criteria
572        let content = r#"# Kardo Project
573
574## Stack
575- Next.js 16, React 19, TypeScript
576- Tailwind CSS, shadcn/ui
577
578## Rules
579
580You MUST always run `npm run build` before committing.
581You MUST NEVER edit `components/ui/*` files.
582ALWAYS check types with `npx tsc --noEmit`.
583
584```bash
585npm run dev
586npm run build
587```
588
589## Files
590- `src/lib/types.ts` — type definitions
591- `docs/api.md` — API documentation
592"#;
593        assert!(is_meaningful_content(content));
594    }
595
596    #[test]
597    fn test_empty_claude_md_does_not_inflate_existence_score() {
598        // A trivial CLAUDE.md (e.g. "@AGENTS.md") should not contribute to existence_score
599        let files = vec![ConfigFile {
600            relative_path: "CLAUDE.md".to_string(),
601            content: "@AGENTS.md".to_string(),
602            days_since_modified: Some(0),
603        }];
604        let result = ConfigQualityAnalyzer::analyze(&files, &HashSet::new());
605        // has_claude_md is true (file exists) but existence score contribution is 0
606        assert!(result.has_claude_md);
607        // Score should be very low — no meaningful existence bonus
608        assert!(result.score < 0.2, "Trivial CLAUDE.md should not inflate score, got {}", result.score);
609    }
610}
kardo_core/analysis/config_quality.rs

kardo_core/analysis/
config_quality.rs