kardo_core/llm/
engine.rs

1//! Three-tier document classification: rules -> embeddings -> LLM.
2//!
3//! Hierarchical taxonomy: 7 categories + subcategories.
4//! Uses Longest-Prefix-Match (LPM) for path rules with word-boundary matching.
5//! For now, implements rules tier + LLM tier (embeddings is future work).
6
7use super::ollama::OllamaClient;
8use super::{GenerateRequest, LlmBackend, LlmError};
9use serde::{Deserialize, Serialize};
10
11/// Safely truncate a string to at most `max_bytes` bytes without splitting a
12/// multi-byte UTF-8 character.
13fn safe_truncate(s: &str, max_bytes: usize) -> &str {
14    if s.len() <= max_bytes {
15        return s;
16    }
17    let mut end = max_bytes;
18    while end > 0 && !s.is_char_boundary(end) {
19        end -= 1;
20    }
21    &s[..end]
22}
23
24/// Valid top-level categories.
25const VALID_CATEGORIES: &[&str] = &[
26    "product", "research", "technical", "design", "decisions", "guides", "unknown",
27];
28
29/// Validate that a category is known.
30fn is_valid_category(cat: &str) -> bool {
31    VALID_CATEGORIES.contains(&cat)
32}
33
34/// Validate that a subcategory belongs to the given category.
35fn is_valid_subcategory(category: &str, subcategory: &str) -> bool {
36    let valid = match category {
37        "product" => &["prd", "roadmap", "brief"][..],
38        "research" => &["customer-interviews", "competitive-analysis", "user-research", "market-research"][..],
39        "technical" => &["api-docs", "architecture", "spec", "code", "config"][..],
40        "design" => &["design-system", "ui-specs", "mockups"][..],
41        "decisions" => &["adr", "rfc", "meetings"][..],
42        "guides" => &["readme", "onboarding", "how-to", "code-instructions"][..],
43        _ => return false,
44    };
45    valid.contains(&subcategory)
46}
47
48/// Document classification result.
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct Classification {
51    pub doc_type: String,
52    pub subcategory: Option<String>,
53    pub confidence: f64,
54    pub source: ClassificationSource,
55}
56
57/// How the classification was determined.
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ClassificationSource {
60    /// Matched a filename/path rule
61    Rule,
62    /// Classified by LLM
63    Llm,
64    /// Default fallback
65    Fallback,
66}
67
68impl std::fmt::Display for ClassificationSource {
69    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
70        match self {
71            ClassificationSource::Rule => write!(f, "rule"),
72            ClassificationSource::Llm => write!(f, "llm"),
73            ClassificationSource::Fallback => write!(f, "fallback"),
74        }
75    }
76}
77
78// ── LPM Path Rules ──
79
80/// A path-prefix rule for Longest-Prefix-Match classification.
81struct PathRule {
82    prefix: &'static str,
83    doc_type: &'static str,
84    subcategory: &'static str,
85    confidence: f64,
86}
87
88/// Path rules sorted by prefix length (longest first) for LPM behavior.
89/// First matching rule wins — more specific paths always take priority.
90///
91/// INVARIANT: Rules MUST be sorted by descending prefix length.
92/// The `test_path_rules_sorted` test verifies this at compile-check time.
93const PATH_RULES: &[PathRule] = &[
94    // .claude/ subtree — most specific first
95    PathRule { prefix: ".claude/research/calibration/dataset/", doc_type: "technical", subcategory: "config", confidence: 0.70 },
96    PathRule { prefix: ".claude/research/competitors/", doc_type: "research", subcategory: "competitive-analysis", confidence: 0.80 },
97    PathRule { prefix: ".claude/research/kardo-pmf/", doc_type: "research", subcategory: "market-research", confidence: 0.80 },
98    PathRule { prefix: ".claude/research/experiments/", doc_type: "research", subcategory: "user-research", confidence: 0.75 },
99    PathRule { prefix: ".claude/research/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
100    PathRule { prefix: ".claude/plans/", doc_type: "product", subcategory: "roadmap", confidence: 0.75 },
101    PathRule { prefix: ".claude/analysis/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
102    PathRule { prefix: ".claude/agents/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.80 },
103    PathRule { prefix: ".claude/skills/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.80 },
104    PathRule { prefix: ".claude/commands/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.80 },
105    PathRule { prefix: ".claude/hooks/", doc_type: "technical", subcategory: "config", confidence: 0.80 },
106    PathRule { prefix: ".claude/memory/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
107    PathRule { prefix: ".claude/discovery/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
108    PathRule { prefix: ".claude/summaries/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
109    PathRule { prefix: ".claude/knowledge/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
110    PathRule { prefix: ".claude/components-registry/", doc_type: "design", subcategory: "design-system", confidence: 0.75 },
111    PathRule { prefix: ".claude/training-data/", doc_type: "technical", subcategory: "config", confidence: 0.75 },
112    // .claude/ catch-all — LOW confidence triggers LLM
113    PathRule { prefix: ".claude/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.50 },
114
115    // docs/ subtree
116    PathRule { prefix: "docs/plans/", doc_type: "product", subcategory: "roadmap", confidence: 0.75 },
117    PathRule { prefix: "docs/reviews/", doc_type: "decisions", subcategory: "meetings", confidence: 0.70 },
118    PathRule { prefix: "docs/drafts/", doc_type: "guides", subcategory: "how-to", confidence: 0.50 },
119    PathRule { prefix: "docs/layers/", doc_type: "product", subcategory: "brief", confidence: 0.60 },
120    // docs/ catch-all — LOW confidence triggers LLM
121    PathRule { prefix: "docs/", doc_type: "guides", subcategory: "how-to", confidence: 0.50 },
122    PathRule { prefix: "doc/", doc_type: "guides", subcategory: "how-to", confidence: 0.50 },
123
124    // AI_First_Idea/ subtree
125    PathRule { prefix: "ai_first_idea/research/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
126    PathRule { prefix: "ai_first_idea/articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
127    PathRule { prefix: "ai_first_idea/", doc_type: "research", subcategory: "market-research", confidence: 0.55 },
128
129    // Habr_plan/ subtree
130    PathRule { prefix: "habr_plan/pipeline/research/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
131    PathRule { prefix: "habr_plan/pipeline/prompts/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.75 },
132    PathRule { prefix: "habr_plan/pipeline/articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
133    PathRule { prefix: "habr_plan/articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
134    PathRule { prefix: "habr_plan/pipeline/", doc_type: "guides", subcategory: "how-to", confidence: 0.60 },
135    PathRule { prefix: "habr_plan/", doc_type: "guides", subcategory: "how-to", confidence: 0.55 },
136
137    // .github/ subtree
138    PathRule { prefix: ".github/workflows/", doc_type: "technical", subcategory: "config", confidence: 0.85 },
139    PathRule { prefix: ".github/", doc_type: "technical", subcategory: "config", confidence: 0.75 },
140
141    // Generic path patterns (work for any project)
142    PathRule { prefix: "research/", doc_type: "research", subcategory: "market-research", confidence: 0.70 },
143    PathRule { prefix: "articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.65 },
144    PathRule { prefix: "plans/", doc_type: "product", subcategory: "roadmap", confidence: 0.70 },
145
146    // kardo/ monorepo catch-all — catch internal files (build notes, specs)
147    PathRule { prefix: "kardo/", doc_type: "technical", subcategory: "code", confidence: 0.50 },
148];
149
150/// Check if a path contains a word (not just a substring).
151/// Matches "spec" but NOT "aspect", "inspect", "specification".
152/// Word boundaries are: start/end of string, non-alphanumeric characters.
153fn path_contains_word(path: &str, word: &str) -> bool {
154    for (i, _) in path.match_indices(word) {
155        let before_ok = i == 0 || !path.as_bytes()[i - 1].is_ascii_alphanumeric();
156        let after_idx = i + word.len();
157        let after_ok = after_idx >= path.len() || !path.as_bytes()[after_idx].is_ascii_alphanumeric();
158        if before_ok && after_ok {
159            return true;
160        }
161    }
162    false
163}
164
165/// Synchronous rule-based classification. No LLM, no async.
166/// Use this from CLI or anywhere you don't have a tokio runtime.
167///
168/// Pipeline: extension → filename → filename-prefix (path-aware) → LPM path → contains → fallback
169pub fn classify_by_rules(relative_path: &str) -> Classification {
170    let path_lower = relative_path.to_lowercase();
171    let filename = relative_path
172        .rsplit('/')
173        .next()
174        .unwrap_or(relative_path)
175        .to_lowercase();
176
177    // Tier 1: File extension (code/config) — highest confidence
178    if let Some(ext_result) = classify_by_extension(&filename) {
179        return ext_result;
180    }
181
182    // Tier 2: Exact filename match
183    if let Some(c) = classify_filename(&filename) {
184        return c;
185    }
186
187    // Tier 3: Filename prefix (path-aware — skips inside .claude/agents/ etc.)
188    if let Some(c) = classify_filename_prefix(&filename, &path_lower) {
189        return c;
190    }
191
192    // Tier 4: Longest-Prefix-Match path rules
193    if let Some(c) = classify_path_lpm(&path_lower) {
194        return c;
195    }
196
197    // Tier 5: Word-boundary contains patterns (low confidence)
198    if let Some(c) = classify_path_contains(&path_lower) {
199        return c;
200    }
201
202    // Fallback
203    Classification {
204        doc_type: "unknown".to_string(),
205        subcategory: None,
206        confidence: 0.10,
207        source: ClassificationSource::Fallback,
208    }
209}
210
211/// Classify by file extension (code and config files).
212fn classify_by_extension(filename: &str) -> Option<Classification> {
213    let ext = filename.rsplit('.').next()?;
214    let (doc_type, subcategory) = match ext {
215        "rs" | "py" | "ts" | "tsx" | "js" | "jsx" | "go" | "java" | "c" | "cpp" | "h" | "rb"
216        | "swift" | "kt" | "cs" | "php" | "sh" | "bash" | "zsh" => ("technical", "code"),
217        "json" | "yaml" | "yml" | "toml" | "ini" | "cfg" | "conf" => ("technical", "config"),
218        _ => return None,
219    };
220
221    Some(Classification {
222        doc_type: doc_type.to_string(),
223        subcategory: Some(subcategory.to_string()),
224        confidence: 0.95,
225        source: ClassificationSource::Rule,
226    })
227}
228
229/// Classify by exact filename match.
230fn classify_filename(filename: &str) -> Option<Classification> {
231    let (doc_type, subcategory, confidence) = match filename {
232        "readme.md" | "readme.txt" | "readme" | "readme.rst" => ("guides", "readme", 0.95),
233        "claude.md" | ".cursorrules" | ".clinerules" | ".windsurfrules" => ("guides", "code-instructions", 0.95),
234        "agents.md" => ("guides", "code-instructions", 0.95),
235        "changelog.md" | "changes.md" | "history.md" => ("decisions", "meetings", 0.90),
236        "license" | "license.md" | "license.txt" | "copying" => ("guides", "readme", 0.95),
237        "contributing.md" | "contribute.md" => ("guides", "onboarding", 0.90),
238        "code_of_conduct.md" => ("guides", "onboarding", 0.90),
239        "security.md" | "security.txt" => ("technical", "spec", 0.90),
240        "ui_decisions.md" | "ui-decisions.md" => ("design", "ui-specs", 0.90),
241        "product_strategy.md" | "product-strategy.md" => ("product", "brief", 0.85),
242        "todo.md" | "todo.txt" => ("product", "roadmap", 0.85),
243        "makefile" | "justfile" | "taskfile.yml" => ("technical", "config", 0.90),
244        "dockerfile" | "docker-compose.yml" | "docker-compose.yaml" => ("technical", "config", 0.90),
245        ".gitignore" | ".gitattributes" => ("technical", "config", 0.90),
246        ".env.example" | ".env.sample" => ("technical", "config", 0.85),
247        _ => return None,
248    };
249
250    Some(Classification {
251        doc_type: doc_type.to_string(),
252        subcategory: Some(subcategory.to_string()),
253        confidence,
254        source: ClassificationSource::Rule,
255    })
256}
257
258/// Classify by filename prefix patterns.
259/// Path-aware: skips prefix matching inside .claude/agents/, .claude/skills/, .claude/commands/
260/// because path rules are more authoritative for those directories.
261fn classify_filename_prefix(filename: &str, path_lower: &str) -> Option<Classification> {
262    // Skip prefix matching inside known agent/skill/command directories
263    if path_lower.starts_with(".claude/agents/")
264        || path_lower.starts_with(".claude/skills/")
265        || path_lower.starts_with(".claude/commands/")
266    {
267        return None;
268    }
269
270    // AI_First_Idea/ timestamp files: "YYYY-MM-DD HH.MM.SS.md" are voice transcripts
271    if path_lower.starts_with("ai_first_idea/") {
272        // filename starts with 4 digits (year) → voice transcript
273        let starts_with_year = filename
274            .chars()
275            .take(4)
276            .all(|c| c.is_ascii_digit());
277        if starts_with_year {
278            return Some(Classification {
279                doc_type: "research".to_string(),
280                subcategory: Some("customer-interviews".to_string()),
281                confidence: 0.80,
282                source: ClassificationSource::Rule,
283            });
284        }
285    }
286
287    let (doc_type, subcategory, confidence) = if filename.starts_with("prd") {
288        ("product", "prd", 0.95)
289    } else if filename.starts_with("adr-") || filename.starts_with("adr_") {
290        ("decisions", "adr", 0.95)
291    } else if filename.starts_with("rfc-") || filename.starts_with("rfc_") {
292        ("decisions", "rfc", 0.95)
293    } else if filename.starts_with("roadmap") {
294        ("product", "roadmap", 0.90)
295    } else if filename.starts_with("interview") || filename.starts_with("transcript") {
296        ("research", "customer-interviews", 0.90)
297    } else if filename.starts_with("competitive") {
298        ("research", "competitive-analysis", 0.90)
299    } else if filename.starts_with("architecture") {
300        ("technical", "architecture", 0.90)
301    } else if filename.starts_with("api-") || filename.starts_with("api_") {
302        ("technical", "api-docs", 0.90)
303    } else if filename.starts_with("meeting-notes") || filename.starts_with("meeting_notes") {
304        ("decisions", "meetings", 0.85)
305    } else if filename.starts_with("design-system") || filename.starts_with("design_system") {
306        ("design", "design-system", 0.90)
307    } else {
308        return None;
309    };
310
311    Some(Classification {
312        doc_type: doc_type.to_string(),
313        subcategory: Some(subcategory.to_string()),
314        confidence,
315        source: ClassificationSource::Rule,
316    })
317}
318
319/// Classify by Longest-Prefix-Match path rules.
320/// Rules are sorted by prefix length (longest first) — first match wins.
321/// After matching .claude/research/ or .claude/research/kardo-pmf/, refines subcategory
322/// by checking filename word patterns.
323fn classify_path_lpm(path_lower: &str) -> Option<Classification> {
324    for rule in PATH_RULES {
325        if path_lower.starts_with(rule.prefix) {
326            let mut result = Classification {
327                doc_type: rule.doc_type.to_string(),
328                subcategory: Some(rule.subcategory.to_string()),
329                confidence: rule.confidence,
330                source: ClassificationSource::Rule,
331            };
332
333            // Refine subcategory for .claude/research/ files (but not in /competitors/ — already correct)
334            if rule.prefix == ".claude/research/"
335                || rule.prefix == ".claude/research/kardo-pmf/"
336            {
337                if let Some(refined) = refine_research_subcategory(path_lower) {
338                    result.doc_type = refined.0.to_string();
339                    result.subcategory = Some(refined.1.to_string());
340                    result.confidence = refined.2;
341                }
342            }
343
344            // Refine docs/ catch-all: check filename for product document keywords
345            if rule.prefix == "docs/" || rule.prefix == "doc/" {
346                if let Some(refined) = refine_docs_subcategory(path_lower) {
347                    result.doc_type = refined.0.to_string();
348                    result.subcategory = Some(refined.1.to_string());
349                    result.confidence = refined.2;
350                }
351            }
352
353            return Some(result);
354        }
355    }
356    None
357}
358
359/// Refine subcategory for research files by checking filename words.
360/// Returns (doc_type, subcategory, confidence) override if filename contains known keywords.
361fn refine_research_subcategory(path_lower: &str) -> Option<(&'static str, &'static str, f64)> {
362    // Check for "prd" word in filename → product/prd
363    if path_contains_word(path_lower, "prd") {
364        return Some(("product", "prd", 0.80));
365    }
366    // Check for interview/custdev → customer-interviews
367    if path_contains_word(path_lower, "interview") || path_contains_word(path_lower, "custdev") {
368        return Some(("research", "customer-interviews", 0.80));
369    }
370    // Check for competitive/competitor(s) → competitive-analysis
371    if path_contains_word(path_lower, "competitive")
372        || path_contains_word(path_lower, "competitor")
373        || path_contains_word(path_lower, "competitors")
374    {
375        return Some(("research", "competitive-analysis", 0.80));
376    }
377    // Check for customer/persona/jtbd → user-research
378    if path_contains_word(path_lower, "customer")
379        || path_contains_word(path_lower, "persona")
380        || path_contains_word(path_lower, "jtbd")
381    {
382        return Some(("research", "user-research", 0.75));
383    }
384    // Check for ux → user-research
385    if path_contains_word(path_lower, "ux") {
386        return Some(("research", "user-research", 0.70));
387    }
388    // Check for architecture/blueprint → technical/architecture
389    if path_contains_word(path_lower, "architecture")
390        || path_contains_word(path_lower, "blueprint")
391    {
392        return Some(("technical", "architecture", 0.75));
393    }
394    // Check for "implementation" + "plan" together → product/roadmap
395    if path_contains_word(path_lower, "implementation")
396        && path_contains_word(path_lower, "plan")
397    {
398        return Some(("product", "roadmap", 0.75));
399    }
400    None
401}
402
403/// Refine subcategory for docs/ catch-all files by checking filename keywords.
404/// Returns (doc_type, subcategory, confidence) override if filename contains known patterns.
405fn refine_docs_subcategory(path_lower: &str) -> Option<(&'static str, &'static str, f64)> {
406    // "prd" or "requirements" word in filename → product/prd
407    if path_contains_word(path_lower, "prd") || path_contains_word(path_lower, "requirements") {
408        return Some(("product", "prd", 0.65));
409    }
410    None
411}
412
413/// Classify by word-boundary-aware substring patterns.
414/// Lower confidence — these are hints, not definitive matches.
415fn classify_path_contains(path_lower: &str) -> Option<Classification> {
416    let (doc_type, subcategory, confidence) = if path_contains_word(path_lower, "spec") || path_contains_word(path_lower, "requirement") {
417        ("product", "prd", 0.55)
418    } else if path_contains_word(path_lower, "api") && path_lower.ends_with(".md") {
419        ("technical", "api-docs", 0.55)
420    } else if path_contains_word(path_lower, "design") {
421        ("design", "design-system", 0.55)
422    } else if path_contains_word(path_lower, "architecture") {
423        ("technical", "architecture", 0.55)
424    } else if path_contains_word(path_lower, "test") {
425        ("technical", "spec", 0.50)
426    } else {
427        return None;
428    };
429
430    Some(Classification {
431        doc_type: doc_type.to_string(),
432        subcategory: Some(subcategory.to_string()),
433        confidence,
434        source: ClassificationSource::Rule,
435    })
436}
437
438/// The classification engine.
439pub struct ClassificationEngine {
440    #[allow(dead_code)]
441    backend: LlmBackend,
442    ollama: Option<OllamaClient>,
443}
444
445impl ClassificationEngine {
446    pub fn new(backend: LlmBackend) -> Self {
447        let ollama = match &backend {
448            LlmBackend::Ollama => Some(OllamaClient::new()),
449            LlmBackend::Disabled => None,
450        };
451
452        Self { backend, ollama }
453    }
454
455    /// Classify a document based on its path and content.
456    /// Uses three-tier approach: rules first, then LLM if confidence <= 0.70.
457    /// LLM "unknown" never overrides a non-unknown rule result.
458    pub async fn classify(&self, relative_path: &str, content: &str) -> Classification {
459        // Tier 1: Rule-based classification
460        let rule_result = classify_by_rules(relative_path);
461
462        // If confidence > 0.70, rules are sufficient
463        if rule_result.confidence > 0.70 {
464            return rule_result;
465        }
466
467        // Tier 2: LLM-based classification (if available and file is markdown)
468        if let Some(ollama) = &self.ollama {
469            if relative_path.to_lowercase().ends_with(".md")
470                || relative_path.to_lowercase().ends_with(".txt")
471                || relative_path.to_lowercase().ends_with(".rst")
472            {
473                if let Ok(llm_result) =
474                    self.classify_by_llm(ollama, relative_path, content).await
475                {
476                    // LLM "unknown" should NOT override a non-unknown rule result.
477                    // Rules found something (even low confidence) — keep it unless
478                    // LLM provides a better (non-unknown) answer.
479                    if llm_result.doc_type == "unknown" && rule_result.doc_type != "unknown" {
480                        return rule_result;
481                    }
482                    return llm_result;
483                }
484            }
485        }
486
487        // Return rule result as-is (even low confidence) or fallback
488        rule_result
489    }
490
491    /// LLM-based classification using structured JSON output.
492    async fn classify_by_llm(
493        &self,
494        ollama: &OllamaClient,
495        relative_path: &str,
496        content: &str,
497    ) -> Result<Classification, LlmError> {
498        // Truncate content to ~500 chars for classification
499        let truncated = safe_truncate(content, 500);
500
501        let prompt = format!(
502            r#"Classify this document. Reply with ONLY a JSON object, no other text.
503
504Categories (pick one):
505- product (subcategories: prd, roadmap, brief)
506- research (subcategories: customer-interviews, competitive-analysis, user-research, market-research)
507- technical (subcategories: api-docs, architecture, spec, code, config)
508- design (subcategories: design-system, ui-specs, mockups)
509- decisions (subcategories: adr, rfc, meetings)
510- guides (subcategories: readme, onboarding, how-to, code-instructions)
511- unknown (no subcategory)
512
513File: {path}
514Content: "{content}"
515
516JSON:"#,
517            path = relative_path,
518            content = truncated.replace('"', "'"),
519        );
520
521        let response = ollama
522            .generate(&GenerateRequest {
523                prompt,
524                max_tokens: 80,
525                temperature: 0.1,
526            })
527            .await?;
528
529        // Parse structured JSON from response
530        parse_llm_response(&response.text)
531    }
532}
533
534/// Parse LLM response into a Classification. Expects JSON with category, subcategory, confidence.
535fn parse_llm_response(text: &str) -> Result<Classification, LlmError> {
536    // Try to extract JSON object from response
537    let json_str = extract_json(text)
538        .ok_or_else(|| LlmError::Parse("No JSON found in response".to_string()))?;
539
540    // Parse the JSON
541    let parsed: serde_json::Value = serde_json::from_str(&json_str)
542        .map_err(|e| LlmError::Parse(format!("JSON parse error: {}", e)))?;
543
544    let category = parsed
545        .get("category")
546        .and_then(|v| v.as_str())
547        .unwrap_or("unknown")
548        .to_lowercase();
549
550    let subcategory = parsed
551        .get("subcategory")
552        .and_then(|v| v.as_str())
553        .map(|s| s.to_lowercase());
554
555    let raw_confidence = parsed
556        .get("confidence")
557        .and_then(|v| v.as_f64())
558        .unwrap_or(0.70);
559
560    // Validate category
561    let category = if is_valid_category(&category) {
562        category
563    } else {
564        "unknown".to_string()
565    };
566
567    // Validate subcategory
568    let subcategory = match &subcategory {
569        Some(sub) if category != "unknown" && is_valid_subcategory(&category, sub) => {
570            Some(sub.clone())
571        }
572        _ if category == "unknown" => None,
573        _ => None,
574    };
575
576    // Clamp confidence: 0.60-0.90 for LLM results
577    let confidence = if category == "unknown" {
578        0.30
579    } else {
580        raw_confidence.clamp(0.60, 0.90)
581    };
582
583    Ok(Classification {
584        doc_type: category,
585        subcategory,
586        confidence,
587        source: ClassificationSource::Llm,
588    })
589}
590
591/// Extract a JSON object `{...}` from a string that may contain surrounding text.
592fn extract_json(text: &str) -> Option<String> {
593    let start = text.find('{')?;
594    let mut depth = 0;
595    let mut end = start;
596
597    for (i, ch) in text[start..].char_indices() {
598        match ch {
599            '{' => depth += 1,
600            '}' => {
601                depth -= 1;
602                if depth == 0 {
603                    end = start + i + 1;
604                    break;
605                }
606            }
607            _ => {}
608        }
609    }
610
611    if depth == 0 && end > start {
612        Some(text[start..end].to_string())
613    } else {
614        None
615    }
616}
617
618#[cfg(test)]
619mod tests {
620    use super::*;
621
622    // ── PATH_RULES invariant: must be sorted by prefix length (longest first) ──
623
624    #[test]
625    fn test_path_rules_sorted_by_prefix_length() {
626        for window in PATH_RULES.windows(2) {
627            let a = &window[0];
628            let b = &window[1];
629            // Within a shared prefix group, longer prefixes must come first.
630            // Cross-group ordering is fine (e.g., .claude/ group before docs/ group).
631            if b.prefix.starts_with(a.prefix) || a.prefix.starts_with(b.prefix) {
632                assert!(
633                    a.prefix.len() >= b.prefix.len(),
634                    "LPM invariant violated: '{}' (len {}) should come after '{}' (len {})",
635                    a.prefix, a.prefix.len(), b.prefix, b.prefix.len()
636                );
637            }
638        }
639    }
640
641    // ── Word boundary tests ──
642
643    #[test]
644    fn test_path_contains_word_basic() {
645        assert!(path_contains_word("some/spec/file.md", "spec"));
646        assert!(path_contains_word("spec/file.md", "spec"));
647        assert!(path_contains_word("dir/spec.md", "spec"));
648        assert!(path_contains_word("spec", "spec"));
649    }
650
651    #[test]
652    fn test_path_contains_word_rejects_substring() {
653        assert!(!path_contains_word("aspect-1/file.md", "spec"));
654        assert!(!path_contains_word("inspect/file.md", "spec"));
655        assert!(!path_contains_word("specification.md", "spec"));
656        assert!(!path_contains_word("retrospective.md", "spec"));
657    }
658
659    #[test]
660    fn test_path_contains_word_boundaries() {
661        assert!(path_contains_word("path/spec-v2.md", "spec"));
662        assert!(path_contains_word("path/spec_v2.md", "spec"));
663        assert!(path_contains_word("path/spec.md", "spec"));
664        assert!(path_contains_word("api/v2/docs.md", "api"));
665        assert!(!path_contains_word("rapid/file.md", "api"));
666    }
667
668    // ── Rule-based tests ──
669
670    #[test]
671    fn test_rule_readme() {
672        let result = classify_by_rules("README.md");
673        assert_eq!(result.doc_type, "guides");
674        assert_eq!(result.subcategory.as_deref(), Some("readme"));
675        assert_eq!(result.source, ClassificationSource::Rule);
676        assert!(result.confidence >= 0.95);
677    }
678
679    #[test]
680    fn test_rule_claude_md() {
681        let result = classify_by_rules("CLAUDE.md");
682        assert_eq!(result.doc_type, "guides");
683        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
684    }
685
686    #[test]
687    fn test_rule_docs_directory() {
688        let result = classify_by_rules("docs/setup.md");
689        assert_eq!(result.doc_type, "guides");
690        assert_eq!(result.subcategory.as_deref(), Some("how-to"));
691        // Now 0.50 (catch-all) — will trigger LLM
692        assert!((result.confidence - 0.50).abs() < 0.01);
693    }
694
695    #[test]
696    fn test_rule_github_workflow() {
697        let result = classify_by_rules(".github/workflows/ci.yml");
698        assert_eq!(result.doc_type, "technical");
699        assert_eq!(result.subcategory.as_deref(), Some("config"));
700        assert!(result.confidence >= 0.85);
701    }
702
703    #[test]
704    fn test_rule_license() {
705        let result = classify_by_rules("LICENSE");
706        assert_eq!(result.doc_type, "guides");
707        assert_eq!(result.subcategory.as_deref(), Some("readme"));
708    }
709
710    #[test]
711    fn test_fallback_unknown_file() {
712        let result = classify_by_rules("random-notes.md");
713        assert_eq!(result.doc_type, "unknown");
714        assert_eq!(result.subcategory, None);
715        assert_eq!(result.source, ClassificationSource::Fallback);
716        assert!(result.confidence <= 0.10);
717    }
718
719    #[test]
720    fn test_rule_changelog() {
721        let result = classify_by_rules("CHANGELOG.md");
722        assert_eq!(result.doc_type, "decisions");
723        assert_eq!(result.subcategory.as_deref(), Some("meetings"));
724    }
725
726    #[test]
727    fn test_rule_dockerfile() {
728        let result = classify_by_rules("Dockerfile");
729        assert_eq!(result.doc_type, "technical");
730        assert_eq!(result.subcategory.as_deref(), Some("config"));
731    }
732
733    #[test]
734    fn test_rule_architecture_in_docs() {
735        // filename prefix "architecture" matches (docs path doesn't block prefix)
736        let result = classify_by_rules("docs/architecture-overview.md");
737        assert_eq!(result.doc_type, "technical");
738        assert_eq!(result.subcategory.as_deref(), Some("architecture"));
739        assert_eq!(result.source, ClassificationSource::Rule);
740    }
741
742    #[test]
743    fn test_rule_cursorrules() {
744        let result = classify_by_rules(".cursorrules");
745        assert_eq!(result.doc_type, "guides");
746        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
747    }
748
749    // ── New taxonomy tests ──
750
751    #[test]
752    fn test_rule_prd() {
753        let result = classify_by_rules("prd-v2.md");
754        assert_eq!(result.doc_type, "product");
755        assert_eq!(result.subcategory.as_deref(), Some("prd"));
756        assert!(result.confidence >= 0.95);
757    }
758
759    #[test]
760    fn test_rule_adr() {
761        let result = classify_by_rules("adr-001-use-sqlite.md");
762        assert_eq!(result.doc_type, "decisions");
763        assert_eq!(result.subcategory.as_deref(), Some("adr"));
764    }
765
766    #[test]
767    fn test_rule_rfc() {
768        let result = classify_by_rules("rfc-classification-v2.md");
769        assert_eq!(result.doc_type, "decisions");
770        assert_eq!(result.subcategory.as_deref(), Some("rfc"));
771    }
772
773    #[test]
774    fn test_rule_interview() {
775        let result = classify_by_rules("interview-user-01.md");
776        assert_eq!(result.doc_type, "research");
777        assert_eq!(result.subcategory.as_deref(), Some("customer-interviews"));
778    }
779
780    #[test]
781    fn test_rule_rust_code() {
782        let result = classify_by_rules("src/main.rs");
783        assert_eq!(result.doc_type, "technical");
784        assert_eq!(result.subcategory.as_deref(), Some("code"));
785        assert!(result.confidence >= 0.95);
786    }
787
788    #[test]
789    fn test_rule_config_file() {
790        let result = classify_by_rules("config/settings.json");
791        assert_eq!(result.doc_type, "technical");
792        assert_eq!(result.subcategory.as_deref(), Some("config"));
793    }
794
795    #[test]
796    fn test_rule_claude_directory() {
797        // .claude/instructions — no file extension, so falls through to LPM
798        let result = classify_by_rules(".claude/instructions");
799        assert_eq!(result.doc_type, "guides");
800        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
801    }
802
803    #[test]
804    fn test_rule_roadmap() {
805        let result = classify_by_rules("roadmap-2026.md");
806        assert_eq!(result.doc_type, "product");
807        assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
808    }
809
810    #[test]
811    fn test_rule_competitive_analysis() {
812        let result = classify_by_rules("competitive-analysis.md");
813        assert_eq!(result.doc_type, "research");
814        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
815    }
816
817    #[test]
818    fn test_rule_api_docs() {
819        let result = classify_by_rules("api-reference.md");
820        assert_eq!(result.doc_type, "technical");
821        assert_eq!(result.subcategory.as_deref(), Some("api-docs"));
822    }
823
824    // ── LPM-specific tests ──
825
826    #[test]
827    fn test_lpm_claude_research() {
828        let result = classify_by_rules(".claude/research/2026-01-19-kardo-market-research.md");
829        assert_eq!(result.doc_type, "research");
830        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
831        assert!((result.confidence - 0.75).abs() < 0.01);
832    }
833
834    #[test]
835    fn test_lpm_claude_research_competitors() {
836        let result = classify_by_rules(".claude/research/competitors/kardo-competitors.md");
837        assert_eq!(result.doc_type, "research");
838        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
839        assert!((result.confidence - 0.80).abs() < 0.01);
840    }
841
842    #[test]
843    fn test_lpm_claude_plans() {
844        let result = classify_by_rules(".claude/plans/phase-6-fine-tuning-qwen3-4b.md");
845        assert_eq!(result.doc_type, "product");
846        assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
847        assert!((result.confidence - 0.75).abs() < 0.01);
848    }
849
850    #[test]
851    fn test_lpm_claude_analysis() {
852        let result = classify_by_rules(".claude/analysis/recommendations-final.md");
853        assert_eq!(result.doc_type, "research");
854        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
855    }
856
857    #[test]
858    fn test_lpm_claude_agents_no_prefix_collision() {
859        // api-validator.md inside .claude/agents/ should NOT match "api-" prefix
860        let result = classify_by_rules(".claude/agents/api-validator.md");
861        assert_eq!(result.doc_type, "guides");
862        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
863        assert!((result.confidence - 0.80).abs() < 0.01);
864    }
865
866    #[test]
867    fn test_lpm_claude_agents_competitive_no_prefix_collision() {
868        // competitive-research.md inside .claude/agents/ should NOT match "competitive" prefix
869        let result = classify_by_rules(".claude/agents/competitive-research.md");
870        assert_eq!(result.doc_type, "guides");
871        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
872    }
873
874    #[test]
875    fn test_lpm_claude_catch_all_low_confidence() {
876        // Generic .claude/ file — should get LOW confidence (0.50)
877        let result = classify_by_rules(".claude/OPEN_QUESTIONS.md");
878        assert_eq!(result.doc_type, "guides");
879        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
880        assert!((result.confidence - 0.50).abs() < 0.01);
881    }
882
883    #[test]
884    fn test_lpm_docs_plans() {
885        let result = classify_by_rules("docs/plans/IDEAS.md");
886        assert_eq!(result.doc_type, "product");
887        assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
888        assert!((result.confidence - 0.75).abs() < 0.01);
889    }
890
891    #[test]
892    fn test_lpm_docs_catch_all_low_confidence() {
893        // docs/MVP_PRD.md now correctly classified as product/prd via "prd" word check
894        let result = classify_by_rules("docs/MVP_PRD.md");
895        assert_eq!(result.doc_type, "product");
896        assert_eq!(result.subcategory.as_deref(), Some("prd"));
897    }
898
899    #[test]
900    fn test_lpm_ai_first_idea_research() {
901        let result = classify_by_rules("AI_First_Idea/research/FINAL-RESEARCH-REPORT.md");
902        assert_eq!(result.doc_type, "research");
903        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
904        assert!((result.confidence - 0.75).abs() < 0.01);
905    }
906
907    #[test]
908    fn test_lpm_ai_first_idea_articles() {
909        let result = classify_by_rules("AI_First_Idea/articles/ai-first-manifesto/article-en.md");
910        assert_eq!(result.doc_type, "guides");
911        assert_eq!(result.subcategory.as_deref(), Some("how-to"));
912        assert!((result.confidence - 0.70).abs() < 0.01);
913    }
914
915    #[test]
916    fn test_lpm_ai_first_idea_root() {
917        let result = classify_by_rules("AI_First_Idea/VISION_DOCUMENT.md");
918        assert_eq!(result.doc_type, "research");
919        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
920        assert!((result.confidence - 0.55).abs() < 0.01);
921    }
922
923    #[test]
924    fn test_lpm_habr_plan_pipeline_research() {
925        let result = classify_by_rules("Habr_plan/pipeline/research/habr-content-strategy.md");
926        assert_eq!(result.doc_type, "research");
927        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
928    }
929
930    #[test]
931    fn test_lpm_habr_plan_articles() {
932        let result = classify_by_rules("Habr_plan/articles/ai-killing-b2b-saas/habr-article.md");
933        assert_eq!(result.doc_type, "guides");
934        assert_eq!(result.subcategory.as_deref(), Some("how-to"));
935    }
936
937    #[test]
938    fn test_lpm_habr_plan_root() {
939        let result = classify_by_rules("Habr_plan/habr-editor.skill");
940        // No extension match, no filename match, no prefix match
941        // LPM: "habr_plan/" matches with 0.55
942        assert_eq!(result.doc_type, "guides");
943        assert_eq!(result.subcategory.as_deref(), Some("how-to"));
944    }
945
946    #[test]
947    fn test_spec_word_boundary_no_false_positive() {
948        // "aspect" should NOT match "spec" word-boundary check
949        let result = classify_by_rules("AI_First_Idea/research/aspect-1-saas-decline/raw-findings.md");
950        // Should match ai_first_idea/research/ path rule, NOT product/prd via "spec"
951        assert_eq!(result.doc_type, "research");
952        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
953    }
954
955    #[test]
956    fn test_spec_word_boundary_true_positive() {
957        let result = classify_by_rules("project/spec/requirements.md");
958        assert_eq!(result.doc_type, "product");
959        assert_eq!(result.subcategory.as_deref(), Some("prd"));
960    }
961
962    #[test]
963    fn test_lpm_claude_discovery() {
964        let result = classify_by_rules(".claude/discovery/2026-02-03-kardo-core-problem.md");
965        assert_eq!(result.doc_type, "research");
966        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
967    }
968
969    #[test]
970    fn test_lpm_claude_components_registry() {
971        let result = classify_by_rules(".claude/components-registry/docs/shared/animated-tabs.md");
972        assert_eq!(result.doc_type, "design");
973        assert_eq!(result.subcategory.as_deref(), Some("design-system"));
974    }
975
976    #[test]
977    fn test_lpm_claude_memory() {
978        let result = classify_by_rules(".claude/memory/context.md");
979        assert_eq!(result.doc_type, "guides");
980        assert_eq!(result.subcategory.as_deref(), Some("how-to"));
981    }
982
983    #[test]
984    fn test_lpm_github_without_workflow() {
985        let result = classify_by_rules(".github/CODEOWNERS");
986        assert_eq!(result.doc_type, "technical");
987        assert_eq!(result.subcategory.as_deref(), Some("config"));
988    }
989
990    // ── New AI format classification tests ──
991
992    #[test]
993    fn test_rule_windsurfrules() {
994        let result = classify_by_rules(".windsurfrules");
995        assert_eq!(result.doc_type, "guides");
996        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
997        assert!(result.confidence >= 0.95);
998    }
999
1000    #[test]
1001    fn test_rule_agents_md() {
1002        let result = classify_by_rules("AGENTS.md");
1003        assert_eq!(result.doc_type, "guides");
1004        assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
1005        assert!(result.confidence >= 0.95);
1006    }
1007
1008    // ── LLM response parsing tests ──
1009
1010    #[test]
1011    fn test_parse_llm_valid_json() {
1012        let response = r#"{"category": "product", "subcategory": "prd", "confidence": 0.85}"#;
1013        let result = parse_llm_response(response).unwrap();
1014        assert_eq!(result.doc_type, "product");
1015        assert_eq!(result.subcategory.as_deref(), Some("prd"));
1016        assert!((result.confidence - 0.85).abs() < 0.01);
1017        assert_eq!(result.source, ClassificationSource::Llm);
1018    }
1019
1020    #[test]
1021    fn test_parse_llm_json_with_surrounding_text() {
1022        let response = r#"Here is the classification:
1023{"category": "research", "subcategory": "customer-interviews", "confidence": 0.90}
1024That's my answer."#;
1025        let result = parse_llm_response(response).unwrap();
1026        assert_eq!(result.doc_type, "research");
1027        assert_eq!(
1028            result.subcategory.as_deref(),
1029            Some("customer-interviews")
1030        );
1031    }
1032
1033    #[test]
1034    fn test_parse_llm_unknown_with_low_confidence() {
1035        let response = r#"{"category": "unknown", "confidence": 0.50}"#;
1036        let result = parse_llm_response(response).unwrap();
1037        assert_eq!(result.doc_type, "unknown");
1038        assert_eq!(result.subcategory, None);
1039        assert!((result.confidence - 0.30).abs() < 0.01);
1040    }
1041
1042    #[test]
1043    fn test_parse_llm_confidence_clamping() {
1044        let response = r#"{"category": "product", "subcategory": "prd", "confidence": 0.99}"#;
1045        let result = parse_llm_response(response).unwrap();
1046        assert!((result.confidence - 0.90).abs() < 0.01); // clamped to 0.90
1047
1048        let response2 = r#"{"category": "product", "subcategory": "prd", "confidence": 0.20}"#;
1049        let result2 = parse_llm_response(response2).unwrap();
1050        assert!((result2.confidence - 0.60).abs() < 0.01); // clamped to 0.60
1051    }
1052
1053    #[test]
1054    fn test_parse_llm_invalid_category() {
1055        let response = r#"{"category": "banana", "confidence": 0.80}"#;
1056        let result = parse_llm_response(response).unwrap();
1057        assert_eq!(result.doc_type, "unknown");
1058        assert!((result.confidence - 0.30).abs() < 0.01);
1059    }
1060
1061    #[test]
1062    fn test_parse_llm_invalid_subcategory() {
1063        let response = r#"{"category": "product", "subcategory": "nonexistent", "confidence": 0.80}"#;
1064        let result = parse_llm_response(response).unwrap();
1065        assert_eq!(result.doc_type, "product");
1066        assert_eq!(result.subcategory, None); // invalid subcategory stripped
1067    }
1068
1069    #[test]
1070    fn test_parse_llm_no_json() {
1071        let response = "This is just plain text with no JSON";
1072        let result = parse_llm_response(response);
1073        assert!(result.is_err());
1074    }
1075
1076    #[test]
1077    fn test_extract_json() {
1078        assert_eq!(
1079            extract_json(r#"blah {"a": 1} blah"#),
1080            Some(r#"{"a": 1}"#.to_string())
1081        );
1082        assert_eq!(extract_json("no json here"), None);
1083        assert_eq!(
1084            extract_json(r#"{"nested": {"b": 2}}"#),
1085            Some(r#"{"nested": {"b": 2}}"#.to_string())
1086        );
1087    }
1088
1089    // ── Engine async tests ──
1090
1091    #[test]
1092    fn test_engine_high_confidence_skips_llm() {
1093        let engine = ClassificationEngine::new(LlmBackend::Disabled);
1094        let rt = tokio::runtime::Runtime::new().unwrap();
1095        let result = rt.block_on(engine.classify("README.md", "# My Project"));
1096        assert_eq!(result.doc_type, "guides");
1097        assert_eq!(result.subcategory.as_deref(), Some("readme"));
1098        assert_eq!(result.source, ClassificationSource::Rule);
1099    }
1100
1101    #[test]
1102    fn test_engine_low_confidence_returns_rule_when_no_llm() {
1103        let engine = ClassificationEngine::new(LlmBackend::Disabled);
1104        let rt = tokio::runtime::Runtime::new().unwrap();
1105        let result = rt.block_on(engine.classify("random-notes.md", "Some random content"));
1106        assert_eq!(result.doc_type, "unknown");
1107        assert_eq!(result.source, ClassificationSource::Fallback);
1108    }
1109
1110    // ── Group 1: docs/ catch-all fixes ──
1111
1112    #[test]
1113    fn test_docs_mvp_prd_classified_as_product_prd() {
1114        // docs/MVP_PRD.md — "prd" word in filename → product/prd
1115        let result = classify_by_rules("docs/MVP_PRD.md");
1116        assert_eq!(result.doc_type, "product");
1117        assert_eq!(result.subcategory.as_deref(), Some("prd"));
1118    }
1119
1120    #[test]
1121    fn test_docs_requirements_system_classified_as_product_prd() {
1122        // docs/REQUIREMENTS_SYSTEM.md — "requirements" word in filename → product/prd
1123        let result = classify_by_rules("docs/REQUIREMENTS_SYSTEM.md");
1124        assert_eq!(result.doc_type, "product");
1125        assert_eq!(result.subcategory.as_deref(), Some("prd"));
1126    }
1127
1128    #[test]
1129    fn test_docs_ui_decisions_classified_as_design_ui_specs() {
1130        // docs/UI_DECISIONS.md — exact filename match → design/ui-specs
1131        let result = classify_by_rules("docs/UI_DECISIONS.md");
1132        assert_eq!(result.doc_type, "design");
1133        assert_eq!(result.subcategory.as_deref(), Some("ui-specs"));
1134    }
1135
1136    #[test]
1137    fn test_docs_product_strategy_classified_as_product_brief() {
1138        // docs/PRODUCT_STRATEGY.md — exact filename match → product/brief
1139        let result = classify_by_rules("docs/PRODUCT_STRATEGY.md");
1140        assert_eq!(result.doc_type, "product");
1141        assert_eq!(result.subcategory.as_deref(), Some("brief"));
1142    }
1143
1144    // ── Group 2: .claude/research/ date-prefixed files refinement ──
1145
1146    #[test]
1147    fn test_research_custdev_interview_classified_correctly() {
1148        // Date-prefixed custdev interview → research/customer-interviews
1149        let result = classify_by_rules(".claude/research/2026-01-19-custdev-interview-1.md");
1150        assert_eq!(result.doc_type, "research");
1151        assert_eq!(result.subcategory.as_deref(), Some("customer-interviews"));
1152    }
1153
1154    #[test]
1155    fn test_research_competitive_analysis_date_prefix() {
1156        // Date-prefixed competitive analysis → research/competitive-analysis
1157        let result = classify_by_rules(
1158            ".claude/research/2026-01-28-competitive-analysis-docs-visualization.md",
1159        );
1160        assert_eq!(result.doc_type, "research");
1161        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1162    }
1163
1164    #[test]
1165    fn test_research_competitive_positioning_date_prefix() {
1166        let result = classify_by_rules(
1167            ".claude/research/2026-01-28-competitive-positioning-claude-code-companion.md",
1168        );
1169        assert_eq!(result.doc_type, "research");
1170        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1171    }
1172
1173    #[test]
1174    fn test_research_cycle_competitive_moat() {
1175        let result = classify_by_rules(".claude/research/cycle8-competitive-moat.md");
1176        assert_eq!(result.doc_type, "research");
1177        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1178    }
1179
1180    #[test]
1181    fn test_research_kardo_pmf_prd_file() {
1182        // kardo-pmf/ PRD file → product/prd via prd word refinement
1183        let result = classify_by_rules(
1184            ".claude/research/kardo-pmf/Feature_Requirements_PRD.md",
1185        );
1186        assert_eq!(result.doc_type, "product");
1187        assert_eq!(result.subcategory.as_deref(), Some("prd"));
1188    }
1189
1190    #[test]
1191    fn test_research_competitors_dir_still_correct() {
1192        // .claude/research/competitors/ still classified correctly (not refined)
1193        let result = classify_by_rules(
1194            ".claude/research/competitors/kardo-competitors.md",
1195        );
1196        assert_eq!(result.doc_type, "research");
1197        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1198        assert!((result.confidence - 0.80).abs() < 0.01);
1199    }
1200
1201    // ── Group 3: AI_First_Idea/ timestamp transcripts ──
1202
1203    #[test]
1204    fn test_ai_first_idea_timestamp_file_is_customer_interview() {
1205        // Voice transcript with timestamp filename → research/customer-interviews
1206        let result = classify_by_rules("AI_First_Idea/2026-02-14 22.19.41.md");
1207        assert_eq!(result.doc_type, "research");
1208        assert_eq!(result.subcategory.as_deref(), Some("customer-interviews"));
1209        assert!((result.confidence - 0.80).abs() < 0.01);
1210    }
1211
1212    #[test]
1213    fn test_ai_first_idea_non_timestamp_not_affected() {
1214        // Non-timestamp files in AI_First_Idea/ should NOT be affected
1215        let result = classify_by_rules("AI_First_Idea/VISION_DOCUMENT.md");
1216        assert_eq!(result.doc_type, "research");
1217        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
1218    }
1219
1220    // ── Group 4: calibration/dataset/ path rule ──
1221
1222    #[test]
1223    fn test_calibration_dataset_classified_as_technical_config() {
1224        let result = classify_by_rules(
1225            ".claude/research/calibration/dataset/Alive24--CKBoost/claude-md.md",
1226        );
1227        assert_eq!(result.doc_type, "technical");
1228        assert_eq!(result.subcategory.as_deref(), Some("config"));
1229        assert!((result.confidence - 0.70).abs() < 0.01);
1230    }
1231
1232    // ── Group 6: kardo/ catch-all ──
1233
1234    #[test]
1235    fn test_kardo_build_notes_classified_as_technical() {
1236        // kardo/ catch-all — technical/code (0.50) triggers LLM for spec classification
1237        let result = classify_by_rules(
1238            "kardo/crates/kardo-desktop/build-notes.md",
1239        );
1240        assert_eq!(result.doc_type, "technical");
1241        assert_eq!(result.subcategory.as_deref(), Some("code"));
1242        assert!((result.confidence - 0.50).abs() < 0.01);
1243    }
1244
1245    // ── Iteration 2: subcategory refinement fixes ──
1246
1247    #[test]
1248    fn test_research_competitors_word_refined_to_competitive_analysis() {
1249        // "competitors" (not "competitive") should also trigger competitive-analysis
1250        let result = classify_by_rules(".claude/research/2026-01-22-kardo-competitors.md");
1251        assert_eq!(result.doc_type, "research");
1252        assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1253    }
1254
1255    #[test]
1256    fn test_research_customer_voice_refined_to_user_research() {
1257        // "customer" in filename → user-research
1258        let result = classify_by_rules(".claude/research/cycle1-customer-voice.md");
1259        assert_eq!(result.doc_type, "research");
1260        assert_eq!(result.subcategory.as_deref(), Some("user-research"));
1261    }
1262
1263    #[test]
1264    fn test_research_ux_patterns_refined_to_user_research() {
1265        // "ux" in filename → user-research
1266        let result = classify_by_rules(
1267            ".claude/research/2026-02-03-R5-health-dashboard-ux-patterns.md",
1268        );
1269        assert_eq!(result.doc_type, "research");
1270        assert_eq!(result.subcategory.as_deref(), Some("user-research"));
1271    }
1272
1273    #[test]
1274    fn test_research_persona_refined_to_user_research() {
1275        // "persona" in filename → user-research
1276        let result = classify_by_rules(".claude/research/persona-analysis.md");
1277        assert_eq!(result.doc_type, "research");
1278        assert_eq!(result.subcategory.as_deref(), Some("user-research"));
1279    }
1280
1281    #[test]
1282    fn test_research_market_still_default() {
1283        // Files without special keywords remain market-research
1284        let result = classify_by_rules(
1285            ".claude/research/2026-01-19-kardo-market-research.md",
1286        );
1287        assert_eq!(result.doc_type, "research");
1288        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
1289    }
1290
1291    // ── Iteration 3: architecture and implementation plan refinement ──
1292
1293    #[test]
1294    fn test_research_architecture_blueprint_refined_to_technical() {
1295        let result = classify_by_rules(
1296            ".claude/research/2026-02-11-architecture-blueprint.md",
1297        );
1298        assert_eq!(result.doc_type, "technical");
1299        assert_eq!(result.subcategory.as_deref(), Some("architecture"));
1300    }
1301
1302    #[test]
1303    fn test_research_classification_architecture_refined_to_technical() {
1304        let result = classify_by_rules(
1305            ".claude/research/2026-01-28-context-aware-classification-architecture.md",
1306        );
1307        assert_eq!(result.doc_type, "technical");
1308        assert_eq!(result.subcategory.as_deref(), Some("architecture"));
1309    }
1310
1311    #[test]
1312    fn test_research_implementation_plan_refined_to_product_roadmap() {
1313        let result = classify_by_rules(
1314            ".claude/research/2026-02-11-phase1-implementation-plan.md",
1315        );
1316        assert_eq!(result.doc_type, "product");
1317        assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
1318    }
1319
1320    #[test]
1321    fn test_research_implementation_without_plan_stays_research() {
1322        // "implementation" alone (without "plan") should NOT become roadmap
1323        let result = classify_by_rules(
1324            ".claude/research/2026-01-22-virtual-structure-implementation.md",
1325        );
1326        assert_eq!(result.doc_type, "research");
1327        assert_eq!(result.subcategory.as_deref(), Some("market-research"));
1328    }
1329}
kardo_core/llm/engine.rs

kardo_core/llm/
engine.rs