1use super::ollama::OllamaClient;
8use super::{GenerateRequest, LlmBackend, LlmError};
9use serde::{Deserialize, Serialize};
10
11fn safe_truncate(s: &str, max_bytes: usize) -> &str {
14 if s.len() <= max_bytes {
15 return s;
16 }
17 let mut end = max_bytes;
18 while end > 0 && !s.is_char_boundary(end) {
19 end -= 1;
20 }
21 &s[..end]
22}
23
24const VALID_CATEGORIES: &[&str] = &[
26 "product", "research", "technical", "design", "decisions", "guides", "unknown",
27];
28
29fn is_valid_category(cat: &str) -> bool {
31 VALID_CATEGORIES.contains(&cat)
32}
33
34fn is_valid_subcategory(category: &str, subcategory: &str) -> bool {
36 let valid = match category {
37 "product" => &["prd", "roadmap", "brief"][..],
38 "research" => &["customer-interviews", "competitive-analysis", "user-research", "market-research"][..],
39 "technical" => &["api-docs", "architecture", "spec", "code", "config"][..],
40 "design" => &["design-system", "ui-specs", "mockups"][..],
41 "decisions" => &["adr", "rfc", "meetings"][..],
42 "guides" => &["readme", "onboarding", "how-to", "code-instructions"][..],
43 _ => return false,
44 };
45 valid.contains(&subcategory)
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct Classification {
51 pub doc_type: String,
52 pub subcategory: Option<String>,
53 pub confidence: f64,
54 pub source: ClassificationSource,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
59pub enum ClassificationSource {
60 Rule,
62 Llm,
64 Fallback,
66}
67
68impl std::fmt::Display for ClassificationSource {
69 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
70 match self {
71 ClassificationSource::Rule => write!(f, "rule"),
72 ClassificationSource::Llm => write!(f, "llm"),
73 ClassificationSource::Fallback => write!(f, "fallback"),
74 }
75 }
76}
77
78struct PathRule {
82 prefix: &'static str,
83 doc_type: &'static str,
84 subcategory: &'static str,
85 confidence: f64,
86}
87
88const PATH_RULES: &[PathRule] = &[
94 PathRule { prefix: ".claude/research/calibration/dataset/", doc_type: "technical", subcategory: "config", confidence: 0.70 },
96 PathRule { prefix: ".claude/research/competitors/", doc_type: "research", subcategory: "competitive-analysis", confidence: 0.80 },
97 PathRule { prefix: ".claude/research/kardo-pmf/", doc_type: "research", subcategory: "market-research", confidence: 0.80 },
98 PathRule { prefix: ".claude/research/experiments/", doc_type: "research", subcategory: "user-research", confidence: 0.75 },
99 PathRule { prefix: ".claude/research/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
100 PathRule { prefix: ".claude/plans/", doc_type: "product", subcategory: "roadmap", confidence: 0.75 },
101 PathRule { prefix: ".claude/analysis/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
102 PathRule { prefix: ".claude/agents/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.80 },
103 PathRule { prefix: ".claude/skills/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.80 },
104 PathRule { prefix: ".claude/commands/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.80 },
105 PathRule { prefix: ".claude/hooks/", doc_type: "technical", subcategory: "config", confidence: 0.80 },
106 PathRule { prefix: ".claude/memory/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
107 PathRule { prefix: ".claude/discovery/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
108 PathRule { prefix: ".claude/summaries/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
109 PathRule { prefix: ".claude/knowledge/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
110 PathRule { prefix: ".claude/components-registry/", doc_type: "design", subcategory: "design-system", confidence: 0.75 },
111 PathRule { prefix: ".claude/training-data/", doc_type: "technical", subcategory: "config", confidence: 0.75 },
112 PathRule { prefix: ".claude/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.50 },
114
115 PathRule { prefix: "docs/plans/", doc_type: "product", subcategory: "roadmap", confidence: 0.75 },
117 PathRule { prefix: "docs/reviews/", doc_type: "decisions", subcategory: "meetings", confidence: 0.70 },
118 PathRule { prefix: "docs/drafts/", doc_type: "guides", subcategory: "how-to", confidence: 0.50 },
119 PathRule { prefix: "docs/layers/", doc_type: "product", subcategory: "brief", confidence: 0.60 },
120 PathRule { prefix: "docs/", doc_type: "guides", subcategory: "how-to", confidence: 0.50 },
122 PathRule { prefix: "doc/", doc_type: "guides", subcategory: "how-to", confidence: 0.50 },
123
124 PathRule { prefix: "ai_first_idea/research/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
126 PathRule { prefix: "ai_first_idea/articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
127 PathRule { prefix: "ai_first_idea/", doc_type: "research", subcategory: "market-research", confidence: 0.55 },
128
129 PathRule { prefix: "habr_plan/pipeline/research/", doc_type: "research", subcategory: "market-research", confidence: 0.75 },
131 PathRule { prefix: "habr_plan/pipeline/prompts/", doc_type: "guides", subcategory: "code-instructions", confidence: 0.75 },
132 PathRule { prefix: "habr_plan/pipeline/articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
133 PathRule { prefix: "habr_plan/articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.70 },
134 PathRule { prefix: "habr_plan/pipeline/", doc_type: "guides", subcategory: "how-to", confidence: 0.60 },
135 PathRule { prefix: "habr_plan/", doc_type: "guides", subcategory: "how-to", confidence: 0.55 },
136
137 PathRule { prefix: ".github/workflows/", doc_type: "technical", subcategory: "config", confidence: 0.85 },
139 PathRule { prefix: ".github/", doc_type: "technical", subcategory: "config", confidence: 0.75 },
140
141 PathRule { prefix: "research/", doc_type: "research", subcategory: "market-research", confidence: 0.70 },
143 PathRule { prefix: "articles/", doc_type: "guides", subcategory: "how-to", confidence: 0.65 },
144 PathRule { prefix: "plans/", doc_type: "product", subcategory: "roadmap", confidence: 0.70 },
145
146 PathRule { prefix: "kardo/", doc_type: "technical", subcategory: "code", confidence: 0.50 },
148];
149
150fn path_contains_word(path: &str, word: &str) -> bool {
154 for (i, _) in path.match_indices(word) {
155 let before_ok = i == 0 || !path.as_bytes()[i - 1].is_ascii_alphanumeric();
156 let after_idx = i + word.len();
157 let after_ok = after_idx >= path.len() || !path.as_bytes()[after_idx].is_ascii_alphanumeric();
158 if before_ok && after_ok {
159 return true;
160 }
161 }
162 false
163}
164
165pub fn classify_by_rules(relative_path: &str) -> Classification {
170 let path_lower = relative_path.to_lowercase();
171 let filename = relative_path
172 .rsplit('/')
173 .next()
174 .unwrap_or(relative_path)
175 .to_lowercase();
176
177 if let Some(ext_result) = classify_by_extension(&filename) {
179 return ext_result;
180 }
181
182 if let Some(c) = classify_filename(&filename) {
184 return c;
185 }
186
187 if let Some(c) = classify_filename_prefix(&filename, &path_lower) {
189 return c;
190 }
191
192 if let Some(c) = classify_path_lpm(&path_lower) {
194 return c;
195 }
196
197 if let Some(c) = classify_path_contains(&path_lower) {
199 return c;
200 }
201
202 Classification {
204 doc_type: "unknown".to_string(),
205 subcategory: None,
206 confidence: 0.10,
207 source: ClassificationSource::Fallback,
208 }
209}
210
211fn classify_by_extension(filename: &str) -> Option<Classification> {
213 let ext = filename.rsplit('.').next()?;
214 let (doc_type, subcategory) = match ext {
215 "rs" | "py" | "ts" | "tsx" | "js" | "jsx" | "go" | "java" | "c" | "cpp" | "h" | "rb"
216 | "swift" | "kt" | "cs" | "php" | "sh" | "bash" | "zsh" => ("technical", "code"),
217 "json" | "yaml" | "yml" | "toml" | "ini" | "cfg" | "conf" => ("technical", "config"),
218 _ => return None,
219 };
220
221 Some(Classification {
222 doc_type: doc_type.to_string(),
223 subcategory: Some(subcategory.to_string()),
224 confidence: 0.95,
225 source: ClassificationSource::Rule,
226 })
227}
228
229fn classify_filename(filename: &str) -> Option<Classification> {
231 let (doc_type, subcategory, confidence) = match filename {
232 "readme.md" | "readme.txt" | "readme" | "readme.rst" => ("guides", "readme", 0.95),
233 "claude.md" | ".cursorrules" | ".clinerules" | ".windsurfrules" => ("guides", "code-instructions", 0.95),
234 "agents.md" => ("guides", "code-instructions", 0.95),
235 "changelog.md" | "changes.md" | "history.md" => ("decisions", "meetings", 0.90),
236 "license" | "license.md" | "license.txt" | "copying" => ("guides", "readme", 0.95),
237 "contributing.md" | "contribute.md" => ("guides", "onboarding", 0.90),
238 "code_of_conduct.md" => ("guides", "onboarding", 0.90),
239 "security.md" | "security.txt" => ("technical", "spec", 0.90),
240 "ui_decisions.md" | "ui-decisions.md" => ("design", "ui-specs", 0.90),
241 "product_strategy.md" | "product-strategy.md" => ("product", "brief", 0.85),
242 "todo.md" | "todo.txt" => ("product", "roadmap", 0.85),
243 "makefile" | "justfile" | "taskfile.yml" => ("technical", "config", 0.90),
244 "dockerfile" | "docker-compose.yml" | "docker-compose.yaml" => ("technical", "config", 0.90),
245 ".gitignore" | ".gitattributes" => ("technical", "config", 0.90),
246 ".env.example" | ".env.sample" => ("technical", "config", 0.85),
247 _ => return None,
248 };
249
250 Some(Classification {
251 doc_type: doc_type.to_string(),
252 subcategory: Some(subcategory.to_string()),
253 confidence,
254 source: ClassificationSource::Rule,
255 })
256}
257
258fn classify_filename_prefix(filename: &str, path_lower: &str) -> Option<Classification> {
262 if path_lower.starts_with(".claude/agents/")
264 || path_lower.starts_with(".claude/skills/")
265 || path_lower.starts_with(".claude/commands/")
266 {
267 return None;
268 }
269
270 if path_lower.starts_with("ai_first_idea/") {
272 let starts_with_year = filename
274 .chars()
275 .take(4)
276 .all(|c| c.is_ascii_digit());
277 if starts_with_year {
278 return Some(Classification {
279 doc_type: "research".to_string(),
280 subcategory: Some("customer-interviews".to_string()),
281 confidence: 0.80,
282 source: ClassificationSource::Rule,
283 });
284 }
285 }
286
287 let (doc_type, subcategory, confidence) = if filename.starts_with("prd") {
288 ("product", "prd", 0.95)
289 } else if filename.starts_with("adr-") || filename.starts_with("adr_") {
290 ("decisions", "adr", 0.95)
291 } else if filename.starts_with("rfc-") || filename.starts_with("rfc_") {
292 ("decisions", "rfc", 0.95)
293 } else if filename.starts_with("roadmap") {
294 ("product", "roadmap", 0.90)
295 } else if filename.starts_with("interview") || filename.starts_with("transcript") {
296 ("research", "customer-interviews", 0.90)
297 } else if filename.starts_with("competitive") {
298 ("research", "competitive-analysis", 0.90)
299 } else if filename.starts_with("architecture") {
300 ("technical", "architecture", 0.90)
301 } else if filename.starts_with("api-") || filename.starts_with("api_") {
302 ("technical", "api-docs", 0.90)
303 } else if filename.starts_with("meeting-notes") || filename.starts_with("meeting_notes") {
304 ("decisions", "meetings", 0.85)
305 } else if filename.starts_with("design-system") || filename.starts_with("design_system") {
306 ("design", "design-system", 0.90)
307 } else {
308 return None;
309 };
310
311 Some(Classification {
312 doc_type: doc_type.to_string(),
313 subcategory: Some(subcategory.to_string()),
314 confidence,
315 source: ClassificationSource::Rule,
316 })
317}
318
319fn classify_path_lpm(path_lower: &str) -> Option<Classification> {
324 for rule in PATH_RULES {
325 if path_lower.starts_with(rule.prefix) {
326 let mut result = Classification {
327 doc_type: rule.doc_type.to_string(),
328 subcategory: Some(rule.subcategory.to_string()),
329 confidence: rule.confidence,
330 source: ClassificationSource::Rule,
331 };
332
333 if rule.prefix == ".claude/research/"
335 || rule.prefix == ".claude/research/kardo-pmf/"
336 {
337 if let Some(refined) = refine_research_subcategory(path_lower) {
338 result.doc_type = refined.0.to_string();
339 result.subcategory = Some(refined.1.to_string());
340 result.confidence = refined.2;
341 }
342 }
343
344 if rule.prefix == "docs/" || rule.prefix == "doc/" {
346 if let Some(refined) = refine_docs_subcategory(path_lower) {
347 result.doc_type = refined.0.to_string();
348 result.subcategory = Some(refined.1.to_string());
349 result.confidence = refined.2;
350 }
351 }
352
353 return Some(result);
354 }
355 }
356 None
357}
358
359fn refine_research_subcategory(path_lower: &str) -> Option<(&'static str, &'static str, f64)> {
362 if path_contains_word(path_lower, "prd") {
364 return Some(("product", "prd", 0.80));
365 }
366 if path_contains_word(path_lower, "interview") || path_contains_word(path_lower, "custdev") {
368 return Some(("research", "customer-interviews", 0.80));
369 }
370 if path_contains_word(path_lower, "competitive")
372 || path_contains_word(path_lower, "competitor")
373 || path_contains_word(path_lower, "competitors")
374 {
375 return Some(("research", "competitive-analysis", 0.80));
376 }
377 if path_contains_word(path_lower, "customer")
379 || path_contains_word(path_lower, "persona")
380 || path_contains_word(path_lower, "jtbd")
381 {
382 return Some(("research", "user-research", 0.75));
383 }
384 if path_contains_word(path_lower, "ux") {
386 return Some(("research", "user-research", 0.70));
387 }
388 if path_contains_word(path_lower, "architecture")
390 || path_contains_word(path_lower, "blueprint")
391 {
392 return Some(("technical", "architecture", 0.75));
393 }
394 if path_contains_word(path_lower, "implementation")
396 && path_contains_word(path_lower, "plan")
397 {
398 return Some(("product", "roadmap", 0.75));
399 }
400 None
401}
402
403fn refine_docs_subcategory(path_lower: &str) -> Option<(&'static str, &'static str, f64)> {
406 if path_contains_word(path_lower, "prd") || path_contains_word(path_lower, "requirements") {
408 return Some(("product", "prd", 0.65));
409 }
410 None
411}
412
413fn classify_path_contains(path_lower: &str) -> Option<Classification> {
416 let (doc_type, subcategory, confidence) = if path_contains_word(path_lower, "spec") || path_contains_word(path_lower, "requirement") {
417 ("product", "prd", 0.55)
418 } else if path_contains_word(path_lower, "api") && path_lower.ends_with(".md") {
419 ("technical", "api-docs", 0.55)
420 } else if path_contains_word(path_lower, "design") {
421 ("design", "design-system", 0.55)
422 } else if path_contains_word(path_lower, "architecture") {
423 ("technical", "architecture", 0.55)
424 } else if path_contains_word(path_lower, "test") {
425 ("technical", "spec", 0.50)
426 } else {
427 return None;
428 };
429
430 Some(Classification {
431 doc_type: doc_type.to_string(),
432 subcategory: Some(subcategory.to_string()),
433 confidence,
434 source: ClassificationSource::Rule,
435 })
436}
437
438pub struct ClassificationEngine {
440 #[allow(dead_code)]
441 backend: LlmBackend,
442 ollama: Option<OllamaClient>,
443}
444
445impl ClassificationEngine {
446 pub fn new(backend: LlmBackend) -> Self {
447 let ollama = match &backend {
448 LlmBackend::Ollama => Some(OllamaClient::new()),
449 LlmBackend::Disabled => None,
450 };
451
452 Self { backend, ollama }
453 }
454
455 pub async fn classify(&self, relative_path: &str, content: &str) -> Classification {
459 let rule_result = classify_by_rules(relative_path);
461
462 if rule_result.confidence > 0.70 {
464 return rule_result;
465 }
466
467 if let Some(ollama) = &self.ollama {
469 if relative_path.to_lowercase().ends_with(".md")
470 || relative_path.to_lowercase().ends_with(".txt")
471 || relative_path.to_lowercase().ends_with(".rst")
472 {
473 if let Ok(llm_result) =
474 self.classify_by_llm(ollama, relative_path, content).await
475 {
476 if llm_result.doc_type == "unknown" && rule_result.doc_type != "unknown" {
480 return rule_result;
481 }
482 return llm_result;
483 }
484 }
485 }
486
487 rule_result
489 }
490
491 async fn classify_by_llm(
493 &self,
494 ollama: &OllamaClient,
495 relative_path: &str,
496 content: &str,
497 ) -> Result<Classification, LlmError> {
498 let truncated = safe_truncate(content, 500);
500
501 let prompt = format!(
502 r#"Classify this document. Reply with ONLY a JSON object, no other text.
503
504Categories (pick one):
505- product (subcategories: prd, roadmap, brief)
506- research (subcategories: customer-interviews, competitive-analysis, user-research, market-research)
507- technical (subcategories: api-docs, architecture, spec, code, config)
508- design (subcategories: design-system, ui-specs, mockups)
509- decisions (subcategories: adr, rfc, meetings)
510- guides (subcategories: readme, onboarding, how-to, code-instructions)
511- unknown (no subcategory)
512
513File: {path}
514Content: "{content}"
515
516JSON:"#,
517 path = relative_path,
518 content = truncated.replace('"', "'"),
519 );
520
521 let response = ollama
522 .generate(&GenerateRequest {
523 prompt,
524 max_tokens: 80,
525 temperature: 0.1,
526 })
527 .await?;
528
529 parse_llm_response(&response.text)
531 }
532}
533
534fn parse_llm_response(text: &str) -> Result<Classification, LlmError> {
536 let json_str = extract_json(text)
538 .ok_or_else(|| LlmError::Parse("No JSON found in response".to_string()))?;
539
540 let parsed: serde_json::Value = serde_json::from_str(&json_str)
542 .map_err(|e| LlmError::Parse(format!("JSON parse error: {}", e)))?;
543
544 let category = parsed
545 .get("category")
546 .and_then(|v| v.as_str())
547 .unwrap_or("unknown")
548 .to_lowercase();
549
550 let subcategory = parsed
551 .get("subcategory")
552 .and_then(|v| v.as_str())
553 .map(|s| s.to_lowercase());
554
555 let raw_confidence = parsed
556 .get("confidence")
557 .and_then(|v| v.as_f64())
558 .unwrap_or(0.70);
559
560 let category = if is_valid_category(&category) {
562 category
563 } else {
564 "unknown".to_string()
565 };
566
567 let subcategory = match &subcategory {
569 Some(sub) if category != "unknown" && is_valid_subcategory(&category, sub) => {
570 Some(sub.clone())
571 }
572 _ if category == "unknown" => None,
573 _ => None,
574 };
575
576 let confidence = if category == "unknown" {
578 0.30
579 } else {
580 raw_confidence.clamp(0.60, 0.90)
581 };
582
583 Ok(Classification {
584 doc_type: category,
585 subcategory,
586 confidence,
587 source: ClassificationSource::Llm,
588 })
589}
590
591fn extract_json(text: &str) -> Option<String> {
593 let start = text.find('{')?;
594 let mut depth = 0;
595 let mut end = start;
596
597 for (i, ch) in text[start..].char_indices() {
598 match ch {
599 '{' => depth += 1,
600 '}' => {
601 depth -= 1;
602 if depth == 0 {
603 end = start + i + 1;
604 break;
605 }
606 }
607 _ => {}
608 }
609 }
610
611 if depth == 0 && end > start {
612 Some(text[start..end].to_string())
613 } else {
614 None
615 }
616}
617
618#[cfg(test)]
619mod tests {
620 use super::*;
621
622 #[test]
625 fn test_path_rules_sorted_by_prefix_length() {
626 for window in PATH_RULES.windows(2) {
627 let a = &window[0];
628 let b = &window[1];
629 if b.prefix.starts_with(a.prefix) || a.prefix.starts_with(b.prefix) {
632 assert!(
633 a.prefix.len() >= b.prefix.len(),
634 "LPM invariant violated: '{}' (len {}) should come after '{}' (len {})",
635 a.prefix, a.prefix.len(), b.prefix, b.prefix.len()
636 );
637 }
638 }
639 }
640
641 #[test]
644 fn test_path_contains_word_basic() {
645 assert!(path_contains_word("some/spec/file.md", "spec"));
646 assert!(path_contains_word("spec/file.md", "spec"));
647 assert!(path_contains_word("dir/spec.md", "spec"));
648 assert!(path_contains_word("spec", "spec"));
649 }
650
651 #[test]
652 fn test_path_contains_word_rejects_substring() {
653 assert!(!path_contains_word("aspect-1/file.md", "spec"));
654 assert!(!path_contains_word("inspect/file.md", "spec"));
655 assert!(!path_contains_word("specification.md", "spec"));
656 assert!(!path_contains_word("retrospective.md", "spec"));
657 }
658
659 #[test]
660 fn test_path_contains_word_boundaries() {
661 assert!(path_contains_word("path/spec-v2.md", "spec"));
662 assert!(path_contains_word("path/spec_v2.md", "spec"));
663 assert!(path_contains_word("path/spec.md", "spec"));
664 assert!(path_contains_word("api/v2/docs.md", "api"));
665 assert!(!path_contains_word("rapid/file.md", "api"));
666 }
667
668 #[test]
671 fn test_rule_readme() {
672 let result = classify_by_rules("README.md");
673 assert_eq!(result.doc_type, "guides");
674 assert_eq!(result.subcategory.as_deref(), Some("readme"));
675 assert_eq!(result.source, ClassificationSource::Rule);
676 assert!(result.confidence >= 0.95);
677 }
678
679 #[test]
680 fn test_rule_claude_md() {
681 let result = classify_by_rules("CLAUDE.md");
682 assert_eq!(result.doc_type, "guides");
683 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
684 }
685
686 #[test]
687 fn test_rule_docs_directory() {
688 let result = classify_by_rules("docs/setup.md");
689 assert_eq!(result.doc_type, "guides");
690 assert_eq!(result.subcategory.as_deref(), Some("how-to"));
691 assert!((result.confidence - 0.50).abs() < 0.01);
693 }
694
695 #[test]
696 fn test_rule_github_workflow() {
697 let result = classify_by_rules(".github/workflows/ci.yml");
698 assert_eq!(result.doc_type, "technical");
699 assert_eq!(result.subcategory.as_deref(), Some("config"));
700 assert!(result.confidence >= 0.85);
701 }
702
703 #[test]
704 fn test_rule_license() {
705 let result = classify_by_rules("LICENSE");
706 assert_eq!(result.doc_type, "guides");
707 assert_eq!(result.subcategory.as_deref(), Some("readme"));
708 }
709
710 #[test]
711 fn test_fallback_unknown_file() {
712 let result = classify_by_rules("random-notes.md");
713 assert_eq!(result.doc_type, "unknown");
714 assert_eq!(result.subcategory, None);
715 assert_eq!(result.source, ClassificationSource::Fallback);
716 assert!(result.confidence <= 0.10);
717 }
718
719 #[test]
720 fn test_rule_changelog() {
721 let result = classify_by_rules("CHANGELOG.md");
722 assert_eq!(result.doc_type, "decisions");
723 assert_eq!(result.subcategory.as_deref(), Some("meetings"));
724 }
725
726 #[test]
727 fn test_rule_dockerfile() {
728 let result = classify_by_rules("Dockerfile");
729 assert_eq!(result.doc_type, "technical");
730 assert_eq!(result.subcategory.as_deref(), Some("config"));
731 }
732
733 #[test]
734 fn test_rule_architecture_in_docs() {
735 let result = classify_by_rules("docs/architecture-overview.md");
737 assert_eq!(result.doc_type, "technical");
738 assert_eq!(result.subcategory.as_deref(), Some("architecture"));
739 assert_eq!(result.source, ClassificationSource::Rule);
740 }
741
742 #[test]
743 fn test_rule_cursorrules() {
744 let result = classify_by_rules(".cursorrules");
745 assert_eq!(result.doc_type, "guides");
746 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
747 }
748
749 #[test]
752 fn test_rule_prd() {
753 let result = classify_by_rules("prd-v2.md");
754 assert_eq!(result.doc_type, "product");
755 assert_eq!(result.subcategory.as_deref(), Some("prd"));
756 assert!(result.confidence >= 0.95);
757 }
758
759 #[test]
760 fn test_rule_adr() {
761 let result = classify_by_rules("adr-001-use-sqlite.md");
762 assert_eq!(result.doc_type, "decisions");
763 assert_eq!(result.subcategory.as_deref(), Some("adr"));
764 }
765
766 #[test]
767 fn test_rule_rfc() {
768 let result = classify_by_rules("rfc-classification-v2.md");
769 assert_eq!(result.doc_type, "decisions");
770 assert_eq!(result.subcategory.as_deref(), Some("rfc"));
771 }
772
773 #[test]
774 fn test_rule_interview() {
775 let result = classify_by_rules("interview-user-01.md");
776 assert_eq!(result.doc_type, "research");
777 assert_eq!(result.subcategory.as_deref(), Some("customer-interviews"));
778 }
779
780 #[test]
781 fn test_rule_rust_code() {
782 let result = classify_by_rules("src/main.rs");
783 assert_eq!(result.doc_type, "technical");
784 assert_eq!(result.subcategory.as_deref(), Some("code"));
785 assert!(result.confidence >= 0.95);
786 }
787
788 #[test]
789 fn test_rule_config_file() {
790 let result = classify_by_rules("config/settings.json");
791 assert_eq!(result.doc_type, "technical");
792 assert_eq!(result.subcategory.as_deref(), Some("config"));
793 }
794
795 #[test]
796 fn test_rule_claude_directory() {
797 let result = classify_by_rules(".claude/instructions");
799 assert_eq!(result.doc_type, "guides");
800 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
801 }
802
803 #[test]
804 fn test_rule_roadmap() {
805 let result = classify_by_rules("roadmap-2026.md");
806 assert_eq!(result.doc_type, "product");
807 assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
808 }
809
810 #[test]
811 fn test_rule_competitive_analysis() {
812 let result = classify_by_rules("competitive-analysis.md");
813 assert_eq!(result.doc_type, "research");
814 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
815 }
816
817 #[test]
818 fn test_rule_api_docs() {
819 let result = classify_by_rules("api-reference.md");
820 assert_eq!(result.doc_type, "technical");
821 assert_eq!(result.subcategory.as_deref(), Some("api-docs"));
822 }
823
824 #[test]
827 fn test_lpm_claude_research() {
828 let result = classify_by_rules(".claude/research/2026-01-19-kardo-market-research.md");
829 assert_eq!(result.doc_type, "research");
830 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
831 assert!((result.confidence - 0.75).abs() < 0.01);
832 }
833
834 #[test]
835 fn test_lpm_claude_research_competitors() {
836 let result = classify_by_rules(".claude/research/competitors/kardo-competitors.md");
837 assert_eq!(result.doc_type, "research");
838 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
839 assert!((result.confidence - 0.80).abs() < 0.01);
840 }
841
842 #[test]
843 fn test_lpm_claude_plans() {
844 let result = classify_by_rules(".claude/plans/phase-6-fine-tuning-qwen3-4b.md");
845 assert_eq!(result.doc_type, "product");
846 assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
847 assert!((result.confidence - 0.75).abs() < 0.01);
848 }
849
850 #[test]
851 fn test_lpm_claude_analysis() {
852 let result = classify_by_rules(".claude/analysis/recommendations-final.md");
853 assert_eq!(result.doc_type, "research");
854 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
855 }
856
857 #[test]
858 fn test_lpm_claude_agents_no_prefix_collision() {
859 let result = classify_by_rules(".claude/agents/api-validator.md");
861 assert_eq!(result.doc_type, "guides");
862 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
863 assert!((result.confidence - 0.80).abs() < 0.01);
864 }
865
866 #[test]
867 fn test_lpm_claude_agents_competitive_no_prefix_collision() {
868 let result = classify_by_rules(".claude/agents/competitive-research.md");
870 assert_eq!(result.doc_type, "guides");
871 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
872 }
873
874 #[test]
875 fn test_lpm_claude_catch_all_low_confidence() {
876 let result = classify_by_rules(".claude/OPEN_QUESTIONS.md");
878 assert_eq!(result.doc_type, "guides");
879 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
880 assert!((result.confidence - 0.50).abs() < 0.01);
881 }
882
883 #[test]
884 fn test_lpm_docs_plans() {
885 let result = classify_by_rules("docs/plans/IDEAS.md");
886 assert_eq!(result.doc_type, "product");
887 assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
888 assert!((result.confidence - 0.75).abs() < 0.01);
889 }
890
891 #[test]
892 fn test_lpm_docs_catch_all_low_confidence() {
893 let result = classify_by_rules("docs/MVP_PRD.md");
895 assert_eq!(result.doc_type, "product");
896 assert_eq!(result.subcategory.as_deref(), Some("prd"));
897 }
898
899 #[test]
900 fn test_lpm_ai_first_idea_research() {
901 let result = classify_by_rules("AI_First_Idea/research/FINAL-RESEARCH-REPORT.md");
902 assert_eq!(result.doc_type, "research");
903 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
904 assert!((result.confidence - 0.75).abs() < 0.01);
905 }
906
907 #[test]
908 fn test_lpm_ai_first_idea_articles() {
909 let result = classify_by_rules("AI_First_Idea/articles/ai-first-manifesto/article-en.md");
910 assert_eq!(result.doc_type, "guides");
911 assert_eq!(result.subcategory.as_deref(), Some("how-to"));
912 assert!((result.confidence - 0.70).abs() < 0.01);
913 }
914
915 #[test]
916 fn test_lpm_ai_first_idea_root() {
917 let result = classify_by_rules("AI_First_Idea/VISION_DOCUMENT.md");
918 assert_eq!(result.doc_type, "research");
919 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
920 assert!((result.confidence - 0.55).abs() < 0.01);
921 }
922
923 #[test]
924 fn test_lpm_habr_plan_pipeline_research() {
925 let result = classify_by_rules("Habr_plan/pipeline/research/habr-content-strategy.md");
926 assert_eq!(result.doc_type, "research");
927 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
928 }
929
930 #[test]
931 fn test_lpm_habr_plan_articles() {
932 let result = classify_by_rules("Habr_plan/articles/ai-killing-b2b-saas/habr-article.md");
933 assert_eq!(result.doc_type, "guides");
934 assert_eq!(result.subcategory.as_deref(), Some("how-to"));
935 }
936
937 #[test]
938 fn test_lpm_habr_plan_root() {
939 let result = classify_by_rules("Habr_plan/habr-editor.skill");
940 assert_eq!(result.doc_type, "guides");
943 assert_eq!(result.subcategory.as_deref(), Some("how-to"));
944 }
945
946 #[test]
947 fn test_spec_word_boundary_no_false_positive() {
948 let result = classify_by_rules("AI_First_Idea/research/aspect-1-saas-decline/raw-findings.md");
950 assert_eq!(result.doc_type, "research");
952 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
953 }
954
955 #[test]
956 fn test_spec_word_boundary_true_positive() {
957 let result = classify_by_rules("project/spec/requirements.md");
958 assert_eq!(result.doc_type, "product");
959 assert_eq!(result.subcategory.as_deref(), Some("prd"));
960 }
961
962 #[test]
963 fn test_lpm_claude_discovery() {
964 let result = classify_by_rules(".claude/discovery/2026-02-03-kardo-core-problem.md");
965 assert_eq!(result.doc_type, "research");
966 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
967 }
968
969 #[test]
970 fn test_lpm_claude_components_registry() {
971 let result = classify_by_rules(".claude/components-registry/docs/shared/animated-tabs.md");
972 assert_eq!(result.doc_type, "design");
973 assert_eq!(result.subcategory.as_deref(), Some("design-system"));
974 }
975
976 #[test]
977 fn test_lpm_claude_memory() {
978 let result = classify_by_rules(".claude/memory/context.md");
979 assert_eq!(result.doc_type, "guides");
980 assert_eq!(result.subcategory.as_deref(), Some("how-to"));
981 }
982
983 #[test]
984 fn test_lpm_github_without_workflow() {
985 let result = classify_by_rules(".github/CODEOWNERS");
986 assert_eq!(result.doc_type, "technical");
987 assert_eq!(result.subcategory.as_deref(), Some("config"));
988 }
989
990 #[test]
993 fn test_rule_windsurfrules() {
994 let result = classify_by_rules(".windsurfrules");
995 assert_eq!(result.doc_type, "guides");
996 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
997 assert!(result.confidence >= 0.95);
998 }
999
1000 #[test]
1001 fn test_rule_agents_md() {
1002 let result = classify_by_rules("AGENTS.md");
1003 assert_eq!(result.doc_type, "guides");
1004 assert_eq!(result.subcategory.as_deref(), Some("code-instructions"));
1005 assert!(result.confidence >= 0.95);
1006 }
1007
1008 #[test]
1011 fn test_parse_llm_valid_json() {
1012 let response = r#"{"category": "product", "subcategory": "prd", "confidence": 0.85}"#;
1013 let result = parse_llm_response(response).unwrap();
1014 assert_eq!(result.doc_type, "product");
1015 assert_eq!(result.subcategory.as_deref(), Some("prd"));
1016 assert!((result.confidence - 0.85).abs() < 0.01);
1017 assert_eq!(result.source, ClassificationSource::Llm);
1018 }
1019
1020 #[test]
1021 fn test_parse_llm_json_with_surrounding_text() {
1022 let response = r#"Here is the classification:
1023{"category": "research", "subcategory": "customer-interviews", "confidence": 0.90}
1024That's my answer."#;
1025 let result = parse_llm_response(response).unwrap();
1026 assert_eq!(result.doc_type, "research");
1027 assert_eq!(
1028 result.subcategory.as_deref(),
1029 Some("customer-interviews")
1030 );
1031 }
1032
1033 #[test]
1034 fn test_parse_llm_unknown_with_low_confidence() {
1035 let response = r#"{"category": "unknown", "confidence": 0.50}"#;
1036 let result = parse_llm_response(response).unwrap();
1037 assert_eq!(result.doc_type, "unknown");
1038 assert_eq!(result.subcategory, None);
1039 assert!((result.confidence - 0.30).abs() < 0.01);
1040 }
1041
1042 #[test]
1043 fn test_parse_llm_confidence_clamping() {
1044 let response = r#"{"category": "product", "subcategory": "prd", "confidence": 0.99}"#;
1045 let result = parse_llm_response(response).unwrap();
1046 assert!((result.confidence - 0.90).abs() < 0.01); let response2 = r#"{"category": "product", "subcategory": "prd", "confidence": 0.20}"#;
1049 let result2 = parse_llm_response(response2).unwrap();
1050 assert!((result2.confidence - 0.60).abs() < 0.01); }
1052
1053 #[test]
1054 fn test_parse_llm_invalid_category() {
1055 let response = r#"{"category": "banana", "confidence": 0.80}"#;
1056 let result = parse_llm_response(response).unwrap();
1057 assert_eq!(result.doc_type, "unknown");
1058 assert!((result.confidence - 0.30).abs() < 0.01);
1059 }
1060
1061 #[test]
1062 fn test_parse_llm_invalid_subcategory() {
1063 let response = r#"{"category": "product", "subcategory": "nonexistent", "confidence": 0.80}"#;
1064 let result = parse_llm_response(response).unwrap();
1065 assert_eq!(result.doc_type, "product");
1066 assert_eq!(result.subcategory, None); }
1068
1069 #[test]
1070 fn test_parse_llm_no_json() {
1071 let response = "This is just plain text with no JSON";
1072 let result = parse_llm_response(response);
1073 assert!(result.is_err());
1074 }
1075
1076 #[test]
1077 fn test_extract_json() {
1078 assert_eq!(
1079 extract_json(r#"blah {"a": 1} blah"#),
1080 Some(r#"{"a": 1}"#.to_string())
1081 );
1082 assert_eq!(extract_json("no json here"), None);
1083 assert_eq!(
1084 extract_json(r#"{"nested": {"b": 2}}"#),
1085 Some(r#"{"nested": {"b": 2}}"#.to_string())
1086 );
1087 }
1088
1089 #[test]
1092 fn test_engine_high_confidence_skips_llm() {
1093 let engine = ClassificationEngine::new(LlmBackend::Disabled);
1094 let rt = tokio::runtime::Runtime::new().unwrap();
1095 let result = rt.block_on(engine.classify("README.md", "# My Project"));
1096 assert_eq!(result.doc_type, "guides");
1097 assert_eq!(result.subcategory.as_deref(), Some("readme"));
1098 assert_eq!(result.source, ClassificationSource::Rule);
1099 }
1100
1101 #[test]
1102 fn test_engine_low_confidence_returns_rule_when_no_llm() {
1103 let engine = ClassificationEngine::new(LlmBackend::Disabled);
1104 let rt = tokio::runtime::Runtime::new().unwrap();
1105 let result = rt.block_on(engine.classify("random-notes.md", "Some random content"));
1106 assert_eq!(result.doc_type, "unknown");
1107 assert_eq!(result.source, ClassificationSource::Fallback);
1108 }
1109
1110 #[test]
1113 fn test_docs_mvp_prd_classified_as_product_prd() {
1114 let result = classify_by_rules("docs/MVP_PRD.md");
1116 assert_eq!(result.doc_type, "product");
1117 assert_eq!(result.subcategory.as_deref(), Some("prd"));
1118 }
1119
1120 #[test]
1121 fn test_docs_requirements_system_classified_as_product_prd() {
1122 let result = classify_by_rules("docs/REQUIREMENTS_SYSTEM.md");
1124 assert_eq!(result.doc_type, "product");
1125 assert_eq!(result.subcategory.as_deref(), Some("prd"));
1126 }
1127
1128 #[test]
1129 fn test_docs_ui_decisions_classified_as_design_ui_specs() {
1130 let result = classify_by_rules("docs/UI_DECISIONS.md");
1132 assert_eq!(result.doc_type, "design");
1133 assert_eq!(result.subcategory.as_deref(), Some("ui-specs"));
1134 }
1135
1136 #[test]
1137 fn test_docs_product_strategy_classified_as_product_brief() {
1138 let result = classify_by_rules("docs/PRODUCT_STRATEGY.md");
1140 assert_eq!(result.doc_type, "product");
1141 assert_eq!(result.subcategory.as_deref(), Some("brief"));
1142 }
1143
1144 #[test]
1147 fn test_research_custdev_interview_classified_correctly() {
1148 let result = classify_by_rules(".claude/research/2026-01-19-custdev-interview-1.md");
1150 assert_eq!(result.doc_type, "research");
1151 assert_eq!(result.subcategory.as_deref(), Some("customer-interviews"));
1152 }
1153
1154 #[test]
1155 fn test_research_competitive_analysis_date_prefix() {
1156 let result = classify_by_rules(
1158 ".claude/research/2026-01-28-competitive-analysis-docs-visualization.md",
1159 );
1160 assert_eq!(result.doc_type, "research");
1161 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1162 }
1163
1164 #[test]
1165 fn test_research_competitive_positioning_date_prefix() {
1166 let result = classify_by_rules(
1167 ".claude/research/2026-01-28-competitive-positioning-claude-code-companion.md",
1168 );
1169 assert_eq!(result.doc_type, "research");
1170 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1171 }
1172
1173 #[test]
1174 fn test_research_cycle_competitive_moat() {
1175 let result = classify_by_rules(".claude/research/cycle8-competitive-moat.md");
1176 assert_eq!(result.doc_type, "research");
1177 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1178 }
1179
1180 #[test]
1181 fn test_research_kardo_pmf_prd_file() {
1182 let result = classify_by_rules(
1184 ".claude/research/kardo-pmf/Feature_Requirements_PRD.md",
1185 );
1186 assert_eq!(result.doc_type, "product");
1187 assert_eq!(result.subcategory.as_deref(), Some("prd"));
1188 }
1189
1190 #[test]
1191 fn test_research_competitors_dir_still_correct() {
1192 let result = classify_by_rules(
1194 ".claude/research/competitors/kardo-competitors.md",
1195 );
1196 assert_eq!(result.doc_type, "research");
1197 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1198 assert!((result.confidence - 0.80).abs() < 0.01);
1199 }
1200
1201 #[test]
1204 fn test_ai_first_idea_timestamp_file_is_customer_interview() {
1205 let result = classify_by_rules("AI_First_Idea/2026-02-14 22.19.41.md");
1207 assert_eq!(result.doc_type, "research");
1208 assert_eq!(result.subcategory.as_deref(), Some("customer-interviews"));
1209 assert!((result.confidence - 0.80).abs() < 0.01);
1210 }
1211
1212 #[test]
1213 fn test_ai_first_idea_non_timestamp_not_affected() {
1214 let result = classify_by_rules("AI_First_Idea/VISION_DOCUMENT.md");
1216 assert_eq!(result.doc_type, "research");
1217 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
1218 }
1219
1220 #[test]
1223 fn test_calibration_dataset_classified_as_technical_config() {
1224 let result = classify_by_rules(
1225 ".claude/research/calibration/dataset/Alive24--CKBoost/claude-md.md",
1226 );
1227 assert_eq!(result.doc_type, "technical");
1228 assert_eq!(result.subcategory.as_deref(), Some("config"));
1229 assert!((result.confidence - 0.70).abs() < 0.01);
1230 }
1231
1232 #[test]
1235 fn test_kardo_build_notes_classified_as_technical() {
1236 let result = classify_by_rules(
1238 "kardo/crates/kardo-desktop/build-notes.md",
1239 );
1240 assert_eq!(result.doc_type, "technical");
1241 assert_eq!(result.subcategory.as_deref(), Some("code"));
1242 assert!((result.confidence - 0.50).abs() < 0.01);
1243 }
1244
1245 #[test]
1248 fn test_research_competitors_word_refined_to_competitive_analysis() {
1249 let result = classify_by_rules(".claude/research/2026-01-22-kardo-competitors.md");
1251 assert_eq!(result.doc_type, "research");
1252 assert_eq!(result.subcategory.as_deref(), Some("competitive-analysis"));
1253 }
1254
1255 #[test]
1256 fn test_research_customer_voice_refined_to_user_research() {
1257 let result = classify_by_rules(".claude/research/cycle1-customer-voice.md");
1259 assert_eq!(result.doc_type, "research");
1260 assert_eq!(result.subcategory.as_deref(), Some("user-research"));
1261 }
1262
1263 #[test]
1264 fn test_research_ux_patterns_refined_to_user_research() {
1265 let result = classify_by_rules(
1267 ".claude/research/2026-02-03-R5-health-dashboard-ux-patterns.md",
1268 );
1269 assert_eq!(result.doc_type, "research");
1270 assert_eq!(result.subcategory.as_deref(), Some("user-research"));
1271 }
1272
1273 #[test]
1274 fn test_research_persona_refined_to_user_research() {
1275 let result = classify_by_rules(".claude/research/persona-analysis.md");
1277 assert_eq!(result.doc_type, "research");
1278 assert_eq!(result.subcategory.as_deref(), Some("user-research"));
1279 }
1280
1281 #[test]
1282 fn test_research_market_still_default() {
1283 let result = classify_by_rules(
1285 ".claude/research/2026-01-19-kardo-market-research.md",
1286 );
1287 assert_eq!(result.doc_type, "research");
1288 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
1289 }
1290
1291 #[test]
1294 fn test_research_architecture_blueprint_refined_to_technical() {
1295 let result = classify_by_rules(
1296 ".claude/research/2026-02-11-architecture-blueprint.md",
1297 );
1298 assert_eq!(result.doc_type, "technical");
1299 assert_eq!(result.subcategory.as_deref(), Some("architecture"));
1300 }
1301
1302 #[test]
1303 fn test_research_classification_architecture_refined_to_technical() {
1304 let result = classify_by_rules(
1305 ".claude/research/2026-01-28-context-aware-classification-architecture.md",
1306 );
1307 assert_eq!(result.doc_type, "technical");
1308 assert_eq!(result.subcategory.as_deref(), Some("architecture"));
1309 }
1310
1311 #[test]
1312 fn test_research_implementation_plan_refined_to_product_roadmap() {
1313 let result = classify_by_rules(
1314 ".claude/research/2026-02-11-phase1-implementation-plan.md",
1315 );
1316 assert_eq!(result.doc_type, "product");
1317 assert_eq!(result.subcategory.as_deref(), Some("roadmap"));
1318 }
1319
1320 #[test]
1321 fn test_research_implementation_without_plan_stays_research() {
1322 let result = classify_by_rules(
1324 ".claude/research/2026-01-22-virtual-structure-implementation.md",
1325 );
1326 assert_eq!(result.doc_type, "research");
1327 assert_eq!(result.subcategory.as_deref(), Some("market-research"));
1328 }
1329}