Skip to main content

sqlite_graphrag/commands/
ingest_heuristics.rs

1//! Deterministic heuristic for generating descriptions of ingested memories.
2//!
3//! GAP-E2E-011 (FALTA-6): every ingested memory received the hardcoded
4//! description `"ingested from <path>"`, which made the listing useless
5//! and degraded search results. This pure-Rust heuristic extracts
6//! the first meaningful line of the body, ignoring markdown headers.
7//!
8//! Rules:
9//! - First non-empty line longer than 20 characters
10//! - Ignores lines starting with `#` (markdown headers)
11//! - Truncates at 100 characters via `chars().take(100)`
12//! - Fallback: `"ingested document"` when no line is valid
13//!
14//! Determinism: zero hash-order-based allocation, zero LLM,
15//! zero dependency on filesystem order. Byte-for-byte reproducible output.
16
17/// Extracts a heuristic description from the body of an ingested document.
18///
19/// Returns the first meaningful line (non-empty, >20 chars, not a markdown
20/// header) truncated at 100 characters. Contextual deterministic fallback:
21/// when no line meets the criteria, uses the path stem (name without extension),
22/// or `"ingested document"` if the stem is empty or invalid.
23///
24/// FALTA-6 (v1.0.89): the edge case of a body with only Markdown headers now
25/// generates a description useful to the operator instead of the generic placeholder.
26pub fn extract_heuristic_description(body: &str, path_hint: Option<&str>) -> String {
27    let from_body = body
28        .lines()
29        .map(str::trim)
30        .find(|line| line.len() > 20 && !line.starts_with('#'))
31        .map(|line| line.chars().take(100).collect::<String>());
32    if let Some(desc) = from_body {
33        return desc;
34    }
35    // Fallback contextual: usar stem do path quando heurística do body falhar.
36    if let Some(stem) = path_hint.and_then(derive_stem) {
37        return stem;
38    }
39    "ingested document".to_string()
40}
41
42/// Extracts the stem (name without extension) from a path, sanitized.
43fn derive_stem(path: &str) -> Option<String> {
44    let basename = std::path::Path::new(path)
45        .file_stem()
46        .and_then(|s| s.to_str())
47        .unwrap_or("")
48        .trim();
49    if basename.is_empty() || basename.len() < 2 {
50        return None;
51    }
52    Some(basename.chars().take(100).collect::<String>())
53}
54
55#[cfg(test)]
56mod tests {
57    use super::*;
58
59    #[test]
60    fn extracts_first_meaningful_line() {
61        let body = "\
62# Title
63
64This is the actual first sentence of the document that has more than twenty characters.
65Second line should be ignored.
66";
67        let desc = extract_heuristic_description(body, Some("/tmp/spec.md"));
68        assert!(
69            desc.starts_with("This is the actual"),
70            "desc deve começar com a primeira linha útil, got: {desc}"
71        );
72    }
73
74    #[test]
75    fn falls_back_to_stem_when_only_headers() {
76        // FALTA-6: documento markdown com apenas headers (sem texto > 20 chars).
77        let body = "\
78# Header 1
79## Header 2
80### Header 3
81";
82        let desc = extract_heuristic_description(body, Some("/tmp/headers-only.md"));
83        assert_eq!(desc, "headers-only");
84    }
85
86    #[test]
87    fn falls_back_to_ingested_document_when_no_path() {
88        let body = "# Only Header";
89        let desc = extract_heuristic_description(body, None);
90        assert_eq!(desc, "ingested document");
91    }
92
93    #[test]
94    fn truncates_at_100_chars() {
95        let long = "a".repeat(200);
96        let desc = extract_heuristic_description(&long, None);
97        assert!(
98            desc.chars().count() <= 100,
99            "desc deve ter no máximo 100 chars, got: {}",
100            desc.chars().count()
101        );
102    }
103
104    #[test]
105    fn back_compat_single_arg_returns_body_only() {
106        // Confirma o caminho simplificado (sem path_hint) ainda funciona.
107        let body = "\
108# H
109
110First sentence that has more than twenty characters of useful text.
111";
112        let desc = extract_heuristic_description(body, None);
113        assert!(desc.starts_with("First sentence"));
114    }
115}