Skip to main content

lean_ctx/core/
knowledge_provider_extract.rs

1//! Knowledge auto-extraction from provider data (issues, PRs, DB schemas).
2//!
3//! Converts external `ContentChunk`s into knowledge facts that flow into the
4//! `ProjectKnowledge` system. This implements the "Sleep Replay" pattern from
5//! hippocampal consolidation: raw episodic data is transformed into semantic
6//! long-term knowledge.
7//!
8//! Extraction rules:
9//!   - Issues with labels → `known_bugs`, `known_features`, `known_issues`
10//!   - PRs → `recent_changes`, `active_branches`
11//!   - DB schemas → `data_model` facts
12//!   - Wiki pages → `documentation` facts
13
14use crate::core::content_chunk::ContentChunk;
15
16/// A knowledge fact extracted from provider data, ready for `ProjectKnowledge.remember()`.
17#[derive(Debug, Clone)]
18pub struct ExtractedFact {
19    pub category: String,
20    pub key: String,
21    pub value: String,
22    pub confidence: f32,
23}
24
25/// Extract knowledge facts from a set of ContentChunks.
26pub fn extract_facts(chunks: &[ContentChunk]) -> Vec<ExtractedFact> {
27    let mut facts = Vec::new();
28
29    for chunk in chunks {
30        if !chunk.is_external() {
31            continue;
32        }
33
34        let provider = chunk.provider_id().unwrap_or("unknown");
35        match chunk.kind {
36            crate::core::bm25_index::ChunkKind::Issue
37            | crate::core::bm25_index::ChunkKind::Ticket => {
38                extract_issue_facts(chunk, provider, &mut facts);
39            }
40            crate::core::bm25_index::ChunkKind::PullRequest => {
41                extract_pr_facts(chunk, provider, &mut facts);
42            }
43            crate::core::bm25_index::ChunkKind::WikiPage => {
44                extract_wiki_facts(chunk, provider, &mut facts);
45            }
46            crate::core::bm25_index::ChunkKind::DbSchema => {
47                extract_db_facts(chunk, provider, &mut facts);
48            }
49            _ => {}
50        }
51    }
52
53    facts
54}
55
56fn extract_issue_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
57    let state = chunk
58        .metadata
59        .as_ref()
60        .and_then(|m| m["state"].as_str())
61        .unwrap_or("unknown");
62
63    let labels: Vec<&str> = chunk
64        .metadata
65        .as_ref()
66        .and_then(|m| m["labels"].as_array())
67        .map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
68        .unwrap_or_default();
69
70    let category = if labels.iter().any(|l| {
71        let lower = l.to_lowercase();
72        lower.contains("bug") || lower.contains("defect")
73    }) {
74        "known_bugs"
75    } else if labels.iter().any(|l| {
76        let lower = l.to_lowercase();
77        lower.contains("feature") || lower.contains("enhancement")
78    }) {
79        "known_features"
80    } else {
81        "known_issues"
82    };
83
84    let issue_id = chunk
85        .file_path
86        .rsplit('/')
87        .next()
88        .unwrap_or(&chunk.file_path);
89
90    facts.push(ExtractedFact {
91        category: category.to_string(),
92        key: format!("{provider}#{issue_id}"),
93        value: format!("{} [{}]", chunk.symbol_name, state),
94        confidence: if state == "open" { 0.9 } else { 0.7 },
95    });
96
97    for ref_path in &chunk.references {
98        facts.push(ExtractedFact {
99            category: "file_mentions".to_string(),
100            key: ref_path.clone(),
101            value: format!(
102                "Referenced in {} {provider}#{issue_id}: {}",
103                category, chunk.symbol_name
104            ),
105            confidence: 0.85,
106        });
107    }
108}
109
110fn extract_pr_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
111    let state = chunk
112        .metadata
113        .as_ref()
114        .and_then(|m| m["state"].as_str())
115        .unwrap_or("unknown");
116
117    let pr_id = chunk
118        .file_path
119        .rsplit('/')
120        .next()
121        .unwrap_or(&chunk.file_path);
122
123    facts.push(ExtractedFact {
124        category: "recent_changes".to_string(),
125        key: format!("{provider}#PR{pr_id}"),
126        value: format!("{} [{}]", chunk.symbol_name, state),
127        confidence: if state == "open" { 0.95 } else { 0.8 },
128    });
129
130    for ref_path in &chunk.references {
131        facts.push(ExtractedFact {
132            category: "changed_files".to_string(),
133            key: ref_path.clone(),
134            value: format!("Changed in PR {provider}#{pr_id}: {}", chunk.symbol_name),
135            confidence: 0.9,
136        });
137    }
138}
139
140fn extract_wiki_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
141    let page_id = chunk
142        .file_path
143        .rsplit('/')
144        .next()
145        .unwrap_or(&chunk.file_path);
146
147    facts.push(ExtractedFact {
148        category: "documentation".to_string(),
149        key: format!("{provider}#{page_id}"),
150        value: chunk.symbol_name.clone(),
151        confidence: 0.85,
152    });
153
154    for ref_path in &chunk.references {
155        facts.push(ExtractedFact {
156            category: "documented_files".to_string(),
157            key: ref_path.clone(),
158            value: format!("Documented in {provider}#{page_id}: {}", chunk.symbol_name),
159            confidence: 0.8,
160        });
161    }
162}
163
164fn extract_db_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
165    let table_id = chunk
166        .file_path
167        .rsplit('/')
168        .next()
169        .unwrap_or(&chunk.file_path);
170
171    facts.push(ExtractedFact {
172        category: "data_model".to_string(),
173        key: format!("{provider}#{table_id}"),
174        value: chunk.symbol_name.clone(),
175        confidence: 0.95,
176    });
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use crate::core::bm25_index::ChunkKind;
183    use crate::core::content_chunk::ContentChunk;
184
185    fn issue_with_labels(id: &str, title: &str, labels: &[&str], refs: Vec<&str>) -> ContentChunk {
186        ContentChunk::from_provider(
187            "github",
188            "issues",
189            id,
190            title,
191            ChunkKind::Issue,
192            format!("Body of {title}"),
193            refs.into_iter().map(String::from).collect(),
194            Some(serde_json::json!({
195                "state": "open",
196                "labels": labels,
197            })),
198        )
199    }
200
201    #[test]
202    fn bug_label_creates_known_bugs_fact() {
203        let chunk = issue_with_labels("42", "Auth crash", &["bug", "p1"], vec!["src/auth.rs"]);
204        let facts = extract_facts(&[chunk]);
205
206        let bug_fact = facts.iter().find(|f| f.category == "known_bugs");
207        assert!(bug_fact.is_some());
208        assert!(bug_fact.unwrap().key.contains("42"));
209        assert!(bug_fact.unwrap().value.contains("Auth crash"));
210        assert!(bug_fact.unwrap().value.contains("[open]"));
211    }
212
213    #[test]
214    fn feature_label_creates_known_features_fact() {
215        let chunk = issue_with_labels("10", "Dark mode", &["enhancement"], vec![]);
216        let facts = extract_facts(&[chunk]);
217
218        assert!(facts.iter().any(|f| f.category == "known_features"));
219    }
220
221    #[test]
222    fn generic_issue_creates_known_issues_fact() {
223        let chunk = issue_with_labels("5", "Question about API", &["question"], vec![]);
224        let facts = extract_facts(&[chunk]);
225
226        assert!(facts.iter().any(|f| f.category == "known_issues"));
227    }
228
229    #[test]
230    fn issue_with_refs_creates_file_mentions() {
231        let chunk = issue_with_labels(
232            "42",
233            "Auth crash",
234            &["bug"],
235            vec!["src/auth.rs", "src/db.rs"],
236        );
237        let facts = extract_facts(&[chunk]);
238
239        let mentions: Vec<_> = facts
240            .iter()
241            .filter(|f| f.category == "file_mentions")
242            .collect();
243        assert_eq!(mentions.len(), 2);
244        assert!(mentions.iter().any(|f| f.key == "src/auth.rs"));
245        assert!(mentions.iter().any(|f| f.key == "src/db.rs"));
246    }
247
248    #[test]
249    fn pr_creates_recent_changes_and_changed_files() {
250        let chunk = ContentChunk::from_provider(
251            "github",
252            "pull_requests",
253            "100",
254            "Fix auth token expiry",
255            ChunkKind::PullRequest,
256            "Fixes token expiry".into(),
257            vec!["src/auth.rs".into()],
258            Some(serde_json::json!({"state": "open"})),
259        );
260
261        let facts = extract_facts(&[chunk]);
262        assert!(facts.iter().any(|f| f.category == "recent_changes"));
263        assert!(facts
264            .iter()
265            .any(|f| f.category == "changed_files" && f.key == "src/auth.rs"));
266    }
267
268    #[test]
269    fn wiki_creates_documentation_facts() {
270        let chunk = ContentChunk::from_provider(
271            "confluence",
272            "wikis",
273            "auth-guide",
274            "Authentication Guide",
275            ChunkKind::WikiPage,
276            "How auth works".into(),
277            vec!["src/auth/mod.rs".into()],
278            None,
279        );
280
281        let facts = extract_facts(&[chunk]);
282        assert!(facts.iter().any(|f| f.category == "documentation"));
283        assert!(facts.iter().any(|f| f.category == "documented_files"));
284    }
285
286    #[test]
287    fn db_creates_data_model_facts() {
288        let chunk = ContentChunk::from_provider(
289            "postgres",
290            "schemas",
291            "users",
292            "public.users",
293            ChunkKind::DbSchema,
294            "CREATE TABLE users (id serial, email varchar)".into(),
295            vec![],
296            None,
297        );
298
299        let facts = extract_facts(&[chunk]);
300        assert_eq!(facts.len(), 1);
301        assert_eq!(facts[0].category, "data_model");
302        assert_eq!(facts[0].confidence, 0.95);
303    }
304
305    #[test]
306    fn code_chunks_are_skipped() {
307        let chunk = ContentChunk::from(crate::core::bm25_index::CodeChunk {
308            file_path: "src/main.rs".into(),
309            symbol_name: "main".into(),
310            kind: ChunkKind::Function,
311            start_line: 1,
312            end_line: 5,
313            content: "fn main() {}".into(),
314            tokens: vec![],
315            token_count: 0,
316        });
317
318        let facts = extract_facts(&[chunk]);
319        assert!(facts.is_empty());
320    }
321
322    #[test]
323    fn closed_issues_have_lower_confidence() {
324        let chunk = ContentChunk::from_provider(
325            "github",
326            "issues",
327            "99",
328            "Old bug",
329            ChunkKind::Issue,
330            "Fixed".into(),
331            vec![],
332            Some(serde_json::json!({"state": "closed", "labels": ["bug"]})),
333        );
334
335        let facts = extract_facts(&[chunk]);
336        let fact = facts.iter().find(|f| f.category == "known_bugs").unwrap();
337        assert!(fact.confidence < 0.9);
338    }
339}