1use crate::core::content_chunk::ContentChunk;
15
16#[derive(Debug, Clone)]
18pub struct ExtractedFact {
19 pub category: String,
20 pub key: String,
21 pub value: String,
22 pub confidence: f32,
23}
24
25pub fn extract_facts(chunks: &[ContentChunk]) -> Vec<ExtractedFact> {
27 let mut facts = Vec::new();
28
29 for chunk in chunks {
30 if !chunk.is_external() {
31 continue;
32 }
33
34 let provider = chunk.provider_id().unwrap_or("unknown");
35 match chunk.kind {
36 crate::core::bm25_index::ChunkKind::Issue
37 | crate::core::bm25_index::ChunkKind::Ticket => {
38 extract_issue_facts(chunk, provider, &mut facts);
39 }
40 crate::core::bm25_index::ChunkKind::PullRequest => {
41 extract_pr_facts(chunk, provider, &mut facts);
42 }
43 crate::core::bm25_index::ChunkKind::WikiPage => {
44 extract_wiki_facts(chunk, provider, &mut facts);
45 }
46 crate::core::bm25_index::ChunkKind::DbSchema => {
47 extract_db_facts(chunk, provider, &mut facts);
48 }
49 _ => {}
50 }
51 }
52
53 facts
54}
55
56fn extract_issue_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
57 let state = chunk
58 .metadata
59 .as_ref()
60 .and_then(|m| m["state"].as_str())
61 .unwrap_or("unknown");
62
63 let labels: Vec<&str> = chunk
64 .metadata
65 .as_ref()
66 .and_then(|m| m["labels"].as_array())
67 .map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
68 .unwrap_or_default();
69
70 let category = if labels.iter().any(|l| {
71 let lower = l.to_lowercase();
72 lower.contains("bug") || lower.contains("defect")
73 }) {
74 "known_bugs"
75 } else if labels.iter().any(|l| {
76 let lower = l.to_lowercase();
77 lower.contains("feature") || lower.contains("enhancement")
78 }) {
79 "known_features"
80 } else {
81 "known_issues"
82 };
83
84 let issue_id = chunk
85 .file_path
86 .rsplit('/')
87 .next()
88 .unwrap_or(&chunk.file_path);
89
90 facts.push(ExtractedFact {
91 category: category.to_string(),
92 key: format!("{provider}#{issue_id}"),
93 value: format!("{} [{}]", chunk.symbol_name, state),
94 confidence: if state == "open" { 0.9 } else { 0.7 },
95 });
96
97 for ref_path in &chunk.references {
98 facts.push(ExtractedFact {
99 category: "file_mentions".to_string(),
100 key: ref_path.clone(),
101 value: format!(
102 "Referenced in {} {provider}#{issue_id}: {}",
103 category, chunk.symbol_name
104 ),
105 confidence: 0.85,
106 });
107 }
108}
109
110fn extract_pr_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
111 let state = chunk
112 .metadata
113 .as_ref()
114 .and_then(|m| m["state"].as_str())
115 .unwrap_or("unknown");
116
117 let pr_id = chunk
118 .file_path
119 .rsplit('/')
120 .next()
121 .unwrap_or(&chunk.file_path);
122
123 facts.push(ExtractedFact {
124 category: "recent_changes".to_string(),
125 key: format!("{provider}#PR{pr_id}"),
126 value: format!("{} [{}]", chunk.symbol_name, state),
127 confidence: if state == "open" { 0.95 } else { 0.8 },
128 });
129
130 for ref_path in &chunk.references {
131 facts.push(ExtractedFact {
132 category: "changed_files".to_string(),
133 key: ref_path.clone(),
134 value: format!("Changed in PR {provider}#{pr_id}: {}", chunk.symbol_name),
135 confidence: 0.9,
136 });
137 }
138}
139
140fn extract_wiki_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
141 let page_id = chunk
142 .file_path
143 .rsplit('/')
144 .next()
145 .unwrap_or(&chunk.file_path);
146
147 facts.push(ExtractedFact {
148 category: "documentation".to_string(),
149 key: format!("{provider}#{page_id}"),
150 value: chunk.symbol_name.clone(),
151 confidence: 0.85,
152 });
153
154 for ref_path in &chunk.references {
155 facts.push(ExtractedFact {
156 category: "documented_files".to_string(),
157 key: ref_path.clone(),
158 value: format!("Documented in {provider}#{page_id}: {}", chunk.symbol_name),
159 confidence: 0.8,
160 });
161 }
162}
163
164fn extract_db_facts(chunk: &ContentChunk, provider: &str, facts: &mut Vec<ExtractedFact>) {
165 let table_id = chunk
166 .file_path
167 .rsplit('/')
168 .next()
169 .unwrap_or(&chunk.file_path);
170
171 facts.push(ExtractedFact {
172 category: "data_model".to_string(),
173 key: format!("{provider}#{table_id}"),
174 value: chunk.symbol_name.clone(),
175 confidence: 0.95,
176 });
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182 use crate::core::bm25_index::ChunkKind;
183 use crate::core::content_chunk::ContentChunk;
184
185 fn issue_with_labels(id: &str, title: &str, labels: &[&str], refs: Vec<&str>) -> ContentChunk {
186 ContentChunk::from_provider(
187 "github",
188 "issues",
189 id,
190 title,
191 ChunkKind::Issue,
192 format!("Body of {title}"),
193 refs.into_iter().map(String::from).collect(),
194 Some(serde_json::json!({
195 "state": "open",
196 "labels": labels,
197 })),
198 )
199 }
200
201 #[test]
202 fn bug_label_creates_known_bugs_fact() {
203 let chunk = issue_with_labels("42", "Auth crash", &["bug", "p1"], vec!["src/auth.rs"]);
204 let facts = extract_facts(&[chunk]);
205
206 let bug_fact = facts.iter().find(|f| f.category == "known_bugs");
207 assert!(bug_fact.is_some());
208 assert!(bug_fact.unwrap().key.contains("42"));
209 assert!(bug_fact.unwrap().value.contains("Auth crash"));
210 assert!(bug_fact.unwrap().value.contains("[open]"));
211 }
212
213 #[test]
214 fn feature_label_creates_known_features_fact() {
215 let chunk = issue_with_labels("10", "Dark mode", &["enhancement"], vec![]);
216 let facts = extract_facts(&[chunk]);
217
218 assert!(facts.iter().any(|f| f.category == "known_features"));
219 }
220
221 #[test]
222 fn generic_issue_creates_known_issues_fact() {
223 let chunk = issue_with_labels("5", "Question about API", &["question"], vec![]);
224 let facts = extract_facts(&[chunk]);
225
226 assert!(facts.iter().any(|f| f.category == "known_issues"));
227 }
228
229 #[test]
230 fn issue_with_refs_creates_file_mentions() {
231 let chunk = issue_with_labels(
232 "42",
233 "Auth crash",
234 &["bug"],
235 vec!["src/auth.rs", "src/db.rs"],
236 );
237 let facts = extract_facts(&[chunk]);
238
239 let mentions: Vec<_> = facts
240 .iter()
241 .filter(|f| f.category == "file_mentions")
242 .collect();
243 assert_eq!(mentions.len(), 2);
244 assert!(mentions.iter().any(|f| f.key == "src/auth.rs"));
245 assert!(mentions.iter().any(|f| f.key == "src/db.rs"));
246 }
247
248 #[test]
249 fn pr_creates_recent_changes_and_changed_files() {
250 let chunk = ContentChunk::from_provider(
251 "github",
252 "pull_requests",
253 "100",
254 "Fix auth token expiry",
255 ChunkKind::PullRequest,
256 "Fixes token expiry".into(),
257 vec!["src/auth.rs".into()],
258 Some(serde_json::json!({"state": "open"})),
259 );
260
261 let facts = extract_facts(&[chunk]);
262 assert!(facts.iter().any(|f| f.category == "recent_changes"));
263 assert!(facts
264 .iter()
265 .any(|f| f.category == "changed_files" && f.key == "src/auth.rs"));
266 }
267
268 #[test]
269 fn wiki_creates_documentation_facts() {
270 let chunk = ContentChunk::from_provider(
271 "confluence",
272 "wikis",
273 "auth-guide",
274 "Authentication Guide",
275 ChunkKind::WikiPage,
276 "How auth works".into(),
277 vec!["src/auth/mod.rs".into()],
278 None,
279 );
280
281 let facts = extract_facts(&[chunk]);
282 assert!(facts.iter().any(|f| f.category == "documentation"));
283 assert!(facts.iter().any(|f| f.category == "documented_files"));
284 }
285
286 #[test]
287 fn db_creates_data_model_facts() {
288 let chunk = ContentChunk::from_provider(
289 "postgres",
290 "schemas",
291 "users",
292 "public.users",
293 ChunkKind::DbSchema,
294 "CREATE TABLE users (id serial, email varchar)".into(),
295 vec![],
296 None,
297 );
298
299 let facts = extract_facts(&[chunk]);
300 assert_eq!(facts.len(), 1);
301 assert_eq!(facts[0].category, "data_model");
302 assert_eq!(facts[0].confidence, 0.95);
303 }
304
305 #[test]
306 fn code_chunks_are_skipped() {
307 let chunk = ContentChunk::from(crate::core::bm25_index::CodeChunk {
308 file_path: "src/main.rs".into(),
309 symbol_name: "main".into(),
310 kind: ChunkKind::Function,
311 start_line: 1,
312 end_line: 5,
313 content: "fn main() {}".into(),
314 tokens: vec![],
315 token_count: 0,
316 });
317
318 let facts = extract_facts(&[chunk]);
319 assert!(facts.is_empty());
320 }
321
322 #[test]
323 fn closed_issues_have_lower_confidence() {
324 let chunk = ContentChunk::from_provider(
325 "github",
326 "issues",
327 "99",
328 "Old bug",
329 ChunkKind::Issue,
330 "Fixed".into(),
331 vec![],
332 Some(serde_json::json!({"state": "closed", "labels": ["bug"]})),
333 );
334
335 let facts = extract_facts(&[chunk]);
336 let fact = facts.iter().find(|f| f.category == "known_bugs").unwrap();
337 assert!(fact.confidence < 0.9);
338 }
339}