1#![cfg_attr(not(feature = "async"), allow(unused_imports))]
2
3use crate::{
18 core::{Entity, EntityId, GraphRAGError, Relationship, Result, TextChunk},
19 ollama::OllamaClient,
20};
21use serde::{Deserialize, Serialize};
22use std::collections::HashMap;
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct AtomicFact {
30 pub subject: String,
32 pub predicate: String,
34 pub object: String,
36 pub temporal_marker: Option<String>,
38 pub confidence: f32,
40}
41
42impl AtomicFact {
43 pub fn is_temporal(&self) -> bool {
45 self.temporal_marker.is_some()
46 }
47
48 pub fn extract_timestamp(&self) -> Option<i64> {
53 let marker = self.temporal_marker.as_ref()?;
54
55 if marker.contains("BC") || marker.contains("BCE") {
60 let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
62
63 if let Ok(year) = num_str.parse::<i64>() {
64 return Some(-year * 365 * 24 * 3600);
67 }
68 }
69
70 let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
72
73 if let Ok(year) = num_str.parse::<i64>() {
74 if year > 1000 && year < 3000 {
75 return Some((year - 1970) * 365 * 24 * 3600);
78 }
79 }
80
81 None
82 }
83}
84
85pub struct AtomicFactExtractor {
89 #[cfg_attr(not(feature = "async"), allow(dead_code))]
91 ollama_client: OllamaClient,
92 max_fact_tokens: usize,
94}
95
96impl AtomicFactExtractor {
97 pub fn new(ollama_client: OllamaClient) -> Self {
103 Self {
104 ollama_client,
105 max_fact_tokens: 400,
106 }
107 }
108
109 pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
111 self.max_fact_tokens = max_tokens;
112 self
113 }
114
115 #[cfg(feature = "async")]
125 pub async fn extract_atomic_facts(&self, chunk: &TextChunk) -> Result<Vec<AtomicFact>> {
126 let prompt = format!(
127 r#"Extract atomic facts from the following text. Each fact should be:
128- Self-contained and verifiable (< {} tokens)
129- In the format: (Subject, Predicate, Object, TemporalMarker, Confidence)
130- TemporalMarker should capture time expressions like "in 1876", "during summer", "380 BC" (or null if none)
131- Confidence should be 0.0-1.0
132
133Respond ONLY with valid JSON array:
134[
135 {{
136 "subject": "entity or concept",
137 "predicate": "relationship or property",
138 "object": "entity, value, or concept",
139 "temporal_marker": "time expression or null",
140 "confidence": 0.0-1.0
141 }}
142]
143
144Text: "{}"
145
146JSON:"#,
147 self.max_fact_tokens, chunk.content
148 );
149
150 #[cfg(feature = "tracing")]
151 tracing::debug!(
152 chunk_id = %chunk.id,
153 "Extracting atomic facts from chunk"
154 );
155
156 match self.ollama_client.generate(&prompt).await {
157 Ok(response) => {
158 let json_str = response.trim();
160 let json_str = if let Some(start) = json_str.find('[') {
161 if let Some(end) = json_str.rfind(']') {
162 &json_str[start..=end]
163 } else {
164 json_str
165 }
166 } else {
167 json_str
168 };
169
170 #[derive(Deserialize)]
171 struct AtomicFactJson {
172 subject: String,
173 predicate: String,
174 object: String,
175 temporal_marker: Option<String>,
176 confidence: f32,
177 }
178
179 match serde_json::from_str::<Vec<AtomicFactJson>>(json_str) {
180 Ok(facts_json) => {
181 let facts: Vec<AtomicFact> = facts_json
182 .into_iter()
183 .map(|f| AtomicFact {
184 subject: f.subject,
185 predicate: f.predicate,
186 object: f.object,
187 temporal_marker: f
188 .temporal_marker
189 .filter(|s| !s.is_empty() && s != "null"),
190 confidence: f.confidence.clamp(0.0, 1.0),
191 })
192 .collect();
193
194 #[cfg(feature = "tracing")]
195 tracing::info!(
196 chunk_id = %chunk.id,
197 fact_count = facts.len(),
198 "Extracted atomic facts"
199 );
200
201 Ok(facts)
202 },
203 Err(e) => {
204 #[cfg(feature = "tracing")]
205 tracing::warn!(
206 chunk_id = %chunk.id,
207 error = %e,
208 response = %json_str,
209 "Failed to parse atomic facts JSON"
210 );
211
212 Ok(Vec::new())
214 },
215 }
216 },
217 Err(e) => {
218 #[cfg(feature = "tracing")]
219 tracing::error!(
220 chunk_id = %chunk.id,
221 error = %e,
222 "Atomic fact extraction failed"
223 );
224
225 Err(GraphRAGError::EntityExtraction {
226 message: format!("Atomic fact extraction failed: {}", e),
227 })
228 },
229 }
230 }
231
232 pub fn atomics_to_graph_elements(
243 &self,
244 facts: Vec<AtomicFact>,
245 chunk_id: &crate::core::ChunkId,
246 ) -> (Vec<Entity>, Vec<Relationship>) {
247 let mut entities: HashMap<String, Entity> = HashMap::new();
248 let mut relationships = Vec::new();
249
250 for fact in facts {
251 let subject_id = EntityId::new(Self::normalize_entity_name(&fact.subject));
253 entities.entry(subject_id.0.clone()).or_insert_with(|| {
254 let mut entity = Entity::new(
255 subject_id.clone(),
256 fact.subject.clone(),
257 Self::infer_entity_type(&fact.subject),
258 fact.confidence,
259 );
260
261 if let Some(timestamp) = fact.extract_timestamp() {
263 entity.first_mentioned = Some(timestamp);
264 entity.last_mentioned = Some(timestamp);
265 }
266
267 entity
268 });
269
270 let object_id = EntityId::new(Self::normalize_entity_name(&fact.object));
272 entities.entry(object_id.0.clone()).or_insert_with(|| {
273 let mut entity = Entity::new(
274 object_id.clone(),
275 fact.object.clone(),
276 Self::infer_entity_type(&fact.object),
277 fact.confidence,
278 );
279
280 if let Some(timestamp) = fact.extract_timestamp() {
282 entity.first_mentioned = Some(timestamp);
283 entity.last_mentioned = Some(timestamp);
284 }
285
286 entity
287 });
288
289 let mut relationship = Relationship::new(
291 subject_id,
292 object_id,
293 fact.predicate.to_uppercase(),
294 fact.confidence,
295 )
296 .with_context(vec![chunk_id.clone()]);
297
298 if let Some(timestamp) = fact.extract_timestamp() {
300 relationship.temporal_range = Some(crate::graph::temporal::TemporalRange::new(
301 timestamp, timestamp,
302 ));
303
304 if fact.predicate.to_lowercase().contains("caused")
306 || fact.predicate.to_lowercase().contains("led to")
307 {
308 relationship.temporal_type =
309 Some(crate::graph::temporal::TemporalRelationType::Caused);
310 relationship.causal_strength = Some(fact.confidence);
311 } else if fact.predicate.to_lowercase().contains("enabled")
312 || fact.predicate.to_lowercase().contains("allowed")
313 {
314 relationship.temporal_type =
315 Some(crate::graph::temporal::TemporalRelationType::Enabled);
316 relationship.causal_strength = Some(fact.confidence * 0.6);
317 }
318 }
319
320 relationships.push(relationship);
321 }
322
323 (entities.into_values().collect(), relationships)
324 }
325
326 fn normalize_entity_name(name: &str) -> String {
328 name.trim()
329 .to_lowercase()
330 .replace(' ', "_")
331 .chars()
332 .filter(|c| c.is_alphanumeric() || *c == '_')
333 .collect()
334 }
335
336 fn infer_entity_type(name: &str) -> String {
338 let lower = name.to_lowercase();
339
340 if name.chars().next().is_some_and(|c| c.is_uppercase()) {
342 if lower.ends_with("ia") || lower.ends_with("land") || lower.ends_with("istan") {
343 return "LOCATION".to_string();
344 }
345 return "PERSON".to_string();
346 }
347
348 if name.chars().any(|c| c.is_ascii_digit()) {
350 return "DATE".to_string();
351 }
352
353 "CONCEPT".to_string()
355 }
356}
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361
362 #[test]
363 fn test_atomic_fact_creation() {
364 let fact = AtomicFact {
365 subject: "Socrates".to_string(),
366 predicate: "taught".to_string(),
367 object: "Plato".to_string(),
368 temporal_marker: Some("in 380 BC".to_string()),
369 confidence: 0.9,
370 };
371
372 assert_eq!(fact.subject, "Socrates");
373 assert!(fact.is_temporal());
374 }
375
376 #[test]
377 fn test_timestamp_extraction_bc() {
378 let fact = AtomicFact {
379 subject: "Event".to_string(),
380 predicate: "occurred".to_string(),
381 object: "Athens".to_string(),
382 temporal_marker: Some("380 BC".to_string()),
383 confidence: 0.9,
384 };
385
386 let timestamp = fact.extract_timestamp();
387 assert!(timestamp.is_some());
388 assert!(timestamp.unwrap() < 0); }
390
391 #[test]
392 fn test_timestamp_extraction_ad() {
393 let fact = AtomicFact {
394 subject: "Event".to_string(),
395 predicate: "occurred".to_string(),
396 object: "Rome".to_string(),
397 temporal_marker: Some("in 1876".to_string()),
398 confidence: 0.9,
399 };
400
401 let timestamp = fact.extract_timestamp();
402 assert!(timestamp.is_some());
403 }
404
405 #[test]
406 fn test_normalize_entity_name() {
407 assert_eq!(
408 AtomicFactExtractor::normalize_entity_name("Socrates the Philosopher"),
409 "socrates_the_philosopher"
410 );
411 assert_eq!(
412 AtomicFactExtractor::normalize_entity_name("New York"),
413 "new_york"
414 );
415 }
416
417 #[test]
418 fn test_infer_entity_type() {
419 assert_eq!(AtomicFactExtractor::infer_entity_type("Socrates"), "PERSON");
420 assert_eq!(
421 AtomicFactExtractor::infer_entity_type("Athens"),
422 "PERSON" );
424 assert_eq!(AtomicFactExtractor::infer_entity_type("love"), "CONCEPT");
425 assert_eq!(AtomicFactExtractor::infer_entity_type("1876"), "DATE");
426 }
427}