1use crate::{
16 core::{Entity, EntityId, GraphRAGError, Relationship, Result, TextChunk},
17 ollama::OllamaClient,
18};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct AtomicFact {
28 pub subject: String,
30 pub predicate: String,
32 pub object: String,
34 pub temporal_marker: Option<String>,
36 pub confidence: f32,
38}
39
40impl AtomicFact {
41 pub fn is_temporal(&self) -> bool {
43 self.temporal_marker.is_some()
44 }
45
46 pub fn extract_timestamp(&self) -> Option<i64> {
51 let marker = self.temporal_marker.as_ref()?;
52
53 if marker.contains("BC") || marker.contains("BCE") {
58 let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
60
61 if let Ok(year) = num_str.parse::<i64>() {
62 return Some(-year * 365 * 24 * 3600);
65 }
66 }
67
68 let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
70
71 if let Ok(year) = num_str.parse::<i64>() {
72 if year > 1000 && year < 3000 {
73 return Some((year - 1970) * 365 * 24 * 3600);
76 }
77 }
78
79 None
80 }
81}
82
83pub struct AtomicFactExtractor {
87 ollama_client: OllamaClient,
89 max_fact_tokens: usize,
91}
92
93impl AtomicFactExtractor {
94 pub fn new(ollama_client: OllamaClient) -> Self {
100 Self {
101 ollama_client,
102 max_fact_tokens: 400,
103 }
104 }
105
106 pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
108 self.max_fact_tokens = max_tokens;
109 self
110 }
111
112 #[cfg(feature = "async")]
122 pub async fn extract_atomic_facts(&self, chunk: &TextChunk) -> Result<Vec<AtomicFact>> {
123 let prompt = format!(
124 r#"Extract atomic facts from the following text. Each fact should be:
125- Self-contained and verifiable (< {} tokens)
126- In the format: (Subject, Predicate, Object, TemporalMarker, Confidence)
127- TemporalMarker should capture time expressions like "in 1876", "during summer", "380 BC" (or null if none)
128- Confidence should be 0.0-1.0
129
130Respond ONLY with valid JSON array:
131[
132 {{
133 "subject": "entity or concept",
134 "predicate": "relationship or property",
135 "object": "entity, value, or concept",
136 "temporal_marker": "time expression or null",
137 "confidence": 0.0-1.0
138 }}
139]
140
141Text: "{}"
142
143JSON:"#,
144 self.max_fact_tokens, chunk.content
145 );
146
147 #[cfg(feature = "tracing")]
148 tracing::debug!(
149 chunk_id = %chunk.id,
150 "Extracting atomic facts from chunk"
151 );
152
153 match self.ollama_client.generate(&prompt).await {
154 Ok(response) => {
155 let json_str = response.trim();
157 let json_str = if let Some(start) = json_str.find('[') {
158 if let Some(end) = json_str.rfind(']') {
159 &json_str[start..=end]
160 } else {
161 json_str
162 }
163 } else {
164 json_str
165 };
166
167 #[derive(Deserialize)]
168 struct AtomicFactJson {
169 subject: String,
170 predicate: String,
171 object: String,
172 temporal_marker: Option<String>,
173 confidence: f32,
174 }
175
176 match serde_json::from_str::<Vec<AtomicFactJson>>(json_str) {
177 Ok(facts_json) => {
178 let facts: Vec<AtomicFact> = facts_json
179 .into_iter()
180 .map(|f| AtomicFact {
181 subject: f.subject,
182 predicate: f.predicate,
183 object: f.object,
184 temporal_marker: f
185 .temporal_marker
186 .filter(|s| !s.is_empty() && s != "null"),
187 confidence: f.confidence.clamp(0.0, 1.0),
188 })
189 .collect();
190
191 #[cfg(feature = "tracing")]
192 tracing::info!(
193 chunk_id = %chunk.id,
194 fact_count = facts.len(),
195 "Extracted atomic facts"
196 );
197
198 Ok(facts)
199 },
200 Err(e) => {
201 #[cfg(feature = "tracing")]
202 tracing::warn!(
203 chunk_id = %chunk.id,
204 error = %e,
205 response = %json_str,
206 "Failed to parse atomic facts JSON"
207 );
208
209 Ok(Vec::new())
211 },
212 }
213 },
214 Err(e) => {
215 #[cfg(feature = "tracing")]
216 tracing::error!(
217 chunk_id = %chunk.id,
218 error = %e,
219 "Atomic fact extraction failed"
220 );
221
222 Err(GraphRAGError::EntityExtraction {
223 message: format!("Atomic fact extraction failed: {}", e),
224 })
225 },
226 }
227 }
228
229 pub fn atomics_to_graph_elements(
240 &self,
241 facts: Vec<AtomicFact>,
242 chunk_id: &crate::core::ChunkId,
243 ) -> (Vec<Entity>, Vec<Relationship>) {
244 let mut entities: HashMap<String, Entity> = HashMap::new();
245 let mut relationships = Vec::new();
246
247 for fact in facts {
248 let subject_id = EntityId::new(Self::normalize_entity_name(&fact.subject));
250 entities.entry(subject_id.0.clone()).or_insert_with(|| {
251 let mut entity = Entity::new(
252 subject_id.clone(),
253 fact.subject.clone(),
254 Self::infer_entity_type(&fact.subject),
255 fact.confidence,
256 );
257
258 if let Some(timestamp) = fact.extract_timestamp() {
260 entity.first_mentioned = Some(timestamp);
261 entity.last_mentioned = Some(timestamp);
262 }
263
264 entity
265 });
266
267 let object_id = EntityId::new(Self::normalize_entity_name(&fact.object));
269 entities.entry(object_id.0.clone()).or_insert_with(|| {
270 let mut entity = Entity::new(
271 object_id.clone(),
272 fact.object.clone(),
273 Self::infer_entity_type(&fact.object),
274 fact.confidence,
275 );
276
277 if let Some(timestamp) = fact.extract_timestamp() {
279 entity.first_mentioned = Some(timestamp);
280 entity.last_mentioned = Some(timestamp);
281 }
282
283 entity
284 });
285
286 let mut relationship = Relationship::new(
288 subject_id,
289 object_id,
290 fact.predicate.to_uppercase(),
291 fact.confidence,
292 )
293 .with_context(vec![chunk_id.clone()]);
294
295 if let Some(timestamp) = fact.extract_timestamp() {
297 relationship.temporal_range = Some(crate::graph::temporal::TemporalRange::new(
298 timestamp, timestamp,
299 ));
300
301 if fact.predicate.to_lowercase().contains("caused")
303 || fact.predicate.to_lowercase().contains("led to")
304 {
305 relationship.temporal_type =
306 Some(crate::graph::temporal::TemporalRelationType::Caused);
307 relationship.causal_strength = Some(fact.confidence);
308 } else if fact.predicate.to_lowercase().contains("enabled")
309 || fact.predicate.to_lowercase().contains("allowed")
310 {
311 relationship.temporal_type =
312 Some(crate::graph::temporal::TemporalRelationType::Enabled);
313 relationship.causal_strength = Some(fact.confidence * 0.6);
314 }
315 }
316
317 relationships.push(relationship);
318 }
319
320 (entities.into_values().collect(), relationships)
321 }
322
323 fn normalize_entity_name(name: &str) -> String {
325 name.trim()
326 .to_lowercase()
327 .replace(' ', "_")
328 .chars()
329 .filter(|c| c.is_alphanumeric() || *c == '_')
330 .collect()
331 }
332
333 fn infer_entity_type(name: &str) -> String {
335 let lower = name.to_lowercase();
336
337 if name.chars().next().map_or(false, |c| c.is_uppercase()) {
339 if lower.ends_with("ia") || lower.ends_with("land") || lower.ends_with("istan") {
340 return "LOCATION".to_string();
341 }
342 return "PERSON".to_string();
343 }
344
345 if name.chars().any(|c| c.is_ascii_digit()) {
347 return "DATE".to_string();
348 }
349
350 "CONCEPT".to_string()
352 }
353}
354
355#[cfg(test)]
356mod tests {
357 use super::*;
358
359 #[test]
360 fn test_atomic_fact_creation() {
361 let fact = AtomicFact {
362 subject: "Socrates".to_string(),
363 predicate: "taught".to_string(),
364 object: "Plato".to_string(),
365 temporal_marker: Some("in 380 BC".to_string()),
366 confidence: 0.9,
367 };
368
369 assert_eq!(fact.subject, "Socrates");
370 assert!(fact.is_temporal());
371 }
372
373 #[test]
374 fn test_timestamp_extraction_bc() {
375 let fact = AtomicFact {
376 subject: "Event".to_string(),
377 predicate: "occurred".to_string(),
378 object: "Athens".to_string(),
379 temporal_marker: Some("380 BC".to_string()),
380 confidence: 0.9,
381 };
382
383 let timestamp = fact.extract_timestamp();
384 assert!(timestamp.is_some());
385 assert!(timestamp.unwrap() < 0); }
387
388 #[test]
389 fn test_timestamp_extraction_ad() {
390 let fact = AtomicFact {
391 subject: "Event".to_string(),
392 predicate: "occurred".to_string(),
393 object: "Rome".to_string(),
394 temporal_marker: Some("in 1876".to_string()),
395 confidence: 0.9,
396 };
397
398 let timestamp = fact.extract_timestamp();
399 assert!(timestamp.is_some());
400 }
401
402 #[test]
403 fn test_normalize_entity_name() {
404 assert_eq!(
405 AtomicFactExtractor::normalize_entity_name("Socrates the Philosopher"),
406 "socrates_the_philosopher"
407 );
408 assert_eq!(
409 AtomicFactExtractor::normalize_entity_name("New York"),
410 "new_york"
411 );
412 }
413
414 #[test]
415 fn test_infer_entity_type() {
416 assert_eq!(AtomicFactExtractor::infer_entity_type("Socrates"), "PERSON");
417 assert_eq!(
418 AtomicFactExtractor::infer_entity_type("Athens"),
419 "PERSON" );
421 assert_eq!(AtomicFactExtractor::infer_entity_type("love"), "CONCEPT");
422 assert_eq!(AtomicFactExtractor::infer_entity_type("1876"), "DATE");
423 }
424}