Skip to main content

graphrag_core/entity/
atomic_fact_extractor.rs

1//! ATOM Atomic Fact Extraction
2//!
3//! This module implements atomic fact extraction following the ATOM methodology
4//! (itext2kg - https://github.com/AuvaLab/itext2kg)
5//!
6//! ATOM extracts self-contained facts as 5-tuples:
7//! (Subject, Predicate, Object, TemporalMarker, Confidence)
8//!
9//! Benefits:
10//! - More granular than entity-relationship pairs
11//! - Better temporal grounding
12//! - Easier to validate and verify
13//! - Natural fit for knowledge graphs
14
15use crate::{
16    core::{Entity, EntityId, GraphRAGError, Relationship, Result, TextChunk},
17    ollama::OllamaClient,
18};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21
22/// Atomic fact extracted from text
23///
24/// Represents a single, self-contained factual statement.
25/// Each fact should be verifiable and stand alone without context.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct AtomicFact {
28    /// Subject of the fact (entity performing action or being described)
29    pub subject: String,
30    /// Predicate describing the relationship or property
31    pub predicate: String,
32    /// Object of the fact (entity being acted upon or value)
33    pub object: String,
34    /// Optional temporal marker (e.g., "in 1876", "during summer", "380 BC")
35    pub temporal_marker: Option<String>,
36    /// Confidence score for this fact (0.0-1.0)
37    pub confidence: f32,
38}
39
40impl AtomicFact {
41    /// Check if this fact has temporal information
42    pub fn is_temporal(&self) -> bool {
43        self.temporal_marker.is_some()
44    }
45
46    /// Extract approximate Unix timestamp from temporal marker if possible
47    ///
48    /// This is a best-effort extraction and may not work for all formats.
49    /// Returns None if temporal marker is missing or cannot be parsed.
50    pub fn extract_timestamp(&self) -> Option<i64> {
51        let marker = self.temporal_marker.as_ref()?;
52
53        // Try to extract year from common formats
54        // "in 1876", "during 1876", "1876", "380 BC", etc.
55
56        // Check for BC/BCE dates
57        if marker.contains("BC") || marker.contains("BCE") {
58            // Extract the number before BC/BCE
59            let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
60
61            if let Ok(year) = num_str.parse::<i64>() {
62                // Negative for BC years
63                // Approximate: 365.25 days per year * 24 hours * 3600 seconds
64                return Some(-year * 365 * 24 * 3600);
65            }
66        }
67
68        // Check for AD dates (positive years)
69        let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
70
71        if let Ok(year) = num_str.parse::<i64>() {
72            if year > 1000 && year < 3000 {
73                // Approximate Unix timestamp for year
74                // Unix epoch is 1970, so subtract that
75                return Some((year - 1970) * 365 * 24 * 3600);
76            }
77        }
78
79        None
80    }
81}
82
83/// Extractor for atomic facts from text
84///
85/// Uses LLM to decompose text into self-contained factual statements.
86pub struct AtomicFactExtractor {
87    /// Ollama client for LLM-based extraction
88    ollama_client: OllamaClient,
89    /// Maximum tokens per fact (default: 400)
90    max_fact_tokens: usize,
91}
92
93impl AtomicFactExtractor {
94    /// Create a new atomic fact extractor
95    ///
96    /// # Arguments
97    ///
98    /// * `ollama_client` - Ollama client for LLM calls
99    pub fn new(ollama_client: OllamaClient) -> Self {
100        Self {
101            ollama_client,
102            max_fact_tokens: 400,
103        }
104    }
105
106    /// Set maximum tokens per fact
107    pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
108        self.max_fact_tokens = max_tokens;
109        self
110    }
111
112    /// Extract atomic facts from a text chunk
113    ///
114    /// # Arguments
115    ///
116    /// * `chunk` - Text chunk to extract facts from
117    ///
118    /// # Returns
119    ///
120    /// Vector of atomic facts extracted from the text
121    #[cfg(feature = "async")]
122    pub async fn extract_atomic_facts(&self, chunk: &TextChunk) -> Result<Vec<AtomicFact>> {
123        let prompt = format!(
124            r#"Extract atomic facts from the following text. Each fact should be:
125- Self-contained and verifiable (< {} tokens)
126- In the format: (Subject, Predicate, Object, TemporalMarker, Confidence)
127- TemporalMarker should capture time expressions like "in 1876", "during summer", "380 BC" (or null if none)
128- Confidence should be 0.0-1.0
129
130Respond ONLY with valid JSON array:
131[
132  {{
133    "subject": "entity or concept",
134    "predicate": "relationship or property",
135    "object": "entity, value, or concept",
136    "temporal_marker": "time expression or null",
137    "confidence": 0.0-1.0
138  }}
139]
140
141Text: "{}"
142
143JSON:"#,
144            self.max_fact_tokens, chunk.content
145        );
146
147        #[cfg(feature = "tracing")]
148        tracing::debug!(
149            chunk_id = %chunk.id,
150            "Extracting atomic facts from chunk"
151        );
152
153        match self.ollama_client.generate(&prompt).await {
154            Ok(response) => {
155                // Extract JSON from response
156                let json_str = response.trim();
157                let json_str = if let Some(start) = json_str.find('[') {
158                    if let Some(end) = json_str.rfind(']') {
159                        &json_str[start..=end]
160                    } else {
161                        json_str
162                    }
163                } else {
164                    json_str
165                };
166
167                #[derive(Deserialize)]
168                struct AtomicFactJson {
169                    subject: String,
170                    predicate: String,
171                    object: String,
172                    temporal_marker: Option<String>,
173                    confidence: f32,
174                }
175
176                match serde_json::from_str::<Vec<AtomicFactJson>>(json_str) {
177                    Ok(facts_json) => {
178                        let facts: Vec<AtomicFact> = facts_json
179                            .into_iter()
180                            .map(|f| AtomicFact {
181                                subject: f.subject,
182                                predicate: f.predicate,
183                                object: f.object,
184                                temporal_marker: f
185                                    .temporal_marker
186                                    .filter(|s| !s.is_empty() && s != "null"),
187                                confidence: f.confidence.clamp(0.0, 1.0),
188                            })
189                            .collect();
190
191                        #[cfg(feature = "tracing")]
192                        tracing::info!(
193                            chunk_id = %chunk.id,
194                            fact_count = facts.len(),
195                            "Extracted atomic facts"
196                        );
197
198                        Ok(facts)
199                    },
200                    Err(e) => {
201                        #[cfg(feature = "tracing")]
202                        tracing::warn!(
203                            chunk_id = %chunk.id,
204                            error = %e,
205                            response = %json_str,
206                            "Failed to parse atomic facts JSON"
207                        );
208
209                        // Return empty vector on parse failure
210                        Ok(Vec::new())
211                    },
212                }
213            },
214            Err(e) => {
215                #[cfg(feature = "tracing")]
216                tracing::error!(
217                    chunk_id = %chunk.id,
218                    error = %e,
219                    "Atomic fact extraction failed"
220                );
221
222                Err(GraphRAGError::EntityExtraction {
223                    message: format!("Atomic fact extraction failed: {}", e),
224                })
225            },
226        }
227    }
228
229    /// Convert atomic facts to graph elements (entities and relationships)
230    ///
231    /// # Arguments
232    ///
233    /// * `facts` - Vector of atomic facts to convert
234    /// * `chunk_id` - ID of the source chunk for context
235    ///
236    /// # Returns
237    ///
238    /// Tuple of (entities, relationships) extracted from facts
239    pub fn atomics_to_graph_elements(
240        &self,
241        facts: Vec<AtomicFact>,
242        chunk_id: &crate::core::ChunkId,
243    ) -> (Vec<Entity>, Vec<Relationship>) {
244        let mut entities: HashMap<String, Entity> = HashMap::new();
245        let mut relationships = Vec::new();
246
247        for fact in facts {
248            // Create or update subject entity
249            let subject_id = EntityId::new(Self::normalize_entity_name(&fact.subject));
250            entities.entry(subject_id.0.clone()).or_insert_with(|| {
251                let mut entity = Entity::new(
252                    subject_id.clone(),
253                    fact.subject.clone(),
254                    Self::infer_entity_type(&fact.subject),
255                    fact.confidence,
256                );
257
258                // Add temporal information if available
259                if let Some(timestamp) = fact.extract_timestamp() {
260                    entity.first_mentioned = Some(timestamp);
261                    entity.last_mentioned = Some(timestamp);
262                }
263
264                entity
265            });
266
267            // Create or update object entity
268            let object_id = EntityId::new(Self::normalize_entity_name(&fact.object));
269            entities.entry(object_id.0.clone()).or_insert_with(|| {
270                let mut entity = Entity::new(
271                    object_id.clone(),
272                    fact.object.clone(),
273                    Self::infer_entity_type(&fact.object),
274                    fact.confidence,
275                );
276
277                // Add temporal information if available
278                if let Some(timestamp) = fact.extract_timestamp() {
279                    entity.first_mentioned = Some(timestamp);
280                    entity.last_mentioned = Some(timestamp);
281                }
282
283                entity
284            });
285
286            // Create relationship
287            let mut relationship = Relationship::new(
288                subject_id,
289                object_id,
290                fact.predicate.to_uppercase(),
291                fact.confidence,
292            )
293            .with_context(vec![chunk_id.clone()]);
294
295            // Add temporal information if available
296            if let Some(timestamp) = fact.extract_timestamp() {
297                relationship.temporal_range = Some(crate::graph::temporal::TemporalRange::new(
298                    timestamp, timestamp,
299                ));
300
301                // Infer temporal relationship type based on predicate
302                if fact.predicate.to_lowercase().contains("caused")
303                    || fact.predicate.to_lowercase().contains("led to")
304                {
305                    relationship.temporal_type =
306                        Some(crate::graph::temporal::TemporalRelationType::Caused);
307                    relationship.causal_strength = Some(fact.confidence);
308                } else if fact.predicate.to_lowercase().contains("enabled")
309                    || fact.predicate.to_lowercase().contains("allowed")
310                {
311                    relationship.temporal_type =
312                        Some(crate::graph::temporal::TemporalRelationType::Enabled);
313                    relationship.causal_strength = Some(fact.confidence * 0.6);
314                }
315            }
316
317            relationships.push(relationship);
318        }
319
320        (entities.into_values().collect(), relationships)
321    }
322
323    /// Normalize entity name for consistent ID generation
324    fn normalize_entity_name(name: &str) -> String {
325        name.trim()
326            .to_lowercase()
327            .replace(' ', "_")
328            .chars()
329            .filter(|c| c.is_alphanumeric() || *c == '_')
330            .collect()
331    }
332
333    /// Infer entity type from name (simple heuristic)
334    fn infer_entity_type(name: &str) -> String {
335        let lower = name.to_lowercase();
336
337        // Check for proper nouns (capitalized)
338        if name.chars().next().map_or(false, |c| c.is_uppercase()) {
339            if lower.ends_with("ia") || lower.ends_with("land") || lower.ends_with("istan") {
340                return "LOCATION".to_string();
341            }
342            return "PERSON".to_string();
343        }
344
345        // Check for numbers/dates
346        if name.chars().any(|c| c.is_ascii_digit()) {
347            return "DATE".to_string();
348        }
349
350        // Default to concept
351        "CONCEPT".to_string()
352    }
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    #[test]
360    fn test_atomic_fact_creation() {
361        let fact = AtomicFact {
362            subject: "Socrates".to_string(),
363            predicate: "taught".to_string(),
364            object: "Plato".to_string(),
365            temporal_marker: Some("in 380 BC".to_string()),
366            confidence: 0.9,
367        };
368
369        assert_eq!(fact.subject, "Socrates");
370        assert!(fact.is_temporal());
371    }
372
373    #[test]
374    fn test_timestamp_extraction_bc() {
375        let fact = AtomicFact {
376            subject: "Event".to_string(),
377            predicate: "occurred".to_string(),
378            object: "Athens".to_string(),
379            temporal_marker: Some("380 BC".to_string()),
380            confidence: 0.9,
381        };
382
383        let timestamp = fact.extract_timestamp();
384        assert!(timestamp.is_some());
385        assert!(timestamp.unwrap() < 0); // BC should be negative
386    }
387
388    #[test]
389    fn test_timestamp_extraction_ad() {
390        let fact = AtomicFact {
391            subject: "Event".to_string(),
392            predicate: "occurred".to_string(),
393            object: "Rome".to_string(),
394            temporal_marker: Some("in 1876".to_string()),
395            confidence: 0.9,
396        };
397
398        let timestamp = fact.extract_timestamp();
399        assert!(timestamp.is_some());
400    }
401
402    #[test]
403    fn test_normalize_entity_name() {
404        assert_eq!(
405            AtomicFactExtractor::normalize_entity_name("Socrates the Philosopher"),
406            "socrates_the_philosopher"
407        );
408        assert_eq!(
409            AtomicFactExtractor::normalize_entity_name("New York"),
410            "new_york"
411        );
412    }
413
414    #[test]
415    fn test_infer_entity_type() {
416        assert_eq!(AtomicFactExtractor::infer_entity_type("Socrates"), "PERSON");
417        assert_eq!(
418            AtomicFactExtractor::infer_entity_type("Athens"),
419            "PERSON" // Would be LOCATION if we had more sophisticated logic
420        );
421        assert_eq!(AtomicFactExtractor::infer_entity_type("love"), "CONCEPT");
422        assert_eq!(AtomicFactExtractor::infer_entity_type("1876"), "DATE");
423    }
424}