Skip to main content

graphrag_core/entity/
atomic_fact_extractor.rs

1#![cfg_attr(not(feature = "async"), allow(unused_imports))]
2
3//! ATOM Atomic Fact Extraction
4//!
5//! This module implements atomic fact extraction following the ATOM methodology
6//! (itext2kg - https://github.com/AuvaLab/itext2kg)
7//!
8//! ATOM extracts self-contained facts as 5-tuples:
9//! (Subject, Predicate, Object, TemporalMarker, Confidence)
10//!
11//! Benefits:
12//! - More granular than entity-relationship pairs
13//! - Better temporal grounding
14//! - Easier to validate and verify
15//! - Natural fit for knowledge graphs
16
17use crate::{
18    core::{Entity, EntityId, GraphRAGError, Relationship, Result, TextChunk},
19    ollama::OllamaClient,
20};
21use serde::{Deserialize, Serialize};
22use std::collections::HashMap;
23
24/// Atomic fact extracted from text
25///
26/// Represents a single, self-contained factual statement.
27/// Each fact should be verifiable and stand alone without context.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct AtomicFact {
30    /// Subject of the fact (entity performing action or being described)
31    pub subject: String,
32    /// Predicate describing the relationship or property
33    pub predicate: String,
34    /// Object of the fact (entity being acted upon or value)
35    pub object: String,
36    /// Optional temporal marker (e.g., "in 1876", "during summer", "380 BC")
37    pub temporal_marker: Option<String>,
38    /// Confidence score for this fact (0.0-1.0)
39    pub confidence: f32,
40}
41
42impl AtomicFact {
43    /// Check if this fact has temporal information
44    pub fn is_temporal(&self) -> bool {
45        self.temporal_marker.is_some()
46    }
47
48    /// Extract approximate Unix timestamp from temporal marker if possible
49    ///
50    /// This is a best-effort extraction and may not work for all formats.
51    /// Returns None if temporal marker is missing or cannot be parsed.
52    pub fn extract_timestamp(&self) -> Option<i64> {
53        let marker = self.temporal_marker.as_ref()?;
54
55        // Try to extract year from common formats
56        // "in 1876", "during 1876", "1876", "380 BC", etc.
57
58        // Check for BC/BCE dates
59        if marker.contains("BC") || marker.contains("BCE") {
60            // Extract the number before BC/BCE
61            let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
62
63            if let Ok(year) = num_str.parse::<i64>() {
64                // Negative for BC years
65                // Approximate: 365.25 days per year * 24 hours * 3600 seconds
66                return Some(-year * 365 * 24 * 3600);
67            }
68        }
69
70        // Check for AD dates (positive years)
71        let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
72
73        if let Ok(year) = num_str.parse::<i64>() {
74            if year > 1000 && year < 3000 {
75                // Approximate Unix timestamp for year
76                // Unix epoch is 1970, so subtract that
77                return Some((year - 1970) * 365 * 24 * 3600);
78            }
79        }
80
81        None
82    }
83}
84
85/// Extractor for atomic facts from text
86///
87/// Uses LLM to decompose text into self-contained factual statements.
88pub struct AtomicFactExtractor {
89    /// Ollama client for LLM-based extraction
90    #[cfg_attr(not(feature = "async"), allow(dead_code))]
91    ollama_client: OllamaClient,
92    /// Maximum tokens per fact (default: 400)
93    max_fact_tokens: usize,
94}
95
96impl AtomicFactExtractor {
97    /// Create a new atomic fact extractor
98    ///
99    /// # Arguments
100    ///
101    /// * `ollama_client` - Ollama client for LLM calls
102    pub fn new(ollama_client: OllamaClient) -> Self {
103        Self {
104            ollama_client,
105            max_fact_tokens: 400,
106        }
107    }
108
109    /// Set maximum tokens per fact
110    pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
111        self.max_fact_tokens = max_tokens;
112        self
113    }
114
115    /// Extract atomic facts from a text chunk
116    ///
117    /// # Arguments
118    ///
119    /// * `chunk` - Text chunk to extract facts from
120    ///
121    /// # Returns
122    ///
123    /// Vector of atomic facts extracted from the text
124    #[cfg(feature = "async")]
125    pub async fn extract_atomic_facts(&self, chunk: &TextChunk) -> Result<Vec<AtomicFact>> {
126        let prompt = format!(
127            r#"Extract atomic facts from the following text. Each fact should be:
128- Self-contained and verifiable (< {} tokens)
129- In the format: (Subject, Predicate, Object, TemporalMarker, Confidence)
130- TemporalMarker should capture time expressions like "in 1876", "during summer", "380 BC" (or null if none)
131- Confidence should be 0.0-1.0
132
133Respond ONLY with valid JSON array:
134[
135  {{
136    "subject": "entity or concept",
137    "predicate": "relationship or property",
138    "object": "entity, value, or concept",
139    "temporal_marker": "time expression or null",
140    "confidence": 0.0-1.0
141  }}
142]
143
144Text: "{}"
145
146JSON:"#,
147            self.max_fact_tokens, chunk.content
148        );
149
150        #[cfg(feature = "tracing")]
151        tracing::debug!(
152            chunk_id = %chunk.id,
153            "Extracting atomic facts from chunk"
154        );
155
156        match self.ollama_client.generate(&prompt).await {
157            Ok(response) => {
158                // Extract JSON from response
159                let json_str = response.trim();
160                let json_str = if let Some(start) = json_str.find('[') {
161                    if let Some(end) = json_str.rfind(']') {
162                        &json_str[start..=end]
163                    } else {
164                        json_str
165                    }
166                } else {
167                    json_str
168                };
169
170                #[derive(Deserialize)]
171                struct AtomicFactJson {
172                    subject: String,
173                    predicate: String,
174                    object: String,
175                    temporal_marker: Option<String>,
176                    confidence: f32,
177                }
178
179                match serde_json::from_str::<Vec<AtomicFactJson>>(json_str) {
180                    Ok(facts_json) => {
181                        let facts: Vec<AtomicFact> = facts_json
182                            .into_iter()
183                            .map(|f| AtomicFact {
184                                subject: f.subject,
185                                predicate: f.predicate,
186                                object: f.object,
187                                temporal_marker: f
188                                    .temporal_marker
189                                    .filter(|s| !s.is_empty() && s != "null"),
190                                confidence: f.confidence.clamp(0.0, 1.0),
191                            })
192                            .collect();
193
194                        #[cfg(feature = "tracing")]
195                        tracing::info!(
196                            chunk_id = %chunk.id,
197                            fact_count = facts.len(),
198                            "Extracted atomic facts"
199                        );
200
201                        Ok(facts)
202                    },
203                    Err(e) => {
204                        #[cfg(feature = "tracing")]
205                        tracing::warn!(
206                            chunk_id = %chunk.id,
207                            error = %e,
208                            response = %json_str,
209                            "Failed to parse atomic facts JSON"
210                        );
211
212                        // Return empty vector on parse failure
213                        Ok(Vec::new())
214                    },
215                }
216            },
217            Err(e) => {
218                #[cfg(feature = "tracing")]
219                tracing::error!(
220                    chunk_id = %chunk.id,
221                    error = %e,
222                    "Atomic fact extraction failed"
223                );
224
225                Err(GraphRAGError::EntityExtraction {
226                    message: format!("Atomic fact extraction failed: {}", e),
227                })
228            },
229        }
230    }
231
232    /// Convert atomic facts to graph elements (entities and relationships)
233    ///
234    /// # Arguments
235    ///
236    /// * `facts` - Vector of atomic facts to convert
237    /// * `chunk_id` - ID of the source chunk for context
238    ///
239    /// # Returns
240    ///
241    /// Tuple of (entities, relationships) extracted from facts
242    pub fn atomics_to_graph_elements(
243        &self,
244        facts: Vec<AtomicFact>,
245        chunk_id: &crate::core::ChunkId,
246    ) -> (Vec<Entity>, Vec<Relationship>) {
247        let mut entities: HashMap<String, Entity> = HashMap::new();
248        let mut relationships = Vec::new();
249
250        for fact in facts {
251            // Create or update subject entity
252            let subject_id = EntityId::new(Self::normalize_entity_name(&fact.subject));
253            entities.entry(subject_id.0.clone()).or_insert_with(|| {
254                let mut entity = Entity::new(
255                    subject_id.clone(),
256                    fact.subject.clone(),
257                    Self::infer_entity_type(&fact.subject),
258                    fact.confidence,
259                );
260
261                // Add temporal information if available
262                if let Some(timestamp) = fact.extract_timestamp() {
263                    entity.first_mentioned = Some(timestamp);
264                    entity.last_mentioned = Some(timestamp);
265                }
266
267                entity
268            });
269
270            // Create or update object entity
271            let object_id = EntityId::new(Self::normalize_entity_name(&fact.object));
272            entities.entry(object_id.0.clone()).or_insert_with(|| {
273                let mut entity = Entity::new(
274                    object_id.clone(),
275                    fact.object.clone(),
276                    Self::infer_entity_type(&fact.object),
277                    fact.confidence,
278                );
279
280                // Add temporal information if available
281                if let Some(timestamp) = fact.extract_timestamp() {
282                    entity.first_mentioned = Some(timestamp);
283                    entity.last_mentioned = Some(timestamp);
284                }
285
286                entity
287            });
288
289            // Create relationship
290            let mut relationship = Relationship::new(
291                subject_id,
292                object_id,
293                fact.predicate.to_uppercase(),
294                fact.confidence,
295            )
296            .with_context(vec![chunk_id.clone()]);
297
298            // Add temporal information if available
299            if let Some(timestamp) = fact.extract_timestamp() {
300                relationship.temporal_range = Some(crate::graph::temporal::TemporalRange::new(
301                    timestamp, timestamp,
302                ));
303
304                // Infer temporal relationship type based on predicate
305                if fact.predicate.to_lowercase().contains("caused")
306                    || fact.predicate.to_lowercase().contains("led to")
307                {
308                    relationship.temporal_type =
309                        Some(crate::graph::temporal::TemporalRelationType::Caused);
310                    relationship.causal_strength = Some(fact.confidence);
311                } else if fact.predicate.to_lowercase().contains("enabled")
312                    || fact.predicate.to_lowercase().contains("allowed")
313                {
314                    relationship.temporal_type =
315                        Some(crate::graph::temporal::TemporalRelationType::Enabled);
316                    relationship.causal_strength = Some(fact.confidence * 0.6);
317                }
318            }
319
320            relationships.push(relationship);
321        }
322
323        (entities.into_values().collect(), relationships)
324    }
325
326    /// Normalize entity name for consistent ID generation
327    fn normalize_entity_name(name: &str) -> String {
328        name.trim()
329            .to_lowercase()
330            .replace(' ', "_")
331            .chars()
332            .filter(|c| c.is_alphanumeric() || *c == '_')
333            .collect()
334    }
335
336    /// Infer entity type from name (simple heuristic)
337    fn infer_entity_type(name: &str) -> String {
338        let lower = name.to_lowercase();
339
340        // Check for proper nouns (capitalized)
341        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
342            if lower.ends_with("ia") || lower.ends_with("land") || lower.ends_with("istan") {
343                return "LOCATION".to_string();
344            }
345            return "PERSON".to_string();
346        }
347
348        // Check for numbers/dates
349        if name.chars().any(|c| c.is_ascii_digit()) {
350            return "DATE".to_string();
351        }
352
353        // Default to concept
354        "CONCEPT".to_string()
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361
362    #[test]
363    fn test_atomic_fact_creation() {
364        let fact = AtomicFact {
365            subject: "Socrates".to_string(),
366            predicate: "taught".to_string(),
367            object: "Plato".to_string(),
368            temporal_marker: Some("in 380 BC".to_string()),
369            confidence: 0.9,
370        };
371
372        assert_eq!(fact.subject, "Socrates");
373        assert!(fact.is_temporal());
374    }
375
376    #[test]
377    fn test_timestamp_extraction_bc() {
378        let fact = AtomicFact {
379            subject: "Event".to_string(),
380            predicate: "occurred".to_string(),
381            object: "Athens".to_string(),
382            temporal_marker: Some("380 BC".to_string()),
383            confidence: 0.9,
384        };
385
386        let timestamp = fact.extract_timestamp();
387        assert!(timestamp.is_some());
388        assert!(timestamp.unwrap() < 0); // BC should be negative
389    }
390
391    #[test]
392    fn test_timestamp_extraction_ad() {
393        let fact = AtomicFact {
394            subject: "Event".to_string(),
395            predicate: "occurred".to_string(),
396            object: "Rome".to_string(),
397            temporal_marker: Some("in 1876".to_string()),
398            confidence: 0.9,
399        };
400
401        let timestamp = fact.extract_timestamp();
402        assert!(timestamp.is_some());
403    }
404
405    #[test]
406    fn test_normalize_entity_name() {
407        assert_eq!(
408            AtomicFactExtractor::normalize_entity_name("Socrates the Philosopher"),
409            "socrates_the_philosopher"
410        );
411        assert_eq!(
412            AtomicFactExtractor::normalize_entity_name("New York"),
413            "new_york"
414        );
415    }
416
417    #[test]
418    fn test_infer_entity_type() {
419        assert_eq!(AtomicFactExtractor::infer_entity_type("Socrates"), "PERSON");
420        assert_eq!(
421            AtomicFactExtractor::infer_entity_type("Athens"),
422            "PERSON" // Would be LOCATION if we had more sophisticated logic
423        );
424        assert_eq!(AtomicFactExtractor::infer_entity_type("love"), "CONCEPT");
425        assert_eq!(AtomicFactExtractor::infer_entity_type("1876"), "DATE");
426    }
427}