graphrag-core 0.2.0

Core portable library for GraphRAG - works on native and WASM
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
#![cfg_attr(not(feature = "async"), allow(unused_imports))]

//! ATOM Atomic Fact Extraction
//!
//! This module implements atomic fact extraction following the ATOM methodology
//! (itext2kg - https://github.com/AuvaLab/itext2kg)
//!
//! ATOM extracts self-contained facts as 5-tuples:
//! (Subject, Predicate, Object, TemporalMarker, Confidence)
//!
//! Benefits:
//! - More granular than entity-relationship pairs
//! - Better temporal grounding
//! - Easier to validate and verify
//! - Natural fit for knowledge graphs

use crate::{
    core::{Entity, EntityId, GraphRAGError, Relationship, Result, TextChunk},
    ollama::OllamaClient,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Atomic fact extracted from text
///
/// Represents a single, self-contained factual statement.
/// Each fact should be verifiable and stand alone without context.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AtomicFact {
    /// Subject of the fact (entity performing action or being described)
    pub subject: String,
    /// Predicate describing the relationship or property
    pub predicate: String,
    /// Object of the fact (entity being acted upon or value)
    pub object: String,
    /// Optional temporal marker (e.g., "in 1876", "during summer", "380 BC")
    pub temporal_marker: Option<String>,
    /// Confidence score for this fact (0.0-1.0)
    pub confidence: f32,
}

impl AtomicFact {
    /// Check if this fact has temporal information
    pub fn is_temporal(&self) -> bool {
        self.temporal_marker.is_some()
    }

    /// Extract approximate Unix timestamp from temporal marker if possible
    ///
    /// This is a best-effort extraction and may not work for all formats.
    /// Returns None if temporal marker is missing or cannot be parsed.
    pub fn extract_timestamp(&self) -> Option<i64> {
        let marker = self.temporal_marker.as_ref()?;

        // Try to extract year from common formats
        // "in 1876", "during 1876", "1876", "380 BC", etc.

        // Check for BC/BCE dates
        if marker.contains("BC") || marker.contains("BCE") {
            // Extract the number before BC/BCE
            let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();

            if let Ok(year) = num_str.parse::<i64>() {
                // Negative for BC years
                // Approximate: 365.25 days per year * 24 hours * 3600 seconds
                return Some(-year * 365 * 24 * 3600);
            }
        }

        // Check for AD dates (positive years)
        let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();

        if let Ok(year) = num_str.parse::<i64>() {
            if year > 1000 && year < 3000 {
                // Approximate Unix timestamp for year
                // Unix epoch is 1970, so subtract that
                return Some((year - 1970) * 365 * 24 * 3600);
            }
        }

        None
    }
}

/// Extractor for atomic facts from text
///
/// Uses LLM to decompose text into self-contained factual statements.
pub struct AtomicFactExtractor {
    /// Ollama client for LLM-based extraction
    #[cfg_attr(not(feature = "async"), allow(dead_code))]
    ollama_client: OllamaClient,
    /// Maximum tokens per fact (default: 400)
    max_fact_tokens: usize,
}

impl AtomicFactExtractor {
    /// Create a new atomic fact extractor
    ///
    /// # Arguments
    ///
    /// * `ollama_client` - Ollama client for LLM calls
    pub fn new(ollama_client: OllamaClient) -> Self {
        Self {
            ollama_client,
            max_fact_tokens: 400,
        }
    }

    /// Set maximum tokens per fact
    pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
        self.max_fact_tokens = max_tokens;
        self
    }

    /// Extract atomic facts from a text chunk
    ///
    /// # Arguments
    ///
    /// * `chunk` - Text chunk to extract facts from
    ///
    /// # Returns
    ///
    /// Vector of atomic facts extracted from the text
    #[cfg(feature = "async")]
    pub async fn extract_atomic_facts(&self, chunk: &TextChunk) -> Result<Vec<AtomicFact>> {
        let prompt = format!(
            r#"Extract atomic facts from the following text. Each fact should be:
- Self-contained and verifiable (< {} tokens)
- In the format: (Subject, Predicate, Object, TemporalMarker, Confidence)
- TemporalMarker should capture time expressions like "in 1876", "during summer", "380 BC" (or null if none)
- Confidence should be 0.0-1.0

Respond ONLY with valid JSON array:
[
  {{
    "subject": "entity or concept",
    "predicate": "relationship or property",
    "object": "entity, value, or concept",
    "temporal_marker": "time expression or null",
    "confidence": 0.0-1.0
  }}
]

Text: "{}"

JSON:"#,
            self.max_fact_tokens, chunk.content
        );

        #[cfg(feature = "tracing")]
        tracing::debug!(
            chunk_id = %chunk.id,
            "Extracting atomic facts from chunk"
        );

        match self.ollama_client.generate(&prompt).await {
            Ok(response) => {
                // Extract JSON from response
                let json_str = response.trim();
                let json_str = if let Some(start) = json_str.find('[') {
                    if let Some(end) = json_str.rfind(']') {
                        &json_str[start..=end]
                    } else {
                        json_str
                    }
                } else {
                    json_str
                };

                #[derive(Deserialize)]
                struct AtomicFactJson {
                    subject: String,
                    predicate: String,
                    object: String,
                    temporal_marker: Option<String>,
                    confidence: f32,
                }

                match serde_json::from_str::<Vec<AtomicFactJson>>(json_str) {
                    Ok(facts_json) => {
                        let facts: Vec<AtomicFact> = facts_json
                            .into_iter()
                            .map(|f| AtomicFact {
                                subject: f.subject,
                                predicate: f.predicate,
                                object: f.object,
                                temporal_marker: f
                                    .temporal_marker
                                    .filter(|s| !s.is_empty() && s != "null"),
                                confidence: f.confidence.clamp(0.0, 1.0),
                            })
                            .collect();

                        #[cfg(feature = "tracing")]
                        tracing::info!(
                            chunk_id = %chunk.id,
                            fact_count = facts.len(),
                            "Extracted atomic facts"
                        );

                        Ok(facts)
                    },
                    Err(e) => {
                        #[cfg(feature = "tracing")]
                        tracing::warn!(
                            chunk_id = %chunk.id,
                            error = %e,
                            response = %json_str,
                            "Failed to parse atomic facts JSON"
                        );

                        // Return empty vector on parse failure
                        Ok(Vec::new())
                    },
                }
            },
            Err(e) => {
                #[cfg(feature = "tracing")]
                tracing::error!(
                    chunk_id = %chunk.id,
                    error = %e,
                    "Atomic fact extraction failed"
                );

                Err(GraphRAGError::EntityExtraction {
                    message: format!("Atomic fact extraction failed: {}", e),
                })
            },
        }
    }

    /// Convert atomic facts to graph elements (entities and relationships)
    ///
    /// # Arguments
    ///
    /// * `facts` - Vector of atomic facts to convert
    /// * `chunk_id` - ID of the source chunk for context
    ///
    /// # Returns
    ///
    /// Tuple of (entities, relationships) extracted from facts
    pub fn atomics_to_graph_elements(
        &self,
        facts: Vec<AtomicFact>,
        chunk_id: &crate::core::ChunkId,
    ) -> (Vec<Entity>, Vec<Relationship>) {
        let mut entities: HashMap<String, Entity> = HashMap::new();
        let mut relationships = Vec::new();

        for fact in facts {
            // Create or update subject entity
            let subject_id = EntityId::new(Self::normalize_entity_name(&fact.subject));
            entities.entry(subject_id.0.clone()).or_insert_with(|| {
                let mut entity = Entity::new(
                    subject_id.clone(),
                    fact.subject.clone(),
                    Self::infer_entity_type(&fact.subject),
                    fact.confidence,
                );

                // Add temporal information if available
                if let Some(timestamp) = fact.extract_timestamp() {
                    entity.first_mentioned = Some(timestamp);
                    entity.last_mentioned = Some(timestamp);
                }

                entity
            });

            // Create or update object entity
            let object_id = EntityId::new(Self::normalize_entity_name(&fact.object));
            entities.entry(object_id.0.clone()).or_insert_with(|| {
                let mut entity = Entity::new(
                    object_id.clone(),
                    fact.object.clone(),
                    Self::infer_entity_type(&fact.object),
                    fact.confidence,
                );

                // Add temporal information if available
                if let Some(timestamp) = fact.extract_timestamp() {
                    entity.first_mentioned = Some(timestamp);
                    entity.last_mentioned = Some(timestamp);
                }

                entity
            });

            // Create relationship
            let mut relationship = Relationship::new(
                subject_id,
                object_id,
                fact.predicate.to_uppercase(),
                fact.confidence,
            )
            .with_context(vec![chunk_id.clone()]);

            // Add temporal information if available
            if let Some(timestamp) = fact.extract_timestamp() {
                relationship.temporal_range = Some(crate::graph::temporal::TemporalRange::new(
                    timestamp, timestamp,
                ));

                // Infer temporal relationship type based on predicate
                if fact.predicate.to_lowercase().contains("caused")
                    || fact.predicate.to_lowercase().contains("led to")
                {
                    relationship.temporal_type =
                        Some(crate::graph::temporal::TemporalRelationType::Caused);
                    relationship.causal_strength = Some(fact.confidence);
                } else if fact.predicate.to_lowercase().contains("enabled")
                    || fact.predicate.to_lowercase().contains("allowed")
                {
                    relationship.temporal_type =
                        Some(crate::graph::temporal::TemporalRelationType::Enabled);
                    relationship.causal_strength = Some(fact.confidence * 0.6);
                }
            }

            relationships.push(relationship);
        }

        (entities.into_values().collect(), relationships)
    }

    /// Normalize entity name for consistent ID generation
    fn normalize_entity_name(name: &str) -> String {
        name.trim()
            .to_lowercase()
            .replace(' ', "_")
            .chars()
            .filter(|c| c.is_alphanumeric() || *c == '_')
            .collect()
    }

    /// Infer entity type from name (simple heuristic)
    fn infer_entity_type(name: &str) -> String {
        let lower = name.to_lowercase();

        // Check for proper nouns (capitalized)
        if name.chars().next().is_some_and(|c| c.is_uppercase()) {
            if lower.ends_with("ia") || lower.ends_with("land") || lower.ends_with("istan") {
                return "LOCATION".to_string();
            }
            return "PERSON".to_string();
        }

        // Check for numbers/dates
        if name.chars().any(|c| c.is_ascii_digit()) {
            return "DATE".to_string();
        }

        // Default to concept
        "CONCEPT".to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_atomic_fact_creation() {
        let fact = AtomicFact {
            subject: "Socrates".to_string(),
            predicate: "taught".to_string(),
            object: "Plato".to_string(),
            temporal_marker: Some("in 380 BC".to_string()),
            confidence: 0.9,
        };

        assert_eq!(fact.subject, "Socrates");
        assert!(fact.is_temporal());
    }

    #[test]
    fn test_timestamp_extraction_bc() {
        let fact = AtomicFact {
            subject: "Event".to_string(),
            predicate: "occurred".to_string(),
            object: "Athens".to_string(),
            temporal_marker: Some("380 BC".to_string()),
            confidence: 0.9,
        };

        let timestamp = fact.extract_timestamp();
        assert!(timestamp.is_some());
        assert!(timestamp.unwrap() < 0); // BC should be negative
    }

    #[test]
    fn test_timestamp_extraction_ad() {
        let fact = AtomicFact {
            subject: "Event".to_string(),
            predicate: "occurred".to_string(),
            object: "Rome".to_string(),
            temporal_marker: Some("in 1876".to_string()),
            confidence: 0.9,
        };

        let timestamp = fact.extract_timestamp();
        assert!(timestamp.is_some());
    }

    #[test]
    fn test_normalize_entity_name() {
        assert_eq!(
            AtomicFactExtractor::normalize_entity_name("Socrates the Philosopher"),
            "socrates_the_philosopher"
        );
        assert_eq!(
            AtomicFactExtractor::normalize_entity_name("New York"),
            "new_york"
        );
    }

    #[test]
    fn test_infer_entity_type() {
        assert_eq!(AtomicFactExtractor::infer_entity_type("Socrates"), "PERSON");
        assert_eq!(
            AtomicFactExtractor::infer_entity_type("Athens"),
            "PERSON" // Would be LOCATION if we had more sophisticated logic
        );
        assert_eq!(AtomicFactExtractor::infer_entity_type("love"), "CONCEPT");
        assert_eq!(AtomicFactExtractor::infer_entity_type("1876"), "DATE");
    }
}