#![cfg_attr(not(feature = "async"), allow(unused_imports))]
use crate::{
core::{Entity, EntityId, GraphRAGError, Relationship, Result, TextChunk},
ollama::OllamaClient,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AtomicFact {
pub subject: String,
pub predicate: String,
pub object: String,
pub temporal_marker: Option<String>,
pub confidence: f32,
}
impl AtomicFact {
pub fn is_temporal(&self) -> bool {
self.temporal_marker.is_some()
}
pub fn extract_timestamp(&self) -> Option<i64> {
let marker = self.temporal_marker.as_ref()?;
if marker.contains("BC") || marker.contains("BCE") {
let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
if let Ok(year) = num_str.parse::<i64>() {
return Some(-year * 365 * 24 * 3600);
}
}
let num_str: String = marker.chars().filter(|c| c.is_ascii_digit()).collect();
if let Ok(year) = num_str.parse::<i64>() {
if year > 1000 && year < 3000 {
return Some((year - 1970) * 365 * 24 * 3600);
}
}
None
}
}
pub struct AtomicFactExtractor {
#[cfg_attr(not(feature = "async"), allow(dead_code))]
ollama_client: OllamaClient,
max_fact_tokens: usize,
}
impl AtomicFactExtractor {
pub fn new(ollama_client: OllamaClient) -> Self {
Self {
ollama_client,
max_fact_tokens: 400,
}
}
pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
self.max_fact_tokens = max_tokens;
self
}
#[cfg(feature = "async")]
pub async fn extract_atomic_facts(&self, chunk: &TextChunk) -> Result<Vec<AtomicFact>> {
let prompt = format!(
r#"Extract atomic facts from the following text. Each fact should be:
- Self-contained and verifiable (< {} tokens)
- In the format: (Subject, Predicate, Object, TemporalMarker, Confidence)
- TemporalMarker should capture time expressions like "in 1876", "during summer", "380 BC" (or null if none)
- Confidence should be 0.0-1.0
Respond ONLY with valid JSON array:
[
{{
"subject": "entity or concept",
"predicate": "relationship or property",
"object": "entity, value, or concept",
"temporal_marker": "time expression or null",
"confidence": 0.0-1.0
}}
]
Text: "{}"
JSON:"#,
self.max_fact_tokens, chunk.content
);
#[cfg(feature = "tracing")]
tracing::debug!(
chunk_id = %chunk.id,
"Extracting atomic facts from chunk"
);
match self.ollama_client.generate(&prompt).await {
Ok(response) => {
let json_str = response.trim();
let json_str = if let Some(start) = json_str.find('[') {
if let Some(end) = json_str.rfind(']') {
&json_str[start..=end]
} else {
json_str
}
} else {
json_str
};
#[derive(Deserialize)]
struct AtomicFactJson {
subject: String,
predicate: String,
object: String,
temporal_marker: Option<String>,
confidence: f32,
}
match serde_json::from_str::<Vec<AtomicFactJson>>(json_str) {
Ok(facts_json) => {
let facts: Vec<AtomicFact> = facts_json
.into_iter()
.map(|f| AtomicFact {
subject: f.subject,
predicate: f.predicate,
object: f.object,
temporal_marker: f
.temporal_marker
.filter(|s| !s.is_empty() && s != "null"),
confidence: f.confidence.clamp(0.0, 1.0),
})
.collect();
#[cfg(feature = "tracing")]
tracing::info!(
chunk_id = %chunk.id,
fact_count = facts.len(),
"Extracted atomic facts"
);
Ok(facts)
},
Err(e) => {
#[cfg(feature = "tracing")]
tracing::warn!(
chunk_id = %chunk.id,
error = %e,
response = %json_str,
"Failed to parse atomic facts JSON"
);
Ok(Vec::new())
},
}
},
Err(e) => {
#[cfg(feature = "tracing")]
tracing::error!(
chunk_id = %chunk.id,
error = %e,
"Atomic fact extraction failed"
);
Err(GraphRAGError::EntityExtraction {
message: format!("Atomic fact extraction failed: {}", e),
})
},
}
}
pub fn atomics_to_graph_elements(
&self,
facts: Vec<AtomicFact>,
chunk_id: &crate::core::ChunkId,
) -> (Vec<Entity>, Vec<Relationship>) {
let mut entities: HashMap<String, Entity> = HashMap::new();
let mut relationships = Vec::new();
for fact in facts {
let subject_id = EntityId::new(Self::normalize_entity_name(&fact.subject));
entities.entry(subject_id.0.clone()).or_insert_with(|| {
let mut entity = Entity::new(
subject_id.clone(),
fact.subject.clone(),
Self::infer_entity_type(&fact.subject),
fact.confidence,
);
if let Some(timestamp) = fact.extract_timestamp() {
entity.first_mentioned = Some(timestamp);
entity.last_mentioned = Some(timestamp);
}
entity
});
let object_id = EntityId::new(Self::normalize_entity_name(&fact.object));
entities.entry(object_id.0.clone()).or_insert_with(|| {
let mut entity = Entity::new(
object_id.clone(),
fact.object.clone(),
Self::infer_entity_type(&fact.object),
fact.confidence,
);
if let Some(timestamp) = fact.extract_timestamp() {
entity.first_mentioned = Some(timestamp);
entity.last_mentioned = Some(timestamp);
}
entity
});
let mut relationship = Relationship::new(
subject_id,
object_id,
fact.predicate.to_uppercase(),
fact.confidence,
)
.with_context(vec![chunk_id.clone()]);
if let Some(timestamp) = fact.extract_timestamp() {
relationship.temporal_range = Some(crate::graph::temporal::TemporalRange::new(
timestamp, timestamp,
));
if fact.predicate.to_lowercase().contains("caused")
|| fact.predicate.to_lowercase().contains("led to")
{
relationship.temporal_type =
Some(crate::graph::temporal::TemporalRelationType::Caused);
relationship.causal_strength = Some(fact.confidence);
} else if fact.predicate.to_lowercase().contains("enabled")
|| fact.predicate.to_lowercase().contains("allowed")
{
relationship.temporal_type =
Some(crate::graph::temporal::TemporalRelationType::Enabled);
relationship.causal_strength = Some(fact.confidence * 0.6);
}
}
relationships.push(relationship);
}
(entities.into_values().collect(), relationships)
}
fn normalize_entity_name(name: &str) -> String {
name.trim()
.to_lowercase()
.replace(' ', "_")
.chars()
.filter(|c| c.is_alphanumeric() || *c == '_')
.collect()
}
fn infer_entity_type(name: &str) -> String {
let lower = name.to_lowercase();
if name.chars().next().is_some_and(|c| c.is_uppercase()) {
if lower.ends_with("ia") || lower.ends_with("land") || lower.ends_with("istan") {
return "LOCATION".to_string();
}
return "PERSON".to_string();
}
if name.chars().any(|c| c.is_ascii_digit()) {
return "DATE".to_string();
}
"CONCEPT".to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_atomic_fact_creation() {
let fact = AtomicFact {
subject: "Socrates".to_string(),
predicate: "taught".to_string(),
object: "Plato".to_string(),
temporal_marker: Some("in 380 BC".to_string()),
confidence: 0.9,
};
assert_eq!(fact.subject, "Socrates");
assert!(fact.is_temporal());
}
#[test]
fn test_timestamp_extraction_bc() {
let fact = AtomicFact {
subject: "Event".to_string(),
predicate: "occurred".to_string(),
object: "Athens".to_string(),
temporal_marker: Some("380 BC".to_string()),
confidence: 0.9,
};
let timestamp = fact.extract_timestamp();
assert!(timestamp.is_some());
assert!(timestamp.unwrap() < 0); }
#[test]
fn test_timestamp_extraction_ad() {
let fact = AtomicFact {
subject: "Event".to_string(),
predicate: "occurred".to_string(),
object: "Rome".to_string(),
temporal_marker: Some("in 1876".to_string()),
confidence: 0.9,
};
let timestamp = fact.extract_timestamp();
assert!(timestamp.is_some());
}
#[test]
fn test_normalize_entity_name() {
assert_eq!(
AtomicFactExtractor::normalize_entity_name("Socrates the Philosopher"),
"socrates_the_philosopher"
);
assert_eq!(
AtomicFactExtractor::normalize_entity_name("New York"),
"new_york"
);
}
#[test]
fn test_infer_entity_type() {
assert_eq!(AtomicFactExtractor::infer_entity_type("Socrates"), "PERSON");
assert_eq!(
AtomicFactExtractor::infer_entity_type("Athens"),
"PERSON" );
assert_eq!(AtomicFactExtractor::infer_entity_type("love"), "CONCEPT");
assert_eq!(AtomicFactExtractor::infer_entity_type("1876"), "DATE");
}
}