use crate::error::Result;
use crate::traits::LlmCallback;
#[derive(Debug, Clone)]
pub struct ExtractedFact {
pub text: String,
pub category: String,
pub importance: u8,
pub entities: Vec<String>,
pub relationships: Vec<(String, String, String)>,
}
#[derive(Debug, Clone)]
pub struct ExtractionResult {
pub facts: Vec<ExtractedFact>,
pub tokens_used: usize,
}
pub fn extract_facts(text: &str, llm: &dyn LlmCallback) -> Result<ExtractionResult> {
if text.trim().is_empty() {
return Ok(ExtractionResult {
facts: Vec::new(),
tokens_used: 0,
});
}
let prompt = build_extraction_prompt(text);
let response = llm.generate(&prompt, 4096)?;
let facts = parse_extraction_response(&response);
let tokens_used = (prompt.len() + response.len()) / 4;
Ok(ExtractionResult { facts, tokens_used })
}
fn build_extraction_prompt(text: &str) -> String {
format!(
r#"Extract structured facts from the following text. For each fact, output one line in this exact format:
CATEGORY|IMPORTANCE|FACT_TEXT|ENTITIES|RELATIONSHIPS
Where:
- CATEGORY: one of: fact, decision, preference, note, lesson
- IMPORTANCE: 1-10 (10 = critical, 1 = trivial)
- FACT_TEXT: the extracted fact as a clear statement
- ENTITIES: comma-separated entity names (people, places, things)
- RELATIONSHIPS: semicolon-separated triples as "subject>relation>object"
Rules:
- Extract EVERY concrete fact, decision, preference, and noteworthy statement
- Each fact should be independently understandable
- Include entity names exactly as mentioned in the text
- For relationships, use clear relation types: married_to, works_at, citizen_of, author_of, likes, dislikes, lives_in, born_in, created_by, member_of, supersedes, etc.
- If a fact contradicts or updates a previous fact, note the relationship as "new_value>supersedes>old_value"
- If no entities or relationships, leave those fields empty
- Output ONLY the formatted lines, no explanations or headers
TEXT:
{text}
EXTRACTED FACTS:"#
)
}
fn parse_extraction_response(response: &str) -> Vec<ExtractedFact> {
let mut facts = Vec::new();
for line in response.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') || line.starts_with("EXTRACTED") {
continue;
}
let parts: Vec<&str> = line.splitn(5, '|').collect();
if parts.len() < 3 {
if !line.is_empty() && line.len() > 10 {
facts.push(ExtractedFact {
text: line.to_string(),
category: "fact".to_string(),
importance: 5,
entities: Vec::new(),
relationships: Vec::new(),
});
}
continue;
}
let category = parts[0].trim().to_lowercase();
let importance: u8 = parts[1].trim().parse().unwrap_or(5).clamp(1, 10);
let text = parts[2].trim().to_string();
let entities = if parts.len() > 3 && !parts[3].trim().is_empty() {
parts[3]
.split(',')
.map(|e| e.trim().to_string())
.filter(|e| !e.is_empty())
.collect()
} else {
Vec::new()
};
let relationships = if parts.len() > 4 && !parts[4].trim().is_empty() {
parts[4]
.split(';')
.filter_map(|r| {
let triple: Vec<&str> = r.split('>').collect();
if triple.len() == 3 {
Some((
triple[0].trim().to_string(),
triple[1].trim().to_string(),
triple[2].trim().to_string(),
))
} else {
None
}
})
.collect()
} else {
Vec::new()
};
if !text.is_empty() {
facts.push(ExtractedFact {
text,
category: if ["fact", "decision", "preference", "note", "lesson"]
.contains(&category.as_str())
{
category
} else {
"fact".to_string()
},
importance,
entities,
relationships,
});
}
}
facts
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_prompt_includes_text() {
let prompt = build_extraction_prompt("Hello world");
assert!(prompt.contains("Hello world"));
assert!(prompt.contains("CATEGORY|IMPORTANCE|FACT_TEXT"));
}
#[test]
fn parse_well_formed_response() {
let response = "fact|7|User likes sushi|User,sushi|User>likes>sushi\n\
decision|8|Chose Rust over Python|Rust,Python|Rust>chosen_over>Python\n\
preference|6|Prefers dark mode||";
let facts = parse_extraction_response(response);
assert_eq!(facts.len(), 3);
assert_eq!(facts[0].category, "fact");
assert_eq!(facts[0].importance, 7);
assert_eq!(facts[0].text, "User likes sushi");
assert_eq!(facts[0].entities, vec!["User", "sushi"]);
assert_eq!(facts[0].relationships.len(), 1);
assert_eq!(
facts[0].relationships[0],
("User".into(), "likes".into(), "sushi".into())
);
assert_eq!(facts[1].category, "decision");
assert_eq!(facts[1].importance, 8);
assert_eq!(facts[2].category, "preference");
}
#[test]
fn parse_malformed_response() {
let response = "fact|5|Valid fact||\n\
this is not formatted correctly but is long enough\n\
\n\
fact|3|Another fact||";
let facts = parse_extraction_response(response);
assert_eq!(facts.len(), 3); }
#[test]
fn parse_empty_response() {
let facts = parse_extraction_response("");
assert!(facts.is_empty());
}
#[test]
fn parse_clamps_importance() {
let response = "fact|15|Too high||\nfact|0|Too low||";
let facts = parse_extraction_response(response);
assert_eq!(facts[0].importance, 10); assert_eq!(facts[1].importance, 1); }
#[test]
fn extract_with_mock_llm() {
struct MockLlm;
impl LlmCallback for MockLlm {
fn generate(&self, _prompt: &str, _max_tokens: usize) -> Result<String> {
Ok(
"fact|7|User's favorite color is blue|User|User>favorite_color>blue"
.to_string(),
)
}
}
let result = extract_facts("I really like blue!", &MockLlm).unwrap();
assert_eq!(result.facts.len(), 1);
assert_eq!(result.facts[0].text, "User's favorite color is blue");
assert_eq!(result.facts[0].relationships[0].2, "blue");
}
#[test]
fn extract_empty_text() {
struct MockLlm;
impl LlmCallback for MockLlm {
fn generate(&self, _: &str, _: usize) -> Result<String> {
panic!("should not be called for empty text");
}
}
let result = extract_facts("", &MockLlm).unwrap();
assert!(result.facts.is_empty());
}
}