use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct FactTriple {
pub index: usize,
pub subject: String,
pub relation: String,
pub object: String,
pub original: String,
}
#[derive(Debug, Clone)]
pub struct FactConflict {
pub old_fact: FactTriple,
pub new_fact: FactTriple,
}
pub fn extract_facts(text: &str) -> (Vec<FactTriple>, Vec<FactConflict>) {
let lines: Vec<&str> = text
.lines()
.map(|l| l.trim())
.filter(|l| {
!l.is_empty()
&& l.chars()
.next()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
})
.collect();
let mut triples = Vec::new();
for line in &lines {
if let Some(triple) = parse_fact_line(line) {
triples.push(triple);
}
}
let mut by_key: HashMap<(String, String), Vec<&FactTriple>> = HashMap::new();
for triple in &triples {
let key = (triple.subject.to_lowercase(), triple.relation.clone());
by_key.entry(key).or_default().push(triple);
}
let mut conflicts = Vec::new();
for facts in by_key.values() {
if facts.len() > 1 {
let mut sorted: Vec<&&FactTriple> = facts.iter().collect();
sorted.sort_by_key(|f| f.index);
for pair in sorted.windows(2) {
if pair[0].object.to_lowercase() != pair[1].object.to_lowercase() {
conflicts.push(FactConflict {
old_fact: (*pair[0]).clone(),
new_fact: (*pair[1]).clone(),
});
}
}
}
}
(triples, conflicts)
}
fn parse_fact_line(line: &str) -> Option<FactTriple> {
let dot_pos = line.find('.')?;
let index: usize = line[..dot_pos].trim().parse().ok()?;
let statement = line[dot_pos + 1..].trim();
let patterns: &[(&str, &str, fn(&str) -> Option<(&str, &str)>)] = &[];
let _ = patterns;
let relation_phrases = [
(" is married to ", "married_to"),
(" is a citizen of ", "citizen_of"),
(" was born in the city of ", "born_in"),
(" died in the city of ", "died_in"),
(" plays the position of ", "plays_position"),
(" was founded by ", "founded_by"),
(" is famous for ", "famous_for"),
(" was developed by ", "developed_by"),
(" is located in the continent of ", "located_in_continent"),
(" is located in the country of ", "located_in_country"),
(" is associated with the sport of ", "sport_of"),
(" was created by ", "created_by"),
(" was created in the country of ", "created_in_country"),
(" worked in the city of ", "worked_in"),
(" was performed by ", "performed_by"),
(" is employed by ", "employed_by"),
(" speaks the language of ", "speaks_language"),
(" is affiliated with the religion of ", "religion_of"),
(" is a member of ", "member_of"),
(" was written in the language of ", "written_in_language"),
(" was founded in the city of ", "founded_in"),
];
for (phrase, rel_type) in &relation_phrases {
if let Some(pos) = statement.find(phrase) {
let subject = statement[..pos].trim().trim_end_matches('.');
let object = statement[pos + phrase.len()..].trim().trim_end_matches('.');
if !subject.is_empty() && !object.is_empty() {
return Some(FactTriple {
index,
subject: subject.to_string(),
relation: rel_type.to_string(),
object: object.to_string(),
original: line.to_string(),
});
}
}
}
let the_patterns = [
("The author of ", " is ", "author_of"),
("The capital of ", " is ", "capital_of"),
("The chairperson of ", " is ", "chairperson_of"),
("The chief executive officer of ", " is ", "ceo_of"),
("The director of ", " is ", "director_of"),
(
"The headquarters of ",
" is located in the city of ",
"headquarters_in",
),
("The official language of ", " is ", "official_language"),
("The type of music that ", " plays is ", "music_genre"),
(
"The name of the current head of the ",
" government is ",
"government_head",
),
("The univeristy where ", " was educated is ", "educated_at"),
("The company that produced ", " is ", "produced_by"),
("The genre of ", " is ", "genre_of"),
("The country of origin of ", " is ", "country_of_origin"),
];
for (prefix, middle, rel_type) in &the_patterns {
if let Some(rest) = statement.strip_prefix(prefix) {
if let Some(mid_pos) = rest.find(middle) {
let subject = rest[..mid_pos].trim();
let object = rest[mid_pos + middle.len()..].trim().trim_end_matches('.');
if !subject.is_empty() && !object.is_empty() {
return Some(FactTriple {
index,
subject: subject.to_string(),
relation: rel_type.to_string(),
object: object.to_string(),
original: line.to_string(),
});
}
}
}
}
if let Some(pos) = statement.find("'s child is ") {
let subject = statement[..pos].trim();
let object = statement[pos + "'s child is ".len()..]
.trim()
.trim_end_matches('.');
return Some(FactTriple {
index,
subject: subject.to_string(),
relation: "child_of".to_string(),
object: object.to_string(),
original: line.to_string(),
});
}
for sep in [" is ", " was "] {
if let Some(pos) = statement.find(sep) {
let subject = statement[..pos].trim();
let object = statement[pos + sep.len()..].trim().trim_end_matches('.');
if !subject.is_empty() && !object.is_empty() && subject.len() > 2 {
return Some(FactTriple {
index,
subject: subject.to_string(),
relation: "is_related".to_string(),
object: object.to_string(),
original: line.to_string(),
});
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_basic_facts() {
let text = "0. Thomas Kyd was born in the city of London.\n\
1. The chairperson of Fatah is Mahmoud Abbas.\n\
2. Amy Winehouse died in the city of Camden Town.";
let (triples, _) = extract_facts(text);
assert_eq!(triples.len(), 3);
assert_eq!(triples[0].subject, "Thomas Kyd");
assert_eq!(triples[0].relation, "born_in");
assert_eq!(triples[0].object, "London");
assert_eq!(triples[1].subject, "Fatah");
assert_eq!(triples[1].relation, "chairperson_of");
assert_eq!(triples[1].object, "Mahmoud Abbas");
}
#[test]
fn detect_conflicts() {
let text = "0. The chairperson of Fatah is Mahmoud Abbas.\n\
1. Some other fact.\n\
2. The chairperson of Fatah is Moshe Kahlon.";
let (_, conflicts) = extract_facts(text);
assert_eq!(conflicts.len(), 1);
assert_eq!(conflicts[0].old_fact.object, "Mahmoud Abbas");
assert_eq!(conflicts[0].new_fact.object, "Moshe Kahlon");
assert!(conflicts[0].new_fact.index > conflicts[0].old_fact.index);
}
#[test]
fn no_conflict_same_value() {
let text = "0. Amy Winehouse died in the city of Camden Town.\n\
1. Amy Winehouse died in the city of Camden Town.";
let (_, conflicts) = extract_facts(text);
assert_eq!(conflicts.len(), 0, "same value should not be a conflict");
}
}