use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Claim {
pub text: String,
pub sentence_idx: usize,
pub is_verifiable: bool,
pub specificity: f64,
pub is_hedged: bool,
}
pub fn extract_claims(text: &str) -> Vec<Claim> {
let sentences = split_sentences(text);
sentences
.iter()
.enumerate()
.filter(|(_, s)| !s.trim().is_empty())
.map(|(idx, sentence)| {
let text = sentence.trim().to_string();
let is_verifiable = contains_verifiable_content(&text);
let specificity = compute_specificity(&text);
let is_hedged = contains_hedging(&text);
Claim {
text,
sentence_idx: idx,
is_verifiable,
specificity,
is_hedged,
}
})
.collect()
}
fn split_sentences(text: &str) -> Vec<String> {
let mut sentences = Vec::new();
let mut current = String::new();
for ch in text.chars() {
current.push(ch);
if (ch == '.' || ch == '!' || ch == '?') && current.len() > 10 {
let trimmed = current.trim();
if !trimmed.ends_with("Dr.")
&& !trimmed.ends_with("Mr.")
&& !trimmed.ends_with("Mrs.")
&& !trimmed.ends_with("vs.")
&& !trimmed.ends_with("etc.")
&& !trimmed.ends_with("e.g.")
&& !trimmed.ends_with("i.e.")
{
sentences.push(current.trim().to_string());
current = String::new();
}
}
}
if !current.trim().is_empty() {
sentences.push(current.trim().to_string());
}
sentences
}
fn contains_verifiable_content(text: &str) -> bool {
let has_numbers = text.chars().any(|c| c.is_ascii_digit());
let words: Vec<&str> = text.split_whitespace().collect();
let has_proper_nouns = words.iter().skip(1).any(|w| {
w.len() > 1
&& w.chars().next().is_some_and(|c| c.is_uppercase())
&& !is_common_sentence_starter(w)
});
let has_dates = regex::Regex::new(r"\b\d{4}\b|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b")
.is_ok_and(|re| re.is_match(text));
let has_quantities = text.contains('%')
|| text.contains("million")
|| text.contains("billion")
|| text.contains("thousand")
|| text.contains("kilometers")
|| text.contains("miles")
|| text.contains("years");
has_numbers || has_proper_nouns || has_dates || has_quantities
}
fn is_common_sentence_starter(word: &str) -> bool {
matches!(
word,
"The"
| "This"
| "That"
| "These"
| "Those"
| "It"
| "They"
| "He"
| "She"
| "We"
| "However"
| "Although"
| "Moreover"
| "Furthermore"
| "Additionally"
| "In"
| "On"
| "At"
| "For"
| "With"
| "But"
| "And"
| "Or"
| "As"
| "If"
)
}
fn compute_specificity(text: &str) -> f64 {
let mut score = 0.0;
let mut signals = 0;
let digit_count = text.chars().filter(|c| c.is_ascii_digit()).count();
if digit_count > 0 {
score += 0.3;
signals += 1;
}
let words: Vec<&str> = text.split_whitespace().collect();
let proper_noun_count = words
.iter()
.skip(1)
.filter(|w| {
w.len() > 1
&& w.chars().next().is_some_and(|c| c.is_uppercase())
&& !is_common_sentence_starter(w)
})
.count();
if proper_noun_count > 0 {
score += 0.2 * (proper_noun_count as f64).min(3.0) / 3.0;
signals += 1;
}
let vague_words = [
"some",
"many",
"several",
"few",
"various",
"numerous",
"often",
"sometimes",
"generally",
"typically",
"usually",
"about",
"approximately",
"roughly",
"around",
"nearly",
];
let vague_count = words
.iter()
.filter(|w| vague_words.contains(&w.to_lowercase().as_str()))
.count();
if vague_count > 0 {
score -= 0.15 * vague_count as f64;
signals += 1;
}
let definitive_words = [
"exactly",
"precisely",
"specifically",
"always",
"never",
"every",
];
let definitive_count = words
.iter()
.filter(|w| definitive_words.contains(&w.to_lowercase().as_str()))
.count();
if definitive_count > 0 {
score += 0.15;
signals += 1;
}
let word_count = words.len();
if (10..30).contains(&word_count) {
score += 0.1;
}
if signals == 0 {
return 0.3; }
score.clamp(0.0, 1.0)
}
pub fn contains_hedging(text: &str) -> bool {
let lower = text.to_lowercase();
let hedge_phrases = [
"might be",
"could be",
"may be",
"possibly",
"perhaps",
"likely",
"probably",
"it seems",
"it appears",
"reportedly",
"allegedly",
"it is believed",
"it is thought",
"some suggest",
"arguably",
"it is possible",
"there is evidence",
"to some extent",
"not entirely clear",
"remains uncertain",
"is debated",
"as far as we know",
"to the best of",
"i think",
"i believe",
];
hedge_phrases.iter().any(|h| lower.contains(h))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_basic_claims() {
let text = "Albert Einstein was born in 1879 in Ulm, Germany. He developed the theory of relativity. The sky is blue.";
let claims = extract_claims(text);
assert_eq!(claims.len(), 3);
assert!(claims[0].is_verifiable); assert!(claims[0].specificity > 0.3);
}
#[test]
fn detect_hedging() {
assert!(contains_hedging("This might be related to climate change."));
assert!(contains_hedging(
"It is believed that the universe is expanding."
));
assert!(!contains_hedging("The Earth orbits the Sun."));
}
#[test]
fn specificity_scoring() {
let specific = compute_specificity(
"Einstein published 300 papers in 1905 at the University of Zurich.",
);
let vague = compute_specificity("Some researchers have found various interesting results.");
assert!(
specific > vague,
"specific={specific} should be > vague={vague}"
);
}
#[test]
fn empty_text() {
let claims = extract_claims("");
assert!(claims.is_empty());
}
}