use std::collections::HashMap;
use crate::candidate::LlmCandidate;
use crate::ollama::LlmError;
#[allow(clippy::missing_errors_doc)]
pub fn validate(candidate: &LlmCandidate, valid_source_ids: &[String]) -> Result<(), LlmError> {
for claim in &candidate.claims {
if !claim.claim_type.allowed_from_llm() {
return Err(LlmError::ForbiddenClaimType(claim.claim_type));
}
for sid in &claim.source_ids {
if !valid_source_ids.contains(sid) {
return Err(LlmError::SourceIdMissing(sid.clone()));
}
}
}
Ok(())
}
fn four_word_ngrams(text: &str) -> Vec<String> {
let tokens: Vec<&str> = text.split_whitespace().collect();
if tokens.len() < 4 {
return Vec::new();
}
tokens
.windows(4)
.map(|w| w.join(" ").to_lowercase())
.collect()
}
#[allow(clippy::missing_errors_doc)]
pub fn validate_with_sources<S: std::hash::BuildHasher>(
candidate: &LlmCandidate,
valid_source_ids: &[String],
source_content_map: &HashMap<String, String, S>,
) -> Result<(), LlmError> {
validate(candidate, valid_source_ids)?;
if source_content_map.is_empty() {
return Ok(());
}
for claim in &candidate.claims {
let ngrams = four_word_ngrams(&claim.text);
if ngrams.is_empty() {
continue;
}
let mut grounded = false;
'outer: for cited_id in &claim.source_ids {
if let Some(content) = source_content_map.get(cited_id) {
let content_lower = content.to_lowercase();
for ngram in &ngrams {
if content_lower.contains(ngram.as_str()) {
grounded = true;
break 'outer;
}
}
}
}
if !grounded {
return Err(LlmError::SourceNotGrounded {
claim_text: claim.text.clone(),
source_ids: claim.source_ids.clone(),
});
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::candidate::{ClaimConfidence, ClaimType, LlmClaim, SCHEMA};
fn make_candidate(
claim_text: &str,
claim_type: ClaimType,
source_ids: Vec<String>,
) -> LlmCandidate {
LlmCandidate {
schema: SCHEMA.into(),
candidate_id: "test".into(),
input_source_ids: source_ids.clone(),
claims: vec![LlmClaim {
text: claim_text.into(),
claim_type,
source_ids,
confidence: ClaimConfidence::Candidate,
}],
}
}
fn candidate_with_claim(claim_type: ClaimType, source_ids: Vec<String>) -> LlmCandidate {
make_candidate("some claim text", claim_type, source_ids)
}
#[test]
fn hard_rule_rejected() {
let c = candidate_with_claim(ClaimType::HardRule, vec!["src_1".into()]);
assert!(validate(&c, &["src_1".into()]).is_err());
}
#[test]
fn missing_source_id_rejected() {
let c = candidate_with_claim(ClaimType::CandidateObservation, vec!["ghost".into()]);
assert!(validate(&c, &["src_1".into()]).is_err());
}
#[test]
fn valid_candidate_passes() {
let c = candidate_with_claim(ClaimType::WorkflowInstruction, vec!["src_1".into()]);
assert!(validate(&c, &["src_1".into()]).is_ok());
}
#[test]
fn grounded_claim_passes() {
let mut map = HashMap::new();
map.insert(
"src_1".to_string(),
"The design principle is that contracts first define data schemas before implementation."
.to_string(),
);
let c = make_candidate(
"contracts first define data schemas for all services",
ClaimType::CandidateObservation,
vec!["src_1".into()],
);
assert!(validate_with_sources(&c, &["src_1".into()], &map).is_ok());
}
#[test]
fn ungrounded_claim_rejected() {
let mut map = HashMap::new();
map.insert(
"src_1".to_string(),
"always use lowercase identifiers in all configuration files".to_string(),
);
let c = make_candidate(
"never write docs for internal only components",
ClaimType::CandidateObservation,
vec!["src_1".into()],
);
let result = validate_with_sources(&c, &["src_1".into()], &map);
assert!(
matches!(result, Err(LlmError::SourceNotGrounded { .. })),
"expected SourceNotGrounded, got {result:?}"
);
}
#[test]
fn empty_source_map_skips_grounding_check() {
let c = make_candidate(
"never write docs for internal only components",
ClaimType::CandidateObservation,
vec!["src_1".into()],
);
let result = validate_with_sources(&c, &["src_1".into()], &HashMap::new());
assert!(result.is_ok(), "expected Ok with empty map, got {result:?}");
}
#[test]
fn claim_shorter_than_four_words_skips_grounding() {
let mut map = HashMap::new();
map.insert(
"src_1".to_string(),
"totally unrelated source content".to_string(),
);
let c = make_candidate(
"use strict types",
ClaimType::WeakPreference,
vec!["src_1".into()],
);
assert!(validate_with_sources(&c, &["src_1".into()], &map).is_ok());
}
#[test]
fn four_word_ngrams_produces_correct_windows() {
let ngrams = four_word_ngrams("a b c d e");
assert_eq!(ngrams, vec!["a b c d", "b c d e"]);
}
#[test]
fn four_word_ngrams_empty_for_short_input() {
assert!(four_word_ngrams("one two three").is_empty());
}
}