cordance-llm 0.1.1

//! Candidate validator. ADR 0002: hard-rule types, unknown source IDs,
//! and ungrounded claims are all rejected before an `LlmCandidate` is returned.

use std::collections::HashMap;

use crate::candidate::LlmCandidate;
use crate::ollama::LlmError;

/// Validate an LLM candidate against ADR 0002 rules:
/// - no forbidden claim types,
/// - every cited source ID must appear in `valid_source_ids`.
///
/// For grounding checks (4-gram substring), use [`validate_with_sources`].
#[allow(clippy::missing_errors_doc)]
pub fn validate(candidate: &LlmCandidate, valid_source_ids: &[String]) -> Result<(), LlmError> {
    for claim in &candidate.claims {
        if !claim.claim_type.allowed_from_llm() {
            return Err(LlmError::ForbiddenClaimType(claim.claim_type));
        }
        for sid in &claim.source_ids {
            if !valid_source_ids.contains(sid) {
                return Err(LlmError::SourceIdMissing(sid.clone()));
            }
        }
    }
    Ok(())
}

/// Extract all 4-word ngrams from `text`, normalised to lowercase tokens.
///
/// Returns an empty `Vec` when `text` has fewer than 4 whitespace-delimited tokens.
fn four_word_ngrams(text: &str) -> Vec<String> {
    let tokens: Vec<&str> = text.split_whitespace().collect();
    if tokens.len() < 4 {
        return Vec::new();
    }
    tokens
        .windows(4)
        .map(|w| w.join(" ").to_lowercase())
        .collect()
}

/// Validate an LLM candidate against ADR 0002 rules including substring grounding.
///
/// In addition to the checks in [`validate`], this function verifies that at
/// least one 4-word ngram from every claim appears verbatim (case-insensitive)
/// in the cited source content.
///
/// When `source_content_map` is empty the grounding check is skipped — the
/// caller did not supply source content.
#[allow(clippy::missing_errors_doc)]
pub fn validate_with_sources<S: std::hash::BuildHasher>(
    candidate: &LlmCandidate,
    valid_source_ids: &[String],
    source_content_map: &HashMap<String, String, S>,
) -> Result<(), LlmError> {
    // Run the basic checks first.
    validate(candidate, valid_source_ids)?;

    // Grounding check: skip entirely when no source content was provided.
    if source_content_map.is_empty() {
        return Ok(());
    }

    for claim in &candidate.claims {
        let ngrams = four_word_ngrams(&claim.text);

        // Claims shorter than 4 words cannot be checked; treat as passing.
        if ngrams.is_empty() {
            continue;
        }

        let mut grounded = false;

        'outer: for cited_id in &claim.source_ids {
            if let Some(content) = source_content_map.get(cited_id) {
                let content_lower = content.to_lowercase();
                for ngram in &ngrams {
                    if content_lower.contains(ngram.as_str()) {
                        grounded = true;
                        break 'outer;
                    }
                }
            }
        }

        if !grounded {
            return Err(LlmError::SourceNotGrounded {
                claim_text: claim.text.clone(),
                source_ids: claim.source_ids.clone(),
            });
        }
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::candidate::{ClaimConfidence, ClaimType, LlmClaim, SCHEMA};

    fn make_candidate(
        claim_text: &str,
        claim_type: ClaimType,
        source_ids: Vec<String>,
    ) -> LlmCandidate {
        LlmCandidate {
            schema: SCHEMA.into(),
            candidate_id: "test".into(),
            input_source_ids: source_ids.clone(),
            claims: vec![LlmClaim {
                text: claim_text.into(),
                claim_type,
                source_ids,
                confidence: ClaimConfidence::Candidate,
            }],
        }
    }

    // ── existing tests (kept) ────────────────────────────────────────────────

    fn candidate_with_claim(claim_type: ClaimType, source_ids: Vec<String>) -> LlmCandidate {
        make_candidate("some claim text", claim_type, source_ids)
    }

    #[test]
    fn hard_rule_rejected() {
        let c = candidate_with_claim(ClaimType::HardRule, vec!["src_1".into()]);
        assert!(validate(&c, &["src_1".into()]).is_err());
    }

    #[test]
    fn missing_source_id_rejected() {
        let c = candidate_with_claim(ClaimType::CandidateObservation, vec!["ghost".into()]);
        assert!(validate(&c, &["src_1".into()]).is_err());
    }

    #[test]
    fn valid_candidate_passes() {
        let c = candidate_with_claim(ClaimType::WorkflowInstruction, vec!["src_1".into()]);
        assert!(validate(&c, &["src_1".into()]).is_ok());
    }

    // ── new grounding tests ──────────────────────────────────────────────────

    #[test]
    fn grounded_claim_passes() {
        // Source contains "contracts first define data schemas" verbatim.
        let mut map = HashMap::new();
        map.insert(
            "src_1".to_string(),
            "The design principle is that contracts first define data schemas before implementation."
                .to_string(),
        );

        // Claim contains the 4-word ngram "contracts first define data".
        let c = make_candidate(
            "contracts first define data schemas for all services",
            ClaimType::CandidateObservation,
            vec!["src_1".into()],
        );

        assert!(validate_with_sources(&c, &["src_1".into()], &map).is_ok());
    }

    #[test]
    fn ungrounded_claim_rejected() {
        let mut map = HashMap::new();
        map.insert(
            "src_1".to_string(),
            "always use lowercase identifiers in all configuration files".to_string(),
        );

        // "never write docs" shares no 4-word ngram with the source.
        let c = make_candidate(
            "never write docs for internal only components",
            ClaimType::CandidateObservation,
            vec!["src_1".into()],
        );

        let result = validate_with_sources(&c, &["src_1".into()], &map);
        assert!(
            matches!(result, Err(LlmError::SourceNotGrounded { .. })),
            "expected SourceNotGrounded, got {result:?}"
        );
    }

    #[test]
    fn empty_source_map_skips_grounding_check() {
        // With an empty map the grounding check is bypassed — even a claim with
        // no plausible match must pass.
        let c = make_candidate(
            "never write docs for internal only components",
            ClaimType::CandidateObservation,
            vec!["src_1".into()],
        );

        let result = validate_with_sources(&c, &["src_1".into()], &HashMap::new());
        assert!(result.is_ok(), "expected Ok with empty map, got {result:?}");
    }

    #[test]
    fn claim_shorter_than_four_words_skips_grounding() {
        // A 3-word claim cannot produce a 4-gram; grounding is not required.
        let mut map = HashMap::new();
        map.insert(
            "src_1".to_string(),
            "totally unrelated source content".to_string(),
        );

        let c = make_candidate(
            "use strict types",
            ClaimType::WeakPreference,
            vec!["src_1".into()],
        );

        assert!(validate_with_sources(&c, &["src_1".into()], &map).is_ok());
    }

    #[test]
    fn four_word_ngrams_produces_correct_windows() {
        let ngrams = four_word_ngrams("a b c d e");
        assert_eq!(ngrams, vec!["a b c d", "b c d e"]);
    }

    #[test]
    fn four_word_ngrams_empty_for_short_input() {
        assert!(four_word_ngrams("one two three").is_empty());
    }
}