lint-ai 0.1.5

Semantic wiki and docs linting for contradictions, stale claims, orphan pages, and missing cross-references
Documentation
use crate::index::{Claim, DocRecord, SectionChunk};

#[derive(Debug, Clone, Default)]
pub struct ExtractedClaims {
    pub claims: Vec<Claim>,
}

pub trait ClaimExtractor {
    fn extract(&self, record: &DocRecord) -> ExtractedClaims;
    fn name(&self) -> &'static str;
}

#[derive(Debug, Clone, Default)]
pub struct ConservativeClaimExtractor;

impl ConservativeClaimExtractor {
    fn from_existing_claims(record: &DocRecord, out: &mut Vec<Claim>) {
        out.extend(
            record
                .top_claims
                .iter()
                .filter(|claim| is_valid_claim(claim))
                .cloned(),
        );
    }

    fn from_chunks(record: &DocRecord, out: &mut Vec<Claim>) {
        for chunk in &record.section_chunks {
            out.extend(chunk_claims(record, chunk));
        }
    }
}

impl ClaimExtractor for ConservativeClaimExtractor {
    fn extract(&self, record: &DocRecord) -> ExtractedClaims {
        let mut claims = Vec::new();
        Self::from_existing_claims(record, &mut claims);
        Self::from_chunks(record, &mut claims);
        dedupe_claims(&mut claims);
        ExtractedClaims { claims }
    }

    fn name(&self) -> &'static str {
        "conservative"
    }
}

fn chunk_claims(record: &DocRecord, chunk: &SectionChunk) -> Vec<Claim> {
    let mut out = Vec::new();

    if let Some(topic) = record.probable_topic.as_ref().filter(|s| !s.trim().is_empty()) {
        if chunk
            .heading
            .to_lowercase()
            .contains(&topic.to_lowercase())
            || chunk.content.to_lowercase().contains(&topic.to_lowercase())
        {
            out.push(Claim {
                subject: topic.clone(),
                predicate: "mentions".to_string(),
                object: chunk.heading.clone(),
                confidence: 0.55,
            });
        }
    }

    if let Some(ts) = chunk.timestamp.as_ref().or(record.timestamp.as_ref()) {
        if !ts.trim().is_empty() {
            out.push(Claim {
                subject: chunk.heading.clone(),
                predicate: "timestamp".to_string(),
                object: ts.clone(),
                confidence: 0.5,
            });
        }
    }

    for ent in &chunk.key_entities {
        out.push(Claim {
            subject: chunk.heading.clone(),
            predicate: "mentions_entity".to_string(),
            object: ent.clone(),
            confidence: 0.45,
        });
    }

    for term in &chunk.important_terms {
        out.push(Claim {
            subject: chunk.heading.clone(),
            predicate: "important_term".to_string(),
            object: term.clone(),
            confidence: 0.4,
        });
    }

    out
}

fn is_valid_claim(claim: &Claim) -> bool {
    !claim.subject.trim().is_empty()
        && !claim.predicate.trim().is_empty()
        && !claim.object.trim().is_empty()
        && claim.confidence.is_finite()
}

fn dedupe_claims(claims: &mut Vec<Claim>) {
    claims.retain(is_valid_claim);
    claims.sort_by(|a, b| {
        a.subject
            .cmp(&b.subject)
            .then_with(|| a.predicate.cmp(&b.predicate))
            .then_with(|| a.object.cmp(&b.object))
            .then_with(|| a.confidence.partial_cmp(&b.confidence).unwrap_or(std::cmp::Ordering::Equal))
    });
    claims.dedup_by(|a, b| {
        a.subject == b.subject && a.predicate == b.predicate && a.object == b.object
    });
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::index::{Claim, Provenance, SectionChunk};
    use crate::tier1::{RankedTerm, Tier1Entity};

    fn sample_record() -> DocRecord {
        DocRecord {
            doc_id: "doc-1".to_string(),
            source: "source://doc-1".to_string(),
            content: "Alice works at Acme".to_string(),
            timestamp: Some("2024-05-01".to_string()),
            doc_length: 20,
            author_agent: None,
            group_id: Some("group-a".to_string()),
            probable_topic: Some("Alice".to_string()),
            doc_type_guess: Some("note".to_string()),
            headings: vec!["Overview".to_string()],
            doc_links: vec![],
            temporal_terms: vec![],
            key_entities: vec![Tier1Entity {
                text: "Alice".to_string(),
                label: "PROPN".to_string(),
                start: 0,
                end: 5,
                score: Some(1.0),
                source: "heuristic".to_string(),
            }],
            important_terms: vec![RankedTerm {
                term: "acme".to_string(),
                score: 1.0,
                source: "yake".to_string(),
            }],
            section_chunks: vec![SectionChunk {
                chunk_id: "doc-1::chunk-0".to_string(),
                heading: "Overview".to_string(),
                content: "Alice works at Acme".to_string(),
                start_line: 1,
                end_line: 1,
                timestamp: Some("2024-05-01".to_string()),
                key_entities: vec!["Alice".to_string()],
                important_terms: vec!["acme".to_string()],
            }],
            embedding: None,
            top_claims: vec![Claim {
                subject: "Alice".to_string(),
                predicate: "works_at".to_string(),
                object: "Acme".to_string(),
                confidence: 0.9,
            }],
            provenance: Provenance {
                source: "source://doc-1".to_string(),
                timestamp: Some("2024-05-01".to_string()),
                ner_provider: "heuristic".to_string(),
                term_ranker: "yake".to_string(),
                index_version: "v1".to_string(),
            },
        }
    }

    #[test]
    fn conservative_extractor_uses_existing_claims_and_chunk_signals() {
        let extractor = ConservativeClaimExtractor;
        let record = sample_record();
        let extracted = extractor.extract(&record);
        assert!(extracted
            .claims
            .iter()
            .any(|c| c.predicate == "works_at" && c.object == "Acme"));
        assert!(extracted
            .claims
            .iter()
            .any(|c| c.predicate == "mentions_entity" && c.object == "Alice"));
        assert!(extracted
            .claims
            .iter()
            .any(|c| c.predicate == "important_term" && c.object == "acme"));
        assert!(extracted
            .claims
            .iter()
            .any(|c| c.predicate == "timestamp" && c.object == "2024-05-01"));
    }
}