use rand::seq::SliceRandom;
use tracing::{debug, info};
use crate::config::LlmConfig;
use crate::error::Result;
use crate::index::parse::pdf::PdfPage;
use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport};
use crate::llm::LlmClient;
#[derive(Debug, Clone)]
pub struct VerifierConfig {
pub sample_size: Option<usize>,
pub llm_config: LlmConfig,
pub accuracy_threshold: f32,
}
impl Default for VerifierConfig {
fn default() -> Self {
Self {
sample_size: Some(10),
llm_config: LlmConfig::default(),
accuracy_threshold: 0.6,
}
}
}
pub struct IndexVerifier {
config: VerifierConfig,
client: LlmClient,
}
impl IndexVerifier {
pub fn new(config: VerifierConfig) -> Self {
let client = LlmClient::new(config.llm_config.clone().into());
Self { config, client }
}
pub fn with_defaults() -> Self {
Self::new(VerifierConfig::default())
}
pub async fn verify(
&self,
entries: &[TocEntry],
pages: &[PdfPage],
) -> Result<VerificationReport> {
if entries.is_empty() {
return Ok(VerificationReport::all_correct(0));
}
let sample = self.select_sample(entries);
let mut errors = Vec::new();
let mut correct = 0;
for (index, entry) in &sample {
if let Some(physical_page) = entry.physical_page {
match self.verify_entry(entry, physical_page, pages).await? {
Ok(()) => correct += 1,
Err(error_type) => {
errors.push(VerificationError::new(
*index,
entry.title.clone(),
physical_page,
error_type,
));
}
}
} else {
errors.push(VerificationError::new(
*index,
entry.title.clone(),
0,
ErrorType::PageOutOfRange,
));
}
}
let report = VerificationReport::new(sample.len(), correct, errors);
info!(
"Verification complete: {}/{} correct ({:.1}% accuracy)",
report.correct,
report.total,
report.accuracy * 100.0
);
Ok(report)
}
fn select_sample<'a>(&self, entries: &'a [TocEntry]) -> Vec<(usize, &'a TocEntry)> {
let with_pages: Vec<_> = entries
.iter()
.enumerate()
.filter(|(_, e)| e.physical_page.is_some())
.collect();
match self.config.sample_size {
Some(size) if size < with_pages.len() => {
let mut rng = rand::thread_rng();
let mut sample: Vec<_> = with_pages;
sample.shuffle(&mut rng);
sample.into_iter().take(size).collect()
}
_ => with_pages,
}
}
async fn verify_entry(
&self,
entry: &TocEntry,
physical_page: usize,
pages: &[PdfPage],
) -> Result<std::result::Result<(), ErrorType>> {
if physical_page == 0 || physical_page > pages.len() {
return Ok(Err(ErrorType::PageOutOfRange));
}
let page = &pages[physical_page - 1];
let found = self.check_title_on_page(&entry.title, &page.text).await?;
if !found {
debug!(
"Title '{}' not found on page {}",
entry.title, physical_page
);
return Ok(Err(ErrorType::TitleNotFound));
}
Ok(Ok(()))
}
async fn check_title_on_page(&self, title: &str, page_text: &str) -> Result<bool> {
let system = "You are a document analysis assistant. Determine if a section title appears in the given text.";
let text = if page_text.len() > 1000 {
&page_text[..1000]
} else {
page_text
};
let user = format!(
r#"Does the section title "{}" appear in this page text?
Page text:
{}
Reply in JSON format:
{{"found": true/false}}"#,
title, text
);
#[derive(serde::Deserialize)]
struct CheckResult {
found: bool,
}
let result: CheckResult = self.client.complete_json(system, &user).await?;
Ok(result.found)
}
pub async fn check_title_at_start(&self, title: &str, page_text: &str) -> Result<bool> {
let system = "You are a document analysis assistant. Determine if a section title appears at the START of the given page text.";
let text = if page_text.len() > 500 {
&page_text[..500]
} else {
page_text
};
let user = format!(
r#"Does the section title "{}" appear at the BEGINNING of this page text?
Note: It should be near the start, not in the middle or end.
Page text:
{}
Reply in JSON format:
{{"at_start": true/false}}"#,
title, text
);
#[derive(serde::Deserialize)]
struct StartCheck {
at_start: bool,
}
let result: StartCheck = self.client.complete_json(system, &user).await?;
Ok(result.at_start)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_select_sample() {
let verifier = IndexVerifier::with_defaults();
let entries: Vec<TocEntry> = (1..=20)
.map(|i| TocEntry::new(format!("Entry {}", i), 1).with_physical_page(i))
.collect();
let sample = verifier.select_sample(&entries);
assert_eq!(sample.len(), 10); }
#[test]
fn test_select_sample_all() {
let config = VerifierConfig {
sample_size: None,
..Default::default()
};
let verifier = IndexVerifier::new(config);
let entries: Vec<TocEntry> = (1..=5)
.map(|i| TocEntry::new(format!("Entry {}", i), 1).with_physical_page(i))
.collect();
let sample = verifier.select_sample(&entries);
assert_eq!(sample.len(), 5);
}
}