use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use chrono::Utc;
use serde::{Deserialize, Serialize};
use vela_protocol::bundle::FindingBundle;
use vela_protocol::ingest::extract_pdf_text;
use vela_protocol::project::Project;
use vela_protocol::proposals::{AgentRun, StateProposal};
use vela_protocol::repo;
use crate::AGENT_LITERATURE_SCOUT;
use crate::agent::{AgentContext, agent_run_meta, build_finding_add_proposal, discover_files};
use crate::extract::extract_via_claude_cli;
#[derive(Debug, Clone)]
pub struct ScoutInput {
pub folder: PathBuf,
pub frontier_path: PathBuf,
pub model: Option<String>,
pub cli_command: String,
pub apply: bool,
}
impl Default for ScoutInput {
fn default() -> Self {
Self {
folder: PathBuf::new(),
frontier_path: PathBuf::new(),
model: None,
cli_command: "claude".to_string(),
apply: true,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScoutCandidate {
pub source_file: String,
pub finding: FindingBundle,
pub rationale: String,
pub flags: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourceSpan {
pub page: u32,
pub paragraph: u32,
pub snippet: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ScoutReport {
pub run: AgentRun,
pub pdfs_seen: usize,
pub pdfs_processed: usize,
pub candidates_emitted: usize,
pub proposals_written: usize,
pub skipped: Vec<SkippedFile>,
pub frontier_path: String,
pub apply: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SkippedFile {
pub path: String,
pub reason: String,
}
pub async fn run(input: ScoutInput) -> Result<ScoutReport, String> {
let pdfs = discover_pdfs(&input.folder)?;
let pdfs_seen = pdfs.len();
let mut frontier: Project = repo::load_from_path(&input.frontier_path)
.map_err(|e| format!("load frontier {}: {e}", input.frontier_path.display()))?;
let ctx = AgentContext::new(
AGENT_LITERATURE_SCOUT,
input.frontier_path.clone(),
input.folder.clone(),
input.model.clone(),
input.cli_command.clone(),
);
let extra = BTreeMap::from([("pdf_count".to_string(), pdfs_seen.to_string())]);
let mut report = ScoutReport {
run: agent_run_meta(&ctx, extra),
pdfs_seen,
pdfs_processed: 0,
candidates_emitted: 0,
proposals_written: 0,
skipped: Vec::new(),
frontier_path: input.frontier_path.display().to_string(),
apply: input.apply,
};
let existing_finding_ids: std::collections::HashSet<String> =
frontier.findings.iter().map(|f| f.id.clone()).collect();
let existing_proposal_ids: std::collections::HashSet<String> =
frontier.proposals.iter().map(|p| p.id.clone()).collect();
let mut new_proposals: Vec<StateProposal> = Vec::new();
for pdf in &pdfs {
let label = pdf.display().to_string();
let text = match extract_pdf_text(pdf) {
Ok(t) if !t.trim().is_empty() => t,
Ok(_) => {
report.skipped.push(SkippedFile {
path: label,
reason: "empty PDF text after extraction".to_string(),
});
continue;
}
Err(e) => {
report.skipped.push(SkippedFile {
path: label,
reason: format!("extract failed: {e}"),
});
continue;
}
};
let candidates =
match extract_via_claude_cli(&text, pdf, input.model.as_deref(), &input.cli_command) {
Ok(b) => b,
Err(e) => {
report.skipped.push(SkippedFile {
path: label,
reason: format!("LLM extract failed: {e}"),
});
continue;
}
};
report.pdfs_processed += 1;
for (rationale, finding) in candidates {
report.candidates_emitted += 1;
let mut flags: Vec<String> = Vec::new();
if existing_finding_ids.contains(&finding.id) {
flags.push("duplicate_finding".to_string());
report.skipped.push(SkippedFile {
path: format!("{}#{}", pdf.display(), finding.id),
reason: "finding id already in frontier".to_string(),
});
continue;
}
let proposal = build_finding_add_proposal(
&finding,
&ctx,
&pdf.display().to_string(),
&rationale,
&flags,
&report.run,
);
if existing_proposal_ids.contains(&proposal.id) {
report.skipped.push(SkippedFile {
path: format!("{}#{}", pdf.display(), proposal.id),
reason: "proposal id already in frontier".to_string(),
});
continue;
}
new_proposals.push(proposal);
}
}
if input.apply && !new_proposals.is_empty() {
for p in new_proposals.drain(..) {
report.proposals_written += 1;
frontier.proposals.push(p);
}
repo::save_to_path(&input.frontier_path, &frontier)
.map_err(|e| format!("save frontier: {e}"))?;
} else {
report.proposals_written = new_proposals.len();
}
report.run.finished_at = Some(Utc::now().to_rfc3339());
Ok(report)
}
pub fn discover_pdfs(folder: &Path) -> Result<Vec<PathBuf>, String> {
discover_files(folder, &["pdf"])
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn discover_pdfs_filters_correctly() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("a.pdf"), b"%PDF-1.4").unwrap();
std::fs::write(dir.path().join("b.txt"), b"not a pdf").unwrap();
std::fs::write(dir.path().join(".hidden.pdf"), b"%PDF-1.4").unwrap();
std::fs::write(dir.path().join("c.pdf"), b"%PDF-1.4").unwrap();
let pdfs = discover_pdfs(dir.path()).unwrap();
assert_eq!(pdfs.len(), 2);
let names: Vec<String> = pdfs
.iter()
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
.collect();
assert!(names.contains(&"a.pdf".to_string()));
assert!(names.contains(&"c.pdf".to_string()));
assert!(!names.contains(&".hidden.pdf".to_string()));
}
}