use std::collections::HashMap;
use std::fmt::Write as _;
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::{debug, info, warn};
use super::asr_backend::{TranscriptSegment, TranscriptionResult};
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "snake_case")]
pub enum ReferenceKind {
Paper,
Person,
Tool,
Claim,
Number,
Other,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Reference {
pub kind: ReferenceKind,
pub query: String,
pub confidence: f32,
pub segment_idx: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LookupResult {
pub url: String,
pub summary: String,
pub fetched_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone)]
pub struct ActiveReadingConfig {
pub token_budget: u32,
pub max_depth: u32,
pub confidence_threshold: f32,
pub max_refs_per_segment: usize,
pub lookup_timeout_secs: u64,
pub cache_ttl_days: u64,
pub allowed_kinds: Vec<ReferenceKind>,
}
impl Default for ActiveReadingConfig {
fn default() -> Self {
Self {
token_budget: 10_000,
max_depth: 1,
confidence_threshold: 0.7,
max_refs_per_segment: 3,
lookup_timeout_secs: 10,
cache_ttl_days: 7,
allowed_kinds: vec![
ReferenceKind::Paper,
ReferenceKind::Person,
ReferenceKind::Tool,
ReferenceKind::Claim,
],
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ActiveReadingMetadata {
pub references_identified: usize,
pub references_followed: usize,
pub tokens_spent: u32,
pub cache_hits: usize,
pub elapsed_ms: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ActiveReadingOutput {
pub footnotes: Vec<String>,
pub metadata: ActiveReadingMetadata,
}
#[derive(Error, Debug)]
pub enum ActiveReadingError {
#[error("LLM sampling not supported by client")]
SamplingNotSupported,
#[error("token budget exhausted")]
BudgetExhausted,
#[error("lookup timeout: {0}")]
Timeout(String),
#[error("sampling error: {0}")]
SamplingFailed(String),
#[error("fetch error: {0}")]
FetchFailed(String),
#[error("invalid response from LLM: {0}")]
InvalidResponse(String),
}
pub type Result<T> = std::result::Result<T, ActiveReadingError>;
#[async_trait]
pub trait LlmSampler: Send + Sync {
async fn identify_references(
&self,
chunk: &str,
segment_offset: usize,
) -> Result<Vec<Reference>>;
async fn summarize(&self, content: &str, query: &str, max_tokens: u32) -> Result<String>;
}
#[async_trait]
pub trait UrlFetcher: Send + Sync {
async fn fetch_text(&self, url: &str) -> Result<String>;
}
const CHUNK_SIZE_CHARS: usize = 4_000;
const CHUNK_OVERLAP_CHARS: usize = 200;
pub struct ActiveReader<'a> {
sampler: &'a dyn LlmSampler,
fetcher: &'a dyn UrlFetcher,
config: ActiveReadingConfig,
cache: HashMap<(ReferenceKind, String), LookupResult>,
}
impl<'a> ActiveReader<'a> {
pub fn new(
sampler: &'a dyn LlmSampler,
fetcher: &'a dyn UrlFetcher,
config: ActiveReadingConfig,
) -> Self {
Self {
sampler,
fetcher,
config,
cache: HashMap::new(),
}
}
#[must_use]
pub fn with_cache(mut self, cache: HashMap<(ReferenceKind, String), LookupResult>) -> Self {
self.cache = cache;
self
}
pub async fn process(
&mut self,
transcript: &mut TranscriptionResult,
) -> Result<ActiveReadingOutput> {
let started = std::time::Instant::now();
let mut metadata = ActiveReadingMetadata::default();
let mut footnotes: Vec<String> = Vec::new();
let mut refs_per_segment: HashMap<usize, usize> = HashMap::new();
if transcript.segments.is_empty() {
return Ok(ActiveReadingOutput {
footnotes,
metadata,
});
}
let chunks = self.chunk_segments(&transcript.segments);
debug!(chunks = chunks.len(), "active reading: chunked transcript");
for (offset, chunk_text) in &chunks {
if metadata.tokens_spent >= self.config.token_budget {
info!("active reading: token budget exhausted, stopping");
break;
}
let refs = match self.sampler.identify_references(chunk_text, *offset).await {
Ok(r) => r,
Err(e) => {
warn!("active reading: identify_references failed: {e}");
continue;
}
};
metadata.tokens_spent = metadata
.tokens_spent
.saturating_add(estimate_tokens(chunk_text));
metadata.references_identified += refs.len();
debug!(
count = refs.len(),
offset, "active reading: references identified in chunk"
);
for reference in refs {
if !self.should_follow(&reference, &refs_per_segment) {
continue;
}
let lookup = match self.lookup_reference(&reference).await {
Ok(l) => l,
Err(e) => {
warn!(query = %reference.query, "active reading: lookup failed: {e}");
continue;
}
};
metadata.tokens_spent = metadata
.tokens_spent
.saturating_add(estimate_tokens(&lookup.summary));
metadata.references_followed += 1;
let fn_num = footnotes.len() + 1;
if let Some(seg) = transcript.segments.get_mut(reference.segment_idx) {
let _ = write!(seg.text, "[{fn_num}]");
}
footnotes.push(format!("[{fn_num}] {} — {}", lookup.summary, lookup.url));
*refs_per_segment.entry(reference.segment_idx).or_insert(0) += 1;
}
}
metadata.elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX);
info!(
identified = metadata.references_identified,
followed = metadata.references_followed,
tokens = metadata.tokens_spent,
elapsed_ms = metadata.elapsed_ms,
"active reading complete"
);
Ok(ActiveReadingOutput {
footnotes,
metadata,
})
}
#[allow(clippy::unused_self)]
pub(crate) fn chunk_segments(&self, segments: &[TranscriptSegment]) -> Vec<(usize, String)> {
let mut chunks: Vec<(usize, String)> = Vec::new();
let mut current = String::new();
let mut chunk_start_idx: usize = 0;
for (idx, seg) in segments.iter().enumerate() {
if current.len() + seg.text.len() > CHUNK_SIZE_CHARS && !current.is_empty() {
let tail: String = if current.len() > CHUNK_OVERLAP_CHARS {
current[current.len() - CHUNK_OVERLAP_CHARS..].to_string()
} else {
current.clone()
};
chunks.push((chunk_start_idx, current.clone()));
current = tail;
chunk_start_idx = idx;
}
if !current.is_empty() {
current.push(' ');
}
current.push_str(&seg.text);
}
if !current.is_empty() {
chunks.push((chunk_start_idx, current));
}
chunks
}
pub(crate) fn url_for_reference(reference: &Reference) -> Result<String> {
let q = urlencoding::encode(&reference.query);
let url = match reference.kind {
ReferenceKind::Paper => {
format!("https://scholar.google.com/scholar?q={q}")
}
ReferenceKind::Person => {
format!("https://en.wikipedia.org/wiki/Special:Search?search={q}")
}
ReferenceKind::Tool => {
format!("https://github.com/search?q={q}&type=repositories")
}
ReferenceKind::Claim | ReferenceKind::Other => {
format!("https://www.google.com/search?q={q}")
}
ReferenceKind::Number => {
return Err(ActiveReadingError::FetchFailed(
"numbers do not look up well".to_string(),
));
}
};
Ok(url)
}
async fn lookup_reference(&mut self, reference: &Reference) -> Result<LookupResult> {
let cache_key = (reference.kind, reference.query.to_lowercase());
if let Some(cached) = self.cache.get(&cache_key) {
let age_days = chrono::Utc::now()
.signed_duration_since(cached.fetched_at)
.num_days();
if age_days < i64::try_from(self.config.cache_ttl_days).unwrap_or(i64::MAX) {
debug!(query = %reference.query, "active reading: cache hit");
return Ok(cached.clone());
}
}
let url = Self::url_for_reference(reference)?;
debug!(url = %url, "active reading: fetching reference");
let fetch_result = tokio::time::timeout(
std::time::Duration::from_secs(self.config.lookup_timeout_secs),
self.fetcher.fetch_text(&url),
)
.await
.map_err(|_| ActiveReadingError::Timeout(url.clone()))?;
let content = fetch_result?;
let summary = self
.sampler
.summarize(&content, &reference.query, 200)
.await?;
let result = LookupResult {
url,
summary,
fetched_at: chrono::Utc::now(),
};
self.cache.insert(cache_key, result.clone());
Ok(result)
}
fn should_follow(
&self,
reference: &Reference,
refs_per_segment: &HashMap<usize, usize>,
) -> bool {
if reference.confidence < self.config.confidence_threshold {
return false;
}
if !self.config.allowed_kinds.contains(&reference.kind) {
return false;
}
let count = refs_per_segment
.get(&reference.segment_idx)
.copied()
.unwrap_or(0);
if count >= self.config.max_refs_per_segment {
return false;
}
true
}
}
fn estimate_tokens(text: &str) -> u32 {
u32::try_from((text.len() / 4).min(u32::MAX as usize)).unwrap_or(u32::MAX)
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
struct MockSampler {
refs: Vec<Reference>,
summary: String,
identify_calls: Mutex<usize>,
summarize_calls: Mutex<usize>,
}
impl MockSampler {
fn new(refs: Vec<Reference>, summary: impl Into<String>) -> Self {
Self {
refs,
summary: summary.into(),
identify_calls: Mutex::new(0),
summarize_calls: Mutex::new(0),
}
}
fn identify_call_count(&self) -> usize {
*self.identify_calls.lock().unwrap()
}
}
#[async_trait]
impl LlmSampler for MockSampler {
async fn identify_references(
&self,
_chunk: &str,
segment_offset: usize,
) -> Result<Vec<Reference>> {
*self.identify_calls.lock().unwrap() += 1;
Ok(self
.refs
.iter()
.cloned()
.map(|mut r| {
r.segment_idx = segment_offset;
r
})
.collect())
}
async fn summarize(
&self,
_content: &str,
_query: &str,
_max_tokens: u32,
) -> Result<String> {
*self.summarize_calls.lock().unwrap() += 1;
Ok(self.summary.clone())
}
}
struct MockFetcher {
body: String,
call_count: Mutex<usize>,
should_fail: bool,
}
impl MockFetcher {
fn new(body: impl Into<String>) -> Self {
Self {
body: body.into(),
call_count: Mutex::new(0),
should_fail: false,
}
}
fn failing() -> Self {
Self {
body: String::new(),
call_count: Mutex::new(0),
should_fail: true,
}
}
fn call_count(&self) -> usize {
*self.call_count.lock().unwrap()
}
}
#[async_trait]
impl UrlFetcher for MockFetcher {
async fn fetch_text(&self, _url: &str) -> Result<String> {
*self.call_count.lock().unwrap() += 1;
if self.should_fail {
return Err(ActiveReadingError::FetchFailed("mock fail".into()));
}
Ok(self.body.clone())
}
}
fn make_segment(text: &str) -> TranscriptSegment {
TranscriptSegment {
text: text.to_string(),
start: 0.0,
end: 1.0,
confidence: 0.95,
language: None,
speaker: None,
words: None,
}
}
fn make_transcript(texts: &[&str]) -> TranscriptionResult {
TranscriptionResult {
segments: texts.iter().map(|t| make_segment(t)).collect(),
language: "en".to_string(),
duration_seconds: 10.0,
model: "test".to_string(),
backend: "test".to_string(),
rtfx: 1.0,
processing_time_seconds: 1.0,
speakers: None,
footnotes: None,
active_reading: None,
}
}
fn high_confidence_paper_ref() -> Reference {
Reference {
kind: ReferenceKind::Paper,
query: "Dijkstra 1968 GOTO".to_string(),
confidence: 0.95,
segment_idx: 0,
}
}
#[test]
fn chunk_segments_small_transcript_produces_single_chunk() {
let segs: Vec<TranscriptSegment> = vec![make_segment("Hello world.")];
let config = ActiveReadingConfig::default();
let sampler = MockSampler::new(vec![], "");
let fetcher = MockFetcher::new("");
let reader = ActiveReader::new(&sampler, &fetcher, config);
let chunks = reader.chunk_segments(&segs);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].0, 0);
assert!(chunks[0].1.contains("Hello world."));
}
#[test]
fn chunk_segments_respects_word_count() {
let long_word = "word ".repeat(200); let segs: Vec<TranscriptSegment> =
(0..10).map(|_| make_segment(long_word.trim())).collect();
let config = ActiveReadingConfig::default();
let sampler = MockSampler::new(vec![], "");
let fetcher = MockFetcher::new("");
let reader = ActiveReader::new(&sampler, &fetcher, config);
let chunks = reader.chunk_segments(&segs);
assert!(
chunks.len() >= 2,
"expected ≥2 chunks, got {}",
chunks.len()
);
for (_, text) in &chunks {
assert!(!text.is_empty());
}
}
#[test]
fn url_for_reference_paper_uses_scholar() {
let r = high_confidence_paper_ref();
let url = ActiveReader::url_for_reference(&r).unwrap();
assert!(
url.starts_with("https://scholar.google.com/scholar?q="),
"got {url}"
);
assert!(url.contains("Dijkstra"));
}
#[test]
fn url_for_reference_person_uses_wikipedia() {
let r = Reference {
kind: ReferenceKind::Person,
query: "Geoffrey Hinton".to_string(),
confidence: 0.9,
segment_idx: 0,
};
let url = ActiveReader::url_for_reference(&r).unwrap();
assert!(
url.starts_with("https://en.wikipedia.org/wiki/Special:Search?search="),
"got {url}"
);
}
#[test]
fn url_for_reference_number_returns_error() {
let r = Reference {
kind: ReferenceKind::Number,
query: "42".to_string(),
confidence: 0.8,
segment_idx: 0,
};
let result = ActiveReader::url_for_reference(&r);
assert!(result.is_err());
}
#[tokio::test]
async fn process_skips_below_threshold() {
let low_conf_ref = Reference {
kind: ReferenceKind::Paper,
query: "obscure thing".to_string(),
confidence: 0.3,
segment_idx: 0,
};
let sampler = MockSampler::new(vec![low_conf_ref], "summary");
let fetcher = MockFetcher::new("content");
let mut reader = ActiveReader::new(&sampler, &fetcher, ActiveReadingConfig::default());
let mut transcript = make_transcript(&["Some text."]);
let output = reader.process(&mut transcript).await.unwrap();
assert!(output.footnotes.is_empty());
assert_eq!(fetcher.call_count(), 0);
}
#[tokio::test]
async fn process_caps_refs_per_segment() {
let refs: Vec<Reference> = (0..5)
.map(|i| Reference {
kind: ReferenceKind::Paper,
query: format!("paper {i}"),
confidence: 0.95,
segment_idx: 0,
})
.collect();
let sampler = MockSampler::new(refs, "summary");
let fetcher = MockFetcher::new("content");
let config = ActiveReadingConfig {
max_refs_per_segment: 2,
..ActiveReadingConfig::default()
};
let mut reader = ActiveReader::new(&sampler, &fetcher, config);
let mut transcript = make_transcript(&["Segment zero."]);
let output = reader.process(&mut transcript).await.unwrap();
assert!(
output.footnotes.len() <= 2,
"expected ≤2 footnotes, got {}",
output.footnotes.len()
);
}
#[tokio::test]
async fn process_uses_cache_on_repeat() {
let refs = vec![Reference {
kind: ReferenceKind::Paper,
query: "same paper".to_string(),
confidence: 0.9,
segment_idx: 0,
}];
let sampler = MockSampler::new(refs, "cached summary");
let fetcher = MockFetcher::new("content");
let mut reader = ActiveReader::new(&sampler, &fetcher, ActiveReadingConfig::default());
let paper_ref = Reference {
kind: ReferenceKind::Paper,
query: "same paper".to_string(),
confidence: 0.9,
segment_idx: 0,
};
let _first = reader.lookup_reference(&paper_ref).await.unwrap();
let second = reader.lookup_reference(&paper_ref).await.unwrap();
assert_eq!(
fetcher.call_count(),
1,
"fetcher should be called once; cache should serve second"
);
assert_eq!(second.summary, "cached summary");
}
#[tokio::test]
async fn process_continues_on_lookup_failure() {
let refs = vec![
Reference {
kind: ReferenceKind::Paper,
query: "good paper".to_string(),
confidence: 0.9,
segment_idx: 0,
},
Reference {
kind: ReferenceKind::Tool,
query: "bad tool".to_string(),
confidence: 0.9,
segment_idx: 0,
},
];
let sampler = MockSampler::new(refs, "summary");
let fetcher = MockFetcher::failing(); let mut reader = ActiveReader::new(&sampler, &fetcher, ActiveReadingConfig::default());
let mut transcript = make_transcript(&["Some text mentioning a paper and a tool."]);
let output = reader.process(&mut transcript).await;
assert!(output.is_ok(), "process should not propagate lookup errors");
assert!(output.unwrap().footnotes.is_empty());
}
#[tokio::test]
async fn process_respects_token_budget() {
let refs = vec![high_confidence_paper_ref()];
let sampler = MockSampler::new(refs, "summary");
let fetcher = MockFetcher::new("content");
let config = ActiveReadingConfig {
token_budget: 1, ..ActiveReadingConfig::default()
};
let mut reader = ActiveReader::new(&sampler, &fetcher, config);
let texts: Vec<&str> = (0..20)
.map(|_| "This is a sentence that mentions Dijkstra.")
.collect();
let mut transcript = make_transcript(&texts);
let output = reader.process(&mut transcript).await.unwrap();
assert_eq!(
sampler.identify_call_count(),
1,
"expected 1 sampling call before budget was exhausted"
);
let _ = output; }
#[tokio::test]
async fn process_skips_disallowed_kinds() {
let refs = vec![Reference {
kind: ReferenceKind::Number,
query: "3.14".to_string(),
confidence: 0.9,
segment_idx: 0,
}];
let sampler = MockSampler::new(refs, "summary");
let fetcher = MockFetcher::new("content");
let config = ActiveReadingConfig::default();
let mut reader = ActiveReader::new(&sampler, &fetcher, config);
let mut transcript = make_transcript(&["The value is 3.14."]);
let output = reader.process(&mut transcript).await.unwrap();
assert!(output.footnotes.is_empty());
assert_eq!(fetcher.call_count(), 0);
}
#[tokio::test]
async fn process_empty_transcript_returns_empty_output() {
let sampler = MockSampler::new(vec![], "");
let fetcher = MockFetcher::new("");
let mut reader = ActiveReader::new(&sampler, &fetcher, ActiveReadingConfig::default());
let mut transcript = make_transcript(&[]);
let output = reader.process(&mut transcript).await.unwrap();
assert!(output.footnotes.is_empty());
assert_eq!(output.metadata.references_identified, 0);
}
#[tokio::test]
async fn process_appends_footnote_marker_to_segment_text() {
let refs = vec![Reference {
kind: ReferenceKind::Paper,
query: "Dijkstra GOTO".to_string(),
confidence: 0.95,
segment_idx: 0,
}];
let sampler = MockSampler::new(refs, "Dijkstra, 1968 paper summary");
let fetcher = MockFetcher::new("full content");
let mut reader = ActiveReader::new(&sampler, &fetcher, ActiveReadingConfig::default());
let mut transcript = make_transcript(&["Dijkstra's famous paper."]);
let output = reader.process(&mut transcript).await.unwrap();
assert_eq!(output.footnotes.len(), 1);
assert!(
transcript.segments[0].text.contains("[1]"),
"segment text should contain footnote marker, got: {}",
transcript.segments[0].text
);
assert!(output.footnotes[0].starts_with("[1]"));
}
}