pub mod citation;
pub mod distill;
pub mod fetch;
pub mod html_to_text;
pub mod pdf;
pub mod url_guard;
pub mod youtube;
use crate::core::evidence::Claim;
use citation::Citation;
pub const DEFAULT_MAX_TOKENS: usize = 6000;
pub const DEFAULT_MAX_ITEMS: usize = 12;
const MAX_LINKS: usize = 100;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReadMode {
Auto,
Markdown,
Text,
Links,
Facts,
Quotes,
Transcript,
}
impl ReadMode {
pub fn parse(s: &str) -> Option<Self> {
match s.trim().to_ascii_lowercase().as_str() {
"auto" => Some(Self::Auto),
"markdown" | "md" => Some(Self::Markdown),
"text" | "plain" => Some(Self::Text),
"links" => Some(Self::Links),
"facts" => Some(Self::Facts),
"quotes" => Some(Self::Quotes),
"transcript" | "summary" => Some(Self::Transcript),
_ => None,
}
}
pub fn label(self) -> &'static str {
match self {
Self::Auto => "auto",
Self::Markdown => "markdown",
Self::Text => "text",
Self::Links => "links",
Self::Facts => "facts",
Self::Quotes => "quotes",
Self::Transcript => "transcript",
}
}
}
pub struct ReadOptions<'a> {
pub url: &'a str,
pub mode: ReadMode,
pub query: Option<&'a str>,
pub max_tokens: usize,
pub max_items: usize,
pub timeout_secs: u64,
}
impl<'a> ReadOptions<'a> {
pub fn new(url: &'a str) -> Self {
Self {
url,
mode: ReadMode::Auto,
query: None,
max_tokens: DEFAULT_MAX_TOKENS,
max_items: DEFAULT_MAX_ITEMS,
timeout_secs: fetch::DEFAULT_TIMEOUT_SECS,
}
}
}
pub struct ReadResult {
pub content: String,
pub mode: ReadMode,
pub original_tokens: usize,
pub final_url: String,
}
pub fn read_url(opts: &ReadOptions) -> Result<ReadResult, String> {
if let Some(id) = youtube::video_id(opts.url) {
return read_youtube(&id, opts);
}
read_web(opts)
}
fn read_web(opts: &ReadOptions) -> Result<ReadResult, String> {
let doc = fetch::fetch(opts.url, fetch::DEFAULT_MAX_BYTES, opts.timeout_secs)?;
if doc.status >= 400 {
return Err(format!("HTTP {} from {}", doc.status, doc.final_url));
}
let is_pdf = doc.content_type.contains("pdf")
|| (doc.content_type.is_empty() && pdf::looks_like_pdf(&doc.bytes));
let (title, markdown, links, original_tokens) = if is_pdf {
let text = pdf::extract_text(&doc.bytes)?;
let tokens = crate::core::tokens::count_tokens(&text);
(None, text, Vec::new(), tokens)
} else {
let body = doc.body_text();
let tokens = crate::core::tokens::count_tokens(&body);
let looks_html = body.trim_start().starts_with('<');
if is_html(&doc.content_type) || (doc.content_type.is_empty() && looks_html) {
let parsed = html_to_text::parse(&body);
(parsed.title, parsed.markdown, parsed.links, tokens)
} else if is_textual(&doc.content_type) {
(None, body, Vec::new(), tokens)
} else {
return Err(format!(
"unsupported content type '{}' for {} (extractable: HTML, PDF, plain text)",
doc.content_type, doc.final_url
));
}
};
let effective = match opts.mode {
ReadMode::Auto => ReadMode::Markdown,
other => other,
};
let body = render_mode(effective, &markdown, &links, &doc.final_url, opts);
let trimmed = enforce_budget(&body, opts.max_tokens);
let citation = Citation::new(&doc.final_url, title);
Ok(ReadResult {
content: format!("{trimmed}{}", citation.footer()),
mode: effective,
original_tokens,
final_url: doc.final_url,
})
}
fn read_youtube(video_id: &str, opts: &ReadOptions) -> Result<ReadResult, String> {
let transcript = youtube::fetch_transcript(video_id, opts.timeout_secs)?;
let original_tokens = crate::core::tokens::count_tokens(&transcript.full_text);
let effective = match opts.mode {
ReadMode::Auto => ReadMode::Transcript,
other => other,
};
let body = match effective {
ReadMode::Facts => render_facts(&claims_from(
distill::facts_scored(&transcript.full_text, opts.query, opts.max_items),
&transcript.source_url,
)),
ReadMode::Quotes => render_quotes(&claims_from(
distill::quotes_scored(&transcript.full_text, opts.query, opts.max_items),
&transcript.source_url,
)),
ReadMode::Links => "Links are not available for video transcripts.".to_string(),
_ => distill::transcript_summary(&transcript.full_text, opts.max_tokens.saturating_mul(4)),
};
let trimmed = enforce_budget(&body, opts.max_tokens);
let citation = Citation::new(&transcript.source_url, transcript.title);
Ok(ReadResult {
content: format!("{trimmed}{}", citation.footer()),
mode: effective,
original_tokens,
final_url: transcript.source_url,
})
}
fn render_mode(
mode: ReadMode,
markdown: &str,
links: &[html_to_text::Link],
base_url: &str,
opts: &ReadOptions,
) -> String {
match mode {
ReadMode::Markdown | ReadMode::Auto => markdown.to_string(),
ReadMode::Text => html_to_text::markdown_to_text(markdown),
ReadMode::Links => render_links(links, base_url),
ReadMode::Facts => {
let plain = html_to_text::markdown_to_text(markdown);
let claims = claims_from(
distill::facts_scored(&plain, opts.query, opts.max_items),
base_url,
);
render_facts(&claims)
}
ReadMode::Quotes => {
let plain = html_to_text::markdown_to_text(markdown);
let claims = claims_from(
distill::quotes_scored(&plain, opts.query, opts.max_items),
base_url,
);
render_quotes(&claims)
}
ReadMode::Transcript => {
let plain = html_to_text::markdown_to_text(markdown);
distill::transcript_summary(&plain, opts.max_tokens.saturating_mul(4))
}
}
}
fn render_links(links: &[html_to_text::Link], base_url: &str) -> String {
if links.is_empty() {
return "No links found.".to_string();
}
let base = url_guard::validate(base_url).ok();
let mut seen = std::collections::HashSet::new();
let mut out = Vec::new();
for link in links {
let abs = absolutize(&link.href, base.as_ref());
if seen.insert(abs.clone()) {
out.push(format!("- [{}]({abs})", link.text));
if out.len() >= MAX_LINKS {
break;
}
}
}
out.join("\n")
}
fn absolutize(href: &str, base: Option<&url_guard::SafeUrl>) -> String {
if href.starts_with("http://") || href.starts_with("https://") {
return href.to_string();
}
match base {
Some(b) => fetch::resolve_redirect(b, href),
None => href.to_string(),
}
}
fn claims_from(scored: Vec<(String, f32)>, source: &str) -> Vec<Claim> {
scored
.into_iter()
.map(|(text, conf)| Claim::new(text, conf).with_source(source))
.collect()
}
fn render_facts(claims: &[Claim]) -> String {
if claims.is_empty() {
return "No matching content found.".to_string();
}
claims
.iter()
.map(|c| format!("- ({:.2}) {}", c.confidence, c.text))
.collect::<Vec<_>>()
.join("\n")
}
fn render_quotes(claims: &[Claim]) -> String {
if claims.is_empty() {
return "No quotable content found.".to_string();
}
claims
.iter()
.map(|c| format!("> ({:.2}) {}", c.confidence, c.text))
.collect::<Vec<_>>()
.join("\n\n")
}
fn enforce_budget(content: &str, max_tokens: usize) -> String {
let tokens = crate::core::tokens::count_tokens(content);
if tokens <= max_tokens {
return content.to_string();
}
let total_chars = content.chars().count();
let ratio = max_tokens as f64 / tokens as f64;
let keep = ((total_chars as f64 * ratio) as usize).max(1);
let truncated: String = content.chars().take(keep).collect();
format!("{truncated}\n\n…[truncated to fit ~{max_tokens} token budget]")
}
fn is_html(content_type: &str) -> bool {
content_type.contains("html") || content_type.contains("xml")
}
fn is_textual(content_type: &str) -> bool {
content_type.starts_with("text/")
|| content_type.contains("json")
|| content_type.contains("markdown")
|| content_type.contains("plain")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn read_mode_parsing_is_lenient() {
assert_eq!(ReadMode::parse("MD"), Some(ReadMode::Markdown));
assert_eq!(ReadMode::parse(" transcript "), Some(ReadMode::Transcript));
assert_eq!(ReadMode::parse("summary"), Some(ReadMode::Transcript));
assert_eq!(ReadMode::parse("bogus"), None);
}
#[test]
fn content_type_classification() {
assert!(is_html("text/html"));
assert!(is_html("application/xhtml+xml"));
assert!(is_textual("text/plain"));
assert!(is_textual("application/json"));
assert!(!is_html("application/pdf"));
assert!(!is_textual("application/pdf"));
}
#[test]
fn claim_renderers_handle_empty_and_confidence() {
assert_eq!(render_facts(&[]), "No matching content found.");
assert_eq!(render_quotes(&[]), "No quotable content found.");
let claims = claims_from(
vec![("Alpha".to_string(), 0.9), ("Beta".to_string(), 0.5)],
"https://src.example/page",
);
assert_eq!(render_facts(&claims), "- (0.90) Alpha\n- (0.50) Beta");
assert_eq!(
claims[0].source_url.as_deref(),
Some("https://src.example/page")
);
}
#[test]
fn render_links_absolutizes_and_dedupes() {
let links = vec![
html_to_text::Link {
text: "rel".into(),
href: "/about".into(),
},
html_to_text::Link {
text: "abs".into(),
href: "https://y.com/z".into(),
},
html_to_text::Link {
text: "dup".into(),
href: "https://y.com/z".into(),
},
];
let out = render_links(&links, "https://x.com/dir/page");
assert!(out.contains("[rel](https://x.com/about)"));
assert!(out.contains("[abs](https://y.com/z)"));
assert_eq!(out.matches("https://y.com/z").count(), 1);
}
#[test]
fn enforce_budget_truncates_when_over() {
let big = "word ".repeat(5000);
let out = enforce_budget(&big, 50);
assert!(out.contains("[truncated"));
assert!(crate::core::tokens::count_tokens(&out) < crate::core::tokens::count_tokens(&big));
}
#[test]
fn enforce_budget_keeps_small_content() {
let small = "short content";
assert_eq!(enforce_budget(small, 1000), small);
}
}