use serde::{Deserialize, Serialize};
use serde_json::Value;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SynthesisConfig {
pub max_tokens_per_page: usize,
pub max_pages: usize,
pub pre_summarize: bool,
pub summary_tokens: usize,
pub include_relevance: bool,
pub min_relevance: f64,
}
impl Default for SynthesisConfig {
fn default() -> Self {
Self {
max_tokens_per_page: 4000,
max_pages: 10,
pre_summarize: true,
summary_tokens: 500,
include_relevance: true,
min_relevance: 0.3,
}
}
}
impl SynthesisConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_max_tokens_per_page(mut self, tokens: usize) -> Self {
self.max_tokens_per_page = tokens;
self
}
pub fn with_max_pages(mut self, max: usize) -> Self {
self.max_pages = max;
self
}
pub fn with_pre_summarize(mut self, enabled: bool) -> Self {
self.pre_summarize = enabled;
self
}
pub fn with_min_relevance(mut self, min: f64) -> Self {
self.min_relevance = min.clamp(0.0, 1.0);
self
}
pub fn tokens_per_page(&self, total_budget: usize, page_count: usize) -> usize {
if page_count == 0 {
return 0;
}
(total_budget / page_count).min(self.max_tokens_per_page)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageContext {
pub url: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub extracted: Option<Value>,
#[serde(skip_serializing_if = "Option::is_none")]
pub summary: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub html: Option<String>,
pub relevance: f64,
pub index: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
impl PageContext {
pub fn new(url: impl Into<String>, index: usize) -> Self {
Self {
url: url.into(),
title: None,
extracted: None,
summary: None,
html: None,
relevance: 0.5,
index,
error: None,
}
}
pub fn with_title(mut self, title: impl Into<String>) -> Self {
self.title = Some(title.into());
self
}
pub fn with_extracted(mut self, data: Value) -> Self {
self.extracted = Some(data);
self
}
pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
self.summary = Some(summary.into());
self
}
pub fn with_html(mut self, html: impl Into<String>) -> Self {
self.html = Some(html.into());
self
}
pub fn with_relevance(mut self, relevance: f64) -> Self {
self.relevance = relevance.clamp(0.0, 1.0);
self
}
pub fn with_error(mut self, error: impl Into<String>) -> Self {
self.error = Some(error.into());
self
}
pub fn has_content(&self) -> bool {
self.extracted.is_some() || self.summary.is_some() || self.html.is_some()
}
pub fn has_error(&self) -> bool {
self.error.is_some()
}
pub fn estimated_tokens(&self) -> usize {
let mut tokens = 0;
tokens += self.url.len() / 4;
if let Some(title) = &self.title {
tokens += title.len() / 4;
}
if let Some(extracted) = &self.extracted {
tokens += extracted.to_string().len() / 4;
}
if let Some(summary) = &self.summary {
tokens += summary.len() / 4;
}
if let Some(html) = &self.html {
tokens += html.len() / 4;
}
tokens
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MultiPageContext {
pub pages: Vec<PageContext>,
pub total_token_budget: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub goal: Option<String>,
#[serde(skip)]
pub config: SynthesisConfig,
}
impl MultiPageContext {
pub fn new(total_token_budget: usize) -> Self {
Self {
pages: Vec::new(),
total_token_budget,
goal: None,
config: SynthesisConfig::default(),
}
}
pub fn with_config(mut self, config: SynthesisConfig) -> Self {
self.config = config;
self
}
pub fn with_goal(mut self, goal: impl Into<String>) -> Self {
self.goal = Some(goal.into());
self
}
pub fn add_page(&mut self, page: PageContext) {
self.pages.push(page);
}
pub fn page_count(&self) -> usize {
self.pages.len()
}
pub fn pages_by_relevance(&self) -> Vec<&PageContext> {
let mut pages: Vec<_> = self.pages.iter().collect();
pages.sort_by(|a, b| {
b.relevance
.partial_cmp(&a.relevance)
.unwrap_or(std::cmp::Ordering::Equal)
});
pages
}
pub fn relevant_pages(&self) -> Vec<&PageContext> {
self.pages
.iter()
.filter(|p| p.relevance >= self.config.min_relevance)
.collect()
}
pub fn fit_to_budget(&mut self) {
self.pages.sort_by(|a, b| {
b.relevance
.partial_cmp(&a.relevance)
.unwrap_or(std::cmp::Ordering::Equal)
});
if self.pages.len() > self.config.max_pages {
self.pages.truncate(self.config.max_pages);
}
let tokens_per_page = self
.config
.tokens_per_page(self.total_token_budget, self.pages.len());
for page in &mut self.pages {
let mut current_tokens = page.estimated_tokens();
if current_tokens > tokens_per_page {
if let Some(html) = &mut page.html {
let target_len = (tokens_per_page * 4).min(html.len());
*html = truncate_to_char_boundary(html, target_len);
current_tokens = page.estimated_tokens();
}
}
if current_tokens > tokens_per_page {
if let Some(summary) = &mut page.summary {
let target_len = (tokens_per_page * 4).min(summary.len());
*summary = truncate_to_char_boundary(summary, target_len);
}
}
}
}
pub fn to_prompt(&self) -> String {
let mut prompt = String::with_capacity(self.total_token_budget * 4);
prompt.push_str("MULTI-PAGE SYNTHESIS REQUEST\n\n");
if let Some(goal) = &self.goal {
prompt.push_str("Goal: ");
prompt.push_str(goal);
prompt.push_str("\n\n");
}
prompt.push_str(&format!("Pages to analyze: {}\n\n", self.pages.len()));
for (i, page) in self.pages.iter().enumerate() {
prompt.push_str(&format!("=== PAGE {} ===\n", i + 1));
prompt.push_str("URL: ");
prompt.push_str(&page.url);
prompt.push('\n');
if let Some(title) = &page.title {
prompt.push_str("Title: ");
prompt.push_str(title);
prompt.push('\n');
}
prompt.push_str(&format!("Relevance: {:.2}\n", page.relevance));
if let Some(extracted) = &page.extracted {
prompt.push_str("Extracted Data:\n");
prompt.push_str(&serde_json::to_string_pretty(extracted).unwrap_or_default());
prompt.push_str("\n\n");
}
if let Some(summary) = &page.summary {
prompt.push_str("Summary:\n");
prompt.push_str(summary);
prompt.push_str("\n\n");
}
if let Some(html) = &page.html {
prompt.push_str("HTML Content:\n");
prompt.push_str(html);
prompt.push_str("\n\n");
}
if let Some(error) = &page.error {
prompt.push_str("Error: ");
prompt.push_str(error);
prompt.push_str("\n\n");
}
prompt.push('\n');
}
prompt.push_str("TASK:\n");
prompt.push_str(
"Synthesize the information from all pages above. Return a JSON object with:\n",
);
prompt.push_str("- synthesis: the combined analysis/answer\n");
prompt.push_str(
"- page_contributions: array of { page_index, contribution } for each page\n",
);
prompt.push_str("- confidence: overall confidence in the synthesis (0.0-1.0)\n");
prompt
}
pub fn total_estimated_tokens(&self) -> usize {
self.pages.iter().map(|p| p.estimated_tokens()).sum()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SynthesisResult {
pub synthesis: Value,
pub page_contributions: Vec<PageContribution>,
pub confidence: f64,
pub pages_used: usize,
pub tokens_used: usize,
pub duration_ms: u64,
}
impl SynthesisResult {
pub fn new(synthesis: Value, confidence: f64) -> Self {
Self {
synthesis,
page_contributions: Vec::new(),
confidence: confidence.clamp(0.0, 1.0),
pages_used: 0,
tokens_used: 0,
duration_ms: 0,
}
}
pub fn with_contributions(mut self, contributions: Vec<PageContribution>) -> Self {
self.pages_used = contributions.len();
self.page_contributions = contributions;
self
}
pub fn with_tokens(mut self, tokens: usize) -> Self {
self.tokens_used = tokens;
self
}
pub fn with_duration(mut self, ms: u64) -> Self {
self.duration_ms = ms;
self
}
pub fn from_json(value: &Value) -> Option<Self> {
let synthesis = value.get("synthesis")?.clone();
let confidence = value
.get("confidence")
.and_then(|v| v.as_f64())
.unwrap_or(0.5);
let page_contributions = value
.get("page_contributions")
.and_then(|v| v.as_array())
.map(|arr| arr.iter().filter_map(PageContribution::from_json).collect())
.unwrap_or_default();
Some(Self {
synthesis,
page_contributions,
confidence: confidence.clamp(0.0, 1.0),
pages_used: 0,
tokens_used: 0,
duration_ms: 0,
})
}
pub fn significant_contributors(&self, min_contribution: f64) -> Vec<&PageContribution> {
self.page_contributions
.iter()
.filter(|c| c.weight >= min_contribution)
.collect()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageContribution {
pub page_index: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
pub contribution: String,
pub weight: f64,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub key_points: Vec<String>,
}
impl PageContribution {
pub fn new(page_index: usize, contribution: impl Into<String>, weight: f64) -> Self {
Self {
page_index,
url: None,
contribution: contribution.into(),
weight: weight.clamp(0.0, 1.0),
key_points: Vec::new(),
}
}
pub fn with_url(mut self, url: impl Into<String>) -> Self {
self.url = Some(url.into());
self
}
pub fn with_key_points(mut self, points: Vec<String>) -> Self {
self.key_points = points;
self
}
pub fn from_json(value: &Value) -> Option<Self> {
let page_index = value.get("page_index").and_then(|v| v.as_u64())? as usize;
let contribution = value
.get("contribution")
.and_then(|v| v.as_str())?
.to_string();
let weight = value.get("weight").and_then(|v| v.as_f64()).unwrap_or(0.5);
let url = value.get("url").and_then(|v| v.as_str()).map(String::from);
let key_points = value
.get("key_points")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect()
})
.unwrap_or_default();
Some(Self {
page_index,
url,
contribution,
weight: weight.clamp(0.0, 1.0),
key_points,
})
}
}
fn truncate_to_char_boundary(s: &str, max_len: usize) -> String {
if s.len() <= max_len {
return s.to_string();
}
let mut end = max_len;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
format!("{}...[truncated]", &s[..end])
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_synthesis_config() {
let config = SynthesisConfig::new()
.with_max_pages(5)
.with_max_tokens_per_page(2000)
.with_min_relevance(0.5);
assert_eq!(config.max_pages, 5);
assert_eq!(config.max_tokens_per_page, 2000);
assert_eq!(config.min_relevance, 0.5);
}
#[test]
fn test_tokens_per_page() {
let config = SynthesisConfig::new().with_max_tokens_per_page(1000);
assert_eq!(config.tokens_per_page(5000, 5), 1000);
assert_eq!(config.tokens_per_page(10000, 5), 1000);
assert_eq!(config.tokens_per_page(2000, 5), 400);
}
#[test]
fn test_page_context() {
let page = PageContext::new("https://example.com", 0)
.with_title("Example")
.with_relevance(0.8)
.with_summary("A test page");
assert!(page.has_content());
assert!(!page.has_error());
assert!(page.estimated_tokens() > 0);
}
#[test]
fn test_multi_page_context() {
let mut ctx = MultiPageContext::new(10000)
.with_goal("Compare products")
.with_config(SynthesisConfig::new().with_max_pages(3));
ctx.add_page(PageContext::new("https://a.com", 0).with_relevance(0.9));
ctx.add_page(PageContext::new("https://b.com", 1).with_relevance(0.7));
ctx.add_page(PageContext::new("https://c.com", 2).with_relevance(0.5));
assert_eq!(ctx.page_count(), 3);
let by_relevance = ctx.pages_by_relevance();
assert_eq!(by_relevance[0].url, "https://a.com");
}
#[test]
fn test_fit_to_budget() {
let mut ctx =
MultiPageContext::new(1000).with_config(SynthesisConfig::new().with_max_pages(2));
ctx.add_page(PageContext::new("https://a.com", 0).with_relevance(0.9));
ctx.add_page(PageContext::new("https://b.com", 1).with_relevance(0.8));
ctx.add_page(PageContext::new("https://c.com", 2).with_relevance(0.7));
ctx.fit_to_budget();
assert_eq!(ctx.page_count(), 2);
assert_eq!(ctx.pages[0].url, "https://a.com");
}
#[test]
fn test_synthesis_result() {
let result = SynthesisResult::new(serde_json::json!({"answer": "Combined data"}), 0.85)
.with_contributions(vec![
PageContribution::new(0, "Provided main data", 0.7),
PageContribution::new(1, "Supplementary info", 0.3),
])
.with_tokens(500);
assert_eq!(result.pages_used, 2);
assert_eq!(result.confidence, 0.85);
let significant = result.significant_contributors(0.5);
assert_eq!(significant.len(), 1);
}
#[test]
fn test_page_contribution_parsing() {
let json = serde_json::json!({
"page_index": 0,
"url": "https://example.com",
"contribution": "Main source of data",
"weight": 0.8,
"key_points": ["Point 1", "Point 2"]
});
let contrib = PageContribution::from_json(&json).unwrap();
assert_eq!(contrib.page_index, 0);
assert_eq!(contrib.weight, 0.8);
assert_eq!(contrib.key_points.len(), 2);
}
#[test]
fn test_to_prompt() {
let mut ctx = MultiPageContext::new(10000).with_goal("Find the best product");
ctx.add_page(
PageContext::new("https://a.com", 0)
.with_title("Product A")
.with_summary("A great product"),
);
let prompt = ctx.to_prompt();
assert!(prompt.contains("MULTI-PAGE SYNTHESIS"));
assert!(prompt.contains("Find the best product"));
assert!(prompt.contains("https://a.com"));
assert!(prompt.contains("Product A"));
}
#[test]
fn test_truncate_to_char_boundary() {
let s = "Hello, 世界!";
let truncated = truncate_to_char_boundary(s, 10);
assert!(truncated.len() <= 25); assert!(truncated.ends_with("...[truncated]"));
}
}