use crate::Result;
#[cfg(feature = "semantic")]
use serde_json::{json, Value};
use super::chunking::DocumentChunk;
#[derive(Debug, Clone)]
pub struct DocumentMetadata {
pub title: String,
pub page_count: usize,
pub created_at: Option<String>,
pub author: Option<String>,
}
impl Default for DocumentMetadata {
fn default() -> Self {
Self {
title: "Untitled Document".to_string(),
page_count: 0,
created_at: None,
author: None,
}
}
}
#[derive(Debug, Clone)]
pub struct MarkdownOptions {
pub include_metadata: bool,
pub include_page_numbers: bool,
}
impl Default for MarkdownOptions {
fn default() -> Self {
Self {
include_metadata: true,
include_page_numbers: true,
}
}
}
#[derive(Debug, Clone)]
pub struct MarkdownExporter {
options: MarkdownOptions,
}
impl MarkdownExporter {
pub fn new(options: MarkdownOptions) -> Self {
Self { options }
}
pub fn default() -> Self {
Self::new(MarkdownOptions::default())
}
pub fn export(&self, text: &str) -> Result<String> {
if self.options.include_metadata {
Self::export_text(text)
} else {
Ok(text.to_string())
}
}
pub fn export_text(text: &str) -> Result<String> {
let mut output = String::new();
output.push_str("# Document\n\n");
output.push_str(text);
Ok(output)
}
pub fn export_with_metadata(text: &str, metadata: &DocumentMetadata) -> Result<String> {
let mut output = String::new();
output.push_str("---\n");
let escaped_title = if metadata.title.contains(':') || metadata.title.contains('#') {
format!("\"{}\"", metadata.title.replace('"', "\\\""))
} else {
metadata.title.clone()
};
output.push_str(&format!("title: {}\n", escaped_title));
output.push_str(&format!("pages: {}\n", metadata.page_count));
if let Some(ref created) = metadata.created_at {
output.push_str(&format!("created: {}\n", created));
}
if let Some(ref author) = metadata.author {
let escaped_author = if author.contains(':') {
format!("\"{}\"", author.replace('"', "\\\""))
} else {
author.clone()
};
output.push_str(&format!("author: {}\n", escaped_author));
}
output.push_str("---\n\n");
output.push_str(&format!("# {}\n\n", metadata.title));
output.push_str(text);
Ok(output)
}
pub fn export_with_pages(page_texts: &[(usize, String)]) -> Result<String> {
let mut output = String::new();
output.push_str("# Document\n\n");
for (i, (page_num, text)) in page_texts.iter().enumerate() {
if i > 0 {
output.push_str("\n\n---\n\n");
}
output.push_str(&format!("**Page {}**\n\n", page_num));
output.push_str(text);
}
Ok(output)
}
pub fn export_with_metadata_and_pages(
page_texts: &[(usize, String)],
metadata: &DocumentMetadata,
) -> Result<String> {
let mut output = String::new();
output.push_str("---\n");
let escaped_title = if metadata.title.contains(':') || metadata.title.contains('#') {
format!("\"{}\"", metadata.title.replace('"', "\\\""))
} else {
metadata.title.clone()
};
output.push_str(&format!("title: {}\n", escaped_title));
output.push_str(&format!("pages: {}\n", metadata.page_count));
if let Some(ref created) = metadata.created_at {
output.push_str(&format!("created: {}\n", created));
}
if let Some(ref author) = metadata.author {
let escaped_author = if author.contains(':') {
format!("\"{}\"", author.replace('"', "\\\""))
} else {
author.clone()
};
output.push_str(&format!("author: {}\n", escaped_author));
}
output.push_str("---\n\n");
output.push_str(&format!("# {}\n\n", metadata.title));
for (i, (page_num, text)) in page_texts.iter().enumerate() {
if i > 0 {
output.push_str("\n\n---\n\n");
}
output.push_str(&format!("**Page {}**\n\n", page_num));
output.push_str(text);
}
Ok(output)
}
}
#[cfg(feature = "semantic")]
#[derive(Debug, Clone)]
pub struct JsonOptions {
pub pretty_print: bool,
pub include_chunks: bool,
}
#[cfg(feature = "semantic")]
impl Default for JsonOptions {
fn default() -> Self {
Self {
pretty_print: true,
include_chunks: false,
}
}
}
#[cfg(feature = "semantic")]
#[derive(Debug, Clone)]
pub struct JsonExporter {
options: JsonOptions,
}
#[cfg(feature = "semantic")]
impl JsonExporter {
pub fn new(options: JsonOptions) -> Self {
Self { options }
}
pub fn default() -> Self {
Self::new(JsonOptions::default())
}
pub fn export(&self, text: &str) -> Result<String> {
let doc = json!({
"type": "document",
"content": text
});
if self.options.pretty_print {
serde_json::to_string_pretty(&doc)
.map_err(|e| crate::error::PdfError::SerializationError(e.to_string()))
} else {
serde_json::to_string(&doc)
.map_err(|e| crate::error::PdfError::SerializationError(e.to_string()))
}
}
pub fn export_simple(text: &str) -> Result<String> {
let doc = json!({
"type": "document",
"content": text
});
serde_json::to_string_pretty(&doc)
.map_err(|e| crate::error::PdfError::SerializationError(e.to_string()))
}
pub fn export_with_metadata(text: &str, metadata: &DocumentMetadata) -> Result<String> {
let mut meta_obj = json!({
"title": metadata.title,
"page_count": metadata.page_count
});
if let Some(ref created) = metadata.created_at {
meta_obj["created_at"] = json!(created);
}
if let Some(ref author) = metadata.author {
meta_obj["author"] = json!(author);
}
let doc = json!({
"type": "document",
"metadata": meta_obj,
"content": text
});
serde_json::to_string_pretty(&doc)
.map_err(|e| crate::error::PdfError::SerializationError(e.to_string()))
}
pub fn export_pages(page_texts: &[(usize, String)]) -> Result<String> {
let pages: Vec<Value> = page_texts
.iter()
.map(|(page_num, text)| {
json!({
"page_number": page_num,
"content": text
})
})
.collect();
let doc = json!({
"type": "document",
"page_count": page_texts.len(),
"pages": pages
});
serde_json::to_string_pretty(&doc)
.map_err(|e| crate::error::PdfError::SerializationError(e.to_string()))
}
pub fn export_with_chunks(chunks: &[DocumentChunk]) -> Result<String> {
let chunk_objects: Vec<Value> = chunks
.iter()
.map(|chunk| {
json!({
"id": chunk.id,
"content": chunk.content,
"tokens": chunk.tokens,
"page_numbers": chunk.page_numbers,
"chunk_index": chunk.chunk_index,
"metadata": {
"position": {
"start_char": chunk.metadata.position.start_char,
"end_char": chunk.metadata.position.end_char,
"first_page": chunk.metadata.position.first_page,
"last_page": chunk.metadata.position.last_page
},
"confidence": chunk.metadata.confidence,
"sentence_boundary_respected": chunk.metadata.sentence_boundary_respected
}
})
})
.collect();
let doc = json!({
"type": "chunked_document",
"chunk_count": chunks.len(),
"chunks": chunk_objects
});
serde_json::to_string_pretty(&doc)
.map_err(|e| crate::error::PdfError::SerializationError(e.to_string()))
}
}
#[cfg(feature = "semantic")]
impl ChunkExporter for JsonExporter {
fn export_chunks(&self, chunks: &[DocumentChunk]) -> Result<String> {
JsonExporter::export_with_chunks(chunks)
}
}
#[derive(Debug, Clone)]
pub struct ContextualFormat;
impl ContextualFormat {
pub fn export_simple(text: &str) -> Result<String> {
let mut output = String::new();
output.push_str("Document content:\n\n");
output.push_str(text);
Ok(output)
}
pub fn export_with_metadata(text: &str, metadata: &DocumentMetadata) -> Result<String> {
let mut output = String::new();
output.push_str(&format!("This is a document titled \"{}\"", metadata.title));
if metadata.page_count > 0 {
output.push_str(&format!(
" with {} page{}",
metadata.page_count,
if metadata.page_count == 1 { "" } else { "s" }
));
}
if let Some(ref author) = metadata.author {
output.push_str(&format!(", written by {}", author));
}
if let Some(ref created) = metadata.created_at {
output.push_str(&format!(", created on {}", created));
}
output.push_str(".\n\n");
output.push_str("Content:\n\n");
output.push_str(text);
Ok(output)
}
pub fn export_with_pages(page_texts: &[(usize, String)]) -> Result<String> {
let mut output = String::new();
output.push_str("Document content:\n\n");
for (page_num, text) in page_texts.iter() {
output.push_str(&format!("On page {}:\n", page_num));
output.push_str(text);
output.push_str("\n\n");
}
Ok(output)
}
pub fn export_with_metadata_and_pages(
page_texts: &[(usize, String)],
metadata: &DocumentMetadata,
) -> Result<String> {
let mut output = String::new();
output.push_str(&format!("This is a document titled \"{}\"", metadata.title));
if metadata.page_count > 0 {
output.push_str(&format!(
" with {} page{}",
metadata.page_count,
if metadata.page_count == 1 { "" } else { "s" }
));
}
if let Some(ref author) = metadata.author {
output.push_str(&format!(", written by {}", author));
}
if let Some(ref created) = metadata.created_at {
output.push_str(&format!(", created on {}", created));
}
output.push_str(".\n\n");
output.push_str("Content:\n\n");
for (page_num, text) in page_texts.iter() {
output.push_str(&format!("On page {}:\n", page_num));
output.push_str(text);
output.push_str("\n\n");
}
Ok(output)
}
}
pub trait ChunkExporter {
fn export_chunks(&self, chunks: &[DocumentChunk]) -> Result<String>;
}
#[derive(Debug, Clone, Default)]
pub struct TokenEfficientExporter;
impl TokenEfficientExporter {
const MAGIC: &'static str = "#oxct/1";
const HEADER: &'static str = "id\ttokens\tchunk_index\tstart_char\tend_char\tfirst_page\tlast_page\tconfidence\tsentence_boundary\tpage_numbers\tcontent";
pub fn new() -> Self {
Self
}
pub fn export_chunks(&self, chunks: &[DocumentChunk]) -> Result<String> {
let mut out = String::new();
out.push_str(Self::MAGIC);
out.push('\n');
out.push_str(Self::HEADER);
for chunk in chunks {
out.push('\n');
out.push_str(&Self::encode_row(chunk));
}
Ok(out)
}
pub fn parse_chunks(input: &str) -> Result<Vec<DocumentChunk>> {
let logical = Self::rejoin_quoted_lines(input)?;
let mut iter = logical.iter();
match iter.next().map(|s| s.trim_end_matches('\r')) {
Some(Self::MAGIC) => {}
other => {
return Err(crate::error::PdfError::InvalidStructure(format!(
"token-efficient: unexpected version marker {other:?}, expected {:?}",
Self::MAGIC
)))
}
}
match iter.next().map(|s| s.trim_end_matches('\r')) {
Some(Self::HEADER) => {}
other => {
return Err(crate::error::PdfError::InvalidStructure(format!(
"token-efficient: unexpected column header {other:?}"
)))
}
}
let mut chunks = Vec::new();
for line in iter {
if line.is_empty() {
continue;
}
chunks.push(Self::parse_row(line)?);
}
Ok(chunks)
}
fn parse_row(line: &str) -> Result<DocumentChunk> {
use crate::ai::chunking::{ChunkMetadata, ChunkPosition};
let fields: Vec<&str> = line.splitn(11, '\t').collect();
if fields.len() != 11 {
return Err(crate::error::PdfError::InvalidStructure(format!(
"token-efficient: row has {} columns, expected 11",
fields.len()
)));
}
let parse_usize = |name: &str, s: &str| -> Result<usize> {
s.parse::<usize>().map_err(|e| {
crate::error::PdfError::InvalidStructure(format!(
"token-efficient: invalid {name} {s:?}: {e}"
))
})
};
let confidence = fields[7].parse::<f32>().map_err(|e| {
crate::error::PdfError::InvalidStructure(format!(
"token-efficient: invalid confidence {:?}: {e}",
fields[7]
))
})?;
if !confidence.is_finite() {
return Err(crate::error::PdfError::InvalidStructure(format!(
"token-efficient: confidence must be finite, got {confidence:?}"
)));
}
Ok(DocumentChunk {
id: fields[0].to_string(),
tokens: parse_usize("tokens", fields[1])?,
chunk_index: parse_usize("chunk_index", fields[2])?,
page_numbers: Self::parse_page_numbers_field(fields[9])?,
content: Self::parse_content_field(fields[10])?,
metadata: ChunkMetadata {
position: ChunkPosition {
start_char: parse_usize("start_char", fields[3])?,
end_char: parse_usize("end_char", fields[4])?,
first_page: parse_usize("first_page", fields[5])?,
last_page: parse_usize("last_page", fields[6])?,
},
confidence,
sentence_boundary_respected: fields[8] == "true",
language: None,
},
})
}
fn rejoin_quoted_lines(input: &str) -> Result<Vec<String>> {
let mut rows = Vec::new();
let mut current = String::new();
let mut in_quote = false;
for ch in input.chars() {
match ch {
'"' => {
in_quote = !in_quote;
current.push(ch);
}
'\n' if !in_quote => {
rows.push(std::mem::take(&mut current));
}
_ => current.push(ch),
}
}
if in_quote {
return Err(crate::error::PdfError::InvalidStructure(
"token-efficient: unterminated quoted field".to_string(),
));
}
rows.push(current);
Ok(rows)
}
fn parse_page_numbers_field(s: &str) -> Result<Vec<usize>> {
if s.is_empty() {
return Ok(Vec::new());
}
s.split(';')
.map(|p| {
p.parse::<usize>().map_err(|e| {
crate::error::PdfError::InvalidStructure(format!(
"invalid page number {p:?} in token-efficient chunk row: {e}"
))
})
})
.collect()
}
fn quote_content(s: &str) -> String {
let needs_quote = s.contains('"') || s.contains('\n') || s.contains('\r');
if needs_quote {
format!("\"{}\"", s.replace('"', "\"\""))
} else {
s.to_string()
}
}
fn parse_content_field(s: &str) -> Result<String> {
if s.len() >= 2 && s.starts_with('"') && s.ends_with('"') {
let inner = &s[1..s.len() - 1];
if inner.replace("\"\"", "").contains('"') {
return Err(crate::error::PdfError::InvalidStructure(
"token-efficient: malformed quoted content field (unbalanced quotes)"
.to_string(),
));
}
Ok(inner.replace("\"\"", "\""))
} else if s.contains('"') {
Err(crate::error::PdfError::InvalidStructure(
"token-efficient: unquoted content field contains a stray quote".to_string(),
))
} else {
Ok(s.to_string())
}
}
fn encode_row(chunk: &DocumentChunk) -> String {
let pages = chunk
.page_numbers
.iter()
.map(|p| p.to_string())
.collect::<Vec<_>>()
.join(";");
format!(
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.4}\t{}\t{}\t{}",
chunk.id,
chunk.tokens,
chunk.chunk_index,
chunk.metadata.position.start_char,
chunk.metadata.position.end_char,
chunk.metadata.position.first_page,
chunk.metadata.position.last_page,
chunk.metadata.confidence,
chunk.metadata.sentence_boundary_respected,
pages,
Self::quote_content(&chunk.content),
)
}
}
impl ChunkExporter for TokenEfficientExporter {
fn export_chunks(&self, chunks: &[DocumentChunk]) -> Result<String> {
TokenEfficientExporter::export_chunks(self, chunks)
}
}
#[cfg(test)]
mod tests {
use super::*;
const TE_HEADER: &str = "id\ttokens\tchunk_index\tstart_char\tend_char\tfirst_page\tlast_page\tconfidence\tsentence_boundary\tpage_numbers\tcontent";
#[allow(clippy::too_many_arguments)]
fn te_chunk(
id: &str,
content: &str,
tokens: usize,
page_numbers: Vec<usize>,
chunk_index: usize,
start_char: usize,
end_char: usize,
first_page: usize,
last_page: usize,
confidence: f32,
sentence_boundary_respected: bool,
) -> crate::ai::chunking::DocumentChunk {
use crate::ai::chunking::{ChunkMetadata, ChunkPosition, DocumentChunk};
DocumentChunk {
id: id.to_string(),
content: content.to_string(),
tokens,
page_numbers,
chunk_index,
metadata: ChunkMetadata {
position: ChunkPosition {
start_char,
end_char,
first_page,
last_page,
},
confidence,
sentence_boundary_respected,
language: None,
},
}
}
#[test]
fn test_encode_scalar_fields_no_special_chars() {
let chunks = vec![te_chunk(
"chunk_0",
"Hello world",
10,
vec![1],
0,
0,
100,
1,
1,
1.0,
true,
)];
let out = TokenEfficientExporter::new()
.export_chunks(&chunks)
.unwrap();
let lines: Vec<&str> = out.lines().collect();
assert_eq!(
lines.len(),
3,
"expected magic + header + 1 row, got: {out:?}"
);
assert_eq!(
lines[0], "#oxct/1",
"first line must be the format version marker"
);
assert_eq!(lines[1], TE_HEADER, "second line must be the column header");
assert_eq!(
lines[2], "chunk_0\t10\t0\t0\t100\t1\t1\t1.0000\ttrue\t1\tHello world",
"data row must be tab-separated scalar fields followed by content"
);
}
#[test]
fn test_page_numbers_encoding() {
let out = TokenEfficientExporter::new()
.export_chunks(&[te_chunk(
"c",
"x",
1,
vec![2, 3, 4],
0,
0,
1,
2,
4,
0.5,
false,
)])
.unwrap();
let row = out.lines().nth(2).unwrap();
assert_eq!(row.split('\t').nth(9).unwrap(), "2;3;4");
let out = TokenEfficientExporter::new()
.export_chunks(&[te_chunk("c", "x", 1, vec![1], 0, 0, 1, 1, 1, 0.5, false)])
.unwrap();
let row = out.lines().nth(2).unwrap();
assert_eq!(row.split('\t').nth(9).unwrap(), "1");
let out = TokenEfficientExporter::new()
.export_chunks(&[te_chunk("c", "x", 1, vec![], 0, 0, 1, 0, 0, 0.5, false)])
.unwrap();
let row = out.lines().nth(2).unwrap();
assert_eq!(row.split('\t').nth(9).unwrap(), "");
}
#[test]
fn test_parse_page_numbers_field() {
assert_eq!(
TokenEfficientExporter::parse_page_numbers_field("2;3;4").unwrap(),
vec![2usize, 3, 4]
);
assert_eq!(
TokenEfficientExporter::parse_page_numbers_field("1").unwrap(),
vec![1usize]
);
assert_eq!(
TokenEfficientExporter::parse_page_numbers_field("").unwrap(),
Vec::<usize>::new()
);
assert!(TokenEfficientExporter::parse_page_numbers_field("1;x;3").is_err());
}
fn te_content_field(out: &str) -> String {
let row = out
.strip_prefix("#oxct/1\n")
.and_then(|s| s.strip_prefix(TE_HEADER))
.and_then(|s| s.strip_prefix('\n'))
.expect("well-formed export");
row.splitn(11, '\t').nth(10).unwrap().to_string()
}
fn te_export_one(content: &str) -> String {
TokenEfficientExporter::new()
.export_chunks(&[te_chunk(
"c",
content,
1,
vec![1],
0,
0,
1,
1,
1,
0.5,
false,
)])
.unwrap()
}
#[test]
fn test_content_quoting() {
assert_eq!(
te_content_field(&te_export_one("hello, world")),
"hello, world"
);
assert_eq!(
te_content_field(&te_export_one("say \"hi\"")),
"\"say \"\"hi\"\"\""
);
assert_eq!(te_content_field(&te_export_one("\"hi\"")), "\"\"\"hi\"\"\"");
assert_eq!(
te_content_field(&te_export_one("say \"hello")),
"\"say \"\"hello\""
);
assert_eq!(
te_content_field(&te_export_one("line1\nline2")),
"\"line1\nline2\""
);
assert_eq!(te_content_field(&te_export_one("こんにちは")), "こんにちは");
assert_eq!(te_content_field(&te_export_one("")), "");
}
#[test]
fn test_parse_content_field() {
assert_eq!(
TokenEfficientExporter::parse_content_field("hello, world").unwrap(),
"hello, world"
);
assert_eq!(
TokenEfficientExporter::parse_content_field("\"\"\"hi\"\"\"").unwrap(),
"\"hi\""
);
assert_eq!(TokenEfficientExporter::parse_content_field("").unwrap(), "");
assert_eq!(
TokenEfficientExporter::parse_content_field("\"line1\nline2\"").unwrap(),
"line1\nline2"
);
assert!(TokenEfficientExporter::parse_content_field("say \"hi\"").is_err());
assert!(TokenEfficientExporter::parse_content_field("\"ab\"cd\"").is_err());
}
#[test]
fn test_multi_chunk_document() {
let chunks = vec![
te_chunk(
"chunk_0",
"First chunk",
10,
vec![1],
0,
0,
100,
1,
1,
1.0,
true,
),
te_chunk(
"chunk_1",
"Second, chunk",
12,
vec![1, 2],
1,
90,
200,
1,
2,
0.95,
false,
),
];
let out = TokenEfficientExporter::new()
.export_chunks(&chunks)
.unwrap();
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines.len(), 4, "magic + header + 2 rows");
assert_eq!(lines[0], "#oxct/1");
assert_eq!(lines[1], TE_HEADER);
assert!(lines[2].ends_with("\tFirst chunk"));
assert!(lines[3].ends_with("\tSecond, chunk"));
assert_eq!(lines[3].split('\t').nth(9).unwrap(), "1;2");
}
fn assert_chunk_eq(
a: &crate::ai::chunking::DocumentChunk,
b: &crate::ai::chunking::DocumentChunk,
) {
assert_eq!(a.id, b.id, "id");
assert_eq!(a.content, b.content, "content");
assert_eq!(a.tokens, b.tokens, "tokens");
assert_eq!(a.page_numbers, b.page_numbers, "page_numbers");
assert_eq!(a.chunk_index, b.chunk_index, "chunk_index");
assert_eq!(
a.metadata.position.start_char, b.metadata.position.start_char,
"start_char"
);
assert_eq!(
a.metadata.position.end_char, b.metadata.position.end_char,
"end_char"
);
assert_eq!(
a.metadata.position.first_page, b.metadata.position.first_page,
"first_page"
);
assert_eq!(
a.metadata.position.last_page, b.metadata.position.last_page,
"last_page"
);
assert!(
(a.metadata.confidence - b.metadata.confidence).abs() < 1e-4,
"confidence: {} vs {}",
a.metadata.confidence,
b.metadata.confidence
);
assert_eq!(
a.metadata.sentence_boundary_respected, b.metadata.sentence_boundary_respected,
"sentence_boundary"
);
}
fn roundtrip(chunk: crate::ai::chunking::DocumentChunk) {
let exporter = TokenEfficientExporter::new();
let serialized = exporter
.export_chunks(std::slice::from_ref(&chunk))
.unwrap();
let parsed = TokenEfficientExporter::parse_chunks(&serialized).unwrap();
assert_eq!(
parsed.len(),
1,
"single chunk should round-trip to one chunk"
);
assert_chunk_eq(&parsed[0], &chunk);
}
#[test]
fn test_rejoin_quoted_lines_with_embedded_newline() {
let raw = "#oxct/1\nHDR\nrow_a\t\"line1\nline2\"\nrow_b";
let logical = TokenEfficientExporter::rejoin_quoted_lines(raw).unwrap();
assert_eq!(logical.len(), 4); assert_eq!(logical[0], "#oxct/1");
assert_eq!(logical[1], "HDR");
assert_eq!(logical[2], "row_a\t\"line1\nline2\"");
assert_eq!(logical[3], "row_b");
}
#[test]
fn test_roundtrip_single_chunk() {
roundtrip(te_chunk(
"chunk_0",
"Plain content",
7,
vec![3],
0,
10,
210,
3,
3,
0.95,
true,
));
}
#[test]
fn test_roundtrip_zero_chunks() {
let exporter = TokenEfficientExporter::new();
let serialized = exporter.export_chunks(&[]).unwrap();
assert_eq!(serialized, format!("#oxct/1\n{TE_HEADER}"));
let parsed = TokenEfficientExporter::parse_chunks(&serialized).unwrap();
assert!(parsed.is_empty());
}
#[test]
fn test_roundtrip_content_with_comma() {
roundtrip(te_chunk(
"c",
"price: $1,200",
4,
vec![1],
0,
0,
13,
1,
1,
0.5,
false,
));
}
#[test]
fn test_roundtrip_content_with_embedded_newline() {
roundtrip(te_chunk(
"c",
"line1\nline2\nline3",
5,
vec![1, 2],
0,
0,
17,
1,
2,
0.5,
false,
));
}
#[test]
fn test_roundtrip_content_with_embedded_quote() {
roundtrip(te_chunk(
"c",
"He said \"hello\"",
5,
vec![1],
0,
0,
15,
1,
1,
0.5,
false,
));
}
#[test]
fn test_roundtrip_content_starting_with_quote() {
roundtrip(te_chunk(
"c",
"\"quoted start",
3,
vec![1],
0,
0,
13,
1,
1,
0.5,
false,
));
}
#[test]
fn test_roundtrip_unicode() {
roundtrip(te_chunk(
"c",
"Ñoño: αβγ 中文 🦀",
6,
vec![1],
0,
0,
20,
1,
1,
0.5,
false,
));
}
#[test]
fn test_roundtrip_multi_page() {
roundtrip(te_chunk(
"c",
"x",
1,
vec![3, 7, 12],
0,
0,
1,
3,
12,
0.5,
false,
));
}
#[test]
fn test_roundtrip_multiple_chunks_preserves_order() {
let chunks = vec![
te_chunk("a", "first", 1, vec![1], 0, 0, 5, 1, 1, 1.0, true),
te_chunk(
"b",
"with, comma",
2,
vec![1, 2],
1,
5,
16,
1,
2,
0.8,
false,
),
te_chunk("c", "line\nbreak", 3, vec![2], 2, 16, 26, 2, 2, 0.6, true),
];
let serialized = TokenEfficientExporter::new()
.export_chunks(&chunks)
.unwrap();
let parsed = TokenEfficientExporter::parse_chunks(&serialized).unwrap();
assert_eq!(parsed.len(), 3);
for (orig, got) in chunks.iter().zip(parsed.iter()) {
assert_chunk_eq(got, orig);
}
}
#[test]
fn test_parse_rejects_wrong_column_count() {
let bad =
format!("#oxct/1\n{TE_HEADER}\nonly\tnine\tfields\there\tare\tsome\tnot\televen\tk");
assert!(TokenEfficientExporter::parse_chunks(&bad).is_err());
}
#[test]
fn test_parse_rejects_bad_header() {
let bad = "#oxct/1\nnot_the_header\nrow";
assert!(TokenEfficientExporter::parse_chunks(bad).is_err());
}
#[test]
fn test_parse_rejects_bad_magic() {
let bad = format!("#oxct/2\n{TE_HEADER}");
assert!(TokenEfficientExporter::parse_chunks(&bad).is_err());
}
#[test]
fn test_roundtrip_content_with_odd_interior_quote() {
let chunks = vec![
te_chunk("a", "say \"hello", 2, vec![1], 0, 0, 10, 1, 1, 0.5, false),
te_chunk("b", "second chunk", 1, vec![1], 1, 10, 22, 1, 1, 0.5, false),
];
let serialized = TokenEfficientExporter::new()
.export_chunks(&chunks)
.unwrap();
let parsed = TokenEfficientExporter::parse_chunks(&serialized).unwrap();
assert_eq!(parsed.len(), 2, "the second row must not be swallowed");
assert_chunk_eq(&parsed[0], &chunks[0]);
assert_chunk_eq(&parsed[1], &chunks[1]);
}
#[test]
fn test_parse_rejects_unterminated_quote() {
let bad =
format!("#oxct/1\n{TE_HEADER}\nc\t1\t0\t0\t1\t1\t1\t0.5000\ttrue\t1\t\"unterminated");
assert!(TokenEfficientExporter::parse_chunks(&bad).is_err());
}
#[test]
fn test_parse_rejects_non_finite_confidence() {
let bad = format!("#oxct/1\n{TE_HEADER}\nc\t1\t0\t0\t1\t1\t1\tNaN\ttrue\t1\tx");
assert!(TokenEfficientExporter::parse_chunks(&bad).is_err());
let bad_inf = format!("#oxct/1\n{TE_HEADER}\nc\t1\t0\t0\t1\t1\t1\tinf\ttrue\t1\tx");
assert!(TokenEfficientExporter::parse_chunks(&bad_inf).is_err());
}
#[test]
fn test_chunk_exporter_trait_object() {
let chunks = vec![te_chunk("a", "x", 1, vec![1], 0, 0, 1, 1, 1, 1.0, true)];
let exporters: Vec<Box<dyn ChunkExporter>> = vec![Box::new(TokenEfficientExporter::new())];
for e in &exporters {
let out = e.export_chunks(&chunks).unwrap();
assert!(out.starts_with("#oxct/1\n"));
}
}
#[test]
fn test_chunk_exporter_trait_matches_inherent() {
let chunks = vec![te_chunk("a", "x", 1, vec![1], 0, 0, 1, 1, 1, 1.0, true)];
let exporter = TokenEfficientExporter::new();
let via_trait = ChunkExporter::export_chunks(&exporter, &chunks).unwrap();
let via_inherent = exporter.export_chunks(&chunks).unwrap();
assert_eq!(via_trait, via_inherent);
}
#[cfg(feature = "semantic")]
#[test]
fn test_json_exporter_implements_chunk_exporter() {
let chunks = vec![te_chunk("a", "x", 1, vec![1], 0, 0, 1, 1, 1, 1.0, true)];
let json_exporter = JsonExporter::default();
let via_trait = ChunkExporter::export_chunks(&json_exporter, &chunks).unwrap();
let via_inherent = JsonExporter::export_with_chunks(&chunks).unwrap();
assert_eq!(via_trait, via_inherent);
}
#[cfg(feature = "token-bench")]
#[test]
fn test_token_efficient_uses_fewer_tokens_than_json() {
let paragraphs = [
"The quarterly report shows revenue of $1,200,000, up 12% year over year.",
"Section 3.2 describes the authentication flow, including token refresh and rotation.",
"Climate models project a temperature increase between 1.5 and 4.0 degrees Celsius.",
"The defendant, having waived counsel, proceeded to represent themselves at trial.",
"Mitochondria are the membrane-bound organelles responsible for cellular respiration.",
];
let mut chunks = Vec::new();
for i in 0..50usize {
let body = paragraphs[i % paragraphs.len()];
let content = if i % 3 == 0 {
format!("{body} {body}")
} else {
body.to_string()
};
let first_page = 1 + i / 3;
let last_page = first_page + (i % 2);
let page_numbers: Vec<usize> = (first_page..=last_page).collect();
chunks.push(te_chunk(
&format!("chunk_{i}"),
&content,
crate::ai::chunking::DocumentChunker::estimate_tokens(&content),
page_numbers,
i,
i * 100,
i * 100 + content.len(),
first_page,
last_page,
0.5 + (i % 5) as f32 / 10.0,
i % 2 == 0,
));
}
let te_out = TokenEfficientExporter::new()
.export_chunks(&chunks)
.unwrap();
let json_out = JsonExporter::export_with_chunks(&chunks).unwrap();
let bpe = tiktoken_rs::cl100k_base().expect("cl100k_base tokenizer");
let te_tokens = bpe.encode_ordinary(&te_out).len();
let json_tokens = bpe.encode_ordinary(&json_out).len();
let reduction = 100.0 * (1.0 - te_tokens as f64 / json_tokens as f64);
eprintln!(
"token-efficient: {te_tokens} tokens | json: {json_tokens} tokens | reduction: {reduction:.1}%"
);
assert!(
te_tokens < json_tokens,
"token-efficient ({te_tokens}) must use fewer tokens than JSON ({json_tokens})"
);
}
#[test]
fn test_basic_text_to_markdown() {
let text = "hello world";
let result = MarkdownExporter::export_text(text).unwrap();
assert!(result.contains("# Document"), "Should have document header");
assert!(
result.contains("hello world"),
"Should contain original text"
);
let lines: Vec<&str> = result.lines().collect();
assert_eq!(lines[0], "# Document");
assert_eq!(lines[1], "");
assert_eq!(lines[2], "hello world");
}
#[test]
fn test_empty_text() {
let result = MarkdownExporter::export_text("").unwrap();
assert!(
result.contains("# Document"),
"Should still have header for empty text"
);
assert_eq!(result, "# Document\n\n");
}
#[test]
fn test_multiline_text() {
let text = "First line\nSecond line\nThird line";
let result = MarkdownExporter::export_text(text).unwrap();
assert!(result.contains("First line"));
assert!(result.contains("Second line"));
assert!(result.contains("Third line"));
}
#[test]
fn test_text_with_special_characters() {
let text = "Text with # hash and * asterisk";
let result = MarkdownExporter::export_text(text).unwrap();
assert!(result.contains("# hash"));
assert!(result.contains("* asterisk"));
}
#[test]
fn test_markdown_exporter_creation() {
let exporter = MarkdownExporter::new(MarkdownOptions {
include_metadata: true,
include_page_numbers: false,
});
assert!(exporter.options.include_metadata);
assert!(!exporter.options.include_page_numbers);
}
#[test]
fn test_markdown_exporter_default() {
let exporter = MarkdownExporter::default();
assert!(exporter.options.include_metadata);
assert!(exporter.options.include_page_numbers);
}
#[test]
fn test_markdown_with_metadata() {
let metadata = DocumentMetadata {
title: "Test Document".to_string(),
page_count: 10,
created_at: Some("2025-10-13".to_string()),
author: Some("John Doe".to_string()),
};
let result = MarkdownExporter::export_with_metadata("Sample content", &metadata).unwrap();
assert!(result.starts_with("---\n"), "Should start with YAML marker");
assert!(result.contains("title: Test Document"));
assert!(result.contains("pages: 10"));
assert!(result.contains("created: 2025-10-13"));
assert!(result.contains("author: John Doe"));
assert!(result.contains("# Test Document"));
assert!(result.contains("Sample content"));
}
#[test]
fn test_metadata_with_special_characters() {
let metadata = DocumentMetadata {
title: "Test: Document #1".to_string(),
page_count: 5,
created_at: None,
author: None,
};
let result = MarkdownExporter::export_with_metadata("Content", &metadata).unwrap();
assert!(result.contains("title: \"Test: Document #1\""));
}
#[test]
fn test_metadata_minimal() {
let metadata = DocumentMetadata {
title: "Simple".to_string(),
page_count: 1,
created_at: None,
author: None,
};
let result = MarkdownExporter::export_with_metadata("Text", &metadata).unwrap();
assert!(result.contains("title: Simple"));
assert!(result.contains("pages: 1"));
assert!(!result.contains("created:"));
assert!(!result.contains("author:"));
}
#[test]
fn test_document_metadata_default() {
let metadata = DocumentMetadata::default();
assert_eq!(metadata.title, "Untitled Document");
assert_eq!(metadata.page_count, 0);
assert!(metadata.created_at.is_none());
assert!(metadata.author.is_none());
}
#[test]
fn test_multipage_markdown() {
let pages = vec![
(1, "Content of page 1".to_string()),
(2, "Content of page 2".to_string()),
(3, "Content of page 3".to_string()),
];
let result = MarkdownExporter::export_with_pages(&pages).unwrap();
assert!(result.starts_with("# Document\n\n"));
assert!(result.contains("**Page 1**"));
assert!(result.contains("**Page 2**"));
assert!(result.contains("**Page 3**"));
assert!(result.contains("Content of page 1"));
assert!(result.contains("Content of page 2"));
assert!(result.contains("Content of page 3"));
let separator_count = result.matches("\n---\n").count();
assert_eq!(separator_count, 2, "Should have 2 separators for 3 pages");
}
#[test]
fn test_page_numbers_correct() {
let pages = vec![(1, "First".to_string()), (2, "Second".to_string())];
let result = MarkdownExporter::export_with_pages(&pages).unwrap();
let page1_pos = result.find("**Page 1**").unwrap();
let page2_pos = result.find("**Page 2**").unwrap();
assert!(page1_pos < page2_pos, "Page 1 should appear before Page 2");
}
#[test]
fn test_single_page_no_separator() {
let pages = vec![(1, "Single page content".to_string())];
let result = MarkdownExporter::export_with_pages(&pages).unwrap();
assert!(
!result.contains("---"),
"Single page should not have separator"
);
assert!(result.contains("**Page 1**"));
assert!(result.contains("Single page content"));
}
#[test]
fn test_empty_pages_list() {
let pages: Vec<(usize, String)> = vec![];
let result = MarkdownExporter::export_with_pages(&pages).unwrap();
assert_eq!(result, "# Document\n\n");
}
#[test]
fn test_metadata_and_pages_combined() {
let metadata = DocumentMetadata {
title: "Test Document".to_string(),
page_count: 2,
created_at: Some("2025-10-13".to_string()),
author: Some("John Doe".to_string()),
};
let pages = vec![
(1, "Page one text".to_string()),
(2, "Page two text".to_string()),
];
let result = MarkdownExporter::export_with_metadata_and_pages(&pages, &metadata).unwrap();
assert!(result.starts_with("---\n"));
assert!(result.contains("title: Test Document"));
assert!(result.contains("pages: 2"));
assert!(result.contains("created: 2025-10-13"));
assert!(result.contains("author: John Doe"));
assert!(result.contains("# Test Document"));
assert!(result.contains("**Page 1**"));
assert!(result.contains("**Page 2**"));
assert!(result.contains("Page one text"));
assert!(result.contains("Page two text"));
}
#[test]
fn test_page_separator_format() {
let pages = vec![(1, "A".to_string()), (2, "B".to_string())];
let result = MarkdownExporter::export_with_pages(&pages).unwrap();
assert!(result.contains("\n\n---\n\n"));
}
#[cfg(feature = "semantic")]
#[test]
fn test_basic_json_export() {
let text = "Hello, world!";
let result = JsonExporter::export_simple(text).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(parsed["type"], "document");
assert_eq!(parsed["content"], "Hello, world!");
}
#[cfg(feature = "semantic")]
#[test]
fn test_json_parsing() {
let text = "Sample content";
let json = JsonExporter::export_simple(text).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert!(parsed.is_object());
assert_eq!(parsed["type"], "document");
assert_eq!(parsed["content"], "Sample content");
}
#[cfg(feature = "semantic")]
#[test]
fn test_json_with_metadata() {
let metadata = DocumentMetadata {
title: "Test Doc".to_string(),
page_count: 10,
created_at: Some("2025-10-13".to_string()),
author: Some("Jane Doe".to_string()),
};
let json = JsonExporter::export_with_metadata("Content", &metadata).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["metadata"]["title"], "Test Doc");
assert_eq!(parsed["metadata"]["page_count"], 10);
assert_eq!(parsed["metadata"]["created_at"], "2025-10-13");
assert_eq!(parsed["metadata"]["author"], "Jane Doe");
assert_eq!(parsed["content"], "Content");
}
#[cfg(feature = "semantic")]
#[test]
fn test_json_pages_export() {
let pages = vec![
(1, "Page 1 text".to_string()),
(2, "Page 2 text".to_string()),
(3, "Page 3 text".to_string()),
];
let json = JsonExporter::export_pages(&pages).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["type"], "document");
assert_eq!(parsed["page_count"], 3);
let pages_array = parsed["pages"].as_array().unwrap();
assert_eq!(pages_array.len(), 3);
assert_eq!(pages_array[0]["page_number"], 1);
assert_eq!(pages_array[0]["content"], "Page 1 text");
assert_eq!(pages_array[1]["page_number"], 2);
assert_eq!(pages_array[1]["content"], "Page 2 text");
assert_eq!(pages_array[2]["page_number"], 3);
assert_eq!(pages_array[2]["content"], "Page 3 text");
}
#[cfg(feature = "semantic")]
#[test]
fn test_json_exporter_options() {
let exporter = JsonExporter::new(JsonOptions {
pretty_print: false,
include_chunks: false,
});
let result = exporter.export("test").unwrap();
assert!(!result.contains('\n'));
}
#[cfg(feature = "semantic")]
#[test]
fn test_json_pretty_print() {
let exporter = JsonExporter::new(JsonOptions {
pretty_print: true,
include_chunks: false,
});
let result = exporter.export("test").unwrap();
assert!(result.contains('\n'));
assert!(result.contains(" ")); }
#[cfg(feature = "semantic")]
#[test]
fn test_json_empty_pages() {
let pages: Vec<(usize, String)> = vec![];
let json = JsonExporter::export_pages(&pages).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["page_count"], 0);
assert_eq!(parsed["pages"].as_array().unwrap().len(), 0);
}
#[cfg(feature = "semantic")]
#[test]
fn test_export_with_chunks_basic() {
use crate::ai::chunking::{ChunkMetadata, ChunkPosition};
let chunks = vec![
DocumentChunk {
id: "chunk_0".to_string(),
content: "First chunk content".to_string(),
tokens: 10,
page_numbers: vec![1],
chunk_index: 0,
metadata: ChunkMetadata {
position: ChunkPosition {
start_char: 0,
end_char: 100,
first_page: 1,
last_page: 1,
},
confidence: 1.0,
sentence_boundary_respected: true,
language: None,
},
},
DocumentChunk {
id: "chunk_1".to_string(),
content: "Second chunk content".to_string(),
tokens: 12,
page_numbers: vec![1, 2],
chunk_index: 1,
metadata: ChunkMetadata {
position: ChunkPosition {
start_char: 90,
end_char: 200,
first_page: 1,
last_page: 2,
},
confidence: 0.95,
sentence_boundary_respected: false,
language: None,
},
},
];
let json = JsonExporter::export_with_chunks(&chunks).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["type"], "chunked_document");
assert_eq!(parsed["chunk_count"], 2);
let chunks_array = parsed["chunks"].as_array().unwrap();
assert_eq!(chunks_array.len(), 2);
assert_eq!(chunks_array[0]["id"], "chunk_0");
assert_eq!(chunks_array[0]["tokens"], 10);
assert_eq!(chunks_array[0]["content"], "First chunk content");
assert_eq!(chunks_array[0]["page_numbers"][0], 1);
assert_eq!(chunks_array[0]["chunk_index"], 0);
assert_eq!(chunks_array[0]["metadata"]["confidence"], 1.0);
assert_eq!(
chunks_array[0]["metadata"]["sentence_boundary_respected"],
true
);
assert_eq!(chunks_array[0]["metadata"]["position"]["start_char"], 0);
assert_eq!(chunks_array[0]["metadata"]["position"]["end_char"], 100);
assert_eq!(chunks_array[0]["metadata"]["position"]["first_page"], 1);
assert_eq!(chunks_array[0]["metadata"]["position"]["last_page"], 1);
assert_eq!(chunks_array[1]["id"], "chunk_1");
assert_eq!(chunks_array[1]["chunk_index"], 1);
assert_eq!(chunks_array[1]["tokens"], 12);
assert_eq!(chunks_array[1]["page_numbers"].as_array().unwrap().len(), 2);
let confidence = chunks_array[1]["metadata"]["confidence"].as_f64().unwrap();
assert!(
(confidence - 0.95).abs() < 0.01,
"Confidence should be approximately 0.95, got {}",
confidence
);
assert_eq!(
chunks_array[1]["metadata"]["sentence_boundary_respected"],
false
);
}
#[cfg(feature = "semantic")]
#[test]
fn test_export_with_chunks_empty() {
let chunks: Vec<DocumentChunk> = vec![];
let json = JsonExporter::export_with_chunks(&chunks).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["type"], "chunked_document");
assert_eq!(parsed["chunk_count"], 0);
assert_eq!(parsed["chunks"].as_array().unwrap().len(), 0);
}
#[cfg(feature = "semantic")]
#[test]
fn test_export_with_chunks_position_metadata() {
use crate::ai::chunking::{ChunkMetadata, ChunkPosition};
let chunk = DocumentChunk {
id: "test_chunk".to_string(),
content: "Test content for position tracking".to_string(),
tokens: 5,
page_numbers: vec![5, 6, 7],
chunk_index: 10,
metadata: ChunkMetadata {
position: ChunkPosition {
start_char: 1000,
end_char: 2000,
first_page: 5,
last_page: 7,
},
confidence: 0.85,
sentence_boundary_respected: false,
language: None,
},
};
let json = JsonExporter::export_with_chunks(&[chunk]).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed["chunk_count"], 1);
let chunk_obj = &parsed["chunks"][0];
assert_eq!(chunk_obj["id"], "test_chunk");
assert_eq!(chunk_obj["tokens"], 5);
assert_eq!(chunk_obj["chunk_index"], 10);
assert_eq!(chunk_obj["content"], "Test content for position tracking");
let pages = chunk_obj["page_numbers"].as_array().unwrap();
assert_eq!(pages.len(), 3);
assert_eq!(pages[0], 5);
assert_eq!(pages[1], 6);
assert_eq!(pages[2], 7);
let pos = &chunk_obj["metadata"]["position"];
assert_eq!(pos["start_char"], 1000);
assert_eq!(pos["end_char"], 2000);
assert_eq!(pos["first_page"], 5);
assert_eq!(pos["last_page"], 7);
let confidence = chunk_obj["metadata"]["confidence"].as_f64().unwrap();
assert!(
(confidence - 0.85).abs() < 0.01,
"Confidence should be approximately 0.85, got {}",
confidence
);
assert_eq!(chunk_obj["metadata"]["sentence_boundary_respected"], false);
}
#[cfg(feature = "semantic")]
#[test]
fn test_export_with_chunks_multiple_pages() {
use crate::ai::chunking::{ChunkMetadata, ChunkPosition};
let chunk = DocumentChunk {
id: "multipage".to_string(),
content: "Content spanning pages".to_string(),
tokens: 20,
page_numbers: vec![2, 3, 4],
chunk_index: 0,
metadata: ChunkMetadata {
position: ChunkPosition {
start_char: 500,
end_char: 1500,
first_page: 2,
last_page: 4,
},
confidence: 1.0,
sentence_boundary_respected: true,
language: None,
},
};
let json = JsonExporter::export_with_chunks(&[chunk]).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
let chunk_obj = &parsed["chunks"][0];
let pages = chunk_obj["page_numbers"].as_array().unwrap();
assert_eq!(pages.len(), 3);
assert_eq!(chunk_obj["metadata"]["position"]["first_page"], 2);
assert_eq!(chunk_obj["metadata"]["position"]["last_page"], 4);
}
#[test]
fn test_contextual_simple() {
let text = "This is sample content.";
let result = ContextualFormat::export_simple(text).unwrap();
assert!(result.contains("Document content:"));
assert!(result.contains("This is sample content."));
assert_eq!(result, "Document content:\n\nThis is sample content.");
}
#[test]
fn test_contextual_with_metadata_full() {
let metadata = DocumentMetadata {
title: "Annual Report".to_string(),
page_count: 25,
created_at: Some("2025-01-15".to_string()),
author: Some("Jane Smith".to_string()),
};
let result =
ContextualFormat::export_with_metadata("Report content here.", &metadata).unwrap();
assert!(result.contains("This is a document titled \"Annual Report\""));
assert!(result.contains("with 25 pages"));
assert!(result.contains("written by Jane Smith"));
assert!(result.contains("created on 2025-01-15"));
assert!(result.contains("Content:"));
assert!(result.contains("Report content here."));
assert!(!result.contains("page,"));
assert!(result.contains("pages,"));
}
#[test]
fn test_contextual_with_metadata_minimal() {
let metadata = DocumentMetadata {
title: "Simple Doc".to_string(),
page_count: 1,
created_at: None,
author: None,
};
let result = ContextualFormat::export_with_metadata("Content", &metadata).unwrap();
assert!(result.contains("titled \"Simple Doc\""));
assert!(result.contains("with 1 page"));
assert!(!result.contains("pages")); assert!(!result.contains("written by"));
assert!(!result.contains("created on"));
}
#[test]
fn test_contextual_with_metadata_no_page_count() {
let metadata = DocumentMetadata {
title: "Test".to_string(),
page_count: 0,
created_at: None,
author: None,
};
let result = ContextualFormat::export_with_metadata("Text", &metadata).unwrap();
assert!(!result.contains("with"));
assert!(!result.contains("page"));
assert!(result.contains("This is a document titled \"Test\"."));
}
#[test]
fn test_contextual_with_pages() {
let pages = vec![
(1, "First page text".to_string()),
(2, "Second page text".to_string()),
(3, "Third page text".to_string()),
];
let result = ContextualFormat::export_with_pages(&pages).unwrap();
assert!(result.starts_with("Document content:\n\n"));
assert!(result.contains("On page 1:\nFirst page text"));
assert!(result.contains("On page 2:\nSecond page text"));
assert!(result.contains("On page 3:\nThird page text"));
}
#[test]
fn test_contextual_with_pages_empty() {
let pages: Vec<(usize, String)> = vec![];
let result = ContextualFormat::export_with_pages(&pages).unwrap();
assert_eq!(result, "Document content:\n\n");
}
#[test]
fn test_contextual_with_pages_single() {
let pages = vec![(1, "Only page".to_string())];
let result = ContextualFormat::export_with_pages(&pages).unwrap();
assert!(result.contains("On page 1:\nOnly page"));
}
#[test]
fn test_contextual_with_metadata_and_pages() {
let metadata = DocumentMetadata {
title: "Technical Guide".to_string(),
page_count: 2,
created_at: Some("2025-10-13".to_string()),
author: Some("John Doe".to_string()),
};
let pages = vec![
(1, "Introduction text".to_string()),
(2, "Main content".to_string()),
];
let result = ContextualFormat::export_with_metadata_and_pages(&pages, &metadata).unwrap();
assert!(result.contains("titled \"Technical Guide\""));
assert!(result.contains("with 2 pages"));
assert!(result.contains("written by John Doe"));
assert!(result.contains("created on 2025-10-13"));
assert!(result.contains("Content:"));
assert!(result.contains("On page 1:\nIntroduction text"));
assert!(result.contains("On page 2:\nMain content"));
}
#[test]
fn test_contextual_natural_language_flow() {
let metadata = DocumentMetadata {
title: "Report".to_string(),
page_count: 5,
created_at: Some("2025-01-01".to_string()),
author: Some("Alice".to_string()),
};
let result = ContextualFormat::export_with_metadata("Text", &metadata).unwrap();
assert!(result.starts_with("This is a document titled \"Report\" with 5 pages, written by Alice, created on 2025-01-01."));
}
#[test]
fn test_contextual_empty_text() {
let result = ContextualFormat::export_simple("").unwrap();
assert_eq!(result, "Document content:\n\n");
}
}