pub mod chunking;
pub mod formats;
pub use chunking::{ChunkMetadata, ChunkPosition, DocumentChunk, DocumentChunker};
pub use formats::{ContextualFormat, DocumentMetadata, MarkdownExporter, MarkdownOptions};
#[cfg(feature = "semantic")]
pub use formats::{JsonExporter, JsonOptions};
use crate::error::Result;
use std::io::{Read, Seek};
#[deprecated(since = "2.1.0", note = "Use PdfDocument::to_markdown() instead")]
pub fn export_to_markdown<R: Read + Seek>(
document: &crate::parser::PdfDocument<R>,
) -> Result<String> {
let extracted = document.extract_text()?;
let pages: Vec<(usize, String)> = extracted
.iter()
.enumerate()
.map(|(i, page_text)| (i + 1, page_text.text.clone()))
.collect();
let parsed_metadata = document.metadata()?;
let ai_metadata = DocumentMetadata {
title: parsed_metadata
.title
.unwrap_or_else(|| "Untitled Document".to_string()),
page_count: pages.len(),
created_at: parsed_metadata.creation_date.clone(),
author: parsed_metadata.author,
};
MarkdownExporter::export_with_metadata_and_pages(&pages, &ai_metadata)
}
#[deprecated(since = "2.1.0", note = "Use PdfDocument::to_contextual() instead")]
pub fn export_to_contextual<R: Read + Seek>(
document: &crate::parser::PdfDocument<R>,
) -> Result<String> {
let extracted = document.extract_text()?;
let pages: Vec<(usize, String)> = extracted
.iter()
.enumerate()
.map(|(i, page_text)| (i + 1, page_text.text.clone()))
.collect();
let parsed_metadata = document.metadata()?;
let ai_metadata = DocumentMetadata {
title: parsed_metadata
.title
.unwrap_or_else(|| "Untitled Document".to_string()),
page_count: pages.len(),
created_at: parsed_metadata.creation_date.clone(),
author: parsed_metadata.author,
};
ContextualFormat::export_with_metadata_and_pages(&pages, &ai_metadata)
}
#[cfg(feature = "semantic")]
#[deprecated(since = "2.1.0", note = "Use PdfDocument::to_json() instead")]
pub fn export_to_json<R: Read + Seek>(document: &crate::parser::PdfDocument<R>) -> Result<String> {
let extracted = document.extract_text()?;
let pages: Vec<(usize, String)> = extracted
.iter()
.enumerate()
.map(|(i, page_text)| (i + 1, page_text.text.clone()))
.collect();
JsonExporter::export_pages(&pages)
}
#[cfg(feature = "semantic")]
#[deprecated(since = "2.1.0", note = "Use PdfDocument::chunk() instead")]
pub fn export_to_chunks<R: Read + Seek>(
document: &crate::parser::PdfDocument<R>,
chunk_size: usize,
overlap: usize,
) -> Result<String> {
let extracted = document.extract_text()?;
let full_text: String = extracted
.iter()
.map(|page_text| page_text.text.as_str())
.collect::<Vec<&str>>()
.join("\n\n");
let chunker = DocumentChunker::new(chunk_size, overlap);
let chunks = chunker.chunk_text(&full_text)?;
JsonExporter::export_with_chunks(&chunks)
}