use std::time::{Duration, Instant};
use lopdf::Document as LopdfDocument;
use serde::{Deserialize, Serialize};
use crate::error::{MemvidError, Result};
#[cfg(feature = "symspell_cleanup")]
use crate::symspell_cleanup::fix_pdf_text as fix_pdf_spacing;
#[cfg(not(feature = "symspell_cleanup"))]
use crate::text::fix_pdf_spacing;
pub const DEFAULT_EXTRACTION_BUDGET_MS: u64 = 350;
#[derive(Debug, Clone, Copy)]
pub struct ExtractionBudget {
pub budget: Duration,
pub max_chars: usize,
pub sample_interval: usize,
}
impl Default for ExtractionBudget {
fn default() -> Self {
Self {
budget: Duration::from_millis(DEFAULT_EXTRACTION_BUDGET_MS),
max_chars: 100_000,
sample_interval: 20, }
}
}
impl ExtractionBudget {
#[must_use]
pub fn with_ms(ms: u64) -> Self {
Self {
budget: Duration::from_millis(ms),
..Default::default()
}
}
#[must_use]
pub fn unlimited() -> Self {
Self {
budget: Duration::from_secs(3600), max_chars: usize::MAX,
sample_interval: 1, }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BudgetedExtractionResult {
pub text: String,
pub sections_extracted: usize,
pub sections_total: usize,
pub completed: bool,
pub elapsed_ms: u64,
pub coverage: f32,
}
impl BudgetedExtractionResult {
#[must_use]
pub fn has_content(&self) -> bool {
!self.text.trim().is_empty()
}
#[must_use]
pub fn is_skim(&self) -> bool {
!self.completed && self.sections_extracted < self.sections_total
}
}
pub fn extract_pdf_budgeted(
bytes: &[u8],
budget: ExtractionBudget,
) -> Result<BudgetedExtractionResult> {
let start = Instant::now();
#[cfg(feature = "extractous")]
{
use extractous::Extractor;
if let Ok(mut temp_file) = tempfile::NamedTempFile::new() {
use std::io::Write;
if temp_file.write_all(bytes).is_ok() {
let extractor = Extractor::new();
let path_str = temp_file.path().to_string_lossy();
if let Ok((text, _metadata)) = extractor.extract_file_to_string(&path_str) {
let trimmed = text.trim();
if !trimmed.is_empty() {
tracing::debug!("extractous successfully extracted PDF text");
let estimated_pages = (trimmed.len() / 3000).max(1);
let text_len = trimmed.len();
let final_text = if text_len > budget.max_chars {
truncate_at_boundary(trimmed, budget.max_chars)
} else {
trimmed.to_string()
};
let completed = final_text.len() == text_len;
return Ok(BudgetedExtractionResult {
text: final_text,
sections_extracted: estimated_pages,
sections_total: estimated_pages,
completed,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage: 1.0,
});
}
tracing::debug!("extractous returned empty text, trying pdf-extract");
}
}
}
}
#[cfg(feature = "pdf_extract")]
{
let bytes_clone = bytes.to_vec();
let extract_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&bytes_clone)
}));
match extract_result {
Ok(Ok(text)) => {
let cleaned = fix_pdf_spacing(text.trim());
if !cleaned.is_empty() {
let estimated_pages = (cleaned.len() / 3000).max(1);
let cleaned_len = cleaned.len();
let final_text = if cleaned_len > budget.max_chars {
truncate_at_boundary(&cleaned, budget.max_chars)
} else {
cleaned
};
let completed = final_text.len() == cleaned_len;
return Ok(BudgetedExtractionResult {
text: final_text,
sections_extracted: estimated_pages,
sections_total: estimated_pages,
completed,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage: 1.0,
});
}
tracing::debug!("pdf-extract returned empty text, trying lopdf");
}
Ok(Err(e)) => {
tracing::debug!(?e, "pdf-extract failed, trying lopdf");
}
Err(_) => {
tracing::warn!(
"pdf-extract panicked (likely font parsing issue), falling back to lopdf"
);
}
}
}
extract_pdf_budgeted_lopdf(bytes, budget, start)
}
fn truncate_at_boundary(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars {
return text.to_string();
}
let truncate_at = text[..max_chars]
.rfind(|c: char| c.is_whitespace())
.unwrap_or(max_chars);
text[..truncate_at].to_string()
}
fn extract_pdf_budgeted_lopdf(
bytes: &[u8],
budget: ExtractionBudget,
start: Instant,
) -> Result<BudgetedExtractionResult> {
let deadline = start + budget.budget;
let mut document =
LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
reason: format!("failed to load PDF: {err}").into(),
})?;
if document.is_encrypted() && document.decrypt("").is_err() {
return Err(MemvidError::ExtractionFailed {
reason: "cannot decrypt password-protected PDF".into(),
});
}
let () = document.decompress();
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
if page_numbers.is_empty() {
return Ok(BudgetedExtractionResult {
text: String::new(),
sections_extracted: 0,
sections_total: 0,
completed: true,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage: 1.0,
});
}
page_numbers.sort_unstable();
let page_count = page_numbers.len();
let mut extracted_pages: Vec<(u32, String)> = Vec::new();
let mut total_chars = 0usize;
let priority_pages: Vec<u32> = {
let mut pages = vec![page_numbers[0]]; if page_count > 1 {
pages.push(page_numbers[page_count - 1]); }
pages
};
for &page_num in &priority_pages {
if total_chars >= budget.max_chars {
break;
}
if let Ok(text) = document.extract_text(&[page_num]) {
let trimmed = text.trim();
if !trimmed.is_empty() {
total_chars += trimmed.len();
extracted_pages.push((page_num, trimmed.to_string()));
}
}
}
if Instant::now() >= deadline || total_chars >= budget.max_chars {
return finish_extraction(extracted_pages, page_count, start, false);
}
let sample_interval = budget.sample_interval.max(1);
let middle_pages: Vec<u32> = page_numbers
.iter()
.enumerate()
.filter(|(i, page)| {
!priority_pages.contains(page) && (*i % sample_interval == 0)
})
.map(|(_, page)| *page)
.collect();
for &page_num in &middle_pages {
if Instant::now() >= deadline {
return finish_extraction(extracted_pages, page_count, start, false);
}
if total_chars >= budget.max_chars {
return finish_extraction(extracted_pages, page_count, start, false);
}
if let Ok(text) = document.extract_text(&[page_num]) {
let trimmed = text.trim();
if !trimmed.is_empty() {
total_chars += trimmed.len();
extracted_pages.push((page_num, trimmed.to_string()));
}
}
}
let completed = extracted_pages.len() >= page_count;
finish_extraction(extracted_pages, page_count, start, completed)
}
pub fn extract_text_budgeted(
bytes: &[u8],
budget: ExtractionBudget,
) -> Result<BudgetedExtractionResult> {
let start = Instant::now();
let text: String = match std::str::from_utf8(bytes) {
Ok(s) => s.to_string(),
Err(_) => String::from_utf8_lossy(bytes).into_owned(),
};
let truncated = if text.len() > budget.max_chars {
let mut end = budget.max_chars;
while end > 0 && !text.is_char_boundary(end) {
end -= 1;
}
text[..end].to_string()
} else {
text
};
let sections = truncated.split("\n\n").count();
Ok(BudgetedExtractionResult {
text: truncated,
sections_extracted: sections,
sections_total: sections,
completed: true,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage: 1.0,
})
}
fn is_ooxml_mime(mime: Option<&str>) -> bool {
let Some(m) = mime else { return false };
let m = m.to_lowercase();
m.contains("spreadsheetml")
|| m.contains("wordprocessingml")
|| m.contains("presentationml")
|| m == "application/vnd.ms-excel"
|| m == "application/msword"
|| m == "application/vnd.ms-powerpoint"
}
fn extract_ooxml_budgeted(
bytes: &[u8],
mime: Option<&str>,
uri: Option<&str>,
) -> Result<BudgetedExtractionResult> {
use crate::reader::{DocumentFormat, ReaderHint, ReaderRegistry};
let start = Instant::now();
let format = match mime.map(str::to_lowercase).as_deref() {
Some(m) if m.contains("spreadsheetml") => Some(DocumentFormat::Xlsx),
Some(m) if m.contains("wordprocessingml") => Some(DocumentFormat::Docx),
Some(m) if m.contains("presentationml") => Some(DocumentFormat::Pptx),
Some("application/vnd.ms-excel") => Some(DocumentFormat::Xls),
_ => {
uri.and_then(|u| {
let lower = u.to_lowercase();
if lower.ends_with(".xlsx") {
Some(DocumentFormat::Xlsx)
} else if lower.ends_with(".docx") {
Some(DocumentFormat::Docx)
} else if lower.ends_with(".pptx") {
Some(DocumentFormat::Pptx)
} else if lower.ends_with(".xls") {
Some(DocumentFormat::Xls)
} else {
None
}
})
}
};
let hint = ReaderHint::new(mime, format).with_uri(uri);
let registry = ReaderRegistry::default();
if let Some(reader) = registry.find_reader(&hint) {
match reader.extract(bytes, &hint) {
Ok(output) => {
let text = output.document.text.unwrap_or_default();
let sections = text
.split("\n\n")
.filter(|s| !s.trim().is_empty())
.count()
.max(1);
Ok(BudgetedExtractionResult {
text,
sections_extracted: sections,
sections_total: sections,
completed: true,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage: 1.0,
})
}
Err(e) => Err(e),
}
} else {
Ok(BudgetedExtractionResult {
text: String::new(),
sections_extracted: 0,
sections_total: 0,
completed: true,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage: 1.0,
})
}
}
pub fn extract_with_budget(
bytes: &[u8],
mime: Option<&str>,
uri: Option<&str>,
budget: ExtractionBudget,
) -> Result<BudgetedExtractionResult> {
let is_pdf = mime.is_some_and(|m| m.contains("pdf")) || is_pdf_magic(bytes);
if is_pdf {
extract_pdf_budgeted(bytes, budget)
} else if is_ooxml_mime(mime) || is_ooxml_by_extension(uri) || is_ooxml_by_magic(bytes, uri) {
extract_ooxml_budgeted(bytes, mime, uri)
} else if is_binary_mime(mime) || is_binary_content(bytes) {
Ok(BudgetedExtractionResult {
text: String::new(),
sections_extracted: 0,
sections_total: 0,
completed: true,
elapsed_ms: 0,
coverage: 1.0,
})
} else {
extract_text_budgeted(bytes, budget)
}
}
fn is_ooxml_by_extension(uri: Option<&str>) -> bool {
let Some(u) = uri else { return false };
let lower = u.to_lowercase();
lower.ends_with(".docx")
|| lower.ends_with(".xlsx")
|| lower.ends_with(".pptx")
|| lower.ends_with(".doc")
|| lower.ends_with(".xls")
|| lower.ends_with(".ppt")
}
fn is_ooxml_by_magic(bytes: &[u8], uri: Option<&str>) -> bool {
if bytes.len() >= 4 && bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
is_ooxml_by_extension(uri)
} else {
false
}
}
fn is_binary_mime(mime: Option<&str>) -> bool {
let Some(m) = mime else { return false };
let m = m.to_lowercase();
m.starts_with("video/")
|| m.starts_with("audio/")
|| m.starts_with("image/")
|| m == "application/octet-stream"
|| m.contains("zip")
|| m.contains("gzip")
|| m.contains("tar")
|| m.contains("rar")
|| m.contains("7z")
}
fn is_binary_content(bytes: &[u8]) -> bool {
if bytes.is_empty() {
return false;
}
let sample_size = bytes.len().min(8192);
let sample = &bytes[..sample_size];
let non_text_count = sample
.iter()
.filter(|&&b| {
b == 0 || (b < 32 && b != 9 && b != 10 && b != 13)
})
.count();
non_text_count * 100 / sample_size > 30
}
fn is_pdf_magic(bytes: &[u8]) -> bool {
if bytes.is_empty() {
return false;
}
let mut slice = bytes;
if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
slice = &slice[3..];
}
while let Some((first, rest)) = slice.split_first() {
if *first == 0 || first.is_ascii_whitespace() {
slice = rest;
} else {
break;
}
}
slice.starts_with(b"%PDF")
}
fn finish_extraction(
mut pages: Vec<(u32, String)>,
total_pages: usize,
start: Instant,
completed: bool,
) -> Result<BudgetedExtractionResult> {
pages.sort_by_key(|(num, _)| *num);
let sections_extracted = pages.len();
let text = pages
.into_iter()
.map(|(_, text)| fix_pdf_spacing(&text)) .collect::<Vec<_>>()
.join("\n\n");
let coverage = if total_pages > 0 {
sections_extracted as f32 / total_pages as f32
} else {
1.0
};
Ok(BudgetedExtractionResult {
text,
sections_extracted,
sections_total: total_pages,
completed,
elapsed_ms: start.elapsed().as_millis().try_into().unwrap_or(u64::MAX),
coverage,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_text_extraction_budget() {
let text = b"Hello world.\n\nThis is a test.\n\nAnother paragraph.";
let result = extract_text_budgeted(text, ExtractionBudget::default()).unwrap();
assert!(result.completed);
assert!(result.has_content());
assert_eq!(result.sections_extracted, 3);
assert_eq!(result.coverage, 1.0);
}
#[test]
fn test_text_truncation() {
let text = "x".repeat(200_000);
let budget = ExtractionBudget {
max_chars: 1000,
..Default::default()
};
let result = extract_text_budgeted(text.as_bytes(), budget).unwrap();
assert_eq!(result.text.len(), 1000);
}
#[test]
fn test_pdf_magic_detection() {
assert!(is_pdf_magic(b"%PDF-1.7"));
assert!(is_pdf_magic(b" \n%PDF-1.4"));
assert!(!is_pdf_magic(b"Hello world"));
assert!(!is_pdf_magic(b""));
}
#[test]
fn test_budget_config() {
let default = ExtractionBudget::default();
assert_eq!(default.budget.as_millis(), 350);
let custom = ExtractionBudget::with_ms(500);
assert_eq!(custom.budget.as_millis(), 500);
let unlimited = ExtractionBudget::unlimited();
assert_eq!(unlimited.sample_interval, 1);
}
}