use kreuzberg::core::config::ExtractionConfig;
#[cfg(feature = "pdf")]
use kreuzberg::core::extractor::batch_extract_file_sync;
use kreuzberg::core::extractor::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file};
use std::path::PathBuf;
mod helpers;
use helpers::{get_test_documents_dir, get_test_file_path, skip_if_missing, test_documents_available};
fn trim_trailing_newlines(value: &str) -> &str {
value.trim_end_matches(['\n', '\r'])
}
fn assert_text_content(actual: &str, expected: &str) {
assert_eq!(
trim_trailing_newlines(actual),
expected,
"Content mismatch after trimming trailing newlines"
);
}
#[tokio::test]
#[cfg(all(feature = "pdf", feature = "office", feature = "tokio-runtime"))]
async fn test_batch_extract_file_multiple_formats() {
if !test_documents_available() {
println!("Skipping test: test_documents/ directory not found");
return;
}
if skip_if_missing("pdfs/fake_memo.pdf")
|| skip_if_missing("documents/fake.docx")
|| skip_if_missing("text/fake_text.txt")
{
return;
}
let config = ExtractionConfig::default();
let paths = vec![
get_test_file_path("pdfs/fake_memo.pdf"),
get_test_file_path("documents/fake.docx"),
get_test_file_path("text/fake_text.txt"),
];
let results = batch_extract_file(paths, &config).await;
assert!(results.is_ok(), "Batch extraction should succeed");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 3);
assert!(!results[0].content.is_empty(), "PDF content should not be empty");
assert_eq!(results[0].mime_type, "application/pdf");
assert!(!results[1].content.is_empty(), "DOCX content should not be empty");
assert_eq!(
results[1].mime_type,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
);
assert!(!results[2].content.is_empty(), "TXT content should not be empty");
assert_eq!(results[2].mime_type, "text/plain");
assert!(results[0].metadata.error.is_none());
assert!(results[1].metadata.error.is_none());
assert!(results[2].metadata.error.is_none());
}
#[test]
#[cfg(feature = "pdf")]
fn test_batch_extract_file_sync_variant() {
if !test_documents_available() {
println!("Skipping test: test_documents/ directory not found");
return;
}
if skip_if_missing("pdfs/fake_memo.pdf") || skip_if_missing("text/fake_text.txt") {
return;
}
let config = ExtractionConfig::default();
let paths = vec![
get_test_file_path("pdfs/fake_memo.pdf"),
get_test_file_path("text/fake_text.txt"),
];
let results = batch_extract_file_sync(paths, &config);
assert!(results.is_ok(), "Sync batch extraction should succeed");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 2);
assert!(!results[0].content.is_empty(), "PDF content should not be empty");
assert_eq!(
results[0].mime_type, "application/pdf",
"PDF MIME type should be correct"
);
assert!(results[0].metadata.error.is_none(), "PDF should extract without errors");
assert!(!results[1].content.is_empty(), "Text content should not be empty");
assert_eq!(results[1].mime_type, "text/plain", "Text MIME type should be correct");
assert!(
results[1].metadata.error.is_none(),
"Text should extract without errors"
);
}
#[tokio::test]
async fn test_batch_extract_bytes_multiple() {
let config = ExtractionConfig::default();
let text_bytes = b"This is plain text content";
let markdown_bytes = b"# Markdown Header\n\nThis is markdown content";
let json_bytes = b"{\"key\": \"value\", \"number\": 42}";
let contents = vec![
(text_bytes.as_slice(), "text/plain"),
(markdown_bytes.as_slice(), "text/markdown"),
(json_bytes.as_slice(), "application/json"),
];
let owned_contents: Vec<(Vec<u8>, String)> = contents
.into_iter()
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
.collect();
let results = batch_extract_bytes(owned_contents, &config).await;
assert!(results.is_ok(), "Batch bytes extraction should succeed");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 3);
assert_text_content(&results[0].content, "This is plain text content");
assert_eq!(results[0].mime_type, "text/plain");
assert!(results[1].content.contains("Markdown Header"));
assert_eq!(results[1].mime_type, "text/markdown");
assert!(results[2].content.contains("key"));
assert!(results[2].content.contains("value"));
assert_eq!(results[2].mime_type, "application/json");
}
#[tokio::test]
async fn test_batch_extract_empty_list() {
let config = ExtractionConfig::default();
let paths: Vec<PathBuf> = vec![];
let results = batch_extract_file(paths, &config).await;
assert!(results.is_ok(), "Empty batch should succeed");
assert_eq!(
results.expect("Operation failed").len(),
0,
"Should return empty vector"
);
}
#[tokio::test]
async fn test_batch_extract_one_file_fails() {
if !test_documents_available() {
println!("Skipping test: test_documents/ directory not found");
return;
}
if skip_if_missing("text/fake_text.txt") {
return;
}
let config = ExtractionConfig::default();
let paths = vec![
get_test_file_path("text/fake_text.txt"),
get_test_documents_dir().join("nonexistent_file.txt"),
get_test_file_path("text/contract.txt"),
];
let results = batch_extract_file(paths, &config).await;
assert!(results.is_ok(), "Batch should succeed even with one failure");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 3);
assert!(!results[0].content.is_empty());
assert!(results[0].metadata.error.is_none());
assert!(results[1].metadata.error.is_some());
assert!(results[1].content.contains("Error:"));
assert!(!results[2].content.is_empty());
assert!(results[2].metadata.error.is_none());
}
#[tokio::test]
async fn test_batch_extract_all_fail() {
let config = ExtractionConfig::default();
let test_dir = get_test_documents_dir();
let paths = vec![
test_dir.join("nonexistent1.txt"),
test_dir.join("nonexistent2.pdf"),
test_dir.join("nonexistent3.docx"),
];
let results = batch_extract_file(paths, &config).await;
assert!(results.is_ok(), "Batch should succeed (errors in metadata)");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 3);
assert!(results[0].metadata.error.is_some());
assert!(results[1].metadata.error.is_some());
assert!(results[2].metadata.error.is_some());
assert!(results[0].content.contains("Error:"));
assert!(results[1].content.contains("Error:"));
assert!(results[2].content.contains("Error:"));
}
#[tokio::test]
async fn test_batch_extract_concurrent() {
if !test_documents_available() {
println!("Skipping test: test_documents/ directory not found");
return;
}
if skip_if_missing("text/fake_text.txt") {
return;
}
let config = ExtractionConfig::default();
let base_path = get_test_file_path("text/fake_text.txt");
let paths: Vec<PathBuf> = (0..20).map(|_| base_path.clone()).collect();
let start = std::time::Instant::now();
let results = batch_extract_file(paths, &config).await;
let duration = start.elapsed();
assert!(results.is_ok(), "Concurrent batch should succeed");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 20);
for result in &results {
assert!(result.metadata.error.is_none(), "Result should not have errors");
assert!(!result.content.is_empty(), "Result content should not be empty");
assert_eq!(result.mime_type, "text/plain", "MIME type should be text/plain");
}
assert!(
!results[0].content.is_empty(),
"Should have extracted actual text content"
);
assert!(duration.as_secs() < 5, "Batch processing took too long: {:?}", duration);
}
#[tokio::test]
async fn test_batch_extract_large_batch() {
if !test_documents_available() {
println!("Skipping test: test_documents/ directory not found");
return;
}
if skip_if_missing("text/fake_text.txt") {
return;
}
let config = ExtractionConfig::default();
let base_path = get_test_file_path("text/fake_text.txt");
let paths: Vec<PathBuf> = (0..50).map(|_| base_path.clone()).collect();
let results = batch_extract_file(paths, &config).await;
assert!(results.is_ok(), "Large batch should succeed");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 50);
for result in &results {
assert!(result.metadata.error.is_none());
assert!(!result.content.is_empty());
assert_eq!(result.mime_type, "text/plain");
}
}
#[test]
fn test_batch_extract_bytes_sync_variant() {
let config = ExtractionConfig::default();
let contents = vec![
(b"content 1".as_slice(), "text/plain"),
(b"content 2".as_slice(), "text/plain"),
(b"# content 3".as_slice(), "text/markdown"),
];
let owned_contents: Vec<(Vec<u8>, String)> = contents
.into_iter()
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
.collect();
let results = batch_extract_bytes_sync(owned_contents, &config);
assert!(results.is_ok(), "Sync batch bytes extraction should succeed");
let results = results.expect("Operation failed");
assert_eq!(results.len(), 3);
assert_text_content(&results[0].content, "content 1");
assert_text_content(&results[1].content, "content 2");
assert!(results[2].content.contains("content 3"));
}