#[cfg(feature = "ocr-tesseract")]
mod ocr_integration_tests {
use oxidize_pdf::operations::pdf_ocr_converter::{ConversionOptions, PdfOcrConverter};
use oxidize_pdf::text::{
FragmentType, OcrEngine, OcrOptions, OcrProcessingResult, OcrProvider, OcrTextFragment,
RustyTesseractProvider,
};
use oxidize_pdf::{Color, Document, Font, ImageFormat, Page};
use std::fs;
use std::path::Path;
use tempfile::TempDir;
struct MockOcrProvider {
mock_text: String,
mock_confidence: f64,
}
impl MockOcrProvider {
fn new(text: &str, confidence: f64) -> Self {
Self {
mock_text: text.to_string(),
mock_confidence: confidence,
}
}
}
impl OcrProvider for MockOcrProvider {
fn process_image(
&self,
_image_data: &[u8],
_options: &OcrOptions,
) -> oxidize_pdf::text::OcrResult<OcrProcessingResult> {
Ok(OcrProcessingResult {
text: self.mock_text.clone(),
confidence: self.mock_confidence,
processing_time_ms: 10,
fragments: vec![OcrTextFragment {
text: self.mock_text.clone(),
x: 100.0,
y: 700.0,
width: 200.0,
height: 20.0,
confidence: self.mock_confidence,
word_confidences: None,
font_size: 12.0,
fragment_type: FragmentType::Word,
}],
engine_name: "Mock OCR".to_string(),
language: "eng".to_string(),
processed_region: None,
image_dimensions: (800, 600),
})
}
fn supported_formats(&self) -> Vec<ImageFormat> {
vec![ImageFormat::Jpeg, ImageFormat::Png, ImageFormat::Tiff]
}
fn engine_name(&self) -> &str {
"Mock OCR"
}
fn engine_type(&self) -> OcrEngine {
OcrEngine::Mock
}
}
fn create_test_pdf(output_path: &Path) -> Result<(), Box<dyn std::error::Error>> {
let mut doc = Document::new();
let mut page = Page::a4();
page.graphics()
.set_fill_color(Color::rgb(245.0, 245.0, 245.0))
.rect(50.0, 50.0, 500.0, 700.0)
.fill();
page.text()
.set_font(Font::Helvetica, 14.0)
.at(100.0, 700.0)
.write("Test OCR Document")?;
page.text()
.set_font(Font::Helvetica, 12.0)
.at(100.0, 670.0)
.write("This is a test document for OCR processing.")?;
doc.add_page(page);
let pdf_bytes = doc.to_bytes()?;
fs::write(output_path, pdf_bytes)?;
Ok(())
}
#[test]
fn test_conversion_options_default() {
let options = ConversionOptions::default();
assert_eq!(options.min_confidence, 0.7);
assert!(options.skip_text_pages);
assert_eq!(options.text_layer_font_size, 12.0);
assert_eq!(options.dpi, 300);
assert!(options.preserve_structure);
}
#[test]
fn test_conversion_options_custom() {
let options = ConversionOptions {
min_confidence: 0.8,
skip_text_pages: false,
text_layer_font_size: 14.0,
dpi: 600,
preserve_structure: false,
..Default::default()
};
assert_eq!(options.min_confidence, 0.8);
assert!(!options.skip_text_pages);
assert_eq!(options.text_layer_font_size, 14.0);
assert_eq!(options.dpi, 600);
assert!(!options.preserve_structure);
}
#[test]
fn test_pdf_ocr_converter_creation() {
let converter = PdfOcrConverter::new();
assert!(converter.is_ok(), "Failed to create PDF OCR converter");
}
#[test]
fn test_mock_ocr_provider() {
let provider = MockOcrProvider::new("Test text", 0.95);
let dummy_image = vec![0u8; 100]; let options = OcrOptions::default();
let result = provider.process_image(&dummy_image, &options);
assert!(result.is_ok());
let ocr_result = result.unwrap();
assert_eq!(ocr_result.text, "Test text");
assert_eq!(ocr_result.confidence, 0.95);
assert_eq!(ocr_result.fragments.len(), 1);
assert_eq!(ocr_result.fragments[0].text, "Test text");
assert_eq!(ocr_result.fragments[0].fragment_type, FragmentType::Word);
}
#[test]
fn test_pdf_conversion_with_mock_ocr() -> Result<(), Box<dyn std::error::Error>> {
let temp_dir = TempDir::new()?;
let input_path = temp_dir.path().join("test_input.pdf");
let output_path = temp_dir.path().join("test_output.pdf");
create_test_pdf(&input_path)?;
assert!(input_path.exists(), "Test PDF was not created");
let converter = PdfOcrConverter::new()?;
let ocr_provider = MockOcrProvider::new("Extracted text from OCR", 0.85);
let options = ConversionOptions {
min_confidence: 0.7,
skip_text_pages: false, ..Default::default()
};
let result =
converter.convert_to_searchable_pdf(&input_path, &output_path, &ocr_provider, &options);
match result {
Ok(conversion_result) => {
assert!(output_path.exists(), "Output PDF was not created");
assert!(conversion_result.pages_processed > 0);
println!(
"✅ OCR conversion successful: {} pages processed",
conversion_result.pages_processed
);
}
Err(e) => {
println!(
"⚠️ OCR conversion failed (expected due to page analysis limitations): {}",
e
);
}
}
Ok(())
}
#[test]
fn test_low_confidence_handling() {
let provider = MockOcrProvider::new("Low confidence text", 0.3);
let dummy_image = vec![0u8; 100];
let options = OcrOptions::default();
let result = provider.process_image(&dummy_image, &options);
assert!(result.is_ok());
let ocr_result = result.unwrap();
assert_eq!(ocr_result.confidence, 0.3);
let converter_options = ConversionOptions::default(); assert!(ocr_result.confidence < converter_options.min_confidence);
}
#[test]
fn test_ocr_options_configuration() {
let mut options = OcrOptions::default();
assert_eq!(options.language, "en");
assert_eq!(options.min_confidence, 0.6);
options.language = "spa".to_string();
options.min_confidence = 0.8;
assert_eq!(options.language, "spa");
assert_eq!(options.min_confidence, 0.8);
}
#[test]
fn test_batch_conversion_interface() -> Result<(), Box<dyn std::error::Error>> {
let temp_dir = TempDir::new()?;
let input_dir = temp_dir.path().join("input");
let output_dir = temp_dir.path().join("output");
fs::create_dir_all(&input_dir)?;
fs::create_dir_all(&output_dir)?;
let pdf1_path = input_dir.join("test1.pdf");
let pdf2_path = input_dir.join("test2.pdf");
create_test_pdf(&pdf1_path)?;
create_test_pdf(&pdf2_path)?;
let converter = PdfOcrConverter::new()?;
let ocr_provider = MockOcrProvider::new("Batch test text", 0.9);
let options = ConversionOptions::default();
let input_paths = vec![pdf1_path, pdf2_path];
let result = converter.batch_convert(&input_paths, &output_dir, &ocr_provider, &options);
match result {
Ok(results) => {
println!(
"✅ Batch conversion successful: {} files processed",
results.len()
);
}
Err(e) => {
println!(
"⚠️ Batch conversion failed (expected due to limitations): {}",
e
);
}
}
Ok(())
}
#[cfg(feature = "ocr-tesseract")]
#[test]
fn test_tesseract_provider_creation() {
let provider = RustyTesseractProvider::new();
println!("✅ Tesseract provider created successfully");
let engine_type = provider.engine_type();
assert_eq!(engine_type, oxidize_pdf::text::OcrEngine::Tesseract);
}
#[test]
fn test_conversion_result_statistics() {
use oxidize_pdf::operations::pdf_ocr_converter::ConversionResult;
use std::time::Duration;
let result = ConversionResult {
pages_processed: 5,
pages_ocr_processed: 3,
pages_skipped: 2,
processing_time: Duration::from_secs(10),
average_confidence: 0.85,
total_characters_extracted: 1250,
};
assert_eq!(result.pages_processed, 5);
assert_eq!(result.pages_ocr_processed, 3);
assert_eq!(result.pages_skipped, 2);
assert_eq!(result.processing_time.as_secs(), 10);
assert_eq!(result.average_confidence, 0.85);
assert_eq!(result.total_characters_extracted, 1250);
assert_eq!(result.pages_skipped + result.pages_ocr_processed, 5);
}
}
#[cfg(not(feature = "ocr-tesseract"))]
mod ocr_disabled_tests {
#[test]
fn test_ocr_feature_disabled() {
println!("OCR features are disabled - this is expected when 'ocr-tesseract' feature is not enabled");
assert!(true, "Compilation successful without OCR features");
}
}