use crate::errors::ExtractResult;
use crate::tika;
use crate::tika::JReaderInputStream;
use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
use std::collections::HashMap;
use strum_macros::{Display, EnumString};
pub type Metadata = HashMap<String, Vec<String>>;
#[derive(Debug, Clone, Default, Copy, PartialEq, Eq, Hash, Display, EnumString)]
#[allow(non_camel_case_types)]
pub enum CharSet {
#[default]
UTF_8,
US_ASCII,
UTF_16BE,
}
pub struct StreamReader {
pub(crate) inner: JReaderInputStream,
}
impl std::io::Read for StreamReader {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.inner.read(buf)
}
}
#[derive(Debug, Clone)]
pub struct Extractor {
extract_string_max_length: i32,
encoding: CharSet,
pdf_config: PdfParserConfig,
office_config: OfficeParserConfig,
ocr_config: TesseractOcrConfig,
xml_output: bool,
}
impl Default for Extractor {
fn default() -> Self {
Self {
extract_string_max_length: 500_000, encoding: CharSet::UTF_8,
pdf_config: PdfParserConfig::default(),
office_config: OfficeParserConfig::default(),
ocr_config: TesseractOcrConfig::default(),
xml_output: false,
}
}
}
impl Extractor {
pub fn new() -> Self {
Self::default()
}
pub fn set_extract_string_max_length(mut self, max_length: i32) -> Self {
self.extract_string_max_length = max_length;
self
}
pub fn set_encoding(mut self, encoding: CharSet) -> Self {
self.encoding = encoding;
self
}
pub fn set_pdf_config(mut self, config: PdfParserConfig) -> Self {
self.pdf_config = config;
self
}
pub fn set_office_config(mut self, config: OfficeParserConfig) -> Self {
self.office_config = config;
self
}
pub fn set_ocr_config(mut self, config: TesseractOcrConfig) -> Self {
self.ocr_config = config;
self
}
pub fn set_xml_output(mut self, xml_output: bool) -> Self {
self.xml_output = xml_output;
self
}
pub fn extract_file(&self, file_path: &str) -> ExtractResult<(StreamReader, Metadata)> {
tika::parse_file(
file_path,
&self.encoding,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
self.xml_output,
)
}
pub fn extract_bytes(&self, buffer: &[u8]) -> ExtractResult<(StreamReader, Metadata)> {
tika::parse_bytes(
buffer,
&self.encoding,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
self.xml_output,
)
}
pub fn extract_url(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)> {
tika::parse_url(
url,
&self.encoding,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
self.xml_output,
)
}
pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult<(String, Metadata)> {
tika::parse_file_to_string(
file_path,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
self.xml_output,
)
}
pub fn extract_bytes_to_string(&self, buffer: &[u8]) -> ExtractResult<(String, Metadata)> {
tika::parse_bytes_to_string(
buffer,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
self.xml_output,
)
}
pub fn extract_url_to_string(&self, url: &str) -> ExtractResult<(String, Metadata)> {
tika::parse_url_to_string(
url,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
self.xml_output,
)
}
}
#[cfg(test)]
mod tests {
use super::StreamReader;
use crate::Extractor;
use std::fs::File;
use std::io::BufReader;
use std::io::{self, Read};
use std::str;
const TEST_FILE: &str = "README.md";
const TEST_URL: &str = "https://www.google.com/";
fn expected_content() -> String {
let mut file = File::open(TEST_FILE).unwrap();
let mut content = String::new();
file.read_to_string(&mut content).unwrap();
content
}
#[test]
fn extract_file_to_string_test() {
let expected_content = expected_content();
let extractor = Extractor::new();
let result = extractor.extract_file_to_string(TEST_FILE);
let (content, metadata) = result.unwrap();
assert_eq!(content.trim(), expected_content.trim());
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}
fn read_content_from_stream(stream: StreamReader) -> String {
let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer).unwrap();
let content = String::from_utf8(buffer).unwrap();
content
}
#[test]
fn extract_file_test() {
let expected_content = expected_content();
let extractor = Extractor::new();
let result = extractor.extract_file(TEST_FILE);
let (reader, metadata) = result.unwrap();
let content = read_content_from_stream(reader);
assert_eq!(content.trim(), expected_content.trim());
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}
fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
Ok(buffer)
}
#[test]
fn extract_bytes_test() {
let expected_content = expected_content();
let file_bytes = read_file_as_bytes(TEST_FILE).unwrap();
let extractor = Extractor::new();
let result = extractor.extract_bytes(&file_bytes);
let (reader, metadata) = result.unwrap();
let content = read_content_from_stream(reader);
assert_eq!(content.trim(), expected_content.trim());
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}
#[test]
fn extract_url_test() {
let extractor = Extractor::new();
let result = extractor.extract_url(&TEST_URL);
let (reader, metadata) = result.unwrap();
let content = read_content_from_stream(reader);
assert!(content.contains("Google"));
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}
#[test]
fn extract_file_to_xml_test() {
let extractor = Extractor::new().set_xml_output(true);
let result = extractor.extract_file_to_string(TEST_FILE);
let (content, metadata) = result.unwrap();
assert!(
content.len() > 0,
"Metadata should contain at least one entry"
);
assert!(
metadata.len() > 0,
"Metadata should contain at least one entry"
);
}
}