use crate::errors::{Error, ExtractResult};
use crate::tika::jni_utils::{
jni_call_method, jni_jobject_to_string, jni_new_string_as_jvalue,
jni_tika_metadata_to_rust_metadata,
};
use crate::tika::vm;
use crate::{Metadata, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE};
use bytemuck::cast_slice_mut;
use jni::objects::{GlobalRef, JByteArray, JObject, JValue};
use jni::sys::jsize;
use jni::JNIEnv;
#[derive(Clone)]
pub struct JReaderInputStream {
internal: GlobalRef,
buffer: GlobalRef,
capacity: jsize,
}
impl JReaderInputStream {
pub(crate) fn new<'local>(
env: &mut JNIEnv<'local>,
obj: JObject<'local>,
) -> ExtractResult<Self> {
let capacity = DEFAULT_BUF_SIZE as jsize;
let jbyte_array = env.new_byte_array(capacity)?;
Ok(Self {
internal: env.new_global_ref(obj)?,
buffer: env.new_global_ref(jbyte_array)?,
capacity,
})
}
pub(crate) fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let mut env = vm().attach_current_thread().map_err(Error::JniError)?;
let length = buf.len() as jsize;
if length > self.capacity {
let jbyte_array = env
.new_byte_array(length as jsize)
.map_err(|_e| Error::JniEnvCall("Failed to create byte array"))?;
self.buffer = env
.new_global_ref(jbyte_array)
.map_err(|_e| Error::JniEnvCall("Failed to create global reference"))?;
self.capacity = length;
}
let call_result = jni_call_method(
&mut env,
&self.internal,
"read",
"([BII)I",
&[
JValue::Object(&self.buffer),
JValue::Int(0),
JValue::Int(length),
],
);
let num_read_bytes = call_result?.i().map_err(Error::JniError)?;
let obj_local = env
.new_local_ref(&self.buffer)
.map_err(|_e| Error::JniEnvCall("Failed to create local ref"))?;
let buf_of_i8: &mut [i8] = cast_slice_mut(buf);
env.get_byte_array_region(JByteArray::from(obj_local), 0, buf_of_i8)
.map_err(|_e| Error::JniEnvCall("Failed to get byte array region"))?;
if num_read_bytes == -1 {
Ok(0)
} else {
Ok(num_read_bytes as usize)
}
}
}
impl Drop for JReaderInputStream {
fn drop(&mut self) {
if let Ok(mut env) = vm().attach_current_thread() {
jni_call_method(&mut env, &self.internal, "close", "()V", &[]).ok();
}
}
}
pub struct JStringResult {
pub content: String,
pub metadata: Metadata,
}
impl<'local> JStringResult {
pub(crate) fn new(env: &mut JNIEnv<'local>, obj: JObject<'local>) -> ExtractResult<Self> {
let is_error = jni_call_method(env, &obj, "isError", "()Z", &[])?.z()?;
if is_error {
let status = jni_call_method(env, &obj, "getStatus", "()B", &[])?.b()?;
let msg_obj = env
.call_method(&obj, "getErrorMessage", "()Ljava/lang/String;", &[])?
.l()?;
let msg = jni_jobject_to_string(env, msg_obj)?;
match status {
1 => Err(Error::IoError(msg)),
2 => Err(Error::ParseError(msg)),
_ => Err(Error::Unknown(msg)),
}
} else {
let call_result_obj = env
.call_method(&obj, "getContent", "()Ljava/lang/String;", &[])?
.l()?;
let content = jni_jobject_to_string(env, call_result_obj)?;
let tika_metadata_obj: JObject = env
.call_method(
&obj,
"getMetadata",
"()Lorg/apache/tika/metadata/Metadata;",
&[],
)?
.l()?;
let metadata = jni_tika_metadata_to_rust_metadata(env, tika_metadata_obj)?;
Ok(Self { content, metadata })
}
}
}
pub struct JReaderResult<'local> {
pub java_reader: JObject<'local>,
pub metadata: Metadata,
}
impl<'local> JReaderResult<'local> {
pub(crate) fn new(env: &mut JNIEnv<'local>, obj: JObject<'local>) -> ExtractResult<Self> {
let is_error = jni_call_method(env, &obj, "isError", "()Z", &[])?.z()?;
if is_error {
let status = jni_call_method(env, &obj, "getStatus", "()B", &[])?.b()?;
let msg_obj = env
.call_method(&obj, "getErrorMessage", "()Ljava/lang/String;", &[])?
.l()?;
let msg = jni_jobject_to_string(env, msg_obj)?;
match status {
1 => Err(Error::IoError(msg)),
2 => Err(Error::ParseError(msg)),
_ => Err(Error::Unknown(msg)),
}
} else {
let reader_obj = jni_call_method(
env,
&obj,
"getReader",
"()Lorg/apache/commons/io/input/ReaderInputStream;",
&[],
)?
.l()?;
let tika_metadata_obj: JObject = env
.call_method(
&obj,
"getMetadata",
"()Lorg/apache/tika/metadata/Metadata;",
&[],
)?
.l()?;
let metadata = jni_tika_metadata_to_rust_metadata(env, tika_metadata_obj)?;
Ok(Self {
java_reader: reader_obj,
metadata,
})
}
}
}
pub(crate) struct JPDFParserConfig<'local> {
pub(crate) internal: JObject<'local>,
}
impl<'local> JPDFParserConfig<'local> {
pub(crate) fn new(env: &mut JNIEnv<'local>, config: &PdfParserConfig) -> ExtractResult<Self> {
let class = env.find_class("org/apache/tika/parser/pdf/PDFParserConfig")?;
let obj = env.new_object(&class, "()V", &[])?;
jni_call_method(
env,
&obj,
"setExtractInlineImages",
"(Z)V",
&[JValue::from(config.extract_inline_images)],
)?;
jni_call_method(
env,
&obj,
"setExtractUniqueInlineImagesOnly",
"(Z)V",
&[JValue::from(config.extract_unique_inline_images_only)],
)?;
jni_call_method(
env,
&obj,
"setExtractMarkedContent",
"(Z)V",
&[JValue::from(config.extract_marked_content)],
)?;
jni_call_method(
env,
&obj,
"setExtractAnnotationText",
"(Z)V",
&[JValue::from(config.extract_annotation_text)],
)?;
let ocr_str_val = jni_new_string_as_jvalue(env, &config.ocr_strategy.to_string())?;
jni_call_method(
env,
&obj,
"setOcrStrategy",
"(Ljava/lang/String;)V",
&[(&ocr_str_val).into()],
)?;
Ok(Self { internal: obj })
}
}
pub(crate) struct JOfficeParserConfig<'local> {
pub(crate) internal: JObject<'local>,
}
impl<'local> JOfficeParserConfig<'local> {
pub(crate) fn new(
env: &mut JNIEnv<'local>,
config: &OfficeParserConfig,
) -> ExtractResult<Self> {
let class = env.find_class("org/apache/tika/parser/microsoft/OfficeParserConfig")?;
let obj = env.new_object(&class, "()V", &[])?;
jni_call_method(
env,
&obj,
"setExtractMacros",
"(Z)V",
&[JValue::from(config.extract_macros)],
)?;
jni_call_method(
env,
&obj,
"setIncludeDeletedContent",
"(Z)V",
&[JValue::from(config.include_deleted_content)],
)?;
jni_call_method(
env,
&obj,
"setIncludeMoveFromContent",
"(Z)V",
&[JValue::from(config.include_move_from_content)],
)?;
jni_call_method(
env,
&obj,
"setIncludeShapeBasedContent",
"(Z)V",
&[JValue::from(config.include_shape_based_content)],
)?;
jni_call_method(
env,
&obj,
"setIncludeHeadersAndFooters",
"(Z)V",
&[JValue::from(config.include_headers_and_footers)],
)?;
jni_call_method(
env,
&obj,
"setIncludeMissingRows",
"(Z)V",
&[JValue::from(config.include_missing_rows)],
)?;
jni_call_method(
env,
&obj,
"setIncludeSlideNotes",
"(Z)V",
&[JValue::from(config.include_slide_notes)],
)?;
jni_call_method(
env,
&obj,
"setIncludeSlideMasterContent",
"(Z)V",
&[JValue::from(config.include_slide_master_content)],
)?;
jni_call_method(
env,
&obj,
"setConcatenatePhoneticRuns",
"(Z)V",
&[JValue::from(config.concatenate_phonetic_runs)],
)?;
jni_call_method(
env,
&obj,
"setExtractAllAlternativesFromMSG",
"(Z)V",
&[JValue::from(config.extract_all_alternatives_from_msg)],
)?;
Ok(Self { internal: obj })
}
}
pub(crate) struct JTesseractOcrConfig<'local> {
pub(crate) internal: JObject<'local>,
}
impl<'local> JTesseractOcrConfig<'local> {
pub(crate) fn new(
env: &mut JNIEnv<'local>,
config: &TesseractOcrConfig,
) -> ExtractResult<Self> {
let class = env.find_class("org/apache/tika/parser/ocr/TesseractOCRConfig")?;
let obj = env.new_object(&class, "()V", &[])?;
jni_call_method(
env,
&obj,
"setDensity",
"(I)V",
&[JValue::from(config.density)],
)?;
jni_call_method(env, &obj, "setDepth", "(I)V", &[JValue::from(config.depth)])?;
jni_call_method(
env,
&obj,
"setTimeoutSeconds",
"(I)V",
&[JValue::from(config.timeout_seconds)],
)?;
jni_call_method(
env,
&obj,
"setEnableImagePreprocessing",
"(Z)V",
&[JValue::from(config.enable_image_preprocessing)],
)?;
jni_call_method(
env,
&obj,
"setApplyRotation",
"(Z)V",
&[JValue::from(config.apply_rotation)],
)?;
let lang_string_val = jni_new_string_as_jvalue(env, &config.language)?;
jni_call_method(
env,
&obj,
"setLanguage",
"(Ljava/lang/String;)V",
&[(&lang_string_val).into()],
)?;
Ok(Self { internal: obj })
}
}