use std::sync::OnceLock;
use crate::errors::ExtractResult;
use crate::tika::jni_utils::*;
use crate::tika::wrappers::*;
use crate::{
CharSet, Metadata, OfficeParserConfig, PdfParserConfig, StreamReader, TesseractOcrConfig,
};
use jni::objects::JValue;
use jni::{AttachGuard, JavaVM};
pub(crate) fn vm() -> &'static JavaVM {
static GRAAL_VM: OnceLock<JavaVM> = OnceLock::new();
GRAAL_VM.get_or_init(create_vm_isolate)
}
fn get_vm_attach_current_thread<'local>() -> ExtractResult<AttachGuard<'local>> {
let env = vm().attach_current_thread()?;
Ok(env)
}
fn parse_to_stream(
mut env: AttachGuard,
data_source_val: JValue,
char_set: &CharSet,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
method_name: &str,
signature: &str,
) -> ExtractResult<(StreamReader, Metadata)> {
let charset_name_val = jni_new_string_as_jvalue(&mut env, &char_set.to_string())?;
let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?;
let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?;
let j_ocr_conf = JTesseractOcrConfig::new(&mut env, ocr_conf)?;
let call_result = jni_call_static_method(
&mut env,
"ai/yobix/TikaNativeMain",
method_name,
signature,
&[
data_source_val,
(&charset_name_val).into(),
(&j_pdf_conf.internal).into(),
(&j_office_conf.internal).into(),
(&j_ocr_conf.internal).into(),
JValue::Bool(if as_xml { 1 } else { 0 }),
],
);
let call_result_obj = call_result?.l()?;
let result = JReaderResult::new(&mut env, call_result_obj)?;
let j_reader = JReaderInputStream::new(&mut env, result.java_reader)?;
Ok((StreamReader { inner: j_reader }, result.metadata))
}
pub fn parse_file(
file_path: &str,
char_set: &CharSet,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool
) -> ExtractResult<(StreamReader, Metadata)> {
let mut env = get_vm_attach_current_thread()?;
let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
parse_to_stream(
env,
(&file_path_val).into(),
char_set,
pdf_conf,
office_conf,
ocr_conf,
as_xml,
"parseFile",
"(Ljava/lang/String;\
Ljava/lang/String;\
Lorg/apache/tika/parser/pdf/PDFParserConfig;\
Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
Z\
)Lai/yobix/ReaderResult;",
)
}
pub fn parse_bytes(
buffer: &[u8],
char_set: &CharSet,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
) -> ExtractResult<(StreamReader, Metadata)> {
let mut env = get_vm_attach_current_thread()?;
let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8;
let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
parse_to_stream(
env,
(&byte_buffer).into(),
char_set,
pdf_conf,
office_conf,
ocr_conf,
as_xml,
"parseBytes",
"(Ljava/nio/ByteBuffer;\
Ljava/lang/String;\
Lorg/apache/tika/parser/pdf/PDFParserConfig;\
Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
Z\
)Lai/yobix/ReaderResult;",
)
}
pub fn parse_url(
url: &str,
char_set: &CharSet,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
) -> ExtractResult<(StreamReader, Metadata)> {
let mut env = get_vm_attach_current_thread()?;
let url_val = jni_new_string_as_jvalue(&mut env, url)?;
parse_to_stream(
env,
(&url_val).into(),
char_set,
pdf_conf,
office_conf,
ocr_conf,
as_xml,
"parseUrl",
"(Ljava/lang/String;\
Ljava/lang/String;\
Lorg/apache/tika/parser/pdf/PDFParserConfig;\
Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
Z\
)Lai/yobix/ReaderResult;",
)
}
pub fn parse_to_string(
mut env: AttachGuard,
data_source_val: JValue,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
method_name: &str,
signature: &str,
) -> ExtractResult<(String, Metadata)> {
let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?;
let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?;
let j_ocr_conf = JTesseractOcrConfig::new(&mut env, ocr_conf)?;
let call_result = jni_call_static_method(
&mut env,
"ai/yobix/TikaNativeMain",
method_name,
signature,
&[
data_source_val,
JValue::Int(max_length),
(&j_pdf_conf.internal).into(),
(&j_office_conf.internal).into(),
(&j_ocr_conf.internal).into(),
JValue::Bool(if as_xml { 1 } else { 0 }),
],
);
let call_result_obj = call_result?.l()?;
let result = JStringResult::new(&mut env, call_result_obj)?;
Ok((result.content, result.metadata))
}
pub fn parse_file_to_string(
file_path: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
) -> ExtractResult<(String, Metadata)> {
let mut env = get_vm_attach_current_thread()?;
let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
parse_to_string(
env,
(&file_path_val).into(),
max_length,
pdf_conf,
office_conf,
ocr_conf,
as_xml,
"parseFileToString",
"(Ljava/lang/String;\
I\
Lorg/apache/tika/parser/pdf/PDFParserConfig;\
Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
Z\
)Lai/yobix/StringResult;",
)
}
pub fn parse_bytes_to_string(
buffer: &[u8],
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
) -> ExtractResult<(String, Metadata)> {
let mut env = get_vm_attach_current_thread()?;
let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8;
let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
parse_to_string(
env,
(&byte_buffer).into(),
max_length,
pdf_conf,
office_conf,
ocr_conf,
as_xml,
"parseBytesToString",
"(Ljava/nio/ByteBuffer;\
I\
Lorg/apache/tika/parser/pdf/PDFParserConfig;\
Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
Z\
)Lai/yobix/StringResult;",
)
}
pub fn parse_url_to_string(
url: &str,
max_length: i32,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool,
) -> ExtractResult<(String, Metadata)> {
let mut env = get_vm_attach_current_thread()?;
let url_val = jni_new_string_as_jvalue(&mut env, url)?;
parse_to_string(
env,
(&url_val).into(),
max_length,
pdf_conf,
office_conf,
ocr_conf,
as_xml,
"parseUrlToString",
"(Ljava/lang/String;\
I\
Lorg/apache/tika/parser/pdf/PDFParserConfig;\
Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
Z\
)Lai/yobix/StringResult;",
)
}