#[cfg(not(feature = "office"))]
use crate::KreuzbergError;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
use crate::types::ExtractionResult;
use super::file::extract_bytes_with_extractor;
#[cfg_attr(feature = "otel", tracing::instrument(
skip(config, content),
fields(
{ crate::telemetry::conventions::OPERATION } = crate::telemetry::conventions::operations::EXTRACT_BYTES,
{ crate::telemetry::conventions::DOCUMENT_MIME_TYPE } = mime_type,
{ crate::telemetry::conventions::DOCUMENT_SIZE_BYTES } = content.len(),
{ crate::telemetry::conventions::OTEL_STATUS_CODE } = tracing::field::Empty,
{ crate::telemetry::conventions::ERROR_TYPE } = tracing::field::Empty,
{ crate::telemetry::conventions::ERROR_MESSAGE } = tracing::field::Empty,
)
))]
pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
use crate::core::mime;
let extraction_future = async {
if config.force_ocr && config.effective_disable_ocr() {
return Err(crate::KreuzbergError::Validation {
message: "force_ocr and disable_ocr cannot both be true".to_string(),
source: None,
});
}
let validated_mime = if mime_type == "application/octet-stream" {
mime::detect_mime_type_from_bytes(content)?
} else {
mime::validate_mime_type(mime_type)?
};
#[cfg(not(feature = "office"))]
match validated_mime.as_str() {
LEGACY_WORD_MIME_TYPE => {
return Err(KreuzbergError::UnsupportedFormat(
"Legacy Word extraction requires the `office` feature".to_string(),
));
}
LEGACY_POWERPOINT_MIME_TYPE => {
return Err(KreuzbergError::UnsupportedFormat(
"Legacy PowerPoint extraction requires the `office` feature".to_string(),
));
}
_ => {}
}
#[cfg(feature = "office")]
{
let _ = LEGACY_WORD_MIME_TYPE;
let _ = LEGACY_POWERPOINT_MIME_TYPE;
}
extract_bytes_with_extractor(content, &validated_mime, config).await
};
#[cfg(feature = "tokio-runtime")]
let result = if let Some(secs) = config.extraction_timeout_secs {
let start = std::time::Instant::now();
match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
Ok(inner) => inner,
Err(_elapsed) => {
if let Some(ref token) = config.cancel_token {
token.cancel();
}
Err(crate::KreuzbergError::Timeout {
elapsed_ms: start.elapsed().as_millis() as u64,
limit_ms: secs * 1000,
})
}
}
} else {
extraction_future.await
};
#[cfg(not(feature = "tokio-runtime"))]
let result = {
let _ = config.extraction_timeout_secs;
extraction_future.await
};
#[cfg(feature = "otel")]
if let Err(ref e) = result {
crate::telemetry::spans::record_error_on_current_span(e);
}
result
}