#[cfg(any(feature = "otel", not(feature = "office")))]
use crate::KreuzbergError;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
use crate::types::ExtractionResult;
use std::path::Path;
use super::helpers::get_extractor;
#[cfg(feature = "otel")]
pub(super) fn sanitize_path(path: &Path) -> String {
path.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown")
.to_string()
}
#[cfg(feature = "otel")]
pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
let span = tracing::Span::current();
span.record("otel.status_code", "ERROR");
span.record("error.type", format!("{:?}", error));
span.record("error.message", error.to_string());
}
#[cfg_attr(feature = "otel", tracing::instrument(
skip(config, path),
fields(
extraction.filename = tracing::field::Empty,
)
))]
pub async fn extract_file(
path: impl AsRef<Path>,
mime_type: Option<&str>,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
use crate::core::{io, mime};
let path = path.as_ref();
#[cfg(feature = "otel")]
{
let span = tracing::Span::current();
span.record("extraction.filename", sanitize_path(path));
}
let result = async {
io::validate_file_exists(path)?;
let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
#[cfg(not(feature = "office"))]
match detected_mime.as_str() {
LEGACY_WORD_MIME_TYPE => {
return Err(KreuzbergError::UnsupportedFormat(
"Legacy Word extraction requires the `office` feature".to_string(),
));
}
LEGACY_POWERPOINT_MIME_TYPE => {
return Err(KreuzbergError::UnsupportedFormat(
"Legacy PowerPoint extraction requires the `office` feature".to_string(),
));
}
_ => {}
}
#[cfg(feature = "office")]
{
let _ = LEGACY_WORD_MIME_TYPE;
let _ = LEGACY_POWERPOINT_MIME_TYPE;
}
extract_file_with_extractor(path, &detected_mime, config).await
}
.await;
#[cfg(feature = "otel")]
if let Err(ref e) = result {
record_error(e);
}
result
}
pub(in crate::core::extractor) async fn extract_file_with_extractor(
path: &Path,
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
crate::extractors::ensure_initialized()?;
let extractor = get_extractor(mime_type)?;
let mut result = extractor.extract_file(path, mime_type, config).await?;
result = crate::core::pipeline::run_pipeline(result, config).await?;
Ok(result)
}
pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
crate::extractors::ensure_initialized()?;
let extractor = get_extractor(mime_type)?;
let mut result = extractor.extract_bytes(content, mime_type, config).await?;
result = crate::core::pipeline::run_pipeline(result, config).await?;
Ok(result)
}