use std::path::{Path, PathBuf};
use std::sync::Arc;
use bytes::Bytes;
use modkit_macros::domain_model;
use tracing::{debug, info, instrument, warn};
use crate::domain::error::DomainError;
use crate::domain::ir::ParsedDocument;
use crate::domain::parser::FileParserBackend;
const EXTENSION_MIME_MAPPINGS: &[(&str, &str)] = &[
("pdf", "application/pdf"),
("html", "text/html"),
("htm", "text/html"),
(
"docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
("png", "image/png"),
("jpg", "image/jpeg"),
("jpeg", "image/jpeg"),
("webp", "image/webp"),
("gif", "image/gif"),
];
#[domain_model]
#[derive(Clone)]
pub struct FileParserService {
parsers: Vec<Arc<dyn FileParserBackend>>,
config: ServiceConfig,
}
#[domain_model]
#[derive(Debug, Clone)]
pub struct ServiceConfig {
pub max_file_size_bytes: usize,
pub allowed_local_base_dir: PathBuf,
}
#[domain_model]
#[derive(Debug, Clone)]
pub struct FileParserInfo {
pub supported_extensions: std::collections::HashMap<String, Vec<String>>,
}
impl FileParserService {
#[must_use]
pub fn new(parsers: Vec<Arc<dyn FileParserBackend>>, config: ServiceConfig) -> Self {
Self { parsers, config }
}
#[instrument(skip(self))]
pub fn info(&self) -> FileParserInfo {
debug!("Getting parser info");
let mut supported_extensions = std::collections::HashMap::new();
for parser in &self.parsers {
let id = parser.id();
let extensions: Vec<String> = parser
.supported_extensions()
.iter()
.map(ToString::to_string)
.collect();
supported_extensions.insert(id.to_owned(), extensions);
}
FileParserInfo {
supported_extensions,
}
}
#[instrument(skip(self), fields(path = %path.display()))]
pub async fn parse_local(&self, path: &Path) -> Result<ParsedDocument, DomainError> {
info!("Parsing file from local path");
Self::validate_local_path(path)?;
let canonical = path.canonicalize().map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
DomainError::file_not_found(path.display().to_string())
} else {
DomainError::io_error(format!(
"Cannot canonicalize path '{}': {e}",
path.display()
))
}
})?;
if !canonical.starts_with(&self.config.allowed_local_base_dir) {
warn!(
requested = %path.display(),
canonical = %canonical.display(),
base_dir = %self.config.allowed_local_base_dir.display(),
"Path traversal blocked: canonical path outside allowed base directory"
);
return Err(DomainError::path_traversal_blocked(format!(
"Access denied: '{}' is outside the allowed base directory",
path.display()
)));
}
let extension = canonical
.extension()
.and_then(|s| s.to_str())
.ok_or_else(|| DomainError::unsupported_file_type("no extension"))?;
let parser = self
.find_parser_by_extension(extension)
.ok_or_else(|| DomainError::no_parser_available(extension))?;
let document = parser.parse_local_path(&canonical).await.map_err(|e| {
tracing::error!(?e, "FileParserService: parse_local failed");
e
})?;
debug!("Successfully parsed file from local path");
Ok(document)
}
fn validate_local_path(path: &Path) -> Result<(), DomainError> {
for component in path.components() {
if matches!(component, std::path::Component::ParentDir) {
warn!(
path = %path.display(),
"Path traversal blocked: '..' component detected"
);
return Err(DomainError::path_traversal_blocked(format!(
"Access denied: path '{}' contains '..' traversal component",
path.display()
)));
}
}
Ok(())
}
#[instrument(
skip(self, bytes),
fields(filename_hint = ?filename_hint, content_type = ?content_type, size = bytes.len())
)]
pub async fn parse_bytes(
&self,
filename_hint: Option<&str>,
content_type: Option<&str>,
bytes: Bytes,
) -> Result<ParsedDocument, DomainError> {
info!("Parsing uploaded file");
if bytes.len() > self.config.max_file_size_bytes {
return Err(DomainError::invalid_request(format!(
"File size {} exceeds maximum of {} bytes",
bytes.len(),
self.config.max_file_size_bytes
)));
}
let extension_from_name = filename_hint
.and_then(|name| Path::new(name).extension())
.and_then(|s| s.to_str())
.map(str::to_owned);
let extension = if let Some(ext) = extension_from_name {
ext
} else if let Some(ct) = content_type {
if let Some(ext) = Self::extension_from_content_type(ct) {
ext
} else {
return Err(DomainError::unsupported_file_type(
"no extension and unknown content-type",
));
}
} else {
return Err(DomainError::unsupported_file_type(
"no extension and no content-type",
));
};
let parser = self
.find_parser_by_extension(&extension)
.ok_or_else(|| DomainError::no_parser_available(&extension))?;
let document = parser
.parse_bytes(filename_hint, content_type, bytes)
.await
.map_err(|e| {
tracing::error!(?e, "FileParserService: parse_bytes failed");
e
})?;
debug!("Successfully parsed uploaded file");
Ok(document)
}
#[must_use]
pub fn extension_from_content_type(ct: &str) -> Option<String> {
let mime: mime::Mime = ct.parse().ok()?;
let essence = mime.essence_str();
if essence == "application/xhtml+xml" {
return Some("html".to_owned());
}
EXTENSION_MIME_MAPPINGS
.iter()
.find(|(_, mime_type)| *mime_type == essence)
.map(|(ext, _)| (*ext).to_owned())
}
fn find_parser_by_extension(&self, ext: &str) -> Option<Arc<dyn FileParserBackend>> {
let ext_lower = ext.to_lowercase();
self.parsers
.iter()
.find(|p| {
p.supported_extensions()
.iter()
.any(|e| e.to_lowercase() == ext_lower)
})
.cloned()
}
}