use std::fmt;
use std::path::Path;
pub mod docx;
pub mod html;
pub mod pdf;
pub mod text;
#[derive(Debug, Clone)]
pub struct Document {
pub content: String,
pub metadata: DocumentMetadata,
pub elements: Vec<DocumentElement>,
}
#[derive(Debug, Clone)]
pub struct DocumentMetadata {
pub filename: Option<String>,
pub file_size: Option<u64>,
pub file_type: FileType,
pub page_count: Option<u32>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum FileType {
Pdf,
Docx,
Text,
Html,
Rtf,
Unknown,
}
#[derive(Debug, Clone)]
pub struct DocumentElement {
pub element_type: String,
pub text: String,
pub metadata: Option<String>,
}
#[derive(Debug)]
pub enum LoaderError {
IoError(std::io::Error),
FormatError(String),
UnsupportedFormat(String),
ParseError(String),
}
impl fmt::Display for LoaderError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LoaderError::IoError(err) => write!(f, "IO error: {err}"),
LoaderError::FormatError(msg) => write!(f, "Format error: {msg}"),
LoaderError::UnsupportedFormat(format) => write!(f, "Unsupported format: {format}"),
LoaderError::ParseError(msg) => write!(f, "Parse error: {msg}"),
}
}
}
impl std::error::Error for LoaderError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
LoaderError::IoError(err) => Some(err),
_ => None,
}
}
}
impl From<std::io::Error> for LoaderError {
fn from(error: std::io::Error) -> Self {
LoaderError::IoError(error)
}
}
pub type Result<T> = std::result::Result<T, LoaderError>;
pub trait DocumentLoader {
fn load_from_path(&self, path: &Path) -> Result<Document>;
fn load_from_bytes(&self, data: &[u8], filename: Option<&str>) -> Result<Document>;
}
pub mod utils {
use super::FileType;
use std::path::Path;
pub fn detect_file_type<P: AsRef<Path>>(path: P) -> FileType {
if let Some(ext) = path.as_ref().extension() {
match ext.to_string_lossy().to_lowercase().as_str() {
"pdf" => FileType::Pdf,
"docx" => FileType::Docx,
"txt" | "text" => FileType::Text,
"html" | "htm" => FileType::Html,
"rtf" => FileType::Rtf,
_ => FileType::Unknown,
}
} else {
FileType::Unknown
}
}
pub fn detect_from_bytes(data: &[u8]) -> FileType {
if data.len() < 4 {
return FileType::Unknown;
}
if data.starts_with(b"%PDF") {
return FileType::Pdf;
}
if data.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
return FileType::Docx;
}
if data.starts_with(b"<!DOCTYPE html")
|| data.starts_with(b"<html")
|| data.starts_with(b"<HTML")
{
return FileType::Html;
}
if data.starts_with(b"{\\rtf") {
return FileType::Rtf;
}
if data.iter().take(1000).all(|&b| b.is_ascii()) {
return FileType::Text;
}
FileType::Unknown
}
}