dongler-core 0.3.9

Rust-native PDF and document extraction core for Markdown, LaTeX, and JSON output.
Documentation
use std::fs;
use std::io::Read;
use std::path::Path;

use flate2::read::GzDecoder;

use crate::error::Result;
use crate::format::InputFormat;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Source {
    pub content: String,
    pub bytes: Option<Vec<u8>>,
    pub format: String,
    pub path: Option<String>,
}

impl Source {
    pub fn from_text(text: impl Into<String>) -> Self {
        Self {
            content: text.into(),
            bytes: None,
            format: InputFormat::Text.as_str().to_owned(),
            path: None,
        }
    }

    pub fn from_path(path: impl AsRef<Path>, format: impl Into<String>) -> Result<Self> {
        let path = path.as_ref();

        let content = fs::read_to_string(path)?;

        Ok(Self {
            bytes: Some(content.as_bytes().to_vec()),
            content,
            format: format.into(),
            path: Some(path.display().to_string()),
        })
    }

    pub fn from_text_or_gzip_path(
        path: impl AsRef<Path>,
        format: impl Into<String>,
    ) -> Result<Self> {
        let path = path.as_ref();
        if !is_gzip_path(path) {
            return Self::from_path(path, format);
        }

        let bytes = fs::read(path)?;
        let mut decoder = GzDecoder::new(bytes.as_slice());
        let mut content = String::new();
        decoder.read_to_string(&mut content)?;

        Ok(Self {
            bytes: Some(bytes),
            content,
            format: format.into(),
            path: Some(path.display().to_string()),
        })
    }

    pub fn from_pdf_path(path: impl AsRef<Path>) -> Result<Self> {
        Self::from_binary_path(path, InputFormat::Pdf.as_str())
    }

    pub fn from_binary_path(path: impl AsRef<Path>, format: impl Into<String>) -> Result<Self> {
        let path = path.as_ref();
        let bytes = fs::read(path)?;

        Ok(Self {
            content: String::from_utf8_lossy(&bytes).into_owned(),
            bytes: Some(bytes),
            format: format.into(),
            path: Some(path.display().to_string()),
        })
    }
}

pub trait SourceLoader {
    fn load(&self, path: &Path) -> Result<Source>;
}

#[derive(Debug, Default, Clone, Copy)]
pub struct TextSourceLoader;

impl SourceLoader for TextSourceLoader {
    fn load(&self, path: &Path) -> Result<Source> {
        Source::from_text_or_gzip_path(path, InputFormat::Text.as_str())
    }
}

#[derive(Debug, Default, Clone, Copy)]
pub struct PdfSourceLoader;

impl SourceLoader for PdfSourceLoader {
    fn load(&self, path: &Path) -> Result<Source> {
        Source::from_pdf_path(path)
    }
}

#[derive(Debug, Default, Clone, Copy)]
pub struct ImageSourceLoader;

impl SourceLoader for ImageSourceLoader {
    fn load(&self, path: &Path) -> Result<Source> {
        Source::from_binary_path(path, InputFormat::Image.as_str())
    }
}

#[derive(Debug, Clone, Copy)]
pub struct FormatSourceLoader {
    format: InputFormat,
}

impl FormatSourceLoader {
    pub fn new(format: InputFormat) -> Self {
        Self { format }
    }
}

impl SourceLoader for FormatSourceLoader {
    fn load(&self, path: &Path) -> Result<Source> {
        match self.format {
            InputFormat::Text
            | InputFormat::Html
            | InputFormat::Email
            | InputFormat::Json
            | InputFormat::Csv
            | InputFormat::Xml => Source::from_text_or_gzip_path(path, self.format.as_str()),
            InputFormat::Pdf
            | InputFormat::Image
            | InputFormat::Word
            | InputFormat::Excel
            | InputFormat::Presentation
            | InputFormat::OpenDocument
            | InputFormat::Archive => Source::from_binary_path(path, self.format.as_str()),
            InputFormat::LegacyWord
            | InputFormat::LegacyExcel
            | InputFormat::LegacyPresentation
            | InputFormat::LegacyEmail => Source::from_binary_path(path, self.format.as_str()),
        }
    }
}

fn is_gzip_path(path: &Path) -> bool {
    path.extension()
        .and_then(|extension| extension.to_str())
        .map(|extension| extension.eq_ignore_ascii_case("gz"))
        .unwrap_or(false)
}