Skip to main content

gaze_document/extract/
mod.rs

1//! Input extraction backends.
2//!
3//! Each submodule turns a specific input format (PDF, future: Word, HTML)
4//! into a PNG image ready for OCR.
5
6use crate::DocumentError;
7
8#[cfg(feature = "pdf-input")]
9#[cfg_attr(docsrs, doc(cfg(feature = "pdf-input")))]
10pub mod pdf;
11
12/// Source kind detected for an input path.
13#[non_exhaustive]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum InputKind {
16    /// PNG image input. Passed straight to the OCR backend.
17    Png,
18    /// JPEG image input. Passed straight to the OCR backend.
19    Jpeg,
20    /// PDF input. Rasterized to PNG before OCR (single-page only in v0.0.x).
21    Pdf,
22}
23
24impl InputKind {
25    /// Detects [`InputKind`] from a file path's extension.
26    ///
27    /// Returns [`DocumentError::UnsupportedInput`] when the extension is
28    /// missing or not in the supported set.
29    pub fn detect(path: &std::path::Path) -> Result<Self, DocumentError> {
30        let ext = path
31            .extension()
32            .and_then(|s| s.to_str())
33            .map(|s| s.to_ascii_lowercase());
34        match ext.as_deref() {
35            Some("png") => Ok(Self::Png),
36            Some("jpg" | "jpeg") => Ok(Self::Jpeg),
37            Some("pdf") => Ok(Self::Pdf),
38            _ => Err(DocumentError::UnsupportedInput {
39                path: path.to_path_buf(),
40                reason: "extension must be one of: png, jpg, jpeg, pdf",
41            }),
42        }
43    }
44
45    /// File extension used when writing a temp copy of this input kind.
46    pub fn extension(&self) -> &'static str {
47        match self {
48            Self::Png => "png",
49            Self::Jpeg => "jpg",
50            Self::Pdf => "pdf",
51        }
52    }
53}