gaze_document/extract/mod.rs
1//! Input extraction backends.
2//!
3//! Each submodule turns a specific input format (PDF, future: Word, HTML)
4//! into a PNG image ready for OCR.
5
6use crate::DocumentError;
7
8#[cfg(feature = "pdf-input")]
9#[cfg_attr(docsrs, doc(cfg(feature = "pdf-input")))]
10pub mod pdf;
11
12/// Source kind detected for an input path.
13#[non_exhaustive]
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum InputKind {
16 /// PNG image input. Passed straight to the OCR backend.
17 Png,
18 /// JPEG image input. Passed straight to the OCR backend.
19 Jpeg,
20 /// PDF input. Rasterized to PNG before OCR (single-page only in v0.0.x).
21 Pdf,
22}
23
24impl InputKind {
25 /// Detects [`InputKind`] from a file path's extension.
26 ///
27 /// Returns [`DocumentError::UnsupportedInput`] when the extension is
28 /// missing or not in the supported set.
29 pub fn detect(path: &std::path::Path) -> Result<Self, DocumentError> {
30 let ext = path
31 .extension()
32 .and_then(|s| s.to_str())
33 .map(|s| s.to_ascii_lowercase());
34 match ext.as_deref() {
35 Some("png") => Ok(Self::Png),
36 Some("jpg" | "jpeg") => Ok(Self::Jpeg),
37 Some("pdf") => Ok(Self::Pdf),
38 _ => Err(DocumentError::UnsupportedInput {
39 path: path.to_path_buf(),
40 reason: "extension must be one of: png, jpg, jpeg, pdf",
41 }),
42 }
43 }
44
45 /// File extension used when writing a temp copy of this input kind.
46 pub fn extension(&self) -> &'static str {
47 match self {
48 Self::Png => "png",
49 Self::Jpeg => "jpg",
50 Self::Pdf => "pdf",
51 }
52 }
53}