Skip to main content

eth_id/parser/
mod.rs

1mod pdf;
2mod image;
3mod json;
4mod text;
5
6use crate::error::{Result, EthIdError};
7use std::path::Path;
8use serde::{Deserialize, Serialize};
9use zeroize::Zeroize;
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ParsedDocument {
13    fields: std::collections::HashMap<String, String>,
14    raw_text: String,
15    doc_type: DocumentType,
16    #[serde(skip)]
17    sensitive_data: Vec<String>,
18}
19
20impl Drop for ParsedDocument {
21    fn drop(&mut self) {
22        for value in self.fields.values_mut() {
23            value.zeroize();
24        }
25        self.raw_text.zeroize();
26        for data in &mut self.sensitive_data {
27            data.zeroize();
28        }
29    }
30}
31
32impl ParsedDocument {
33    pub fn new(doc_type: DocumentType) -> Self {
34        Self {
35            fields: std::collections::HashMap::new(),
36            raw_text: String::new(),
37            doc_type,
38            sensitive_data: Vec::new(),
39        }
40    }
41    
42    pub fn add_field(&mut self, key: String, value: String) {
43        self.fields.insert(key, value);
44    }
45    
46    pub fn set_raw_text(&mut self, text: String) {
47        self.raw_text = text;
48    }
49    
50    pub fn get_field(&self, key: &str) -> Option<&String> {
51        self.fields.get(key)
52    }
53    
54    pub fn raw_text(&self) -> &str {
55        &self.raw_text
56    }
57    
58    pub fn doc_type(&self) -> &DocumentType {
59        &self.doc_type
60    }
61    
62    pub fn size(&self) -> usize {
63        self.raw_text.len()
64    }
65    
66    pub fn fields(&self) -> &std::collections::HashMap<String, String> {
67        &self.fields
68    }
69}
70
71#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
72pub enum DocumentType {
73    Pdf,
74    Image,
75    Json,
76    Text,
77}
78
79pub struct DocumentParser;
80
81impl DocumentParser {
82    pub fn new() -> Self {
83        Self
84    }
85    
86    pub async fn parse_file(&self, path: &Path) -> Result<ParsedDocument> {
87        if !path.exists() {
88            return Err(EthIdError::DocumentParsing(
89                format!("File not found: {:?}", path)
90            ));
91        }
92        
93        let extension = path.extension()
94            .and_then(|e| e.to_str())
95            .ok_or_else(|| EthIdError::InvalidFormat("No file extension".to_string()))?;
96        
97        match extension.to_lowercase().as_str() {
98            "pdf" => pdf::parse_pdf(path).await,
99            "jpg" | "jpeg" | "png" | "webp" | "bmp" => image::parse_image(path).await,
100            "json" => json::parse_json(path).await,
101            "txt" => text::parse_text(path).await,
102            _ => Err(EthIdError::InvalidFormat(
103                format!("Unsupported file type: {}", extension)
104            )),
105        }
106    }
107}