1mod pdf;
2mod image;
3mod json;
4mod text;
5
6use crate::error::{Result, EthIdError};
7use std::path::Path;
8use serde::{Deserialize, Serialize};
9use zeroize::Zeroize;
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ParsedDocument {
13 fields: std::collections::HashMap<String, String>,
14 raw_text: String,
15 doc_type: DocumentType,
16 #[serde(skip)]
17 sensitive_data: Vec<String>,
18}
19
20impl Drop for ParsedDocument {
21 fn drop(&mut self) {
22 for value in self.fields.values_mut() {
23 value.zeroize();
24 }
25 self.raw_text.zeroize();
26 for data in &mut self.sensitive_data {
27 data.zeroize();
28 }
29 }
30}
31
32impl ParsedDocument {
33 pub fn new(doc_type: DocumentType) -> Self {
34 Self {
35 fields: std::collections::HashMap::new(),
36 raw_text: String::new(),
37 doc_type,
38 sensitive_data: Vec::new(),
39 }
40 }
41
42 pub fn add_field(&mut self, key: String, value: String) {
43 self.fields.insert(key, value);
44 }
45
46 pub fn set_raw_text(&mut self, text: String) {
47 self.raw_text = text;
48 }
49
50 pub fn get_field(&self, key: &str) -> Option<&String> {
51 self.fields.get(key)
52 }
53
54 pub fn raw_text(&self) -> &str {
55 &self.raw_text
56 }
57
58 pub fn doc_type(&self) -> &DocumentType {
59 &self.doc_type
60 }
61
62 pub fn size(&self) -> usize {
63 self.raw_text.len()
64 }
65
66 pub fn fields(&self) -> &std::collections::HashMap<String, String> {
67 &self.fields
68 }
69}
70
71#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
72pub enum DocumentType {
73 Pdf,
74 Image,
75 Json,
76 Text,
77}
78
79pub struct DocumentParser;
80
81impl DocumentParser {
82 pub fn new() -> Self {
83 Self
84 }
85
86 pub async fn parse_file(&self, path: &Path) -> Result<ParsedDocument> {
87 if !path.exists() {
88 return Err(EthIdError::DocumentParsing(
89 format!("File not found: {:?}", path)
90 ));
91 }
92
93 let extension = path.extension()
94 .and_then(|e| e.to_str())
95 .ok_or_else(|| EthIdError::InvalidFormat("No file extension".to_string()))?;
96
97 match extension.to_lowercase().as_str() {
98 "pdf" => pdf::parse_pdf(path).await,
99 "jpg" | "jpeg" | "png" | "webp" | "bmp" => image::parse_image(path).await,
100 "json" => json::parse_json(path).await,
101 "txt" => text::parse_text(path).await,
102 _ => Err(EthIdError::InvalidFormat(
103 format!("Unsupported file type: {}", extension)
104 )),
105 }
106 }
107}