Skip to main content

three_dcf_core/
document.rs

1use std::collections::HashSet;
2use std::convert::TryFrom;
3use std::fs::File;
4use std::io::{Read, Write};
5use std::path::Path;
6
7use indexmap::IndexMap;
8use prost::Message;
9use serde::{Deserialize, Serialize};
10
11use crate::error::Result;
12use crate::proto;
13
14pub type CodeHash = [u8; 32];
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct Header {
18    pub version: u32,
19    pub grid: String,
20    pub codeset: String,
21}
22
23impl Default for Header {
24    fn default() -> Self {
25        Self {
26            version: 1,
27            grid: "coarse".to_string(),
28            codeset: "HASH256".to_string(),
29        }
30    }
31}
32
33#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
34#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
35pub enum CellType {
36    Text,
37    Table,
38    Figure,
39    Footer,
40    Header,
41}
42
43impl From<CellType> for proto::CellType {
44    fn from(value: CellType) -> Self {
45        match value {
46            CellType::Text => proto::CellType::Text,
47            CellType::Table => proto::CellType::Table,
48            CellType::Figure => proto::CellType::Figure,
49            CellType::Footer => proto::CellType::Footer,
50            CellType::Header => proto::CellType::Header,
51        }
52    }
53}
54
55impl From<proto::CellType> for CellType {
56    fn from(value: proto::CellType) -> Self {
57        match value {
58            proto::CellType::Text => CellType::Text,
59            proto::CellType::Table => CellType::Table,
60            proto::CellType::Figure => CellType::Figure,
61            proto::CellType::Footer => CellType::Footer,
62            proto::CellType::Header => CellType::Header,
63        }
64    }
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
68pub struct PageInfo {
69    pub z: u32,
70    pub width_px: u32,
71    pub height_px: u32,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
75pub struct CellRecord {
76    pub z: u32,
77    pub x: i32,
78    pub y: i32,
79    pub w: u32,
80    pub h: u32,
81    #[serde(with = "codehash_serde")]
82    pub code_id: CodeHash,
83    pub rle: u32,
84    pub cell_type: CellType,
85    pub importance: u8,
86}
87
88impl CellRecord {
89    pub fn key(&self) -> (u32, i32, i32) {
90        (self.z, self.y, self.x)
91    }
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
95pub struct NumGuard {
96    pub z: u32,
97    pub x: u32,
98    pub y: u32,
99    pub units: String,
100    #[serde(with = "numhash_serde")]
101    pub sha1: [u8; 20],
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize, Default)]
105pub struct Document {
106    pub header: Header,
107    pub pages: Vec<PageInfo>,
108    pub cells: Vec<CellRecord>,
109    #[serde(with = "dict_serde")]
110    pub dict: IndexMap<CodeHash, String>,
111    pub numguards: Vec<NumGuard>,
112}
113
114impl Document {
115    pub fn new(header: Header) -> Self {
116        Self {
117            header,
118            pages: Vec::new(),
119            cells: Vec::new(),
120            dict: IndexMap::new(),
121            numguards: Vec::new(),
122        }
123    }
124
125    pub fn add_page(&mut self, info: PageInfo) {
126        self.pages.push(info);
127    }
128
129    pub fn push_cell(&mut self, cell: CellRecord, payload: String) {
130        let code = cell.code_id;
131        self.cells.push(cell);
132        self.dict.entry(code).or_insert(payload);
133    }
134
135    pub fn add_numguard(&mut self, guard: NumGuard) {
136        self.numguards.push(guard);
137    }
138
139    pub fn payload_for(&self, code_id: &CodeHash) -> Option<&str> {
140        self.dict.get(code_id).map(|s| s.as_str())
141    }
142
143    pub fn ordered_cells(&self) -> Vec<CellRecord> {
144        let mut cells = self.cells.clone();
145        cells.sort_by_key(|c| (c.z, c.y, c.x));
146        cells
147    }
148
149    pub fn to_proto(&self) -> proto::Document {
150        let mut prev = (0i64, 0i64, 0i64);
151        let cells = self
152            .ordered_cells()
153            .into_iter()
154            .map(|cell| {
155                let dz = cell.z as i64 - prev.0;
156                let dx = cell.x as i64 - prev.1;
157                let dy = cell.y as i64 - prev.2;
158                prev = (cell.z as i64, cell.x as i64, cell.y as i64);
159                proto::Cell {
160                    dz: dz as i32,
161                    dx: dx as i32,
162                    dy: dy as i32,
163                    w: cell.w,
164                    h: cell.h,
165                    code_id: cell.code_id.to_vec().into(),
166                    rle: cell.rle,
167                    r#type: proto::CellType::from(cell.cell_type) as i32,
168                    importance_q: cell.importance as u32,
169                }
170            })
171            .collect();
172
173        let dict = self
174            .dict
175            .iter()
176            .map(|(code_id, payload)| proto::DictEntry {
177                code_id: code_id.to_vec().into(),
178                payload_utf8: payload.clone(),
179            })
180            .collect();
181
182        let numguards = self
183            .numguards
184            .iter()
185            .map(|guard| proto::NumGuard {
186                z: guard.z,
187                x: guard.x,
188                y: guard.y,
189                units: guard.units.clone(),
190                sha1: guard.sha1.to_vec().into(),
191            })
192            .collect();
193
194        proto::Document {
195            header: Some(proto::Header {
196                version: self.header.version,
197                grid: self.header.grid.clone(),
198                codeset: self.header.codeset.clone(),
199            }),
200            pages: self
201                .pages
202                .iter()
203                .map(|p| proto::PageInfo {
204                    z: p.z,
205                    width_px: p.width_px,
206                    height_px: p.height_px,
207                })
208                .collect(),
209            cells,
210            dict,
211            numguards,
212        }
213    }
214
215    pub fn from_proto(doc: proto::Document) -> Result<Self> {
216        let header = doc
217            .header
218            .map(|h| Header {
219                version: h.version,
220                grid: h.grid,
221                codeset: h.codeset,
222            })
223            .unwrap_or_default();
224
225        let pages = doc
226            .pages
227            .into_iter()
228            .map(|p| PageInfo {
229                z: p.z,
230                width_px: p.width_px,
231                height_px: p.height_px,
232            })
233            .collect();
234
235        let mut cells = Vec::new();
236        let mut prev = (0i64, 0i64, 0i64);
237        for cell in doc.cells {
238            prev.0 += cell.dz as i64;
239            prev.1 += cell.dx as i64;
240            prev.2 += cell.dy as i64;
241            let mut code_id = [0u8; 32];
242            code_id.copy_from_slice(&cell.code_id);
243            cells.push(CellRecord {
244                z: prev.0 as u32,
245                x: prev.1 as i32,
246                y: prev.2 as i32,
247                w: cell.w,
248                h: cell.h,
249                code_id,
250                rle: cell.rle,
251                cell_type: proto::CellType::try_from(cell.r#type)
252                    .map(CellType::from)
253                    .unwrap_or(CellType::Text),
254                importance: cell.importance_q as u8,
255            });
256        }
257
258        let mut dict = IndexMap::new();
259        for entry in doc.dict {
260            let mut code_id = [0u8; 32];
261            code_id.copy_from_slice(&entry.code_id);
262            dict.insert(code_id, entry.payload_utf8);
263        }
264
265        let numguards = doc
266            .numguards
267            .into_iter()
268            .map(|guard| {
269                let mut sha = [0u8; 20];
270                sha.copy_from_slice(&guard.sha1);
271                NumGuard {
272                    z: guard.z,
273                    x: guard.x,
274                    y: guard.y,
275                    units: guard.units,
276                    sha1: sha,
277                }
278            })
279            .collect();
280
281        Ok(Self {
282            header,
283            pages,
284            cells,
285            dict,
286            numguards,
287        })
288    }
289
290    pub fn to_bytes(&self) -> Result<Vec<u8>> {
291        let proto = self.to_proto();
292        let mut buf = Vec::with_capacity(proto.encoded_len());
293        proto.encode(&mut buf)?;
294        let mut encoder = zstd::stream::Encoder::new(Vec::new(), 3)?;
295        encoder.write_all(&buf)?;
296        let data = encoder.finish()?;
297        Ok(data)
298    }
299
300    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
301        let mut decoder = zstd::stream::Decoder::new(bytes)?;
302        let mut buf = Vec::new();
303        decoder.read_to_end(&mut buf)?;
304        let proto = proto::Document::decode(&*buf)?;
305        Self::from_proto(proto)
306    }
307
308    pub fn save_bin<P: AsRef<Path>>(&self, path: P) -> Result<()> {
309        let bytes = self.to_bytes()?;
310        let mut file = File::create(path)?;
311        file.write_all(&bytes)?;
312        Ok(())
313    }
314
315    pub fn load_bin<P: AsRef<Path>>(path: P) -> Result<Self> {
316        let mut file = File::open(path)?;
317        let mut buf = Vec::new();
318        file.read_to_end(&mut buf)?;
319        Self::from_bytes(&buf)
320    }
321
322    pub fn save_json<P: AsRef<Path>>(&self, path: P) -> Result<()> {
323        let mut file = File::create(path)?;
324        serde_json::to_writer_pretty(&mut file, self)?;
325        Ok(())
326    }
327
328    pub fn load_json<P: AsRef<Path>>(path: P) -> Result<Self> {
329        let file = File::open(path)?;
330        let doc: Document = serde_json::from_reader(file)?;
331        Ok(doc)
332    }
333
334    pub fn total_cells(&self) -> usize {
335        self.cells.len()
336    }
337
338    pub fn total_pages(&self) -> usize {
339        self.pages.len()
340    }
341
342    pub fn ensure_dict_entry(&mut self, payload: &str) -> CodeHash {
343        let hash = hash_payload(payload);
344        self.dict.entry(hash).or_insert_with(|| payload.to_string());
345        hash
346    }
347
348    pub fn page_dims(&self, z: u32) -> Option<(u32, u32)> {
349        self.pages
350            .iter()
351            .find(|p| p.z == z)
352            .map(|p| (p.width_px, p.height_px))
353    }
354
355    pub fn iter_cells(&self) -> impl Iterator<Item = &CellRecord> {
356        self.cells.iter()
357    }
358
359    pub fn decode_to_text(&self) -> String {
360        let ordered = self.ordered_cells();
361        self.decode_cells_to_text(&ordered)
362    }
363
364    pub fn decode_page_to_text(&self, z: u32) -> String {
365        let mut page_cells: Vec<_> = self.cells.iter().filter(|c| c.z == z).cloned().collect();
366        page_cells.sort_by_key(|c| (c.y, c.x));
367        self.decode_cells_to_text(&page_cells)
368    }
369
370    pub fn decode_cells_to_text(&self, cells: &[CellRecord]) -> String {
371        let mut lines = Vec::with_capacity(cells.len());
372        for cell in cells {
373            if let Some(payload) = self.payload_for(&cell.code_id) {
374                lines.push(payload.to_string());
375            }
376        }
377        lines.join("\n")
378    }
379
380    pub fn cells_in_bbox(&self, z: u32, x0: i32, y0: i32, x1: i32, y1: i32) -> Vec<CellRecord> {
381        let (min_x, max_x) = if x0 <= x1 { (x0, x1) } else { (x1, x0) };
382        let (min_y, max_y) = if y0 <= y1 { (y0, y1) } else { (y1, y0) };
383        let mut matches: Vec<_> = self
384            .cells
385            .iter()
386            .filter(|cell| {
387                if cell.z != z {
388                    return false;
389                }
390                let cell_x1 = cell.x + cell.w as i32;
391                let cell_y1 = cell.y + cell.h as i32;
392                cell.x <= max_x && cell_x1 >= min_x && cell.y <= max_y && cell_y1 >= min_y
393            })
394            .cloned()
395            .collect();
396        matches.sort_by_key(|c| (c.y, c.x));
397        matches
398    }
399}
400
401pub fn hash_payload(payload: &str) -> CodeHash {
402    let mut hasher = blake3::Hasher::new();
403    hasher.update(payload.as_bytes());
404    let hash = hasher.finalize();
405    let mut bytes = [0u8; 32];
406    bytes.copy_from_slice(hash.as_bytes());
407    bytes
408}
409
410impl Document {
411    pub fn retain_dict_for_cells(&mut self) {
412        let used: HashSet<_> = self.cells.iter().map(|c| c.code_id).collect();
413        self.dict.retain(|code, _| used.contains(code));
414    }
415
416    pub fn numguard_mismatches(&self) -> Vec<NumGuardAlert> {
417        self.numguard_mismatches_with_units(None)
418    }
419
420    pub fn numguard_mismatches_with_units(
421        &self,
422        whitelist: Option<&HashSet<String>>,
423    ) -> Vec<NumGuardAlert> {
424        let whitelist =
425            whitelist.map(|set| set.iter().map(|s| s.to_lowercase()).collect::<HashSet<_>>());
426        let mut alerts = Vec::new();
427        for guard in &self.numguards {
428            if let Some(ref allowed) = whitelist {
429                if !guard.units.is_empty() && !allowed.contains(&guard.units.to_lowercase()) {
430                    alerts.push(NumGuardAlert {
431                        guard: guard.clone(),
432                        observed: None,
433                        issue: NumGuardIssue::UnitNotAllowed,
434                    });
435                    continue;
436                }
437            }
438            let cell = self.cells.iter().find(|c| {
439                c.z == guard.z && c.x.max(0) as u32 == guard.x && c.y.max(0) as u32 == guard.y
440            });
441            if let Some(cell) = cell {
442                if let Some(payload) = self.payload_for(&cell.code_id) {
443                    if let Some(actual) = crate::numguard::hash_digits_from_payload(payload) {
444                        if actual != guard.sha1 {
445                            alerts.push(NumGuardAlert {
446                                guard: guard.clone(),
447                                observed: Some(actual),
448                                issue: NumGuardIssue::HashMismatch,
449                            });
450                        }
451                        continue;
452                    }
453                    alerts.push(NumGuardAlert {
454                        guard: guard.clone(),
455                        observed: None,
456                        issue: NumGuardIssue::MissingPayload,
457                    });
458                    continue;
459                }
460            }
461            alerts.push(NumGuardAlert {
462                guard: guard.clone(),
463                observed: None,
464                issue: NumGuardIssue::MissingCell,
465            });
466        }
467        alerts
468    }
469}
470
471#[derive(Debug, Clone)]
472pub struct NumGuardAlert {
473    pub guard: NumGuard,
474    pub observed: Option<[u8; 20]>,
475    pub issue: NumGuardIssue,
476}
477
478#[derive(Debug, Clone, Copy, PartialEq, Eq)]
479pub enum NumGuardIssue {
480    MissingCell,
481    MissingPayload,
482    HashMismatch,
483    UnitNotAllowed,
484}
485
486mod dict_serde {
487    use super::CodeHash;
488    use indexmap::IndexMap;
489    use serde::ser::Serialize;
490    use serde::{de::Error, Deserialize, Deserializer, Serializer};
491
492    pub fn serialize<S>(map: &IndexMap<CodeHash, String>, serializer: S) -> Result<S::Ok, S::Error>
493    where
494        S: Serializer,
495    {
496        let as_vec: Vec<_> = map
497            .iter()
498            .map(|(code, payload)| (hex::encode(code), payload))
499            .collect();
500        as_vec.serialize(serializer)
501    }
502
503    pub fn deserialize<'de, D>(deserializer: D) -> Result<IndexMap<CodeHash, String>, D::Error>
504    where
505        D: Deserializer<'de>,
506    {
507        let raw: Vec<(String, String)> = Vec::deserialize(deserializer)?;
508        let mut map = IndexMap::new();
509        for (hex_code, payload) in raw {
510            let bytes = hex::decode(&hex_code).map_err(D::Error::custom)?;
511            if bytes.len() != 32 {
512                return Err(D::Error::custom("invalid code hash length"));
513            }
514            let mut code = [0u8; 32];
515            code.copy_from_slice(&bytes);
516            map.insert(code, payload);
517        }
518        Ok(map)
519    }
520}
521
522mod codehash_serde {
523    use serde::{de::Error, Deserialize, Deserializer, Serializer};
524
525    pub fn serialize<S>(hash: &[u8; 32], serializer: S) -> Result<S::Ok, S::Error>
526    where
527        S: Serializer,
528    {
529        serializer.serialize_str(&hex::encode(hash))
530    }
531
532    pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 32], D::Error>
533    where
534        D: Deserializer<'de>,
535    {
536        let s = String::deserialize(deserializer)?;
537        let bytes = hex::decode(&s).map_err(D::Error::custom)?;
538        if bytes.len() != 32 {
539            return Err(D::Error::custom("invalid hash length"));
540        }
541        let mut hash = [0u8; 32];
542        hash.copy_from_slice(&bytes);
543        Ok(hash)
544    }
545}
546
547mod numhash_serde {
548    use serde::{de::Error, Deserialize, Deserializer, Serializer};
549
550    pub fn serialize<S>(hash: &[u8; 20], serializer: S) -> Result<S::Ok, S::Error>
551    where
552        S: Serializer,
553    {
554        serializer.serialize_str(&hex::encode(hash))
555    }
556
557    pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 20], D::Error>
558    where
559        D: Deserializer<'de>,
560    {
561        let s = String::deserialize(deserializer)?;
562        let bytes = hex::decode(&s).map_err(D::Error::custom)?;
563        if bytes.len() != 20 {
564            return Err(D::Error::custom("invalid sha1 length"));
565        }
566        let mut hash = [0u8; 20];
567        hash.copy_from_slice(&bytes);
568        Ok(hash)
569    }
570}