1use std::collections::HashSet;
2use std::convert::TryFrom;
3use std::fs::File;
4use std::io::{Read, Write};
5use std::path::Path;
6
7use indexmap::IndexMap;
8use prost::Message;
9use serde::{Deserialize, Serialize};
10
11use crate::error::Result;
12use crate::proto;
13
14pub type CodeHash = [u8; 32];
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct Header {
18 pub version: u32,
19 pub grid: String,
20 pub codeset: String,
21}
22
23impl Default for Header {
24 fn default() -> Self {
25 Self {
26 version: 1,
27 grid: "coarse".to_string(),
28 codeset: "HASH256".to_string(),
29 }
30 }
31}
32
33#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
34#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
35pub enum CellType {
36 Text,
37 Table,
38 Figure,
39 Footer,
40 Header,
41}
42
43impl From<CellType> for proto::CellType {
44 fn from(value: CellType) -> Self {
45 match value {
46 CellType::Text => proto::CellType::Text,
47 CellType::Table => proto::CellType::Table,
48 CellType::Figure => proto::CellType::Figure,
49 CellType::Footer => proto::CellType::Footer,
50 CellType::Header => proto::CellType::Header,
51 }
52 }
53}
54
55impl From<proto::CellType> for CellType {
56 fn from(value: proto::CellType) -> Self {
57 match value {
58 proto::CellType::Text => CellType::Text,
59 proto::CellType::Table => CellType::Table,
60 proto::CellType::Figure => CellType::Figure,
61 proto::CellType::Footer => CellType::Footer,
62 proto::CellType::Header => CellType::Header,
63 }
64 }
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
68pub struct PageInfo {
69 pub z: u32,
70 pub width_px: u32,
71 pub height_px: u32,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
75pub struct CellRecord {
76 pub z: u32,
77 pub x: i32,
78 pub y: i32,
79 pub w: u32,
80 pub h: u32,
81 #[serde(with = "codehash_serde")]
82 pub code_id: CodeHash,
83 pub rle: u32,
84 pub cell_type: CellType,
85 pub importance: u8,
86}
87
88impl CellRecord {
89 pub fn key(&self) -> (u32, i32, i32) {
90 (self.z, self.y, self.x)
91 }
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
95pub struct NumGuard {
96 pub z: u32,
97 pub x: u32,
98 pub y: u32,
99 pub units: String,
100 #[serde(with = "numhash_serde")]
101 pub sha1: [u8; 20],
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize, Default)]
105pub struct Document {
106 pub header: Header,
107 pub pages: Vec<PageInfo>,
108 pub cells: Vec<CellRecord>,
109 #[serde(with = "dict_serde")]
110 pub dict: IndexMap<CodeHash, String>,
111 pub numguards: Vec<NumGuard>,
112}
113
114impl Document {
115 pub fn new(header: Header) -> Self {
116 Self {
117 header,
118 pages: Vec::new(),
119 cells: Vec::new(),
120 dict: IndexMap::new(),
121 numguards: Vec::new(),
122 }
123 }
124
125 pub fn add_page(&mut self, info: PageInfo) {
126 self.pages.push(info);
127 }
128
129 pub fn push_cell(&mut self, cell: CellRecord, payload: String) {
130 let code = cell.code_id;
131 self.cells.push(cell);
132 self.dict.entry(code).or_insert(payload);
133 }
134
135 pub fn add_numguard(&mut self, guard: NumGuard) {
136 self.numguards.push(guard);
137 }
138
139 pub fn payload_for(&self, code_id: &CodeHash) -> Option<&str> {
140 self.dict.get(code_id).map(|s| s.as_str())
141 }
142
143 pub fn ordered_cells(&self) -> Vec<CellRecord> {
144 let mut cells = self.cells.clone();
145 cells.sort_by_key(|c| (c.z, c.y, c.x));
146 cells
147 }
148
149 pub fn to_proto(&self) -> proto::Document {
150 let mut prev = (0i64, 0i64, 0i64);
151 let cells = self
152 .ordered_cells()
153 .into_iter()
154 .map(|cell| {
155 let dz = cell.z as i64 - prev.0;
156 let dx = cell.x as i64 - prev.1;
157 let dy = cell.y as i64 - prev.2;
158 prev = (cell.z as i64, cell.x as i64, cell.y as i64);
159 proto::Cell {
160 dz: dz as i32,
161 dx: dx as i32,
162 dy: dy as i32,
163 w: cell.w,
164 h: cell.h,
165 code_id: cell.code_id.to_vec().into(),
166 rle: cell.rle,
167 r#type: proto::CellType::from(cell.cell_type) as i32,
168 importance_q: cell.importance as u32,
169 }
170 })
171 .collect();
172
173 let dict = self
174 .dict
175 .iter()
176 .map(|(code_id, payload)| proto::DictEntry {
177 code_id: code_id.to_vec().into(),
178 payload_utf8: payload.clone(),
179 })
180 .collect();
181
182 let numguards = self
183 .numguards
184 .iter()
185 .map(|guard| proto::NumGuard {
186 z: guard.z,
187 x: guard.x,
188 y: guard.y,
189 units: guard.units.clone(),
190 sha1: guard.sha1.to_vec().into(),
191 })
192 .collect();
193
194 proto::Document {
195 header: Some(proto::Header {
196 version: self.header.version,
197 grid: self.header.grid.clone(),
198 codeset: self.header.codeset.clone(),
199 }),
200 pages: self
201 .pages
202 .iter()
203 .map(|p| proto::PageInfo {
204 z: p.z,
205 width_px: p.width_px,
206 height_px: p.height_px,
207 })
208 .collect(),
209 cells,
210 dict,
211 numguards,
212 }
213 }
214
215 pub fn from_proto(doc: proto::Document) -> Result<Self> {
216 let header = doc
217 .header
218 .map(|h| Header {
219 version: h.version,
220 grid: h.grid,
221 codeset: h.codeset,
222 })
223 .unwrap_or_default();
224
225 let pages = doc
226 .pages
227 .into_iter()
228 .map(|p| PageInfo {
229 z: p.z,
230 width_px: p.width_px,
231 height_px: p.height_px,
232 })
233 .collect();
234
235 let mut cells = Vec::new();
236 let mut prev = (0i64, 0i64, 0i64);
237 for cell in doc.cells {
238 prev.0 += cell.dz as i64;
239 prev.1 += cell.dx as i64;
240 prev.2 += cell.dy as i64;
241 let mut code_id = [0u8; 32];
242 code_id.copy_from_slice(&cell.code_id);
243 cells.push(CellRecord {
244 z: prev.0 as u32,
245 x: prev.1 as i32,
246 y: prev.2 as i32,
247 w: cell.w,
248 h: cell.h,
249 code_id,
250 rle: cell.rle,
251 cell_type: proto::CellType::try_from(cell.r#type)
252 .map(CellType::from)
253 .unwrap_or(CellType::Text),
254 importance: cell.importance_q as u8,
255 });
256 }
257
258 let mut dict = IndexMap::new();
259 for entry in doc.dict {
260 let mut code_id = [0u8; 32];
261 code_id.copy_from_slice(&entry.code_id);
262 dict.insert(code_id, entry.payload_utf8);
263 }
264
265 let numguards = doc
266 .numguards
267 .into_iter()
268 .map(|guard| {
269 let mut sha = [0u8; 20];
270 sha.copy_from_slice(&guard.sha1);
271 NumGuard {
272 z: guard.z,
273 x: guard.x,
274 y: guard.y,
275 units: guard.units,
276 sha1: sha,
277 }
278 })
279 .collect();
280
281 Ok(Self {
282 header,
283 pages,
284 cells,
285 dict,
286 numguards,
287 })
288 }
289
290 pub fn to_bytes(&self) -> Result<Vec<u8>> {
291 let proto = self.to_proto();
292 let mut buf = Vec::with_capacity(proto.encoded_len());
293 proto.encode(&mut buf)?;
294 let mut encoder = zstd::stream::Encoder::new(Vec::new(), 3)?;
295 encoder.write_all(&buf)?;
296 let data = encoder.finish()?;
297 Ok(data)
298 }
299
300 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
301 let mut decoder = zstd::stream::Decoder::new(bytes)?;
302 let mut buf = Vec::new();
303 decoder.read_to_end(&mut buf)?;
304 let proto = proto::Document::decode(&*buf)?;
305 Self::from_proto(proto)
306 }
307
308 pub fn save_bin<P: AsRef<Path>>(&self, path: P) -> Result<()> {
309 let bytes = self.to_bytes()?;
310 let mut file = File::create(path)?;
311 file.write_all(&bytes)?;
312 Ok(())
313 }
314
315 pub fn load_bin<P: AsRef<Path>>(path: P) -> Result<Self> {
316 let mut file = File::open(path)?;
317 let mut buf = Vec::new();
318 file.read_to_end(&mut buf)?;
319 Self::from_bytes(&buf)
320 }
321
322 pub fn save_json<P: AsRef<Path>>(&self, path: P) -> Result<()> {
323 let mut file = File::create(path)?;
324 serde_json::to_writer_pretty(&mut file, self)?;
325 Ok(())
326 }
327
328 pub fn load_json<P: AsRef<Path>>(path: P) -> Result<Self> {
329 let file = File::open(path)?;
330 let doc: Document = serde_json::from_reader(file)?;
331 Ok(doc)
332 }
333
334 pub fn total_cells(&self) -> usize {
335 self.cells.len()
336 }
337
338 pub fn total_pages(&self) -> usize {
339 self.pages.len()
340 }
341
342 pub fn ensure_dict_entry(&mut self, payload: &str) -> CodeHash {
343 let hash = hash_payload(payload);
344 self.dict.entry(hash).or_insert_with(|| payload.to_string());
345 hash
346 }
347
348 pub fn page_dims(&self, z: u32) -> Option<(u32, u32)> {
349 self.pages
350 .iter()
351 .find(|p| p.z == z)
352 .map(|p| (p.width_px, p.height_px))
353 }
354
355 pub fn iter_cells(&self) -> impl Iterator<Item = &CellRecord> {
356 self.cells.iter()
357 }
358
359 pub fn decode_to_text(&self) -> String {
360 let ordered = self.ordered_cells();
361 self.decode_cells_to_text(&ordered)
362 }
363
364 pub fn decode_page_to_text(&self, z: u32) -> String {
365 let mut page_cells: Vec<_> = self.cells.iter().filter(|c| c.z == z).cloned().collect();
366 page_cells.sort_by_key(|c| (c.y, c.x));
367 self.decode_cells_to_text(&page_cells)
368 }
369
370 pub fn decode_cells_to_text(&self, cells: &[CellRecord]) -> String {
371 let mut lines = Vec::with_capacity(cells.len());
372 for cell in cells {
373 if let Some(payload) = self.payload_for(&cell.code_id) {
374 lines.push(payload.to_string());
375 }
376 }
377 lines.join("\n")
378 }
379
380 pub fn cells_in_bbox(&self, z: u32, x0: i32, y0: i32, x1: i32, y1: i32) -> Vec<CellRecord> {
381 let (min_x, max_x) = if x0 <= x1 { (x0, x1) } else { (x1, x0) };
382 let (min_y, max_y) = if y0 <= y1 { (y0, y1) } else { (y1, y0) };
383 let mut matches: Vec<_> = self
384 .cells
385 .iter()
386 .filter(|cell| {
387 if cell.z != z {
388 return false;
389 }
390 let cell_x1 = cell.x + cell.w as i32;
391 let cell_y1 = cell.y + cell.h as i32;
392 cell.x <= max_x && cell_x1 >= min_x && cell.y <= max_y && cell_y1 >= min_y
393 })
394 .cloned()
395 .collect();
396 matches.sort_by_key(|c| (c.y, c.x));
397 matches
398 }
399}
400
401pub fn hash_payload(payload: &str) -> CodeHash {
402 let mut hasher = blake3::Hasher::new();
403 hasher.update(payload.as_bytes());
404 let hash = hasher.finalize();
405 let mut bytes = [0u8; 32];
406 bytes.copy_from_slice(hash.as_bytes());
407 bytes
408}
409
410impl Document {
411 pub fn retain_dict_for_cells(&mut self) {
412 let used: HashSet<_> = self.cells.iter().map(|c| c.code_id).collect();
413 self.dict.retain(|code, _| used.contains(code));
414 }
415
416 pub fn numguard_mismatches(&self) -> Vec<NumGuardAlert> {
417 self.numguard_mismatches_with_units(None)
418 }
419
420 pub fn numguard_mismatches_with_units(
421 &self,
422 whitelist: Option<&HashSet<String>>,
423 ) -> Vec<NumGuardAlert> {
424 let whitelist =
425 whitelist.map(|set| set.iter().map(|s| s.to_lowercase()).collect::<HashSet<_>>());
426 let mut alerts = Vec::new();
427 for guard in &self.numguards {
428 if let Some(ref allowed) = whitelist {
429 if !guard.units.is_empty() && !allowed.contains(&guard.units.to_lowercase()) {
430 alerts.push(NumGuardAlert {
431 guard: guard.clone(),
432 observed: None,
433 issue: NumGuardIssue::UnitNotAllowed,
434 });
435 continue;
436 }
437 }
438 let cell = self.cells.iter().find(|c| {
439 c.z == guard.z && c.x.max(0) as u32 == guard.x && c.y.max(0) as u32 == guard.y
440 });
441 if let Some(cell) = cell {
442 if let Some(payload) = self.payload_for(&cell.code_id) {
443 if let Some(actual) = crate::numguard::hash_digits_from_payload(payload) {
444 if actual != guard.sha1 {
445 alerts.push(NumGuardAlert {
446 guard: guard.clone(),
447 observed: Some(actual),
448 issue: NumGuardIssue::HashMismatch,
449 });
450 }
451 continue;
452 }
453 alerts.push(NumGuardAlert {
454 guard: guard.clone(),
455 observed: None,
456 issue: NumGuardIssue::MissingPayload,
457 });
458 continue;
459 }
460 }
461 alerts.push(NumGuardAlert {
462 guard: guard.clone(),
463 observed: None,
464 issue: NumGuardIssue::MissingCell,
465 });
466 }
467 alerts
468 }
469}
470
471#[derive(Debug, Clone)]
472pub struct NumGuardAlert {
473 pub guard: NumGuard,
474 pub observed: Option<[u8; 20]>,
475 pub issue: NumGuardIssue,
476}
477
478#[derive(Debug, Clone, Copy, PartialEq, Eq)]
479pub enum NumGuardIssue {
480 MissingCell,
481 MissingPayload,
482 HashMismatch,
483 UnitNotAllowed,
484}
485
486mod dict_serde {
487 use super::CodeHash;
488 use indexmap::IndexMap;
489 use serde::ser::Serialize;
490 use serde::{de::Error, Deserialize, Deserializer, Serializer};
491
492 pub fn serialize<S>(map: &IndexMap<CodeHash, String>, serializer: S) -> Result<S::Ok, S::Error>
493 where
494 S: Serializer,
495 {
496 let as_vec: Vec<_> = map
497 .iter()
498 .map(|(code, payload)| (hex::encode(code), payload))
499 .collect();
500 as_vec.serialize(serializer)
501 }
502
503 pub fn deserialize<'de, D>(deserializer: D) -> Result<IndexMap<CodeHash, String>, D::Error>
504 where
505 D: Deserializer<'de>,
506 {
507 let raw: Vec<(String, String)> = Vec::deserialize(deserializer)?;
508 let mut map = IndexMap::new();
509 for (hex_code, payload) in raw {
510 let bytes = hex::decode(&hex_code).map_err(D::Error::custom)?;
511 if bytes.len() != 32 {
512 return Err(D::Error::custom("invalid code hash length"));
513 }
514 let mut code = [0u8; 32];
515 code.copy_from_slice(&bytes);
516 map.insert(code, payload);
517 }
518 Ok(map)
519 }
520}
521
522mod codehash_serde {
523 use serde::{de::Error, Deserialize, Deserializer, Serializer};
524
525 pub fn serialize<S>(hash: &[u8; 32], serializer: S) -> Result<S::Ok, S::Error>
526 where
527 S: Serializer,
528 {
529 serializer.serialize_str(&hex::encode(hash))
530 }
531
532 pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 32], D::Error>
533 where
534 D: Deserializer<'de>,
535 {
536 let s = String::deserialize(deserializer)?;
537 let bytes = hex::decode(&s).map_err(D::Error::custom)?;
538 if bytes.len() != 32 {
539 return Err(D::Error::custom("invalid hash length"));
540 }
541 let mut hash = [0u8; 32];
542 hash.copy_from_slice(&bytes);
543 Ok(hash)
544 }
545}
546
547mod numhash_serde {
548 use serde::{de::Error, Deserialize, Deserializer, Serializer};
549
550 pub fn serialize<S>(hash: &[u8; 20], serializer: S) -> Result<S::Ok, S::Error>
551 where
552 S: Serializer,
553 {
554 serializer.serialize_str(&hex::encode(hash))
555 }
556
557 pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 20], D::Error>
558 where
559 D: Deserializer<'de>,
560 {
561 let s = String::deserialize(deserializer)?;
562 let bytes = hex::decode(&s).map_err(D::Error::custom)?;
563 if bytes.len() != 20 {
564 return Err(D::Error::custom("invalid sha1 length"));
565 }
566 let mut hash = [0u8; 20];
567 hash.copy_from_slice(&bytes);
568 Ok(hash)
569 }
570}