1use crate::jbig2arith::{IntProc, Jbig2ArithCoder};
3use crate::jbig2classify::{
4 FamilyBucketKey, SymbolSignature, compute_symbol_signature as compute_symbol_signature_shared,
5 family_bucket_key_for_symbol, family_bucket_neighbors, family_match_details,
6 family_signatures_are_compatible, refine_compare_score,
7};
8use crate::jbig2comparator::{Comparator, MAX_DIMENSION_DELTA};
9use crate::jbig2context::build_symbol_context_model;
10use crate::jbig2cost::{symbol_dictionary_entries_bytes, symbol_dictionary_entry_bytes};
11use crate::jbig2unify::{SymbolUnifyInputs, UnifiedClass};
12#[cfg(feature = "cc-analysis")]
14use crate::jbig2cc::analyze_page;
15use crate::jbig2structs::{
16 FileHeader, GenericRegionConfig, GenericRegionParams, Jbig2Config, LossySymbolMode, PageInfo,
17 Segment, SegmentType, SymbolDictParams, TextRegionParams,
18};
19
20use crate::jbig2sym::{BitImage, Rect};
21use anyhow::{Result, anyhow};
22
23#[macro_export]
25macro_rules! debug {
26 ($($arg:tt)*) => {
27 #[cfg(feature = "trace_encoder")]
28 log::debug!($($arg)*);
29
30 #[cfg(not(feature = "trace_encoder"))]
31 let _ = format_args!($($arg)*);
32 };
33}
34
35#[macro_export]
36macro_rules! trace {
37 ($($arg:tt)*) => {
38 #[cfg(feature = "trace_encoder")]
39 log::trace!($($arg)*);
40
41 #[cfg(not(feature = "trace_encoder"))]
42 let _ = format_args!($($arg)*);
43 };
44}
45
46#[allow(unused_imports)]
48use crate::{debug, trace};
49
50use ndarray::Array2;
51use rustc_hash::{FxHashMap, FxHashSet};
52use std::collections::{HashMap, HashSet, VecDeque};
53use std::hash::{Hash, Hasher};
54use std::time::{Duration, Instant};
55
56#[cfg(feature = "parallel")]
57use rayon::prelude::*;
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub struct HashKey(u64);
62
63const RECENT_SYMBOL_CACHE_CAP: usize = 64;
64const SYM_UNIFY_EXACT_ANCHOR_BUDGET: usize = 32;
65const SYM_UNIFY_NEIGHBOR_ANCHOR_BUDGET: usize = 16;
66const SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE: usize = 8;
67const SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN: usize = 4;
68
69fn encoder_diagnostics_enabled() -> bool {
70 std::env::var("JBIG2_DIAGNOSTICS").is_ok_and(|value| value != "0" && !value.is_empty())
71}
72
73#[inline]
74fn indexed_symbol_dictionary_bytes(symbols: &[BitImage], indices: &[usize]) -> usize {
75 indices
76 .iter()
77 .copied()
78 .map(|index| symbol_dictionary_entry_bytes(&symbols[index]))
79 .sum()
80}
81
82#[inline]
83fn anchor_map_dictionary_bytes(
84 symbols: &[BitImage],
85 anchor_map: &FxHashMap<FamilyBucketKey, Vec<usize>>,
86) -> usize {
87 anchor_map
88 .values()
89 .flat_map(|bucket| bucket.iter().copied())
90 .map(|index| symbol_dictionary_entry_bytes(&symbols[index]))
91 .sum()
92}
93
94#[derive(Debug, Clone, Copy)]
95enum SymUnifyAnchorDecision {
96 Accept {
97 score: u32,
98 dx: i32,
99 dy: i32,
100 },
101 RejectDim,
102 RejectPixelDelta,
103 RejectSignature,
104 RejectOverlap,
105 RejectCompare,
106 RejectScore {
107 score: u32,
108 limit: u32,
109 dx: i32,
110 dy: i32,
111 },
112 RejectOutsideInk,
113}
114
115impl SymUnifyAnchorDecision {
116 fn label(self) -> &'static str {
117 match self {
118 SymUnifyAnchorDecision::Accept { .. } => "accept",
119 SymUnifyAnchorDecision::RejectDim => "dim",
120 SymUnifyAnchorDecision::RejectPixelDelta => "pixel_delta",
121 SymUnifyAnchorDecision::RejectSignature => "signature",
122 SymUnifyAnchorDecision::RejectOverlap => "overlap",
123 SymUnifyAnchorDecision::RejectCompare => "compare",
124 SymUnifyAnchorDecision::RejectScore { .. } => "score",
125 SymUnifyAnchorDecision::RejectOutsideInk => "outside_ink",
126 }
127 }
128
129 fn diagnostic_rank(self) -> u8 {
130 match self {
131 SymUnifyAnchorDecision::Accept { .. } => 255,
132 SymUnifyAnchorDecision::RejectScore { .. } => 7,
133 SymUnifyAnchorDecision::RejectOutsideInk => 6,
134 SymUnifyAnchorDecision::RejectCompare => 5,
135 SymUnifyAnchorDecision::RejectOverlap => 4,
136 SymUnifyAnchorDecision::RejectSignature => 3,
137 SymUnifyAnchorDecision::RejectPixelDelta => 2,
138 SymUnifyAnchorDecision::RejectDim => 1,
139 }
140 }
141}
142
143#[inline]
144fn update_best_reject(best: &mut Option<SymUnifyAnchorDecision>, decision: SymUnifyAnchorDecision) {
145 if !matches!(decision, SymUnifyAnchorDecision::Accept { .. })
146 && best.is_none_or(|current| decision.diagnostic_rank() > current.diagnostic_rank())
147 {
148 *best = Some(decision);
149 }
150}
151
152#[inline]
153fn bitmap_proxy_bytes(symbol: &BitImage) -> usize {
154 (symbol.width.saturating_mul(symbol.height).saturating_add(7)) / 8
155}
156
157#[inline]
158fn classify_residual_shape(symbol: &BitImage) -> ResidualShapeKind {
159 let area = symbol.width.saturating_mul(symbol.height);
160 let black = symbol.count_ones();
161 if area <= 16 || black <= 2 {
162 ResidualShapeKind::Tiny
163 } else if crate::jbig2shared::symbol_likely_punctuation_or_mark(symbol) {
164 ResidualShapeKind::PunctuationLike
165 } else {
166 ResidualShapeKind::GlyphLike
167 }
168}
169
170#[inline]
171fn record_counterfactual_probe(
172 stats: &mut CounterfactualProbeStats,
173 page_num: usize,
174 symbol_index: usize,
175 symbol: &BitImage,
176 black_pixels: usize,
177) {
178 stats.symbol_count += 1;
179 stats.black_pixels += black_pixels;
180 stats.bitmap_proxy_bytes += bitmap_proxy_bytes(symbol);
181 stats.pages.insert(page_num);
182 if stats.samples.len() < 8 {
183 stats
184 .samples
185 .push((page_num + 1, symbol_index, symbol.width, symbol.height));
186 }
187}
188
189#[inline]
190fn record_labeled_counterfactual_probe(
191 stats_map: &mut FxHashMap<&'static str, CounterfactualProbeStats>,
192 label: &'static str,
193 page_num: usize,
194 symbol_index: usize,
195 symbol: &BitImage,
196 black_pixels: usize,
197) {
198 let stats = stats_map.entry(label).or_default();
199 record_counterfactual_probe(stats, page_num, symbol_index, symbol, black_pixels);
200}
201
202#[inline]
203fn relaxed_compare_probe_max_err(candidate: &BitImage, proto: &BitImage) -> u32 {
204 candidate
205 .width
206 .max(proto.width)
207 .saturating_mul(candidate.height.max(proto.height)) as u32
208}
209
210#[inline]
211fn record_detailed_compare_probe(
212 stats: &mut DetailedCompareProbeStats,
213 page_num: usize,
214 symbol_index: usize,
215 symbol: &BitImage,
216 result: crate::jbig2comparator::CompareResult,
217 compare_max_err: u32,
218 exact_dims: bool,
219 strong_anchor: bool,
220) {
221 stats.symbol_count += 1;
222 stats.bitmap_proxy_bytes += bitmap_proxy_bytes(symbol);
223 stats.pages.insert(page_num);
224 stats.exact_dims_count += usize::from(exact_dims);
225 stats.strong_anchor_count += usize::from(strong_anchor);
226 stats.shift_le1_count += usize::from(result.dx.abs() <= 1 && result.dy.abs() <= 1);
227
228 let over_by = result.total_err.saturating_sub(compare_max_err);
229 if over_by <= 2 {
230 stats.over_by_le2_count += 1;
231 } else if over_by <= 4 {
232 stats.over_by_le4_count += 1;
233 } else if over_by <= 8 {
234 stats.over_by_le8_count += 1;
235 } else {
236 stats.over_by_gt8_count += 1;
237 }
238
239 if stats.samples.len() < 8 {
240 stats.samples.push((
241 page_num + 1,
242 symbol_index,
243 symbol.width,
244 symbol.height,
245 result.total_err,
246 compare_max_err,
247 result.overlap_err,
248 result.outside_ink_err,
249 result.dx,
250 result.dy,
251 ));
252 }
253}
254
255impl ResidualSymbolTrace {
256 fn reason_code(self) -> ResidualReasonCode {
257 if self.local_use_count != 1 {
258 return ResidualReasonCode::NonSingletonResidual;
259 }
260
261 if self.had_global_candidates {
262 return match self
263 .global_best_reject
264 .unwrap_or(SymUnifyAnchorDecision::RejectDim)
265 {
266 SymUnifyAnchorDecision::RejectDim => ResidualReasonCode::UseCountOneGlobalRejectDim,
267 SymUnifyAnchorDecision::RejectPixelDelta => {
268 ResidualReasonCode::UseCountOneGlobalRejectPixelDelta
269 }
270 SymUnifyAnchorDecision::RejectSignature => {
271 ResidualReasonCode::UseCountOneGlobalRejectSignature
272 }
273 SymUnifyAnchorDecision::RejectOverlap => {
274 ResidualReasonCode::UseCountOneGlobalRejectOverlap
275 }
276 SymUnifyAnchorDecision::RejectCompare => {
277 ResidualReasonCode::UseCountOneGlobalRejectCompare
278 }
279 SymUnifyAnchorDecision::RejectOutsideInk => {
280 ResidualReasonCode::UseCountOneGlobalRejectOutsideInk
281 }
282 SymUnifyAnchorDecision::RejectScore { .. } => {
283 ResidualReasonCode::UseCountOneGlobalRejectScore
284 }
285 SymUnifyAnchorDecision::Accept { .. } => {
286 ResidualReasonCode::UseCountOneNoCandidates
287 }
288 };
289 }
290
291 if self.had_local_candidates {
292 return match self
293 .local_best_reject
294 .unwrap_or(SymUnifyAnchorDecision::RejectDim)
295 {
296 SymUnifyAnchorDecision::RejectDim => ResidualReasonCode::UseCountOneLocalRejectDim,
297 SymUnifyAnchorDecision::RejectPixelDelta => {
298 ResidualReasonCode::UseCountOneLocalRejectPixelDelta
299 }
300 SymUnifyAnchorDecision::RejectSignature => {
301 ResidualReasonCode::UseCountOneLocalRejectSignature
302 }
303 SymUnifyAnchorDecision::RejectOverlap => {
304 ResidualReasonCode::UseCountOneLocalRejectOverlap
305 }
306 SymUnifyAnchorDecision::RejectCompare => {
307 ResidualReasonCode::UseCountOneLocalRejectCompare
308 }
309 SymUnifyAnchorDecision::RejectOutsideInk => {
310 ResidualReasonCode::UseCountOneLocalRejectOutsideInk
311 }
312 SymUnifyAnchorDecision::RejectScore { .. } => {
313 ResidualReasonCode::UseCountOneLocalRejectScore
314 }
315 SymUnifyAnchorDecision::Accept { .. } => {
316 ResidualReasonCode::UseCountOneNoCandidates
317 }
318 };
319 }
320
321 ResidualReasonCode::UseCountOneNoCandidates
322 }
323}
324
325#[derive(Debug, Clone, Copy)]
326struct SymUnifyAnchorCandidate {
327 anchor_index: usize,
328 score: u32,
329 dx: i32,
330 dy: i32,
331 rerank_cost: u32,
332 rescued_on_score: bool,
333}
334
335#[derive(Debug, Clone, Copy)]
336struct ResidualSymbolTrace {
337 page_num: usize,
338 local_use_count: usize,
339 had_local_candidates: bool,
340 had_global_candidates: bool,
341 local_best_reject: Option<SymUnifyAnchorDecision>,
342 global_best_reject: Option<SymUnifyAnchorDecision>,
343}
344
345#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
346enum ResidualReasonCode {
347 UseCountOneNoCandidates,
348 UseCountOneLocalRejectDim,
349 UseCountOneLocalRejectPixelDelta,
350 UseCountOneLocalRejectSignature,
351 UseCountOneLocalRejectOverlap,
352 UseCountOneLocalRejectCompare,
353 UseCountOneLocalRejectOutsideInk,
354 UseCountOneLocalRejectScore,
355 UseCountOneGlobalRejectDim,
356 UseCountOneGlobalRejectPixelDelta,
357 UseCountOneGlobalRejectSignature,
358 UseCountOneGlobalRejectOverlap,
359 UseCountOneGlobalRejectCompare,
360 UseCountOneGlobalRejectOutsideInk,
361 UseCountOneGlobalRejectScore,
362 NonSingletonResidual,
363}
364
365impl ResidualReasonCode {
366 fn label(self) -> &'static str {
367 match self {
368 ResidualReasonCode::UseCountOneNoCandidates => "UseCountOneNoCandidates",
369 ResidualReasonCode::UseCountOneLocalRejectDim => "UseCountOneLocalRejectDim",
370 ResidualReasonCode::UseCountOneLocalRejectPixelDelta => {
371 "UseCountOneLocalRejectPixelDelta"
372 }
373 ResidualReasonCode::UseCountOneLocalRejectSignature => {
374 "UseCountOneLocalRejectSignature"
375 }
376 ResidualReasonCode::UseCountOneLocalRejectOverlap => "UseCountOneLocalRejectOverlap",
377 ResidualReasonCode::UseCountOneLocalRejectCompare => "UseCountOneLocalRejectCompare",
378 ResidualReasonCode::UseCountOneLocalRejectOutsideInk => {
379 "UseCountOneLocalRejectOutsideInk"
380 }
381 ResidualReasonCode::UseCountOneLocalRejectScore => "UseCountOneLocalRejectScore",
382 ResidualReasonCode::UseCountOneGlobalRejectDim => "UseCountOneGlobalRejectDim",
383 ResidualReasonCode::UseCountOneGlobalRejectPixelDelta => {
384 "UseCountOneGlobalRejectPixelDelta"
385 }
386 ResidualReasonCode::UseCountOneGlobalRejectSignature => {
387 "UseCountOneGlobalRejectSignature"
388 }
389 ResidualReasonCode::UseCountOneGlobalRejectOverlap => "UseCountOneGlobalRejectOverlap",
390 ResidualReasonCode::UseCountOneGlobalRejectCompare => "UseCountOneGlobalRejectCompare",
391 ResidualReasonCode::UseCountOneGlobalRejectOutsideInk => {
392 "UseCountOneGlobalRejectOutsideInk"
393 }
394 ResidualReasonCode::UseCountOneGlobalRejectScore => "UseCountOneGlobalRejectScore",
395 ResidualReasonCode::NonSingletonResidual => "NonSingletonResidual",
396 }
397 }
398}
399
400#[derive(Debug, Clone, Copy, PartialEq, Eq)]
401enum ResidualShapeKind {
402 Tiny,
403 PunctuationLike,
404 GlyphLike,
405}
406
407#[derive(Debug, Clone, Default)]
408struct ResidualReasonStats {
409 symbol_count: usize,
410 instance_count: usize,
411 black_pixels: usize,
412 bitmap_proxy_bytes: usize,
413 pages: FxHashSet<usize>,
414 tiny_count: usize,
415 punctuation_like_count: usize,
416 glyph_like_count: usize,
417 samples: Vec<(usize, usize, usize, usize, usize)>,
418}
419
420#[derive(Debug, Clone, Default)]
421struct CounterfactualProbeStats {
422 symbol_count: usize,
423 black_pixels: usize,
424 bitmap_proxy_bytes: usize,
425 pages: FxHashSet<usize>,
426 samples: Vec<(usize, usize, usize, usize)>,
427}
428
429#[derive(Debug, Clone, Default)]
430struct DetailedCompareProbeStats {
431 symbol_count: usize,
432 bitmap_proxy_bytes: usize,
433 pages: FxHashSet<usize>,
434 exact_dims_count: usize,
435 strong_anchor_count: usize,
436 shift_le1_count: usize,
437 over_by_le2_count: usize,
438 over_by_le4_count: usize,
439 over_by_le8_count: usize,
440 over_by_gt8_count: usize,
441 samples: Vec<(usize, usize, usize, usize, u32, u32, u32, u32, i32, i32)>,
442}
443
444#[derive(Debug)]
445struct RecentSymbolCache {
446 recent: VecDeque<usize>,
447 cap: usize,
448}
449
450impl RecentSymbolCache {
451 fn new(cap: usize) -> Self {
452 Self {
453 recent: VecDeque::with_capacity(cap),
454 cap,
455 }
456 }
457
458 fn clear(&mut self) {
459 self.recent.clear();
460 }
461
462 fn touch(&mut self, idx: usize) {
463 if let Some(pos) = self.recent.iter().position(|&entry| entry == idx) {
464 self.recent.remove(pos);
465 }
466 self.recent.push_front(idx);
467 while self.recent.len() > self.cap {
468 self.recent.pop_back();
469 }
470 }
471
472 fn iter(&self) -> impl Iterator<Item = usize> + '_ {
473 self.recent.iter().copied()
474 }
475
476 fn copy_into(&self, out: &mut [usize]) -> usize {
477 let mut len = 0usize;
478 for idx in self.recent.iter().copied() {
479 if len >= out.len() {
480 break;
481 }
482 out[len] = idx;
483 len += 1;
484 }
485 len
486 }
487}
488
489impl Hash for HashKey {
490 fn hash<H: Hasher>(&self, state: &mut H) {
491 self.0.hash(state);
492 }
493}
494
495impl std::fmt::Display for HashKey {
496 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
497 write!(f, "HashKey({:x})", self.0)
498 }
499}
500
501#[derive(Debug, Clone)]
503pub struct SymbolCandidate {
504 pub bitmap: BitImage,
506 pub bbox: Rect,
508}
509
510pub fn segment_symbols(image: &BitImage, dpi: i32, losslevel: i32) -> Result<Vec<SymbolCandidate>> {
520 #[cfg(feature = "cc-analysis")]
521 {
522 let cc_image = analyze_page(image, dpi, losslevel);
524 let shapes = cc_image.extract_shapes();
525
526 let mut candidates = Vec::with_capacity(shapes.len());
527 for (bitmap, bbox) in shapes {
528 let rect = Rect {
529 x: bbox.xmin as u32,
530 y: bbox.ymin as u32,
531 width: bbox.width() as u32,
532 height: bbox.height() as u32,
533 };
534 candidates.push(SymbolCandidate { bitmap, bbox: rect });
535 }
536 Ok(candidates)
537 }
538 #[cfg(not(feature = "cc-analysis"))]
539 {
540 Err(anyhow!("Symbol segmentation requires cc-analysis feature"))
541 }
542}
543
544#[derive(Clone)]
547pub struct SymbolInstance {
548 pub symbol_index: usize,
549 pub position: Rect,
550 pub instance_bitmap: BitImage,
551 pub needs_refinement: bool,
553 pub refinement_dx: i32,
555 pub refinement_dy: i32,
557}
558
559impl SymbolInstance {
560 pub fn symbol_index(&self) -> usize {
561 self.symbol_index
562 }
563
564 pub fn position(&self) -> Rect {
565 self.position
566 }
567
568 pub fn instance_bitmap(&self) -> &BitImage {
569 &self.instance_bitmap
570 }
571}
572
573#[derive(Clone)]
574pub struct PageData {
575 pub image: BitImage,
576 pub symbol_instances: Vec<SymbolInstance>,
577}
578
579#[derive(Debug, Clone, Default)]
580pub struct SymbolModeStageMetrics {
581 pub cc_extraction: Duration,
582 pub matching_dedup: Duration,
583 pub clustering: Duration,
584 pub planning: Duration,
585 pub symbol_dict_encoding: Duration,
586 pub text_region_encoding: Duration,
587 pub generic_region_encoding: Duration,
588}
589
590#[derive(Debug, Clone, Default)]
591pub struct SymbolModeStats {
592 pub symbols_discovered: usize,
593 pub symbols_exported: usize,
594 pub avg_symbol_reuse: f64,
595 pub global_symbol_count: usize,
596 pub local_symbol_count: usize,
597 pub comparator_calls: usize,
598 pub comparator_hits: usize,
599 pub exact_hits: usize,
600 pub refined_hits: usize,
601 pub signature_rejects: usize,
602}
603
604#[derive(Debug, Clone, Default)]
605pub struct EncoderMetrics {
606 pub symbol_mode: SymbolModeStageMetrics,
607 pub symbol_stats: SymbolModeStats,
608}
609
610#[derive(Debug, Clone)]
611pub struct PdfSplitOutput {
612 pub global_segments: Option<Vec<u8>>,
613 pub page_streams: Vec<Vec<u8>>,
614 pub local_dict_bytes_per_page: Vec<usize>,
615 pub text_region_bytes_per_page: Vec<usize>,
616 pub generic_region_bytes_per_page: Vec<usize>,
617}
618
619#[derive(Debug)]
620struct PlannedPage {
621 page_number: u32,
622 segments: Vec<Segment>,
623}
624
625#[derive(Debug)]
626struct PlannedDocument {
627 file_header: Option<FileHeader>,
628 global_segments: Vec<Segment>,
629 pages: Vec<PlannedPage>,
630 eof_segment: Option<Segment>,
631 next_segment_number: u32,
632}
633
634#[derive(Debug, Clone)]
635struct PlannedPageLayout {
636 page_index: usize,
637 page_number: u32,
638 page_info_segment_number: u32,
639 local_dict_segment_numbers: Vec<u32>,
640 local_dict_layout: Option<SymbolDictLayout>,
641 region_segment_number: u32,
642 residual_region_segment_number: Option<u32>,
643 end_of_page_segment_number: u32,
644 local_symbols: Vec<usize>,
645 residual_symbols: Vec<usize>,
646 residual_anchor_remaps: FxHashMap<usize, usize>,
647 use_generic_region: bool,
648}
649
650#[derive(Debug)]
651struct BuiltPage {
652 page: PlannedPage,
653 symbol_dict_time: Duration,
654 text_region_time: Duration,
655 generic_region_time: Duration,
656}
657
658#[derive(Debug, Clone, Default)]
659struct SymbolDictLayout {
660 export_input_indices: Vec<usize>,
661 refinements: Vec<Option<RefinementPlan>>,
662 diagnostics: SymbolDictDiagnostics,
663}
664
665impl SymbolDictLayout {
666 fn segment_count(&self) -> usize {
667 if self.export_input_indices.is_empty() {
668 0
669 } else {
670 1
671 }
672 }
673}
674
675#[derive(Debug, Clone, Default)]
676struct SymbolDictDiagnostics {
677 family_count: usize,
678 singleton_family_count: usize,
679 refined_member_count: usize,
680 exported_member_count: usize,
681 sample_lines: Vec<String>,
682}
683
684#[derive(Debug, Clone, Copy)]
685struct RefinementPlan {
686 prototype_input_index: usize,
687 refinement_dx: i32,
688 refinement_dy: i32,
689}
690
691#[derive(Debug, Clone, Default)]
692struct EncodedSymbolDictionary {
693 payload: Vec<u8>,
694 input_to_exported_pos: Vec<u32>,
695 exported_symbol_count: u32,
696}
697
698#[derive(Debug, Default)]
700struct EncoderState {
701 pdf_mode: bool,
702 full_headers_remaining: bool,
703 segment: bool,
704 use_refinement: bool,
705 use_delta_encoding: bool,
706 lossy_symbol_mode_applied: bool,
707 ingest_debug_lines: Vec<String>,
708 decision_debug_lines: Vec<String>,
709}
710
711pub struct Jbig2Encoder<'a> {
716 config: &'a Jbig2Config,
718
719 state: EncoderState,
721
722 global_symbols: Vec<BitImage>,
724
725 symbol_usage: Vec<usize>,
727
728 symbol_pixel_counts: Vec<usize>,
730
731 symbol_signatures: Vec<SymbolSignature>,
733
734 symbol_page_count: Vec<usize>,
736
737 symbol_last_page_seen: Vec<Option<usize>>,
739
740 hash_map: FxHashMap<HashKey, Vec<usize>>,
742
743 pages: Vec<PageData>,
745
746 page_symbol_indices: Vec<Vec<usize>>,
748
749 next_segment_number: u32,
751
752 global_dict_segment_numbers: Vec<u32>,
754
755 metrics: EncoderMetrics,
757}
758
759impl<'a> Jbig2Encoder<'a> {
760 pub fn new(config: &'a Jbig2Config) -> Self {
765 if config.refine && !config.symbol_mode {
766 panic!("Refinement requires symbol mode to be enabled.");
767 }
768
769 Self {
770 config,
771 state: EncoderState {
772 pdf_mode: false, full_headers_remaining: config.want_full_headers,
774 segment: true, use_refinement: config.refine, use_delta_encoding: true, lossy_symbol_mode_applied: false,
778 ingest_debug_lines: Vec::new(),
779 decision_debug_lines: Vec::new(),
780 },
781 global_symbols: Vec::new(),
782 symbol_usage: Vec::new(),
783 symbol_pixel_counts: Vec::new(),
784 symbol_signatures: Vec::new(),
785 symbol_page_count: Vec::new(),
786 symbol_last_page_seen: Vec::new(),
787 hash_map: FxHashMap::default(),
788 pages: Vec::new(),
789 page_symbol_indices: Vec::new(),
790 next_segment_number: 0,
791 global_dict_segment_numbers: Vec::new(),
792 metrics: EncoderMetrics::default(),
793 }
794 }
795
796 pub fn dict_only(mut self) -> Self {
797 self.state.full_headers_remaining = false;
798 self.state.pdf_mode = true;
799 self
800 }
801
802 pub fn get_page_count(&self) -> usize {
804 self.pages.len()
805 }
806
807 pub fn metrics_snapshot(&self) -> EncoderMetrics {
808 self.metrics.clone()
809 }
810
811 pub fn decision_debug_log(&self) -> String {
812 if self.state.ingest_debug_lines.is_empty() {
813 return self.state.decision_debug_lines.join("\n");
814 }
815 if self.state.decision_debug_lines.is_empty() {
816 return self.state.ingest_debug_lines.join("\n");
817 }
818
819 let mut out = String::new();
820 out.push_str(&self.state.ingest_debug_lines.join("\n"));
821 out.push('\n');
822 out.push_str(&self.state.decision_debug_lines.join("\n"));
823 out
824 }
825
826 pub fn get_symbol_stats(&self) -> String {
828 let total_symbols = self.global_symbols.len();
829 let avg_usage = if total_symbols > 0 {
830 self.symbol_usage.iter().sum::<usize>() as f32 / total_symbols as f32
831 } else {
832 0.0
833 };
834 let low_usage_count = self.symbol_usage.iter().filter(|&&u| u < 2).count();
835
836 format!(
837 "Total symbols: {}, Average usage: {:.1}, Low usage (<2): {}",
838 total_symbols, avg_usage, low_usage_count
839 )
840 }
841
842 fn compute_symbol_signature(img: &BitImage) -> SymbolSignature {
843 compute_symbol_signature_shared(img)
844 }
845
846 fn signatures_are_compatible(
847 &self,
848 candidate: SymbolSignature,
849 symbol_index: usize,
850 refine: bool,
851 ) -> bool {
852 let stored = self.symbol_signatures[symbol_index];
853 let black_tol = if refine { 12 } else { 8 };
854 let pos_tol = if refine { 2 } else { 2 };
855 let centroid_tol = if refine { 96 } else { 64 };
856
857 candidate.black.abs_diff(stored.black) <= black_tol
858 && candidate.left_col.abs_diff(stored.left_col) <= pos_tol
859 && candidate.right_col.abs_diff(stored.right_col) <= pos_tol
860 && candidate.top_row.abs_diff(stored.top_row) <= pos_tol
861 && candidate.bottom_row.abs_diff(stored.bottom_row) <= pos_tol
862 && candidate.cx_times_256.abs_diff(stored.cx_times_256) <= centroid_tol
863 && candidate.cy_times_256.abs_diff(stored.cy_times_256) <= centroid_tol
864 }
865
866 fn should_skip_symbol_candidate(width: usize, height: usize, black_pixels: usize) -> bool {
867 if width == 0 || height == 0 || black_pixels <= 1 {
868 return true;
869 }
870 if (width >= 64 && height <= 2) || (height >= 64 && width <= 2) {
871 return true;
872 }
873 if width > 256 || height > 256 {
874 return true;
875 }
876
877 let area = width.saturating_mul(height).max(1);
878 let density = black_pixels as f32 / area as f32;
879 let dense_tiny_mark = width <= 6 && height <= 10 && black_pixels <= 24;
880 if dense_tiny_mark {
881 return density < 0.01;
882 }
883 !(0.01..=0.90).contains(&density)
884 }
885
886 #[inline(always)]
887 fn should_accept_match(
888 &self,
889 err: u32,
890 dx: i32,
891 dy: i32,
892 exact_dims: bool,
893 max_err: u32,
894 ) -> (bool, bool) {
895 if err == 0 && dx == 0 && dy == 0 && exact_dims {
896 return (true, false);
897 }
898
899 if self.config.text_refine {
900 if dx.abs() <= 1 && dy.abs() <= 1 && err <= (max_err / 2).max(2) {
901 return (true, true);
902 }
903 return (false, false);
904 }
905
906 if dx.abs() <= 1 && dy == 0 {
907 return (true, false);
908 }
909
910 (false, false)
911 }
912
913 #[inline]
914 fn symbol_unify_assignment_score(result: &crate::jbig2comparator::CompareResult) -> u32 {
915 result
916 .total_err
917 .saturating_add(result.black_delta.saturating_mul(2))
918 .saturating_add(result.outside_ink_err.saturating_mul(3))
919 .saturating_add(((result.dx.abs() + result.dy.abs()) as u32).saturating_mul(3))
920 .saturating_add((result.row_profile_err + result.col_profile_err) / 24)
921 }
922
923 fn sym_unify_context_rerank_cost(candidate: &BitImage, proto: &BitImage) -> u32 {
924 let width = candidate.width.max(proto.width);
925 let height = candidate.height.max(proto.height);
926 let mut cost = 0u32;
927
928 for y in 0..height {
929 for x in 0..width {
930 let cand = candidate.get_usize(x, y);
931 let proto_bit = proto.get_usize(x, y);
932 if !cand && !proto_bit {
933 continue;
934 }
935
936 let proto_support = (-1i32..=1)
937 .flat_map(|dy| (-1i32..=1).map(move |dx| (dx, dy)))
938 .filter(|&(dx, dy)| dx != 0 || dy != 0)
939 .filter(|&(dx, dy)| {
940 let nx = x as i32 + dx;
941 let ny = y as i32 + dy;
942 nx >= 0 && ny >= 0 && proto.get_usize(nx as usize, ny as usize)
943 })
944 .count() as u32;
945 let causal_support = [(-1i32, 0i32), (-1, -1), (0, -1), (1, -1)]
946 .into_iter()
947 .filter(|&(dx, dy)| {
948 let nx = x as i32 + dx;
949 let ny = y as i32 + dy;
950 nx >= 0 && ny >= 0 && candidate.get_usize(nx as usize, ny as usize)
951 })
952 .count() as u32;
953
954 if cand == proto_bit {
955 cost = cost.saturating_add(1 + u32::from(cand && proto_support == 0));
956 } else {
957 cost = cost.saturating_add(4 + proto_support + causal_support);
958 }
959 }
960 }
961
962 cost
963 }
964
965 fn sym_unify_anchor_candidate_is_better(
966 &self,
967 candidate: SymUnifyAnchorCandidate,
968 current: SymUnifyAnchorCandidate,
969 ) -> bool {
970 (
971 !candidate.rescued_on_score,
972 std::cmp::Reverse(candidate.rerank_cost),
973 std::cmp::Reverse(candidate.score),
974 self.symbol_page_count[candidate.anchor_index],
975 self.symbol_usage[candidate.anchor_index],
976 std::cmp::Reverse(candidate.anchor_index),
977 ) > (
978 !current.rescued_on_score,
979 std::cmp::Reverse(current.rerank_cost),
980 std::cmp::Reverse(current.score),
981 self.symbol_page_count[current.anchor_index],
982 self.symbol_usage[current.anchor_index],
983 std::cmp::Reverse(current.anchor_index),
984 )
985 }
986
987 fn maybe_update_best_sym_unify_anchor_candidate(
988 &self,
989 best: &mut Option<SymUnifyAnchorCandidate>,
990 candidate_bitmap: &BitImage,
991 anchor_index: usize,
992 score: u32,
993 dx: i32,
994 dy: i32,
995 rescued_on_score: bool,
996 ) {
997 let rerank_cost = Self::sym_unify_context_rerank_cost(
998 candidate_bitmap,
999 &self.global_symbols[anchor_index],
1000 );
1001 let proposal = SymUnifyAnchorCandidate {
1002 anchor_index,
1003 score,
1004 dx,
1005 dy,
1006 rerank_cost,
1007 rescued_on_score,
1008 };
1009 if best.is_none_or(|current| self.sym_unify_anchor_candidate_is_better(proposal, current)) {
1010 *best = Some(proposal);
1011 }
1012 }
1013
1014 #[inline]
1015 fn sym_unify_anchor_ready(&self, symbol_index: usize, page_num: usize) -> bool {
1016 if self.symbol_usage[symbol_index] < 2 || self.symbol_pixel_counts[symbol_index] <= 1 {
1017 return false;
1018 }
1019
1020 let usage_ready =
1021 self.symbol_usage[symbol_index] >= self.config.sym_unify_min_class_usage.max(2);
1022 let page_span_ready =
1023 self.symbol_page_count[symbol_index] >= self.config.sym_unify_min_page_span.max(2);
1024 let recent_ready = self.symbol_last_page_seen[symbol_index]
1025 .map(|last| page_num.saturating_sub(last) <= 1)
1026 .unwrap_or(false)
1027 && self.symbol_usage[symbol_index] >= 3;
1028
1029 usage_ready || page_span_ready || recent_ready
1030 }
1031
1032 fn build_sym_unify_anchor_map(
1033 &self,
1034 page_num: usize,
1035 ) -> FxHashMap<FamilyBucketKey, Vec<usize>> {
1036 let mut anchors: FxHashMap<FamilyBucketKey, Vec<usize>> = FxHashMap::default();
1037 for symbol_index in 0..self.global_symbols.len() {
1038 if !self.sym_unify_anchor_ready(symbol_index, page_num) {
1039 continue;
1040 }
1041 let key = family_bucket_key_for_symbol(
1042 &self.global_symbols[symbol_index],
1043 &self.symbol_signatures[symbol_index],
1044 );
1045 anchors.entry(key).or_default().push(symbol_index);
1046 }
1047 for bucket in anchors.values_mut() {
1048 bucket.sort_unstable_by(|&lhs, &rhs| {
1049 self.symbol_page_count[rhs]
1050 .cmp(&self.symbol_page_count[lhs])
1051 .then_with(|| self.symbol_usage[rhs].cmp(&self.symbol_usage[lhs]))
1052 .then_with(|| self.symbol_pixel_counts[rhs].cmp(&self.symbol_pixel_counts[lhs]))
1053 .then_with(|| lhs.cmp(&rhs))
1054 });
1055 }
1056 anchors
1057 }
1058
1059 fn maybe_add_sym_unify_anchor(
1060 &self,
1061 anchors: &mut FxHashMap<FamilyBucketKey, Vec<usize>>,
1062 symbol_index: usize,
1063 page_num: usize,
1064 ) {
1065 if !self.sym_unify_anchor_ready(symbol_index, page_num) {
1066 return;
1067 }
1068 let key = family_bucket_key_for_symbol(
1069 &self.global_symbols[symbol_index],
1070 &self.symbol_signatures[symbol_index],
1071 );
1072 let bucket = anchors.entry(key).or_default();
1073 if !bucket.contains(&symbol_index) {
1074 bucket.push(symbol_index);
1075 bucket.sort_unstable_by(|&lhs, &rhs| {
1076 self.symbol_page_count[rhs]
1077 .cmp(&self.symbol_page_count[lhs])
1078 .then_with(|| self.symbol_usage[rhs].cmp(&self.symbol_usage[lhs]))
1079 .then_with(|| self.symbol_pixel_counts[rhs].cmp(&self.symbol_pixel_counts[lhs]))
1080 .then_with(|| lhs.cmp(&rhs))
1081 });
1082 }
1083 }
1084
1085 fn residual_symbol_matches_anchor(
1086 &self,
1087 residual_index: usize,
1088 anchor_index: usize,
1089 comparator: &mut Comparator,
1090 ) -> bool {
1091 matches!(
1092 self.residual_symbol_anchor_decision(residual_index, anchor_index, comparator),
1093 SymUnifyAnchorDecision::Accept { .. }
1094 )
1095 }
1096
1097 fn residual_symbol_anchor_decision(
1098 &self,
1099 residual_index: usize,
1100 anchor_index: usize,
1101 comparator: &mut Comparator,
1102 ) -> SymUnifyAnchorDecision {
1103 let candidate = &self.global_symbols[residual_index];
1104 let proto = &self.global_symbols[anchor_index];
1105 if candidate.width.abs_diff(proto.width) > 1 || candidate.height.abs_diff(proto.height) > 1
1106 {
1107 return SymUnifyAnchorDecision::RejectDim;
1108 }
1109
1110 let strong_anchor = self.symbol_usage[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1111 || self.symbol_page_count[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1112 let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1113 let area = candidate
1114 .width
1115 .max(proto.width)
1116 .saturating_mul(candidate.height.max(proto.height));
1117 let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1118 let black_delta = self.symbol_pixel_counts[anchor_index]
1119 .abs_diff(self.symbol_pixel_counts[residual_index]);
1120 if black_delta > pixel_delta_limit {
1121 return SymUnifyAnchorDecision::RejectPixelDelta;
1122 }
1123 let signature_compatible = family_signatures_are_compatible(
1124 self.symbol_signatures[residual_index],
1125 self.symbol_signatures[anchor_index],
1126 self.symbol_pixel_counts[residual_index],
1127 self.symbol_pixel_counts[anchor_index],
1128 );
1129 if !signature_compatible {
1130 let soft_signature_black_delta_limit = 4 + usize::from(strong_anchor);
1131 if !exact_dims || black_delta > soft_signature_black_delta_limit {
1132 return SymUnifyAnchorDecision::RejectSignature;
1133 }
1134 }
1135
1136 let overlap_limit = self
1137 .config
1138 .sym_unify_max_err
1139 .max(4)
1140 .saturating_add(2)
1141 .saturating_add(u32::from(strong_anchor))
1142 .min(15);
1143 let Some(overlap) = comparator.compare_overlap_only(candidate, proto, overlap_limit) else {
1144 return SymUnifyAnchorDecision::RejectOverlap;
1145 };
1146 if overlap.dx.abs() > self.config.sym_unify_max_dx.max(0)
1147 || overlap.dy.abs() > self.config.sym_unify_max_dy.max(0)
1148 || overlap.overlap_err > overlap_limit
1149 || overlap.black_delta > pixel_delta_limit as u32
1150 {
1151 return SymUnifyAnchorDecision::RejectOverlap;
1152 }
1153
1154 let compare_max_err = self
1155 .config
1156 .sym_unify_max_err
1157 .max(4)
1158 .saturating_add(u32::from(strong_anchor));
1159 let Some(result) = comparator.compare_for_symbol_unify(
1160 candidate,
1161 proto,
1162 compare_max_err,
1163 self.config.sym_unify_max_dx.max(0),
1164 self.config.sym_unify_max_dy.max(0),
1165 ) else {
1166 return SymUnifyAnchorDecision::RejectCompare;
1167 };
1168
1169 let outside_limit =
1170 self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1171 if result.outside_ink_err > outside_limit {
1172 return SymUnifyAnchorDecision::RejectOutsideInk;
1173 }
1174
1175 let score = Self::symbol_unify_assignment_score(&result);
1176 let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1177 if score > score_limit {
1178 return SymUnifyAnchorDecision::RejectScore {
1179 score,
1180 limit: score_limit,
1181 dx: result.dx,
1182 dy: result.dy,
1183 };
1184 }
1185
1186 SymUnifyAnchorDecision::Accept {
1187 score,
1188 dx: result.dx,
1189 dy: result.dy,
1190 }
1191 }
1192
1193 fn residual_symbol_accept_with_dim_limit(
1194 &self,
1195 residual_index: usize,
1196 anchor_index: usize,
1197 comparator: &mut Comparator,
1198 dim_limit: usize,
1199 ) -> bool {
1200 let candidate = &self.global_symbols[residual_index];
1201 let proto = &self.global_symbols[anchor_index];
1202 if candidate.width.abs_diff(proto.width) > dim_limit
1203 || candidate.height.abs_diff(proto.height) > dim_limit
1204 {
1205 return false;
1206 }
1207
1208 let strong_anchor = self.symbol_usage[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1209 || self.symbol_page_count[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1210 let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1211 let area = candidate
1212 .width
1213 .max(proto.width)
1214 .saturating_mul(candidate.height.max(proto.height));
1215 let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1216 let black_delta = self.symbol_pixel_counts[anchor_index]
1217 .abs_diff(self.symbol_pixel_counts[residual_index]);
1218 if black_delta > pixel_delta_limit {
1219 return false;
1220 }
1221
1222 let signature_compatible = family_signatures_are_compatible(
1223 self.symbol_signatures[residual_index],
1224 self.symbol_signatures[anchor_index],
1225 self.symbol_pixel_counts[residual_index],
1226 self.symbol_pixel_counts[anchor_index],
1227 );
1228 if !signature_compatible {
1229 let soft_signature_black_delta_limit = 4 + usize::from(strong_anchor);
1230 if !exact_dims || black_delta > soft_signature_black_delta_limit {
1231 return false;
1232 }
1233 }
1234
1235 let overlap_limit = self
1236 .config
1237 .sym_unify_max_err
1238 .max(4)
1239 .saturating_add(2)
1240 .saturating_add(u32::from(strong_anchor))
1241 .min(15);
1242 let Some(overlap) = comparator.compare_overlap_only(candidate, proto, overlap_limit) else {
1243 return false;
1244 };
1245 if overlap.dx.abs() > self.config.sym_unify_max_dx.max(0)
1246 || overlap.dy.abs() > self.config.sym_unify_max_dy.max(0)
1247 || overlap.overlap_err > overlap_limit
1248 || overlap.black_delta > pixel_delta_limit as u32
1249 {
1250 return false;
1251 }
1252
1253 let compare_max_err = self
1254 .config
1255 .sym_unify_max_err
1256 .max(4)
1257 .saturating_add(u32::from(strong_anchor));
1258 let Some(result) = comparator.compare_for_symbol_unify(
1259 candidate,
1260 proto,
1261 compare_max_err,
1262 self.config.sym_unify_max_dx.max(0),
1263 self.config.sym_unify_max_dy.max(0),
1264 ) else {
1265 return false;
1266 };
1267
1268 let outside_limit =
1269 self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1270 if result.outside_ink_err > outside_limit {
1271 return false;
1272 }
1273
1274 let score = Self::symbol_unify_assignment_score(&result);
1275 let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1276 score <= score_limit
1277 }
1278
1279 fn residual_symbol_accept_without_overlap_prescreen(
1280 &self,
1281 residual_index: usize,
1282 anchor_index: usize,
1283 comparator: &mut Comparator,
1284 ) -> bool {
1285 matches!(
1286 self.residual_symbol_anchor_decision_without_overlap_prescreen(
1287 residual_index,
1288 anchor_index,
1289 comparator,
1290 ),
1291 SymUnifyAnchorDecision::Accept { .. }
1292 )
1293 }
1294
1295 fn residual_symbol_anchor_decision_without_overlap_prescreen(
1296 &self,
1297 residual_index: usize,
1298 anchor_index: usize,
1299 comparator: &mut Comparator,
1300 ) -> SymUnifyAnchorDecision {
1301 let candidate = &self.global_symbols[residual_index];
1302 let proto = &self.global_symbols[anchor_index];
1303 if candidate.width.abs_diff(proto.width) > 1 || candidate.height.abs_diff(proto.height) > 1
1304 {
1305 return SymUnifyAnchorDecision::RejectDim;
1306 }
1307
1308 let strong_anchor = self.symbol_usage[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1309 || self.symbol_page_count[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1310 let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1311 let area = candidate
1312 .width
1313 .max(proto.width)
1314 .saturating_mul(candidate.height.max(proto.height));
1315 let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1316 let black_delta = self.symbol_pixel_counts[anchor_index]
1317 .abs_diff(self.symbol_pixel_counts[residual_index]);
1318 if black_delta > pixel_delta_limit {
1319 return SymUnifyAnchorDecision::RejectPixelDelta;
1320 }
1321
1322 let signature_compatible = family_signatures_are_compatible(
1323 self.symbol_signatures[residual_index],
1324 self.symbol_signatures[anchor_index],
1325 self.symbol_pixel_counts[residual_index],
1326 self.symbol_pixel_counts[anchor_index],
1327 );
1328 if !signature_compatible {
1329 let soft_signature_black_delta_limit = 4 + usize::from(strong_anchor);
1330 if !exact_dims || black_delta > soft_signature_black_delta_limit {
1331 return SymUnifyAnchorDecision::RejectSignature;
1332 }
1333 }
1334
1335 let compare_max_err = self
1336 .config
1337 .sym_unify_max_err
1338 .max(4)
1339 .saturating_add(u32::from(strong_anchor));
1340 let Some(result) = comparator.compare_for_symbol_unify(
1341 candidate,
1342 proto,
1343 compare_max_err,
1344 self.config.sym_unify_max_dx.max(0),
1345 self.config.sym_unify_max_dy.max(0),
1346 ) else {
1347 return SymUnifyAnchorDecision::RejectCompare;
1348 };
1349
1350 let outside_limit =
1351 self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1352 if result.outside_ink_err > outside_limit {
1353 return SymUnifyAnchorDecision::RejectOutsideInk;
1354 }
1355
1356 let score = Self::symbol_unify_assignment_score(&result);
1357 let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1358 if score > score_limit {
1359 return SymUnifyAnchorDecision::RejectScore {
1360 score,
1361 limit: score_limit,
1362 dx: result.dx,
1363 dy: result.dy,
1364 };
1365 }
1366
1367 SymUnifyAnchorDecision::Accept {
1368 score,
1369 dx: result.dx,
1370 dy: result.dy,
1371 }
1372 }
1373
1374 #[inline(always)]
1375 fn evaluate_symbol_match(
1376 &mut self,
1377 candidate: &BitImage,
1378 candidate_sig: SymbolSignature,
1379 candidate_pixels: usize,
1380 symbol_index: usize,
1381 comparator: &mut Comparator,
1382 max_err: u32,
1383 ) -> Option<(u32, i32, i32, bool)> {
1384 let proto = &self.global_symbols[symbol_index];
1385 let dim_limit = if self.config.text_refine { 2 } else { 0 };
1386 if (candidate.width as i32 - proto.width as i32).unsigned_abs() > dim_limit
1387 || (candidate.height as i32 - proto.height as i32).unsigned_abs() > dim_limit
1388 {
1389 return None;
1390 }
1391 if self.symbol_pixel_counts[symbol_index].abs_diff(candidate_pixels)
1392 > max_err as usize + if self.config.text_refine { 8 } else { 6 }
1393 {
1394 return None;
1395 }
1396 if !self.signatures_are_compatible(candidate_sig, symbol_index, self.config.text_refine) {
1397 self.metrics.symbol_stats.signature_rejects += 1;
1398 return None;
1399 }
1400
1401 self.metrics.symbol_stats.comparator_calls += 1;
1402 let (err, dx, dy) = if self.config.text_refine {
1403 comparator
1404 .compare_for_refine_family(candidate, proto, max_err, 1, 1)
1405 .map(|r| (r.total_err, r.dx, r.dy))?
1406 } else {
1407 comparator
1408 .compare_for_refine_family(candidate, proto, max_err, 1, 0)
1409 .map(|r| (r.total_err, r.dx, r.dy))?
1410 };
1411 self.metrics.symbol_stats.comparator_hits += 1;
1412
1413 let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1414 let (accept, needs_refinement) = self.should_accept_match(err, dx, dy, exact_dims, max_err);
1415 if !accept {
1416 return None;
1417 }
1418
1419 if needs_refinement {
1420 self.metrics.symbol_stats.refined_hits += 1;
1421 } else if err == 0 && dx == 0 && dy == 0 && exact_dims {
1422 self.metrics.symbol_stats.exact_hits += 1;
1423 }
1424
1425 Some((err, dx, dy, needs_refinement))
1426 }
1427
1428 #[inline(always)]
1429 fn evaluate_symbol_unify_anchor_match(
1430 &mut self,
1431 candidate: &BitImage,
1432 candidate_sig: SymbolSignature,
1433 candidate_pixels: usize,
1434 symbol_index: usize,
1435 comparator: &mut Comparator,
1436 ) -> SymUnifyAnchorDecision {
1437 let proto = &self.global_symbols[symbol_index];
1438 if candidate.width.abs_diff(proto.width) > 1 || candidate.height.abs_diff(proto.height) > 1
1439 {
1440 return SymUnifyAnchorDecision::RejectDim;
1441 }
1442
1443 let area = candidate
1444 .width
1445 .max(proto.width)
1446 .saturating_mul(candidate.height.max(proto.height));
1447 let strong_anchor = self.symbol_usage[symbol_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1448 || self.symbol_page_count[symbol_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1449 let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1450 if self.symbol_pixel_counts[symbol_index].abs_diff(candidate_pixels) > pixel_delta_limit {
1451 return SymUnifyAnchorDecision::RejectPixelDelta;
1452 }
1453
1454 if !family_signatures_are_compatible(
1455 candidate_sig,
1456 self.symbol_signatures[symbol_index],
1457 candidate_pixels,
1458 self.symbol_pixel_counts[symbol_index],
1459 ) {
1460 self.metrics.symbol_stats.signature_rejects += 1;
1461 return SymUnifyAnchorDecision::RejectSignature;
1462 }
1463
1464 let overlap_limit = self
1465 .config
1466 .sym_unify_max_err
1467 .max(4)
1468 .saturating_add(2)
1469 .saturating_add(u32::from(strong_anchor))
1470 .min(15);
1471 let Some(overlap) = comparator.compare_overlap_only(candidate, proto, overlap_limit) else {
1472 return SymUnifyAnchorDecision::RejectOverlap;
1473 };
1474 if overlap.dx.abs() > self.config.sym_unify_max_dx.max(0)
1475 || overlap.dy.abs() > self.config.sym_unify_max_dy.max(0)
1476 || overlap.overlap_err > overlap_limit
1477 || overlap.black_delta > pixel_delta_limit as u32
1478 {
1479 return SymUnifyAnchorDecision::RejectOverlap;
1480 }
1481
1482 self.metrics.symbol_stats.comparator_calls += 1;
1483 let compare_max_err = self
1484 .config
1485 .sym_unify_max_err
1486 .max(4)
1487 .saturating_add(u32::from(strong_anchor));
1488 let Some(result) = comparator.compare_for_symbol_unify(
1489 candidate,
1490 proto,
1491 compare_max_err,
1492 self.config.sym_unify_max_dx.max(0),
1493 self.config.sym_unify_max_dy.max(0),
1494 ) else {
1495 return SymUnifyAnchorDecision::RejectCompare;
1496 };
1497 self.metrics.symbol_stats.comparator_hits += 1;
1498
1499 let score = Self::symbol_unify_assignment_score(&result);
1500 let outside_limit =
1501 self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1502 let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1503 if result.outside_ink_err > outside_limit {
1504 return SymUnifyAnchorDecision::RejectOutsideInk;
1505 }
1506 if score > score_limit {
1507 return SymUnifyAnchorDecision::RejectScore {
1508 score,
1509 limit: score_limit,
1510 dx: result.dx,
1511 dy: result.dy,
1512 };
1513 }
1514
1515 SymUnifyAnchorDecision::Accept {
1516 score,
1517 dx: result.dx,
1518 dy: result.dy,
1519 }
1520 }
1521
1522 fn estimate_local_symbol_gain(&self, page: &PageData, symbol_index: usize) -> i64 {
1523 let uses = page
1524 .symbol_instances
1525 .iter()
1526 .filter(|instance| instance.symbol_index == symbol_index)
1527 .count() as i64;
1528 let symbol = &self.global_symbols[symbol_index];
1529 let area = (symbol.width * symbol.height) as i64;
1530 let dict_cost = 24 + (area / 8);
1531 let saved_per_use = (area / 10).max(2);
1532 (uses * saved_per_use) - dict_cost
1533 }
1534
1535 fn estimate_global_symbol_gain(&self, symbol_index: usize) -> i64 {
1536 let uses = self.symbol_usage[symbol_index] as i64;
1537 let page_span = self.symbol_page_count[symbol_index] as i64;
1538 let symbol = &self.global_symbols[symbol_index];
1539 let area = (symbol.width * symbol.height) as i64;
1540 let dict_cost = 24 + (area / 8);
1541 let id_savings = ((uses - page_span).max(0)) * 2;
1542 let reuse_value = (uses * (area / 12).max(2)) + (page_span * 3);
1543 reuse_value + id_savings - dict_cost
1544 }
1545
1546 fn should_keep_text_local_symbol(&self, page: &PageData, symbol_index: usize) -> bool {
1547 let _ = (page, symbol_index);
1548 false
1549 }
1550
1551 fn choose_cluster_prototype(&self, members: &[usize]) -> usize {
1552 if members.len() <= 1 || !self.config.text_refine {
1553 return *members
1554 .iter()
1555 .max_by(|&&lhs, &&rhs| {
1556 self.symbol_usage[lhs]
1557 .cmp(&self.symbol_usage[rhs])
1558 .then_with(|| {
1559 self.symbol_pixel_counts[lhs].cmp(&self.symbol_pixel_counts[rhs])
1560 })
1561 .then_with(|| rhs.cmp(&lhs))
1562 })
1563 .unwrap();
1564 }
1565
1566 let mut comparator = Comparator::default();
1567 let mut best_idx = members[0];
1568 let mut best_cost = u64::MAX;
1569
1570 for &candidate in members {
1571 let candidate_symbol = &self.global_symbols[candidate];
1572 let mut total_cost = 0u64;
1573 for &other in members {
1574 if candidate == other {
1575 continue;
1576 }
1577 let other_symbol = &self.global_symbols[other];
1578 let area = candidate_symbol.width.max(other_symbol.width)
1579 * candidate_symbol.height.max(other_symbol.height);
1580 let max_err = ((self.symbol_pixel_counts[candidate]
1581 .max(self.symbol_pixel_counts[other]) as f32
1582 * 0.10) as u32)
1583 .max((area / self.config.match_tolerance.max(1) as usize) as u32)
1584 .clamp(3, 20);
1585
1586 match comparator.compare_for_refine_family(
1587 other_symbol,
1588 candidate_symbol,
1589 max_err,
1590 2,
1591 1,
1592 ) {
1593 Some(result) => {
1594 let err = result.total_err;
1595 let dx = result.dx;
1596 let dy = result.dy;
1597 let refinement_penalty = err as u64 + ((dx.abs() + dy.abs()) as u64 * 2);
1598 total_cost += refinement_penalty * self.symbol_usage[other] as u64;
1599 }
1600 None => total_cost += 1_000_000,
1601 }
1602 }
1603
1604 if total_cost < best_cost
1605 || (total_cost == best_cost
1606 && (
1607 self.symbol_usage[candidate],
1608 self.symbol_pixel_counts[candidate],
1609 ) > (
1610 self.symbol_usage[best_idx],
1611 self.symbol_pixel_counts[best_idx],
1612 ))
1613 {
1614 best_cost = total_cost;
1615 best_idx = candidate;
1616 }
1617 }
1618
1619 best_idx
1620 }
1621
1622 fn note_symbol_page(&mut self, symbol_index: usize, page_num: usize) {
1623 if self.symbol_last_page_seen[symbol_index] != Some(page_num) {
1624 self.symbol_last_page_seen[symbol_index] = Some(page_num);
1625 self.symbol_page_count[symbol_index] += 1;
1626 self.page_symbol_indices[page_num].push(symbol_index);
1627 }
1628 }
1629
1630 fn push_symbol(&mut self, symbol: BitImage, pixel_count: usize, page_num: usize) -> usize {
1631 let idx = self.global_symbols.len();
1632 self.symbol_signatures
1633 .push(Self::compute_symbol_signature(&symbol));
1634 self.symbol_pixel_counts.push(pixel_count);
1635 self.global_symbols.push(symbol);
1636 self.symbol_usage.push(1);
1637 self.symbol_page_count.push(0);
1638 self.symbol_last_page_seen.push(None);
1639 self.note_symbol_page(idx, page_num);
1640 idx
1641 }
1642
1643 fn rebuild_symbol_metadata(&mut self) {
1644 self.symbol_usage = vec![0; self.global_symbols.len()];
1645 self.symbol_page_count = vec![0; self.global_symbols.len()];
1646 self.symbol_last_page_seen = vec![None; self.global_symbols.len()];
1647 self.page_symbol_indices = vec![Vec::new(); self.pages.len()];
1648 self.symbol_pixel_counts = self
1649 .global_symbols
1650 .iter()
1651 .map(BitImage::count_ones)
1652 .collect();
1653 self.symbol_signatures = self
1654 .global_symbols
1655 .iter()
1656 .map(Self::compute_symbol_signature)
1657 .collect();
1658
1659 for page_num in 0..self.pages.len() {
1660 let instance_indices: Vec<usize> = self.pages[page_num]
1661 .symbol_instances
1662 .iter()
1663 .map(|inst| inst.symbol_index)
1664 .collect();
1665 for symbol_index in instance_indices {
1666 self.symbol_usage[symbol_index] += 1;
1667 self.note_symbol_page(symbol_index, page_num);
1668 }
1669 }
1670 }
1671
1672 fn rebuild_hash_map(&mut self) {
1673 self.hash_map.clear();
1674 self.hash_map.reserve(self.global_symbols.len());
1675 for (idx, symbol) in self.global_symbols.iter().enumerate() {
1676 let key = hash_key(symbol);
1677 self.hash_map.entry(key).or_default().push(idx);
1678 }
1679 }
1680
1681 fn build_symbol_unify_classes(&mut self) -> Vec<UnifiedClass> {
1682 let diagnostics_enabled = encoder_diagnostics_enabled();
1683 let context_model =
1684 build_symbol_context_model(&self.pages, &self.global_symbols, &self.symbol_signatures);
1685 let (classes, diagnostics) =
1686 crate::jbig2unify::build_symbol_unify_classes(SymbolUnifyInputs {
1687 config: self.config,
1688 global_symbols: &self.global_symbols,
1689 symbol_usage: &self.symbol_usage,
1690 symbol_page_count: &self.symbol_page_count,
1691 symbol_signatures: &self.symbol_signatures,
1692 symbol_pixel_counts: &self.symbol_pixel_counts,
1693 context_model: Some(&context_model),
1694 collect_diagnostics: diagnostics_enabled,
1695 });
1696 if diagnostics_enabled {
1697 self.state.decision_debug_lines.extend(diagnostics.lines);
1698 }
1699 classes
1700 }
1701
1702 fn compact_symbol_table_after_remap(&mut self) {
1703 let mut used = vec![false; self.global_symbols.len()];
1704 for page in &self.pages {
1705 for instance in &page.symbol_instances {
1706 if instance.symbol_index < used.len() {
1707 used[instance.symbol_index] = true;
1708 }
1709 }
1710 }
1711
1712 let old_symbols = self.global_symbols.clone();
1713 let mut new_index = vec![usize::MAX; old_symbols.len()];
1714 let mut new_symbols = Vec::new();
1715
1716 for (old_index, symbol) in old_symbols.into_iter().enumerate() {
1717 if used[old_index] {
1718 new_index[old_index] = new_symbols.len();
1719 new_symbols.push(symbol);
1720 }
1721 }
1722
1723 for page in &mut self.pages {
1724 for instance in &mut page.symbol_instances {
1725 instance.symbol_index = new_index[instance.symbol_index];
1726 }
1727 }
1728
1729 self.global_symbols = new_symbols;
1730 self.rebuild_symbol_metadata();
1731 self.rebuild_hash_map();
1732 }
1733
1734 fn alias_local_symbols_to_globals(&mut self) -> Result<()> {
1735 if self.pages.len() <= 1 || self.global_symbols.is_empty() {
1736 return Ok(());
1737 }
1738 let text_refine = self.config.text_refine;
1739 let refine_enabled = self.config.refine;
1740 let global_indices: Vec<usize> = self
1741 .global_symbols
1742 .iter()
1743 .enumerate()
1744 .filter(|(i, _)| self.symbol_page_count[*i] > 1)
1745 .map(|(i, _)| i)
1746 .collect();
1747 if global_indices.is_empty() {
1748 return Ok(());
1749 }
1750
1751 let mut global_bucket_map: FxHashMap<HashKey, Vec<usize>> =
1752 FxHashMap::with_capacity_and_hasher(global_indices.len(), Default::default());
1753 for &symbol_index in &global_indices {
1754 global_bucket_map
1755 .entry(hash_key(&self.global_symbols[symbol_index]))
1756 .or_default()
1757 .push(symbol_index);
1758 }
1759
1760 let mut comparator = Comparator::default();
1761 let mut changed = false;
1762 let mut aliased_symbols = 0usize;
1763 let mut aliased_instances = 0usize;
1764 let mut alias_samples = Vec::new();
1765 for page in &mut self.pages {
1766 let mut page_local_symbols: FxHashSet<usize> =
1767 FxHashSet::with_capacity_and_hasher(256, Default::default());
1768 for instance in &page.symbol_instances {
1769 if self.symbol_page_count[instance.symbol_index] <= 1 {
1770 page_local_symbols.insert(instance.symbol_index);
1771 }
1772 }
1773
1774 for local_symbol_index in page_local_symbols {
1775 let local_symbol = &self.global_symbols[local_symbol_index];
1776 let local_sig = self.symbol_signatures[local_symbol_index];
1777 let pixel_count = self.symbol_pixel_counts[local_symbol_index];
1778 let area = (local_symbol.width * local_symbol.height) as u32;
1779 let max_err = if self.config.text_refine {
1780 (area / self.config.match_tolerance.max(1)).max(3)
1781 } else {
1782 ((area as f32 * 0.05) as u32).max(2)
1783 };
1784 let dim_range: u64 = if self.config.text_refine || self.config.refine {
1785 2
1786 } else {
1787 0
1788 };
1789
1790 let mut best_match: Option<(usize, u32, i32, i32, bool)> = None;
1791 let h = local_symbol.height as u64;
1792 let w = local_symbol.width as u64;
1793 'bucket_search: for dh_off in 0..=(dim_range * 2) {
1794 let dh = h.wrapping_add(dh_off).wrapping_sub(dim_range);
1795 if dh >= 10_000 {
1796 continue;
1797 }
1798 for dw_off in 0..=(dim_range * 2) {
1799 let dw = w.wrapping_add(dw_off).wrapping_sub(dim_range);
1800 if dw >= 10_000 {
1801 continue;
1802 }
1803 let bucket_key = HashKey(dh * 10_000 + dw);
1804 let Some(bucket) = global_bucket_map.get(&bucket_key) else {
1805 continue;
1806 };
1807 for &global_symbol_index in bucket {
1808 if self.symbol_pixel_counts[global_symbol_index].abs_diff(pixel_count)
1809 > max_err as usize + if self.config.text_refine { 8 } else { 6 }
1810 {
1811 continue;
1812 }
1813 let stored = self.symbol_signatures[global_symbol_index];
1814 let black_tol = if text_refine { 12 } else { 8 };
1815 let pos_tol = 2;
1816 let centroid_tol = if text_refine { 96 } else { 64 };
1817 if local_sig.black.abs_diff(stored.black) > black_tol
1818 || local_sig.left_col.abs_diff(stored.left_col) > pos_tol
1819 || local_sig.right_col.abs_diff(stored.right_col) > pos_tol
1820 || local_sig.top_row.abs_diff(stored.top_row) > pos_tol
1821 || local_sig.bottom_row.abs_diff(stored.bottom_row) > pos_tol
1822 || local_sig.cx_times_256.abs_diff(stored.cx_times_256)
1823 > centroid_tol
1824 || local_sig.cy_times_256.abs_diff(stored.cy_times_256)
1825 > centroid_tol
1826 {
1827 continue;
1828 }
1829 let max_dx = if text_refine { 1 } else { 1 };
1830 let max_dy = if text_refine { 1 } else { 0 };
1831 let Some(result) = comparator.compare_for_refine_family(
1832 local_symbol,
1833 &self.global_symbols[global_symbol_index],
1834 max_err,
1835 max_dx,
1836 max_dy,
1837 ) else {
1838 continue;
1839 };
1840 let err = result.total_err;
1841 let dx = result.dx;
1842 let dy = result.dy;
1843 let exact_dims = local_symbol.width
1844 == self.global_symbols[global_symbol_index].width
1845 && local_symbol.height
1846 == self.global_symbols[global_symbol_index].height;
1847 let (accept, needs_refinement) =
1848 if err == 0 && dx == 0 && dy == 0 && exact_dims {
1849 (true, false)
1850 } else if text_refine {
1851 (
1852 dx.abs() <= 1
1853 && dy.abs() <= 1
1854 && err <= (max_err / 2).max(2),
1855 true,
1856 )
1857 } else if dx.abs() <= 1 && dy == 0 {
1858 (true, false)
1859 } else {
1860 (false, false)
1861 };
1862 if !accept {
1863 continue;
1864 }
1865 best_match = Some((
1866 global_symbol_index,
1867 err,
1868 dx,
1869 dy,
1870 needs_refinement && (text_refine || refine_enabled),
1871 ));
1872 if err == 0 && dx == 0 && dy == 0 {
1873 break 'bucket_search;
1874 }
1875 }
1876 }
1877 }
1878
1879 let Some((global_symbol_index, _err, dx, dy, needs_refinement)) = best_match else {
1880 continue;
1881 };
1882 aliased_symbols += 1;
1883 for instance in &mut page.symbol_instances {
1884 if instance.symbol_index == local_symbol_index {
1885 instance.symbol_index = global_symbol_index;
1886 instance.needs_refinement = needs_refinement;
1887 instance.refinement_dx = if needs_refinement { dx } else { 0 };
1888 instance.refinement_dy = if needs_refinement { dy } else { 0 };
1889 changed = true;
1890 aliased_instances += 1;
1891 }
1892 }
1893 if alias_samples.len() < 64 {
1894 alias_samples.push(format!(
1895 "alias local->global: local={} global={} dx={} dy={} refine={}",
1896 local_symbol_index, global_symbol_index, dx, dy, needs_refinement
1897 ));
1898 }
1899 }
1900 }
1901
1902 if encoder_diagnostics_enabled() {
1903 if changed {
1904 self.state.decision_debug_lines.push(format!(
1905 "alias pass: {} local symbols / {} instances remapped onto globals",
1906 aliased_symbols, aliased_instances
1907 ));
1908 self.state.decision_debug_lines.extend(alias_samples);
1909 } else {
1910 self.state
1911 .decision_debug_lines
1912 .push("alias pass: no local symbols remapped onto globals".to_string());
1913 }
1914 }
1915 if changed {
1916 self.compact_symbol_table_after_remap();
1917 }
1918
1919 Ok(())
1920 }
1921
1922 fn apply_symbol_unify(&mut self) -> Result<()> {
1923 if !self.config.uses_symbol_unify() || self.state.lossy_symbol_mode_applied {
1924 return Ok(());
1925 }
1926
1927 let diagnostics_enabled = encoder_diagnostics_enabled();
1928 let before_exported = self.global_symbols.len();
1929 let before_estimated_dict_bytes =
1930 symbol_dictionary_entries_bytes(self.global_symbols.iter());
1931 let classes = self.build_symbol_unify_classes();
1932 if classes.is_empty() {
1933 if diagnostics_enabled {
1934 self.state
1935 .decision_debug_lines
1936 .push("sym_unify: no eligible classes".to_string());
1937 }
1938 self.state.lossy_symbol_mode_applied = true;
1939 return Ok(());
1940 }
1941
1942 let mut remap: Vec<usize> = (0..self.global_symbols.len()).collect();
1943 let mut refinement_remap: Vec<Option<RefinementPlan>> =
1944 vec![None; self.global_symbols.len()];
1945 let mut unified_members = 0usize;
1946 let mut border_unified_members = 0usize;
1947 let mut refined_members = 0usize;
1948 let mut refinement_subclusters = 0usize;
1949 let mut retained_border_members = 0usize;
1950 let mut retained_outlier_members = 0usize;
1951
1952 if diagnostics_enabled {
1953 self.state.decision_debug_lines.push(format!(
1954 "sym_unify: {} classes eligible across {} symbols",
1955 classes.len(),
1956 self.global_symbols.len()
1957 ));
1958
1959 for class in classes.iter().take(64) {
1960 self.state.decision_debug_lines.push(format!(
1961 "sym_unify class: representative={} class_size={} core_size={} unified={} border_unified={} refined_subclusters={} refined_members={} retained_border={} retained_outliers={} total_usage={} page_span={} representative_score={} estimated_gain={} subclusters={}",
1962 class.representative_index,
1963 class.class_size,
1964 class.dense_core_size,
1965 class.core_members.len(),
1966 class.border_members.len(),
1967 class.refinement_subclusters.len(),
1968 class.refinement_subclusters
1969 .iter()
1970 .map(|subcluster| subcluster.refined_members.len())
1971 .sum::<usize>(),
1972 class.retained_border_members,
1973 class.retained_outlier_members,
1974 class.total_usage,
1975 class.page_span,
1976 class.representative_score,
1977 class.estimated_gain,
1978 class.candidate_subclusters
1979 ));
1980 }
1981 }
1982
1983 for class in &classes {
1984 retained_border_members += class.retained_border_members;
1985 retained_outlier_members += class.retained_outlier_members;
1986 for member in &class.core_members {
1987 remap[member.member_index] = class.representative_index;
1988 unified_members += 1;
1989 }
1990 for member in &class.border_members {
1991 remap[member.member_index] = class.representative_index;
1992 border_unified_members += 1;
1993 }
1994 refinement_subclusters += class.refinement_subclusters.len();
1995 for subcluster in &class.refinement_subclusters {
1996 for member in &subcluster.refined_members {
1997 refinement_remap[member.member_index] = Some(RefinementPlan {
1998 prototype_input_index: subcluster.prototype_index,
1999 refinement_dx: member.dx,
2000 refinement_dy: member.dy,
2001 });
2002 refined_members += 1;
2003 }
2004 }
2005 }
2006
2007 for page in &mut self.pages {
2008 for instance in &mut page.symbol_instances {
2009 let original_index = instance.symbol_index;
2010 if let Some(refinement) = refinement_remap[original_index] {
2011 instance.symbol_index = refinement.prototype_input_index;
2012 instance.needs_refinement = true;
2013 instance.refinement_dx = refinement.refinement_dx;
2014 instance.refinement_dy = refinement.refinement_dy;
2015 } else {
2016 instance.symbol_index = remap[original_index];
2017 instance.needs_refinement = false;
2018 instance.refinement_dx = 0;
2019 instance.refinement_dy = 0;
2020 }
2021 }
2022 }
2023
2024 self.compact_symbol_table_after_remap();
2025 if diagnostics_enabled {
2026 let after_estimated_dict_bytes =
2027 symbol_dictionary_entries_bytes(self.global_symbols.iter());
2028 self.state.decision_debug_lines.push(format!(
2029 "sym_unify export summary: before={} after={} removed={} dict_bytes_before={} dict_bytes_after={} dict_bytes_saved={} unified_members={} border_unified_members={} refined_members={} refinement_subclusters={} retained_border_members={} retained_outlier_members={}",
2030 before_exported,
2031 self.global_symbols.len(),
2032 before_exported.saturating_sub(self.global_symbols.len()),
2033 before_estimated_dict_bytes,
2034 after_estimated_dict_bytes,
2035 before_estimated_dict_bytes.saturating_sub(after_estimated_dict_bytes),
2036 unified_members,
2037 border_unified_members,
2038 refined_members,
2039 refinement_subclusters,
2040 retained_border_members,
2041 retained_outlier_members
2042 ));
2043 }
2044 self.state.lossy_symbol_mode_applied = true;
2045 Ok(())
2046 }
2047
2048 pub fn add_page(&mut self, image: &Array2<u8>) -> Result<()> {
2049 let bitimage = crate::jbig2sym::array_to_bitimage(image);
2050 self.add_page_bitimage(bitimage)
2051 }
2052
2053 pub fn add_page_bitimage(&mut self, bitimage: BitImage) -> Result<()> {
2054 let page_num = self.pages.len();
2055 self.page_symbol_indices.push(Vec::new());
2056 let mut symbol_instances = Vec::new();
2057 let mut comparator = Comparator::default();
2058 let debug_matching =
2059 page_num == 0 && std::env::var("JBIG2_DEBUG").map_or(false, |v| v == "1");
2060 let no_reuse = std::env::var("JBIG2_NO_REUSE").map_or(false, |v| v == "1");
2061
2062 let mut debug_lines: Vec<String> = Vec::new();
2063 if debug_matching {
2064 debug_lines.push("=== PAGE 0 MATCHING LOG ===".to_string());
2065 debug_lines.push(format!("Image: {}x{}", bitimage.width, bitimage.height));
2066 }
2067 let mut cc_index = 0usize;
2068 let mut sym_unify_anchor_map = (self.config.lossy_symbol_mode
2069 == LossySymbolMode::SymbolUnify
2070 && !self.global_symbols.is_empty())
2071 .then(|| self.build_sym_unify_anchor_map(page_num));
2072 let sym_unify_initial_anchor_count = sym_unify_anchor_map
2073 .as_ref()
2074 .map(|anchors| anchors.values().map(Vec::len).sum::<usize>())
2075 .unwrap_or(0);
2076 let sym_unify_initial_anchor_bytes = sym_unify_anchor_map
2077 .as_ref()
2078 .map(|anchors| anchor_map_dictionary_bytes(&self.global_symbols, anchors))
2079 .unwrap_or(0);
2080 let mut sym_unify_recent_hits = 0usize;
2081 let mut sym_unify_anchor_hits = 0usize;
2082 let mut sym_unify_bucket_hits = 0usize;
2083 let mut sym_unify_new_symbols = 0usize;
2084 let mut sym_unify_anchor_score_rejects = 0usize;
2085 let mut sym_unify_anchor_outside_rejects = 0usize;
2086 let mut sym_unify_anchor_compare_rejects = 0usize;
2087 let mut sym_unify_anchor_overlap_rejects = 0usize;
2088
2089 if self.config.symbol_mode && self.state.segment {
2091 #[cfg(feature = "cc-analysis")]
2092 {
2093 let dpi = 300; let losslevel =
2095 if self.config.symbol_mode || self.config.uses_lossy_symbol_dictionary() {
2096 0
2097 } else if self.config.is_lossless {
2098 0
2099 } else {
2100 1
2101 };
2102 let cc_start = Instant::now();
2103 let cc_image = analyze_page(&bitimage, dpi, losslevel);
2104 let extracted = cc_image.extract_shape_refs();
2105 self.metrics.symbol_mode.cc_extraction += cc_start.elapsed();
2106
2107 let should_use_symbols = if extracted.len() == 1 {
2111 let bbox = extracted[0].bbox;
2112 !(bbox.xmin == 0
2113 && bbox.ymin == 0
2114 && bbox.width() as usize >= bitimage.width.saturating_sub(2)
2115 && bbox.height() as usize >= bitimage.height.saturating_sub(2))
2116 } else {
2117 !extracted.is_empty()
2118 };
2119
2120 if should_use_symbols {
2121 let matching_start = Instant::now();
2122 let mut recent_cache = RecentSymbolCache::new(RECENT_SYMBOL_CACHE_CAP);
2123 let mut recent_candidates = [0usize; RECENT_SYMBOL_CACHE_CAP];
2124 let mut last_y = 0u32;
2125
2126 for shape in extracted {
2127 if Self::should_skip_symbol_candidate(
2128 shape.bbox.width().max(0) as usize,
2129 shape.bbox.height().max(0) as usize,
2130 shape.black_pixels,
2131 ) || shape.run_count == 0
2132 {
2133 continue;
2134 }
2135 let Some(symbol) = cc_image.get_bitmap_for_cc(shape.ccid) else {
2136 continue;
2137 };
2138 let (trim_offset, trimmed) = symbol.trim();
2139 let pixel_count = trimmed.count_ones();
2140 if Self::should_skip_symbol_candidate(
2141 trimmed.width,
2142 trimmed.height,
2143 pixel_count,
2144 ) {
2145 continue;
2146 }
2147
2148 let rect = Rect {
2153 x: shape.bbox.xmin as u32 + trim_offset.x,
2154 y: shape.bbox.ymin as u32 + trim_offset.y,
2155 width: trimmed.width as u32,
2156 height: trimmed.height as u32,
2157 };
2158 if rect.y > last_y.saturating_add(24) {
2159 recent_cache.clear();
2160 }
2161 last_y = rect.y;
2162
2163 let trimmed_sig = Self::compute_symbol_signature(&trimmed);
2164 let mut matched = false;
2165 let mut instance_bitmap = Some(symbol);
2166
2167 let area = (trimmed.width * trimmed.height) as u32;
2169 let max_err = if self.config.text_refine {
2170 (area / self.config.match_tolerance).max(3)
2171 } else {
2172 ((area as f32 * 0.05) as u32).max(2)
2173 };
2174
2175 if !matched && !no_reuse {
2176 let recent_len = recent_cache.copy_into(&mut recent_candidates);
2177 'recent_search: for &idx in &recent_candidates[..recent_len] {
2178 if let Some((err, dx, dy, needs_refinement)) = self
2179 .evaluate_symbol_match(
2180 &trimmed,
2181 trimmed_sig,
2182 pixel_count,
2183 idx,
2184 &mut comparator,
2185 max_err,
2186 )
2187 {
2188 if debug_matching {
2189 let mode = if needs_refinement {
2190 "REFINE"
2191 } else if err == 0 && dx == 0 && dy == 0 {
2192 "EXACT "
2193 } else {
2194 "LOSSY "
2195 };
2196 let proto = &self.global_symbols[idx];
2197 debug_lines.push(format!(
2198 "CC#{:04} {} pos=({},{}) {}x{} → proto#{} {}x{} err={} dx={} dy={} [recent]",
2199 cc_index,
2200 mode,
2201 rect.x,
2202 rect.y,
2203 rect.width,
2204 rect.height,
2205 idx,
2206 proto.width,
2207 proto.height,
2208 err,
2209 dx,
2210 dy
2211 ));
2212 }
2213
2214 self.symbol_usage[idx] += 1;
2215 self.note_symbol_page(idx, page_num);
2216 symbol_instances.push(SymbolInstance {
2217 symbol_index: idx,
2218 position: rect,
2219 instance_bitmap: instance_bitmap.take().unwrap(),
2220 needs_refinement,
2221 refinement_dx: if needs_refinement { dx } else { 0 },
2222 refinement_dy: if needs_refinement { dy } else { 0 },
2223 });
2224 recent_cache.touch(idx);
2225 if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
2226 {
2227 sym_unify_recent_hits += 1;
2228 }
2229 matched = true;
2230 break 'recent_search;
2231 }
2232 }
2233 }
2234
2235 if !matched
2236 && !no_reuse
2237 && self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
2238 {
2239 if let Some(anchor_map) = sym_unify_anchor_map.as_mut() {
2240 let anchor_key =
2241 family_bucket_key_for_symbol(&trimmed, &trimmed_sig);
2242 let mut visited = FxHashSet::default();
2243 let mut exact_examined = 0usize;
2244 if let Some(bucket) = anchor_map.get(&anchor_key) {
2245 'anchor_search_exact: for &idx in bucket {
2246 if exact_examined >= SYM_UNIFY_EXACT_ANCHOR_BUDGET {
2247 break 'anchor_search_exact;
2248 }
2249 exact_examined += 1;
2250 if !visited.insert(idx) {
2251 continue;
2252 }
2253 let decision = self.evaluate_symbol_unify_anchor_match(
2254 &trimmed,
2255 trimmed_sig,
2256 pixel_count,
2257 idx,
2258 &mut comparator,
2259 );
2260 let (score, dx, dy) = match decision {
2261 SymUnifyAnchorDecision::Accept { score, dx, dy } => {
2262 (score, dx, dy)
2263 }
2264 SymUnifyAnchorDecision::RejectScore { .. } => {
2265 sym_unify_anchor_score_rejects += 1;
2266 continue;
2267 }
2268 SymUnifyAnchorDecision::RejectOutsideInk => {
2269 sym_unify_anchor_outside_rejects += 1;
2270 continue;
2271 }
2272 SymUnifyAnchorDecision::RejectCompare => {
2273 sym_unify_anchor_compare_rejects += 1;
2274 continue;
2275 }
2276 SymUnifyAnchorDecision::RejectOverlap => {
2277 sym_unify_anchor_overlap_rejects += 1;
2278 continue;
2279 }
2280 _ => continue,
2281 };
2282
2283 if debug_matching {
2284 let proto = &self.global_symbols[idx];
2285 debug_lines.push(format!(
2286 "CC#{:04} UNIFY pos=({},{}) {}x{} → proto#{} {}x{} score={} dx={} dy={} [anchor]",
2287 cc_index,
2288 rect.x,
2289 rect.y,
2290 rect.width,
2291 rect.height,
2292 idx,
2293 proto.width,
2294 proto.height,
2295 score,
2296 dx,
2297 dy
2298 ));
2299 }
2300
2301 self.symbol_usage[idx] += 1;
2302 self.note_symbol_page(idx, page_num);
2303 self.maybe_add_sym_unify_anchor(anchor_map, idx, page_num);
2304 symbol_instances.push(SymbolInstance {
2305 symbol_index: idx,
2306 position: rect,
2307 instance_bitmap: instance_bitmap.take().unwrap(),
2308 needs_refinement: false,
2309 refinement_dx: 0,
2310 refinement_dy: 0,
2311 });
2312 recent_cache.touch(idx);
2313 sym_unify_anchor_hits += 1;
2314 matched = true;
2315 break;
2316 }
2317 }
2318
2319 if !matched {
2320 let mut neighbor_examined = 0usize;
2321 'anchor_search_neighbors: for neighbor in
2322 family_bucket_neighbors(anchor_key)
2323 {
2324 if neighbor == anchor_key {
2325 continue;
2326 }
2327 let Some(bucket) = anchor_map.get(&neighbor) else {
2328 continue;
2329 };
2330 for &idx in bucket {
2331 if neighbor_examined >= SYM_UNIFY_NEIGHBOR_ANCHOR_BUDGET
2332 {
2333 break 'anchor_search_neighbors;
2334 }
2335 neighbor_examined += 1;
2336 if !visited.insert(idx) {
2337 continue;
2338 }
2339 let decision = self.evaluate_symbol_unify_anchor_match(
2340 &trimmed,
2341 trimmed_sig,
2342 pixel_count,
2343 idx,
2344 &mut comparator,
2345 );
2346 let (score, dx, dy) = match decision {
2347 SymUnifyAnchorDecision::Accept {
2348 score,
2349 dx,
2350 dy,
2351 } => (score, dx, dy),
2352 SymUnifyAnchorDecision::RejectScore { .. } => {
2353 sym_unify_anchor_score_rejects += 1;
2354 continue;
2355 }
2356 SymUnifyAnchorDecision::RejectOutsideInk => {
2357 sym_unify_anchor_outside_rejects += 1;
2358 continue;
2359 }
2360 SymUnifyAnchorDecision::RejectCompare => {
2361 sym_unify_anchor_compare_rejects += 1;
2362 continue;
2363 }
2364 SymUnifyAnchorDecision::RejectOverlap => {
2365 sym_unify_anchor_overlap_rejects += 1;
2366 continue;
2367 }
2368 _ => continue,
2369 };
2370
2371 if debug_matching {
2372 let proto = &self.global_symbols[idx];
2373 debug_lines.push(format!(
2374 "CC#{:04} UNIFY pos=({},{}) {}x{} → proto#{} {}x{} score={} dx={} dy={} [anchor]",
2375 cc_index,
2376 rect.x,
2377 rect.y,
2378 rect.width,
2379 rect.height,
2380 idx,
2381 proto.width,
2382 proto.height,
2383 score,
2384 dx,
2385 dy
2386 ));
2387 }
2388
2389 self.symbol_usage[idx] += 1;
2390 self.note_symbol_page(idx, page_num);
2391 self.maybe_add_sym_unify_anchor(
2392 anchor_map, idx, page_num,
2393 );
2394 symbol_instances.push(SymbolInstance {
2395 symbol_index: idx,
2396 position: rect,
2397 instance_bitmap: instance_bitmap.take().unwrap(),
2398 needs_refinement: false,
2399 refinement_dx: 0,
2400 refinement_dy: 0,
2401 });
2402 recent_cache.touch(idx);
2403 sym_unify_anchor_hits += 1;
2404 matched = true;
2405 break 'anchor_search_neighbors;
2406 }
2407 }
2408 }
2409 }
2410 }
2411
2412 if !matched && !no_reuse {
2413 let h = trimmed.height as u64;
2414 let w = trimmed.width as u64;
2415 let dim_range: u64 = if self.config.text_refine { 2 } else { 0 };
2416
2417 'bucket_search: for dh_off in 0..=(dim_range * 2) {
2418 let dh = h.wrapping_add(dh_off).wrapping_sub(dim_range);
2419 if dh >= 10_000 {
2420 continue;
2421 }
2422 for dw_off in 0..=(dim_range * 2) {
2423 let dw = w.wrapping_add(dw_off).wrapping_sub(dim_range);
2424 if dw >= 10_000 {
2425 continue;
2426 }
2427
2428 let nk = HashKey(dh * 10_000 + dw);
2429 if let Some(bucket) = self.hash_map.get(&nk) {
2430 let bucket_len = bucket.len();
2431 let bucket_ptr = bucket.as_ptr();
2432 for bucket_pos in 0..bucket_len {
2433 let idx = unsafe { *bucket_ptr.add(bucket_pos) };
2434 let Some((err, dx, dy, needs_refinement)) = self
2435 .evaluate_symbol_match(
2436 &trimmed,
2437 trimmed_sig,
2438 pixel_count,
2439 idx,
2440 &mut comparator,
2441 max_err,
2442 )
2443 else {
2444 continue;
2445 };
2446
2447 if debug_matching {
2448 let mode = if needs_refinement {
2449 "REFINE"
2450 } else if err == 0 && dx == 0 && dy == 0 {
2451 "EXACT "
2452 } else {
2453 "LOSSY "
2454 };
2455 let proto = &self.global_symbols[idx];
2456 debug_lines.push(format!(
2457 "CC#{:04} {} pos=({},{}) {}x{} → proto#{} {}x{} err={} dx={} dy={}",
2458 cc_index,
2459 mode,
2460 rect.x,
2461 rect.y,
2462 rect.width,
2463 rect.height,
2464 idx,
2465 proto.width,
2466 proto.height,
2467 err,
2468 dx,
2469 dy
2470 ));
2471 }
2472
2473 self.symbol_usage[idx] += 1;
2474 self.note_symbol_page(idx, page_num);
2475 if let Some(anchor_map) = sym_unify_anchor_map.as_mut()
2476 {
2477 self.maybe_add_sym_unify_anchor(
2478 anchor_map, idx, page_num,
2479 );
2480 }
2481 symbol_instances.push(SymbolInstance {
2482 symbol_index: idx,
2483 position: rect,
2484 instance_bitmap: instance_bitmap.take().unwrap(),
2485 needs_refinement,
2486 refinement_dx: if needs_refinement {
2487 dx
2488 } else {
2489 0
2490 },
2491 refinement_dy: if needs_refinement {
2492 dy
2493 } else {
2494 0
2495 },
2496 });
2497 recent_cache.touch(idx);
2498 if self.config.lossy_symbol_mode
2499 == LossySymbolMode::SymbolUnify
2500 {
2501 sym_unify_bucket_hits += 1;
2502 }
2503 matched = true;
2504 break 'bucket_search;
2505 }
2506 }
2507 }
2508 }
2509 }
2510
2511 if !matched {
2512 let idx = self.push_symbol(trimmed, pixel_count, page_num);
2513 self.metrics.symbol_stats.symbols_discovered += 1;
2514 if debug_matching {
2515 debug_lines.push(format!(
2516 "CC#{:04} NEW pos=({},{}) {}x{} trim_off=({},{}) → new proto#{} {}x{}",
2517 cc_index, rect.x, rect.y, rect.width, rect.height,
2518 trim_offset.x, trim_offset.y,
2519 idx, self.global_symbols[idx].width, self.global_symbols[idx].height
2520 ));
2521 }
2522 let key = hash_key(&self.global_symbols[idx]);
2523 self.hash_map.entry(key).or_default().push(idx);
2524 if let Some(anchor_map) = sym_unify_anchor_map.as_mut() {
2525 self.maybe_add_sym_unify_anchor(anchor_map, idx, page_num);
2526 }
2527 symbol_instances.push(SymbolInstance {
2528 symbol_index: idx,
2529 position: rect,
2530 instance_bitmap: instance_bitmap.take().unwrap(),
2531 needs_refinement: false,
2532 refinement_dx: 0,
2533 refinement_dy: 0,
2534 });
2535 recent_cache.touch(idx);
2536 if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify {
2537 sym_unify_new_symbols += 1;
2538 }
2539 }
2540 cc_index += 1;
2541 }
2542 self.metrics.symbol_mode.matching_dedup += matching_start.elapsed();
2543 }
2544 }
2545 }
2546
2547 if debug_matching && !debug_lines.is_empty() {
2549 debug_lines.push(format!(
2550 "\nTotal CCs: {}, Instances: {}",
2551 cc_index,
2552 symbol_instances.len()
2553 ));
2554 let log_path = std::path::Path::new("jbig2_debug_page0.log");
2555 if let Ok(mut f) = std::fs::File::create(log_path) {
2556 use std::io::Write;
2557 for line in &debug_lines {
2558 let _ = writeln!(f, "{}", line);
2559 }
2560 }
2561 }
2562
2563 if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
2564 && encoder_diagnostics_enabled()
2565 {
2566 let final_anchor_count = sym_unify_anchor_map
2567 .as_ref()
2568 .map(|anchors| anchors.values().map(Vec::len).sum::<usize>())
2569 .unwrap_or(0);
2570 let final_anchor_bytes = sym_unify_anchor_map
2571 .as_ref()
2572 .map(|anchors| anchor_map_dictionary_bytes(&self.global_symbols, anchors))
2573 .unwrap_or(0);
2574 self.state.ingest_debug_lines.push(format!(
2575 "sym_unify ingest page={}: cc={} recent_hits={} anchor_hits={} bucket_hits={} new_symbols={} initial_anchors={} final_anchors={} initial_anchor_bytes={} final_anchor_bytes={} anchor_score_rejects={} anchor_outside_rejects={} anchor_compare_rejects={} anchor_overlap_rejects={}",
2576 page_num + 1,
2577 cc_index,
2578 sym_unify_recent_hits,
2579 sym_unify_anchor_hits,
2580 sym_unify_bucket_hits,
2581 sym_unify_new_symbols,
2582 sym_unify_initial_anchor_count,
2583 final_anchor_count,
2584 sym_unify_initial_anchor_bytes,
2585 final_anchor_bytes,
2586 sym_unify_anchor_score_rejects,
2587 sym_unify_anchor_outside_rejects,
2588 sym_unify_anchor_compare_rejects,
2589 sym_unify_anchor_overlap_rejects,
2590 ));
2591 }
2592
2593 self.pages.push(PageData {
2594 image: bitimage,
2595 symbol_instances,
2596 });
2597 Ok(())
2598 }
2599
2600 pub fn collect_symbols(&mut self, roi: &Array2<u8>) -> Result<()> {
2601 let bitimage = crate::jbig2sym::array_to_bitimage(roi);
2602 let (_, trimmed) = bitimage.trim();
2603 let key = hash_key(&trimmed);
2604 let page_num = self.pages.len();
2605 if self.page_symbol_indices.len() <= page_num {
2606 self.page_symbol_indices.resize_with(page_num + 1, Vec::new);
2607 }
2608
2609 if !self.hash_map.contains_key(&key) {
2610 let pixel_count = trimmed.count_ones();
2611 let idx = self.push_symbol(trimmed, pixel_count, page_num);
2612 self.metrics.symbol_stats.symbols_discovered += 1;
2613 self.hash_map.insert(key, vec![idx]);
2614 }
2615 Ok(())
2616 }
2617
2618 pub fn flush(&mut self) -> Result<Vec<u8>> {
2619 let include_header = self.state.full_headers_remaining;
2620 self.state.decision_debug_lines.clear();
2621 match self.config.lossy_symbol_mode {
2622 LossySymbolMode::SymbolUnify => self.apply_symbol_unify()?,
2623 LossySymbolMode::Off => {}
2624 }
2625 let plan = self.plan_document(include_header)?;
2626 self.validate_plan(&plan)?;
2627 let output = self.serialize_full_document(&plan)?;
2628 self.state.full_headers_remaining = false;
2629 self.next_segment_number = plan.next_segment_number;
2630 Ok(output)
2631 }
2632
2633 pub fn flush_pdf_split(&mut self) -> Result<PdfSplitOutput> {
2634 self.state.pdf_mode = true;
2635 self.state.decision_debug_lines.clear();
2636 match self.config.lossy_symbol_mode {
2637 LossySymbolMode::SymbolUnify => self.apply_symbol_unify()?,
2638 LossySymbolMode::Off => {}
2639 }
2640 let plan = self.plan_document(false)?;
2641 self.validate_plan(&plan)?;
2642 let (
2643 global_segments,
2644 page_streams,
2645 local_dict_bytes_per_page,
2646 text_region_bytes_per_page,
2647 generic_region_bytes_per_page,
2648 ) = self.serialize_pdf_split(&plan)?;
2649 self.next_segment_number = plan.next_segment_number;
2650 Ok(PdfSplitOutput {
2651 global_segments,
2652 page_streams,
2653 local_dict_bytes_per_page,
2654 text_region_bytes_per_page,
2655 generic_region_bytes_per_page,
2656 })
2657 }
2658
2659 fn plan_document(&mut self, include_header: bool) -> Result<PlannedDocument> {
2660 debug!("Symbol stats before encoding: {}", self.get_symbol_stats());
2661 let diagnostics_enabled = encoder_diagnostics_enabled();
2662 let planning_start = Instant::now();
2663
2664 if self.config.auto_thresh {
2665 let clustering_start = Instant::now();
2666 self.cluster_symbols()?;
2667 self.metrics.symbol_mode.clustering += clustering_start.elapsed();
2668 }
2669
2670 self.prune_symbols_if_needed();
2671 self.alias_local_symbols_to_globals()?;
2672 self.validate_symbol_instance_indices()?;
2673
2674 let multi_page_candidates: Vec<usize> = self
2675 .global_symbols
2676 .iter()
2677 .enumerate()
2678 .filter(|(i, _)| self.symbol_page_count[*i] > 1 || self.pages.len() == 1)
2679 .map(|(i, _)| i)
2680 .collect();
2681 let global_symbol_indices: Vec<usize> = multi_page_candidates.clone();
2682 let global_set: HashSet<usize> = global_symbol_indices.iter().copied().collect();
2683 let estimated_global_dict_bytes =
2684 indexed_symbol_dictionary_bytes(&self.global_symbols, &global_symbol_indices);
2685 let low_value_global_candidates: Vec<(usize, usize, usize, i64)> = multi_page_candidates
2686 .iter()
2687 .copied()
2688 .filter(|symbol_index| !global_set.contains(symbol_index))
2689 .map(|symbol_index| {
2690 (
2691 symbol_index,
2692 self.symbol_usage[symbol_index],
2693 self.symbol_page_count[symbol_index],
2694 self.estimate_global_symbol_gain(symbol_index),
2695 )
2696 })
2697 .take(16)
2698 .collect();
2699 let multi_page_non_global = multi_page_candidates
2700 .len()
2701 .saturating_sub(global_symbol_indices.len());
2702 if diagnostics_enabled {
2703 self.state.decision_debug_lines.push(format!(
2704 "planning globals: selected={} multi_page_non_global={} estimated_dict_bytes={} low_value_candidates={}",
2705 global_symbol_indices.len(),
2706 multi_page_non_global,
2707 estimated_global_dict_bytes,
2708 low_value_global_candidates.len()
2709 ));
2710 for (symbol_index, usage, page_span, gain) in low_value_global_candidates {
2711 self.state.decision_debug_lines.push(format!(
2712 "planning global candidate: symbol={} usage={} page_span={} estimated_global_gain={}",
2713 symbol_index,
2714 usage,
2715 page_span,
2716 gain
2717 ));
2718 }
2719 }
2720
2721 let mut page_local_symbols: Vec<Vec<usize>> = self
2722 .page_symbol_indices
2723 .iter()
2724 .map(|symbols| {
2725 symbols
2726 .iter()
2727 .copied()
2728 .filter(|i| !global_set.contains(i))
2729 .collect()
2730 })
2731 .collect();
2732 let mut page_residual_symbols = vec![Vec::new(); self.pages.len()];
2733 let mut page_residual_anchor_remaps: Vec<FxHashMap<usize, usize>> = (0..self.pages.len())
2734 .map(|_| FxHashMap::default())
2735 .collect();
2736 let sym_unify_global_anchor_map =
2737 if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify {
2738 let mut anchors: FxHashMap<FamilyBucketKey, Vec<usize>> = FxHashMap::default();
2739 for &symbol_index in &global_symbol_indices {
2740 if !self.sym_unify_anchor_ready(symbol_index, self.pages.len()) {
2741 continue;
2742 }
2743 let key = family_bucket_key_for_symbol(
2744 &self.global_symbols[symbol_index],
2745 &self.symbol_signatures[symbol_index],
2746 );
2747 anchors.entry(key).or_default().push(symbol_index);
2748 }
2749 Some(anchors)
2750 } else {
2751 None
2752 };
2753 let sym_unify_global_anchor_bytes = sym_unify_global_anchor_map
2754 .as_ref()
2755 .map(|anchors| anchor_map_dictionary_bytes(&self.global_symbols, anchors))
2756 .unwrap_or(0);
2757 let mut planning_anchor_comparator = Comparator::default();
2758 let mut planning_local_anchor_attach_count = 0usize;
2759 let mut planning_anchor_attach_count = 0usize;
2760 let mut planning_local_score_rescue_count = 0usize;
2761 let mut planning_anchor_score_rescue_count = 0usize;
2762 let mut planning_local_anchor_attach_sample = Vec::new();
2763 let mut planning_anchor_attach_sample = Vec::new();
2764 let mut planning_local_rescue_count = 0usize;
2765 let mut planning_local_rescue_sample = Vec::new();
2766 let mut residual_symbol_traces: FxHashMap<usize, ResidualSymbolTrace> =
2767 FxHashMap::default();
2768 let mut counterfactual_local_dim_relax2 = CounterfactualProbeStats::default();
2769 let mut counterfactual_global_overlap_skip = CounterfactualProbeStats::default();
2770 let mut page_uses_generic_region = vec![false; self.pages.len()];
2771 for (page_num, page) in self.pages.iter().enumerate() {
2772 if self.config.uses_lossy_symbol_dictionary()
2773 || self.config.refine
2774 || self.config.text_refine
2775 {
2776 let mut local_use_counts = HashMap::new();
2777 for instance in &page.symbol_instances {
2778 *local_use_counts
2779 .entry(instance.symbol_index)
2780 .or_insert(0usize) += 1;
2781 }
2782 let local_anchor_candidates: Vec<usize> = page_local_symbols[page_num]
2783 .iter()
2784 .copied()
2785 .filter(|&symbol_index| {
2786 local_use_counts.get(&symbol_index).copied().unwrap_or(0) > 1
2787 || self.should_keep_text_local_symbol(page, symbol_index)
2788 })
2789 .collect();
2790 let mut kept_local_symbols = Vec::with_capacity(page_local_symbols[page_num].len());
2791 for &symbol_index in &page_local_symbols[page_num] {
2792 if local_use_counts.get(&symbol_index).copied().unwrap_or(0) <= 1 {
2793 if self.should_keep_text_local_symbol(page, symbol_index) {
2794 kept_local_symbols.push(symbol_index);
2795 planning_local_rescue_count += 1;
2796 if planning_local_rescue_sample.len() < 16 {
2797 planning_local_rescue_sample.push((
2798 page_num + 1,
2799 symbol_index,
2800 self.global_symbols[symbol_index].width,
2801 self.global_symbols[symbol_index].height,
2802 ));
2803 }
2804 continue;
2805 }
2806 let mut local_best_anchor = None;
2807 let mut local_best_reject = None;
2808 let mut had_local_candidates = false;
2809 for &anchor_index in &local_anchor_candidates {
2810 if anchor_index == symbol_index {
2811 continue;
2812 }
2813 had_local_candidates = true;
2814 match self.residual_symbol_anchor_decision(
2815 symbol_index,
2816 anchor_index,
2817 &mut planning_anchor_comparator,
2818 ) {
2819 SymUnifyAnchorDecision::Accept { score, dx, dy } => {
2820 self.maybe_update_best_sym_unify_anchor_candidate(
2821 &mut local_best_anchor,
2822 &self.global_symbols[symbol_index],
2823 anchor_index,
2824 score,
2825 dx,
2826 dy,
2827 false,
2828 );
2829 }
2830 SymUnifyAnchorDecision::RejectScore {
2831 score,
2832 limit,
2833 dx,
2834 dy,
2835 } if score
2836 <= limit.saturating_add(
2837 self.config.sym_unify_score_rescue_slack,
2838 ) =>
2839 {
2840 self.maybe_update_best_sym_unify_anchor_candidate(
2841 &mut local_best_anchor,
2842 &self.global_symbols[symbol_index],
2843 anchor_index,
2844 score,
2845 dx,
2846 dy,
2847 true,
2848 );
2849 }
2850 other => update_best_reject(&mut local_best_reject, other),
2851 }
2852 }
2853 if let Some(anchor_choice) = local_best_anchor {
2854 page_residual_anchor_remaps[page_num]
2855 .insert(symbol_index, anchor_choice.anchor_index);
2856 planning_local_anchor_attach_count += 1;
2857 if anchor_choice.rescued_on_score {
2858 planning_local_score_rescue_count += 1;
2859 }
2860 if planning_local_anchor_attach_sample.len() < 16 {
2861 planning_local_anchor_attach_sample.push((
2862 page_num + 1,
2863 symbol_index,
2864 anchor_choice.anchor_index,
2865 ));
2866 }
2867 continue;
2868 }
2869 if diagnostics_enabled
2870 && matches!(local_best_reject, Some(SymUnifyAnchorDecision::RejectDim))
2871 && local_anchor_candidates.iter().copied().any(|anchor_index| {
2872 anchor_index != symbol_index
2873 && self.residual_symbol_accept_with_dim_limit(
2874 symbol_index,
2875 anchor_index,
2876 &mut planning_anchor_comparator,
2877 2,
2878 )
2879 })
2880 {
2881 record_counterfactual_probe(
2882 &mut counterfactual_local_dim_relax2,
2883 page_num,
2884 symbol_index,
2885 &self.global_symbols[symbol_index],
2886 self.symbol_pixel_counts[symbol_index],
2887 );
2888 }
2889 let mut attached_anchor = None;
2890 let mut global_best_reject = None;
2891 let mut had_global_candidates = false;
2892 if let Some(anchor_map) = &sym_unify_global_anchor_map {
2893 let bucket = family_bucket_key_for_symbol(
2894 &self.global_symbols[symbol_index],
2895 &self.symbol_signatures[symbol_index],
2896 );
2897 let mut visited = FxHashSet::default();
2898 let mut best_anchor = None;
2899 for neighbor in family_bucket_neighbors(bucket) {
2900 let Some(candidates) = anchor_map.get(&neighbor) else {
2901 continue;
2902 };
2903 for &anchor_index in candidates {
2904 if anchor_index == symbol_index || !visited.insert(anchor_index)
2905 {
2906 continue;
2907 }
2908 had_global_candidates = true;
2909 match self.residual_symbol_anchor_decision(
2910 symbol_index,
2911 anchor_index,
2912 &mut planning_anchor_comparator,
2913 ) {
2914 SymUnifyAnchorDecision::Accept { score, dx, dy } => {
2915 self.maybe_update_best_sym_unify_anchor_candidate(
2916 &mut best_anchor,
2917 &self.global_symbols[symbol_index],
2918 anchor_index,
2919 score,
2920 dx,
2921 dy,
2922 false,
2923 );
2924 }
2925 SymUnifyAnchorDecision::RejectScore {
2926 score,
2927 limit,
2928 dx,
2929 dy,
2930 } if score
2931 <= limit.saturating_add(
2932 self.config.sym_unify_score_rescue_slack,
2933 ) =>
2934 {
2935 self.maybe_update_best_sym_unify_anchor_candidate(
2936 &mut best_anchor,
2937 &self.global_symbols[symbol_index],
2938 anchor_index,
2939 score,
2940 dx,
2941 dy,
2942 true,
2943 );
2944 }
2945 other => update_best_reject(&mut global_best_reject, other),
2946 }
2947 }
2948 }
2949 attached_anchor = best_anchor;
2950 }
2951 if let Some(anchor_choice) = attached_anchor {
2952 page_residual_anchor_remaps[page_num]
2953 .insert(symbol_index, anchor_choice.anchor_index);
2954 planning_anchor_attach_count += 1;
2955 if anchor_choice.rescued_on_score {
2956 planning_anchor_score_rescue_count += 1;
2957 }
2958 if planning_anchor_attach_sample.len() < 16 {
2959 planning_anchor_attach_sample.push((
2960 page_num + 1,
2961 symbol_index,
2962 anchor_choice.anchor_index,
2963 ));
2964 }
2965 } else {
2966 if diagnostics_enabled
2967 && matches!(
2968 global_best_reject,
2969 Some(SymUnifyAnchorDecision::RejectOverlap)
2970 )
2971 {
2972 let bucket = family_bucket_key_for_symbol(
2973 &self.global_symbols[symbol_index],
2974 &self.symbol_signatures[symbol_index],
2975 );
2976 let mut visited = FxHashSet::default();
2977 let recovered_without_overlap_prescreen = sym_unify_global_anchor_map
2978 .as_ref()
2979 .is_some_and(|anchor_map| {
2980 family_bucket_neighbors(bucket).into_iter().any(|neighbor| {
2981 anchor_map.get(&neighbor).is_some_and(|candidates| {
2982 candidates.iter().copied().any(|anchor_index| {
2983 anchor_index != symbol_index
2984 && visited.insert(anchor_index)
2985 && self.residual_symbol_accept_without_overlap_prescreen(
2986 symbol_index,
2987 anchor_index,
2988 &mut planning_anchor_comparator,
2989 )
2990 })
2991 })
2992 })
2993 });
2994 if recovered_without_overlap_prescreen {
2995 record_counterfactual_probe(
2996 &mut counterfactual_global_overlap_skip,
2997 page_num,
2998 symbol_index,
2999 &self.global_symbols[symbol_index],
3000 self.symbol_pixel_counts[symbol_index],
3001 );
3002 }
3003 }
3004 page_residual_symbols[page_num].push(symbol_index);
3005 residual_symbol_traces.insert(
3006 symbol_index,
3007 ResidualSymbolTrace {
3008 page_num,
3009 local_use_count: local_use_counts
3010 .get(&symbol_index)
3011 .copied()
3012 .unwrap_or(0),
3013 had_local_candidates,
3014 had_global_candidates,
3015 local_best_reject,
3016 global_best_reject,
3017 },
3018 );
3019 }
3020 } else {
3021 kept_local_symbols.push(symbol_index);
3022 }
3023 }
3024 page_local_symbols[page_num] = kept_local_symbols;
3025 }
3026
3027 let local_symbols = &page_local_symbols[page_num];
3028 let page_local_gain: i64 = local_symbols
3029 .iter()
3030 .map(|&symbol_index| self.estimate_local_symbol_gain(page, symbol_index))
3031 .sum();
3032 let uses_only_locals = page.symbol_instances.iter().all(|inst| {
3033 !global_set.contains(&inst.symbol_index)
3034 && !page_residual_anchor_remaps[page_num].contains_key(&inst.symbol_index)
3035 && !page_residual_symbols[page_num].contains(&inst.symbol_index)
3036 });
3037 if uses_only_locals
3038 && local_symbols.len() <= 2
3039 && page.symbol_instances.len() <= 2
3040 && page_local_gain <= 0
3041 {
3042 page_local_symbols[page_num].clear();
3043 page_uses_generic_region[page_num] = true;
3044 }
3045
3046 let has_kept_symbol_instances = page.symbol_instances.iter().any(|inst| {
3047 global_set.contains(&inst.symbol_index)
3048 || page_residual_anchor_remaps[page_num].contains_key(&inst.symbol_index)
3049 || page_local_symbols[page_num].contains(&inst.symbol_index)
3050 });
3051 if !has_kept_symbol_instances {
3052 page_uses_generic_region[page_num] = true;
3053 }
3054 }
3055
3056 let total_residual_symbols: usize = page_residual_symbols.iter().map(Vec::len).sum();
3057 let full_generic_pages = page_uses_generic_region.iter().filter(|&&v| v).count();
3058 if diagnostics_enabled {
3059 self.state.decision_debug_lines.push(format!(
3060 "planning residuals: {} page-local one-off symbols moved to generic residuals",
3061 total_residual_symbols
3062 ));
3063 self.state.decision_debug_lines.push(format!(
3064 "planning page modes: full_generic_pages={} text_pages={}",
3065 full_generic_pages,
3066 self.pages.len().saturating_sub(full_generic_pages)
3067 ));
3068 if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify {
3069 self.state.decision_debug_lines.push(format!(
3070 "sym_unify planning symbol rescues: local_kept={} local_anchor_remaps={} global_anchor_remaps={} local_score_rescues={} global_score_rescues={} anchor_ready_bytes={}",
3071 planning_local_rescue_count,
3072 planning_local_anchor_attach_count,
3073 planning_anchor_attach_count,
3074 planning_local_score_rescue_count,
3075 planning_anchor_score_rescue_count,
3076 sym_unify_global_anchor_bytes,
3077 ));
3078 if !planning_local_rescue_sample.is_empty() {
3079 self.state.decision_debug_lines.push(format!(
3080 "sym_unify planning local rescue sample: {:?}",
3081 planning_local_rescue_sample
3082 ));
3083 }
3084 if !planning_local_anchor_attach_sample.is_empty() {
3085 self.state.decision_debug_lines.push(format!(
3086 "sym_unify planning local-anchor sample: {:?}",
3087 planning_local_anchor_attach_sample
3088 ));
3089 }
3090 if !planning_anchor_attach_sample.is_empty() {
3091 self.state.decision_debug_lines.push(format!(
3092 "sym_unify planning anchor sample: {:?}",
3093 planning_anchor_attach_sample
3094 ));
3095 }
3096 self.state.decision_debug_lines.push(format!(
3097 "sym_unify residual counterfactuals: local_dim_relax2_symbols={} local_dim_relax2_bitmap_proxy_bytes={} local_dim_relax2_pages={} global_overlap_skip_symbols={} global_overlap_skip_bitmap_proxy_bytes={} global_overlap_skip_pages={}",
3098 counterfactual_local_dim_relax2.symbol_count,
3099 counterfactual_local_dim_relax2.bitmap_proxy_bytes,
3100 counterfactual_local_dim_relax2.pages.len(),
3101 counterfactual_global_overlap_skip.symbol_count,
3102 counterfactual_global_overlap_skip.bitmap_proxy_bytes,
3103 counterfactual_global_overlap_skip.pages.len(),
3104 ));
3105 if !counterfactual_local_dim_relax2.samples.is_empty() {
3106 self.state.decision_debug_lines.push(format!(
3107 "sym_unify counterfactual local_dim_relax2 sample: {:?}",
3108 counterfactual_local_dim_relax2.samples
3109 ));
3110 }
3111 if !counterfactual_global_overlap_skip.samples.is_empty() {
3112 self.state.decision_debug_lines.push(format!(
3113 "sym_unify counterfactual global_overlap_skip sample: {:?}",
3114 counterfactual_global_overlap_skip.samples
3115 ));
3116 }
3117 }
3118 }
3119 if diagnostics_enabled
3120 && self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
3121 && total_residual_symbols > 0
3122 {
3123 let mut comparator = Comparator::default();
3124 let mut residual_unique = FxHashSet::default();
3125 for residuals in &page_residual_symbols {
3126 residual_unique.extend(residuals.iter().copied());
3127 }
3128 let anchor_map = self.build_sym_unify_anchor_map(self.pages.len());
3129 let mut any_global_map: FxHashMap<FamilyBucketKey, Vec<usize>> = FxHashMap::default();
3130 for &symbol_index in &global_symbol_indices {
3131 let key = family_bucket_key_for_symbol(
3132 &self.global_symbols[symbol_index],
3133 &self.symbol_signatures[symbol_index],
3134 );
3135 any_global_map.entry(key).or_default().push(symbol_index);
3136 }
3137 let mut attachable = 0usize;
3138 let mut attachable_with_score_rescue = 0usize;
3139 let mut attachable_to_any_global = 0usize;
3140 let mut sampled = Vec::new();
3141 let mut sampled_score_rescue = Vec::new();
3142 let mut sampled_any_global = Vec::new();
3143 let mut visited = FxHashSet::default();
3144 let mut reject_counts: FxHashMap<&'static str, usize> = FxHashMap::default();
3145 let mut area_buckets = [0usize; 4];
3146 for residual_index in residual_unique.iter().copied() {
3147 let symbol = &self.global_symbols[residual_index];
3148 let area = symbol.width.saturating_mul(symbol.height);
3149 let bucket_index = if area <= 16 {
3150 0
3151 } else if area <= 32 {
3152 1
3153 } else if area <= 64 {
3154 2
3155 } else {
3156 3
3157 };
3158 area_buckets[bucket_index] += 1;
3159 let bucket =
3160 family_bucket_key_for_symbol(symbol, &self.symbol_signatures[residual_index]);
3161 visited.clear();
3162 let mut matched_anchor = None;
3163 let mut best_reject = SymUnifyAnchorDecision::RejectDim;
3164 'anchor_search: for neighbor in family_bucket_neighbors(bucket) {
3165 let Some(candidates) = anchor_map.get(&neighbor) else {
3166 continue;
3167 };
3168 for &anchor_index in candidates {
3169 if anchor_index == residual_index || !visited.insert(anchor_index) {
3170 continue;
3171 }
3172 let decision = self.residual_symbol_anchor_decision(
3173 residual_index,
3174 anchor_index,
3175 &mut comparator,
3176 );
3177 match decision {
3178 SymUnifyAnchorDecision::Accept { .. } => {
3179 matched_anchor = Some(anchor_index);
3180 break 'anchor_search;
3181 }
3182 _ => {
3183 if decision.diagnostic_rank() > best_reject.diagnostic_rank() {
3184 best_reject = decision;
3185 }
3186 }
3187 }
3188 }
3189 }
3190 if let Some(anchor_index) = matched_anchor {
3191 attachable += 1;
3192 attachable_with_score_rescue += 1;
3193 if sampled.len() < 16 {
3194 sampled.push((residual_index, anchor_index));
3195 }
3196 } else {
3197 *reject_counts.entry(best_reject.label()).or_insert(0) += 1;
3198
3199 visited.clear();
3200 let mut rescued_anchor = None;
3201 'score_rescue_search: for neighbor in family_bucket_neighbors(bucket) {
3202 let Some(candidates) = anchor_map.get(&neighbor) else {
3203 continue;
3204 };
3205 for &anchor_index in candidates {
3206 if anchor_index == residual_index || !visited.insert(anchor_index) {
3207 continue;
3208 }
3209 match self.residual_symbol_anchor_decision(
3210 residual_index,
3211 anchor_index,
3212 &mut comparator,
3213 ) {
3214 SymUnifyAnchorDecision::Accept { .. } => {
3215 rescued_anchor = Some(anchor_index);
3216 break 'score_rescue_search;
3217 }
3218 SymUnifyAnchorDecision::RejectScore { score, limit, .. }
3219 if score
3220 <= limit.saturating_add(
3221 self.config.sym_unify_score_rescue_slack,
3222 ) =>
3223 {
3224 rescued_anchor = Some(anchor_index);
3225 break 'score_rescue_search;
3226 }
3227 _ => {}
3228 }
3229 }
3230 }
3231 if let Some(anchor_index) = rescued_anchor {
3232 attachable_with_score_rescue += 1;
3233 if sampled_score_rescue.len() < 16 {
3234 sampled_score_rescue.push((residual_index, anchor_index));
3235 }
3236 }
3237 }
3238
3239 visited.clear();
3240 'any_global_search: for neighbor in family_bucket_neighbors(bucket) {
3241 let Some(candidates) = any_global_map.get(&neighbor) else {
3242 continue;
3243 };
3244 for &anchor_index in candidates {
3245 if anchor_index == residual_index || !visited.insert(anchor_index) {
3246 continue;
3247 }
3248 if matches!(
3249 self.residual_symbol_anchor_decision(
3250 residual_index,
3251 anchor_index,
3252 &mut comparator,
3253 ),
3254 SymUnifyAnchorDecision::Accept { .. }
3255 ) {
3256 attachable_to_any_global += 1;
3257 if sampled_any_global.len() < 16 {
3258 sampled_any_global.push((residual_index, anchor_index));
3259 }
3260 break 'any_global_search;
3261 }
3262 }
3263 }
3264 }
3265 self.state.decision_debug_lines.push(format!(
3266 "sym_unify residual anchor scan: residual_unique={} attachable_to_current_anchors={} attachable_with_score_rescue={} score_rescue_extra={} unattached={}",
3267 residual_unique.len(),
3268 attachable,
3269 attachable_with_score_rescue,
3270 attachable_with_score_rescue.saturating_sub(attachable),
3271 residual_unique.len().saturating_sub(attachable)
3272 ));
3273 self.state.decision_debug_lines.push(format!(
3274 "sym_unify residual reject breakdown: dim={} pixel_delta={} signature={} overlap={} compare={} outside_ink={} score={} area_le16={} area_le32={} area_le64={} area_gt64={}",
3275 reject_counts.get("dim").copied().unwrap_or(0),
3276 reject_counts.get("pixel_delta").copied().unwrap_or(0),
3277 reject_counts.get("signature").copied().unwrap_or(0),
3278 reject_counts.get("overlap").copied().unwrap_or(0),
3279 reject_counts.get("compare").copied().unwrap_or(0),
3280 reject_counts.get("outside_ink").copied().unwrap_or(0),
3281 reject_counts.get("score").copied().unwrap_or(0),
3282 area_buckets[0],
3283 area_buckets[1],
3284 area_buckets[2],
3285 area_buckets[3],
3286 ));
3287 self.state.decision_debug_lines.push(format!(
3288 "sym_unify residual any-global scan: residual_unique={} attachable_to_any_global={} extra_beyond_anchor_ready={}",
3289 residual_unique.len(),
3290 attachable_to_any_global,
3291 attachable_to_any_global.saturating_sub(attachable),
3292 ));
3293 if !sampled.is_empty() {
3294 self.state
3295 .decision_debug_lines
3296 .push(format!("sym_unify residual anchor sample: {:?}", sampled));
3297 }
3298 if !sampled_score_rescue.is_empty() {
3299 self.state.decision_debug_lines.push(format!(
3300 "sym_unify residual score-rescue sample: {:?}",
3301 sampled_score_rescue
3302 ));
3303 }
3304 if !sampled_any_global.is_empty() {
3305 self.state.decision_debug_lines.push(format!(
3306 "sym_unify residual any-global sample: {:?}",
3307 sampled_any_global
3308 ));
3309 }
3310
3311 let mut reason_stats: FxHashMap<ResidualReasonCode, ResidualReasonStats> =
3312 FxHashMap::default();
3313 for (&symbol_index, trace) in &residual_symbol_traces {
3314 let reason = trace.reason_code();
3315 let stats = reason_stats.entry(reason).or_default();
3316 let symbol = &self.global_symbols[symbol_index];
3317 let instance_count = trace.local_use_count.max(1);
3318 stats.symbol_count += 1;
3319 stats.instance_count += instance_count;
3320 stats.black_pixels += self.symbol_pixel_counts[symbol_index] * instance_count;
3321 stats.bitmap_proxy_bytes += bitmap_proxy_bytes(symbol) * instance_count;
3322 stats.pages.insert(trace.page_num);
3323 match classify_residual_shape(symbol) {
3324 ResidualShapeKind::Tiny => stats.tiny_count += 1,
3325 ResidualShapeKind::PunctuationLike => stats.punctuation_like_count += 1,
3326 ResidualShapeKind::GlyphLike => stats.glyph_like_count += 1,
3327 }
3328 if stats.samples.len() < 8 {
3329 stats.samples.push((
3330 trace.page_num + 1,
3331 symbol_index,
3332 symbol.width,
3333 symbol.height,
3334 trace.local_use_count,
3335 ));
3336 }
3337 }
3338
3339 let mut sorted_reason_stats: Vec<_> = reason_stats.into_iter().collect();
3340 sorted_reason_stats.sort_by(|lhs, rhs| {
3341 rhs.1
3342 .bitmap_proxy_bytes
3343 .cmp(&lhs.1.bitmap_proxy_bytes)
3344 .then_with(|| rhs.1.symbol_count.cmp(&lhs.1.symbol_count))
3345 .then_with(|| lhs.0.label().cmp(rhs.0.label()))
3346 });
3347 let total_reason_proxy_bytes: usize = sorted_reason_stats
3348 .iter()
3349 .map(|(_, stats)| stats.bitmap_proxy_bytes)
3350 .sum();
3351 self.state.decision_debug_lines.push(format!(
3352 "sym_unify residual reason summary: reasons={} residual_symbols={} bitmap_proxy_bytes={}",
3353 sorted_reason_stats.len(),
3354 residual_symbol_traces.len(),
3355 total_reason_proxy_bytes,
3356 ));
3357 for (reason, stats) in sorted_reason_stats {
3358 self.state.decision_debug_lines.push(format!(
3359 " residual reason {}: symbols={} instances={} pages={} black_pixels={} bitmap_proxy_bytes={} tiny={} punct_like={} glyph_like={} sample={:?}",
3360 reason.label(),
3361 stats.symbol_count,
3362 stats.instance_count,
3363 stats.pages.len(),
3364 stats.black_pixels,
3365 stats.bitmap_proxy_bytes,
3366 stats.tiny_count,
3367 stats.punctuation_like_count,
3368 stats.glyph_like_count,
3369 stats.samples
3370 ));
3371 }
3372
3373 let mut symbol_home_page = vec![usize::MAX; self.global_symbols.len()];
3374 for (page_idx, symbols) in self.page_symbol_indices.iter().enumerate() {
3375 for &symbol_index in symbols {
3376 if symbol_home_page[symbol_index] == usize::MAX {
3377 symbol_home_page[symbol_index] = page_idx;
3378 }
3379 }
3380 }
3381 let mut all_symbol_bucket_map: FxHashMap<FamilyBucketKey, Vec<usize>> =
3382 FxHashMap::default();
3383 for (symbol_index, symbol) in self.global_symbols.iter().enumerate() {
3384 let key =
3385 family_bucket_key_for_symbol(symbol, &self.symbol_signatures[symbol_index]);
3386 all_symbol_bucket_map
3387 .entry(key)
3388 .or_default()
3389 .push(symbol_index);
3390 }
3391
3392 let mut local_dim_cross_page_current = CounterfactualProbeStats::default();
3393 let mut local_dim_cross_page_dim2 = CounterfactualProbeStats::default();
3394 let mut cross_page_comparator = Comparator::default();
3395 let mut overlap_bypass_outcomes: FxHashMap<&'static str, CounterfactualProbeStats> =
3396 FxHashMap::default();
3397 let mut overlap_bypass_comparator = Comparator::default();
3398 let mut overlap_compare_probe_outcomes: FxHashMap<
3399 &'static str,
3400 CounterfactualProbeStats,
3401 > = FxHashMap::default();
3402 let mut overlap_compare_probe_comparator = Comparator::default();
3403 let mut overlap_bypass_compare_total_err_details = DetailedCompareProbeStats::default();
3404 let mut global_compare_total_err_details = DetailedCompareProbeStats::default();
3405 let mut compare_slack2_from_global_compare = CounterfactualProbeStats::default();
3406 let mut compare_slack4_from_global_compare = CounterfactualProbeStats::default();
3407 let mut compare_slack2_from_overlap_compare = CounterfactualProbeStats::default();
3408 let mut compare_slack4_from_overlap_compare = CounterfactualProbeStats::default();
3409 for (&symbol_index, trace) in &residual_symbol_traces {
3410 if trace.reason_code() != ResidualReasonCode::UseCountOneLocalRejectDim {
3411 if trace.reason_code() == ResidualReasonCode::UseCountOneGlobalRejectOverlap {
3412 let bucket = family_bucket_key_for_symbol(
3413 &self.global_symbols[symbol_index],
3414 &self.symbol_signatures[symbol_index],
3415 );
3416 let mut visited = FxHashSet::default();
3417 let mut best_bypass_reject = None;
3418 let mut recovered = false;
3419 let mut best_compare_total_err: Option<(
3420 crate::jbig2comparator::CompareResult,
3421 u32,
3422 bool,
3423 bool,
3424 )> = None;
3425 if let Some(anchor_map) = &sym_unify_global_anchor_map {
3426 'overlap_bypass_search: for neighbor in family_bucket_neighbors(bucket)
3427 {
3428 let Some(candidates) = anchor_map.get(&neighbor) else {
3429 continue;
3430 };
3431 for &anchor_index in candidates {
3432 if anchor_index == symbol_index || !visited.insert(anchor_index)
3433 {
3434 continue;
3435 }
3436 let strong_anchor = self.symbol_usage[anchor_index]
3437 >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
3438 || self.symbol_page_count[anchor_index]
3439 >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
3440 match self
3441 .residual_symbol_anchor_decision_without_overlap_prescreen(
3442 symbol_index,
3443 anchor_index,
3444 &mut overlap_bypass_comparator,
3445 ) {
3446 SymUnifyAnchorDecision::Accept { .. } => {
3447 recovered = true;
3448 break 'overlap_bypass_search;
3449 }
3450 SymUnifyAnchorDecision::RejectCompare => {
3451 update_best_reject(
3452 &mut best_bypass_reject,
3453 SymUnifyAnchorDecision::RejectCompare,
3454 );
3455 let candidate = &self.global_symbols[symbol_index];
3456 let proto = &self.global_symbols[anchor_index];
3457 let compare_max_err = self
3458 .config
3459 .sym_unify_max_err
3460 .max(4)
3461 .saturating_add(u32::from(strong_anchor));
3462 if let Some(result) = overlap_compare_probe_comparator
3463 .compare_for_symbol_unify(
3464 candidate,
3465 proto,
3466 relaxed_compare_probe_max_err(candidate, proto),
3467 self.config.sym_unify_max_dx.max(0),
3468 self.config.sym_unify_max_dy.max(0),
3469 )
3470 {
3471 let exact_dims = candidate.width == proto.width
3472 && candidate.height == proto.height;
3473 if best_compare_total_err.is_none_or(
3474 |(current, _, _, _)| {
3475 result.total_err < current.total_err
3476 },
3477 ) {
3478 best_compare_total_err = Some((
3479 result,
3480 compare_max_err,
3481 exact_dims,
3482 strong_anchor,
3483 ));
3484 }
3485 }
3486 }
3487 other => update_best_reject(&mut best_bypass_reject, other),
3488 }
3489 }
3490 }
3491 }
3492 let label = if recovered {
3493 "accept"
3494 } else {
3495 best_bypass_reject
3496 .map(SymUnifyAnchorDecision::label)
3497 .unwrap_or("no_candidates")
3498 };
3499 record_labeled_counterfactual_probe(
3500 &mut overlap_bypass_outcomes,
3501 label,
3502 trace.page_num,
3503 symbol_index,
3504 &self.global_symbols[symbol_index],
3505 self.symbol_pixel_counts[symbol_index],
3506 );
3507 if label == "compare"
3508 && let Some((result, compare_max_err, exact_dims, strong_anchor)) =
3509 best_compare_total_err
3510 {
3511 let outside_limit = self
3512 .config
3513 .sym_unify_max_border_outside_ink
3514 .min(1)
3515 .saturating_add(u32::from(strong_anchor));
3516 let score_limit =
3517 self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
3518 let score = Self::symbol_unify_assignment_score(&result);
3519 record_detailed_compare_probe(
3520 &mut overlap_bypass_compare_total_err_details,
3521 trace.page_num,
3522 symbol_index,
3523 &self.global_symbols[symbol_index],
3524 result,
3525 compare_max_err,
3526 exact_dims,
3527 strong_anchor,
3528 );
3529 if result.total_err <= compare_max_err.saturating_add(2)
3530 && result.outside_ink_err <= outside_limit
3531 && score <= score_limit
3532 {
3533 record_counterfactual_probe(
3534 &mut compare_slack2_from_overlap_compare,
3535 trace.page_num,
3536 symbol_index,
3537 &self.global_symbols[symbol_index],
3538 self.symbol_pixel_counts[symbol_index],
3539 );
3540 }
3541 if result.total_err <= compare_max_err.saturating_add(4)
3542 && result.outside_ink_err <= outside_limit
3543 && score <= score_limit
3544 {
3545 record_counterfactual_probe(
3546 &mut compare_slack4_from_overlap_compare,
3547 trace.page_num,
3548 symbol_index,
3549 &self.global_symbols[symbol_index],
3550 self.symbol_pixel_counts[symbol_index],
3551 );
3552 }
3553 }
3554 }
3555 if trace.reason_code() == ResidualReasonCode::UseCountOneGlobalRejectCompare {
3556 let bucket = family_bucket_key_for_symbol(
3557 &self.global_symbols[symbol_index],
3558 &self.symbol_signatures[symbol_index],
3559 );
3560 let mut visited = FxHashSet::default();
3561 let mut best_probe_label = "no_candidates";
3562 let mut best_total_err = u32::MAX;
3563 let mut best_total_err_detail: Option<(
3564 crate::jbig2comparator::CompareResult,
3565 u32,
3566 bool,
3567 bool,
3568 )> = None;
3569 if let Some(anchor_map) = &sym_unify_global_anchor_map {
3570 for neighbor in family_bucket_neighbors(bucket) {
3571 let Some(candidates) = anchor_map.get(&neighbor) else {
3572 continue;
3573 };
3574 for &anchor_index in candidates {
3575 if anchor_index == symbol_index || !visited.insert(anchor_index)
3576 {
3577 continue;
3578 }
3579
3580 let candidate = &self.global_symbols[symbol_index];
3581 let proto = &self.global_symbols[anchor_index];
3582 let strong_anchor = self.symbol_usage[anchor_index]
3583 >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
3584 || self.symbol_page_count[anchor_index]
3585 >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
3586 let compare_max_err = self
3587 .config
3588 .sym_unify_max_err
3589 .max(4)
3590 .saturating_add(u32::from(strong_anchor));
3591 let outside_limit = self
3592 .config
3593 .sym_unify_max_border_outside_ink
3594 .min(1)
3595 .saturating_add(u32::from(strong_anchor));
3596 let relaxed = overlap_compare_probe_comparator
3597 .compare_for_symbol_unify(
3598 candidate,
3599 proto,
3600 relaxed_compare_probe_max_err(candidate, proto),
3601 self.config.sym_unify_max_dx.max(0),
3602 self.config.sym_unify_max_dy.max(0),
3603 );
3604 let (label, total_err) = if let Some(result) = relaxed {
3605 let score = Self::symbol_unify_assignment_score(&result);
3606 let score_limit = self.config.sym_unify_class_accept_limit
3607 + u32::from(strong_anchor);
3608 let label = if result.total_err <= compare_max_err {
3609 if result.outside_ink_err > outside_limit {
3610 "outside_ink"
3611 } else if score > score_limit {
3612 "score"
3613 } else {
3614 "accept"
3615 }
3616 } else if result.outside_ink_err > outside_limit {
3617 "total_err+outside_ink"
3618 } else {
3619 "total_err"
3620 };
3621 (label, result.total_err)
3622 } else {
3623 ("relaxed_none", u32::MAX)
3624 };
3625
3626 if total_err < best_total_err {
3627 best_total_err = total_err;
3628 best_probe_label = label;
3629 if let Some(result) = relaxed {
3630 let exact_dims = candidate.width == proto.width
3631 && candidate.height == proto.height;
3632 best_total_err_detail = Some((
3633 result,
3634 compare_max_err,
3635 exact_dims,
3636 strong_anchor,
3637 ));
3638 } else {
3639 best_total_err_detail = None;
3640 }
3641 } else if best_total_err == u32::MAX
3642 && best_probe_label == "no_candidates"
3643 {
3644 best_probe_label = label;
3645 }
3646 }
3647 }
3648 }
3649
3650 record_labeled_counterfactual_probe(
3651 &mut overlap_compare_probe_outcomes,
3652 best_probe_label,
3653 trace.page_num,
3654 symbol_index,
3655 &self.global_symbols[symbol_index],
3656 self.symbol_pixel_counts[symbol_index],
3657 );
3658 if best_probe_label == "total_err"
3659 && let Some((result, compare_max_err, exact_dims, strong_anchor)) =
3660 best_total_err_detail
3661 {
3662 let outside_limit = self
3663 .config
3664 .sym_unify_max_border_outside_ink
3665 .min(1)
3666 .saturating_add(u32::from(strong_anchor));
3667 let score_limit =
3668 self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
3669 let score = Self::symbol_unify_assignment_score(&result);
3670 record_detailed_compare_probe(
3671 &mut global_compare_total_err_details,
3672 trace.page_num,
3673 symbol_index,
3674 &self.global_symbols[symbol_index],
3675 result,
3676 compare_max_err,
3677 exact_dims,
3678 strong_anchor,
3679 );
3680 if result.total_err <= compare_max_err.saturating_add(2)
3681 && result.outside_ink_err <= outside_limit
3682 && score <= score_limit
3683 {
3684 record_counterfactual_probe(
3685 &mut compare_slack2_from_global_compare,
3686 trace.page_num,
3687 symbol_index,
3688 &self.global_symbols[symbol_index],
3689 self.symbol_pixel_counts[symbol_index],
3690 );
3691 }
3692 if result.total_err <= compare_max_err.saturating_add(4)
3693 && result.outside_ink_err <= outside_limit
3694 && score <= score_limit
3695 {
3696 record_counterfactual_probe(
3697 &mut compare_slack4_from_global_compare,
3698 trace.page_num,
3699 symbol_index,
3700 &self.global_symbols[symbol_index],
3701 self.symbol_pixel_counts[symbol_index],
3702 );
3703 }
3704 }
3705 }
3706 continue;
3707 }
3708
3709 let bucket = family_bucket_key_for_symbol(
3710 &self.global_symbols[symbol_index],
3711 &self.symbol_signatures[symbol_index],
3712 );
3713 let mut visited = FxHashSet::default();
3714 let mut found_current = false;
3715 let mut found_dim2 = false;
3716 'cross_page_search: for neighbor in family_bucket_neighbors(bucket) {
3717 let Some(candidates) = all_symbol_bucket_map.get(&neighbor) else {
3718 continue;
3719 };
3720 for &candidate_index in candidates {
3721 if candidate_index == symbol_index
3722 || !visited.insert(candidate_index)
3723 || symbol_home_page[candidate_index] == trace.page_num
3724 {
3725 continue;
3726 }
3727 if self.residual_symbol_matches_anchor(
3728 symbol_index,
3729 candidate_index,
3730 &mut cross_page_comparator,
3731 ) {
3732 found_current = true;
3733 break 'cross_page_search;
3734 }
3735 if self.residual_symbol_accept_with_dim_limit(
3736 symbol_index,
3737 candidate_index,
3738 &mut cross_page_comparator,
3739 2,
3740 ) {
3741 found_dim2 = true;
3742 }
3743 }
3744 }
3745
3746 if found_current {
3747 record_counterfactual_probe(
3748 &mut local_dim_cross_page_current,
3749 trace.page_num,
3750 symbol_index,
3751 &self.global_symbols[symbol_index],
3752 self.symbol_pixel_counts[symbol_index],
3753 );
3754 } else if found_dim2 {
3755 record_counterfactual_probe(
3756 &mut local_dim_cross_page_dim2,
3757 trace.page_num,
3758 symbol_index,
3759 &self.global_symbols[symbol_index],
3760 self.symbol_pixel_counts[symbol_index],
3761 );
3762 }
3763 }
3764 self.state.decision_debug_lines.push(format!(
3765 "sym_unify cross-page local-dim probes: current_symbols={} current_bitmap_proxy_bytes={} current_pages={} dim2_only_symbols={} dim2_only_bitmap_proxy_bytes={} dim2_only_pages={}",
3766 local_dim_cross_page_current.symbol_count,
3767 local_dim_cross_page_current.bitmap_proxy_bytes,
3768 local_dim_cross_page_current.pages.len(),
3769 local_dim_cross_page_dim2.symbol_count,
3770 local_dim_cross_page_dim2.bitmap_proxy_bytes,
3771 local_dim_cross_page_dim2.pages.len(),
3772 ));
3773 if !local_dim_cross_page_current.samples.is_empty() {
3774 self.state.decision_debug_lines.push(format!(
3775 "sym_unify cross-page local-dim current sample: {:?}",
3776 local_dim_cross_page_current.samples
3777 ));
3778 }
3779 if !local_dim_cross_page_dim2.samples.is_empty() {
3780 self.state.decision_debug_lines.push(format!(
3781 "sym_unify cross-page local-dim dim2 sample: {:?}",
3782 local_dim_cross_page_dim2.samples
3783 ));
3784 }
3785
3786 let mut sorted_overlap_bypass_outcomes: Vec<_> =
3787 overlap_bypass_outcomes.into_iter().collect();
3788 sorted_overlap_bypass_outcomes.sort_by(|lhs, rhs| {
3789 rhs.1
3790 .bitmap_proxy_bytes
3791 .cmp(&lhs.1.bitmap_proxy_bytes)
3792 .then_with(|| rhs.1.symbol_count.cmp(&lhs.1.symbol_count))
3793 .then_with(|| lhs.0.cmp(rhs.0))
3794 });
3795 let overlap_bypass_total_symbols: usize = sorted_overlap_bypass_outcomes
3796 .iter()
3797 .map(|(_, stats)| stats.symbol_count)
3798 .sum();
3799 let overlap_bypass_total_bitmap_proxy_bytes: usize = sorted_overlap_bypass_outcomes
3800 .iter()
3801 .map(|(_, stats)| stats.bitmap_proxy_bytes)
3802 .sum();
3803 self.state.decision_debug_lines.push(format!(
3804 "sym_unify overlap-bypass outcomes: outcomes={} symbols={} bitmap_proxy_bytes={}",
3805 sorted_overlap_bypass_outcomes.len(),
3806 overlap_bypass_total_symbols,
3807 overlap_bypass_total_bitmap_proxy_bytes,
3808 ));
3809 for (label, stats) in sorted_overlap_bypass_outcomes {
3810 self.state.decision_debug_lines.push(format!(
3811 " overlap-bypass {}: symbols={} pages={} black_pixels={} bitmap_proxy_bytes={} sample={:?}",
3812 label,
3813 stats.symbol_count,
3814 stats.pages.len(),
3815 stats.black_pixels,
3816 stats.bitmap_proxy_bytes,
3817 stats.samples
3818 ));
3819 }
3820
3821 let mut sorted_overlap_compare_probe_outcomes: Vec<_> =
3822 overlap_compare_probe_outcomes.into_iter().collect();
3823 sorted_overlap_compare_probe_outcomes.sort_by(|lhs, rhs| {
3824 rhs.1
3825 .bitmap_proxy_bytes
3826 .cmp(&lhs.1.bitmap_proxy_bytes)
3827 .then_with(|| rhs.1.symbol_count.cmp(&lhs.1.symbol_count))
3828 .then_with(|| lhs.0.cmp(rhs.0))
3829 });
3830 let overlap_compare_probe_total_symbols: usize = sorted_overlap_compare_probe_outcomes
3831 .iter()
3832 .map(|(_, stats)| stats.symbol_count)
3833 .sum();
3834 let overlap_compare_probe_total_bitmap_proxy_bytes: usize =
3835 sorted_overlap_compare_probe_outcomes
3836 .iter()
3837 .map(|(_, stats)| stats.bitmap_proxy_bytes)
3838 .sum();
3839 self.state.decision_debug_lines.push(format!(
3840 "sym_unify global-compare relaxed probe: outcomes={} symbols={} bitmap_proxy_bytes={}",
3841 sorted_overlap_compare_probe_outcomes.len(),
3842 overlap_compare_probe_total_symbols,
3843 overlap_compare_probe_total_bitmap_proxy_bytes,
3844 ));
3845 for (label, stats) in sorted_overlap_compare_probe_outcomes {
3846 self.state.decision_debug_lines.push(format!(
3847 " global-compare relaxed {}: symbols={} pages={} black_pixels={} bitmap_proxy_bytes={} sample={:?}",
3848 label,
3849 stats.symbol_count,
3850 stats.pages.len(),
3851 stats.black_pixels,
3852 stats.bitmap_proxy_bytes,
3853 stats.samples
3854 ));
3855 }
3856 self.state.decision_debug_lines.push(format!(
3857 "sym_unify overlap-bypass compare total_err detail: symbols={} bitmap_proxy_bytes={} exact_dims={} strong_anchor={} shift_le1={} over_by_le2={} over_by_le4={} over_by_le8={} over_by_gt8={} sample={:?}",
3858 overlap_bypass_compare_total_err_details.symbol_count,
3859 overlap_bypass_compare_total_err_details.bitmap_proxy_bytes,
3860 overlap_bypass_compare_total_err_details.exact_dims_count,
3861 overlap_bypass_compare_total_err_details.strong_anchor_count,
3862 overlap_bypass_compare_total_err_details.shift_le1_count,
3863 overlap_bypass_compare_total_err_details.over_by_le2_count,
3864 overlap_bypass_compare_total_err_details.over_by_le4_count,
3865 overlap_bypass_compare_total_err_details.over_by_le8_count,
3866 overlap_bypass_compare_total_err_details.over_by_gt8_count,
3867 overlap_bypass_compare_total_err_details.samples
3868 ));
3869 self.state.decision_debug_lines.push(format!(
3870 "sym_unify global-compare total_err detail: symbols={} bitmap_proxy_bytes={} exact_dims={} strong_anchor={} shift_le1={} over_by_le2={} over_by_le4={} over_by_le8={} over_by_gt8={} sample={:?}",
3871 global_compare_total_err_details.symbol_count,
3872 global_compare_total_err_details.bitmap_proxy_bytes,
3873 global_compare_total_err_details.exact_dims_count,
3874 global_compare_total_err_details.strong_anchor_count,
3875 global_compare_total_err_details.shift_le1_count,
3876 global_compare_total_err_details.over_by_le2_count,
3877 global_compare_total_err_details.over_by_le4_count,
3878 global_compare_total_err_details.over_by_le8_count,
3879 global_compare_total_err_details.over_by_gt8_count,
3880 global_compare_total_err_details.samples
3881 ));
3882 self.state.decision_debug_lines.push(format!(
3883 "sym_unify compare-slack probes: global_total_err_slack2_symbols={} global_total_err_slack2_bitmap_proxy_bytes={} global_total_err_slack4_symbols={} global_total_err_slack4_bitmap_proxy_bytes={} overlap_compare_slack2_symbols={} overlap_compare_slack2_bitmap_proxy_bytes={} overlap_compare_slack4_symbols={} overlap_compare_slack4_bitmap_proxy_bytes={}",
3884 compare_slack2_from_global_compare.symbol_count,
3885 compare_slack2_from_global_compare.bitmap_proxy_bytes,
3886 compare_slack4_from_global_compare.symbol_count,
3887 compare_slack4_from_global_compare.bitmap_proxy_bytes,
3888 compare_slack2_from_overlap_compare.symbol_count,
3889 compare_slack2_from_overlap_compare.bitmap_proxy_bytes,
3890 compare_slack4_from_overlap_compare.symbol_count,
3891 compare_slack4_from_overlap_compare.bitmap_proxy_bytes
3892 ));
3893 }
3894 if diagnostics_enabled {
3895 for (page_num, residuals) in page_residual_symbols.iter().enumerate().take(32) {
3896 if !residuals.is_empty() {
3897 self.state.decision_debug_lines.push(format!(
3898 "page {} residual symbols: count={} sample={:?}",
3899 page_num + 1,
3900 residuals.len(),
3901 &residuals[..residuals.len().min(8)]
3902 ));
3903 }
3904 }
3905 }
3906
3907 self.validate_symbol_partition(
3908 &global_symbol_indices,
3909 &page_local_symbols,
3910 &page_residual_symbols,
3911 &page_residual_anchor_remaps,
3912 &page_uses_generic_region,
3913 )?;
3914
3915 let mut current_segment_number = self.next_segment_number;
3916 let mut global_segments = Vec::new();
3917
3918 self.global_dict_segment_numbers.clear();
3919 let mut encoded_global_dict = EncodedSymbolDictionary::default();
3920 let mut global_refinement_map = vec![None; self.global_symbols.len()];
3921 if !global_symbol_indices.is_empty() {
3922 let refs: Vec<&BitImage> = global_symbol_indices
3923 .iter()
3924 .map(|&i| &self.global_symbols[i])
3925 .collect();
3926 let dict_usage: Vec<usize> = global_symbol_indices
3927 .iter()
3928 .map(|&i| self.symbol_usage[i])
3929 .collect();
3930 let dict_layout =
3931 plan_symbol_dictionary_layout(&refs, &self.config, Some(&dict_usage))?;
3932 if diagnostics_enabled {
3933 self.state.decision_debug_lines.push(format!(
3934 "global dict layout: families={} singletons={} refined_members={} exported_members={}",
3935 dict_layout.diagnostics.family_count,
3936 dict_layout.diagnostics.singleton_family_count,
3937 dict_layout.diagnostics.refined_member_count,
3938 dict_layout.diagnostics.exported_member_count
3939 ));
3940 self.state.decision_debug_lines.extend(
3941 dict_layout
3942 .diagnostics
3943 .sample_lines
3944 .iter()
3945 .take(64)
3946 .cloned(),
3947 );
3948 }
3949 let dict_start = Instant::now();
3950 encoded_global_dict =
3951 encode_symbol_dictionary_segments(&refs, &self.config, &dict_layout)?;
3952 self.metrics.symbol_mode.symbol_dict_encoding += dict_start.elapsed();
3953 for (subset_index, refinement) in dict_layout.refinements.iter().enumerate() {
3954 if let Some(refinement) = refinement {
3955 let gs_idx = global_symbol_indices[subset_index];
3956 global_refinement_map[gs_idx] = Some(RefinementPlan {
3957 prototype_input_index: global_symbol_indices
3958 [refinement.prototype_input_index],
3959 refinement_dx: refinement.refinement_dx,
3960 refinement_dy: refinement.refinement_dy,
3961 });
3962 }
3963 }
3964 let segment_number = current_segment_number;
3965 current_segment_number += 1;
3966 self.global_dict_segment_numbers.push(segment_number);
3967 global_segments.push(Segment {
3968 number: segment_number,
3969 seg_type: SegmentType::SymbolDictionary,
3970 deferred_non_retain: false,
3971 retain_flags: 0,
3972 page_association_type: 2,
3973 referred_to: Vec::new(),
3974 page: None,
3975 payload: encoded_global_dict.payload.clone(),
3976 });
3977 }
3978
3979 let mut global_sym_to_dict_pos = vec![u32::MAX; self.global_symbols.len()];
3980 for (refs_idx, &dict_pos) in encoded_global_dict.input_to_exported_pos.iter().enumerate() {
3981 if dict_pos != u32::MAX {
3982 let gs_idx = global_symbol_indices[refs_idx];
3983 global_sym_to_dict_pos[gs_idx] = dict_pos;
3984 }
3985 }
3986 let num_global_dict_symbols = encoded_global_dict.exported_symbol_count;
3987
3988 let mut planned_local_export_count = 0usize;
3989 self.metrics.symbol_stats.global_symbol_count = num_global_dict_symbols as usize;
3990
3991 let page_segment_start = current_segment_number;
3992 let mut page_layouts = Vec::with_capacity(self.pages.len());
3993 for (page_num, page) in self.pages.iter().enumerate() {
3994 let page_number = if self.state.pdf_mode {
3995 1u32
3996 } else {
3997 page_num as u32 + 1
3998 };
3999 if self.state.pdf_mode {
4000 current_segment_number = page_segment_start;
4001 }
4002 let page_info_segment_number = current_segment_number;
4003 current_segment_number += 1;
4004 let local_dict_layout = if self.config.symbol_mode
4005 && !page.symbol_instances.is_empty()
4006 && !page_local_symbols[page_num].is_empty()
4007 {
4008 let refs: Vec<&BitImage> = page_local_symbols[page_num]
4009 .iter()
4010 .map(|&i| &self.global_symbols[i])
4011 .collect();
4012 let mut local_usage = vec![0usize; page_local_symbols[page_num].len()];
4013 let local_index_by_symbol: HashMap<usize, usize> = page_local_symbols[page_num]
4014 .iter()
4015 .enumerate()
4016 .map(|(idx, &symbol_index)| (symbol_index, idx))
4017 .collect();
4018 for instance in &page.symbol_instances {
4019 if let Some(&local_idx) = local_index_by_symbol.get(&instance.symbol_index) {
4020 local_usage[local_idx] += 1;
4021 }
4022 }
4023 Some(plan_symbol_dictionary_layout(
4024 &refs,
4025 &self.config,
4026 Some(&local_usage),
4027 )?)
4028 } else {
4029 None
4030 };
4031 let mut local_dict_segment_numbers = Vec::new();
4032 if let Some(local_dict_layout) = &local_dict_layout {
4033 if diagnostics_enabled {
4034 self.state.decision_debug_lines.push(format!(
4035 "page {} local dict layout: families={} singletons={} refined_members={} exported_members={}",
4036 page_num + 1,
4037 local_dict_layout.diagnostics.family_count,
4038 local_dict_layout.diagnostics.singleton_family_count,
4039 local_dict_layout.diagnostics.refined_member_count,
4040 local_dict_layout.diagnostics.exported_member_count
4041 ));
4042 self.state.decision_debug_lines.extend(
4043 local_dict_layout
4044 .diagnostics
4045 .sample_lines
4046 .iter()
4047 .take(16)
4048 .cloned(),
4049 );
4050 }
4051 for _ in 0..local_dict_layout.segment_count() {
4052 local_dict_segment_numbers.push(current_segment_number);
4053 current_segment_number += 1;
4054 }
4055 planned_local_export_count += local_dict_layout.export_input_indices.len();
4056 }
4057 let region_segment_number = current_segment_number;
4058 current_segment_number += 1;
4059 let has_residual_region = !page_residual_symbols[page_num].is_empty()
4060 && !page_uses_generic_region[page_num]
4061 && page.symbol_instances.iter().any(|inst| {
4062 global_set.contains(&inst.symbol_index)
4063 || page_local_symbols[page_num].contains(&inst.symbol_index)
4064 });
4065 let residual_region_segment_number = if has_residual_region {
4066 let number = current_segment_number;
4067 current_segment_number += 1;
4068 Some(number)
4069 } else {
4070 None
4071 };
4072 let end_of_page_segment_number = current_segment_number;
4073 current_segment_number += 1;
4074 let use_generic_region = page_uses_generic_region[page_num];
4075 if diagnostics_enabled {
4076 self.state.decision_debug_lines.push(format!(
4077 "page {} plan: full_generic={} residual_region={} local_symbols={} residual_symbols={} anchor_remaps={} instances={}",
4078 page_num + 1,
4079 use_generic_region,
4080 has_residual_region,
4081 page_local_symbols[page_num].len(),
4082 page_residual_symbols[page_num].len(),
4083 page_residual_anchor_remaps[page_num].len(),
4084 page.symbol_instances.len()
4085 ));
4086 }
4087
4088 page_layouts.push(PlannedPageLayout {
4089 page_index: page_num,
4090 page_number,
4091 page_info_segment_number,
4092 local_dict_segment_numbers,
4093 local_dict_layout,
4094 region_segment_number,
4095 residual_region_segment_number,
4096 end_of_page_segment_number,
4097 local_symbols: page_local_symbols[page_num].clone(),
4098 residual_symbols: page_residual_symbols[page_num].clone(),
4099 residual_anchor_remaps: page_residual_anchor_remaps[page_num].clone(),
4100 use_generic_region,
4101 });
4102 }
4103
4104 self.metrics.symbol_stats.local_symbol_count = planned_local_export_count;
4105 self.metrics.symbol_stats.symbols_exported =
4106 self.metrics.symbol_stats.global_symbol_count + planned_local_export_count;
4107 self.metrics.symbol_stats.avg_symbol_reuse =
4108 if self.metrics.symbol_stats.symbols_exported > 0 {
4109 self.symbol_usage.iter().sum::<usize>() as f64
4110 / self.metrics.symbol_stats.symbols_exported as f64
4111 } else {
4112 0.0
4113 };
4114
4115 self.metrics.symbol_mode.planning += planning_start.elapsed();
4116
4117 #[cfg(feature = "parallel")]
4118 let built_pages = if self.state.pdf_mode || self.pages.len() > 1 {
4119 page_layouts
4120 .par_iter()
4121 .map(|layout| {
4122 self.build_planned_page(
4123 layout,
4124 &global_sym_to_dict_pos,
4125 num_global_dict_symbols,
4126 &global_refinement_map,
4127 )
4128 })
4129 .collect::<Vec<_>>()
4130 } else {
4131 page_layouts
4132 .iter()
4133 .map(|layout| {
4134 self.build_planned_page(
4135 layout,
4136 &global_sym_to_dict_pos,
4137 num_global_dict_symbols,
4138 &global_refinement_map,
4139 )
4140 })
4141 .collect::<Vec<_>>()
4142 };
4143
4144 #[cfg(not(feature = "parallel"))]
4145 let built_pages = page_layouts
4146 .iter()
4147 .map(|layout| {
4148 self.build_planned_page(
4149 layout,
4150 &global_sym_to_dict_pos,
4151 num_global_dict_symbols,
4152 &global_refinement_map,
4153 )
4154 })
4155 .collect::<Vec<_>>();
4156
4157 let mut pages = Vec::with_capacity(built_pages.len());
4158 for built_page in built_pages {
4159 let built_page = built_page?;
4160 self.metrics.symbol_mode.symbol_dict_encoding += built_page.symbol_dict_time;
4161 self.metrics.symbol_mode.text_region_encoding += built_page.text_region_time;
4162 self.metrics.symbol_mode.generic_region_encoding += built_page.generic_region_time;
4163 pages.push(built_page.page);
4164 }
4165
4166 let eof_segment = Some(Segment {
4167 number: current_segment_number,
4168 seg_type: SegmentType::EndOfFile,
4169 deferred_non_retain: false,
4170 retain_flags: 0,
4171 page_association_type: 2,
4172 referred_to: vec![],
4173 page: None,
4174 payload: vec![],
4175 });
4176 current_segment_number += 1;
4177
4178 Ok(PlannedDocument {
4179 file_header: if include_header {
4180 Some(FileHeader {
4181 organisation_type: true,
4182 unknown_n_pages: false,
4183 n_pages: self.pages.len() as u32,
4184 })
4185 } else {
4186 None
4187 },
4188 global_segments,
4189 pages,
4190 eof_segment,
4191 next_segment_number: current_segment_number,
4192 })
4193 }
4194
4195 fn build_planned_page(
4196 &self,
4197 layout: &PlannedPageLayout,
4198 global_sym_to_dict_pos: &[u32],
4199 num_global_dict_symbols: u32,
4200 global_refinement_map: &[Option<RefinementPlan>],
4201 ) -> Result<BuiltPage> {
4202 let page = &self.pages[layout.page_index];
4203 let mut page_segments = Vec::new();
4204 let mut symbol_dict_time = Duration::default();
4205 let mut text_region_time = Duration::default();
4206 let mut generic_region_time = Duration::default();
4207
4208 let page_info_payload = PageInfo {
4209 width: page.image.width as u32,
4210 height: page.image.height as u32,
4211 default_pixel: false,
4212 xres: self.config.generic.dpi,
4213 yres: self.config.generic.dpi,
4214 ..Default::default()
4215 }
4216 .to_bytes();
4217
4218 page_segments.push(Segment {
4219 number: layout.page_info_segment_number,
4220 seg_type: SegmentType::PageInformation,
4221 deferred_non_retain: false,
4222 retain_flags: 0,
4223 page_association_type: 0,
4224 referred_to: vec![],
4225 page: Some(layout.page_number),
4226 payload: page_info_payload,
4227 });
4228
4229 if self.config.symbol_mode
4230 && !page.symbol_instances.is_empty()
4231 && !layout.use_generic_region
4232 {
4233 let mut referred_to_for_text_region = self.global_dict_segment_numbers.clone();
4234 let residual_set: HashSet<usize> = layout.residual_symbols.iter().copied().collect();
4235 let residual_anchor_remaps = &layout.residual_anchor_remaps;
4236
4237 let mut local_sym_to_dict_pos = vec![u32::MAX; self.global_symbols.len()];
4238 let mut local_refinement_map = vec![None; self.global_symbols.len()];
4239 let num_local_dict_symbols = if let Some(local_dict_layout) = &layout.local_dict_layout
4240 {
4241 let refs: Vec<&BitImage> = layout
4242 .local_symbols
4243 .iter()
4244 .map(|&i| &self.global_symbols[i])
4245 .collect();
4246 let dict_start = Instant::now();
4247 let encoded_local_dict =
4248 encode_symbol_dictionary_segments(&refs, self.config, local_dict_layout)?;
4249 symbol_dict_time += dict_start.elapsed();
4250
4251 for (refs_idx, &dict_pos) in
4252 encoded_local_dict.input_to_exported_pos.iter().enumerate()
4253 {
4254 if dict_pos != u32::MAX {
4255 let gs_idx = layout.local_symbols[refs_idx];
4256 local_sym_to_dict_pos[gs_idx] = dict_pos;
4257 }
4258 }
4259 for (subset_index, refinement) in local_dict_layout.refinements.iter().enumerate() {
4260 if let Some(refinement) = refinement {
4261 let gs_idx = layout.local_symbols[subset_index];
4262 local_refinement_map[gs_idx] = Some(RefinementPlan {
4263 prototype_input_index: layout.local_symbols
4264 [refinement.prototype_input_index],
4265 refinement_dx: refinement.refinement_dx,
4266 refinement_dy: refinement.refinement_dy,
4267 });
4268 }
4269 }
4270
4271 for segment_number in layout.local_dict_segment_numbers.iter().copied() {
4272 page_segments.push(Segment {
4273 number: segment_number,
4274 seg_type: SegmentType::SymbolDictionary,
4275 deferred_non_retain: false,
4276 retain_flags: 0,
4277 page_association_type: 0,
4278 referred_to: Vec::new(),
4279 page: Some(layout.page_number),
4280 payload: encoded_local_dict.payload.clone(),
4281 });
4282 }
4283 referred_to_for_text_region
4284 .extend(layout.local_dict_segment_numbers.iter().copied());
4285 encoded_local_dict.exported_symbol_count
4286 } else {
4287 0
4288 };
4289
4290 let mut planned_instances = Vec::with_capacity(page.symbol_instances.len());
4291 let mut residual_instances = Vec::new();
4292 for instance in &page.symbol_instances {
4293 if let Some(&anchor_index) = residual_anchor_remaps.get(&instance.symbol_index) {
4294 let mut remapped = instance.clone();
4295 remapped.symbol_index = anchor_index;
4296 remapped.needs_refinement = false;
4297 remapped.refinement_dx = 0;
4298 remapped.refinement_dy = 0;
4299 planned_instances.push(remapped);
4300 } else if residual_set.contains(&instance.symbol_index) {
4301 residual_instances.push(instance.clone());
4302 } else {
4303 planned_instances.push(instance.clone());
4304 }
4305 }
4306 let mut needs_family_refinement = false;
4307 for instance in &mut planned_instances {
4308 if let Some(refinement) = local_refinement_map[instance.symbol_index] {
4309 instance.symbol_index = refinement.prototype_input_index;
4310 instance.needs_refinement = true;
4311 instance.refinement_dx = refinement.refinement_dx;
4312 instance.refinement_dy = refinement.refinement_dy;
4313 needs_family_refinement = true;
4314 } else if let Some(refinement) = global_refinement_map[instance.symbol_index] {
4315 instance.symbol_index = refinement.prototype_input_index;
4316 instance.needs_refinement = true;
4317 instance.refinement_dx = refinement.refinement_dx;
4318 instance.refinement_dy = refinement.refinement_dy;
4319 needs_family_refinement = true;
4320 }
4321 }
4322
4323 if !planned_instances.is_empty() {
4324 let text_start = Instant::now();
4325 let has_instance_refinement = planned_instances
4326 .iter()
4327 .any(|instance| instance.needs_refinement);
4328 let use_refinement_text_region = has_instance_refinement
4329 || (!self.config.uses_lossy_symbol_dictionary()
4330 && (self.config.text_refine
4331 || self.config.refine
4332 || needs_family_refinement));
4333 let region_payload = if use_refinement_text_region {
4334 encode_text_region_with_refinement(
4335 &planned_instances,
4336 self.config,
4337 &self.global_symbols,
4338 global_sym_to_dict_pos,
4339 num_global_dict_symbols,
4340 &local_sym_to_dict_pos,
4341 num_local_dict_symbols,
4342 )?
4343 } else {
4344 encode_text_region_mapped(
4345 &planned_instances,
4346 self.config,
4347 &self.global_symbols,
4348 global_sym_to_dict_pos,
4349 num_global_dict_symbols,
4350 &local_sym_to_dict_pos,
4351 layout.page_index,
4352 num_local_dict_symbols,
4353 )?
4354 };
4355 text_region_time += text_start.elapsed();
4356
4357 page_segments.push(Segment {
4358 number: layout.region_segment_number,
4359 seg_type: SegmentType::ImmediateTextRegion,
4360 deferred_non_retain: false,
4361 retain_flags: 0,
4362 page_association_type: 0,
4363 referred_to: referred_to_for_text_region,
4364 page: Some(layout.page_number),
4365 payload: region_payload,
4366 });
4367 }
4368
4369 if let Some((residual_bitmap, residual_x, residual_y)) =
4370 self.build_instance_residual_bitmap(&residual_instances)?
4371 {
4372 let generic_start = Instant::now();
4373 let residual_payload = self.encode_generic_region_payload_at(
4374 &residual_bitmap,
4375 residual_x,
4376 residual_y,
4377 )?;
4378 generic_region_time += generic_start.elapsed();
4379 page_segments.push(Segment {
4380 number: layout
4381 .residual_region_segment_number
4382 .unwrap_or(layout.region_segment_number),
4383 seg_type: SegmentType::ImmediateGenericRegion,
4384 deferred_non_retain: false,
4385 retain_flags: 0,
4386 page_association_type: 0,
4387 referred_to: Vec::new(),
4388 page: Some(layout.page_number),
4389 payload: residual_payload,
4390 });
4391 }
4392 } else {
4393 let generic_start = Instant::now();
4394 let generic_region_payload =
4395 self.encode_generic_region_payload_at(&page.image, 0, 0)?;
4396 generic_region_time += generic_start.elapsed();
4397
4398 page_segments.push(Segment {
4399 number: layout.region_segment_number,
4400 seg_type: SegmentType::ImmediateGenericRegion,
4401 deferred_non_retain: false,
4402 retain_flags: 0,
4403 page_association_type: 0,
4404 referred_to: Vec::new(),
4405 page: Some(layout.page_number),
4406 payload: generic_region_payload,
4407 });
4408 }
4409
4410 page_segments.push(Segment {
4411 number: layout.end_of_page_segment_number,
4412 seg_type: SegmentType::EndOfPage,
4413 deferred_non_retain: false,
4414 retain_flags: 0,
4415 page_association_type: 0,
4416 referred_to: Vec::new(),
4417 page: Some(layout.page_number),
4418 payload: Vec::new(),
4419 });
4420
4421 Ok(BuiltPage {
4422 page: PlannedPage {
4423 page_number: layout.page_number,
4424 segments: page_segments,
4425 },
4426 symbol_dict_time,
4427 text_region_time,
4428 generic_region_time,
4429 })
4430 }
4431
4432 fn validate_plan(&self, plan: &PlannedDocument) -> Result<()> {
4433 let mut global_numbers = HashSet::new();
4434
4435 for seg in &plan.global_segments {
4436 if !global_numbers.insert(seg.number) {
4437 anyhow::bail!("Duplicate segment number in globals: {}", seg.number);
4438 }
4439 }
4440
4441 for (page_idx, page) in plan.pages.iter().enumerate() {
4442 let mut page_numbers = global_numbers.clone();
4445 for seg in &page.segments {
4446 if !page_numbers.insert(seg.number) {
4447 anyhow::bail!(
4448 "Duplicate segment number {} on page {}",
4449 seg.number,
4450 page_idx
4451 );
4452 }
4453 }
4454
4455 for seg in &page.segments {
4456 for referred in &seg.referred_to {
4457 if !page_numbers.contains(referred) {
4458 anyhow::bail!(
4459 "Page {} segment {} refers to missing segment {}",
4460 page.page_number,
4461 seg.number,
4462 referred
4463 );
4464 }
4465 if global_numbers.contains(referred) && plan.global_segments.is_empty() {
4466 anyhow::bail!(
4467 "Page {} segment {} refers to global {} but no globals stream exists",
4468 page.page_number,
4469 seg.number,
4470 referred
4471 );
4472 }
4473 }
4474 }
4475 }
4476
4477 if let Some(eof) = &plan.eof_segment {
4478 if global_numbers.contains(&eof.number) {
4479 anyhow::bail!("EOF segment number {} conflicts with globals", eof.number);
4480 }
4481 }
4482
4483 for seg in &plan.global_segments {
4484 for referred in &seg.referred_to {
4485 if !global_numbers.contains(referred) {
4486 anyhow::bail!(
4487 "Global segment {} refers to missing segment {}",
4488 seg.number,
4489 referred
4490 );
4491 }
4492 }
4493 }
4494
4495 Ok(())
4496 }
4497
4498 fn serialize_full_document(&self, plan: &PlannedDocument) -> Result<Vec<u8>> {
4499 let mut output = Vec::new();
4500 if let Some(header) = &plan.file_header {
4501 output.extend(header.to_bytes());
4502 }
4503 for seg in &plan.global_segments {
4504 seg.write_into(&mut output)?;
4505 }
4506 for page in &plan.pages {
4507 for seg in &page.segments {
4508 seg.write_into(&mut output)?;
4509 }
4510 }
4511 if let Some(eof) = &plan.eof_segment {
4512 eof.write_into(&mut output)?;
4513 }
4514 Ok(output)
4515 }
4516
4517 fn serialize_pdf_split(
4518 &self,
4519 plan: &PlannedDocument,
4520 ) -> Result<(
4521 Option<Vec<u8>>,
4522 Vec<Vec<u8>>,
4523 Vec<usize>,
4524 Vec<usize>,
4525 Vec<usize>,
4526 )> {
4527 let global_segments = if plan.global_segments.is_empty() {
4528 None
4529 } else {
4530 let mut out = Vec::new();
4531 for seg in &plan.global_segments {
4532 seg.write_into(&mut out)?;
4533 }
4534 Some(out)
4535 };
4536
4537 #[cfg(feature = "parallel")]
4538 let page_streams = plan
4539 .pages
4540 .par_iter()
4541 .map(|page| {
4542 let mut page_out = Vec::new();
4543 let mut local_dict_bytes = 0usize;
4544 let mut text_region_bytes = 0usize;
4545 let mut generic_region_bytes = 0usize;
4546 for seg in &page.segments {
4547 let start_len = page_out.len();
4548 seg.write_into(&mut page_out)?;
4549 let seg_len = page_out.len().saturating_sub(start_len);
4550 match seg.seg_type {
4551 SegmentType::SymbolDictionary => local_dict_bytes += seg_len,
4552 SegmentType::ImmediateTextRegion => text_region_bytes += seg_len,
4553 SegmentType::ImmediateGenericRegion => generic_region_bytes += seg_len,
4554 _ => {}
4555 }
4556 }
4557 Ok((
4558 page_out,
4559 local_dict_bytes,
4560 text_region_bytes,
4561 generic_region_bytes,
4562 ))
4563 })
4564 .collect::<Vec<Result<(Vec<u8>, usize, usize, usize)>>>()
4565 .into_iter()
4566 .collect::<Result<Vec<_>>>()?;
4567
4568 #[cfg(not(feature = "parallel"))]
4569 let page_streams = {
4570 let mut page_streams = Vec::with_capacity(plan.pages.len());
4571 for page in &plan.pages {
4572 let mut page_out = Vec::new();
4573 let mut local_dict_bytes = 0usize;
4574 let mut text_region_bytes = 0usize;
4575 let mut generic_region_bytes = 0usize;
4576 for seg in &page.segments {
4577 let start_len = page_out.len();
4578 seg.write_into(&mut page_out)?;
4579 let seg_len = page_out.len().saturating_sub(start_len);
4580 match seg.seg_type {
4581 SegmentType::SymbolDictionary => local_dict_bytes += seg_len,
4582 SegmentType::ImmediateTextRegion => text_region_bytes += seg_len,
4583 SegmentType::ImmediateGenericRegion => generic_region_bytes += seg_len,
4584 _ => {}
4585 }
4586 }
4587 page_streams.push((
4588 page_out,
4589 local_dict_bytes,
4590 text_region_bytes,
4591 generic_region_bytes,
4592 ));
4593 }
4594 page_streams
4595 };
4596
4597 let mut raw_pages = Vec::with_capacity(page_streams.len());
4598 let mut local_dict_bytes_per_page = Vec::with_capacity(page_streams.len());
4599 let mut text_region_bytes_per_page = Vec::with_capacity(page_streams.len());
4600 let mut generic_region_bytes_per_page = Vec::with_capacity(page_streams.len());
4601 for (page_out, local_dict_bytes, text_region_bytes, generic_region_bytes) in page_streams {
4602 raw_pages.push(page_out);
4603 local_dict_bytes_per_page.push(local_dict_bytes);
4604 text_region_bytes_per_page.push(text_region_bytes);
4605 generic_region_bytes_per_page.push(generic_region_bytes);
4606 }
4607
4608 Ok((
4609 global_segments,
4610 raw_pages,
4611 local_dict_bytes_per_page,
4612 text_region_bytes_per_page,
4613 generic_region_bytes_per_page,
4614 ))
4615 }
4616
4617 fn prune_symbols_if_needed(&mut self) {
4618 }
4621
4622 fn cluster_symbols(&mut self) -> Result<()> {
4633 let n = self.global_symbols.len();
4634 if n < 2 {
4635 return Ok(());
4636 }
4637
4638 let mut parent: Vec<usize> = (0..n).collect();
4640 let mut uf_rank: Vec<u32> = vec![0; n];
4641 let mut comparator = Comparator::default();
4642
4643 let mut buckets: HashMap<(usize, usize), Vec<usize>> = HashMap::new();
4645 for (i, sym) in self.global_symbols.iter().enumerate() {
4646 buckets.entry((sym.height, sym.width)).or_default().push(i);
4647 }
4648
4649 let mut bucket_keys: Vec<(usize, usize)> = buckets.keys().copied().collect();
4651 bucket_keys.sort_unstable();
4652
4653 let mut compare_pair = |a_idx: usize, b_idx: usize| {
4654 if uf_find(&mut parent, a_idx) == uf_find(&mut parent, b_idx) {
4655 return;
4656 }
4657
4658 let a_sym = &self.global_symbols[a_idx];
4659 let b_sym = &self.global_symbols[b_idx];
4660 let dim_limit = if self.config.text_refine { 2 } else { 1 };
4661 if (a_sym.width as i32 - b_sym.width as i32).abs() > dim_limit
4662 || (a_sym.height as i32 - b_sym.height as i32).abs() > dim_limit
4663 {
4664 return;
4665 }
4666
4667 let area = a_sym.width.max(b_sym.width) * a_sym.height.max(b_sym.height);
4668 let max_err = if self.config.text_refine {
4669 ((self.symbol_pixel_counts[a_idx].max(self.symbol_pixel_counts[b_idx]) as f32
4670 * 0.10) as u32)
4671 .max(((area as f32) * 0.05) as u32)
4672 .clamp(3, 20)
4673 } else {
4674 ((area as f32 * 0.04) as u32).clamp(2, 12)
4675 };
4676 if self.symbol_pixel_counts[a_idx].abs_diff(self.symbol_pixel_counts[b_idx])
4677 > max_err as usize
4678 {
4679 return;
4680 }
4681
4682 let dy_limit = if self.config.text_refine { 1 } else { 0 };
4683 if let Some(result) =
4684 comparator.compare_for_refine_family(a_sym, b_sym, max_err, dim_limit, dy_limit)
4685 {
4686 let dx = result.dx;
4687 let dy = result.dy;
4688 if dx.abs() <= dim_limit && dy.abs() <= dy_limit {
4689 uf_union(&mut parent, &mut uf_rank, a_idx, b_idx);
4690 }
4691 }
4692 };
4693
4694 for &(bh, bw) in &bucket_keys {
4695 let current_bucket = &buckets[&(bh, bw)];
4696 for ci in 0..current_bucket.len() {
4697 for cj in (ci + 1)..current_bucket.len() {
4698 compare_pair(current_bucket[ci], current_bucket[cj]);
4699 }
4700 }
4701
4702 for dh in -1i32..=1 {
4703 for dw in -1i32..=1 {
4704 let nh = bh as i32 + dh;
4705 let nw = bw as i32 + dw;
4706 if nh < 0 || nw < 0 {
4707 continue;
4708 }
4709 let neighbor_key = (nh as usize, nw as usize);
4710 if neighbor_key <= (bh, bw) {
4711 continue;
4712 }
4713 if let Some(neighbor_bucket) = buckets.get(&neighbor_key) {
4714 for &a_idx in current_bucket {
4715 for &b_idx in neighbor_bucket {
4716 compare_pair(a_idx, b_idx);
4717 }
4718 }
4719 }
4720 }
4721 }
4722 }
4723
4724 let mut clusters: HashMap<usize, Vec<usize>> = HashMap::new();
4726 for i in 0..n {
4727 let root = uf_find(&mut parent, i);
4728 clusters.entry(root).or_default().push(i);
4729 }
4730
4731 let mut old_to_prototype: Vec<usize> = (0..n).collect();
4733 for (_, members) in &clusters {
4734 if members.len() <= 1 {
4735 continue;
4736 }
4737 let prototype = self.choose_cluster_prototype(members);
4738 for &m in members {
4739 old_to_prototype[m] = prototype;
4740 }
4741 }
4742
4743 let mut seen_prototypes: HashMap<usize, usize> = HashMap::new();
4745 let mut new_symbols: Vec<BitImage> = Vec::new();
4746 let mut old_to_new: Vec<usize> = vec![0; n];
4747
4748 for i in 0..n {
4750 let proto = old_to_prototype[i];
4751 if let Some(&new_idx) = seen_prototypes.get(&proto) {
4752 old_to_new[i] = new_idx;
4753 } else {
4754 let new_idx = new_symbols.len();
4755 new_symbols.push(self.global_symbols[proto].clone());
4756 seen_prototypes.insert(proto, new_idx);
4757 old_to_new[i] = new_idx;
4758 }
4759 }
4760
4761 let old_count = n;
4762 let new_count = new_symbols.len();
4763
4764 for page in &mut self.pages {
4766 for inst in &mut page.symbol_instances {
4767 let old_idx = inst.symbol_index;
4768 let new_idx = old_to_new[old_idx];
4769 let proto = old_to_prototype[old_idx];
4770
4771 if old_idx != proto {
4774 inst.needs_refinement = true;
4775 let (_, trimmed_inst) = inst.instance_bitmap.trim();
4778 let max_ref_err = (trimmed_inst.width * trimmed_inst.height) as u32;
4779 if let Some((_, dx, dy)) =
4780 comparator.distance(&trimmed_inst, &new_symbols[new_idx], max_ref_err)
4781 {
4782 inst.refinement_dx = dx;
4783 inst.refinement_dy = dy;
4784 }
4785 }
4786
4787 inst.symbol_index = new_idx;
4788 }
4789 }
4790
4791 self.global_symbols = new_symbols;
4793 self.symbol_pixel_counts = self
4794 .global_symbols
4795 .iter()
4796 .map(BitImage::count_ones)
4797 .collect();
4798 self.rebuild_symbol_metadata();
4799 self.rebuild_hash_map();
4800
4801 debug!(
4802 "Clustering: {} -> {} prototype symbols ({:.1}% reduction)",
4803 old_count,
4804 new_count,
4805 (1.0 - new_count as f64 / old_count.max(1) as f64) * 100.0
4806 );
4807
4808 Ok(())
4809 }
4810
4811 fn validate_symbol_instance_indices(&self) -> Result<()> {
4812 for (page_num, page) in self.pages.iter().enumerate() {
4813 for instance in &page.symbol_instances {
4814 if instance.symbol_index >= self.global_symbols.len() {
4815 anyhow::bail!(
4816 "Page {} has symbol instance {} out of range after pruning (max {})",
4817 page_num + 1,
4818 instance.symbol_index,
4819 self.global_symbols.len().saturating_sub(1)
4820 );
4821 }
4822 }
4823 }
4824 Ok(())
4825 }
4826
4827 fn validate_symbol_partition(
4828 &self,
4829 global_symbol_indices: &[usize],
4830 page_local_symbols: &[Vec<usize>],
4831 page_residual_symbols: &[Vec<usize>],
4832 page_residual_anchor_remaps: &[FxHashMap<usize, usize>],
4833 page_uses_generic_region: &[bool],
4834 ) -> Result<()> {
4835 let global_set: HashSet<usize> = global_symbol_indices.iter().copied().collect();
4836 for (page_num, page) in self.pages.iter().enumerate() {
4837 if page_uses_generic_region[page_num] {
4838 continue;
4839 }
4840 let local_set: HashSet<usize> = page_local_symbols[page_num].iter().copied().collect();
4841 let residual_set: HashSet<usize> =
4842 page_residual_symbols[page_num].iter().copied().collect();
4843 for inst in &page.symbol_instances {
4844 let idx = inst.symbol_index;
4845 if !global_set.contains(&idx)
4846 && !page_residual_anchor_remaps[page_num].contains_key(&idx)
4847 && !local_set.contains(&idx)
4848 && !residual_set.contains(&idx)
4849 {
4850 anyhow::bail!(
4851 "Page {} symbol {} was not resolved to global, local, or residual output",
4852 page_num + 1,
4853 idx
4854 );
4855 }
4856 }
4857 }
4858 Ok(())
4859 }
4860
4861 fn auto_threshold(&mut self) -> Result<()> {
4862 let mut i = 0;
4863 let mut comparator = Comparator::default();
4864 while i < self.global_symbols.len() {
4865 let mut j = i + 1;
4866 while j < self.global_symbols.len() {
4867 if comparator
4868 .distance(&self.global_symbols[i], &self.global_symbols[j], 0)
4869 .is_some()
4870 {
4871 self.unite_templates(i, j)?;
4872 } else {
4873 j += 1;
4874 }
4875 }
4876 i += 1;
4877 }
4878 Ok(())
4879 }
4880
4881 fn auto_threshold_using_hash(&mut self) -> Result<()> {
4882 loop {
4886 let mut hashed_templates: HashMap<u32, Vec<usize>> = HashMap::new();
4887 for (i, symbol) in self.global_symbols.iter().enumerate() {
4888 let hash = compute_symbol_hash(symbol);
4889 hashed_templates.entry(hash).or_default().push(i);
4890 }
4891
4892 let mut comparator = Comparator::default();
4893 let mut merged = false;
4894
4895 for (_, bucket) in &hashed_templates {
4896 if bucket.len() < 2 {
4897 continue;
4898 }
4899 'outer: for bi in 0..bucket.len() {
4901 for bj in (bi + 1)..bucket.len() {
4902 if comparator
4903 .distance(
4904 &self.global_symbols[bucket[bi]],
4905 &self.global_symbols[bucket[bj]],
4906 0,
4907 )
4908 .is_some()
4909 {
4910 self.unite_templates(bucket[bi], bucket[bj])?;
4911 merged = true;
4912 break 'outer;
4913 }
4914 }
4915 }
4916 if merged {
4917 break; }
4919 }
4920
4921 if !merged {
4922 break;
4923 }
4924 }
4925 Ok(())
4926 }
4927
4928 fn unite_templates(&mut self, target_idx: usize, source_idx: usize) -> Result<()> {
4929 if source_idx >= self.global_symbols.len() {
4930 anyhow::bail!("Source index out of range");
4931 }
4932
4933 for page in &mut self.pages {
4934 for instance in &mut page.symbol_instances {
4935 if instance.symbol_index == source_idx {
4936 instance.symbol_index = target_idx;
4937 } else if instance.symbol_index > source_idx {
4938 instance.symbol_index -= 1;
4939 }
4940 }
4941 }
4942
4943 self.global_symbols.remove(source_idx);
4944 self.symbol_pixel_counts.remove(source_idx);
4945 self.rebuild_symbol_metadata();
4946 self.rebuild_hash_map();
4947
4948 Ok(())
4949 }
4950
4951 pub fn next_segment_number(&mut self) -> u32 {
4952 let num = self.next_segment_number;
4953 self.next_segment_number += 1;
4954 num
4955 }
4956
4957 pub fn flush_dict(&mut self) -> Result<Vec<u8>> {
4958 if self.global_symbols.is_empty() {
4959 return Ok(Vec::new());
4960 }
4961
4962 let symbol_refs: Vec<&BitImage> = self.global_symbols.iter().collect();
4963 let dict_data = encode_symbol_dict(&symbol_refs, &self.config, 0)?;
4964
4965 let dict_segment = Segment {
4966 number: self.next_segment_number,
4967 seg_type: SegmentType::SymbolDictionary,
4968 deferred_non_retain: false,
4969 retain_flags: 0,
4970 page_association_type: if self.state.pdf_mode { 2 } else { 0 },
4971 referred_to: Vec::new(),
4972 page: if self.state.pdf_mode { None } else { Some(1) },
4973 payload: dict_data,
4974 };
4975 self.next_segment_number += 1;
4976
4977 let mut output = Vec::new();
4978 if self.state.pdf_mode {
4979 dict_segment.write_into(&mut output)?;
4980 return Ok(output);
4981 }
4982
4983 let header = FileHeader {
4984 organisation_type: true,
4985 unknown_n_pages: false,
4986 n_pages: 1,
4987 };
4988 output.extend(header.to_bytes());
4989 dict_segment.write_into(&mut output)?;
4990
4991 Ok(output)
4992 }
4993
4994 fn build_instance_residual_bitmap(
4995 &self,
4996 instances: &[SymbolInstance],
4997 ) -> Result<Option<(BitImage, u32, u32)>> {
4998 if instances.is_empty() {
4999 return Ok(None);
5000 }
5001
5002 let mut min_x = u32::MAX;
5003 let mut min_y = u32::MAX;
5004 let mut max_x = 0u32;
5005 let mut max_y = 0u32;
5006 let mut has_pixels = false;
5007
5008 for instance in instances {
5009 if instance.instance_bitmap.count_ones() == 0 {
5010 continue;
5011 }
5012 has_pixels = true;
5013 min_x = min_x.min(instance.position.x);
5014 min_y = min_y.min(instance.position.y);
5015 max_x = max_x.max(instance.position.x + instance.instance_bitmap.width as u32);
5016 max_y = max_y.max(instance.position.y + instance.instance_bitmap.height as u32);
5017 }
5018
5019 if !has_pixels || max_x <= min_x || max_y <= min_y {
5020 return Ok(None);
5021 }
5022
5023 let width = max_x - min_x;
5024 let height = max_y - min_y;
5025 let mut residual = BitImage::new(width, height).map_err(|e| anyhow!(e))?;
5026 for instance in instances {
5027 let offset_x = (instance.position.x - min_x) as usize;
5028 let offset_y = (instance.position.y - min_y) as usize;
5029 for y in 0..instance.instance_bitmap.height {
5030 for x in 0..instance.instance_bitmap.width {
5031 if instance.instance_bitmap.get_usize(x, y) {
5032 residual.set_usize(offset_x + x, offset_y + y, true);
5033 }
5034 }
5035 }
5036 }
5037
5038 if residual.count_ones() == 0 {
5039 return Ok(None);
5040 }
5041
5042 Ok(Some((residual, min_x, min_y)))
5043 }
5044
5045 fn encode_generic_region_payload_at(
5046 &self,
5047 image: &BitImage,
5048 x: u32,
5049 y: u32,
5050 ) -> Result<Vec<u8>> {
5051 let mut gr_cfg = GenericRegionConfig::new(
5052 image.width as u32,
5053 image.height as u32,
5054 self.config.generic.dpi,
5055 );
5056 gr_cfg.x = x;
5057 gr_cfg.y = y;
5058 gr_cfg.comb_operator = self.config.generic.comb_operator;
5059 gr_cfg.mmr = self.config.generic.mmr;
5060 gr_cfg.tpgdon = self.config.generic.tpgdon;
5061 gr_cfg.validate().map_err(|e: &'static str| anyhow!(e))?;
5062
5063 let coder_data = Jbig2ArithCoder::encode_generic_payload_cfg(image, &gr_cfg)?;
5064 let params: GenericRegionParams = gr_cfg.clone().into();
5065 let mut payload = params.to_bytes();
5066 payload.extend_from_slice(&coder_data);
5067 Ok(payload)
5068 }
5069}
5070
5071pub fn encode_generic_region(img: &BitImage, cfg: &Jbig2Config) -> Result<Vec<u8>> {
5074 let mut gr_cfg = GenericRegionParams::new(img.width as u32, img.height as u32, cfg.generic.dpi);
5076 gr_cfg.comb_operator = cfg.generic.comb_operator;
5077 gr_cfg.mmr = cfg.generic.mmr;
5078 gr_cfg.tpgdon = cfg.generic.tpgdon;
5079 gr_cfg.validate().map_err(|e: &'static str| anyhow!(e))?;
5080
5081 let coder_data =
5082 Jbig2ArithCoder::encode_generic_payload(img, gr_cfg.template, &gr_cfg.at_pixels)?;
5083
5084 let params: GenericRegionParams = gr_cfg.clone();
5085
5086 let mut generic_region_payload = params.to_bytes();
5087 generic_region_payload.extend_from_slice(&coder_data);
5088
5089 let generic_region_segment = Segment {
5091 number: 1, seg_type: SegmentType::ImmediateGenericRegion,
5093 deferred_non_retain: false,
5094 retain_flags: 0,
5095 page_association_type: 0, referred_to: Vec::new(),
5097 page: Some(1), payload: generic_region_payload.clone(), };
5100
5101 if !cfg.want_full_headers {
5103 let mut seg_bytes = Vec::new();
5104 generic_region_segment.write_into(&mut seg_bytes)?;
5105 return Ok(seg_bytes);
5106 }
5107
5108 let mut out = Vec::with_capacity(generic_region_payload.len() + 64);
5110
5111 out.extend_from_slice(
5113 &FileHeader {
5114 organisation_type: true,
5115 unknown_n_pages: false,
5116 n_pages: 1,
5117 }
5118 .to_bytes(),
5119 );
5120
5121 Segment {
5123 number: 0,
5124 seg_type: SegmentType::PageInformation,
5125 deferred_non_retain: false,
5126 retain_flags: 0,
5127 page_association_type: 0,
5128 referred_to: vec![],
5129 page: Some(1),
5130 payload: PageInfo {
5131 width: img.width as u32,
5132 height: img.height as u32,
5133 xres: cfg.generic.dpi,
5134 yres: cfg.generic.dpi,
5135 is_lossless: cfg.is_lossless,
5136 default_pixel: cfg.default_pixel,
5137 default_operator: cfg.generic.comb_operator,
5138 ..Default::default()
5139 }
5140 .to_bytes(),
5141 }
5142 .write_into(&mut out)?;
5143
5144 generic_region_segment.write_into(&mut out)?;
5146
5147 Segment {
5149 number: 2,
5150 seg_type: SegmentType::EndOfFile,
5151 deferred_non_retain: false,
5152 retain_flags: 0,
5153 page_association_type: 2,
5154 referred_to: vec![],
5155 page: None,
5156 payload: vec![],
5157 }
5158 .write_into(&mut out)?;
5159
5160 Ok(out)
5161}
5162
5163pub fn encode_symbol_dict(
5164 symbols: &[&BitImage],
5165 _config: &Jbig2Config,
5166 num_imported_symbols: u32,
5167) -> Result<Vec<u8>> {
5168 let (payload, _order) = encode_symbol_dict_with_order(symbols, _config, num_imported_symbols)?;
5169 Ok(payload)
5170}
5171
5172pub fn canonicalize_dict_symbols(symbols: &[&BitImage]) -> Vec<usize> {
5180 let mut valid: Vec<(usize, &BitImage)> = symbols
5182 .iter()
5183 .enumerate()
5184 .filter(|(_, sym)| sym.width > 0 && sym.height > 0)
5185 .map(|(i, sym)| (i, *sym))
5186 .collect();
5187
5188 valid.sort_by(|a, b| (a.1.height, a.1.width).cmp(&(b.1.height, b.1.width)));
5193
5194 valid.into_iter().map(|(orig_idx, _)| orig_idx).collect()
5196}
5197
5198fn plan_symbol_dictionary_layout(
5199 symbols: &[&BitImage],
5200 config: &Jbig2Config,
5201 usage_weights: Option<&[usize]>,
5202) -> Result<SymbolDictLayout> {
5203 let canonical_order = canonicalize_dict_symbols(symbols);
5204 if canonical_order.is_empty() {
5205 return Err(anyhow!(
5206 "encode_symbol_dict: no valid symbols supplied (all symbols had zero width or height)"
5207 ));
5208 }
5209
5210 let _ = (config, usage_weights);
5211 Ok(SymbolDictLayout {
5212 export_input_indices: canonical_order,
5213 refinements: vec![None; symbols.len()],
5214 diagnostics: SymbolDictDiagnostics {
5215 singleton_family_count: symbols.len(),
5216 exported_member_count: symbols.len(),
5217 ..Default::default()
5218 },
5219 })
5220}
5221
5222fn build_refinement_family_layout(
5223 symbols: &[&BitImage],
5224 canonical_order: &[usize],
5225 usage_weights: Option<&[usize]>,
5226) -> SymbolDictLayout {
5227 let mut comparator = Comparator::default();
5228 let signatures: Vec<SymbolSignature> = symbols
5229 .iter()
5230 .map(|sym| compute_symbol_signature_shared(sym))
5231 .collect();
5232 let black_counts: Vec<usize> = symbols.iter().map(|sym| sym.count_ones()).collect();
5233
5234 let mut canonical_pos = vec![usize::MAX; symbols.len()];
5235 for (pos, &input_index) in canonical_order.iter().enumerate() {
5236 canonical_pos[input_index] = pos;
5237 }
5238
5239 let mut bucket_map: HashMap<FamilyBucketKey, Vec<usize>> = HashMap::new();
5240 for &input_index in canonical_order {
5241 let key = family_bucket_key_for_symbol(symbols[input_index], &signatures[input_index]);
5242 bucket_map.entry(key).or_default().push(input_index);
5243 }
5244
5245 let mut parent: Vec<usize> = (0..symbols.len()).collect();
5246 let mut rank = vec![0u32; symbols.len()];
5247
5248 for &input_index in canonical_order {
5249 let symbol = symbols[input_index];
5250 let key = family_bucket_key_for_symbol(symbol, &signatures[input_index]);
5251
5252 for neighbor in family_bucket_neighbors(key) {
5253 let Some(bucket) = bucket_map.get(&neighbor) else {
5254 continue;
5255 };
5256 for &other_input_index in bucket {
5257 if canonical_pos[other_input_index] >= canonical_pos[input_index] {
5258 continue;
5259 }
5260 if family_match_details(
5261 &mut comparator,
5262 symbol,
5263 input_index,
5264 symbols[other_input_index],
5265 other_input_index,
5266 &signatures,
5267 &black_counts,
5268 )
5269 .is_some()
5270 {
5271 uf_union(&mut parent, &mut rank, input_index, other_input_index);
5272 }
5273 }
5274 }
5275 }
5276
5277 let mut families: HashMap<usize, Vec<usize>> = HashMap::new();
5278 for &input_index in canonical_order {
5279 let root = uf_find(&mut parent, input_index);
5280 families.entry(root).or_default().push(input_index);
5281 }
5282
5283 let mut export_input_indices = Vec::new();
5284 let mut refinements = vec![None; symbols.len()];
5285 let mut diagnostics = SymbolDictDiagnostics::default();
5286
5287 let mut family_members: Vec<Vec<usize>> = families.into_values().collect();
5288 family_members.sort_by_key(|members| canonical_pos[members[0]]);
5289 diagnostics.family_count = family_members.len();
5290
5291 for mut members in family_members {
5292 members.sort_by_key(|&input_index| canonical_pos[input_index]);
5293 if members.len() == 1 {
5294 diagnostics.singleton_family_count += 1;
5295 diagnostics.exported_member_count += 1;
5296 export_input_indices.push(members[0]);
5297 continue;
5298 }
5299
5300 let prototype_input_index = choose_family_prototype(
5301 &members,
5302 symbols,
5303 usage_weights,
5304 &canonical_pos,
5305 &signatures,
5306 &black_counts,
5307 );
5308 if diagnostics.sample_lines.len() < 128 {
5309 diagnostics.sample_lines.push(format!(
5310 "refine family: prototype={} members={} prototype_usage={}",
5311 prototype_input_index,
5312 members.len(),
5313 usage_weights
5314 .and_then(|weights| weights.get(prototype_input_index).copied())
5315 .unwrap_or(1)
5316 ));
5317 }
5318 export_input_indices.push(prototype_input_index);
5319 diagnostics.exported_member_count += 1;
5320
5321 for &member_input_index in &members {
5322 if member_input_index == prototype_input_index {
5323 continue;
5324 }
5325
5326 let maybe_match = family_match_details(
5327 &mut comparator,
5328 symbols[member_input_index],
5329 member_input_index,
5330 symbols[prototype_input_index],
5331 prototype_input_index,
5332 &signatures,
5333 &black_counts,
5334 );
5335
5336 match maybe_match {
5337 Some((err, dx, dy))
5338 if family_should_refine(
5339 symbols[member_input_index],
5340 symbols[prototype_input_index],
5341 err,
5342 dx,
5343 dy,
5344 usage_weights
5345 .and_then(|weights| weights.get(member_input_index).copied())
5346 .unwrap_or(1),
5347 ) =>
5348 {
5349 refinements[member_input_index] = Some(RefinementPlan {
5350 prototype_input_index,
5351 refinement_dx: dx,
5352 refinement_dy: dy,
5353 });
5354 diagnostics.refined_member_count += 1;
5355 if diagnostics.sample_lines.len() < 128 {
5356 diagnostics.sample_lines.push(format!(
5357 " refine member={} -> prototype={} dx={} dy={} err={} usage={}",
5358 member_input_index,
5359 prototype_input_index,
5360 dx,
5361 dy,
5362 err,
5363 usage_weights
5364 .and_then(|weights| weights.get(member_input_index).copied())
5365 .unwrap_or(1)
5366 ));
5367 }
5368 }
5369 _ => {
5370 export_input_indices.push(member_input_index);
5371 diagnostics.exported_member_count += 1;
5372 if diagnostics.sample_lines.len() < 128 {
5373 diagnostics.sample_lines.push(format!(
5374 " export member={} as standalone usage={}",
5375 member_input_index,
5376 usage_weights
5377 .and_then(|weights| weights.get(member_input_index).copied())
5378 .unwrap_or(1)
5379 ));
5380 }
5381 }
5382 }
5383 }
5384 }
5385
5386 export_input_indices.sort_by_key(|&input_index| canonical_pos[input_index]);
5387
5388 SymbolDictLayout {
5389 export_input_indices,
5390 refinements,
5391 diagnostics,
5392 }
5393}
5394
5395fn family_refinement_gain(
5396 target: &BitImage,
5397 reference: &BitImage,
5398 err: u32,
5399 dx: i32,
5400 dy: i32,
5401) -> i64 {
5402 let plain_cost = symbol_dictionary_entry_bytes(target) as i64 + 10;
5403 let refine_cost = 10
5404 + err as i64
5405 + ((dx.abs() + dy.abs()) as i64 * 3)
5406 + (target.width.abs_diff(reference.width) + target.height.abs_diff(reference.height))
5407 as i64
5408 * 2;
5409 plain_cost - refine_cost
5410}
5411
5412fn family_should_refine(
5413 target: &BitImage,
5414 reference: &BitImage,
5415 err: u32,
5416 dx: i32,
5417 dy: i32,
5418 usage_count: usize,
5419) -> bool {
5420 if usage_count > 1 {
5421 return false;
5422 }
5423 let export_gain = family_refinement_gain(target, reference, err, dx, dy);
5424 export_gain > 12
5425}
5426
5427fn choose_family_prototype(
5428 members: &[usize],
5429 symbols: &[&BitImage],
5430 usage_weights: Option<&[usize]>,
5431 canonical_pos: &[usize],
5432 signatures: &[SymbolSignature],
5433 black_counts: &[usize],
5434) -> usize {
5435 if members.len() == 1 {
5436 return members[0];
5437 }
5438
5439 let mut comparator = Comparator::default();
5440 let mut best_idx = members[0];
5441 let mut best_cost = u64::MAX;
5442 let mut best_support = 0u64;
5443
5444 for &candidate in members {
5445 let mut total_cost = 0u64;
5446 for &other in members {
5447 if candidate == other {
5448 continue;
5449 }
5450 let weight = usage_weights
5451 .and_then(|weights| weights.get(other).copied())
5452 .unwrap_or(1) as u64;
5453 match family_match_details(
5454 &mut comparator,
5455 symbols[other],
5456 other,
5457 symbols[candidate],
5458 candidate,
5459 signatures,
5460 black_counts,
5461 ) {
5462 Some((err, dx, dy)) => {
5463 total_cost += (refine_compare_score(err, dx, dy) as u64 + 4) * weight;
5464 }
5465 None => total_cost += 1_000_000 * weight,
5466 }
5467 }
5468
5469 let candidate_support = usage_weights
5470 .and_then(|weights| weights.get(candidate).copied())
5471 .unwrap_or(1) as u64;
5472 let score_close = if best_cost == u64::MAX {
5473 false
5474 } else {
5475 total_cost <= best_cost + best_cost / 50
5476 };
5477
5478 if total_cost < best_cost
5479 || (score_close && candidate_support > best_support)
5480 || (total_cost == best_cost
5481 && candidate_support == best_support
5482 && canonical_pos[candidate] < canonical_pos[best_idx])
5483 {
5484 best_cost = total_cost;
5485 best_idx = candidate;
5486 best_support = candidate_support;
5487 }
5488 }
5489
5490 best_idx
5491}
5492
5493fn encode_symbol_dictionary_segments(
5494 symbols: &[&BitImage],
5495 config: &Jbig2Config,
5496 layout: &SymbolDictLayout,
5497) -> Result<EncodedSymbolDictionary> {
5498 let mut encoded = EncodedSymbolDictionary {
5499 payload: Vec::new(),
5500 input_to_exported_pos: vec![u32::MAX; symbols.len()],
5501 exported_symbol_count: 0,
5502 };
5503
5504 let (dict_payload, base_order) =
5505 encode_symbol_dict_subset_with_order(symbols, config, &layout.export_input_indices, 0)?;
5506 for (dict_pos, &input_index) in base_order.iter().enumerate() {
5507 encoded.input_to_exported_pos[input_index] = dict_pos as u32;
5508 }
5509 encoded.exported_symbol_count = base_order.len() as u32;
5510 encoded.payload = dict_payload;
5511
5512 for (input_index, refinement) in layout.refinements.iter().enumerate() {
5513 if let Some(refinement) = refinement {
5514 let prototype_pos = encoded.input_to_exported_pos[refinement.prototype_input_index];
5515 if prototype_pos != u32::MAX {
5516 encoded.input_to_exported_pos[input_index] = prototype_pos;
5517 }
5518 }
5519 }
5520
5521 Ok(encoded)
5522}
5523
5524fn encode_symbol_dict_subset_with_order(
5525 symbols: &[&BitImage],
5526 config: &Jbig2Config,
5527 subset_indices: &[usize],
5528 num_imported_symbols: u32,
5529) -> Result<(Vec<u8>, Vec<usize>)> {
5530 let subset_symbols: Vec<&BitImage> = subset_indices.iter().map(|&i| symbols[i]).collect();
5531 let (payload, subset_order) =
5532 encode_symbol_dict_with_order(&subset_symbols, config, num_imported_symbols)?;
5533 let input_order = subset_order
5534 .into_iter()
5535 .map(|subset_index| subset_indices[subset_index])
5536 .collect();
5537 Ok((payload, input_order))
5538}
5539
5540pub fn encode_symbol_dict_with_order(
5543 symbols: &[&BitImage],
5544 _config: &Jbig2Config,
5545 num_imported_symbols: u32,
5546) -> Result<(Vec<u8>, Vec<usize>)> {
5547 let canonical_order = canonicalize_dict_symbols(symbols);
5549
5550 if canonical_order.is_empty() {
5551 return Err(anyhow!(
5552 "encode_symbol_dict: no valid symbols supplied (all symbols had zero width or height)"
5553 ));
5554 }
5555
5556 let ordered_symbols: Vec<&BitImage> = canonical_order.iter().map(|&i| symbols[i]).collect();
5558
5559 for (i, sym) in ordered_symbols.iter().enumerate() {
5561 if sym.width > (1 << 24) || sym.height > (1 << 24) {
5562 return Err(anyhow!(
5563 "Symbol at index {} exceeds maximum dimensions ({}x{})",
5564 i,
5565 sym.width,
5566 sym.height
5567 ));
5568 }
5569 }
5570
5571 let mut payload = Vec::new();
5572 let mut coder = Jbig2ArithCoder::new();
5573
5574 let num_export_syms = ordered_symbols.len() as u32;
5575
5576 let params = SymbolDictParams {
5578 sd_template: 0, at: [(3, -1), (-3, -1), (2, -2), (-2, -2)],
5581 refine_aggregate: false,
5582 refine_template: 0,
5583 refine_at: [(0, 0), (0, 0)],
5584 exsyms: num_export_syms,
5585 newsyms: ordered_symbols.len() as u32,
5586 };
5587
5588 if cfg!(debug_assertions) {
5589 debug!("encode_symbol_dict: Exporting {} symbols", num_export_syms);
5590 trace!("encode_symbol_dict: SymbolDictParams details: {:?}", params);
5591 }
5592
5593 payload.extend(params.to_bytes());
5595
5596 let mut height_classes: Vec<Vec<&BitImage>> = Vec::new();
5600 let mut current_height: Option<usize> = None;
5601 let mut current_class: Vec<&BitImage> = Vec::new();
5602
5603 for &sym in &ordered_symbols {
5604 match current_height {
5605 None => {
5606 current_height = Some(sym.height);
5608 current_class.push(sym);
5609 }
5610 Some(h) if sym.height == h => {
5611 current_class.push(sym);
5613 }
5614 Some(_) => {
5615 height_classes.push(current_class);
5617 current_height = Some(sym.height);
5618 current_class = vec![sym];
5619 }
5620 }
5621 }
5622 if !current_class.is_empty() {
5623 height_classes.push(current_class);
5624 }
5625
5626 #[cfg(debug_assertions)]
5628 {
5629 debug!(
5630 "Symbol dictionary encoding order ({} symbols):",
5631 ordered_symbols.len()
5632 );
5633 let mut dict_pos = 0u32;
5634 for (hc_idx, symbols_in_class) in height_classes.iter().enumerate() {
5635 debug!(
5636 " Height class {}: {} symbols",
5637 hc_idx,
5638 symbols_in_class.len()
5639 );
5640 for (sym_idx, sym) in symbols_in_class.iter().enumerate() {
5641 let first_pixel = first_black_pixel(sym);
5643 if sym_idx < 5 || sym_idx >= symbols_in_class.len() - 2 {
5644 debug!(
5645 " dict_pos={} -> {}x{} first_pixel={:?}",
5646 dict_pos, sym.width, sym.height, first_pixel
5647 );
5648 } else if sym_idx == 5 {
5649 debug!(
5650 " ... ({} symbols omitted) ...",
5651 symbols_in_class.len() - 7
5652 );
5653 }
5654 dict_pos += 1;
5655 }
5656 }
5657 }
5658
5659 let mut last_height = 0;
5660
5661 for symbols_in_class in &height_classes {
5663 let h = symbols_in_class[0].height; let delta_h = h as i32 - last_height as i32;
5666 let _ = coder.encode_integer(crate::jbig2arith::IntProc::Iadh, delta_h);
5667 last_height = h;
5668
5669 let mut last_width = 0;
5670 #[cfg(debug_assertions)]
5671 let mut dict_pos = 0u32;
5672
5673 #[cfg(debug_assertions)]
5675 {
5676 debug!("Height class {} has {} symbols:", h, symbols_in_class.len());
5677 for (i, symbol) in symbols_in_class.iter().enumerate() {
5678 debug!(" Symbol {}: {}x{}", i, symbol.width, symbol.height);
5679 }
5680 }
5681
5682 for (i, symbol) in symbols_in_class.iter().enumerate() {
5685 let delta_w = symbol.width as i32 - last_width;
5687
5688 #[cfg(debug_assertions)]
5690 debug!(
5691 "Height class {}, Symbol {}: width={}, last_width={}, delta_w={}",
5692 h, i, symbol.width, last_width, delta_w
5693 );
5694
5695 let _ = coder.encode_integer(crate::jbig2arith::IntProc::Iadw, delta_w);
5696 last_width = symbol.width as i32; let packed = symbol.packed_words();
5700
5701 #[cfg(debug_assertions)]
5703 {
5704 debug!(
5705 " dict_pos={} {}x{} first_word={:08x}",
5706 dict_pos,
5707 symbol.width,
5708 symbol.height,
5709 packed.get(0).unwrap_or(&0)
5710 );
5711 }
5712
5713 if let Some(expected_first_pixel) = first_black_pixel(symbol) {
5715 let actual_first_pixel = crate::jbig2sym::first_black_pixel_in_packed(
5716 packed,
5717 symbol.width,
5718 symbol.height,
5719 );
5720 assert_eq!(
5721 actual_first_pixel,
5722 Some(expected_first_pixel),
5723 "bit-order / row-order mismatch in symbol dict packer! Expected first black pixel at {:?}, got {:?}",
5724 expected_first_pixel,
5725 actual_first_pixel
5726 );
5727 }
5728
5729 coder.encode_generic_region(
5730 packed,
5731 symbol.width,
5732 symbol.height,
5733 params.sd_template,
5734 &[(3, -1), (-3, -1), (2, -2), (-2, -2)],
5735 )?;
5736
5737 #[cfg(debug_assertions)]
5738 {
5739 dict_pos += 1;
5740 }
5741 }
5742
5743 let _ = coder.encode_oob(IntProc::Iadw);
5745 }
5746
5747 let _ = coder.encode_integer(IntProc::Iaex, 0);
5749 let _ = coder.encode_integer(IntProc::Iaex, num_export_syms as i32);
5750
5751 coder.flush(true);
5753
5754 payload.extend(coder.as_bytes());
5756
5757 Ok((payload, canonical_order))
5758}
5759
5760fn compute_region_bounds(
5769 instances: &[TextRegionSymbolInstance],
5770 all_known_symbols: &[&BitImage],
5771) -> (u32, u32, u32, u32) {
5772 if instances.is_empty() {
5773 return (0, 0, 0, 0);
5774 }
5775 let mut min_x = u32::MAX;
5776 let mut min_y = u32::MAX;
5777 let mut max_x_coord = 0u32;
5778 let mut max_y_coord = 0u32;
5779
5780 for instance in instances {
5781 let sym_idx = instance.symbol_id as usize;
5782 if sym_idx >= all_known_symbols.len() {
5783 continue; }
5785
5786 let pos = Rect {
5787 x: instance.x as u32, y: instance.y as u32, width: crate::jbig2shared::usize_to_u32(all_known_symbols[sym_idx].width),
5790 height: crate::jbig2shared::usize_to_u32(all_known_symbols[sym_idx].height),
5791 };
5792
5793 min_x = min_x.min(pos.x);
5794 min_y = min_y.min(pos.y);
5795 max_x_coord = max_x_coord.max(pos.x + pos.width);
5796 max_y_coord = max_y_coord.max(pos.y + pos.height);
5797 }
5798
5799 let region_width = if max_x_coord > min_x {
5801 max_x_coord - min_x
5802 } else {
5803 0
5804 };
5805
5806 let region_height = if max_y_coord > min_y {
5807 max_y_coord - min_y
5808 } else {
5809 0
5810 };
5811
5812 (min_x, min_y, region_width, region_height)
5813}
5814
5815pub fn encode_refine(
5816 instances: &[TextRegionSymbolInstance],
5817 all_known_symbols: &[&BitImage],
5818 data: &mut Vec<u8>,
5819 coder: &mut Jbig2ArithCoder,
5820) -> Result<()> {
5821 let (min_x, min_y, region_w, region_h) = compute_region_bounds(instances, all_known_symbols);
5823 let width = region_w.max(1);
5824 let height = region_h.max(1);
5825
5826 let mut flags: u8 = 0;
5829 flags |= 0x40; data.push(flags);
5831
5832 let params = TextRegionParams {
5833 width,
5834 height,
5835 x: min_x,
5836 y: min_y,
5837 ds_offset: 0,
5838 refine: true,
5839 log_strips: 0,
5840 ref_corner: 0,
5841 transposed: false,
5842 comb_op: 0,
5843 refine_template: 0,
5844 };
5845 data.extend(params.to_bytes());
5846
5847 let num_inst = instances.len() as u32;
5849 let _ = coder.encode_int_with_ctx(num_inst as i32, 16, IntProc::Iaai);
5850
5851 let mut region_buf = BitImage::new(width, height).expect("region bitmap too large");
5853
5854 for inst in instances {
5856 let sym_id = inst.symbol_id;
5858 let _ = coder.encode_iaid(sym_id, 16);
5859
5860 let _ = coder.encode_integer(IntProc::Iardx, inst.dx);
5862 let _ = coder.encode_integer(IntProc::Iardy, inst.dy);
5863
5864 if inst.is_refinement {
5866 if let Some(&sym) = all_known_symbols.get(sym_id as usize) {
5868 let ox = inst.x as u32 - min_x;
5870 let oy = inst.y as u32 - min_y;
5871
5872 for y in 0..sym.height as u32 {
5874 for x in 0..sym.width as u32 {
5875 let rx = ox + x;
5877 let ry = oy + y;
5878
5879 if rx >= width || ry >= height {
5881 continue;
5882 }
5883
5884 let ref_bit = sym.get_pixel_unchecked(x as usize, y as usize) as u8;
5887 let pred_bit =
5888 region_buf.get_pixel_unchecked(rx as usize, ry as usize) as u8;
5889
5890 let ctx = ((ref_bit << 1) | pred_bit) as usize;
5892
5893 let bit = ref_bit;
5895 coder.encode_bit(ctx, bit != 0);
5896
5897 if bit != 0 {
5899 region_buf.set(rx, ry, true);
5900 }
5901 }
5902 }
5903 }
5904 }
5905 }
5906
5907 coder.flush(true);
5909 data.extend(coder.as_bytes());
5910
5911 Ok(())
5912}
5913
5914#[inline]
5923fn symbol_id_from_dense_maps(
5924 symbol_index: usize,
5925 global_sym_to_dict_pos: &[u32],
5926 num_global_dict_symbols: u32,
5927 local_sym_to_dict_pos: &[u32],
5928) -> Option<u32> {
5929 let global = global_sym_to_dict_pos
5930 .get(symbol_index)
5931 .copied()
5932 .unwrap_or(u32::MAX);
5933 if global != u32::MAX {
5934 return Some(global);
5935 }
5936 let local = local_sym_to_dict_pos
5937 .get(symbol_index)
5938 .copied()
5939 .unwrap_or(u32::MAX);
5940 if local != u32::MAX {
5941 Some(num_global_dict_symbols + local)
5942 } else {
5943 None
5944 }
5945}
5946
5947pub fn encode_text_region_mapped(
5948 instances: &[SymbolInstance],
5949 config: &Jbig2Config,
5950 all_symbols: &[BitImage],
5951 global_sym_to_dict_pos: &[u32],
5952 num_global_dict_symbols: u32,
5953 local_sym_to_dict_pos: &[u32],
5954 page_num: usize,
5955 num_local_dict_symbols: u32,
5956) -> Result<Vec<u8>> {
5957 if instances.is_empty() {
5958 return Err(anyhow!("No symbol instances provided for text region"));
5959 }
5960
5961 let debug_encoding = page_num == 0 && std::env::var("JBIG2_DEBUG").map_or(false, |v| v == "1");
5962 let mut enc_debug_lines: Vec<String> = Vec::new();
5963
5964 let num_total_dict_symbols = num_global_dict_symbols + num_local_dict_symbols;
5965
5966 let mut payload = Vec::new();
5967 let mut coder = Jbig2ArithCoder::new();
5968
5969 let mut min_x = u32::MAX;
5970 let mut min_y = u32::MAX;
5971 let mut max_x_coord = 0u32;
5972 let mut max_y_coord = 0u32;
5973
5974 for instance in instances {
5975 let sym = &all_symbols[instance.symbol_index];
5976 min_x = min_x.min(instance.position.x);
5977 min_y = min_y.min(instance.position.y);
5978 max_x_coord = max_x_coord.max(instance.position.x + sym.width as u32);
5979 max_y_coord = max_y_coord.max(instance.position.y + sym.height as u32);
5980 }
5981
5982 let region_width = max_x_coord.saturating_sub(min_x);
5983 let region_height = max_y_coord.saturating_sub(min_y);
5984
5985 let params = TextRegionParams {
5986 width: region_width,
5987 height: region_height,
5988 x: min_x,
5989 y: min_y,
5990 ds_offset: config.text_ds_offset,
5991 refine: config.text_refine,
5992 log_strips: config.text_log_strips,
5993 ref_corner: config.text_ref_corner,
5994 transposed: config.text_transposed,
5995 comb_op: config.text_comb_op,
5996 refine_template: config.text_refine_template,
5997 };
5998
5999 payload.extend(params.to_bytes());
6000 payload.extend_from_slice(&(instances.len() as u32).to_be_bytes());
6001
6002 let symbol_id_bits = log2up(num_total_dict_symbols.max(1)).max(1);
6003
6004 #[derive(Clone, Copy)]
6005 struct EncodedInstance {
6006 strip_base: i32,
6007 x: i32,
6008 t_offset: i32,
6009 symbol_id: u32,
6010 symbol_width: i32,
6011 }
6012
6013 let strip_width = 1i32 << params.log_strips.min(3);
6014 let mut encoded_instances = Vec::with_capacity(instances.len());
6015
6016 for instance in instances {
6017 let gs_idx = instance.symbol_index;
6018 let sym = &all_symbols[gs_idx];
6019
6020 let symbol_id = if let Some(symbol_id) = symbol_id_from_dense_maps(
6023 gs_idx,
6024 global_sym_to_dict_pos,
6025 num_global_dict_symbols,
6026 local_sym_to_dict_pos,
6027 ) {
6028 symbol_id
6029 } else {
6030 anyhow::bail!(
6031 "Symbol instance (global_symbols index {}) not found in any dictionary!",
6032 gs_idx
6033 );
6034 };
6035
6036 let abs = instance.position;
6037 let rel_x = abs.x as i32 - min_x as i32;
6038 let rel_y = abs.y as i32 - min_y as i32;
6040 let strip_base = (rel_y / strip_width) * strip_width;
6041 let t_offset = rel_y - strip_base;
6042
6043 encoded_instances.push(EncodedInstance {
6044 strip_base,
6045 x: rel_x,
6046 t_offset,
6047 symbol_id,
6048 symbol_width: sym.width as i32,
6049 });
6050 }
6051
6052 encoded_instances.sort_by_key(|e| (e.strip_base, e.x));
6053
6054 if debug_encoding {
6055 enc_debug_lines.push(format!("=== PAGE 0 ENCODING LOG ==="));
6056 enc_debug_lines.push(format!(
6057 "Region: {}x{} at ({},{})",
6058 params.width, params.height, params.x, params.y
6059 ));
6060 enc_debug_lines.push(format!(
6061 "min_x={} min_y={} strip_width={}",
6062 min_x, min_y, strip_width
6063 ));
6064 enc_debug_lines.push(format!(
6065 "Total instances: {}, dict symbols: {}",
6066 encoded_instances.len(),
6067 num_total_dict_symbols
6068 ));
6069 enc_debug_lines.push(String::new());
6070
6071 enc_debug_lines.push("Symbol ID -> dimensions lookup (first 30):".to_string());
6073 for (dict_id, sym) in all_symbols.iter().enumerate().take(30) {
6074 let dict_pos = symbol_id_from_dense_maps(
6075 dict_id,
6076 global_sym_to_dict_pos,
6077 num_global_dict_symbols,
6078 local_sym_to_dict_pos,
6079 )
6080 .unwrap_or(u32::MAX);
6081 enc_debug_lines.push(format!(
6082 " gs_idx={} -> dict_pos={} ({}x{})",
6083 dict_id, dict_pos, sym.width, sym.height
6084 ));
6085 }
6086 enc_debug_lines.push(String::new());
6087
6088 enc_debug_lines.push(format!(
6089 "{:<6} {:<8} {:<8} {:<10} {:<8} {:<10} {:<10} {:<10}",
6090 "Idx", "SymID", "SymW", "StripBase", "TOffset", "RelX", "DeltaT", "DeltaS"
6091 ));
6092 }
6093
6094 let mut strip_t = 0i32;
6095 let mut first_s = 0i32;
6096 let mut idx = 0usize;
6097
6098 let _ = coder.encode_integer(IntProc::Iadt, 0);
6100
6101 while idx < encoded_instances.len() {
6102 let current_strip = encoded_instances[idx].strip_base;
6103 let delta_t = current_strip - strip_t;
6104 let _ = coder.encode_integer(IntProc::Iadt, delta_t / strip_width);
6105
6106 if debug_encoding && delta_t != 0 {
6107 enc_debug_lines.push(format!(
6108 "--- strip break: IADT delta_t={} (strip_t {} → {})",
6109 delta_t, strip_t, current_strip
6110 ));
6111 }
6112 strip_t = current_strip;
6113
6114 let mut first_symbol_in_strip = true;
6115 let mut current_s = 0i32;
6116 while idx < encoded_instances.len() && encoded_instances[idx].strip_base == current_strip {
6117 let item = encoded_instances[idx];
6118 let delta_s;
6119 if first_symbol_in_strip {
6120 delta_s = item.x - first_s;
6121 let _ = coder.encode_integer(IntProc::Iafs, delta_s);
6122 first_s += delta_s;
6123 current_s = first_s;
6124 first_symbol_in_strip = false;
6125 } else {
6126 delta_s = item.x - current_s;
6127 let _ = coder.encode_integer(IntProc::Iads, delta_s);
6128 current_s += delta_s;
6129 }
6130
6131 if debug_encoding {
6132 enc_debug_lines.push(format!(
6133 "{:<6} {:<8} {:<8} {:<10} {:<8} {:<10} {:<10} {:<10}",
6134 idx,
6135 item.symbol_id,
6136 item.symbol_width,
6137 item.strip_base,
6138 item.t_offset,
6139 item.x,
6140 delta_t,
6141 delta_s
6142 ));
6143 }
6144
6145 if strip_width > 1 {
6146 let _ = coder.encode_integer(IntProc::Iait, item.t_offset);
6147 }
6148 let _ = coder.encode_iaid(item.symbol_id, symbol_id_bits as u8);
6149 current_s += item.symbol_width - 1;
6150 idx += 1;
6151 }
6152 let _ = coder.encode_oob(IntProc::Iads);
6153 }
6154
6155 if debug_encoding {
6158 enc_debug_lines.push(String::new());
6159 enc_debug_lines.push(format!("=== DECODE SIMULATION ==="));
6160 enc_debug_lines.push(format!(
6161 "{:<6} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<8}",
6162 "Idx", "ExpX", "ExpY", "DecS", "DecT", "AbsX", "AbsY", "Match?"
6163 ));
6164
6165 let sbstrips = strip_width;
6167 let sbdsoffset = params.ds_offset as i32;
6168 let mut dec_stript = 0i32;
6169 let mut dec_firsts = 0i32;
6170 let mut sim_idx = 0usize;
6171 let mut strip_start = 0usize;
6172
6173 while sim_idx < encoded_instances.len() {
6175 let current_strip = encoded_instances[sim_idx].strip_base;
6176 let delta_t = if sim_idx == 0 && current_strip == 0 {
6178 0 } else if sim_idx == strip_start {
6180 current_strip - dec_stript
6183 } else {
6184 0 };
6186
6187 if sim_idx == strip_start || sim_idx == 0 {
6189 let iadt_value = (current_strip - dec_stript) / sbstrips;
6190 dec_stript += iadt_value * sbstrips;
6191 }
6192
6193 let mut first_in_strip = true;
6194 let mut dec_curs = 0i32;
6195 let strip_base = current_strip;
6196
6197 while sim_idx < encoded_instances.len()
6198 && encoded_instances[sim_idx].strip_base == strip_base
6199 {
6200 let item = encoded_instances[sim_idx];
6201
6202 if first_in_strip {
6203 let iafs = item.x - dec_firsts;
6205 dec_firsts += iafs;
6206 dec_curs = dec_firsts;
6207 first_in_strip = false;
6208 } else {
6209 let iads = item.x - dec_curs;
6211 dec_curs += iads + sbdsoffset;
6212 }
6213
6214 let dec_ti = dec_stript;
6216 let dec_si = dec_curs;
6217
6218 let abs_x = dec_si + min_x as i32;
6220 let abs_y = dec_ti + min_y as i32;
6221
6222 let exp_x = item.x + min_x as i32;
6224 let exp_y = item.strip_base + min_y as i32;
6225
6226 let ok = abs_x == exp_x && abs_y == exp_y;
6227 let tag = if ok { "OK" } else { "MISMATCH!" };
6228
6229 if !ok || sim_idx < 60 {
6230 enc_debug_lines.push(format!(
6231 "{:<6} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<8}",
6232 sim_idx, exp_x, exp_y, dec_si, dec_ti, abs_x, abs_y, tag
6233 ));
6234 }
6235
6236 dec_curs += item.symbol_width - 1;
6238 sim_idx += 1;
6239 }
6240 strip_start = sim_idx;
6241 }
6242 }
6243
6244 if debug_encoding && !enc_debug_lines.is_empty() {
6246 let log_path = std::path::Path::new("jbig2_debug_page0.log");
6247 if let Ok(mut f) = std::fs::OpenOptions::new()
6249 .create(true)
6250 .append(true)
6251 .open(log_path)
6252 {
6253 use std::io::Write;
6254 let _ = writeln!(f, "");
6255 for line in &enc_debug_lines {
6256 let _ = writeln!(f, "{}", line);
6257 }
6258 }
6259 }
6260
6261 coder.flush(true);
6262 payload.extend(coder.as_bytes());
6263
6264 Ok(payload)
6265}
6266
6267pub fn encode_text_region_with_refinement(
6279 instances: &[SymbolInstance],
6280 config: &Jbig2Config,
6281 all_symbols: &[BitImage],
6282 global_sym_to_dict_pos: &[u32],
6283 num_global_dict_symbols: u32,
6284 local_sym_to_dict_pos: &[u32],
6285 num_local_dict_symbols: u32,
6286) -> Result<Vec<u8>> {
6287 if instances.is_empty() {
6288 return Err(anyhow!("No symbol instances provided for text region"));
6289 }
6290
6291 let num_total_dict_symbols = num_global_dict_symbols + num_local_dict_symbols;
6292
6293 let mut payload = Vec::new();
6294 let mut coder = Jbig2ArithCoder::new();
6295
6296 let mut min_x = u32::MAX;
6300 let mut min_y = u32::MAX;
6301 let mut max_x_coord = 0u32;
6302 let mut max_y_coord = 0u32;
6303
6304 for instance in instances {
6305 let (w, h) = if instance.needs_refinement {
6306 let (_, trimmed) = instance.instance_bitmap.trim();
6307 (trimmed.width as u32, trimmed.height as u32)
6308 } else {
6309 let sym = &all_symbols[instance.symbol_index];
6310 (sym.width as u32, sym.height as u32)
6311 };
6312
6313 min_x = min_x.min(instance.position.x);
6314 min_y = min_y.min(instance.position.y);
6315 max_x_coord = max_x_coord.max(instance.position.x + w);
6316 max_y_coord = max_y_coord.max(instance.position.y + h);
6317 }
6318
6319 let region_width = max_x_coord.saturating_sub(min_x);
6320 let region_height = max_y_coord.saturating_sub(min_y);
6321
6322 let params = TextRegionParams {
6324 width: region_width,
6325 height: region_height,
6326 x: min_x,
6327 y: min_y,
6328 ds_offset: config.text_ds_offset,
6329 refine: true, log_strips: config.text_log_strips,
6331 ref_corner: config.text_ref_corner,
6332 transposed: config.text_transposed,
6333 comb_op: config.text_comb_op,
6334 refine_template: config.text_refine_template,
6335 };
6336
6337 payload.extend(params.to_bytes());
6338 payload.extend_from_slice(&(instances.len() as u32).to_be_bytes());
6339
6340 let symbol_id_bits = log2up(num_total_dict_symbols.max(1)).max(1);
6341
6342 #[derive(Clone)]
6344 struct RefinedInstance {
6345 strip_base: i32,
6346 x: i32,
6347 t_offset: i32,
6348 symbol_id: u32,
6349 symbol_width: i32,
6350 needs_refinement: bool,
6352 orig_idx: usize,
6354 }
6355
6356 let strip_width = 1i32 << params.log_strips.min(3);
6357 let mut encoded_instances = Vec::with_capacity(instances.len());
6358
6359 for (orig_idx, instance) in instances.iter().enumerate() {
6360 let gs_idx = instance.symbol_index;
6361 let sym = &all_symbols[gs_idx];
6362
6363 let symbol_id = if let Some(symbol_id) = symbol_id_from_dense_maps(
6364 gs_idx,
6365 global_sym_to_dict_pos,
6366 num_global_dict_symbols,
6367 local_sym_to_dict_pos,
6368 ) {
6369 symbol_id
6370 } else {
6371 anyhow::bail!(
6372 "Symbol instance (global_symbols index {}) not found in any dictionary!",
6373 gs_idx
6374 );
6375 };
6376
6377 let abs = instance.position;
6378 let rel_x = abs.x as i32 - min_x as i32;
6379 let rel_y = abs.y as i32 - min_y as i32;
6381 let strip_base = (rel_y / strip_width) * strip_width;
6382 let t_offset = rel_y - strip_base;
6383
6384 encoded_instances.push(RefinedInstance {
6385 strip_base,
6386 x: rel_x,
6387 t_offset,
6388 symbol_id,
6389 symbol_width: sym.width as i32,
6390 needs_refinement: instance.needs_refinement,
6391 orig_idx,
6392 });
6393 }
6394
6395 encoded_instances.sort_by_key(|e| (e.strip_base, e.x));
6396
6397 let mut strip_t = 0i32;
6399 let mut first_s = 0i32;
6400 let mut idx = 0usize;
6401
6402 let grat: [(i8, i8); 1] = [(-1, -1)];
6404
6405 let _ = coder.encode_integer(IntProc::Iadt, 0);
6407
6408 while idx < encoded_instances.len() {
6409 let current_strip = encoded_instances[idx].strip_base;
6410 let delta_t = current_strip - strip_t;
6411 let _ = coder.encode_integer(IntProc::Iadt, delta_t / strip_width);
6412 strip_t = current_strip;
6413
6414 let mut first_symbol_in_strip = true;
6415 let mut current_s = 0i32;
6416
6417 while idx < encoded_instances.len() && encoded_instances[idx].strip_base == current_strip {
6418 let item = &encoded_instances[idx];
6419 if first_symbol_in_strip {
6420 let delta_fs = item.x - first_s;
6421 let _ = coder.encode_integer(IntProc::Iafs, delta_fs);
6422 first_s += delta_fs;
6423 current_s = first_s;
6424 first_symbol_in_strip = false;
6425 } else {
6426 let delta_s = item.x - current_s;
6427 let _ = coder.encode_integer(IntProc::Iads, delta_s);
6428 current_s += delta_s;
6429 }
6430
6431 if strip_width > 1 {
6432 let _ = coder.encode_integer(IntProc::Iait, item.t_offset);
6433 }
6434
6435 let _ = coder.encode_iaid(item.symbol_id, symbol_id_bits as u8);
6437
6438 let ri = if item.needs_refinement { 1i32 } else { 0i32 };
6440 let _ = coder.encode_integer(IntProc::Iari, ri);
6441
6442 if item.needs_refinement {
6443 let orig_instance = &instances[item.orig_idx];
6445 let prototype = &all_symbols[orig_instance.symbol_index];
6446
6447 let (_, trimmed_instance) = orig_instance.instance_bitmap.trim();
6449
6450 let rdwi = trimmed_instance.width as i32 - prototype.width as i32;
6452 let rdhi = trimmed_instance.height as i32 - prototype.height as i32;
6453
6454 let _ = coder.encode_integer(IntProc::Iardw, rdwi);
6455 let _ = coder.encode_integer(IntProc::Iardh, rdhi);
6456
6457 let rdxi = orig_instance.refinement_dx;
6461 let rdyi = orig_instance.refinement_dy;
6462
6463 let _ = coder.encode_integer(IntProc::Iardx, rdxi);
6464 let _ = coder.encode_integer(IntProc::Iardy, rdyi);
6465
6466 let grdx = (rdwi / 2) + rdxi;
6468 let grdy = (rdhi / 2) + rdyi;
6469
6470 coder.encode_refinement_region(
6473 &trimmed_instance,
6474 prototype,
6475 grdx,
6476 grdy,
6477 config.text_refine_template,
6478 &grat,
6479 )?;
6480
6481 coder.reset_refinement_contexts();
6483 }
6484
6485 current_s += item.symbol_width - 1;
6486 idx += 1;
6487 }
6488 let _ = coder.encode_oob(IntProc::Iads);
6489 }
6490
6491 coder.flush(true);
6492 payload.extend(coder.as_bytes());
6493
6494 Ok(payload)
6495}
6496
6497pub fn encode_text_region(
6503 instances: &[SymbolInstance],
6504 config: &Jbig2Config,
6505 all_known_symbols: &[&BitImage],
6506 global_dict_indices: &[usize],
6507 local_dict_indices: &[usize],
6508) -> Result<Vec<u8>> {
6509 if instances.is_empty() {
6511 return Err(anyhow!("No symbol instances provided for text region"));
6512 }
6513
6514 if global_dict_indices
6516 .iter()
6517 .any(|&idx| idx >= all_known_symbols.len())
6518 {
6519 return Err(anyhow!("Invalid global dictionary index in text region"));
6520 }
6521
6522 if !local_dict_indices.is_empty() {
6524 if local_dict_indices
6525 .iter()
6526 .any(|&idx| idx >= all_known_symbols.len())
6527 {
6528 return Err(anyhow!("Invalid local dictionary index in text region"));
6529 }
6530 }
6531
6532 for (i, instance) in instances.iter().enumerate() {
6534 if instance.symbol_index >= all_known_symbols.len() {
6535 return Err(anyhow!(
6536 "Symbol instance {} references invalid symbol index {} (max {})",
6537 i,
6538 instance.symbol_index,
6539 all_known_symbols.len() - 1
6540 ));
6541 }
6542
6543 let symbol = &all_known_symbols[instance.symbol_index];
6544 if instance.position.x as u64 + symbol.width as u64 > u32::MAX as u64
6545 || instance.position.y as u64 + symbol.height as u64 > u32::MAX as u64
6546 {
6547 return Err(anyhow!(
6548 "Symbol instance {} at position ({}, {}) would overflow 32-bit coordinates",
6549 i,
6550 instance.position.x,
6551 instance.position.y
6552 ));
6553 }
6554 }
6555 let mut payload = Vec::new();
6556 let mut coder = Jbig2ArithCoder::new();
6557
6558 let mut min_x = u32::MAX;
6559 let mut min_y = u32::MAX;
6560 let mut max_x_coord = 0;
6561 let mut max_y_coord = 0;
6562
6563 if instances.is_empty() {
6564 min_x = 0;
6565 min_y = 0;
6566 } else {
6567 for instance in instances {
6568 let pos = instance.position();
6569 let sym_idx_in_all_known_list = instance.symbol_index();
6570 let symbol_width = all_known_symbols[sym_idx_in_all_known_list].width as i32;
6571 let symbol_height = all_known_symbols[sym_idx_in_all_known_list].height as i32;
6572
6573 min_x = min_x.min(pos.x as u32);
6574 min_y = min_y.min(pos.y as u32);
6575 max_x_coord = max_x_coord.max((pos.x as i32 + symbol_width) as u32);
6576 max_y_coord = max_y_coord.max((pos.y as i32 + symbol_height) as u32);
6577 }
6578 }
6579
6580 let region_width = if max_x_coord > min_x {
6581 max_x_coord - min_x
6582 } else {
6583 0
6584 };
6585 let region_height = if max_y_coord > min_y {
6586 max_y_coord - min_y
6587 } else {
6588 0
6589 };
6590
6591 let params = TextRegionParams {
6592 width: region_width,
6593 height: region_height,
6594 x: min_x,
6595 y: min_y,
6596 ds_offset: config.text_ds_offset,
6597 refine: config.text_refine,
6598 log_strips: config.text_log_strips,
6599 ref_corner: config.text_ref_corner,
6600 transposed: config.text_transposed,
6601 comb_op: config.text_comb_op,
6602 refine_template: config.text_refine_template,
6603 };
6604 if cfg!(debug_assertions) {
6605 trace!("encode_text_region: TextRegionParams details: {:?}", params);
6606 }
6607 payload.extend(params.to_bytes());
6609 payload.extend_from_slice(&(instances.len() as u32).to_be_bytes());
6610
6611 let num_total_dict_symbols = (global_dict_indices.len() + local_dict_indices.len()) as u32;
6613 let symbol_id_bits = log2up(num_total_dict_symbols.max(1)).max(1);
6614
6615 #[derive(Clone, Copy)]
6616 struct EncodedInstance {
6617 strip_base: i32,
6618 x: i32,
6619 t_offset: i32,
6620 symbol_id: u32,
6621 symbol_width: i32,
6622 }
6623
6624 let strip_width = 1i32 << params.log_strips.min(3);
6625 let mut encoded_instances = Vec::with_capacity(instances.len());
6626
6627 for instance in instances {
6628 let sym_idx_in_all_known_list = instance.symbol_index();
6629 let symbol_props = &all_known_symbols[sym_idx_in_all_known_list];
6630 let symbol_id_to_encode = if let Some(pos_global) = global_dict_indices
6631 .iter()
6632 .position(|&idx| idx == sym_idx_in_all_known_list)
6633 {
6634 pos_global as u32
6635 } else if let Some(pos_local) = local_dict_indices
6636 .iter()
6637 .position(|&idx| idx == sym_idx_in_all_known_list)
6638 {
6639 (global_dict_indices.len() + pos_local) as u32
6640 } else {
6641 anyhow::bail!(
6642 "Symbol instance (index {}) not found in referred dictionaries!",
6643 sym_idx_in_all_known_list
6644 );
6645 };
6646
6647 let abs = instance.position();
6649 let rel_x = abs.x as i32 - min_x as i32;
6650 let rel_y = abs.y as i32 - min_y as i32;
6651 let strip_base = (rel_y / strip_width) * strip_width;
6652 let t_offset = rel_y - strip_base;
6653
6654 encoded_instances.push(EncodedInstance {
6655 strip_base,
6656 x: rel_x,
6657 t_offset,
6658 symbol_id: symbol_id_to_encode,
6659 symbol_width: symbol_props.width as i32,
6660 });
6661 }
6662
6663 encoded_instances.sort_by_key(|e| (e.strip_base, e.x));
6665
6666 let mut strip_t = 0i32;
6667 let mut first_s = 0i32;
6668 let mut idx = 0usize;
6669
6670 let _ = coder.encode_integer(IntProc::Iadt, 0);
6672
6673 while idx < encoded_instances.len() {
6674 let current_strip = encoded_instances[idx].strip_base;
6675 let delta_t = current_strip - strip_t;
6676 let _ = coder.encode_integer(IntProc::Iadt, delta_t / strip_width);
6677 strip_t = current_strip;
6678
6679 let mut first_symbol_in_strip = true;
6680 let mut current_s = 0i32;
6681 while idx < encoded_instances.len() && encoded_instances[idx].strip_base == current_strip {
6682 let item = encoded_instances[idx];
6683 if first_symbol_in_strip {
6684 let delta_fs = item.x - first_s;
6685 let _ = coder.encode_integer(IntProc::Iafs, delta_fs);
6686 first_s += delta_fs;
6687 current_s = first_s;
6688 first_symbol_in_strip = false;
6689 } else {
6690 let delta_s = item.x - current_s;
6691 let _ = coder.encode_integer(IntProc::Iads, delta_s);
6692 current_s += delta_s;
6693 }
6694
6695 if strip_width > 1 {
6696 let _ = coder.encode_integer(IntProc::Iait, item.t_offset);
6697 }
6698 let _ = coder.encode_iaid(item.symbol_id, symbol_id_bits as u8);
6699 current_s += item.symbol_width - 1;
6700 idx += 1;
6701 }
6702 let _ = coder.encode_oob(IntProc::Iads);
6703 }
6704
6705 coder.flush(true);
6706 payload.extend(coder.as_bytes());
6707
6708 Ok(payload)
6709}
6710
6711fn uf_find(parent: &mut [usize], mut i: usize) -> usize {
6714 while parent[i] != i {
6715 parent[i] = parent[parent[i]]; i = parent[i];
6717 }
6718 i
6719}
6720
6721fn uf_union(parent: &mut [usize], rank: &mut [u32], a: usize, b: usize) {
6722 let ra = uf_find(parent, a);
6723 let rb = uf_find(parent, b);
6724 if ra == rb {
6725 return;
6726 }
6727 if rank[ra] < rank[rb] {
6728 parent[ra] = rb;
6729 } else if rank[ra] > rank[rb] {
6730 parent[rb] = ra;
6731 } else {
6732 parent[rb] = ra;
6733 rank[ra] += 1;
6734 }
6735}
6736
6737fn compute_symbol_hash(symbol: &BitImage) -> u32 {
6738 let w = symbol.width as u32;
6739 let h = symbol.height as u32;
6740 (10 * h + 10000 * w) % 10000000
6741}
6742
6743fn log2up(v: u32) -> u32 {
6744 if v == 0 {
6745 return 0;
6746 }
6747 let is_pow_of_2 = (v & (v - 1)) == 0;
6748 let mut r = 0;
6749 let mut val = v;
6750 while val > 1 {
6751 val >>= 1;
6752 r += 1;
6753 }
6754 r + if is_pow_of_2 { 0 } else { 1 }
6755}
6756
6757pub fn encode_document(images: &[Array2<u8>], config: &Jbig2Config) -> Result<Vec<u8>> {
6767 let mut encoder = Jbig2Encoder::new(config);
6768 for image in images {
6769 encoder.add_page(image)?;
6770 }
6771 encoder.flush()
6772}
6773
6774#[derive(Debug, Clone)]
6776pub struct TextRegionSymbolInstance {
6777 pub symbol_id: u32,
6779 pub x: i32,
6781 pub y: i32,
6783 pub dx: i32,
6785 pub dy: i32,
6787 pub is_refinement: bool,
6789}
6790
6791impl TextRegionSymbolInstance {
6792 pub fn position(&self) -> crate::jbig2sym::Rect {
6794 crate::jbig2sym::Rect {
6795 x: self.x as u32,
6796 y: self.y as u32,
6797 width: 0, height: 0, }
6800 }
6801
6802 pub fn symbol_index(&self) -> usize {
6804 self.symbol_id as usize
6805 }
6806
6807 pub fn to_symbol_instance(&self, symbol_bitmap: &BitImage) -> SymbolInstance {
6809 SymbolInstance {
6810 symbol_index: self.symbol_id as usize,
6811 position: self.position(),
6812 instance_bitmap: symbol_bitmap.clone(),
6813 needs_refinement: self.is_refinement,
6814 refinement_dx: self.dx,
6815 refinement_dy: self.dy,
6816 }
6817 }
6818}
6819
6820pub fn build_dictionary_and_get_instances(
6821 symbols: &[(Rect, BitImage)],
6822 comparator: &mut Comparator,
6823) -> (Vec<BitImage>, Vec<TextRegionSymbolInstance>) {
6824 let mut dictionary_symbols: Vec<BitImage> = Vec::with_capacity(symbols.len());
6825 let mut dictionary_black_pixels = Vec::with_capacity(symbols.len());
6826 let mut instances = Vec::with_capacity(symbols.len());
6827
6828 for (rect, symbol_image) in symbols.iter() {
6829 let mut found_match = false;
6830 let max_err = ((symbol_image.width * symbol_image.height) / 10).max(3) as u32;
6832 let symbol_black_pixels = symbol_image.count_ones();
6833
6834 for (dict_idx, dict_symbol) in dictionary_symbols.iter().enumerate() {
6835 if symbol_image.width.abs_diff(dict_symbol.width) > MAX_DIMENSION_DELTA
6836 || symbol_image.height.abs_diff(dict_symbol.height) > MAX_DIMENSION_DELTA
6837 {
6838 continue;
6839 }
6840
6841 if symbol_black_pixels.abs_diff(dictionary_black_pixels[dict_idx]) > max_err as usize {
6842 continue;
6843 }
6844
6845 if let Some((err, dx, dy)) = comparator.distance(symbol_image, dict_symbol, max_err) {
6847 instances.push(TextRegionSymbolInstance {
6848 symbol_id: dict_idx as u32,
6849 x: rect.x as i32,
6850 y: rect.y as i32,
6851 dx,
6852 dy,
6853 is_refinement: err > 0,
6854 });
6855 found_match = true;
6856 break;
6857 }
6858 }
6859
6860 if !found_match {
6861 let new_idx = dictionary_symbols.len();
6862 dictionary_symbols.push(symbol_image.clone());
6863 dictionary_black_pixels.push(symbol_black_pixels);
6864 instances.push(TextRegionSymbolInstance {
6865 symbol_id: new_idx as u32,
6866 x: rect.x as i32,
6867 y: rect.y as i32,
6868 dx: 0,
6869 dy: 0,
6870 is_refinement: false,
6871 });
6872 }
6873 }
6874
6875 (dictionary_symbols, instances)
6876}
6877
6878pub fn encode_page_with_symbol_dictionary(
6881 image: &BitImage,
6882 config: &Jbig2Config,
6883 next_segment_num: u32,
6884) -> Result<(Vec<u8>, u32)> {
6885 #[cfg(feature = "cc-analysis")]
6887 let extracted_symbols = {
6888 let dpi = 300; let losslevel = if config.is_lossless { 0 } else { 1 };
6890 let cc_image = analyze_page(image, dpi, losslevel);
6891 let shapes = cc_image.extract_shapes();
6892 shapes
6894 .into_iter()
6895 .map(|(bitmap, bbox)| {
6896 let rect = Rect {
6897 x: bbox.xmin as u32,
6898 y: bbox.ymin as u32,
6899 width: bbox.width() as u32,
6900 height: bbox.height() as u32,
6901 };
6902 (rect, bitmap)
6903 })
6904 .collect::<Vec<_>>()
6905 };
6906 #[cfg(not(feature = "cc-analysis"))]
6907 let extracted_symbols: Vec<(Rect, BitImage)> = Vec::new();
6908
6909 if extracted_symbols.is_empty() {
6910 return Ok((Vec::new(), next_segment_num));
6911 }
6912
6913 let mut comparator = Comparator::default();
6915 let (dictionary_symbols, text_region_instances) =
6916 build_dictionary_and_get_instances(&extracted_symbols, &mut comparator);
6917 debug!(
6918 "Built dictionary with {} symbols and {} instances",
6919 dictionary_symbols.len(),
6920 text_region_instances.len()
6921 );
6922
6923 let mut output = Vec::new();
6924 let mut current_segment_number = next_segment_num;
6925
6926 let dict_refs: Vec<&BitImage> = dictionary_symbols.iter().collect();
6928 let dict_layout = plan_symbol_dictionary_layout(&dict_refs, config, None)?;
6929 let encoded_dict = encode_symbol_dictionary_segments(&dict_refs, config, &dict_layout)?;
6930 let dict_segment_number = current_segment_number;
6931 current_segment_number += 1;
6932 Segment {
6933 number: dict_segment_number,
6934 seg_type: SegmentType::SymbolDictionary,
6935 referred_to: Vec::new(),
6936 page: Some(1),
6937 payload: encoded_dict.payload.clone(),
6938 ..Default::default()
6939 }
6940 .write_into(&mut output)?;
6941
6942 let mut symbol_instances: Vec<SymbolInstance> = text_region_instances
6944 .iter()
6945 .map(|instance| {
6946 let orig_id = instance.symbol_id as usize;
6947 let symbol_bitmap = if orig_id < dictionary_symbols.len() {
6948 &dictionary_symbols[orig_id]
6949 } else {
6950 &dictionary_symbols[0]
6951 };
6952 SymbolInstance {
6953 symbol_index: orig_id,
6954 position: instance.position(),
6955 instance_bitmap: symbol_bitmap.clone(),
6956 needs_refinement: instance.is_refinement,
6957 refinement_dx: instance.dx,
6958 refinement_dy: instance.dy,
6959 }
6960 })
6961 .collect();
6962
6963 for (orig_idx, refinement) in dict_layout.refinements.iter().enumerate() {
6964 if let Some(refinement) = refinement {
6965 for instance in &mut symbol_instances {
6966 if instance.symbol_index == orig_idx {
6967 instance.symbol_index = refinement.prototype_input_index;
6968 instance.needs_refinement = true;
6969 instance.refinement_dx = refinement.refinement_dx;
6970 instance.refinement_dy = refinement.refinement_dy;
6971 }
6972 }
6973 }
6974 }
6975
6976 let region_payload = if !config.uses_lossy_symbol_dictionary()
6977 && (config.refine || symbol_instances.iter().any(|inst| inst.needs_refinement))
6978 {
6979 encode_text_region_with_refinement(
6980 &symbol_instances,
6981 config,
6982 &dictionary_symbols,
6983 &encoded_dict.input_to_exported_pos,
6984 encoded_dict.exported_symbol_count,
6985 &[],
6986 0,
6987 )?
6988 } else {
6989 encode_text_region_mapped(
6990 &symbol_instances,
6991 config,
6992 &dictionary_symbols,
6993 &encoded_dict.input_to_exported_pos,
6994 encoded_dict.exported_symbol_count,
6995 &[],
6996 0,
6997 0,
6998 )?
6999 };
7000
7001 let region_segment = Segment {
7002 number: current_segment_number,
7003 seg_type: SegmentType::ImmediateTextRegion,
7004 retain_flags: 0,
7005 referred_to: vec![dict_segment_number],
7006 page: Some(1), payload: region_payload,
7008 ..Default::default()
7009 };
7010
7011 region_segment.write_into(&mut output)?;
7013 current_segment_number += 1;
7014
7015 Ok((output, current_segment_number))
7016}
7017
7018pub fn get_version() -> &'static str {
7019 "0.2.0"
7020}
7021
7022#[inline]
7023pub fn hash_key(img: &BitImage) -> HashKey {
7024 let h = img.height as u64;
7029 let w = img.width as u64;
7030 HashKey(h * 10_000 + w)
7031}
7032
7033pub fn first_black_pixel(image: &BitImage) -> Option<(usize, usize)> {
7036 for y in 0..image.height {
7037 for x in 0..image.width {
7038 if image.get_usize(x, y) {
7039 return Some((x, y));
7040 }
7041 }
7042 }
7043 None
7044}
7045
7046#[cfg(all(test, feature = "refine"))]
7047mod refine_tests {
7048 use super::*;
7049
7050 fn symbol_from_rows(rows: &[&str]) -> BitImage {
7051 let height = rows.len() as u32;
7052 let width = rows.first().map_or(0, |row| row.len()) as u32;
7053 let mut image = BitImage::new(width, height).expect("test bitmap");
7054 for (y, row) in rows.iter().enumerate() {
7055 for (x, ch) in row.bytes().enumerate() {
7056 if ch == b'1' {
7057 image.set(x as u32, y as u32, true);
7058 }
7059 }
7060 }
7061 image
7062 }
7063
7064 #[test]
7065 fn refinement_layout_collapses_to_prototypes() {
7066 let base = symbol_from_rows(&["0110", "1001", "1111", "1001", "1001"]);
7067 let variant = symbol_from_rows(&["0110", "1001", "1111", "1001", "1001"]);
7068 let symbols = vec![&base, &variant];
7069
7070 let mut config = Jbig2Config::text();
7071 config.refine = true;
7072 config.text_refine = false;
7073
7074 let layout = plan_symbol_dictionary_layout(&symbols, &config, None).expect("layout");
7075 assert_eq!(layout.segment_count(), 1);
7076 assert_eq!(layout.export_input_indices.len(), 1);
7077 assert!(layout.refinements[1].is_some());
7078
7079 let encoded =
7080 encode_symbol_dictionary_segments(&symbols, &config, &layout).expect("encode");
7081 assert_eq!(encoded.exported_symbol_count, 1);
7082 assert!(
7083 encoded
7084 .input_to_exported_pos
7085 .iter()
7086 .all(|&pos| pos != u32::MAX)
7087 );
7088 }
7089}