Skip to main content

jbig2enc_rust/
jbig2enc.rs

1//! This module contains the main JBIG2 encoder logic.
2use crate::jbig2arith::{IntProc, Jbig2ArithCoder};
3use crate::jbig2classify::{
4    FamilyBucketKey, SymbolSignature, compute_symbol_signature as compute_symbol_signature_shared,
5    family_bucket_key_for_symbol, family_bucket_neighbors, family_match_details,
6    family_signatures_are_compatible, refine_compare_score,
7};
8use crate::jbig2comparator::{Comparator, MAX_DIMENSION_DELTA};
9use crate::jbig2context::build_symbol_context_model;
10use crate::jbig2cost::{symbol_dictionary_entries_bytes, symbol_dictionary_entry_bytes};
11use crate::jbig2unify::{SymbolUnifyInputs, UnifiedClass};
12// Symbol extraction using CC analysis
13#[cfg(feature = "cc-analysis")]
14use crate::jbig2cc::analyze_page;
15use crate::jbig2structs::{
16    FileHeader, GenericRegionConfig, GenericRegionParams, Jbig2Config, LossySymbolMode, PageInfo,
17    Segment, SegmentType, SymbolDictParams, TextRegionParams,
18};
19
20use crate::jbig2sym::{BitImage, Rect};
21use anyhow::{Result, anyhow};
22
23// Define debug and trace macros at the crate root
24#[macro_export]
25macro_rules! debug {
26    ($($arg:tt)*) => {
27        #[cfg(feature = "trace_encoder")]
28        log::debug!($($arg)*);
29
30        #[cfg(not(feature = "trace_encoder"))]
31        let _ = format_args!($($arg)*);
32    };
33}
34
35#[macro_export]
36macro_rules! trace {
37    ($($arg:tt)*) => {
38        #[cfg(feature = "trace_encoder")]
39        log::trace!($($arg)*);
40
41        #[cfg(not(feature = "trace_encoder"))]
42        let _ = format_args!($($arg)*);
43    };
44}
45
46// Import the macros for use in this module
47#[allow(unused_imports)]
48use crate::{debug, trace};
49
50use ndarray::Array2;
51use rustc_hash::{FxHashMap, FxHashSet};
52use std::collections::{HashMap, HashSet, VecDeque};
53use std::hash::{Hash, Hasher};
54use std::time::{Duration, Instant};
55
56#[cfg(feature = "parallel")]
57use rayon::prelude::*;
58
59/// A key type for hashing bitmaps efficiently
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub struct HashKey(u64);
62
63const RECENT_SYMBOL_CACHE_CAP: usize = 64;
64const SYM_UNIFY_EXACT_ANCHOR_BUDGET: usize = 32;
65const SYM_UNIFY_NEIGHBOR_ANCHOR_BUDGET: usize = 16;
66const SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE: usize = 8;
67const SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN: usize = 4;
68
69fn encoder_diagnostics_enabled() -> bool {
70    std::env::var("JBIG2_DIAGNOSTICS").is_ok_and(|value| value != "0" && !value.is_empty())
71}
72
73#[inline]
74fn indexed_symbol_dictionary_bytes(symbols: &[BitImage], indices: &[usize]) -> usize {
75    indices
76        .iter()
77        .copied()
78        .map(|index| symbol_dictionary_entry_bytes(&symbols[index]))
79        .sum()
80}
81
82#[inline]
83fn anchor_map_dictionary_bytes(
84    symbols: &[BitImage],
85    anchor_map: &FxHashMap<FamilyBucketKey, Vec<usize>>,
86) -> usize {
87    anchor_map
88        .values()
89        .flat_map(|bucket| bucket.iter().copied())
90        .map(|index| symbol_dictionary_entry_bytes(&symbols[index]))
91        .sum()
92}
93
94#[derive(Debug, Clone, Copy)]
95enum SymUnifyAnchorDecision {
96    Accept {
97        score: u32,
98        dx: i32,
99        dy: i32,
100    },
101    RejectDim,
102    RejectPixelDelta,
103    RejectSignature,
104    RejectOverlap,
105    RejectCompare,
106    RejectScore {
107        score: u32,
108        limit: u32,
109        dx: i32,
110        dy: i32,
111    },
112    RejectOutsideInk,
113}
114
115impl SymUnifyAnchorDecision {
116    fn label(self) -> &'static str {
117        match self {
118            SymUnifyAnchorDecision::Accept { .. } => "accept",
119            SymUnifyAnchorDecision::RejectDim => "dim",
120            SymUnifyAnchorDecision::RejectPixelDelta => "pixel_delta",
121            SymUnifyAnchorDecision::RejectSignature => "signature",
122            SymUnifyAnchorDecision::RejectOverlap => "overlap",
123            SymUnifyAnchorDecision::RejectCompare => "compare",
124            SymUnifyAnchorDecision::RejectScore { .. } => "score",
125            SymUnifyAnchorDecision::RejectOutsideInk => "outside_ink",
126        }
127    }
128
129    fn diagnostic_rank(self) -> u8 {
130        match self {
131            SymUnifyAnchorDecision::Accept { .. } => 255,
132            SymUnifyAnchorDecision::RejectScore { .. } => 7,
133            SymUnifyAnchorDecision::RejectOutsideInk => 6,
134            SymUnifyAnchorDecision::RejectCompare => 5,
135            SymUnifyAnchorDecision::RejectOverlap => 4,
136            SymUnifyAnchorDecision::RejectSignature => 3,
137            SymUnifyAnchorDecision::RejectPixelDelta => 2,
138            SymUnifyAnchorDecision::RejectDim => 1,
139        }
140    }
141}
142
143#[inline]
144fn update_best_reject(best: &mut Option<SymUnifyAnchorDecision>, decision: SymUnifyAnchorDecision) {
145    if !matches!(decision, SymUnifyAnchorDecision::Accept { .. })
146        && best.is_none_or(|current| decision.diagnostic_rank() > current.diagnostic_rank())
147    {
148        *best = Some(decision);
149    }
150}
151
152#[inline]
153fn bitmap_proxy_bytes(symbol: &BitImage) -> usize {
154    (symbol.width.saturating_mul(symbol.height).saturating_add(7)) / 8
155}
156
157#[inline]
158fn classify_residual_shape(symbol: &BitImage) -> ResidualShapeKind {
159    let area = symbol.width.saturating_mul(symbol.height);
160    let black = symbol.count_ones();
161    if area <= 16 || black <= 2 {
162        ResidualShapeKind::Tiny
163    } else if crate::jbig2shared::symbol_likely_punctuation_or_mark(symbol) {
164        ResidualShapeKind::PunctuationLike
165    } else {
166        ResidualShapeKind::GlyphLike
167    }
168}
169
170#[inline]
171fn record_counterfactual_probe(
172    stats: &mut CounterfactualProbeStats,
173    page_num: usize,
174    symbol_index: usize,
175    symbol: &BitImage,
176    black_pixels: usize,
177) {
178    stats.symbol_count += 1;
179    stats.black_pixels += black_pixels;
180    stats.bitmap_proxy_bytes += bitmap_proxy_bytes(symbol);
181    stats.pages.insert(page_num);
182    if stats.samples.len() < 8 {
183        stats
184            .samples
185            .push((page_num + 1, symbol_index, symbol.width, symbol.height));
186    }
187}
188
189#[inline]
190fn record_labeled_counterfactual_probe(
191    stats_map: &mut FxHashMap<&'static str, CounterfactualProbeStats>,
192    label: &'static str,
193    page_num: usize,
194    symbol_index: usize,
195    symbol: &BitImage,
196    black_pixels: usize,
197) {
198    let stats = stats_map.entry(label).or_default();
199    record_counterfactual_probe(stats, page_num, symbol_index, symbol, black_pixels);
200}
201
202#[inline]
203fn relaxed_compare_probe_max_err(candidate: &BitImage, proto: &BitImage) -> u32 {
204    candidate
205        .width
206        .max(proto.width)
207        .saturating_mul(candidate.height.max(proto.height)) as u32
208}
209
210#[inline]
211fn record_detailed_compare_probe(
212    stats: &mut DetailedCompareProbeStats,
213    page_num: usize,
214    symbol_index: usize,
215    symbol: &BitImage,
216    result: crate::jbig2comparator::CompareResult,
217    compare_max_err: u32,
218    exact_dims: bool,
219    strong_anchor: bool,
220) {
221    stats.symbol_count += 1;
222    stats.bitmap_proxy_bytes += bitmap_proxy_bytes(symbol);
223    stats.pages.insert(page_num);
224    stats.exact_dims_count += usize::from(exact_dims);
225    stats.strong_anchor_count += usize::from(strong_anchor);
226    stats.shift_le1_count += usize::from(result.dx.abs() <= 1 && result.dy.abs() <= 1);
227
228    let over_by = result.total_err.saturating_sub(compare_max_err);
229    if over_by <= 2 {
230        stats.over_by_le2_count += 1;
231    } else if over_by <= 4 {
232        stats.over_by_le4_count += 1;
233    } else if over_by <= 8 {
234        stats.over_by_le8_count += 1;
235    } else {
236        stats.over_by_gt8_count += 1;
237    }
238
239    if stats.samples.len() < 8 {
240        stats.samples.push((
241            page_num + 1,
242            symbol_index,
243            symbol.width,
244            symbol.height,
245            result.total_err,
246            compare_max_err,
247            result.overlap_err,
248            result.outside_ink_err,
249            result.dx,
250            result.dy,
251        ));
252    }
253}
254
255impl ResidualSymbolTrace {
256    fn reason_code(self) -> ResidualReasonCode {
257        if self.local_use_count != 1 {
258            return ResidualReasonCode::NonSingletonResidual;
259        }
260
261        if self.had_global_candidates {
262            return match self
263                .global_best_reject
264                .unwrap_or(SymUnifyAnchorDecision::RejectDim)
265            {
266                SymUnifyAnchorDecision::RejectDim => ResidualReasonCode::UseCountOneGlobalRejectDim,
267                SymUnifyAnchorDecision::RejectPixelDelta => {
268                    ResidualReasonCode::UseCountOneGlobalRejectPixelDelta
269                }
270                SymUnifyAnchorDecision::RejectSignature => {
271                    ResidualReasonCode::UseCountOneGlobalRejectSignature
272                }
273                SymUnifyAnchorDecision::RejectOverlap => {
274                    ResidualReasonCode::UseCountOneGlobalRejectOverlap
275                }
276                SymUnifyAnchorDecision::RejectCompare => {
277                    ResidualReasonCode::UseCountOneGlobalRejectCompare
278                }
279                SymUnifyAnchorDecision::RejectOutsideInk => {
280                    ResidualReasonCode::UseCountOneGlobalRejectOutsideInk
281                }
282                SymUnifyAnchorDecision::RejectScore { .. } => {
283                    ResidualReasonCode::UseCountOneGlobalRejectScore
284                }
285                SymUnifyAnchorDecision::Accept { .. } => {
286                    ResidualReasonCode::UseCountOneNoCandidates
287                }
288            };
289        }
290
291        if self.had_local_candidates {
292            return match self
293                .local_best_reject
294                .unwrap_or(SymUnifyAnchorDecision::RejectDim)
295            {
296                SymUnifyAnchorDecision::RejectDim => ResidualReasonCode::UseCountOneLocalRejectDim,
297                SymUnifyAnchorDecision::RejectPixelDelta => {
298                    ResidualReasonCode::UseCountOneLocalRejectPixelDelta
299                }
300                SymUnifyAnchorDecision::RejectSignature => {
301                    ResidualReasonCode::UseCountOneLocalRejectSignature
302                }
303                SymUnifyAnchorDecision::RejectOverlap => {
304                    ResidualReasonCode::UseCountOneLocalRejectOverlap
305                }
306                SymUnifyAnchorDecision::RejectCompare => {
307                    ResidualReasonCode::UseCountOneLocalRejectCompare
308                }
309                SymUnifyAnchorDecision::RejectOutsideInk => {
310                    ResidualReasonCode::UseCountOneLocalRejectOutsideInk
311                }
312                SymUnifyAnchorDecision::RejectScore { .. } => {
313                    ResidualReasonCode::UseCountOneLocalRejectScore
314                }
315                SymUnifyAnchorDecision::Accept { .. } => {
316                    ResidualReasonCode::UseCountOneNoCandidates
317                }
318            };
319        }
320
321        ResidualReasonCode::UseCountOneNoCandidates
322    }
323}
324
325#[derive(Debug, Clone, Copy)]
326struct SymUnifyAnchorCandidate {
327    anchor_index: usize,
328    score: u32,
329    dx: i32,
330    dy: i32,
331    rerank_cost: u32,
332    rescued_on_score: bool,
333}
334
335#[derive(Debug, Clone, Copy)]
336struct ResidualSymbolTrace {
337    page_num: usize,
338    local_use_count: usize,
339    had_local_candidates: bool,
340    had_global_candidates: bool,
341    local_best_reject: Option<SymUnifyAnchorDecision>,
342    global_best_reject: Option<SymUnifyAnchorDecision>,
343}
344
345#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
346enum ResidualReasonCode {
347    UseCountOneNoCandidates,
348    UseCountOneLocalRejectDim,
349    UseCountOneLocalRejectPixelDelta,
350    UseCountOneLocalRejectSignature,
351    UseCountOneLocalRejectOverlap,
352    UseCountOneLocalRejectCompare,
353    UseCountOneLocalRejectOutsideInk,
354    UseCountOneLocalRejectScore,
355    UseCountOneGlobalRejectDim,
356    UseCountOneGlobalRejectPixelDelta,
357    UseCountOneGlobalRejectSignature,
358    UseCountOneGlobalRejectOverlap,
359    UseCountOneGlobalRejectCompare,
360    UseCountOneGlobalRejectOutsideInk,
361    UseCountOneGlobalRejectScore,
362    NonSingletonResidual,
363}
364
365impl ResidualReasonCode {
366    fn label(self) -> &'static str {
367        match self {
368            ResidualReasonCode::UseCountOneNoCandidates => "UseCountOneNoCandidates",
369            ResidualReasonCode::UseCountOneLocalRejectDim => "UseCountOneLocalRejectDim",
370            ResidualReasonCode::UseCountOneLocalRejectPixelDelta => {
371                "UseCountOneLocalRejectPixelDelta"
372            }
373            ResidualReasonCode::UseCountOneLocalRejectSignature => {
374                "UseCountOneLocalRejectSignature"
375            }
376            ResidualReasonCode::UseCountOneLocalRejectOverlap => "UseCountOneLocalRejectOverlap",
377            ResidualReasonCode::UseCountOneLocalRejectCompare => "UseCountOneLocalRejectCompare",
378            ResidualReasonCode::UseCountOneLocalRejectOutsideInk => {
379                "UseCountOneLocalRejectOutsideInk"
380            }
381            ResidualReasonCode::UseCountOneLocalRejectScore => "UseCountOneLocalRejectScore",
382            ResidualReasonCode::UseCountOneGlobalRejectDim => "UseCountOneGlobalRejectDim",
383            ResidualReasonCode::UseCountOneGlobalRejectPixelDelta => {
384                "UseCountOneGlobalRejectPixelDelta"
385            }
386            ResidualReasonCode::UseCountOneGlobalRejectSignature => {
387                "UseCountOneGlobalRejectSignature"
388            }
389            ResidualReasonCode::UseCountOneGlobalRejectOverlap => "UseCountOneGlobalRejectOverlap",
390            ResidualReasonCode::UseCountOneGlobalRejectCompare => "UseCountOneGlobalRejectCompare",
391            ResidualReasonCode::UseCountOneGlobalRejectOutsideInk => {
392                "UseCountOneGlobalRejectOutsideInk"
393            }
394            ResidualReasonCode::UseCountOneGlobalRejectScore => "UseCountOneGlobalRejectScore",
395            ResidualReasonCode::NonSingletonResidual => "NonSingletonResidual",
396        }
397    }
398}
399
400#[derive(Debug, Clone, Copy, PartialEq, Eq)]
401enum ResidualShapeKind {
402    Tiny,
403    PunctuationLike,
404    GlyphLike,
405}
406
407#[derive(Debug, Clone, Default)]
408struct ResidualReasonStats {
409    symbol_count: usize,
410    instance_count: usize,
411    black_pixels: usize,
412    bitmap_proxy_bytes: usize,
413    pages: FxHashSet<usize>,
414    tiny_count: usize,
415    punctuation_like_count: usize,
416    glyph_like_count: usize,
417    samples: Vec<(usize, usize, usize, usize, usize)>,
418}
419
420#[derive(Debug, Clone, Default)]
421struct CounterfactualProbeStats {
422    symbol_count: usize,
423    black_pixels: usize,
424    bitmap_proxy_bytes: usize,
425    pages: FxHashSet<usize>,
426    samples: Vec<(usize, usize, usize, usize)>,
427}
428
429#[derive(Debug, Clone, Default)]
430struct DetailedCompareProbeStats {
431    symbol_count: usize,
432    bitmap_proxy_bytes: usize,
433    pages: FxHashSet<usize>,
434    exact_dims_count: usize,
435    strong_anchor_count: usize,
436    shift_le1_count: usize,
437    over_by_le2_count: usize,
438    over_by_le4_count: usize,
439    over_by_le8_count: usize,
440    over_by_gt8_count: usize,
441    samples: Vec<(usize, usize, usize, usize, u32, u32, u32, u32, i32, i32)>,
442}
443
444#[derive(Debug)]
445struct RecentSymbolCache {
446    recent: VecDeque<usize>,
447    cap: usize,
448}
449
450impl RecentSymbolCache {
451    fn new(cap: usize) -> Self {
452        Self {
453            recent: VecDeque::with_capacity(cap),
454            cap,
455        }
456    }
457
458    fn clear(&mut self) {
459        self.recent.clear();
460    }
461
462    fn touch(&mut self, idx: usize) {
463        if let Some(pos) = self.recent.iter().position(|&entry| entry == idx) {
464            self.recent.remove(pos);
465        }
466        self.recent.push_front(idx);
467        while self.recent.len() > self.cap {
468            self.recent.pop_back();
469        }
470    }
471
472    fn iter(&self) -> impl Iterator<Item = usize> + '_ {
473        self.recent.iter().copied()
474    }
475
476    fn copy_into(&self, out: &mut [usize]) -> usize {
477        let mut len = 0usize;
478        for idx in self.recent.iter().copied() {
479            if len >= out.len() {
480                break;
481            }
482            out[len] = idx;
483            len += 1;
484        }
485        len
486    }
487}
488
489impl Hash for HashKey {
490    fn hash<H: Hasher>(&self, state: &mut H) {
491        self.0.hash(state);
492    }
493}
494
495impl std::fmt::Display for HashKey {
496    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
497        write!(f, "HashKey({:x})", self.0)
498    }
499}
500
501/// A candidate symbol extracted from a document image.
502#[derive(Debug, Clone)]
503pub struct SymbolCandidate {
504    /// The bitmap image of the symbol.
505    pub bitmap: BitImage,
506    /// The bounding box of the symbol in the original image.
507    pub bbox: Rect,
508}
509
510/// Segment a document image into symbol candidates.
511///
512/// This function finds connected components in the input image and returns
513/// them as symbol candidates. Each candidate has a bitmap and a bounding box.
514///
515/// # Arguments
516/// * `image` - The input binary image to segment
517/// * `dpi` - Resolution in dots per inch (typically 300 for scanned documents)
518/// * `losslevel` - 0 for lossless, >0 to enable noise removal
519pub fn segment_symbols(image: &BitImage, dpi: i32, losslevel: i32) -> Result<Vec<SymbolCandidate>> {
520    #[cfg(feature = "cc-analysis")]
521    {
522        // Use the new CC analysis pipeline from jbig2cc
523        let cc_image = analyze_page(image, dpi, losslevel);
524        let shapes = cc_image.extract_shapes();
525
526        let mut candidates = Vec::with_capacity(shapes.len());
527        for (bitmap, bbox) in shapes {
528            let rect = Rect {
529                x: bbox.xmin as u32,
530                y: bbox.ymin as u32,
531                width: bbox.width() as u32,
532                height: bbox.height() as u32,
533            };
534            candidates.push(SymbolCandidate { bitmap, bbox: rect });
535        }
536        Ok(candidates)
537    }
538    #[cfg(not(feature = "cc-analysis"))]
539    {
540        Err(anyhow!("Symbol segmentation requires cc-analysis feature"))
541    }
542}
543
544// Jbig2EncConfig has been removed. Use jbig2structs::Jbig2Config directly.
545
546#[derive(Clone)]
547pub struct SymbolInstance {
548    pub symbol_index: usize,
549    pub position: Rect,
550    pub instance_bitmap: BitImage,
551    /// Whether this instance needs refinement coding (bitmap differs from prototype)
552    pub needs_refinement: bool,
553    /// Horizontal alignment offset for refinement (from Comparator)
554    pub refinement_dx: i32,
555    /// Vertical alignment offset for refinement (from Comparator)
556    pub refinement_dy: i32,
557}
558
559impl SymbolInstance {
560    pub fn symbol_index(&self) -> usize {
561        self.symbol_index
562    }
563
564    pub fn position(&self) -> Rect {
565        self.position
566    }
567
568    pub fn instance_bitmap(&self) -> &BitImage {
569        &self.instance_bitmap
570    }
571}
572
573#[derive(Clone)]
574pub struct PageData {
575    pub image: BitImage,
576    pub symbol_instances: Vec<SymbolInstance>,
577}
578
579#[derive(Debug, Clone, Default)]
580pub struct SymbolModeStageMetrics {
581    pub cc_extraction: Duration,
582    pub matching_dedup: Duration,
583    pub clustering: Duration,
584    pub planning: Duration,
585    pub symbol_dict_encoding: Duration,
586    pub text_region_encoding: Duration,
587    pub generic_region_encoding: Duration,
588}
589
590#[derive(Debug, Clone, Default)]
591pub struct SymbolModeStats {
592    pub symbols_discovered: usize,
593    pub symbols_exported: usize,
594    pub avg_symbol_reuse: f64,
595    pub global_symbol_count: usize,
596    pub local_symbol_count: usize,
597    pub comparator_calls: usize,
598    pub comparator_hits: usize,
599    pub exact_hits: usize,
600    pub refined_hits: usize,
601    pub signature_rejects: usize,
602}
603
604#[derive(Debug, Clone, Default)]
605pub struct EncoderMetrics {
606    pub symbol_mode: SymbolModeStageMetrics,
607    pub symbol_stats: SymbolModeStats,
608}
609
610#[derive(Debug, Clone)]
611pub struct PdfSplitOutput {
612    pub global_segments: Option<Vec<u8>>,
613    pub page_streams: Vec<Vec<u8>>,
614    pub local_dict_bytes_per_page: Vec<usize>,
615    pub text_region_bytes_per_page: Vec<usize>,
616    pub generic_region_bytes_per_page: Vec<usize>,
617}
618
619#[derive(Debug)]
620struct PlannedPage {
621    page_number: u32,
622    segments: Vec<Segment>,
623}
624
625#[derive(Debug)]
626struct PlannedDocument {
627    file_header: Option<FileHeader>,
628    global_segments: Vec<Segment>,
629    pages: Vec<PlannedPage>,
630    eof_segment: Option<Segment>,
631    next_segment_number: u32,
632}
633
634#[derive(Debug, Clone)]
635struct PlannedPageLayout {
636    page_index: usize,
637    page_number: u32,
638    page_info_segment_number: u32,
639    local_dict_segment_numbers: Vec<u32>,
640    local_dict_layout: Option<SymbolDictLayout>,
641    region_segment_number: u32,
642    residual_region_segment_number: Option<u32>,
643    end_of_page_segment_number: u32,
644    local_symbols: Vec<usize>,
645    residual_symbols: Vec<usize>,
646    residual_anchor_remaps: FxHashMap<usize, usize>,
647    use_generic_region: bool,
648}
649
650#[derive(Debug)]
651struct BuiltPage {
652    page: PlannedPage,
653    symbol_dict_time: Duration,
654    text_region_time: Duration,
655    generic_region_time: Duration,
656}
657
658#[derive(Debug, Clone, Default)]
659struct SymbolDictLayout {
660    export_input_indices: Vec<usize>,
661    refinements: Vec<Option<RefinementPlan>>,
662    diagnostics: SymbolDictDiagnostics,
663}
664
665impl SymbolDictLayout {
666    fn segment_count(&self) -> usize {
667        if self.export_input_indices.is_empty() {
668            0
669        } else {
670            1
671        }
672    }
673}
674
675#[derive(Debug, Clone, Default)]
676struct SymbolDictDiagnostics {
677    family_count: usize,
678    singleton_family_count: usize,
679    refined_member_count: usize,
680    exported_member_count: usize,
681    sample_lines: Vec<String>,
682}
683
684#[derive(Debug, Clone, Copy)]
685struct RefinementPlan {
686    prototype_input_index: usize,
687    refinement_dx: i32,
688    refinement_dy: i32,
689}
690
691#[derive(Debug, Clone, Default)]
692struct EncodedSymbolDictionary {
693    payload: Vec<u8>,
694    input_to_exported_pos: Vec<u32>,
695    exported_symbol_count: u32,
696}
697
698/// Mutable state for the encoder that can change during encoding.
699#[derive(Debug, Default)]
700struct EncoderState {
701    pdf_mode: bool,
702    full_headers_remaining: bool,
703    segment: bool,
704    use_refinement: bool,
705    use_delta_encoding: bool,
706    lossy_symbol_mode_applied: bool,
707    ingest_debug_lines: Vec<String>,
708    decision_debug_lines: Vec<String>,
709}
710
711/// Main JBIG2 encoder that handles document encoding
712///
713/// This struct manages the encoding state and configuration for JBIG2 documents.
714/// It supports both symbol-based and generic region encoding strategies.
715pub struct Jbig2Encoder<'a> {
716    /// Configuration for the encoder
717    config: &'a Jbig2Config,
718
719    /// Internal encoder state
720    state: EncoderState,
721
722    /// Global symbols (shared across pages)
723    global_symbols: Vec<BitImage>,
724
725    /// Usage count for each global symbol
726    symbol_usage: Vec<usize>,
727
728    /// Black pixel count cache for each global symbol (for fast pre-filtering)
729    symbol_pixel_counts: Vec<usize>,
730
731    /// Cheap structural signatures used to reject bad matches before full comparison
732    symbol_signatures: Vec<SymbolSignature>,
733
734    /// Number of distinct pages where each symbol appears
735    symbol_page_count: Vec<usize>,
736
737    /// Last page where the symbol was seen, used to deduplicate per-page membership updates
738    symbol_last_page_seen: Vec<Option<usize>>,
739
740    /// Hash map for quick symbol lookup
741    hash_map: FxHashMap<HashKey, Vec<usize>>,
742
743    /// Page data for each page in the document
744    pages: Vec<PageData>,
745
746    /// Per-page unique symbol indices, built incrementally during extraction
747    page_symbol_indices: Vec<Vec<usize>>,
748
749    /// Next available segment number
750    next_segment_number: u32,
751
752    /// Segment numbers of the global dictionary segments, in text-region symbol-ID order.
753    global_dict_segment_numbers: Vec<u32>,
754
755    /// Encoder metrics used by the benchmark harness
756    metrics: EncoderMetrics,
757}
758
759impl<'a> Jbig2Encoder<'a> {
760    /// Creates a new JBIG2 encoder with the specified configuration
761    ///
762    /// # Arguments
763    /// * `config` - Configuration for the encoder
764    pub fn new(config: &'a Jbig2Config) -> Self {
765        if config.refine && !config.symbol_mode {
766            panic!("Refinement requires symbol mode to be enabled.");
767        }
768
769        Self {
770            config,
771            state: EncoderState {
772                pdf_mode: false, // start in raw mode
773                full_headers_remaining: config.want_full_headers,
774                segment: true,                 // Default to using segments
775                use_refinement: config.refine, // Enable refinement based on config
776                use_delta_encoding: true,      // Default to using delta encoding
777                lossy_symbol_mode_applied: false,
778                ingest_debug_lines: Vec::new(),
779                decision_debug_lines: Vec::new(),
780            },
781            global_symbols: Vec::new(),
782            symbol_usage: Vec::new(),
783            symbol_pixel_counts: Vec::new(),
784            symbol_signatures: Vec::new(),
785            symbol_page_count: Vec::new(),
786            symbol_last_page_seen: Vec::new(),
787            hash_map: FxHashMap::default(),
788            pages: Vec::new(),
789            page_symbol_indices: Vec::new(),
790            next_segment_number: 0,
791            global_dict_segment_numbers: Vec::new(),
792            metrics: EncoderMetrics::default(),
793        }
794    }
795
796    pub fn dict_only(mut self) -> Self {
797        self.state.full_headers_remaining = false;
798        self.state.pdf_mode = true;
799        self
800    }
801
802    /// Returns the number of pages currently added to the encoder
803    pub fn get_page_count(&self) -> usize {
804        self.pages.len()
805    }
806
807    pub fn metrics_snapshot(&self) -> EncoderMetrics {
808        self.metrics.clone()
809    }
810
811    pub fn decision_debug_log(&self) -> String {
812        if self.state.ingest_debug_lines.is_empty() {
813            return self.state.decision_debug_lines.join("\n");
814        }
815        if self.state.decision_debug_lines.is_empty() {
816            return self.state.ingest_debug_lines.join("\n");
817        }
818
819        let mut out = String::new();
820        out.push_str(&self.state.ingest_debug_lines.join("\n"));
821        out.push('\n');
822        out.push_str(&self.state.decision_debug_lines.join("\n"));
823        out
824    }
825
826    /// Returns debug information about symbol usage
827    pub fn get_symbol_stats(&self) -> String {
828        let total_symbols = self.global_symbols.len();
829        let avg_usage = if total_symbols > 0 {
830            self.symbol_usage.iter().sum::<usize>() as f32 / total_symbols as f32
831        } else {
832            0.0
833        };
834        let low_usage_count = self.symbol_usage.iter().filter(|&&u| u < 2).count();
835
836        format!(
837            "Total symbols: {}, Average usage: {:.1}, Low usage (<2): {}",
838            total_symbols, avg_usage, low_usage_count
839        )
840    }
841
842    fn compute_symbol_signature(img: &BitImage) -> SymbolSignature {
843        compute_symbol_signature_shared(img)
844    }
845
846    fn signatures_are_compatible(
847        &self,
848        candidate: SymbolSignature,
849        symbol_index: usize,
850        refine: bool,
851    ) -> bool {
852        let stored = self.symbol_signatures[symbol_index];
853        let black_tol = if refine { 12 } else { 8 };
854        let pos_tol = if refine { 2 } else { 2 };
855        let centroid_tol = if refine { 96 } else { 64 };
856
857        candidate.black.abs_diff(stored.black) <= black_tol
858            && candidate.left_col.abs_diff(stored.left_col) <= pos_tol
859            && candidate.right_col.abs_diff(stored.right_col) <= pos_tol
860            && candidate.top_row.abs_diff(stored.top_row) <= pos_tol
861            && candidate.bottom_row.abs_diff(stored.bottom_row) <= pos_tol
862            && candidate.cx_times_256.abs_diff(stored.cx_times_256) <= centroid_tol
863            && candidate.cy_times_256.abs_diff(stored.cy_times_256) <= centroid_tol
864    }
865
866    fn should_skip_symbol_candidate(width: usize, height: usize, black_pixels: usize) -> bool {
867        if width == 0 || height == 0 || black_pixels <= 1 {
868            return true;
869        }
870        if (width >= 64 && height <= 2) || (height >= 64 && width <= 2) {
871            return true;
872        }
873        if width > 256 || height > 256 {
874            return true;
875        }
876
877        let area = width.saturating_mul(height).max(1);
878        let density = black_pixels as f32 / area as f32;
879        let dense_tiny_mark = width <= 6 && height <= 10 && black_pixels <= 24;
880        if dense_tiny_mark {
881            return density < 0.01;
882        }
883        !(0.01..=0.90).contains(&density)
884    }
885
886    #[inline(always)]
887    fn should_accept_match(
888        &self,
889        err: u32,
890        dx: i32,
891        dy: i32,
892        exact_dims: bool,
893        max_err: u32,
894    ) -> (bool, bool) {
895        if err == 0 && dx == 0 && dy == 0 && exact_dims {
896            return (true, false);
897        }
898
899        if self.config.text_refine {
900            if dx.abs() <= 1 && dy.abs() <= 1 && err <= (max_err / 2).max(2) {
901                return (true, true);
902            }
903            return (false, false);
904        }
905
906        if dx.abs() <= 1 && dy == 0 {
907            return (true, false);
908        }
909
910        (false, false)
911    }
912
913    #[inline]
914    fn symbol_unify_assignment_score(result: &crate::jbig2comparator::CompareResult) -> u32 {
915        result
916            .total_err
917            .saturating_add(result.black_delta.saturating_mul(2))
918            .saturating_add(result.outside_ink_err.saturating_mul(3))
919            .saturating_add(((result.dx.abs() + result.dy.abs()) as u32).saturating_mul(3))
920            .saturating_add((result.row_profile_err + result.col_profile_err) / 24)
921    }
922
923    fn sym_unify_context_rerank_cost(candidate: &BitImage, proto: &BitImage) -> u32 {
924        let width = candidate.width.max(proto.width);
925        let height = candidate.height.max(proto.height);
926        let mut cost = 0u32;
927
928        for y in 0..height {
929            for x in 0..width {
930                let cand = candidate.get_usize(x, y);
931                let proto_bit = proto.get_usize(x, y);
932                if !cand && !proto_bit {
933                    continue;
934                }
935
936                let proto_support = (-1i32..=1)
937                    .flat_map(|dy| (-1i32..=1).map(move |dx| (dx, dy)))
938                    .filter(|&(dx, dy)| dx != 0 || dy != 0)
939                    .filter(|&(dx, dy)| {
940                        let nx = x as i32 + dx;
941                        let ny = y as i32 + dy;
942                        nx >= 0 && ny >= 0 && proto.get_usize(nx as usize, ny as usize)
943                    })
944                    .count() as u32;
945                let causal_support = [(-1i32, 0i32), (-1, -1), (0, -1), (1, -1)]
946                    .into_iter()
947                    .filter(|&(dx, dy)| {
948                        let nx = x as i32 + dx;
949                        let ny = y as i32 + dy;
950                        nx >= 0 && ny >= 0 && candidate.get_usize(nx as usize, ny as usize)
951                    })
952                    .count() as u32;
953
954                if cand == proto_bit {
955                    cost = cost.saturating_add(1 + u32::from(cand && proto_support == 0));
956                } else {
957                    cost = cost.saturating_add(4 + proto_support + causal_support);
958                }
959            }
960        }
961
962        cost
963    }
964
965    fn sym_unify_anchor_candidate_is_better(
966        &self,
967        candidate: SymUnifyAnchorCandidate,
968        current: SymUnifyAnchorCandidate,
969    ) -> bool {
970        (
971            !candidate.rescued_on_score,
972            std::cmp::Reverse(candidate.rerank_cost),
973            std::cmp::Reverse(candidate.score),
974            self.symbol_page_count[candidate.anchor_index],
975            self.symbol_usage[candidate.anchor_index],
976            std::cmp::Reverse(candidate.anchor_index),
977        ) > (
978            !current.rescued_on_score,
979            std::cmp::Reverse(current.rerank_cost),
980            std::cmp::Reverse(current.score),
981            self.symbol_page_count[current.anchor_index],
982            self.symbol_usage[current.anchor_index],
983            std::cmp::Reverse(current.anchor_index),
984        )
985    }
986
987    fn maybe_update_best_sym_unify_anchor_candidate(
988        &self,
989        best: &mut Option<SymUnifyAnchorCandidate>,
990        candidate_bitmap: &BitImage,
991        anchor_index: usize,
992        score: u32,
993        dx: i32,
994        dy: i32,
995        rescued_on_score: bool,
996    ) {
997        let rerank_cost = Self::sym_unify_context_rerank_cost(
998            candidate_bitmap,
999            &self.global_symbols[anchor_index],
1000        );
1001        let proposal = SymUnifyAnchorCandidate {
1002            anchor_index,
1003            score,
1004            dx,
1005            dy,
1006            rerank_cost,
1007            rescued_on_score,
1008        };
1009        if best.is_none_or(|current| self.sym_unify_anchor_candidate_is_better(proposal, current)) {
1010            *best = Some(proposal);
1011        }
1012    }
1013
1014    #[inline]
1015    fn sym_unify_anchor_ready(&self, symbol_index: usize, page_num: usize) -> bool {
1016        if self.symbol_usage[symbol_index] < 2 || self.symbol_pixel_counts[symbol_index] <= 1 {
1017            return false;
1018        }
1019
1020        let usage_ready =
1021            self.symbol_usage[symbol_index] >= self.config.sym_unify_min_class_usage.max(2);
1022        let page_span_ready =
1023            self.symbol_page_count[symbol_index] >= self.config.sym_unify_min_page_span.max(2);
1024        let recent_ready = self.symbol_last_page_seen[symbol_index]
1025            .map(|last| page_num.saturating_sub(last) <= 1)
1026            .unwrap_or(false)
1027            && self.symbol_usage[symbol_index] >= 3;
1028
1029        usage_ready || page_span_ready || recent_ready
1030    }
1031
1032    fn build_sym_unify_anchor_map(
1033        &self,
1034        page_num: usize,
1035    ) -> FxHashMap<FamilyBucketKey, Vec<usize>> {
1036        let mut anchors: FxHashMap<FamilyBucketKey, Vec<usize>> = FxHashMap::default();
1037        for symbol_index in 0..self.global_symbols.len() {
1038            if !self.sym_unify_anchor_ready(symbol_index, page_num) {
1039                continue;
1040            }
1041            let key = family_bucket_key_for_symbol(
1042                &self.global_symbols[symbol_index],
1043                &self.symbol_signatures[symbol_index],
1044            );
1045            anchors.entry(key).or_default().push(symbol_index);
1046        }
1047        for bucket in anchors.values_mut() {
1048            bucket.sort_unstable_by(|&lhs, &rhs| {
1049                self.symbol_page_count[rhs]
1050                    .cmp(&self.symbol_page_count[lhs])
1051                    .then_with(|| self.symbol_usage[rhs].cmp(&self.symbol_usage[lhs]))
1052                    .then_with(|| self.symbol_pixel_counts[rhs].cmp(&self.symbol_pixel_counts[lhs]))
1053                    .then_with(|| lhs.cmp(&rhs))
1054            });
1055        }
1056        anchors
1057    }
1058
1059    fn maybe_add_sym_unify_anchor(
1060        &self,
1061        anchors: &mut FxHashMap<FamilyBucketKey, Vec<usize>>,
1062        symbol_index: usize,
1063        page_num: usize,
1064    ) {
1065        if !self.sym_unify_anchor_ready(symbol_index, page_num) {
1066            return;
1067        }
1068        let key = family_bucket_key_for_symbol(
1069            &self.global_symbols[symbol_index],
1070            &self.symbol_signatures[symbol_index],
1071        );
1072        let bucket = anchors.entry(key).or_default();
1073        if !bucket.contains(&symbol_index) {
1074            bucket.push(symbol_index);
1075            bucket.sort_unstable_by(|&lhs, &rhs| {
1076                self.symbol_page_count[rhs]
1077                    .cmp(&self.symbol_page_count[lhs])
1078                    .then_with(|| self.symbol_usage[rhs].cmp(&self.symbol_usage[lhs]))
1079                    .then_with(|| self.symbol_pixel_counts[rhs].cmp(&self.symbol_pixel_counts[lhs]))
1080                    .then_with(|| lhs.cmp(&rhs))
1081            });
1082        }
1083    }
1084
1085    fn residual_symbol_matches_anchor(
1086        &self,
1087        residual_index: usize,
1088        anchor_index: usize,
1089        comparator: &mut Comparator,
1090    ) -> bool {
1091        matches!(
1092            self.residual_symbol_anchor_decision(residual_index, anchor_index, comparator),
1093            SymUnifyAnchorDecision::Accept { .. }
1094        )
1095    }
1096
1097    fn residual_symbol_anchor_decision(
1098        &self,
1099        residual_index: usize,
1100        anchor_index: usize,
1101        comparator: &mut Comparator,
1102    ) -> SymUnifyAnchorDecision {
1103        let candidate = &self.global_symbols[residual_index];
1104        let proto = &self.global_symbols[anchor_index];
1105        if candidate.width.abs_diff(proto.width) > 1 || candidate.height.abs_diff(proto.height) > 1
1106        {
1107            return SymUnifyAnchorDecision::RejectDim;
1108        }
1109
1110        let strong_anchor = self.symbol_usage[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1111            || self.symbol_page_count[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1112        let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1113        let area = candidate
1114            .width
1115            .max(proto.width)
1116            .saturating_mul(candidate.height.max(proto.height));
1117        let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1118        let black_delta = self.symbol_pixel_counts[anchor_index]
1119            .abs_diff(self.symbol_pixel_counts[residual_index]);
1120        if black_delta > pixel_delta_limit {
1121            return SymUnifyAnchorDecision::RejectPixelDelta;
1122        }
1123        let signature_compatible = family_signatures_are_compatible(
1124            self.symbol_signatures[residual_index],
1125            self.symbol_signatures[anchor_index],
1126            self.symbol_pixel_counts[residual_index],
1127            self.symbol_pixel_counts[anchor_index],
1128        );
1129        if !signature_compatible {
1130            let soft_signature_black_delta_limit = 4 + usize::from(strong_anchor);
1131            if !exact_dims || black_delta > soft_signature_black_delta_limit {
1132                return SymUnifyAnchorDecision::RejectSignature;
1133            }
1134        }
1135
1136        let overlap_limit = self
1137            .config
1138            .sym_unify_max_err
1139            .max(4)
1140            .saturating_add(2)
1141            .saturating_add(u32::from(strong_anchor))
1142            .min(15);
1143        let Some(overlap) = comparator.compare_overlap_only(candidate, proto, overlap_limit) else {
1144            return SymUnifyAnchorDecision::RejectOverlap;
1145        };
1146        if overlap.dx.abs() > self.config.sym_unify_max_dx.max(0)
1147            || overlap.dy.abs() > self.config.sym_unify_max_dy.max(0)
1148            || overlap.overlap_err > overlap_limit
1149            || overlap.black_delta > pixel_delta_limit as u32
1150        {
1151            return SymUnifyAnchorDecision::RejectOverlap;
1152        }
1153
1154        let compare_max_err = self
1155            .config
1156            .sym_unify_max_err
1157            .max(4)
1158            .saturating_add(u32::from(strong_anchor));
1159        let Some(result) = comparator.compare_for_symbol_unify(
1160            candidate,
1161            proto,
1162            compare_max_err,
1163            self.config.sym_unify_max_dx.max(0),
1164            self.config.sym_unify_max_dy.max(0),
1165        ) else {
1166            return SymUnifyAnchorDecision::RejectCompare;
1167        };
1168
1169        let outside_limit =
1170            self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1171        if result.outside_ink_err > outside_limit {
1172            return SymUnifyAnchorDecision::RejectOutsideInk;
1173        }
1174
1175        let score = Self::symbol_unify_assignment_score(&result);
1176        let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1177        if score > score_limit {
1178            return SymUnifyAnchorDecision::RejectScore {
1179                score,
1180                limit: score_limit,
1181                dx: result.dx,
1182                dy: result.dy,
1183            };
1184        }
1185
1186        SymUnifyAnchorDecision::Accept {
1187            score,
1188            dx: result.dx,
1189            dy: result.dy,
1190        }
1191    }
1192
1193    fn residual_symbol_accept_with_dim_limit(
1194        &self,
1195        residual_index: usize,
1196        anchor_index: usize,
1197        comparator: &mut Comparator,
1198        dim_limit: usize,
1199    ) -> bool {
1200        let candidate = &self.global_symbols[residual_index];
1201        let proto = &self.global_symbols[anchor_index];
1202        if candidate.width.abs_diff(proto.width) > dim_limit
1203            || candidate.height.abs_diff(proto.height) > dim_limit
1204        {
1205            return false;
1206        }
1207
1208        let strong_anchor = self.symbol_usage[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1209            || self.symbol_page_count[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1210        let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1211        let area = candidate
1212            .width
1213            .max(proto.width)
1214            .saturating_mul(candidate.height.max(proto.height));
1215        let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1216        let black_delta = self.symbol_pixel_counts[anchor_index]
1217            .abs_diff(self.symbol_pixel_counts[residual_index]);
1218        if black_delta > pixel_delta_limit {
1219            return false;
1220        }
1221
1222        let signature_compatible = family_signatures_are_compatible(
1223            self.symbol_signatures[residual_index],
1224            self.symbol_signatures[anchor_index],
1225            self.symbol_pixel_counts[residual_index],
1226            self.symbol_pixel_counts[anchor_index],
1227        );
1228        if !signature_compatible {
1229            let soft_signature_black_delta_limit = 4 + usize::from(strong_anchor);
1230            if !exact_dims || black_delta > soft_signature_black_delta_limit {
1231                return false;
1232            }
1233        }
1234
1235        let overlap_limit = self
1236            .config
1237            .sym_unify_max_err
1238            .max(4)
1239            .saturating_add(2)
1240            .saturating_add(u32::from(strong_anchor))
1241            .min(15);
1242        let Some(overlap) = comparator.compare_overlap_only(candidate, proto, overlap_limit) else {
1243            return false;
1244        };
1245        if overlap.dx.abs() > self.config.sym_unify_max_dx.max(0)
1246            || overlap.dy.abs() > self.config.sym_unify_max_dy.max(0)
1247            || overlap.overlap_err > overlap_limit
1248            || overlap.black_delta > pixel_delta_limit as u32
1249        {
1250            return false;
1251        }
1252
1253        let compare_max_err = self
1254            .config
1255            .sym_unify_max_err
1256            .max(4)
1257            .saturating_add(u32::from(strong_anchor));
1258        let Some(result) = comparator.compare_for_symbol_unify(
1259            candidate,
1260            proto,
1261            compare_max_err,
1262            self.config.sym_unify_max_dx.max(0),
1263            self.config.sym_unify_max_dy.max(0),
1264        ) else {
1265            return false;
1266        };
1267
1268        let outside_limit =
1269            self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1270        if result.outside_ink_err > outside_limit {
1271            return false;
1272        }
1273
1274        let score = Self::symbol_unify_assignment_score(&result);
1275        let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1276        score <= score_limit
1277    }
1278
1279    fn residual_symbol_accept_without_overlap_prescreen(
1280        &self,
1281        residual_index: usize,
1282        anchor_index: usize,
1283        comparator: &mut Comparator,
1284    ) -> bool {
1285        matches!(
1286            self.residual_symbol_anchor_decision_without_overlap_prescreen(
1287                residual_index,
1288                anchor_index,
1289                comparator,
1290            ),
1291            SymUnifyAnchorDecision::Accept { .. }
1292        )
1293    }
1294
1295    fn residual_symbol_anchor_decision_without_overlap_prescreen(
1296        &self,
1297        residual_index: usize,
1298        anchor_index: usize,
1299        comparator: &mut Comparator,
1300    ) -> SymUnifyAnchorDecision {
1301        let candidate = &self.global_symbols[residual_index];
1302        let proto = &self.global_symbols[anchor_index];
1303        if candidate.width.abs_diff(proto.width) > 1 || candidate.height.abs_diff(proto.height) > 1
1304        {
1305            return SymUnifyAnchorDecision::RejectDim;
1306        }
1307
1308        let strong_anchor = self.symbol_usage[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1309            || self.symbol_page_count[anchor_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1310        let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1311        let area = candidate
1312            .width
1313            .max(proto.width)
1314            .saturating_mul(candidate.height.max(proto.height));
1315        let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1316        let black_delta = self.symbol_pixel_counts[anchor_index]
1317            .abs_diff(self.symbol_pixel_counts[residual_index]);
1318        if black_delta > pixel_delta_limit {
1319            return SymUnifyAnchorDecision::RejectPixelDelta;
1320        }
1321
1322        let signature_compatible = family_signatures_are_compatible(
1323            self.symbol_signatures[residual_index],
1324            self.symbol_signatures[anchor_index],
1325            self.symbol_pixel_counts[residual_index],
1326            self.symbol_pixel_counts[anchor_index],
1327        );
1328        if !signature_compatible {
1329            let soft_signature_black_delta_limit = 4 + usize::from(strong_anchor);
1330            if !exact_dims || black_delta > soft_signature_black_delta_limit {
1331                return SymUnifyAnchorDecision::RejectSignature;
1332            }
1333        }
1334
1335        let compare_max_err = self
1336            .config
1337            .sym_unify_max_err
1338            .max(4)
1339            .saturating_add(u32::from(strong_anchor));
1340        let Some(result) = comparator.compare_for_symbol_unify(
1341            candidate,
1342            proto,
1343            compare_max_err,
1344            self.config.sym_unify_max_dx.max(0),
1345            self.config.sym_unify_max_dy.max(0),
1346        ) else {
1347            return SymUnifyAnchorDecision::RejectCompare;
1348        };
1349
1350        let outside_limit =
1351            self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1352        if result.outside_ink_err > outside_limit {
1353            return SymUnifyAnchorDecision::RejectOutsideInk;
1354        }
1355
1356        let score = Self::symbol_unify_assignment_score(&result);
1357        let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1358        if score > score_limit {
1359            return SymUnifyAnchorDecision::RejectScore {
1360                score,
1361                limit: score_limit,
1362                dx: result.dx,
1363                dy: result.dy,
1364            };
1365        }
1366
1367        SymUnifyAnchorDecision::Accept {
1368            score,
1369            dx: result.dx,
1370            dy: result.dy,
1371        }
1372    }
1373
1374    #[inline(always)]
1375    fn evaluate_symbol_match(
1376        &mut self,
1377        candidate: &BitImage,
1378        candidate_sig: SymbolSignature,
1379        candidate_pixels: usize,
1380        symbol_index: usize,
1381        comparator: &mut Comparator,
1382        max_err: u32,
1383    ) -> Option<(u32, i32, i32, bool)> {
1384        let proto = &self.global_symbols[symbol_index];
1385        let dim_limit = if self.config.text_refine { 2 } else { 0 };
1386        if (candidate.width as i32 - proto.width as i32).unsigned_abs() > dim_limit
1387            || (candidate.height as i32 - proto.height as i32).unsigned_abs() > dim_limit
1388        {
1389            return None;
1390        }
1391        if self.symbol_pixel_counts[symbol_index].abs_diff(candidate_pixels)
1392            > max_err as usize + if self.config.text_refine { 8 } else { 6 }
1393        {
1394            return None;
1395        }
1396        if !self.signatures_are_compatible(candidate_sig, symbol_index, self.config.text_refine) {
1397            self.metrics.symbol_stats.signature_rejects += 1;
1398            return None;
1399        }
1400
1401        self.metrics.symbol_stats.comparator_calls += 1;
1402        let (err, dx, dy) = if self.config.text_refine {
1403            comparator
1404                .compare_for_refine_family(candidate, proto, max_err, 1, 1)
1405                .map(|r| (r.total_err, r.dx, r.dy))?
1406        } else {
1407            comparator
1408                .compare_for_refine_family(candidate, proto, max_err, 1, 0)
1409                .map(|r| (r.total_err, r.dx, r.dy))?
1410        };
1411        self.metrics.symbol_stats.comparator_hits += 1;
1412
1413        let exact_dims = candidate.width == proto.width && candidate.height == proto.height;
1414        let (accept, needs_refinement) = self.should_accept_match(err, dx, dy, exact_dims, max_err);
1415        if !accept {
1416            return None;
1417        }
1418
1419        if needs_refinement {
1420            self.metrics.symbol_stats.refined_hits += 1;
1421        } else if err == 0 && dx == 0 && dy == 0 && exact_dims {
1422            self.metrics.symbol_stats.exact_hits += 1;
1423        }
1424
1425        Some((err, dx, dy, needs_refinement))
1426    }
1427
1428    #[inline(always)]
1429    fn evaluate_symbol_unify_anchor_match(
1430        &mut self,
1431        candidate: &BitImage,
1432        candidate_sig: SymbolSignature,
1433        candidate_pixels: usize,
1434        symbol_index: usize,
1435        comparator: &mut Comparator,
1436    ) -> SymUnifyAnchorDecision {
1437        let proto = &self.global_symbols[symbol_index];
1438        if candidate.width.abs_diff(proto.width) > 1 || candidate.height.abs_diff(proto.height) > 1
1439        {
1440            return SymUnifyAnchorDecision::RejectDim;
1441        }
1442
1443        let area = candidate
1444            .width
1445            .max(proto.width)
1446            .saturating_mul(candidate.height.max(proto.height));
1447        let strong_anchor = self.symbol_usage[symbol_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
1448            || self.symbol_page_count[symbol_index] >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
1449        let pixel_delta_limit = (area / 10).clamp(4, 16) + usize::from(strong_anchor);
1450        if self.symbol_pixel_counts[symbol_index].abs_diff(candidate_pixels) > pixel_delta_limit {
1451            return SymUnifyAnchorDecision::RejectPixelDelta;
1452        }
1453
1454        if !family_signatures_are_compatible(
1455            candidate_sig,
1456            self.symbol_signatures[symbol_index],
1457            candidate_pixels,
1458            self.symbol_pixel_counts[symbol_index],
1459        ) {
1460            self.metrics.symbol_stats.signature_rejects += 1;
1461            return SymUnifyAnchorDecision::RejectSignature;
1462        }
1463
1464        let overlap_limit = self
1465            .config
1466            .sym_unify_max_err
1467            .max(4)
1468            .saturating_add(2)
1469            .saturating_add(u32::from(strong_anchor))
1470            .min(15);
1471        let Some(overlap) = comparator.compare_overlap_only(candidate, proto, overlap_limit) else {
1472            return SymUnifyAnchorDecision::RejectOverlap;
1473        };
1474        if overlap.dx.abs() > self.config.sym_unify_max_dx.max(0)
1475            || overlap.dy.abs() > self.config.sym_unify_max_dy.max(0)
1476            || overlap.overlap_err > overlap_limit
1477            || overlap.black_delta > pixel_delta_limit as u32
1478        {
1479            return SymUnifyAnchorDecision::RejectOverlap;
1480        }
1481
1482        self.metrics.symbol_stats.comparator_calls += 1;
1483        let compare_max_err = self
1484            .config
1485            .sym_unify_max_err
1486            .max(4)
1487            .saturating_add(u32::from(strong_anchor));
1488        let Some(result) = comparator.compare_for_symbol_unify(
1489            candidate,
1490            proto,
1491            compare_max_err,
1492            self.config.sym_unify_max_dx.max(0),
1493            self.config.sym_unify_max_dy.max(0),
1494        ) else {
1495            return SymUnifyAnchorDecision::RejectCompare;
1496        };
1497        self.metrics.symbol_stats.comparator_hits += 1;
1498
1499        let score = Self::symbol_unify_assignment_score(&result);
1500        let outside_limit =
1501            self.config.sym_unify_max_border_outside_ink.min(1) + u32::from(strong_anchor);
1502        let score_limit = self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
1503        if result.outside_ink_err > outside_limit {
1504            return SymUnifyAnchorDecision::RejectOutsideInk;
1505        }
1506        if score > score_limit {
1507            return SymUnifyAnchorDecision::RejectScore {
1508                score,
1509                limit: score_limit,
1510                dx: result.dx,
1511                dy: result.dy,
1512            };
1513        }
1514
1515        SymUnifyAnchorDecision::Accept {
1516            score,
1517            dx: result.dx,
1518            dy: result.dy,
1519        }
1520    }
1521
1522    fn estimate_local_symbol_gain(&self, page: &PageData, symbol_index: usize) -> i64 {
1523        let uses = page
1524            .symbol_instances
1525            .iter()
1526            .filter(|instance| instance.symbol_index == symbol_index)
1527            .count() as i64;
1528        let symbol = &self.global_symbols[symbol_index];
1529        let area = (symbol.width * symbol.height) as i64;
1530        let dict_cost = 24 + (area / 8);
1531        let saved_per_use = (area / 10).max(2);
1532        (uses * saved_per_use) - dict_cost
1533    }
1534
1535    fn estimate_global_symbol_gain(&self, symbol_index: usize) -> i64 {
1536        let uses = self.symbol_usage[symbol_index] as i64;
1537        let page_span = self.symbol_page_count[symbol_index] as i64;
1538        let symbol = &self.global_symbols[symbol_index];
1539        let area = (symbol.width * symbol.height) as i64;
1540        let dict_cost = 24 + (area / 8);
1541        let id_savings = ((uses - page_span).max(0)) * 2;
1542        let reuse_value = (uses * (area / 12).max(2)) + (page_span * 3);
1543        reuse_value + id_savings - dict_cost
1544    }
1545
1546    fn should_keep_text_local_symbol(&self, page: &PageData, symbol_index: usize) -> bool {
1547        let _ = (page, symbol_index);
1548        false
1549    }
1550
1551    fn choose_cluster_prototype(&self, members: &[usize]) -> usize {
1552        if members.len() <= 1 || !self.config.text_refine {
1553            return *members
1554                .iter()
1555                .max_by(|&&lhs, &&rhs| {
1556                    self.symbol_usage[lhs]
1557                        .cmp(&self.symbol_usage[rhs])
1558                        .then_with(|| {
1559                            self.symbol_pixel_counts[lhs].cmp(&self.symbol_pixel_counts[rhs])
1560                        })
1561                        .then_with(|| rhs.cmp(&lhs))
1562                })
1563                .unwrap();
1564        }
1565
1566        let mut comparator = Comparator::default();
1567        let mut best_idx = members[0];
1568        let mut best_cost = u64::MAX;
1569
1570        for &candidate in members {
1571            let candidate_symbol = &self.global_symbols[candidate];
1572            let mut total_cost = 0u64;
1573            for &other in members {
1574                if candidate == other {
1575                    continue;
1576                }
1577                let other_symbol = &self.global_symbols[other];
1578                let area = candidate_symbol.width.max(other_symbol.width)
1579                    * candidate_symbol.height.max(other_symbol.height);
1580                let max_err = ((self.symbol_pixel_counts[candidate]
1581                    .max(self.symbol_pixel_counts[other]) as f32
1582                    * 0.10) as u32)
1583                    .max((area / self.config.match_tolerance.max(1) as usize) as u32)
1584                    .clamp(3, 20);
1585
1586                match comparator.compare_for_refine_family(
1587                    other_symbol,
1588                    candidate_symbol,
1589                    max_err,
1590                    2,
1591                    1,
1592                ) {
1593                    Some(result) => {
1594                        let err = result.total_err;
1595                        let dx = result.dx;
1596                        let dy = result.dy;
1597                        let refinement_penalty = err as u64 + ((dx.abs() + dy.abs()) as u64 * 2);
1598                        total_cost += refinement_penalty * self.symbol_usage[other] as u64;
1599                    }
1600                    None => total_cost += 1_000_000,
1601                }
1602            }
1603
1604            if total_cost < best_cost
1605                || (total_cost == best_cost
1606                    && (
1607                        self.symbol_usage[candidate],
1608                        self.symbol_pixel_counts[candidate],
1609                    ) > (
1610                        self.symbol_usage[best_idx],
1611                        self.symbol_pixel_counts[best_idx],
1612                    ))
1613            {
1614                best_cost = total_cost;
1615                best_idx = candidate;
1616            }
1617        }
1618
1619        best_idx
1620    }
1621
1622    fn note_symbol_page(&mut self, symbol_index: usize, page_num: usize) {
1623        if self.symbol_last_page_seen[symbol_index] != Some(page_num) {
1624            self.symbol_last_page_seen[symbol_index] = Some(page_num);
1625            self.symbol_page_count[symbol_index] += 1;
1626            self.page_symbol_indices[page_num].push(symbol_index);
1627        }
1628    }
1629
1630    fn push_symbol(&mut self, symbol: BitImage, pixel_count: usize, page_num: usize) -> usize {
1631        let idx = self.global_symbols.len();
1632        self.symbol_signatures
1633            .push(Self::compute_symbol_signature(&symbol));
1634        self.symbol_pixel_counts.push(pixel_count);
1635        self.global_symbols.push(symbol);
1636        self.symbol_usage.push(1);
1637        self.symbol_page_count.push(0);
1638        self.symbol_last_page_seen.push(None);
1639        self.note_symbol_page(idx, page_num);
1640        idx
1641    }
1642
1643    fn rebuild_symbol_metadata(&mut self) {
1644        self.symbol_usage = vec![0; self.global_symbols.len()];
1645        self.symbol_page_count = vec![0; self.global_symbols.len()];
1646        self.symbol_last_page_seen = vec![None; self.global_symbols.len()];
1647        self.page_symbol_indices = vec![Vec::new(); self.pages.len()];
1648        self.symbol_pixel_counts = self
1649            .global_symbols
1650            .iter()
1651            .map(BitImage::count_ones)
1652            .collect();
1653        self.symbol_signatures = self
1654            .global_symbols
1655            .iter()
1656            .map(Self::compute_symbol_signature)
1657            .collect();
1658
1659        for page_num in 0..self.pages.len() {
1660            let instance_indices: Vec<usize> = self.pages[page_num]
1661                .symbol_instances
1662                .iter()
1663                .map(|inst| inst.symbol_index)
1664                .collect();
1665            for symbol_index in instance_indices {
1666                self.symbol_usage[symbol_index] += 1;
1667                self.note_symbol_page(symbol_index, page_num);
1668            }
1669        }
1670    }
1671
1672    fn rebuild_hash_map(&mut self) {
1673        self.hash_map.clear();
1674        self.hash_map.reserve(self.global_symbols.len());
1675        for (idx, symbol) in self.global_symbols.iter().enumerate() {
1676            let key = hash_key(symbol);
1677            self.hash_map.entry(key).or_default().push(idx);
1678        }
1679    }
1680
1681    fn build_symbol_unify_classes(&mut self) -> Vec<UnifiedClass> {
1682        let diagnostics_enabled = encoder_diagnostics_enabled();
1683        let context_model =
1684            build_symbol_context_model(&self.pages, &self.global_symbols, &self.symbol_signatures);
1685        let (classes, diagnostics) =
1686            crate::jbig2unify::build_symbol_unify_classes(SymbolUnifyInputs {
1687                config: self.config,
1688                global_symbols: &self.global_symbols,
1689                symbol_usage: &self.symbol_usage,
1690                symbol_page_count: &self.symbol_page_count,
1691                symbol_signatures: &self.symbol_signatures,
1692                symbol_pixel_counts: &self.symbol_pixel_counts,
1693                context_model: Some(&context_model),
1694                collect_diagnostics: diagnostics_enabled,
1695            });
1696        if diagnostics_enabled {
1697            self.state.decision_debug_lines.extend(diagnostics.lines);
1698        }
1699        classes
1700    }
1701
1702    fn compact_symbol_table_after_remap(&mut self) {
1703        let mut used = vec![false; self.global_symbols.len()];
1704        for page in &self.pages {
1705            for instance in &page.symbol_instances {
1706                if instance.symbol_index < used.len() {
1707                    used[instance.symbol_index] = true;
1708                }
1709            }
1710        }
1711
1712        let old_symbols = self.global_symbols.clone();
1713        let mut new_index = vec![usize::MAX; old_symbols.len()];
1714        let mut new_symbols = Vec::new();
1715
1716        for (old_index, symbol) in old_symbols.into_iter().enumerate() {
1717            if used[old_index] {
1718                new_index[old_index] = new_symbols.len();
1719                new_symbols.push(symbol);
1720            }
1721        }
1722
1723        for page in &mut self.pages {
1724            for instance in &mut page.symbol_instances {
1725                instance.symbol_index = new_index[instance.symbol_index];
1726            }
1727        }
1728
1729        self.global_symbols = new_symbols;
1730        self.rebuild_symbol_metadata();
1731        self.rebuild_hash_map();
1732    }
1733
1734    fn alias_local_symbols_to_globals(&mut self) -> Result<()> {
1735        if self.pages.len() <= 1 || self.global_symbols.is_empty() {
1736            return Ok(());
1737        }
1738        let text_refine = self.config.text_refine;
1739        let refine_enabled = self.config.refine;
1740        let global_indices: Vec<usize> = self
1741            .global_symbols
1742            .iter()
1743            .enumerate()
1744            .filter(|(i, _)| self.symbol_page_count[*i] > 1)
1745            .map(|(i, _)| i)
1746            .collect();
1747        if global_indices.is_empty() {
1748            return Ok(());
1749        }
1750
1751        let mut global_bucket_map: FxHashMap<HashKey, Vec<usize>> =
1752            FxHashMap::with_capacity_and_hasher(global_indices.len(), Default::default());
1753        for &symbol_index in &global_indices {
1754            global_bucket_map
1755                .entry(hash_key(&self.global_symbols[symbol_index]))
1756                .or_default()
1757                .push(symbol_index);
1758        }
1759
1760        let mut comparator = Comparator::default();
1761        let mut changed = false;
1762        let mut aliased_symbols = 0usize;
1763        let mut aliased_instances = 0usize;
1764        let mut alias_samples = Vec::new();
1765        for page in &mut self.pages {
1766            let mut page_local_symbols: FxHashSet<usize> =
1767                FxHashSet::with_capacity_and_hasher(256, Default::default());
1768            for instance in &page.symbol_instances {
1769                if self.symbol_page_count[instance.symbol_index] <= 1 {
1770                    page_local_symbols.insert(instance.symbol_index);
1771                }
1772            }
1773
1774            for local_symbol_index in page_local_symbols {
1775                let local_symbol = &self.global_symbols[local_symbol_index];
1776                let local_sig = self.symbol_signatures[local_symbol_index];
1777                let pixel_count = self.symbol_pixel_counts[local_symbol_index];
1778                let area = (local_symbol.width * local_symbol.height) as u32;
1779                let max_err = if self.config.text_refine {
1780                    (area / self.config.match_tolerance.max(1)).max(3)
1781                } else {
1782                    ((area as f32 * 0.05) as u32).max(2)
1783                };
1784                let dim_range: u64 = if self.config.text_refine || self.config.refine {
1785                    2
1786                } else {
1787                    0
1788                };
1789
1790                let mut best_match: Option<(usize, u32, i32, i32, bool)> = None;
1791                let h = local_symbol.height as u64;
1792                let w = local_symbol.width as u64;
1793                'bucket_search: for dh_off in 0..=(dim_range * 2) {
1794                    let dh = h.wrapping_add(dh_off).wrapping_sub(dim_range);
1795                    if dh >= 10_000 {
1796                        continue;
1797                    }
1798                    for dw_off in 0..=(dim_range * 2) {
1799                        let dw = w.wrapping_add(dw_off).wrapping_sub(dim_range);
1800                        if dw >= 10_000 {
1801                            continue;
1802                        }
1803                        let bucket_key = HashKey(dh * 10_000 + dw);
1804                        let Some(bucket) = global_bucket_map.get(&bucket_key) else {
1805                            continue;
1806                        };
1807                        for &global_symbol_index in bucket {
1808                            if self.symbol_pixel_counts[global_symbol_index].abs_diff(pixel_count)
1809                                > max_err as usize + if self.config.text_refine { 8 } else { 6 }
1810                            {
1811                                continue;
1812                            }
1813                            let stored = self.symbol_signatures[global_symbol_index];
1814                            let black_tol = if text_refine { 12 } else { 8 };
1815                            let pos_tol = 2;
1816                            let centroid_tol = if text_refine { 96 } else { 64 };
1817                            if local_sig.black.abs_diff(stored.black) > black_tol
1818                                || local_sig.left_col.abs_diff(stored.left_col) > pos_tol
1819                                || local_sig.right_col.abs_diff(stored.right_col) > pos_tol
1820                                || local_sig.top_row.abs_diff(stored.top_row) > pos_tol
1821                                || local_sig.bottom_row.abs_diff(stored.bottom_row) > pos_tol
1822                                || local_sig.cx_times_256.abs_diff(stored.cx_times_256)
1823                                    > centroid_tol
1824                                || local_sig.cy_times_256.abs_diff(stored.cy_times_256)
1825                                    > centroid_tol
1826                            {
1827                                continue;
1828                            }
1829                            let max_dx = if text_refine { 1 } else { 1 };
1830                            let max_dy = if text_refine { 1 } else { 0 };
1831                            let Some(result) = comparator.compare_for_refine_family(
1832                                local_symbol,
1833                                &self.global_symbols[global_symbol_index],
1834                                max_err,
1835                                max_dx,
1836                                max_dy,
1837                            ) else {
1838                                continue;
1839                            };
1840                            let err = result.total_err;
1841                            let dx = result.dx;
1842                            let dy = result.dy;
1843                            let exact_dims = local_symbol.width
1844                                == self.global_symbols[global_symbol_index].width
1845                                && local_symbol.height
1846                                    == self.global_symbols[global_symbol_index].height;
1847                            let (accept, needs_refinement) =
1848                                if err == 0 && dx == 0 && dy == 0 && exact_dims {
1849                                    (true, false)
1850                                } else if text_refine {
1851                                    (
1852                                        dx.abs() <= 1
1853                                            && dy.abs() <= 1
1854                                            && err <= (max_err / 2).max(2),
1855                                        true,
1856                                    )
1857                                } else if dx.abs() <= 1 && dy == 0 {
1858                                    (true, false)
1859                                } else {
1860                                    (false, false)
1861                                };
1862                            if !accept {
1863                                continue;
1864                            }
1865                            best_match = Some((
1866                                global_symbol_index,
1867                                err,
1868                                dx,
1869                                dy,
1870                                needs_refinement && (text_refine || refine_enabled),
1871                            ));
1872                            if err == 0 && dx == 0 && dy == 0 {
1873                                break 'bucket_search;
1874                            }
1875                        }
1876                    }
1877                }
1878
1879                let Some((global_symbol_index, _err, dx, dy, needs_refinement)) = best_match else {
1880                    continue;
1881                };
1882                aliased_symbols += 1;
1883                for instance in &mut page.symbol_instances {
1884                    if instance.symbol_index == local_symbol_index {
1885                        instance.symbol_index = global_symbol_index;
1886                        instance.needs_refinement = needs_refinement;
1887                        instance.refinement_dx = if needs_refinement { dx } else { 0 };
1888                        instance.refinement_dy = if needs_refinement { dy } else { 0 };
1889                        changed = true;
1890                        aliased_instances += 1;
1891                    }
1892                }
1893                if alias_samples.len() < 64 {
1894                    alias_samples.push(format!(
1895                        "alias local->global: local={} global={} dx={} dy={} refine={}",
1896                        local_symbol_index, global_symbol_index, dx, dy, needs_refinement
1897                    ));
1898                }
1899            }
1900        }
1901
1902        if encoder_diagnostics_enabled() {
1903            if changed {
1904                self.state.decision_debug_lines.push(format!(
1905                    "alias pass: {} local symbols / {} instances remapped onto globals",
1906                    aliased_symbols, aliased_instances
1907                ));
1908                self.state.decision_debug_lines.extend(alias_samples);
1909            } else {
1910                self.state
1911                    .decision_debug_lines
1912                    .push("alias pass: no local symbols remapped onto globals".to_string());
1913            }
1914        }
1915        if changed {
1916            self.compact_symbol_table_after_remap();
1917        }
1918
1919        Ok(())
1920    }
1921
1922    fn apply_symbol_unify(&mut self) -> Result<()> {
1923        if !self.config.uses_symbol_unify() || self.state.lossy_symbol_mode_applied {
1924            return Ok(());
1925        }
1926
1927        let diagnostics_enabled = encoder_diagnostics_enabled();
1928        let before_exported = self.global_symbols.len();
1929        let before_estimated_dict_bytes =
1930            symbol_dictionary_entries_bytes(self.global_symbols.iter());
1931        let classes = self.build_symbol_unify_classes();
1932        if classes.is_empty() {
1933            if diagnostics_enabled {
1934                self.state
1935                    .decision_debug_lines
1936                    .push("sym_unify: no eligible classes".to_string());
1937            }
1938            self.state.lossy_symbol_mode_applied = true;
1939            return Ok(());
1940        }
1941
1942        let mut remap: Vec<usize> = (0..self.global_symbols.len()).collect();
1943        let mut refinement_remap: Vec<Option<RefinementPlan>> =
1944            vec![None; self.global_symbols.len()];
1945        let mut unified_members = 0usize;
1946        let mut border_unified_members = 0usize;
1947        let mut refined_members = 0usize;
1948        let mut refinement_subclusters = 0usize;
1949        let mut retained_border_members = 0usize;
1950        let mut retained_outlier_members = 0usize;
1951
1952        if diagnostics_enabled {
1953            self.state.decision_debug_lines.push(format!(
1954                "sym_unify: {} classes eligible across {} symbols",
1955                classes.len(),
1956                self.global_symbols.len()
1957            ));
1958
1959            for class in classes.iter().take(64) {
1960                self.state.decision_debug_lines.push(format!(
1961                    "sym_unify class: representative={} class_size={} core_size={} unified={} border_unified={} refined_subclusters={} refined_members={} retained_border={} retained_outliers={} total_usage={} page_span={} representative_score={} estimated_gain={} subclusters={}",
1962                    class.representative_index,
1963                    class.class_size,
1964                    class.dense_core_size,
1965                    class.core_members.len(),
1966                    class.border_members.len(),
1967                    class.refinement_subclusters.len(),
1968                    class.refinement_subclusters
1969                        .iter()
1970                        .map(|subcluster| subcluster.refined_members.len())
1971                        .sum::<usize>(),
1972                    class.retained_border_members,
1973                    class.retained_outlier_members,
1974                    class.total_usage,
1975                    class.page_span,
1976                    class.representative_score,
1977                    class.estimated_gain,
1978                    class.candidate_subclusters
1979                ));
1980            }
1981        }
1982
1983        for class in &classes {
1984            retained_border_members += class.retained_border_members;
1985            retained_outlier_members += class.retained_outlier_members;
1986            for member in &class.core_members {
1987                remap[member.member_index] = class.representative_index;
1988                unified_members += 1;
1989            }
1990            for member in &class.border_members {
1991                remap[member.member_index] = class.representative_index;
1992                border_unified_members += 1;
1993            }
1994            refinement_subclusters += class.refinement_subclusters.len();
1995            for subcluster in &class.refinement_subclusters {
1996                for member in &subcluster.refined_members {
1997                    refinement_remap[member.member_index] = Some(RefinementPlan {
1998                        prototype_input_index: subcluster.prototype_index,
1999                        refinement_dx: member.dx,
2000                        refinement_dy: member.dy,
2001                    });
2002                    refined_members += 1;
2003                }
2004            }
2005        }
2006
2007        for page in &mut self.pages {
2008            for instance in &mut page.symbol_instances {
2009                let original_index = instance.symbol_index;
2010                if let Some(refinement) = refinement_remap[original_index] {
2011                    instance.symbol_index = refinement.prototype_input_index;
2012                    instance.needs_refinement = true;
2013                    instance.refinement_dx = refinement.refinement_dx;
2014                    instance.refinement_dy = refinement.refinement_dy;
2015                } else {
2016                    instance.symbol_index = remap[original_index];
2017                    instance.needs_refinement = false;
2018                    instance.refinement_dx = 0;
2019                    instance.refinement_dy = 0;
2020                }
2021            }
2022        }
2023
2024        self.compact_symbol_table_after_remap();
2025        if diagnostics_enabled {
2026            let after_estimated_dict_bytes =
2027                symbol_dictionary_entries_bytes(self.global_symbols.iter());
2028            self.state.decision_debug_lines.push(format!(
2029                "sym_unify export summary: before={} after={} removed={} dict_bytes_before={} dict_bytes_after={} dict_bytes_saved={} unified_members={} border_unified_members={} refined_members={} refinement_subclusters={} retained_border_members={} retained_outlier_members={}",
2030                before_exported,
2031                self.global_symbols.len(),
2032                before_exported.saturating_sub(self.global_symbols.len()),
2033                before_estimated_dict_bytes,
2034                after_estimated_dict_bytes,
2035                before_estimated_dict_bytes.saturating_sub(after_estimated_dict_bytes),
2036                unified_members,
2037                border_unified_members,
2038                refined_members,
2039                refinement_subclusters,
2040                retained_border_members,
2041                retained_outlier_members
2042            ));
2043        }
2044        self.state.lossy_symbol_mode_applied = true;
2045        Ok(())
2046    }
2047
2048    pub fn add_page(&mut self, image: &Array2<u8>) -> Result<()> {
2049        let bitimage = crate::jbig2sym::array_to_bitimage(image);
2050        self.add_page_bitimage(bitimage)
2051    }
2052
2053    pub fn add_page_bitimage(&mut self, bitimage: BitImage) -> Result<()> {
2054        let page_num = self.pages.len();
2055        self.page_symbol_indices.push(Vec::new());
2056        let mut symbol_instances = Vec::new();
2057        let mut comparator = Comparator::default();
2058        let debug_matching =
2059            page_num == 0 && std::env::var("JBIG2_DEBUG").map_or(false, |v| v == "1");
2060        let no_reuse = std::env::var("JBIG2_NO_REUSE").map_or(false, |v| v == "1");
2061
2062        let mut debug_lines: Vec<String> = Vec::new();
2063        if debug_matching {
2064            debug_lines.push("=== PAGE 0 MATCHING LOG ===".to_string());
2065            debug_lines.push(format!("Image: {}x{}", bitimage.width, bitimage.height));
2066        }
2067        let mut cc_index = 0usize;
2068        let mut sym_unify_anchor_map = (self.config.lossy_symbol_mode
2069            == LossySymbolMode::SymbolUnify
2070            && !self.global_symbols.is_empty())
2071        .then(|| self.build_sym_unify_anchor_map(page_num));
2072        let sym_unify_initial_anchor_count = sym_unify_anchor_map
2073            .as_ref()
2074            .map(|anchors| anchors.values().map(Vec::len).sum::<usize>())
2075            .unwrap_or(0);
2076        let sym_unify_initial_anchor_bytes = sym_unify_anchor_map
2077            .as_ref()
2078            .map(|anchors| anchor_map_dictionary_bytes(&self.global_symbols, anchors))
2079            .unwrap_or(0);
2080        let mut sym_unify_recent_hits = 0usize;
2081        let mut sym_unify_anchor_hits = 0usize;
2082        let mut sym_unify_bucket_hits = 0usize;
2083        let mut sym_unify_new_symbols = 0usize;
2084        let mut sym_unify_anchor_score_rejects = 0usize;
2085        let mut sym_unify_anchor_outside_rejects = 0usize;
2086        let mut sym_unify_anchor_compare_rejects = 0usize;
2087        let mut sym_unify_anchor_overlap_rejects = 0usize;
2088
2089        // Extract symbols if symbol mode is enabled
2090        if self.config.symbol_mode && self.state.segment {
2091            #[cfg(feature = "cc-analysis")]
2092            {
2093                let dpi = 300; // Default DPI
2094                let losslevel =
2095                    if self.config.symbol_mode || self.config.uses_lossy_symbol_dictionary() {
2096                        0
2097                    } else if self.config.is_lossless {
2098                        0
2099                    } else {
2100                        1
2101                    };
2102                let cc_start = Instant::now();
2103                let cc_image = analyze_page(&bitimage, dpi, losslevel);
2104                let extracted = cc_image.extract_shape_refs();
2105                self.metrics.symbol_mode.cc_extraction += cc_start.elapsed();
2106
2107                // Check if symbol extraction makes sense for this image
2108                // If we only get one symbol that covers the entire image,
2109                // it's better to use generic region encoding
2110                let should_use_symbols = if extracted.len() == 1 {
2111                    let bbox = extracted[0].bbox;
2112                    !(bbox.xmin == 0
2113                        && bbox.ymin == 0
2114                        && bbox.width() as usize >= bitimage.width.saturating_sub(2)
2115                        && bbox.height() as usize >= bitimage.height.saturating_sub(2))
2116                } else {
2117                    !extracted.is_empty()
2118                };
2119
2120                if should_use_symbols {
2121                    let matching_start = Instant::now();
2122                    let mut recent_cache = RecentSymbolCache::new(RECENT_SYMBOL_CACHE_CAP);
2123                    let mut recent_candidates = [0usize; RECENT_SYMBOL_CACHE_CAP];
2124                    let mut last_y = 0u32;
2125
2126                    for shape in extracted {
2127                        if Self::should_skip_symbol_candidate(
2128                            shape.bbox.width().max(0) as usize,
2129                            shape.bbox.height().max(0) as usize,
2130                            shape.black_pixels,
2131                        ) || shape.run_count == 0
2132                        {
2133                            continue;
2134                        }
2135                        let Some(symbol) = cc_image.get_bitmap_for_cc(shape.ccid) else {
2136                            continue;
2137                        };
2138                        let (trim_offset, trimmed) = symbol.trim();
2139                        let pixel_count = trimmed.count_ones();
2140                        if Self::should_skip_symbol_candidate(
2141                            trimmed.width,
2142                            trimmed.height,
2143                            pixel_count,
2144                        ) {
2145                            continue;
2146                        }
2147
2148                        // The CC bbox is the bounding box from CC analysis.
2149                        // trim() may remove whitespace rows/cols from the symbol.
2150                        // Adjust position by trim offset so the dictionary bitmap
2151                        // renders at the correct location on the page.
2152                        let rect = Rect {
2153                            x: shape.bbox.xmin as u32 + trim_offset.x,
2154                            y: shape.bbox.ymin as u32 + trim_offset.y,
2155                            width: trimmed.width as u32,
2156                            height: trimmed.height as u32,
2157                        };
2158                        if rect.y > last_y.saturating_add(24) {
2159                            recent_cache.clear();
2160                        }
2161                        last_y = rect.y;
2162
2163                        let trimmed_sig = Self::compute_symbol_signature(&trimmed);
2164                        let mut matched = false;
2165                        let mut instance_bitmap = Some(symbol);
2166
2167                        // Error tolerance for matching.
2168                        let area = (trimmed.width * trimmed.height) as u32;
2169                        let max_err = if self.config.text_refine {
2170                            (area / self.config.match_tolerance).max(3)
2171                        } else {
2172                            ((area as f32 * 0.05) as u32).max(2)
2173                        };
2174
2175                        if !matched && !no_reuse {
2176                            let recent_len = recent_cache.copy_into(&mut recent_candidates);
2177                            'recent_search: for &idx in &recent_candidates[..recent_len] {
2178                                if let Some((err, dx, dy, needs_refinement)) = self
2179                                    .evaluate_symbol_match(
2180                                        &trimmed,
2181                                        trimmed_sig,
2182                                        pixel_count,
2183                                        idx,
2184                                        &mut comparator,
2185                                        max_err,
2186                                    )
2187                                {
2188                                    if debug_matching {
2189                                        let mode = if needs_refinement {
2190                                            "REFINE"
2191                                        } else if err == 0 && dx == 0 && dy == 0 {
2192                                            "EXACT "
2193                                        } else {
2194                                            "LOSSY "
2195                                        };
2196                                        let proto = &self.global_symbols[idx];
2197                                        debug_lines.push(format!(
2198                                            "CC#{:04} {} pos=({},{}) {}x{} → proto#{} {}x{} err={} dx={} dy={} [recent]",
2199                                            cc_index,
2200                                            mode,
2201                                            rect.x,
2202                                            rect.y,
2203                                            rect.width,
2204                                            rect.height,
2205                                            idx,
2206                                            proto.width,
2207                                            proto.height,
2208                                            err,
2209                                            dx,
2210                                            dy
2211                                        ));
2212                                    }
2213
2214                                    self.symbol_usage[idx] += 1;
2215                                    self.note_symbol_page(idx, page_num);
2216                                    symbol_instances.push(SymbolInstance {
2217                                        symbol_index: idx,
2218                                        position: rect,
2219                                        instance_bitmap: instance_bitmap.take().unwrap(),
2220                                        needs_refinement,
2221                                        refinement_dx: if needs_refinement { dx } else { 0 },
2222                                        refinement_dy: if needs_refinement { dy } else { 0 },
2223                                    });
2224                                    recent_cache.touch(idx);
2225                                    if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
2226                                    {
2227                                        sym_unify_recent_hits += 1;
2228                                    }
2229                                    matched = true;
2230                                    break 'recent_search;
2231                                }
2232                            }
2233                        }
2234
2235                        if !matched
2236                            && !no_reuse
2237                            && self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
2238                        {
2239                            if let Some(anchor_map) = sym_unify_anchor_map.as_mut() {
2240                                let anchor_key =
2241                                    family_bucket_key_for_symbol(&trimmed, &trimmed_sig);
2242                                let mut visited = FxHashSet::default();
2243                                let mut exact_examined = 0usize;
2244                                if let Some(bucket) = anchor_map.get(&anchor_key) {
2245                                    'anchor_search_exact: for &idx in bucket {
2246                                        if exact_examined >= SYM_UNIFY_EXACT_ANCHOR_BUDGET {
2247                                            break 'anchor_search_exact;
2248                                        }
2249                                        exact_examined += 1;
2250                                        if !visited.insert(idx) {
2251                                            continue;
2252                                        }
2253                                        let decision = self.evaluate_symbol_unify_anchor_match(
2254                                            &trimmed,
2255                                            trimmed_sig,
2256                                            pixel_count,
2257                                            idx,
2258                                            &mut comparator,
2259                                        );
2260                                        let (score, dx, dy) = match decision {
2261                                            SymUnifyAnchorDecision::Accept { score, dx, dy } => {
2262                                                (score, dx, dy)
2263                                            }
2264                                            SymUnifyAnchorDecision::RejectScore { .. } => {
2265                                                sym_unify_anchor_score_rejects += 1;
2266                                                continue;
2267                                            }
2268                                            SymUnifyAnchorDecision::RejectOutsideInk => {
2269                                                sym_unify_anchor_outside_rejects += 1;
2270                                                continue;
2271                                            }
2272                                            SymUnifyAnchorDecision::RejectCompare => {
2273                                                sym_unify_anchor_compare_rejects += 1;
2274                                                continue;
2275                                            }
2276                                            SymUnifyAnchorDecision::RejectOverlap => {
2277                                                sym_unify_anchor_overlap_rejects += 1;
2278                                                continue;
2279                                            }
2280                                            _ => continue,
2281                                        };
2282
2283                                        if debug_matching {
2284                                            let proto = &self.global_symbols[idx];
2285                                            debug_lines.push(format!(
2286                                                "CC#{:04} UNIFY  pos=({},{}) {}x{} → proto#{} {}x{} score={} dx={} dy={} [anchor]",
2287                                                cc_index,
2288                                                rect.x,
2289                                                rect.y,
2290                                                rect.width,
2291                                                rect.height,
2292                                                idx,
2293                                                proto.width,
2294                                                proto.height,
2295                                                score,
2296                                                dx,
2297                                                dy
2298                                            ));
2299                                        }
2300
2301                                        self.symbol_usage[idx] += 1;
2302                                        self.note_symbol_page(idx, page_num);
2303                                        self.maybe_add_sym_unify_anchor(anchor_map, idx, page_num);
2304                                        symbol_instances.push(SymbolInstance {
2305                                            symbol_index: idx,
2306                                            position: rect,
2307                                            instance_bitmap: instance_bitmap.take().unwrap(),
2308                                            needs_refinement: false,
2309                                            refinement_dx: 0,
2310                                            refinement_dy: 0,
2311                                        });
2312                                        recent_cache.touch(idx);
2313                                        sym_unify_anchor_hits += 1;
2314                                        matched = true;
2315                                        break;
2316                                    }
2317                                }
2318
2319                                if !matched {
2320                                    let mut neighbor_examined = 0usize;
2321                                    'anchor_search_neighbors: for neighbor in
2322                                        family_bucket_neighbors(anchor_key)
2323                                    {
2324                                        if neighbor == anchor_key {
2325                                            continue;
2326                                        }
2327                                        let Some(bucket) = anchor_map.get(&neighbor) else {
2328                                            continue;
2329                                        };
2330                                        for &idx in bucket {
2331                                            if neighbor_examined >= SYM_UNIFY_NEIGHBOR_ANCHOR_BUDGET
2332                                            {
2333                                                break 'anchor_search_neighbors;
2334                                            }
2335                                            neighbor_examined += 1;
2336                                            if !visited.insert(idx) {
2337                                                continue;
2338                                            }
2339                                            let decision = self.evaluate_symbol_unify_anchor_match(
2340                                                &trimmed,
2341                                                trimmed_sig,
2342                                                pixel_count,
2343                                                idx,
2344                                                &mut comparator,
2345                                            );
2346                                            let (score, dx, dy) = match decision {
2347                                                SymUnifyAnchorDecision::Accept {
2348                                                    score,
2349                                                    dx,
2350                                                    dy,
2351                                                } => (score, dx, dy),
2352                                                SymUnifyAnchorDecision::RejectScore { .. } => {
2353                                                    sym_unify_anchor_score_rejects += 1;
2354                                                    continue;
2355                                                }
2356                                                SymUnifyAnchorDecision::RejectOutsideInk => {
2357                                                    sym_unify_anchor_outside_rejects += 1;
2358                                                    continue;
2359                                                }
2360                                                SymUnifyAnchorDecision::RejectCompare => {
2361                                                    sym_unify_anchor_compare_rejects += 1;
2362                                                    continue;
2363                                                }
2364                                                SymUnifyAnchorDecision::RejectOverlap => {
2365                                                    sym_unify_anchor_overlap_rejects += 1;
2366                                                    continue;
2367                                                }
2368                                                _ => continue,
2369                                            };
2370
2371                                            if debug_matching {
2372                                                let proto = &self.global_symbols[idx];
2373                                                debug_lines.push(format!(
2374                                                    "CC#{:04} UNIFY  pos=({},{}) {}x{} → proto#{} {}x{} score={} dx={} dy={} [anchor]",
2375                                                    cc_index,
2376                                                    rect.x,
2377                                                    rect.y,
2378                                                    rect.width,
2379                                                    rect.height,
2380                                                    idx,
2381                                                    proto.width,
2382                                                    proto.height,
2383                                                    score,
2384                                                    dx,
2385                                                    dy
2386                                                ));
2387                                            }
2388
2389                                            self.symbol_usage[idx] += 1;
2390                                            self.note_symbol_page(idx, page_num);
2391                                            self.maybe_add_sym_unify_anchor(
2392                                                anchor_map, idx, page_num,
2393                                            );
2394                                            symbol_instances.push(SymbolInstance {
2395                                                symbol_index: idx,
2396                                                position: rect,
2397                                                instance_bitmap: instance_bitmap.take().unwrap(),
2398                                                needs_refinement: false,
2399                                                refinement_dx: 0,
2400                                                refinement_dy: 0,
2401                                            });
2402                                            recent_cache.touch(idx);
2403                                            sym_unify_anchor_hits += 1;
2404                                            matched = true;
2405                                            break 'anchor_search_neighbors;
2406                                        }
2407                                    }
2408                                }
2409                            }
2410                        }
2411
2412                        if !matched && !no_reuse {
2413                            let h = trimmed.height as u64;
2414                            let w = trimmed.width as u64;
2415                            let dim_range: u64 = if self.config.text_refine { 2 } else { 0 };
2416
2417                            'bucket_search: for dh_off in 0..=(dim_range * 2) {
2418                                let dh = h.wrapping_add(dh_off).wrapping_sub(dim_range);
2419                                if dh >= 10_000 {
2420                                    continue;
2421                                }
2422                                for dw_off in 0..=(dim_range * 2) {
2423                                    let dw = w.wrapping_add(dw_off).wrapping_sub(dim_range);
2424                                    if dw >= 10_000 {
2425                                        continue;
2426                                    }
2427
2428                                    let nk = HashKey(dh * 10_000 + dw);
2429                                    if let Some(bucket) = self.hash_map.get(&nk) {
2430                                        let bucket_len = bucket.len();
2431                                        let bucket_ptr = bucket.as_ptr();
2432                                        for bucket_pos in 0..bucket_len {
2433                                            let idx = unsafe { *bucket_ptr.add(bucket_pos) };
2434                                            let Some((err, dx, dy, needs_refinement)) = self
2435                                                .evaluate_symbol_match(
2436                                                    &trimmed,
2437                                                    trimmed_sig,
2438                                                    pixel_count,
2439                                                    idx,
2440                                                    &mut comparator,
2441                                                    max_err,
2442                                                )
2443                                            else {
2444                                                continue;
2445                                            };
2446
2447                                            if debug_matching {
2448                                                let mode = if needs_refinement {
2449                                                    "REFINE"
2450                                                } else if err == 0 && dx == 0 && dy == 0 {
2451                                                    "EXACT "
2452                                                } else {
2453                                                    "LOSSY "
2454                                                };
2455                                                let proto = &self.global_symbols[idx];
2456                                                debug_lines.push(format!(
2457                                                    "CC#{:04} {} pos=({},{}) {}x{} → proto#{} {}x{} err={} dx={} dy={}",
2458                                                    cc_index,
2459                                                    mode,
2460                                                    rect.x,
2461                                                    rect.y,
2462                                                    rect.width,
2463                                                    rect.height,
2464                                                    idx,
2465                                                    proto.width,
2466                                                    proto.height,
2467                                                    err,
2468                                                    dx,
2469                                                    dy
2470                                                ));
2471                                            }
2472
2473                                            self.symbol_usage[idx] += 1;
2474                                            self.note_symbol_page(idx, page_num);
2475                                            if let Some(anchor_map) = sym_unify_anchor_map.as_mut()
2476                                            {
2477                                                self.maybe_add_sym_unify_anchor(
2478                                                    anchor_map, idx, page_num,
2479                                                );
2480                                            }
2481                                            symbol_instances.push(SymbolInstance {
2482                                                symbol_index: idx,
2483                                                position: rect,
2484                                                instance_bitmap: instance_bitmap.take().unwrap(),
2485                                                needs_refinement,
2486                                                refinement_dx: if needs_refinement {
2487                                                    dx
2488                                                } else {
2489                                                    0
2490                                                },
2491                                                refinement_dy: if needs_refinement {
2492                                                    dy
2493                                                } else {
2494                                                    0
2495                                                },
2496                                            });
2497                                            recent_cache.touch(idx);
2498                                            if self.config.lossy_symbol_mode
2499                                                == LossySymbolMode::SymbolUnify
2500                                            {
2501                                                sym_unify_bucket_hits += 1;
2502                                            }
2503                                            matched = true;
2504                                            break 'bucket_search;
2505                                        }
2506                                    }
2507                                }
2508                            }
2509                        }
2510
2511                        if !matched {
2512                            let idx = self.push_symbol(trimmed, pixel_count, page_num);
2513                            self.metrics.symbol_stats.symbols_discovered += 1;
2514                            if debug_matching {
2515                                debug_lines.push(format!(
2516                                    "CC#{:04} NEW    pos=({},{}) {}x{} trim_off=({},{}) → new proto#{} {}x{}",
2517                                    cc_index, rect.x, rect.y, rect.width, rect.height,
2518                                    trim_offset.x, trim_offset.y,
2519                                    idx, self.global_symbols[idx].width, self.global_symbols[idx].height
2520                                ));
2521                            }
2522                            let key = hash_key(&self.global_symbols[idx]);
2523                            self.hash_map.entry(key).or_default().push(idx);
2524                            if let Some(anchor_map) = sym_unify_anchor_map.as_mut() {
2525                                self.maybe_add_sym_unify_anchor(anchor_map, idx, page_num);
2526                            }
2527                            symbol_instances.push(SymbolInstance {
2528                                symbol_index: idx,
2529                                position: rect,
2530                                instance_bitmap: instance_bitmap.take().unwrap(),
2531                                needs_refinement: false,
2532                                refinement_dx: 0,
2533                                refinement_dy: 0,
2534                            });
2535                            recent_cache.touch(idx);
2536                            if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify {
2537                                sym_unify_new_symbols += 1;
2538                            }
2539                        }
2540                        cc_index += 1;
2541                    }
2542                    self.metrics.symbol_mode.matching_dedup += matching_start.elapsed();
2543                }
2544            }
2545        }
2546
2547        // Write page 0 matching debug log
2548        if debug_matching && !debug_lines.is_empty() {
2549            debug_lines.push(format!(
2550                "\nTotal CCs: {}, Instances: {}",
2551                cc_index,
2552                symbol_instances.len()
2553            ));
2554            let log_path = std::path::Path::new("jbig2_debug_page0.log");
2555            if let Ok(mut f) = std::fs::File::create(log_path) {
2556                use std::io::Write;
2557                for line in &debug_lines {
2558                    let _ = writeln!(f, "{}", line);
2559                }
2560            }
2561        }
2562
2563        if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
2564            && encoder_diagnostics_enabled()
2565        {
2566            let final_anchor_count = sym_unify_anchor_map
2567                .as_ref()
2568                .map(|anchors| anchors.values().map(Vec::len).sum::<usize>())
2569                .unwrap_or(0);
2570            let final_anchor_bytes = sym_unify_anchor_map
2571                .as_ref()
2572                .map(|anchors| anchor_map_dictionary_bytes(&self.global_symbols, anchors))
2573                .unwrap_or(0);
2574            self.state.ingest_debug_lines.push(format!(
2575                "sym_unify ingest page={}: cc={} recent_hits={} anchor_hits={} bucket_hits={} new_symbols={} initial_anchors={} final_anchors={} initial_anchor_bytes={} final_anchor_bytes={} anchor_score_rejects={} anchor_outside_rejects={} anchor_compare_rejects={} anchor_overlap_rejects={}",
2576                page_num + 1,
2577                cc_index,
2578                sym_unify_recent_hits,
2579                sym_unify_anchor_hits,
2580                sym_unify_bucket_hits,
2581                sym_unify_new_symbols,
2582                sym_unify_initial_anchor_count,
2583                final_anchor_count,
2584                sym_unify_initial_anchor_bytes,
2585                final_anchor_bytes,
2586                sym_unify_anchor_score_rejects,
2587                sym_unify_anchor_outside_rejects,
2588                sym_unify_anchor_compare_rejects,
2589                sym_unify_anchor_overlap_rejects,
2590            ));
2591        }
2592
2593        self.pages.push(PageData {
2594            image: bitimage,
2595            symbol_instances,
2596        });
2597        Ok(())
2598    }
2599
2600    pub fn collect_symbols(&mut self, roi: &Array2<u8>) -> Result<()> {
2601        let bitimage = crate::jbig2sym::array_to_bitimage(roi);
2602        let (_, trimmed) = bitimage.trim();
2603        let key = hash_key(&trimmed);
2604        let page_num = self.pages.len();
2605        if self.page_symbol_indices.len() <= page_num {
2606            self.page_symbol_indices.resize_with(page_num + 1, Vec::new);
2607        }
2608
2609        if !self.hash_map.contains_key(&key) {
2610            let pixel_count = trimmed.count_ones();
2611            let idx = self.push_symbol(trimmed, pixel_count, page_num);
2612            self.metrics.symbol_stats.symbols_discovered += 1;
2613            self.hash_map.insert(key, vec![idx]);
2614        }
2615        Ok(())
2616    }
2617
2618    pub fn flush(&mut self) -> Result<Vec<u8>> {
2619        let include_header = self.state.full_headers_remaining;
2620        self.state.decision_debug_lines.clear();
2621        match self.config.lossy_symbol_mode {
2622            LossySymbolMode::SymbolUnify => self.apply_symbol_unify()?,
2623            LossySymbolMode::Off => {}
2624        }
2625        let plan = self.plan_document(include_header)?;
2626        self.validate_plan(&plan)?;
2627        let output = self.serialize_full_document(&plan)?;
2628        self.state.full_headers_remaining = false;
2629        self.next_segment_number = plan.next_segment_number;
2630        Ok(output)
2631    }
2632
2633    pub fn flush_pdf_split(&mut self) -> Result<PdfSplitOutput> {
2634        self.state.pdf_mode = true;
2635        self.state.decision_debug_lines.clear();
2636        match self.config.lossy_symbol_mode {
2637            LossySymbolMode::SymbolUnify => self.apply_symbol_unify()?,
2638            LossySymbolMode::Off => {}
2639        }
2640        let plan = self.plan_document(false)?;
2641        self.validate_plan(&plan)?;
2642        let (
2643            global_segments,
2644            page_streams,
2645            local_dict_bytes_per_page,
2646            text_region_bytes_per_page,
2647            generic_region_bytes_per_page,
2648        ) = self.serialize_pdf_split(&plan)?;
2649        self.next_segment_number = plan.next_segment_number;
2650        Ok(PdfSplitOutput {
2651            global_segments,
2652            page_streams,
2653            local_dict_bytes_per_page,
2654            text_region_bytes_per_page,
2655            generic_region_bytes_per_page,
2656        })
2657    }
2658
2659    fn plan_document(&mut self, include_header: bool) -> Result<PlannedDocument> {
2660        debug!("Symbol stats before encoding: {}", self.get_symbol_stats());
2661        let diagnostics_enabled = encoder_diagnostics_enabled();
2662        let planning_start = Instant::now();
2663
2664        if self.config.auto_thresh {
2665            let clustering_start = Instant::now();
2666            self.cluster_symbols()?;
2667            self.metrics.symbol_mode.clustering += clustering_start.elapsed();
2668        }
2669
2670        self.prune_symbols_if_needed();
2671        self.alias_local_symbols_to_globals()?;
2672        self.validate_symbol_instance_indices()?;
2673
2674        let multi_page_candidates: Vec<usize> = self
2675            .global_symbols
2676            .iter()
2677            .enumerate()
2678            .filter(|(i, _)| self.symbol_page_count[*i] > 1 || self.pages.len() == 1)
2679            .map(|(i, _)| i)
2680            .collect();
2681        let global_symbol_indices: Vec<usize> = multi_page_candidates.clone();
2682        let global_set: HashSet<usize> = global_symbol_indices.iter().copied().collect();
2683        let estimated_global_dict_bytes =
2684            indexed_symbol_dictionary_bytes(&self.global_symbols, &global_symbol_indices);
2685        let low_value_global_candidates: Vec<(usize, usize, usize, i64)> = multi_page_candidates
2686            .iter()
2687            .copied()
2688            .filter(|symbol_index| !global_set.contains(symbol_index))
2689            .map(|symbol_index| {
2690                (
2691                    symbol_index,
2692                    self.symbol_usage[symbol_index],
2693                    self.symbol_page_count[symbol_index],
2694                    self.estimate_global_symbol_gain(symbol_index),
2695                )
2696            })
2697            .take(16)
2698            .collect();
2699        let multi_page_non_global = multi_page_candidates
2700            .len()
2701            .saturating_sub(global_symbol_indices.len());
2702        if diagnostics_enabled {
2703            self.state.decision_debug_lines.push(format!(
2704                "planning globals: selected={} multi_page_non_global={} estimated_dict_bytes={} low_value_candidates={}",
2705                global_symbol_indices.len(),
2706                multi_page_non_global,
2707                estimated_global_dict_bytes,
2708                low_value_global_candidates.len()
2709            ));
2710            for (symbol_index, usage, page_span, gain) in low_value_global_candidates {
2711                self.state.decision_debug_lines.push(format!(
2712                    "planning global candidate: symbol={} usage={} page_span={} estimated_global_gain={}",
2713                    symbol_index,
2714                    usage,
2715                    page_span,
2716                    gain
2717                ));
2718            }
2719        }
2720
2721        let mut page_local_symbols: Vec<Vec<usize>> = self
2722            .page_symbol_indices
2723            .iter()
2724            .map(|symbols| {
2725                symbols
2726                    .iter()
2727                    .copied()
2728                    .filter(|i| !global_set.contains(i))
2729                    .collect()
2730            })
2731            .collect();
2732        let mut page_residual_symbols = vec![Vec::new(); self.pages.len()];
2733        let mut page_residual_anchor_remaps: Vec<FxHashMap<usize, usize>> = (0..self.pages.len())
2734            .map(|_| FxHashMap::default())
2735            .collect();
2736        let sym_unify_global_anchor_map =
2737            if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify {
2738                let mut anchors: FxHashMap<FamilyBucketKey, Vec<usize>> = FxHashMap::default();
2739                for &symbol_index in &global_symbol_indices {
2740                    if !self.sym_unify_anchor_ready(symbol_index, self.pages.len()) {
2741                        continue;
2742                    }
2743                    let key = family_bucket_key_for_symbol(
2744                        &self.global_symbols[symbol_index],
2745                        &self.symbol_signatures[symbol_index],
2746                    );
2747                    anchors.entry(key).or_default().push(symbol_index);
2748                }
2749                Some(anchors)
2750            } else {
2751                None
2752            };
2753        let sym_unify_global_anchor_bytes = sym_unify_global_anchor_map
2754            .as_ref()
2755            .map(|anchors| anchor_map_dictionary_bytes(&self.global_symbols, anchors))
2756            .unwrap_or(0);
2757        let mut planning_anchor_comparator = Comparator::default();
2758        let mut planning_local_anchor_attach_count = 0usize;
2759        let mut planning_anchor_attach_count = 0usize;
2760        let mut planning_local_score_rescue_count = 0usize;
2761        let mut planning_anchor_score_rescue_count = 0usize;
2762        let mut planning_local_anchor_attach_sample = Vec::new();
2763        let mut planning_anchor_attach_sample = Vec::new();
2764        let mut planning_local_rescue_count = 0usize;
2765        let mut planning_local_rescue_sample = Vec::new();
2766        let mut residual_symbol_traces: FxHashMap<usize, ResidualSymbolTrace> =
2767            FxHashMap::default();
2768        let mut counterfactual_local_dim_relax2 = CounterfactualProbeStats::default();
2769        let mut counterfactual_global_overlap_skip = CounterfactualProbeStats::default();
2770        let mut page_uses_generic_region = vec![false; self.pages.len()];
2771        for (page_num, page) in self.pages.iter().enumerate() {
2772            if self.config.uses_lossy_symbol_dictionary()
2773                || self.config.refine
2774                || self.config.text_refine
2775            {
2776                let mut local_use_counts = HashMap::new();
2777                for instance in &page.symbol_instances {
2778                    *local_use_counts
2779                        .entry(instance.symbol_index)
2780                        .or_insert(0usize) += 1;
2781                }
2782                let local_anchor_candidates: Vec<usize> = page_local_symbols[page_num]
2783                    .iter()
2784                    .copied()
2785                    .filter(|&symbol_index| {
2786                        local_use_counts.get(&symbol_index).copied().unwrap_or(0) > 1
2787                            || self.should_keep_text_local_symbol(page, symbol_index)
2788                    })
2789                    .collect();
2790                let mut kept_local_symbols = Vec::with_capacity(page_local_symbols[page_num].len());
2791                for &symbol_index in &page_local_symbols[page_num] {
2792                    if local_use_counts.get(&symbol_index).copied().unwrap_or(0) <= 1 {
2793                        if self.should_keep_text_local_symbol(page, symbol_index) {
2794                            kept_local_symbols.push(symbol_index);
2795                            planning_local_rescue_count += 1;
2796                            if planning_local_rescue_sample.len() < 16 {
2797                                planning_local_rescue_sample.push((
2798                                    page_num + 1,
2799                                    symbol_index,
2800                                    self.global_symbols[symbol_index].width,
2801                                    self.global_symbols[symbol_index].height,
2802                                ));
2803                            }
2804                            continue;
2805                        }
2806                        let mut local_best_anchor = None;
2807                        let mut local_best_reject = None;
2808                        let mut had_local_candidates = false;
2809                        for &anchor_index in &local_anchor_candidates {
2810                            if anchor_index == symbol_index {
2811                                continue;
2812                            }
2813                            had_local_candidates = true;
2814                            match self.residual_symbol_anchor_decision(
2815                                symbol_index,
2816                                anchor_index,
2817                                &mut planning_anchor_comparator,
2818                            ) {
2819                                SymUnifyAnchorDecision::Accept { score, dx, dy } => {
2820                                    self.maybe_update_best_sym_unify_anchor_candidate(
2821                                        &mut local_best_anchor,
2822                                        &self.global_symbols[symbol_index],
2823                                        anchor_index,
2824                                        score,
2825                                        dx,
2826                                        dy,
2827                                        false,
2828                                    );
2829                                }
2830                                SymUnifyAnchorDecision::RejectScore {
2831                                    score,
2832                                    limit,
2833                                    dx,
2834                                    dy,
2835                                } if score
2836                                    <= limit.saturating_add(
2837                                        self.config.sym_unify_score_rescue_slack,
2838                                    ) =>
2839                                {
2840                                    self.maybe_update_best_sym_unify_anchor_candidate(
2841                                        &mut local_best_anchor,
2842                                        &self.global_symbols[symbol_index],
2843                                        anchor_index,
2844                                        score,
2845                                        dx,
2846                                        dy,
2847                                        true,
2848                                    );
2849                                }
2850                                other => update_best_reject(&mut local_best_reject, other),
2851                            }
2852                        }
2853                        if let Some(anchor_choice) = local_best_anchor {
2854                            page_residual_anchor_remaps[page_num]
2855                                .insert(symbol_index, anchor_choice.anchor_index);
2856                            planning_local_anchor_attach_count += 1;
2857                            if anchor_choice.rescued_on_score {
2858                                planning_local_score_rescue_count += 1;
2859                            }
2860                            if planning_local_anchor_attach_sample.len() < 16 {
2861                                planning_local_anchor_attach_sample.push((
2862                                    page_num + 1,
2863                                    symbol_index,
2864                                    anchor_choice.anchor_index,
2865                                ));
2866                            }
2867                            continue;
2868                        }
2869                        if diagnostics_enabled
2870                            && matches!(local_best_reject, Some(SymUnifyAnchorDecision::RejectDim))
2871                            && local_anchor_candidates.iter().copied().any(|anchor_index| {
2872                                anchor_index != symbol_index
2873                                    && self.residual_symbol_accept_with_dim_limit(
2874                                        symbol_index,
2875                                        anchor_index,
2876                                        &mut planning_anchor_comparator,
2877                                        2,
2878                                    )
2879                            })
2880                        {
2881                            record_counterfactual_probe(
2882                                &mut counterfactual_local_dim_relax2,
2883                                page_num,
2884                                symbol_index,
2885                                &self.global_symbols[symbol_index],
2886                                self.symbol_pixel_counts[symbol_index],
2887                            );
2888                        }
2889                        let mut attached_anchor = None;
2890                        let mut global_best_reject = None;
2891                        let mut had_global_candidates = false;
2892                        if let Some(anchor_map) = &sym_unify_global_anchor_map {
2893                            let bucket = family_bucket_key_for_symbol(
2894                                &self.global_symbols[symbol_index],
2895                                &self.symbol_signatures[symbol_index],
2896                            );
2897                            let mut visited = FxHashSet::default();
2898                            let mut best_anchor = None;
2899                            for neighbor in family_bucket_neighbors(bucket) {
2900                                let Some(candidates) = anchor_map.get(&neighbor) else {
2901                                    continue;
2902                                };
2903                                for &anchor_index in candidates {
2904                                    if anchor_index == symbol_index || !visited.insert(anchor_index)
2905                                    {
2906                                        continue;
2907                                    }
2908                                    had_global_candidates = true;
2909                                    match self.residual_symbol_anchor_decision(
2910                                        symbol_index,
2911                                        anchor_index,
2912                                        &mut planning_anchor_comparator,
2913                                    ) {
2914                                        SymUnifyAnchorDecision::Accept { score, dx, dy } => {
2915                                            self.maybe_update_best_sym_unify_anchor_candidate(
2916                                                &mut best_anchor,
2917                                                &self.global_symbols[symbol_index],
2918                                                anchor_index,
2919                                                score,
2920                                                dx,
2921                                                dy,
2922                                                false,
2923                                            );
2924                                        }
2925                                        SymUnifyAnchorDecision::RejectScore {
2926                                            score,
2927                                            limit,
2928                                            dx,
2929                                            dy,
2930                                        } if score
2931                                            <= limit.saturating_add(
2932                                                self.config.sym_unify_score_rescue_slack,
2933                                            ) =>
2934                                        {
2935                                            self.maybe_update_best_sym_unify_anchor_candidate(
2936                                                &mut best_anchor,
2937                                                &self.global_symbols[symbol_index],
2938                                                anchor_index,
2939                                                score,
2940                                                dx,
2941                                                dy,
2942                                                true,
2943                                            );
2944                                        }
2945                                        other => update_best_reject(&mut global_best_reject, other),
2946                                    }
2947                                }
2948                            }
2949                            attached_anchor = best_anchor;
2950                        }
2951                        if let Some(anchor_choice) = attached_anchor {
2952                            page_residual_anchor_remaps[page_num]
2953                                .insert(symbol_index, anchor_choice.anchor_index);
2954                            planning_anchor_attach_count += 1;
2955                            if anchor_choice.rescued_on_score {
2956                                planning_anchor_score_rescue_count += 1;
2957                            }
2958                            if planning_anchor_attach_sample.len() < 16 {
2959                                planning_anchor_attach_sample.push((
2960                                    page_num + 1,
2961                                    symbol_index,
2962                                    anchor_choice.anchor_index,
2963                                ));
2964                            }
2965                        } else {
2966                            if diagnostics_enabled
2967                                && matches!(
2968                                    global_best_reject,
2969                                    Some(SymUnifyAnchorDecision::RejectOverlap)
2970                                )
2971                            {
2972                                let bucket = family_bucket_key_for_symbol(
2973                                    &self.global_symbols[symbol_index],
2974                                    &self.symbol_signatures[symbol_index],
2975                                );
2976                                let mut visited = FxHashSet::default();
2977                                let recovered_without_overlap_prescreen = sym_unify_global_anchor_map
2978                                    .as_ref()
2979                                    .is_some_and(|anchor_map| {
2980                                        family_bucket_neighbors(bucket).into_iter().any(|neighbor| {
2981                                            anchor_map.get(&neighbor).is_some_and(|candidates| {
2982                                                candidates.iter().copied().any(|anchor_index| {
2983                                                    anchor_index != symbol_index
2984                                                        && visited.insert(anchor_index)
2985                                                        && self.residual_symbol_accept_without_overlap_prescreen(
2986                                                            symbol_index,
2987                                                            anchor_index,
2988                                                            &mut planning_anchor_comparator,
2989                                                        )
2990                                                })
2991                                            })
2992                                        })
2993                                    });
2994                                if recovered_without_overlap_prescreen {
2995                                    record_counterfactual_probe(
2996                                        &mut counterfactual_global_overlap_skip,
2997                                        page_num,
2998                                        symbol_index,
2999                                        &self.global_symbols[symbol_index],
3000                                        self.symbol_pixel_counts[symbol_index],
3001                                    );
3002                                }
3003                            }
3004                            page_residual_symbols[page_num].push(symbol_index);
3005                            residual_symbol_traces.insert(
3006                                symbol_index,
3007                                ResidualSymbolTrace {
3008                                    page_num,
3009                                    local_use_count: local_use_counts
3010                                        .get(&symbol_index)
3011                                        .copied()
3012                                        .unwrap_or(0),
3013                                    had_local_candidates,
3014                                    had_global_candidates,
3015                                    local_best_reject,
3016                                    global_best_reject,
3017                                },
3018                            );
3019                        }
3020                    } else {
3021                        kept_local_symbols.push(symbol_index);
3022                    }
3023                }
3024                page_local_symbols[page_num] = kept_local_symbols;
3025            }
3026
3027            let local_symbols = &page_local_symbols[page_num];
3028            let page_local_gain: i64 = local_symbols
3029                .iter()
3030                .map(|&symbol_index| self.estimate_local_symbol_gain(page, symbol_index))
3031                .sum();
3032            let uses_only_locals = page.symbol_instances.iter().all(|inst| {
3033                !global_set.contains(&inst.symbol_index)
3034                    && !page_residual_anchor_remaps[page_num].contains_key(&inst.symbol_index)
3035                    && !page_residual_symbols[page_num].contains(&inst.symbol_index)
3036            });
3037            if uses_only_locals
3038                && local_symbols.len() <= 2
3039                && page.symbol_instances.len() <= 2
3040                && page_local_gain <= 0
3041            {
3042                page_local_symbols[page_num].clear();
3043                page_uses_generic_region[page_num] = true;
3044            }
3045
3046            let has_kept_symbol_instances = page.symbol_instances.iter().any(|inst| {
3047                global_set.contains(&inst.symbol_index)
3048                    || page_residual_anchor_remaps[page_num].contains_key(&inst.symbol_index)
3049                    || page_local_symbols[page_num].contains(&inst.symbol_index)
3050            });
3051            if !has_kept_symbol_instances {
3052                page_uses_generic_region[page_num] = true;
3053            }
3054        }
3055
3056        let total_residual_symbols: usize = page_residual_symbols.iter().map(Vec::len).sum();
3057        let full_generic_pages = page_uses_generic_region.iter().filter(|&&v| v).count();
3058        if diagnostics_enabled {
3059            self.state.decision_debug_lines.push(format!(
3060                "planning residuals: {} page-local one-off symbols moved to generic residuals",
3061                total_residual_symbols
3062            ));
3063            self.state.decision_debug_lines.push(format!(
3064                "planning page modes: full_generic_pages={} text_pages={}",
3065                full_generic_pages,
3066                self.pages.len().saturating_sub(full_generic_pages)
3067            ));
3068            if self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify {
3069                self.state.decision_debug_lines.push(format!(
3070                    "sym_unify planning symbol rescues: local_kept={} local_anchor_remaps={} global_anchor_remaps={} local_score_rescues={} global_score_rescues={} anchor_ready_bytes={}",
3071                    planning_local_rescue_count,
3072                    planning_local_anchor_attach_count,
3073                    planning_anchor_attach_count,
3074                    planning_local_score_rescue_count,
3075                    planning_anchor_score_rescue_count,
3076                    sym_unify_global_anchor_bytes,
3077                ));
3078                if !planning_local_rescue_sample.is_empty() {
3079                    self.state.decision_debug_lines.push(format!(
3080                        "sym_unify planning local rescue sample: {:?}",
3081                        planning_local_rescue_sample
3082                    ));
3083                }
3084                if !planning_local_anchor_attach_sample.is_empty() {
3085                    self.state.decision_debug_lines.push(format!(
3086                        "sym_unify planning local-anchor sample: {:?}",
3087                        planning_local_anchor_attach_sample
3088                    ));
3089                }
3090                if !planning_anchor_attach_sample.is_empty() {
3091                    self.state.decision_debug_lines.push(format!(
3092                        "sym_unify planning anchor sample: {:?}",
3093                        planning_anchor_attach_sample
3094                    ));
3095                }
3096                self.state.decision_debug_lines.push(format!(
3097                    "sym_unify residual counterfactuals: local_dim_relax2_symbols={} local_dim_relax2_bitmap_proxy_bytes={} local_dim_relax2_pages={} global_overlap_skip_symbols={} global_overlap_skip_bitmap_proxy_bytes={} global_overlap_skip_pages={}",
3098                    counterfactual_local_dim_relax2.symbol_count,
3099                    counterfactual_local_dim_relax2.bitmap_proxy_bytes,
3100                    counterfactual_local_dim_relax2.pages.len(),
3101                    counterfactual_global_overlap_skip.symbol_count,
3102                    counterfactual_global_overlap_skip.bitmap_proxy_bytes,
3103                    counterfactual_global_overlap_skip.pages.len(),
3104                ));
3105                if !counterfactual_local_dim_relax2.samples.is_empty() {
3106                    self.state.decision_debug_lines.push(format!(
3107                        "sym_unify counterfactual local_dim_relax2 sample: {:?}",
3108                        counterfactual_local_dim_relax2.samples
3109                    ));
3110                }
3111                if !counterfactual_global_overlap_skip.samples.is_empty() {
3112                    self.state.decision_debug_lines.push(format!(
3113                        "sym_unify counterfactual global_overlap_skip sample: {:?}",
3114                        counterfactual_global_overlap_skip.samples
3115                    ));
3116                }
3117            }
3118        }
3119        if diagnostics_enabled
3120            && self.config.lossy_symbol_mode == LossySymbolMode::SymbolUnify
3121            && total_residual_symbols > 0
3122        {
3123            let mut comparator = Comparator::default();
3124            let mut residual_unique = FxHashSet::default();
3125            for residuals in &page_residual_symbols {
3126                residual_unique.extend(residuals.iter().copied());
3127            }
3128            let anchor_map = self.build_sym_unify_anchor_map(self.pages.len());
3129            let mut any_global_map: FxHashMap<FamilyBucketKey, Vec<usize>> = FxHashMap::default();
3130            for &symbol_index in &global_symbol_indices {
3131                let key = family_bucket_key_for_symbol(
3132                    &self.global_symbols[symbol_index],
3133                    &self.symbol_signatures[symbol_index],
3134                );
3135                any_global_map.entry(key).or_default().push(symbol_index);
3136            }
3137            let mut attachable = 0usize;
3138            let mut attachable_with_score_rescue = 0usize;
3139            let mut attachable_to_any_global = 0usize;
3140            let mut sampled = Vec::new();
3141            let mut sampled_score_rescue = Vec::new();
3142            let mut sampled_any_global = Vec::new();
3143            let mut visited = FxHashSet::default();
3144            let mut reject_counts: FxHashMap<&'static str, usize> = FxHashMap::default();
3145            let mut area_buckets = [0usize; 4];
3146            for residual_index in residual_unique.iter().copied() {
3147                let symbol = &self.global_symbols[residual_index];
3148                let area = symbol.width.saturating_mul(symbol.height);
3149                let bucket_index = if area <= 16 {
3150                    0
3151                } else if area <= 32 {
3152                    1
3153                } else if area <= 64 {
3154                    2
3155                } else {
3156                    3
3157                };
3158                area_buckets[bucket_index] += 1;
3159                let bucket =
3160                    family_bucket_key_for_symbol(symbol, &self.symbol_signatures[residual_index]);
3161                visited.clear();
3162                let mut matched_anchor = None;
3163                let mut best_reject = SymUnifyAnchorDecision::RejectDim;
3164                'anchor_search: for neighbor in family_bucket_neighbors(bucket) {
3165                    let Some(candidates) = anchor_map.get(&neighbor) else {
3166                        continue;
3167                    };
3168                    for &anchor_index in candidates {
3169                        if anchor_index == residual_index || !visited.insert(anchor_index) {
3170                            continue;
3171                        }
3172                        let decision = self.residual_symbol_anchor_decision(
3173                            residual_index,
3174                            anchor_index,
3175                            &mut comparator,
3176                        );
3177                        match decision {
3178                            SymUnifyAnchorDecision::Accept { .. } => {
3179                                matched_anchor = Some(anchor_index);
3180                                break 'anchor_search;
3181                            }
3182                            _ => {
3183                                if decision.diagnostic_rank() > best_reject.diagnostic_rank() {
3184                                    best_reject = decision;
3185                                }
3186                            }
3187                        }
3188                    }
3189                }
3190                if let Some(anchor_index) = matched_anchor {
3191                    attachable += 1;
3192                    attachable_with_score_rescue += 1;
3193                    if sampled.len() < 16 {
3194                        sampled.push((residual_index, anchor_index));
3195                    }
3196                } else {
3197                    *reject_counts.entry(best_reject.label()).or_insert(0) += 1;
3198
3199                    visited.clear();
3200                    let mut rescued_anchor = None;
3201                    'score_rescue_search: for neighbor in family_bucket_neighbors(bucket) {
3202                        let Some(candidates) = anchor_map.get(&neighbor) else {
3203                            continue;
3204                        };
3205                        for &anchor_index in candidates {
3206                            if anchor_index == residual_index || !visited.insert(anchor_index) {
3207                                continue;
3208                            }
3209                            match self.residual_symbol_anchor_decision(
3210                                residual_index,
3211                                anchor_index,
3212                                &mut comparator,
3213                            ) {
3214                                SymUnifyAnchorDecision::Accept { .. } => {
3215                                    rescued_anchor = Some(anchor_index);
3216                                    break 'score_rescue_search;
3217                                }
3218                                SymUnifyAnchorDecision::RejectScore { score, limit, .. }
3219                                    if score
3220                                        <= limit.saturating_add(
3221                                            self.config.sym_unify_score_rescue_slack,
3222                                        ) =>
3223                                {
3224                                    rescued_anchor = Some(anchor_index);
3225                                    break 'score_rescue_search;
3226                                }
3227                                _ => {}
3228                            }
3229                        }
3230                    }
3231                    if let Some(anchor_index) = rescued_anchor {
3232                        attachable_with_score_rescue += 1;
3233                        if sampled_score_rescue.len() < 16 {
3234                            sampled_score_rescue.push((residual_index, anchor_index));
3235                        }
3236                    }
3237                }
3238
3239                visited.clear();
3240                'any_global_search: for neighbor in family_bucket_neighbors(bucket) {
3241                    let Some(candidates) = any_global_map.get(&neighbor) else {
3242                        continue;
3243                    };
3244                    for &anchor_index in candidates {
3245                        if anchor_index == residual_index || !visited.insert(anchor_index) {
3246                            continue;
3247                        }
3248                        if matches!(
3249                            self.residual_symbol_anchor_decision(
3250                                residual_index,
3251                                anchor_index,
3252                                &mut comparator,
3253                            ),
3254                            SymUnifyAnchorDecision::Accept { .. }
3255                        ) {
3256                            attachable_to_any_global += 1;
3257                            if sampled_any_global.len() < 16 {
3258                                sampled_any_global.push((residual_index, anchor_index));
3259                            }
3260                            break 'any_global_search;
3261                        }
3262                    }
3263                }
3264            }
3265            self.state.decision_debug_lines.push(format!(
3266                "sym_unify residual anchor scan: residual_unique={} attachable_to_current_anchors={} attachable_with_score_rescue={} score_rescue_extra={} unattached={}",
3267                residual_unique.len(),
3268                attachable,
3269                attachable_with_score_rescue,
3270                attachable_with_score_rescue.saturating_sub(attachable),
3271                residual_unique.len().saturating_sub(attachable)
3272            ));
3273            self.state.decision_debug_lines.push(format!(
3274                "sym_unify residual reject breakdown: dim={} pixel_delta={} signature={} overlap={} compare={} outside_ink={} score={} area_le16={} area_le32={} area_le64={} area_gt64={}",
3275                reject_counts.get("dim").copied().unwrap_or(0),
3276                reject_counts.get("pixel_delta").copied().unwrap_or(0),
3277                reject_counts.get("signature").copied().unwrap_or(0),
3278                reject_counts.get("overlap").copied().unwrap_or(0),
3279                reject_counts.get("compare").copied().unwrap_or(0),
3280                reject_counts.get("outside_ink").copied().unwrap_or(0),
3281                reject_counts.get("score").copied().unwrap_or(0),
3282                area_buckets[0],
3283                area_buckets[1],
3284                area_buckets[2],
3285                area_buckets[3],
3286            ));
3287            self.state.decision_debug_lines.push(format!(
3288                "sym_unify residual any-global scan: residual_unique={} attachable_to_any_global={} extra_beyond_anchor_ready={}",
3289                residual_unique.len(),
3290                attachable_to_any_global,
3291                attachable_to_any_global.saturating_sub(attachable),
3292            ));
3293            if !sampled.is_empty() {
3294                self.state
3295                    .decision_debug_lines
3296                    .push(format!("sym_unify residual anchor sample: {:?}", sampled));
3297            }
3298            if !sampled_score_rescue.is_empty() {
3299                self.state.decision_debug_lines.push(format!(
3300                    "sym_unify residual score-rescue sample: {:?}",
3301                    sampled_score_rescue
3302                ));
3303            }
3304            if !sampled_any_global.is_empty() {
3305                self.state.decision_debug_lines.push(format!(
3306                    "sym_unify residual any-global sample: {:?}",
3307                    sampled_any_global
3308                ));
3309            }
3310
3311            let mut reason_stats: FxHashMap<ResidualReasonCode, ResidualReasonStats> =
3312                FxHashMap::default();
3313            for (&symbol_index, trace) in &residual_symbol_traces {
3314                let reason = trace.reason_code();
3315                let stats = reason_stats.entry(reason).or_default();
3316                let symbol = &self.global_symbols[symbol_index];
3317                let instance_count = trace.local_use_count.max(1);
3318                stats.symbol_count += 1;
3319                stats.instance_count += instance_count;
3320                stats.black_pixels += self.symbol_pixel_counts[symbol_index] * instance_count;
3321                stats.bitmap_proxy_bytes += bitmap_proxy_bytes(symbol) * instance_count;
3322                stats.pages.insert(trace.page_num);
3323                match classify_residual_shape(symbol) {
3324                    ResidualShapeKind::Tiny => stats.tiny_count += 1,
3325                    ResidualShapeKind::PunctuationLike => stats.punctuation_like_count += 1,
3326                    ResidualShapeKind::GlyphLike => stats.glyph_like_count += 1,
3327                }
3328                if stats.samples.len() < 8 {
3329                    stats.samples.push((
3330                        trace.page_num + 1,
3331                        symbol_index,
3332                        symbol.width,
3333                        symbol.height,
3334                        trace.local_use_count,
3335                    ));
3336                }
3337            }
3338
3339            let mut sorted_reason_stats: Vec<_> = reason_stats.into_iter().collect();
3340            sorted_reason_stats.sort_by(|lhs, rhs| {
3341                rhs.1
3342                    .bitmap_proxy_bytes
3343                    .cmp(&lhs.1.bitmap_proxy_bytes)
3344                    .then_with(|| rhs.1.symbol_count.cmp(&lhs.1.symbol_count))
3345                    .then_with(|| lhs.0.label().cmp(rhs.0.label()))
3346            });
3347            let total_reason_proxy_bytes: usize = sorted_reason_stats
3348                .iter()
3349                .map(|(_, stats)| stats.bitmap_proxy_bytes)
3350                .sum();
3351            self.state.decision_debug_lines.push(format!(
3352                "sym_unify residual reason summary: reasons={} residual_symbols={} bitmap_proxy_bytes={}",
3353                sorted_reason_stats.len(),
3354                residual_symbol_traces.len(),
3355                total_reason_proxy_bytes,
3356            ));
3357            for (reason, stats) in sorted_reason_stats {
3358                self.state.decision_debug_lines.push(format!(
3359                    "  residual reason {}: symbols={} instances={} pages={} black_pixels={} bitmap_proxy_bytes={} tiny={} punct_like={} glyph_like={} sample={:?}",
3360                    reason.label(),
3361                    stats.symbol_count,
3362                    stats.instance_count,
3363                    stats.pages.len(),
3364                    stats.black_pixels,
3365                    stats.bitmap_proxy_bytes,
3366                    stats.tiny_count,
3367                    stats.punctuation_like_count,
3368                    stats.glyph_like_count,
3369                    stats.samples
3370                ));
3371            }
3372
3373            let mut symbol_home_page = vec![usize::MAX; self.global_symbols.len()];
3374            for (page_idx, symbols) in self.page_symbol_indices.iter().enumerate() {
3375                for &symbol_index in symbols {
3376                    if symbol_home_page[symbol_index] == usize::MAX {
3377                        symbol_home_page[symbol_index] = page_idx;
3378                    }
3379                }
3380            }
3381            let mut all_symbol_bucket_map: FxHashMap<FamilyBucketKey, Vec<usize>> =
3382                FxHashMap::default();
3383            for (symbol_index, symbol) in self.global_symbols.iter().enumerate() {
3384                let key =
3385                    family_bucket_key_for_symbol(symbol, &self.symbol_signatures[symbol_index]);
3386                all_symbol_bucket_map
3387                    .entry(key)
3388                    .or_default()
3389                    .push(symbol_index);
3390            }
3391
3392            let mut local_dim_cross_page_current = CounterfactualProbeStats::default();
3393            let mut local_dim_cross_page_dim2 = CounterfactualProbeStats::default();
3394            let mut cross_page_comparator = Comparator::default();
3395            let mut overlap_bypass_outcomes: FxHashMap<&'static str, CounterfactualProbeStats> =
3396                FxHashMap::default();
3397            let mut overlap_bypass_comparator = Comparator::default();
3398            let mut overlap_compare_probe_outcomes: FxHashMap<
3399                &'static str,
3400                CounterfactualProbeStats,
3401            > = FxHashMap::default();
3402            let mut overlap_compare_probe_comparator = Comparator::default();
3403            let mut overlap_bypass_compare_total_err_details = DetailedCompareProbeStats::default();
3404            let mut global_compare_total_err_details = DetailedCompareProbeStats::default();
3405            let mut compare_slack2_from_global_compare = CounterfactualProbeStats::default();
3406            let mut compare_slack4_from_global_compare = CounterfactualProbeStats::default();
3407            let mut compare_slack2_from_overlap_compare = CounterfactualProbeStats::default();
3408            let mut compare_slack4_from_overlap_compare = CounterfactualProbeStats::default();
3409            for (&symbol_index, trace) in &residual_symbol_traces {
3410                if trace.reason_code() != ResidualReasonCode::UseCountOneLocalRejectDim {
3411                    if trace.reason_code() == ResidualReasonCode::UseCountOneGlobalRejectOverlap {
3412                        let bucket = family_bucket_key_for_symbol(
3413                            &self.global_symbols[symbol_index],
3414                            &self.symbol_signatures[symbol_index],
3415                        );
3416                        let mut visited = FxHashSet::default();
3417                        let mut best_bypass_reject = None;
3418                        let mut recovered = false;
3419                        let mut best_compare_total_err: Option<(
3420                            crate::jbig2comparator::CompareResult,
3421                            u32,
3422                            bool,
3423                            bool,
3424                        )> = None;
3425                        if let Some(anchor_map) = &sym_unify_global_anchor_map {
3426                            'overlap_bypass_search: for neighbor in family_bucket_neighbors(bucket)
3427                            {
3428                                let Some(candidates) = anchor_map.get(&neighbor) else {
3429                                    continue;
3430                                };
3431                                for &anchor_index in candidates {
3432                                    if anchor_index == symbol_index || !visited.insert(anchor_index)
3433                                    {
3434                                        continue;
3435                                    }
3436                                    let strong_anchor = self.symbol_usage[anchor_index]
3437                                        >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
3438                                        || self.symbol_page_count[anchor_index]
3439                                            >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
3440                                    match self
3441                                        .residual_symbol_anchor_decision_without_overlap_prescreen(
3442                                            symbol_index,
3443                                            anchor_index,
3444                                            &mut overlap_bypass_comparator,
3445                                        ) {
3446                                        SymUnifyAnchorDecision::Accept { .. } => {
3447                                            recovered = true;
3448                                            break 'overlap_bypass_search;
3449                                        }
3450                                        SymUnifyAnchorDecision::RejectCompare => {
3451                                            update_best_reject(
3452                                                &mut best_bypass_reject,
3453                                                SymUnifyAnchorDecision::RejectCompare,
3454                                            );
3455                                            let candidate = &self.global_symbols[symbol_index];
3456                                            let proto = &self.global_symbols[anchor_index];
3457                                            let compare_max_err = self
3458                                                .config
3459                                                .sym_unify_max_err
3460                                                .max(4)
3461                                                .saturating_add(u32::from(strong_anchor));
3462                                            if let Some(result) = overlap_compare_probe_comparator
3463                                                .compare_for_symbol_unify(
3464                                                    candidate,
3465                                                    proto,
3466                                                    relaxed_compare_probe_max_err(candidate, proto),
3467                                                    self.config.sym_unify_max_dx.max(0),
3468                                                    self.config.sym_unify_max_dy.max(0),
3469                                                )
3470                                            {
3471                                                let exact_dims = candidate.width == proto.width
3472                                                    && candidate.height == proto.height;
3473                                                if best_compare_total_err.is_none_or(
3474                                                    |(current, _, _, _)| {
3475                                                        result.total_err < current.total_err
3476                                                    },
3477                                                ) {
3478                                                    best_compare_total_err = Some((
3479                                                        result,
3480                                                        compare_max_err,
3481                                                        exact_dims,
3482                                                        strong_anchor,
3483                                                    ));
3484                                                }
3485                                            }
3486                                        }
3487                                        other => update_best_reject(&mut best_bypass_reject, other),
3488                                    }
3489                                }
3490                            }
3491                        }
3492                        let label = if recovered {
3493                            "accept"
3494                        } else {
3495                            best_bypass_reject
3496                                .map(SymUnifyAnchorDecision::label)
3497                                .unwrap_or("no_candidates")
3498                        };
3499                        record_labeled_counterfactual_probe(
3500                            &mut overlap_bypass_outcomes,
3501                            label,
3502                            trace.page_num,
3503                            symbol_index,
3504                            &self.global_symbols[symbol_index],
3505                            self.symbol_pixel_counts[symbol_index],
3506                        );
3507                        if label == "compare"
3508                            && let Some((result, compare_max_err, exact_dims, strong_anchor)) =
3509                                best_compare_total_err
3510                        {
3511                            let outside_limit = self
3512                                .config
3513                                .sym_unify_max_border_outside_ink
3514                                .min(1)
3515                                .saturating_add(u32::from(strong_anchor));
3516                            let score_limit =
3517                                self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
3518                            let score = Self::symbol_unify_assignment_score(&result);
3519                            record_detailed_compare_probe(
3520                                &mut overlap_bypass_compare_total_err_details,
3521                                trace.page_num,
3522                                symbol_index,
3523                                &self.global_symbols[symbol_index],
3524                                result,
3525                                compare_max_err,
3526                                exact_dims,
3527                                strong_anchor,
3528                            );
3529                            if result.total_err <= compare_max_err.saturating_add(2)
3530                                && result.outside_ink_err <= outside_limit
3531                                && score <= score_limit
3532                            {
3533                                record_counterfactual_probe(
3534                                    &mut compare_slack2_from_overlap_compare,
3535                                    trace.page_num,
3536                                    symbol_index,
3537                                    &self.global_symbols[symbol_index],
3538                                    self.symbol_pixel_counts[symbol_index],
3539                                );
3540                            }
3541                            if result.total_err <= compare_max_err.saturating_add(4)
3542                                && result.outside_ink_err <= outside_limit
3543                                && score <= score_limit
3544                            {
3545                                record_counterfactual_probe(
3546                                    &mut compare_slack4_from_overlap_compare,
3547                                    trace.page_num,
3548                                    symbol_index,
3549                                    &self.global_symbols[symbol_index],
3550                                    self.symbol_pixel_counts[symbol_index],
3551                                );
3552                            }
3553                        }
3554                    }
3555                    if trace.reason_code() == ResidualReasonCode::UseCountOneGlobalRejectCompare {
3556                        let bucket = family_bucket_key_for_symbol(
3557                            &self.global_symbols[symbol_index],
3558                            &self.symbol_signatures[symbol_index],
3559                        );
3560                        let mut visited = FxHashSet::default();
3561                        let mut best_probe_label = "no_candidates";
3562                        let mut best_total_err = u32::MAX;
3563                        let mut best_total_err_detail: Option<(
3564                            crate::jbig2comparator::CompareResult,
3565                            u32,
3566                            bool,
3567                            bool,
3568                        )> = None;
3569                        if let Some(anchor_map) = &sym_unify_global_anchor_map {
3570                            for neighbor in family_bucket_neighbors(bucket) {
3571                                let Some(candidates) = anchor_map.get(&neighbor) else {
3572                                    continue;
3573                                };
3574                                for &anchor_index in candidates {
3575                                    if anchor_index == symbol_index || !visited.insert(anchor_index)
3576                                    {
3577                                        continue;
3578                                    }
3579
3580                                    let candidate = &self.global_symbols[symbol_index];
3581                                    let proto = &self.global_symbols[anchor_index];
3582                                    let strong_anchor = self.symbol_usage[anchor_index]
3583                                        >= SYM_UNIFY_STRONG_ANCHOR_MIN_USAGE
3584                                        || self.symbol_page_count[anchor_index]
3585                                            >= SYM_UNIFY_STRONG_ANCHOR_MIN_PAGE_SPAN;
3586                                    let compare_max_err = self
3587                                        .config
3588                                        .sym_unify_max_err
3589                                        .max(4)
3590                                        .saturating_add(u32::from(strong_anchor));
3591                                    let outside_limit = self
3592                                        .config
3593                                        .sym_unify_max_border_outside_ink
3594                                        .min(1)
3595                                        .saturating_add(u32::from(strong_anchor));
3596                                    let relaxed = overlap_compare_probe_comparator
3597                                        .compare_for_symbol_unify(
3598                                            candidate,
3599                                            proto,
3600                                            relaxed_compare_probe_max_err(candidate, proto),
3601                                            self.config.sym_unify_max_dx.max(0),
3602                                            self.config.sym_unify_max_dy.max(0),
3603                                        );
3604                                    let (label, total_err) = if let Some(result) = relaxed {
3605                                        let score = Self::symbol_unify_assignment_score(&result);
3606                                        let score_limit = self.config.sym_unify_class_accept_limit
3607                                            + u32::from(strong_anchor);
3608                                        let label = if result.total_err <= compare_max_err {
3609                                            if result.outside_ink_err > outside_limit {
3610                                                "outside_ink"
3611                                            } else if score > score_limit {
3612                                                "score"
3613                                            } else {
3614                                                "accept"
3615                                            }
3616                                        } else if result.outside_ink_err > outside_limit {
3617                                            "total_err+outside_ink"
3618                                        } else {
3619                                            "total_err"
3620                                        };
3621                                        (label, result.total_err)
3622                                    } else {
3623                                        ("relaxed_none", u32::MAX)
3624                                    };
3625
3626                                    if total_err < best_total_err {
3627                                        best_total_err = total_err;
3628                                        best_probe_label = label;
3629                                        if let Some(result) = relaxed {
3630                                            let exact_dims = candidate.width == proto.width
3631                                                && candidate.height == proto.height;
3632                                            best_total_err_detail = Some((
3633                                                result,
3634                                                compare_max_err,
3635                                                exact_dims,
3636                                                strong_anchor,
3637                                            ));
3638                                        } else {
3639                                            best_total_err_detail = None;
3640                                        }
3641                                    } else if best_total_err == u32::MAX
3642                                        && best_probe_label == "no_candidates"
3643                                    {
3644                                        best_probe_label = label;
3645                                    }
3646                                }
3647                            }
3648                        }
3649
3650                        record_labeled_counterfactual_probe(
3651                            &mut overlap_compare_probe_outcomes,
3652                            best_probe_label,
3653                            trace.page_num,
3654                            symbol_index,
3655                            &self.global_symbols[symbol_index],
3656                            self.symbol_pixel_counts[symbol_index],
3657                        );
3658                        if best_probe_label == "total_err"
3659                            && let Some((result, compare_max_err, exact_dims, strong_anchor)) =
3660                                best_total_err_detail
3661                        {
3662                            let outside_limit = self
3663                                .config
3664                                .sym_unify_max_border_outside_ink
3665                                .min(1)
3666                                .saturating_add(u32::from(strong_anchor));
3667                            let score_limit =
3668                                self.config.sym_unify_class_accept_limit + u32::from(strong_anchor);
3669                            let score = Self::symbol_unify_assignment_score(&result);
3670                            record_detailed_compare_probe(
3671                                &mut global_compare_total_err_details,
3672                                trace.page_num,
3673                                symbol_index,
3674                                &self.global_symbols[symbol_index],
3675                                result,
3676                                compare_max_err,
3677                                exact_dims,
3678                                strong_anchor,
3679                            );
3680                            if result.total_err <= compare_max_err.saturating_add(2)
3681                                && result.outside_ink_err <= outside_limit
3682                                && score <= score_limit
3683                            {
3684                                record_counterfactual_probe(
3685                                    &mut compare_slack2_from_global_compare,
3686                                    trace.page_num,
3687                                    symbol_index,
3688                                    &self.global_symbols[symbol_index],
3689                                    self.symbol_pixel_counts[symbol_index],
3690                                );
3691                            }
3692                            if result.total_err <= compare_max_err.saturating_add(4)
3693                                && result.outside_ink_err <= outside_limit
3694                                && score <= score_limit
3695                            {
3696                                record_counterfactual_probe(
3697                                    &mut compare_slack4_from_global_compare,
3698                                    trace.page_num,
3699                                    symbol_index,
3700                                    &self.global_symbols[symbol_index],
3701                                    self.symbol_pixel_counts[symbol_index],
3702                                );
3703                            }
3704                        }
3705                    }
3706                    continue;
3707                }
3708
3709                let bucket = family_bucket_key_for_symbol(
3710                    &self.global_symbols[symbol_index],
3711                    &self.symbol_signatures[symbol_index],
3712                );
3713                let mut visited = FxHashSet::default();
3714                let mut found_current = false;
3715                let mut found_dim2 = false;
3716                'cross_page_search: for neighbor in family_bucket_neighbors(bucket) {
3717                    let Some(candidates) = all_symbol_bucket_map.get(&neighbor) else {
3718                        continue;
3719                    };
3720                    for &candidate_index in candidates {
3721                        if candidate_index == symbol_index
3722                            || !visited.insert(candidate_index)
3723                            || symbol_home_page[candidate_index] == trace.page_num
3724                        {
3725                            continue;
3726                        }
3727                        if self.residual_symbol_matches_anchor(
3728                            symbol_index,
3729                            candidate_index,
3730                            &mut cross_page_comparator,
3731                        ) {
3732                            found_current = true;
3733                            break 'cross_page_search;
3734                        }
3735                        if self.residual_symbol_accept_with_dim_limit(
3736                            symbol_index,
3737                            candidate_index,
3738                            &mut cross_page_comparator,
3739                            2,
3740                        ) {
3741                            found_dim2 = true;
3742                        }
3743                    }
3744                }
3745
3746                if found_current {
3747                    record_counterfactual_probe(
3748                        &mut local_dim_cross_page_current,
3749                        trace.page_num,
3750                        symbol_index,
3751                        &self.global_symbols[symbol_index],
3752                        self.symbol_pixel_counts[symbol_index],
3753                    );
3754                } else if found_dim2 {
3755                    record_counterfactual_probe(
3756                        &mut local_dim_cross_page_dim2,
3757                        trace.page_num,
3758                        symbol_index,
3759                        &self.global_symbols[symbol_index],
3760                        self.symbol_pixel_counts[symbol_index],
3761                    );
3762                }
3763            }
3764            self.state.decision_debug_lines.push(format!(
3765                "sym_unify cross-page local-dim probes: current_symbols={} current_bitmap_proxy_bytes={} current_pages={} dim2_only_symbols={} dim2_only_bitmap_proxy_bytes={} dim2_only_pages={}",
3766                local_dim_cross_page_current.symbol_count,
3767                local_dim_cross_page_current.bitmap_proxy_bytes,
3768                local_dim_cross_page_current.pages.len(),
3769                local_dim_cross_page_dim2.symbol_count,
3770                local_dim_cross_page_dim2.bitmap_proxy_bytes,
3771                local_dim_cross_page_dim2.pages.len(),
3772            ));
3773            if !local_dim_cross_page_current.samples.is_empty() {
3774                self.state.decision_debug_lines.push(format!(
3775                    "sym_unify cross-page local-dim current sample: {:?}",
3776                    local_dim_cross_page_current.samples
3777                ));
3778            }
3779            if !local_dim_cross_page_dim2.samples.is_empty() {
3780                self.state.decision_debug_lines.push(format!(
3781                    "sym_unify cross-page local-dim dim2 sample: {:?}",
3782                    local_dim_cross_page_dim2.samples
3783                ));
3784            }
3785
3786            let mut sorted_overlap_bypass_outcomes: Vec<_> =
3787                overlap_bypass_outcomes.into_iter().collect();
3788            sorted_overlap_bypass_outcomes.sort_by(|lhs, rhs| {
3789                rhs.1
3790                    .bitmap_proxy_bytes
3791                    .cmp(&lhs.1.bitmap_proxy_bytes)
3792                    .then_with(|| rhs.1.symbol_count.cmp(&lhs.1.symbol_count))
3793                    .then_with(|| lhs.0.cmp(rhs.0))
3794            });
3795            let overlap_bypass_total_symbols: usize = sorted_overlap_bypass_outcomes
3796                .iter()
3797                .map(|(_, stats)| stats.symbol_count)
3798                .sum();
3799            let overlap_bypass_total_bitmap_proxy_bytes: usize = sorted_overlap_bypass_outcomes
3800                .iter()
3801                .map(|(_, stats)| stats.bitmap_proxy_bytes)
3802                .sum();
3803            self.state.decision_debug_lines.push(format!(
3804                "sym_unify overlap-bypass outcomes: outcomes={} symbols={} bitmap_proxy_bytes={}",
3805                sorted_overlap_bypass_outcomes.len(),
3806                overlap_bypass_total_symbols,
3807                overlap_bypass_total_bitmap_proxy_bytes,
3808            ));
3809            for (label, stats) in sorted_overlap_bypass_outcomes {
3810                self.state.decision_debug_lines.push(format!(
3811                    "  overlap-bypass {}: symbols={} pages={} black_pixels={} bitmap_proxy_bytes={} sample={:?}",
3812                    label,
3813                    stats.symbol_count,
3814                    stats.pages.len(),
3815                    stats.black_pixels,
3816                    stats.bitmap_proxy_bytes,
3817                    stats.samples
3818                ));
3819            }
3820
3821            let mut sorted_overlap_compare_probe_outcomes: Vec<_> =
3822                overlap_compare_probe_outcomes.into_iter().collect();
3823            sorted_overlap_compare_probe_outcomes.sort_by(|lhs, rhs| {
3824                rhs.1
3825                    .bitmap_proxy_bytes
3826                    .cmp(&lhs.1.bitmap_proxy_bytes)
3827                    .then_with(|| rhs.1.symbol_count.cmp(&lhs.1.symbol_count))
3828                    .then_with(|| lhs.0.cmp(rhs.0))
3829            });
3830            let overlap_compare_probe_total_symbols: usize = sorted_overlap_compare_probe_outcomes
3831                .iter()
3832                .map(|(_, stats)| stats.symbol_count)
3833                .sum();
3834            let overlap_compare_probe_total_bitmap_proxy_bytes: usize =
3835                sorted_overlap_compare_probe_outcomes
3836                    .iter()
3837                    .map(|(_, stats)| stats.bitmap_proxy_bytes)
3838                    .sum();
3839            self.state.decision_debug_lines.push(format!(
3840                "sym_unify global-compare relaxed probe: outcomes={} symbols={} bitmap_proxy_bytes={}",
3841                sorted_overlap_compare_probe_outcomes.len(),
3842                overlap_compare_probe_total_symbols,
3843                overlap_compare_probe_total_bitmap_proxy_bytes,
3844            ));
3845            for (label, stats) in sorted_overlap_compare_probe_outcomes {
3846                self.state.decision_debug_lines.push(format!(
3847                    "  global-compare relaxed {}: symbols={} pages={} black_pixels={} bitmap_proxy_bytes={} sample={:?}",
3848                    label,
3849                    stats.symbol_count,
3850                    stats.pages.len(),
3851                    stats.black_pixels,
3852                    stats.bitmap_proxy_bytes,
3853                    stats.samples
3854                ));
3855            }
3856            self.state.decision_debug_lines.push(format!(
3857                "sym_unify overlap-bypass compare total_err detail: symbols={} bitmap_proxy_bytes={} exact_dims={} strong_anchor={} shift_le1={} over_by_le2={} over_by_le4={} over_by_le8={} over_by_gt8={} sample={:?}",
3858                overlap_bypass_compare_total_err_details.symbol_count,
3859                overlap_bypass_compare_total_err_details.bitmap_proxy_bytes,
3860                overlap_bypass_compare_total_err_details.exact_dims_count,
3861                overlap_bypass_compare_total_err_details.strong_anchor_count,
3862                overlap_bypass_compare_total_err_details.shift_le1_count,
3863                overlap_bypass_compare_total_err_details.over_by_le2_count,
3864                overlap_bypass_compare_total_err_details.over_by_le4_count,
3865                overlap_bypass_compare_total_err_details.over_by_le8_count,
3866                overlap_bypass_compare_total_err_details.over_by_gt8_count,
3867                overlap_bypass_compare_total_err_details.samples
3868            ));
3869            self.state.decision_debug_lines.push(format!(
3870                "sym_unify global-compare total_err detail: symbols={} bitmap_proxy_bytes={} exact_dims={} strong_anchor={} shift_le1={} over_by_le2={} over_by_le4={} over_by_le8={} over_by_gt8={} sample={:?}",
3871                global_compare_total_err_details.symbol_count,
3872                global_compare_total_err_details.bitmap_proxy_bytes,
3873                global_compare_total_err_details.exact_dims_count,
3874                global_compare_total_err_details.strong_anchor_count,
3875                global_compare_total_err_details.shift_le1_count,
3876                global_compare_total_err_details.over_by_le2_count,
3877                global_compare_total_err_details.over_by_le4_count,
3878                global_compare_total_err_details.over_by_le8_count,
3879                global_compare_total_err_details.over_by_gt8_count,
3880                global_compare_total_err_details.samples
3881            ));
3882            self.state.decision_debug_lines.push(format!(
3883                "sym_unify compare-slack probes: global_total_err_slack2_symbols={} global_total_err_slack2_bitmap_proxy_bytes={} global_total_err_slack4_symbols={} global_total_err_slack4_bitmap_proxy_bytes={} overlap_compare_slack2_symbols={} overlap_compare_slack2_bitmap_proxy_bytes={} overlap_compare_slack4_symbols={} overlap_compare_slack4_bitmap_proxy_bytes={}",
3884                compare_slack2_from_global_compare.symbol_count,
3885                compare_slack2_from_global_compare.bitmap_proxy_bytes,
3886                compare_slack4_from_global_compare.symbol_count,
3887                compare_slack4_from_global_compare.bitmap_proxy_bytes,
3888                compare_slack2_from_overlap_compare.symbol_count,
3889                compare_slack2_from_overlap_compare.bitmap_proxy_bytes,
3890                compare_slack4_from_overlap_compare.symbol_count,
3891                compare_slack4_from_overlap_compare.bitmap_proxy_bytes
3892            ));
3893        }
3894        if diagnostics_enabled {
3895            for (page_num, residuals) in page_residual_symbols.iter().enumerate().take(32) {
3896                if !residuals.is_empty() {
3897                    self.state.decision_debug_lines.push(format!(
3898                        "page {} residual symbols: count={} sample={:?}",
3899                        page_num + 1,
3900                        residuals.len(),
3901                        &residuals[..residuals.len().min(8)]
3902                    ));
3903                }
3904            }
3905        }
3906
3907        self.validate_symbol_partition(
3908            &global_symbol_indices,
3909            &page_local_symbols,
3910            &page_residual_symbols,
3911            &page_residual_anchor_remaps,
3912            &page_uses_generic_region,
3913        )?;
3914
3915        let mut current_segment_number = self.next_segment_number;
3916        let mut global_segments = Vec::new();
3917
3918        self.global_dict_segment_numbers.clear();
3919        let mut encoded_global_dict = EncodedSymbolDictionary::default();
3920        let mut global_refinement_map = vec![None; self.global_symbols.len()];
3921        if !global_symbol_indices.is_empty() {
3922            let refs: Vec<&BitImage> = global_symbol_indices
3923                .iter()
3924                .map(|&i| &self.global_symbols[i])
3925                .collect();
3926            let dict_usage: Vec<usize> = global_symbol_indices
3927                .iter()
3928                .map(|&i| self.symbol_usage[i])
3929                .collect();
3930            let dict_layout =
3931                plan_symbol_dictionary_layout(&refs, &self.config, Some(&dict_usage))?;
3932            if diagnostics_enabled {
3933                self.state.decision_debug_lines.push(format!(
3934                    "global dict layout: families={} singletons={} refined_members={} exported_members={}",
3935                    dict_layout.diagnostics.family_count,
3936                    dict_layout.diagnostics.singleton_family_count,
3937                    dict_layout.diagnostics.refined_member_count,
3938                    dict_layout.diagnostics.exported_member_count
3939                ));
3940                self.state.decision_debug_lines.extend(
3941                    dict_layout
3942                        .diagnostics
3943                        .sample_lines
3944                        .iter()
3945                        .take(64)
3946                        .cloned(),
3947                );
3948            }
3949            let dict_start = Instant::now();
3950            encoded_global_dict =
3951                encode_symbol_dictionary_segments(&refs, &self.config, &dict_layout)?;
3952            self.metrics.symbol_mode.symbol_dict_encoding += dict_start.elapsed();
3953            for (subset_index, refinement) in dict_layout.refinements.iter().enumerate() {
3954                if let Some(refinement) = refinement {
3955                    let gs_idx = global_symbol_indices[subset_index];
3956                    global_refinement_map[gs_idx] = Some(RefinementPlan {
3957                        prototype_input_index: global_symbol_indices
3958                            [refinement.prototype_input_index],
3959                        refinement_dx: refinement.refinement_dx,
3960                        refinement_dy: refinement.refinement_dy,
3961                    });
3962                }
3963            }
3964            let segment_number = current_segment_number;
3965            current_segment_number += 1;
3966            self.global_dict_segment_numbers.push(segment_number);
3967            global_segments.push(Segment {
3968                number: segment_number,
3969                seg_type: SegmentType::SymbolDictionary,
3970                deferred_non_retain: false,
3971                retain_flags: 0,
3972                page_association_type: 2,
3973                referred_to: Vec::new(),
3974                page: None,
3975                payload: encoded_global_dict.payload.clone(),
3976            });
3977        }
3978
3979        let mut global_sym_to_dict_pos = vec![u32::MAX; self.global_symbols.len()];
3980        for (refs_idx, &dict_pos) in encoded_global_dict.input_to_exported_pos.iter().enumerate() {
3981            if dict_pos != u32::MAX {
3982                let gs_idx = global_symbol_indices[refs_idx];
3983                global_sym_to_dict_pos[gs_idx] = dict_pos;
3984            }
3985        }
3986        let num_global_dict_symbols = encoded_global_dict.exported_symbol_count;
3987
3988        let mut planned_local_export_count = 0usize;
3989        self.metrics.symbol_stats.global_symbol_count = num_global_dict_symbols as usize;
3990
3991        let page_segment_start = current_segment_number;
3992        let mut page_layouts = Vec::with_capacity(self.pages.len());
3993        for (page_num, page) in self.pages.iter().enumerate() {
3994            let page_number = if self.state.pdf_mode {
3995                1u32
3996            } else {
3997                page_num as u32 + 1
3998            };
3999            if self.state.pdf_mode {
4000                current_segment_number = page_segment_start;
4001            }
4002            let page_info_segment_number = current_segment_number;
4003            current_segment_number += 1;
4004            let local_dict_layout = if self.config.symbol_mode
4005                && !page.symbol_instances.is_empty()
4006                && !page_local_symbols[page_num].is_empty()
4007            {
4008                let refs: Vec<&BitImage> = page_local_symbols[page_num]
4009                    .iter()
4010                    .map(|&i| &self.global_symbols[i])
4011                    .collect();
4012                let mut local_usage = vec![0usize; page_local_symbols[page_num].len()];
4013                let local_index_by_symbol: HashMap<usize, usize> = page_local_symbols[page_num]
4014                    .iter()
4015                    .enumerate()
4016                    .map(|(idx, &symbol_index)| (symbol_index, idx))
4017                    .collect();
4018                for instance in &page.symbol_instances {
4019                    if let Some(&local_idx) = local_index_by_symbol.get(&instance.symbol_index) {
4020                        local_usage[local_idx] += 1;
4021                    }
4022                }
4023                Some(plan_symbol_dictionary_layout(
4024                    &refs,
4025                    &self.config,
4026                    Some(&local_usage),
4027                )?)
4028            } else {
4029                None
4030            };
4031            let mut local_dict_segment_numbers = Vec::new();
4032            if let Some(local_dict_layout) = &local_dict_layout {
4033                if diagnostics_enabled {
4034                    self.state.decision_debug_lines.push(format!(
4035                        "page {} local dict layout: families={} singletons={} refined_members={} exported_members={}",
4036                        page_num + 1,
4037                        local_dict_layout.diagnostics.family_count,
4038                        local_dict_layout.diagnostics.singleton_family_count,
4039                        local_dict_layout.diagnostics.refined_member_count,
4040                        local_dict_layout.diagnostics.exported_member_count
4041                    ));
4042                    self.state.decision_debug_lines.extend(
4043                        local_dict_layout
4044                            .diagnostics
4045                            .sample_lines
4046                            .iter()
4047                            .take(16)
4048                            .cloned(),
4049                    );
4050                }
4051                for _ in 0..local_dict_layout.segment_count() {
4052                    local_dict_segment_numbers.push(current_segment_number);
4053                    current_segment_number += 1;
4054                }
4055                planned_local_export_count += local_dict_layout.export_input_indices.len();
4056            }
4057            let region_segment_number = current_segment_number;
4058            current_segment_number += 1;
4059            let has_residual_region = !page_residual_symbols[page_num].is_empty()
4060                && !page_uses_generic_region[page_num]
4061                && page.symbol_instances.iter().any(|inst| {
4062                    global_set.contains(&inst.symbol_index)
4063                        || page_local_symbols[page_num].contains(&inst.symbol_index)
4064                });
4065            let residual_region_segment_number = if has_residual_region {
4066                let number = current_segment_number;
4067                current_segment_number += 1;
4068                Some(number)
4069            } else {
4070                None
4071            };
4072            let end_of_page_segment_number = current_segment_number;
4073            current_segment_number += 1;
4074            let use_generic_region = page_uses_generic_region[page_num];
4075            if diagnostics_enabled {
4076                self.state.decision_debug_lines.push(format!(
4077                    "page {} plan: full_generic={} residual_region={} local_symbols={} residual_symbols={} anchor_remaps={} instances={}",
4078                    page_num + 1,
4079                    use_generic_region,
4080                    has_residual_region,
4081                    page_local_symbols[page_num].len(),
4082                    page_residual_symbols[page_num].len(),
4083                    page_residual_anchor_remaps[page_num].len(),
4084                    page.symbol_instances.len()
4085                ));
4086            }
4087
4088            page_layouts.push(PlannedPageLayout {
4089                page_index: page_num,
4090                page_number,
4091                page_info_segment_number,
4092                local_dict_segment_numbers,
4093                local_dict_layout,
4094                region_segment_number,
4095                residual_region_segment_number,
4096                end_of_page_segment_number,
4097                local_symbols: page_local_symbols[page_num].clone(),
4098                residual_symbols: page_residual_symbols[page_num].clone(),
4099                residual_anchor_remaps: page_residual_anchor_remaps[page_num].clone(),
4100                use_generic_region,
4101            });
4102        }
4103
4104        self.metrics.symbol_stats.local_symbol_count = planned_local_export_count;
4105        self.metrics.symbol_stats.symbols_exported =
4106            self.metrics.symbol_stats.global_symbol_count + planned_local_export_count;
4107        self.metrics.symbol_stats.avg_symbol_reuse =
4108            if self.metrics.symbol_stats.symbols_exported > 0 {
4109                self.symbol_usage.iter().sum::<usize>() as f64
4110                    / self.metrics.symbol_stats.symbols_exported as f64
4111            } else {
4112                0.0
4113            };
4114
4115        self.metrics.symbol_mode.planning += planning_start.elapsed();
4116
4117        #[cfg(feature = "parallel")]
4118        let built_pages = if self.state.pdf_mode || self.pages.len() > 1 {
4119            page_layouts
4120                .par_iter()
4121                .map(|layout| {
4122                    self.build_planned_page(
4123                        layout,
4124                        &global_sym_to_dict_pos,
4125                        num_global_dict_symbols,
4126                        &global_refinement_map,
4127                    )
4128                })
4129                .collect::<Vec<_>>()
4130        } else {
4131            page_layouts
4132                .iter()
4133                .map(|layout| {
4134                    self.build_planned_page(
4135                        layout,
4136                        &global_sym_to_dict_pos,
4137                        num_global_dict_symbols,
4138                        &global_refinement_map,
4139                    )
4140                })
4141                .collect::<Vec<_>>()
4142        };
4143
4144        #[cfg(not(feature = "parallel"))]
4145        let built_pages = page_layouts
4146            .iter()
4147            .map(|layout| {
4148                self.build_planned_page(
4149                    layout,
4150                    &global_sym_to_dict_pos,
4151                    num_global_dict_symbols,
4152                    &global_refinement_map,
4153                )
4154            })
4155            .collect::<Vec<_>>();
4156
4157        let mut pages = Vec::with_capacity(built_pages.len());
4158        for built_page in built_pages {
4159            let built_page = built_page?;
4160            self.metrics.symbol_mode.symbol_dict_encoding += built_page.symbol_dict_time;
4161            self.metrics.symbol_mode.text_region_encoding += built_page.text_region_time;
4162            self.metrics.symbol_mode.generic_region_encoding += built_page.generic_region_time;
4163            pages.push(built_page.page);
4164        }
4165
4166        let eof_segment = Some(Segment {
4167            number: current_segment_number,
4168            seg_type: SegmentType::EndOfFile,
4169            deferred_non_retain: false,
4170            retain_flags: 0,
4171            page_association_type: 2,
4172            referred_to: vec![],
4173            page: None,
4174            payload: vec![],
4175        });
4176        current_segment_number += 1;
4177
4178        Ok(PlannedDocument {
4179            file_header: if include_header {
4180                Some(FileHeader {
4181                    organisation_type: true,
4182                    unknown_n_pages: false,
4183                    n_pages: self.pages.len() as u32,
4184                })
4185            } else {
4186                None
4187            },
4188            global_segments,
4189            pages,
4190            eof_segment,
4191            next_segment_number: current_segment_number,
4192        })
4193    }
4194
4195    fn build_planned_page(
4196        &self,
4197        layout: &PlannedPageLayout,
4198        global_sym_to_dict_pos: &[u32],
4199        num_global_dict_symbols: u32,
4200        global_refinement_map: &[Option<RefinementPlan>],
4201    ) -> Result<BuiltPage> {
4202        let page = &self.pages[layout.page_index];
4203        let mut page_segments = Vec::new();
4204        let mut symbol_dict_time = Duration::default();
4205        let mut text_region_time = Duration::default();
4206        let mut generic_region_time = Duration::default();
4207
4208        let page_info_payload = PageInfo {
4209            width: page.image.width as u32,
4210            height: page.image.height as u32,
4211            default_pixel: false,
4212            xres: self.config.generic.dpi,
4213            yres: self.config.generic.dpi,
4214            ..Default::default()
4215        }
4216        .to_bytes();
4217
4218        page_segments.push(Segment {
4219            number: layout.page_info_segment_number,
4220            seg_type: SegmentType::PageInformation,
4221            deferred_non_retain: false,
4222            retain_flags: 0,
4223            page_association_type: 0,
4224            referred_to: vec![],
4225            page: Some(layout.page_number),
4226            payload: page_info_payload,
4227        });
4228
4229        if self.config.symbol_mode
4230            && !page.symbol_instances.is_empty()
4231            && !layout.use_generic_region
4232        {
4233            let mut referred_to_for_text_region = self.global_dict_segment_numbers.clone();
4234            let residual_set: HashSet<usize> = layout.residual_symbols.iter().copied().collect();
4235            let residual_anchor_remaps = &layout.residual_anchor_remaps;
4236
4237            let mut local_sym_to_dict_pos = vec![u32::MAX; self.global_symbols.len()];
4238            let mut local_refinement_map = vec![None; self.global_symbols.len()];
4239            let num_local_dict_symbols = if let Some(local_dict_layout) = &layout.local_dict_layout
4240            {
4241                let refs: Vec<&BitImage> = layout
4242                    .local_symbols
4243                    .iter()
4244                    .map(|&i| &self.global_symbols[i])
4245                    .collect();
4246                let dict_start = Instant::now();
4247                let encoded_local_dict =
4248                    encode_symbol_dictionary_segments(&refs, self.config, local_dict_layout)?;
4249                symbol_dict_time += dict_start.elapsed();
4250
4251                for (refs_idx, &dict_pos) in
4252                    encoded_local_dict.input_to_exported_pos.iter().enumerate()
4253                {
4254                    if dict_pos != u32::MAX {
4255                        let gs_idx = layout.local_symbols[refs_idx];
4256                        local_sym_to_dict_pos[gs_idx] = dict_pos;
4257                    }
4258                }
4259                for (subset_index, refinement) in local_dict_layout.refinements.iter().enumerate() {
4260                    if let Some(refinement) = refinement {
4261                        let gs_idx = layout.local_symbols[subset_index];
4262                        local_refinement_map[gs_idx] = Some(RefinementPlan {
4263                            prototype_input_index: layout.local_symbols
4264                                [refinement.prototype_input_index],
4265                            refinement_dx: refinement.refinement_dx,
4266                            refinement_dy: refinement.refinement_dy,
4267                        });
4268                    }
4269                }
4270
4271                for segment_number in layout.local_dict_segment_numbers.iter().copied() {
4272                    page_segments.push(Segment {
4273                        number: segment_number,
4274                        seg_type: SegmentType::SymbolDictionary,
4275                        deferred_non_retain: false,
4276                        retain_flags: 0,
4277                        page_association_type: 0,
4278                        referred_to: Vec::new(),
4279                        page: Some(layout.page_number),
4280                        payload: encoded_local_dict.payload.clone(),
4281                    });
4282                }
4283                referred_to_for_text_region
4284                    .extend(layout.local_dict_segment_numbers.iter().copied());
4285                encoded_local_dict.exported_symbol_count
4286            } else {
4287                0
4288            };
4289
4290            let mut planned_instances = Vec::with_capacity(page.symbol_instances.len());
4291            let mut residual_instances = Vec::new();
4292            for instance in &page.symbol_instances {
4293                if let Some(&anchor_index) = residual_anchor_remaps.get(&instance.symbol_index) {
4294                    let mut remapped = instance.clone();
4295                    remapped.symbol_index = anchor_index;
4296                    remapped.needs_refinement = false;
4297                    remapped.refinement_dx = 0;
4298                    remapped.refinement_dy = 0;
4299                    planned_instances.push(remapped);
4300                } else if residual_set.contains(&instance.symbol_index) {
4301                    residual_instances.push(instance.clone());
4302                } else {
4303                    planned_instances.push(instance.clone());
4304                }
4305            }
4306            let mut needs_family_refinement = false;
4307            for instance in &mut planned_instances {
4308                if let Some(refinement) = local_refinement_map[instance.symbol_index] {
4309                    instance.symbol_index = refinement.prototype_input_index;
4310                    instance.needs_refinement = true;
4311                    instance.refinement_dx = refinement.refinement_dx;
4312                    instance.refinement_dy = refinement.refinement_dy;
4313                    needs_family_refinement = true;
4314                } else if let Some(refinement) = global_refinement_map[instance.symbol_index] {
4315                    instance.symbol_index = refinement.prototype_input_index;
4316                    instance.needs_refinement = true;
4317                    instance.refinement_dx = refinement.refinement_dx;
4318                    instance.refinement_dy = refinement.refinement_dy;
4319                    needs_family_refinement = true;
4320                }
4321            }
4322
4323            if !planned_instances.is_empty() {
4324                let text_start = Instant::now();
4325                let has_instance_refinement = planned_instances
4326                    .iter()
4327                    .any(|instance| instance.needs_refinement);
4328                let use_refinement_text_region = has_instance_refinement
4329                    || (!self.config.uses_lossy_symbol_dictionary()
4330                        && (self.config.text_refine
4331                            || self.config.refine
4332                            || needs_family_refinement));
4333                let region_payload = if use_refinement_text_region {
4334                    encode_text_region_with_refinement(
4335                        &planned_instances,
4336                        self.config,
4337                        &self.global_symbols,
4338                        global_sym_to_dict_pos,
4339                        num_global_dict_symbols,
4340                        &local_sym_to_dict_pos,
4341                        num_local_dict_symbols,
4342                    )?
4343                } else {
4344                    encode_text_region_mapped(
4345                        &planned_instances,
4346                        self.config,
4347                        &self.global_symbols,
4348                        global_sym_to_dict_pos,
4349                        num_global_dict_symbols,
4350                        &local_sym_to_dict_pos,
4351                        layout.page_index,
4352                        num_local_dict_symbols,
4353                    )?
4354                };
4355                text_region_time += text_start.elapsed();
4356
4357                page_segments.push(Segment {
4358                    number: layout.region_segment_number,
4359                    seg_type: SegmentType::ImmediateTextRegion,
4360                    deferred_non_retain: false,
4361                    retain_flags: 0,
4362                    page_association_type: 0,
4363                    referred_to: referred_to_for_text_region,
4364                    page: Some(layout.page_number),
4365                    payload: region_payload,
4366                });
4367            }
4368
4369            if let Some((residual_bitmap, residual_x, residual_y)) =
4370                self.build_instance_residual_bitmap(&residual_instances)?
4371            {
4372                let generic_start = Instant::now();
4373                let residual_payload = self.encode_generic_region_payload_at(
4374                    &residual_bitmap,
4375                    residual_x,
4376                    residual_y,
4377                )?;
4378                generic_region_time += generic_start.elapsed();
4379                page_segments.push(Segment {
4380                    number: layout
4381                        .residual_region_segment_number
4382                        .unwrap_or(layout.region_segment_number),
4383                    seg_type: SegmentType::ImmediateGenericRegion,
4384                    deferred_non_retain: false,
4385                    retain_flags: 0,
4386                    page_association_type: 0,
4387                    referred_to: Vec::new(),
4388                    page: Some(layout.page_number),
4389                    payload: residual_payload,
4390                });
4391            }
4392        } else {
4393            let generic_start = Instant::now();
4394            let generic_region_payload =
4395                self.encode_generic_region_payload_at(&page.image, 0, 0)?;
4396            generic_region_time += generic_start.elapsed();
4397
4398            page_segments.push(Segment {
4399                number: layout.region_segment_number,
4400                seg_type: SegmentType::ImmediateGenericRegion,
4401                deferred_non_retain: false,
4402                retain_flags: 0,
4403                page_association_type: 0,
4404                referred_to: Vec::new(),
4405                page: Some(layout.page_number),
4406                payload: generic_region_payload,
4407            });
4408        }
4409
4410        page_segments.push(Segment {
4411            number: layout.end_of_page_segment_number,
4412            seg_type: SegmentType::EndOfPage,
4413            deferred_non_retain: false,
4414            retain_flags: 0,
4415            page_association_type: 0,
4416            referred_to: Vec::new(),
4417            page: Some(layout.page_number),
4418            payload: Vec::new(),
4419        });
4420
4421        Ok(BuiltPage {
4422            page: PlannedPage {
4423                page_number: layout.page_number,
4424                segments: page_segments,
4425            },
4426            symbol_dict_time,
4427            text_region_time,
4428            generic_region_time,
4429        })
4430    }
4431
4432    fn validate_plan(&self, plan: &PlannedDocument) -> Result<()> {
4433        let mut global_numbers = HashSet::new();
4434
4435        for seg in &plan.global_segments {
4436            if !global_numbers.insert(seg.number) {
4437                anyhow::bail!("Duplicate segment number in globals: {}", seg.number);
4438            }
4439        }
4440
4441        for (page_idx, page) in plan.pages.iter().enumerate() {
4442            // In PDF mode, each page is an independent stream, so segment
4443            // numbers only need to be unique within globals + that page.
4444            let mut page_numbers = global_numbers.clone();
4445            for seg in &page.segments {
4446                if !page_numbers.insert(seg.number) {
4447                    anyhow::bail!(
4448                        "Duplicate segment number {} on page {}",
4449                        seg.number,
4450                        page_idx
4451                    );
4452                }
4453            }
4454
4455            for seg in &page.segments {
4456                for referred in &seg.referred_to {
4457                    if !page_numbers.contains(referred) {
4458                        anyhow::bail!(
4459                            "Page {} segment {} refers to missing segment {}",
4460                            page.page_number,
4461                            seg.number,
4462                            referred
4463                        );
4464                    }
4465                    if global_numbers.contains(referred) && plan.global_segments.is_empty() {
4466                        anyhow::bail!(
4467                            "Page {} segment {} refers to global {} but no globals stream exists",
4468                            page.page_number,
4469                            seg.number,
4470                            referred
4471                        );
4472                    }
4473                }
4474            }
4475        }
4476
4477        if let Some(eof) = &plan.eof_segment {
4478            if global_numbers.contains(&eof.number) {
4479                anyhow::bail!("EOF segment number {} conflicts with globals", eof.number);
4480            }
4481        }
4482
4483        for seg in &plan.global_segments {
4484            for referred in &seg.referred_to {
4485                if !global_numbers.contains(referred) {
4486                    anyhow::bail!(
4487                        "Global segment {} refers to missing segment {}",
4488                        seg.number,
4489                        referred
4490                    );
4491                }
4492            }
4493        }
4494
4495        Ok(())
4496    }
4497
4498    fn serialize_full_document(&self, plan: &PlannedDocument) -> Result<Vec<u8>> {
4499        let mut output = Vec::new();
4500        if let Some(header) = &plan.file_header {
4501            output.extend(header.to_bytes());
4502        }
4503        for seg in &plan.global_segments {
4504            seg.write_into(&mut output)?;
4505        }
4506        for page in &plan.pages {
4507            for seg in &page.segments {
4508                seg.write_into(&mut output)?;
4509            }
4510        }
4511        if let Some(eof) = &plan.eof_segment {
4512            eof.write_into(&mut output)?;
4513        }
4514        Ok(output)
4515    }
4516
4517    fn serialize_pdf_split(
4518        &self,
4519        plan: &PlannedDocument,
4520    ) -> Result<(
4521        Option<Vec<u8>>,
4522        Vec<Vec<u8>>,
4523        Vec<usize>,
4524        Vec<usize>,
4525        Vec<usize>,
4526    )> {
4527        let global_segments = if plan.global_segments.is_empty() {
4528            None
4529        } else {
4530            let mut out = Vec::new();
4531            for seg in &plan.global_segments {
4532                seg.write_into(&mut out)?;
4533            }
4534            Some(out)
4535        };
4536
4537        #[cfg(feature = "parallel")]
4538        let page_streams = plan
4539            .pages
4540            .par_iter()
4541            .map(|page| {
4542                let mut page_out = Vec::new();
4543                let mut local_dict_bytes = 0usize;
4544                let mut text_region_bytes = 0usize;
4545                let mut generic_region_bytes = 0usize;
4546                for seg in &page.segments {
4547                    let start_len = page_out.len();
4548                    seg.write_into(&mut page_out)?;
4549                    let seg_len = page_out.len().saturating_sub(start_len);
4550                    match seg.seg_type {
4551                        SegmentType::SymbolDictionary => local_dict_bytes += seg_len,
4552                        SegmentType::ImmediateTextRegion => text_region_bytes += seg_len,
4553                        SegmentType::ImmediateGenericRegion => generic_region_bytes += seg_len,
4554                        _ => {}
4555                    }
4556                }
4557                Ok((
4558                    page_out,
4559                    local_dict_bytes,
4560                    text_region_bytes,
4561                    generic_region_bytes,
4562                ))
4563            })
4564            .collect::<Vec<Result<(Vec<u8>, usize, usize, usize)>>>()
4565            .into_iter()
4566            .collect::<Result<Vec<_>>>()?;
4567
4568        #[cfg(not(feature = "parallel"))]
4569        let page_streams = {
4570            let mut page_streams = Vec::with_capacity(plan.pages.len());
4571            for page in &plan.pages {
4572                let mut page_out = Vec::new();
4573                let mut local_dict_bytes = 0usize;
4574                let mut text_region_bytes = 0usize;
4575                let mut generic_region_bytes = 0usize;
4576                for seg in &page.segments {
4577                    let start_len = page_out.len();
4578                    seg.write_into(&mut page_out)?;
4579                    let seg_len = page_out.len().saturating_sub(start_len);
4580                    match seg.seg_type {
4581                        SegmentType::SymbolDictionary => local_dict_bytes += seg_len,
4582                        SegmentType::ImmediateTextRegion => text_region_bytes += seg_len,
4583                        SegmentType::ImmediateGenericRegion => generic_region_bytes += seg_len,
4584                        _ => {}
4585                    }
4586                }
4587                page_streams.push((
4588                    page_out,
4589                    local_dict_bytes,
4590                    text_region_bytes,
4591                    generic_region_bytes,
4592                ));
4593            }
4594            page_streams
4595        };
4596
4597        let mut raw_pages = Vec::with_capacity(page_streams.len());
4598        let mut local_dict_bytes_per_page = Vec::with_capacity(page_streams.len());
4599        let mut text_region_bytes_per_page = Vec::with_capacity(page_streams.len());
4600        let mut generic_region_bytes_per_page = Vec::with_capacity(page_streams.len());
4601        for (page_out, local_dict_bytes, text_region_bytes, generic_region_bytes) in page_streams {
4602            raw_pages.push(page_out);
4603            local_dict_bytes_per_page.push(local_dict_bytes);
4604            text_region_bytes_per_page.push(text_region_bytes);
4605            generic_region_bytes_per_page.push(generic_region_bytes);
4606        }
4607
4608        Ok((
4609            global_segments,
4610            raw_pages,
4611            local_dict_bytes_per_page,
4612            text_region_bytes_per_page,
4613            generic_region_bytes_per_page,
4614        ))
4615    }
4616
4617    fn prune_symbols_if_needed(&mut self) {
4618        // No pruning — JBIG2 supports large dictionaries and pruning drops
4619        // symbol instances, leaving holes in the rendered output.
4620    }
4621
4622    /// Cluster similar symbols into groups and select prototypes.
4623    ///
4624    /// This is the key optimization for symbol-mode compression. After all pages
4625    /// have been extracted, we group symbols that look similar (e.g., different
4626    /// renderings of the letter "e") into clusters. Only one prototype per cluster
4627    /// is stored in the dictionary. Instances that don't exactly match their
4628    /// prototype are marked for refinement coding (SPM).
4629    ///
4630    /// This replaces the naive O(n²) auto_threshold with a dimension-bucketed
4631    /// approach that's much faster for large symbol sets.
4632    fn cluster_symbols(&mut self) -> Result<()> {
4633        let n = self.global_symbols.len();
4634        if n < 2 {
4635            return Ok(());
4636        }
4637
4638        // Union-find with path compression and union by rank
4639        let mut parent: Vec<usize> = (0..n).collect();
4640        let mut uf_rank: Vec<u32> = vec![0; n];
4641        let mut comparator = Comparator::default();
4642
4643        // Group by exact dimensions and compare only neighboring sizes.
4644        let mut buckets: HashMap<(usize, usize), Vec<usize>> = HashMap::new();
4645        for (i, sym) in self.global_symbols.iter().enumerate() {
4646            buckets.entry((sym.height, sym.width)).or_default().push(i);
4647        }
4648
4649        // Compare within each bucket and adjacent buckets
4650        let mut bucket_keys: Vec<(usize, usize)> = buckets.keys().copied().collect();
4651        bucket_keys.sort_unstable();
4652
4653        let mut compare_pair = |a_idx: usize, b_idx: usize| {
4654            if uf_find(&mut parent, a_idx) == uf_find(&mut parent, b_idx) {
4655                return;
4656            }
4657
4658            let a_sym = &self.global_symbols[a_idx];
4659            let b_sym = &self.global_symbols[b_idx];
4660            let dim_limit = if self.config.text_refine { 2 } else { 1 };
4661            if (a_sym.width as i32 - b_sym.width as i32).abs() > dim_limit
4662                || (a_sym.height as i32 - b_sym.height as i32).abs() > dim_limit
4663            {
4664                return;
4665            }
4666
4667            let area = a_sym.width.max(b_sym.width) * a_sym.height.max(b_sym.height);
4668            let max_err = if self.config.text_refine {
4669                ((self.symbol_pixel_counts[a_idx].max(self.symbol_pixel_counts[b_idx]) as f32
4670                    * 0.10) as u32)
4671                    .max(((area as f32) * 0.05) as u32)
4672                    .clamp(3, 20)
4673            } else {
4674                ((area as f32 * 0.04) as u32).clamp(2, 12)
4675            };
4676            if self.symbol_pixel_counts[a_idx].abs_diff(self.symbol_pixel_counts[b_idx])
4677                > max_err as usize
4678            {
4679                return;
4680            }
4681
4682            let dy_limit = if self.config.text_refine { 1 } else { 0 };
4683            if let Some(result) =
4684                comparator.compare_for_refine_family(a_sym, b_sym, max_err, dim_limit, dy_limit)
4685            {
4686                let dx = result.dx;
4687                let dy = result.dy;
4688                if dx.abs() <= dim_limit && dy.abs() <= dy_limit {
4689                    uf_union(&mut parent, &mut uf_rank, a_idx, b_idx);
4690                }
4691            }
4692        };
4693
4694        for &(bh, bw) in &bucket_keys {
4695            let current_bucket = &buckets[&(bh, bw)];
4696            for ci in 0..current_bucket.len() {
4697                for cj in (ci + 1)..current_bucket.len() {
4698                    compare_pair(current_bucket[ci], current_bucket[cj]);
4699                }
4700            }
4701
4702            for dh in -1i32..=1 {
4703                for dw in -1i32..=1 {
4704                    let nh = bh as i32 + dh;
4705                    let nw = bw as i32 + dw;
4706                    if nh < 0 || nw < 0 {
4707                        continue;
4708                    }
4709                    let neighbor_key = (nh as usize, nw as usize);
4710                    if neighbor_key <= (bh, bw) {
4711                        continue;
4712                    }
4713                    if let Some(neighbor_bucket) = buckets.get(&neighbor_key) {
4714                        for &a_idx in current_bucket {
4715                            for &b_idx in neighbor_bucket {
4716                                compare_pair(a_idx, b_idx);
4717                            }
4718                        }
4719                    }
4720                }
4721            }
4722        }
4723
4724        // Build cluster groups
4725        let mut clusters: HashMap<usize, Vec<usize>> = HashMap::new();
4726        for i in 0..n {
4727            let root = uf_find(&mut parent, i);
4728            clusters.entry(root).or_default().push(i);
4729        }
4730
4731        // Select prototype deterministically by usage, then black pixels, then original index.
4732        let mut old_to_prototype: Vec<usize> = (0..n).collect();
4733        for (_, members) in &clusters {
4734            if members.len() <= 1 {
4735                continue;
4736            }
4737            let prototype = self.choose_cluster_prototype(members);
4738            for &m in members {
4739                old_to_prototype[m] = prototype;
4740            }
4741        }
4742
4743        // Build new compact symbol list (prototypes only) and index mapping
4744        let mut seen_prototypes: HashMap<usize, usize> = HashMap::new();
4745        let mut new_symbols: Vec<BitImage> = Vec::new();
4746        let mut old_to_new: Vec<usize> = vec![0; n];
4747
4748        // Process in order so prototype positions are deterministic
4749        for i in 0..n {
4750            let proto = old_to_prototype[i];
4751            if let Some(&new_idx) = seen_prototypes.get(&proto) {
4752                old_to_new[i] = new_idx;
4753            } else {
4754                let new_idx = new_symbols.len();
4755                new_symbols.push(self.global_symbols[proto].clone());
4756                seen_prototypes.insert(proto, new_idx);
4757                old_to_new[i] = new_idx;
4758            }
4759        }
4760
4761        let old_count = n;
4762        let new_count = new_symbols.len();
4763
4764        // Remap all instances and mark which ones need refinement
4765        for page in &mut self.pages {
4766            for inst in &mut page.symbol_instances {
4767                let old_idx = inst.symbol_index;
4768                let new_idx = old_to_new[old_idx];
4769                let proto = old_to_prototype[old_idx];
4770
4771                // If this instance's original symbol was NOT the prototype,
4772                // it needs refinement encoding to preserve quality
4773                if old_idx != proto {
4774                    inst.needs_refinement = true;
4775                    // Compute alignment offset between instance and prototype.
4776                    // Use a generous error limit (not u32::MAX which overflows in Comparator).
4777                    let (_, trimmed_inst) = inst.instance_bitmap.trim();
4778                    let max_ref_err = (trimmed_inst.width * trimmed_inst.height) as u32;
4779                    if let Some((_, dx, dy)) =
4780                        comparator.distance(&trimmed_inst, &new_symbols[new_idx], max_ref_err)
4781                    {
4782                        inst.refinement_dx = dx;
4783                        inst.refinement_dy = dy;
4784                    }
4785                }
4786
4787                inst.symbol_index = new_idx;
4788            }
4789        }
4790
4791        // Replace internal state
4792        self.global_symbols = new_symbols;
4793        self.symbol_pixel_counts = self
4794            .global_symbols
4795            .iter()
4796            .map(BitImage::count_ones)
4797            .collect();
4798        self.rebuild_symbol_metadata();
4799        self.rebuild_hash_map();
4800
4801        debug!(
4802            "Clustering: {} -> {} prototype symbols ({:.1}% reduction)",
4803            old_count,
4804            new_count,
4805            (1.0 - new_count as f64 / old_count.max(1) as f64) * 100.0
4806        );
4807
4808        Ok(())
4809    }
4810
4811    fn validate_symbol_instance_indices(&self) -> Result<()> {
4812        for (page_num, page) in self.pages.iter().enumerate() {
4813            for instance in &page.symbol_instances {
4814                if instance.symbol_index >= self.global_symbols.len() {
4815                    anyhow::bail!(
4816                        "Page {} has symbol instance {} out of range after pruning (max {})",
4817                        page_num + 1,
4818                        instance.symbol_index,
4819                        self.global_symbols.len().saturating_sub(1)
4820                    );
4821                }
4822            }
4823        }
4824        Ok(())
4825    }
4826
4827    fn validate_symbol_partition(
4828        &self,
4829        global_symbol_indices: &[usize],
4830        page_local_symbols: &[Vec<usize>],
4831        page_residual_symbols: &[Vec<usize>],
4832        page_residual_anchor_remaps: &[FxHashMap<usize, usize>],
4833        page_uses_generic_region: &[bool],
4834    ) -> Result<()> {
4835        let global_set: HashSet<usize> = global_symbol_indices.iter().copied().collect();
4836        for (page_num, page) in self.pages.iter().enumerate() {
4837            if page_uses_generic_region[page_num] {
4838                continue;
4839            }
4840            let local_set: HashSet<usize> = page_local_symbols[page_num].iter().copied().collect();
4841            let residual_set: HashSet<usize> =
4842                page_residual_symbols[page_num].iter().copied().collect();
4843            for inst in &page.symbol_instances {
4844                let idx = inst.symbol_index;
4845                if !global_set.contains(&idx)
4846                    && !page_residual_anchor_remaps[page_num].contains_key(&idx)
4847                    && !local_set.contains(&idx)
4848                    && !residual_set.contains(&idx)
4849                {
4850                    anyhow::bail!(
4851                        "Page {} symbol {} was not resolved to global, local, or residual output",
4852                        page_num + 1,
4853                        idx
4854                    );
4855                }
4856            }
4857        }
4858        Ok(())
4859    }
4860
4861    fn auto_threshold(&mut self) -> Result<()> {
4862        let mut i = 0;
4863        let mut comparator = Comparator::default();
4864        while i < self.global_symbols.len() {
4865            let mut j = i + 1;
4866            while j < self.global_symbols.len() {
4867                if comparator
4868                    .distance(&self.global_symbols[i], &self.global_symbols[j], 0)
4869                    .is_some()
4870                {
4871                    self.unite_templates(i, j)?;
4872                } else {
4873                    j += 1;
4874                }
4875            }
4876            i += 1;
4877        }
4878        Ok(())
4879    }
4880
4881    fn auto_threshold_using_hash(&mut self) -> Result<()> {
4882        // Repeatedly scan for exact-match duplicates until no more merges occur.
4883        // Each call to unite_templates invalidates indices, so we rebuild the
4884        // hash buckets from scratch after every merge.
4885        loop {
4886            let mut hashed_templates: HashMap<u32, Vec<usize>> = HashMap::new();
4887            for (i, symbol) in self.global_symbols.iter().enumerate() {
4888                let hash = compute_symbol_hash(symbol);
4889                hashed_templates.entry(hash).or_default().push(i);
4890            }
4891
4892            let mut comparator = Comparator::default();
4893            let mut merged = false;
4894
4895            for (_, bucket) in &hashed_templates {
4896                if bucket.len() < 2 {
4897                    continue;
4898                }
4899                // Find first mergeable pair in this bucket
4900                'outer: for bi in 0..bucket.len() {
4901                    for bj in (bi + 1)..bucket.len() {
4902                        if comparator
4903                            .distance(
4904                                &self.global_symbols[bucket[bi]],
4905                                &self.global_symbols[bucket[bj]],
4906                                0,
4907                            )
4908                            .is_some()
4909                        {
4910                            self.unite_templates(bucket[bi], bucket[bj])?;
4911                            merged = true;
4912                            break 'outer;
4913                        }
4914                    }
4915                }
4916                if merged {
4917                    break; // Indices are stale, restart the scan
4918                }
4919            }
4920
4921            if !merged {
4922                break;
4923            }
4924        }
4925        Ok(())
4926    }
4927
4928    fn unite_templates(&mut self, target_idx: usize, source_idx: usize) -> Result<()> {
4929        if source_idx >= self.global_symbols.len() {
4930            anyhow::bail!("Source index out of range");
4931        }
4932
4933        for page in &mut self.pages {
4934            for instance in &mut page.symbol_instances {
4935                if instance.symbol_index == source_idx {
4936                    instance.symbol_index = target_idx;
4937                } else if instance.symbol_index > source_idx {
4938                    instance.symbol_index -= 1;
4939                }
4940            }
4941        }
4942
4943        self.global_symbols.remove(source_idx);
4944        self.symbol_pixel_counts.remove(source_idx);
4945        self.rebuild_symbol_metadata();
4946        self.rebuild_hash_map();
4947
4948        Ok(())
4949    }
4950
4951    pub fn next_segment_number(&mut self) -> u32 {
4952        let num = self.next_segment_number;
4953        self.next_segment_number += 1;
4954        num
4955    }
4956
4957    pub fn flush_dict(&mut self) -> Result<Vec<u8>> {
4958        if self.global_symbols.is_empty() {
4959            return Ok(Vec::new());
4960        }
4961
4962        let symbol_refs: Vec<&BitImage> = self.global_symbols.iter().collect();
4963        let dict_data = encode_symbol_dict(&symbol_refs, &self.config, 0)?;
4964
4965        let dict_segment = Segment {
4966            number: self.next_segment_number,
4967            seg_type: SegmentType::SymbolDictionary,
4968            deferred_non_retain: false,
4969            retain_flags: 0,
4970            page_association_type: if self.state.pdf_mode { 2 } else { 0 },
4971            referred_to: Vec::new(),
4972            page: if self.state.pdf_mode { None } else { Some(1) },
4973            payload: dict_data,
4974        };
4975        self.next_segment_number += 1;
4976
4977        let mut output = Vec::new();
4978        if self.state.pdf_mode {
4979            dict_segment.write_into(&mut output)?;
4980            return Ok(output);
4981        }
4982
4983        let header = FileHeader {
4984            organisation_type: true,
4985            unknown_n_pages: false,
4986            n_pages: 1,
4987        };
4988        output.extend(header.to_bytes());
4989        dict_segment.write_into(&mut output)?;
4990
4991        Ok(output)
4992    }
4993
4994    fn build_instance_residual_bitmap(
4995        &self,
4996        instances: &[SymbolInstance],
4997    ) -> Result<Option<(BitImage, u32, u32)>> {
4998        if instances.is_empty() {
4999            return Ok(None);
5000        }
5001
5002        let mut min_x = u32::MAX;
5003        let mut min_y = u32::MAX;
5004        let mut max_x = 0u32;
5005        let mut max_y = 0u32;
5006        let mut has_pixels = false;
5007
5008        for instance in instances {
5009            if instance.instance_bitmap.count_ones() == 0 {
5010                continue;
5011            }
5012            has_pixels = true;
5013            min_x = min_x.min(instance.position.x);
5014            min_y = min_y.min(instance.position.y);
5015            max_x = max_x.max(instance.position.x + instance.instance_bitmap.width as u32);
5016            max_y = max_y.max(instance.position.y + instance.instance_bitmap.height as u32);
5017        }
5018
5019        if !has_pixels || max_x <= min_x || max_y <= min_y {
5020            return Ok(None);
5021        }
5022
5023        let width = max_x - min_x;
5024        let height = max_y - min_y;
5025        let mut residual = BitImage::new(width, height).map_err(|e| anyhow!(e))?;
5026        for instance in instances {
5027            let offset_x = (instance.position.x - min_x) as usize;
5028            let offset_y = (instance.position.y - min_y) as usize;
5029            for y in 0..instance.instance_bitmap.height {
5030                for x in 0..instance.instance_bitmap.width {
5031                    if instance.instance_bitmap.get_usize(x, y) {
5032                        residual.set_usize(offset_x + x, offset_y + y, true);
5033                    }
5034                }
5035            }
5036        }
5037
5038        if residual.count_ones() == 0 {
5039            return Ok(None);
5040        }
5041
5042        Ok(Some((residual, min_x, min_y)))
5043    }
5044
5045    fn encode_generic_region_payload_at(
5046        &self,
5047        image: &BitImage,
5048        x: u32,
5049        y: u32,
5050    ) -> Result<Vec<u8>> {
5051        let mut gr_cfg = GenericRegionConfig::new(
5052            image.width as u32,
5053            image.height as u32,
5054            self.config.generic.dpi,
5055        );
5056        gr_cfg.x = x;
5057        gr_cfg.y = y;
5058        gr_cfg.comb_operator = self.config.generic.comb_operator;
5059        gr_cfg.mmr = self.config.generic.mmr;
5060        gr_cfg.tpgdon = self.config.generic.tpgdon;
5061        gr_cfg.validate().map_err(|e: &'static str| anyhow!(e))?;
5062
5063        let coder_data = Jbig2ArithCoder::encode_generic_payload_cfg(image, &gr_cfg)?;
5064        let params: GenericRegionParams = gr_cfg.clone().into();
5065        let mut payload = params.to_bytes();
5066        payload.extend_from_slice(&coder_data);
5067        Ok(payload)
5068    }
5069}
5070
5071/// Encodes a generic region, optionally wrapping it in a complete JBIG2 file.
5072/// This function is intended to be the top-level entry point for encoding a single generic region.
5073pub fn encode_generic_region(img: &BitImage, cfg: &Jbig2Config) -> Result<Vec<u8>> {
5074    // Build generic region config from high-level parameters
5075    let mut gr_cfg = GenericRegionParams::new(img.width as u32, img.height as u32, cfg.generic.dpi);
5076    gr_cfg.comb_operator = cfg.generic.comb_operator;
5077    gr_cfg.mmr = cfg.generic.mmr;
5078    gr_cfg.tpgdon = cfg.generic.tpgdon;
5079    gr_cfg.validate().map_err(|e: &'static str| anyhow!(e))?;
5080
5081    let coder_data =
5082        Jbig2ArithCoder::encode_generic_payload(img, gr_cfg.template, &gr_cfg.at_pixels)?;
5083
5084    let params: GenericRegionParams = gr_cfg.clone();
5085
5086    let mut generic_region_payload = params.to_bytes();
5087    generic_region_payload.extend_from_slice(&coder_data);
5088
5089    // Create the generic region segment (segment number 1)
5090    let generic_region_segment = Segment {
5091        number: 1, // Segment number 1
5092        seg_type: SegmentType::ImmediateGenericRegion,
5093        deferred_non_retain: false,
5094        retain_flags: 0,
5095        page_association_type: 0, // Explicit page association
5096        referred_to: Vec::new(),
5097        page: Some(1),                           // Page 1
5098        payload: generic_region_payload.clone(), // Clone to avoid move
5099    };
5100
5101    // If caller wants only the segment, we're done
5102    if !cfg.want_full_headers {
5103        let mut seg_bytes = Vec::new();
5104        generic_region_segment.write_into(&mut seg_bytes)?;
5105        return Ok(seg_bytes);
5106    }
5107
5108    // Otherwise wrap it in a complete one-page JBIG2 file
5109    let mut out = Vec::with_capacity(generic_region_payload.len() + 64);
5110
5111    // File header
5112    out.extend_from_slice(
5113        &FileHeader {
5114            organisation_type: true,
5115            unknown_n_pages: false,
5116            n_pages: 1,
5117        }
5118        .to_bytes(),
5119    );
5120
5121    // Page Information segment (segment number 0)
5122    Segment {
5123        number: 0,
5124        seg_type: SegmentType::PageInformation,
5125        deferred_non_retain: false,
5126        retain_flags: 0,
5127        page_association_type: 0,
5128        referred_to: vec![],
5129        page: Some(1),
5130        payload: PageInfo {
5131            width: img.width as u32,
5132            height: img.height as u32,
5133            xres: cfg.generic.dpi,
5134            yres: cfg.generic.dpi,
5135            is_lossless: cfg.is_lossless,
5136            default_pixel: cfg.default_pixel,
5137            default_operator: cfg.generic.comb_operator,
5138            ..Default::default()
5139        }
5140        .to_bytes(),
5141    }
5142    .write_into(&mut out)?;
5143
5144    // Generic region segment (segment number 1)
5145    generic_region_segment.write_into(&mut out)?;
5146
5147    // EOF segment (segment number 2)
5148    Segment {
5149        number: 2,
5150        seg_type: SegmentType::EndOfFile,
5151        deferred_non_retain: false,
5152        retain_flags: 0,
5153        page_association_type: 2,
5154        referred_to: vec![],
5155        page: None,
5156        payload: vec![],
5157    }
5158    .write_into(&mut out)?;
5159
5160    Ok(out)
5161}
5162
5163pub fn encode_symbol_dict(
5164    symbols: &[&BitImage],
5165    _config: &Jbig2Config,
5166    num_imported_symbols: u32,
5167) -> Result<Vec<u8>> {
5168    let (payload, _order) = encode_symbol_dict_with_order(symbols, _config, num_imported_symbols)?;
5169    Ok(payload)
5170}
5171
5172/// Computes the canonical encoding order for a list of symbols.
5173///
5174/// Returns a `Vec<usize>` where each element is an index into the input `symbols` slice,
5175/// giving the order symbols will appear in the encoded dictionary (after filtering out
5176/// zero-size symbols, deduplication, and sorting by height class then width).
5177///
5178/// This order must be used when mapping symbol instance IDs in text regions.
5179pub fn canonicalize_dict_symbols(symbols: &[&BitImage]) -> Vec<usize> {
5180    // Step 1: Filter zero-size, tracking original indices
5181    let mut valid: Vec<(usize, &BitImage)> = symbols
5182        .iter()
5183        .enumerate()
5184        .filter(|(_, sym)| sym.width > 0 && sym.height > 0)
5185        .map(|(i, sym)| (i, *sym))
5186        .collect();
5187
5188    // Step 2: Sort by (height ASC, width ASC) — same order as sort_symbols_for_dictionary
5189    // Use stable sort to preserve input order for identical dimensions.
5190    // No dedup here: the encoder already deduplicates during extraction + auto_threshold.
5191    // Removing a symbol would leave text region instances without a valid dictionary mapping.
5192    valid.sort_by(|a, b| (a.1.height, a.1.width).cmp(&(b.1.height, b.1.width)));
5193
5194    // Return original indices in canonical order
5195    valid.into_iter().map(|(orig_idx, _)| orig_idx).collect()
5196}
5197
5198fn plan_symbol_dictionary_layout(
5199    symbols: &[&BitImage],
5200    config: &Jbig2Config,
5201    usage_weights: Option<&[usize]>,
5202) -> Result<SymbolDictLayout> {
5203    let canonical_order = canonicalize_dict_symbols(symbols);
5204    if canonical_order.is_empty() {
5205        return Err(anyhow!(
5206            "encode_symbol_dict: no valid symbols supplied (all symbols had zero width or height)"
5207        ));
5208    }
5209
5210    let _ = (config, usage_weights);
5211    Ok(SymbolDictLayout {
5212        export_input_indices: canonical_order,
5213        refinements: vec![None; symbols.len()],
5214        diagnostics: SymbolDictDiagnostics {
5215            singleton_family_count: symbols.len(),
5216            exported_member_count: symbols.len(),
5217            ..Default::default()
5218        },
5219    })
5220}
5221
5222fn build_refinement_family_layout(
5223    symbols: &[&BitImage],
5224    canonical_order: &[usize],
5225    usage_weights: Option<&[usize]>,
5226) -> SymbolDictLayout {
5227    let mut comparator = Comparator::default();
5228    let signatures: Vec<SymbolSignature> = symbols
5229        .iter()
5230        .map(|sym| compute_symbol_signature_shared(sym))
5231        .collect();
5232    let black_counts: Vec<usize> = symbols.iter().map(|sym| sym.count_ones()).collect();
5233
5234    let mut canonical_pos = vec![usize::MAX; symbols.len()];
5235    for (pos, &input_index) in canonical_order.iter().enumerate() {
5236        canonical_pos[input_index] = pos;
5237    }
5238
5239    let mut bucket_map: HashMap<FamilyBucketKey, Vec<usize>> = HashMap::new();
5240    for &input_index in canonical_order {
5241        let key = family_bucket_key_for_symbol(symbols[input_index], &signatures[input_index]);
5242        bucket_map.entry(key).or_default().push(input_index);
5243    }
5244
5245    let mut parent: Vec<usize> = (0..symbols.len()).collect();
5246    let mut rank = vec![0u32; symbols.len()];
5247
5248    for &input_index in canonical_order {
5249        let symbol = symbols[input_index];
5250        let key = family_bucket_key_for_symbol(symbol, &signatures[input_index]);
5251
5252        for neighbor in family_bucket_neighbors(key) {
5253            let Some(bucket) = bucket_map.get(&neighbor) else {
5254                continue;
5255            };
5256            for &other_input_index in bucket {
5257                if canonical_pos[other_input_index] >= canonical_pos[input_index] {
5258                    continue;
5259                }
5260                if family_match_details(
5261                    &mut comparator,
5262                    symbol,
5263                    input_index,
5264                    symbols[other_input_index],
5265                    other_input_index,
5266                    &signatures,
5267                    &black_counts,
5268                )
5269                .is_some()
5270                {
5271                    uf_union(&mut parent, &mut rank, input_index, other_input_index);
5272                }
5273            }
5274        }
5275    }
5276
5277    let mut families: HashMap<usize, Vec<usize>> = HashMap::new();
5278    for &input_index in canonical_order {
5279        let root = uf_find(&mut parent, input_index);
5280        families.entry(root).or_default().push(input_index);
5281    }
5282
5283    let mut export_input_indices = Vec::new();
5284    let mut refinements = vec![None; symbols.len()];
5285    let mut diagnostics = SymbolDictDiagnostics::default();
5286
5287    let mut family_members: Vec<Vec<usize>> = families.into_values().collect();
5288    family_members.sort_by_key(|members| canonical_pos[members[0]]);
5289    diagnostics.family_count = family_members.len();
5290
5291    for mut members in family_members {
5292        members.sort_by_key(|&input_index| canonical_pos[input_index]);
5293        if members.len() == 1 {
5294            diagnostics.singleton_family_count += 1;
5295            diagnostics.exported_member_count += 1;
5296            export_input_indices.push(members[0]);
5297            continue;
5298        }
5299
5300        let prototype_input_index = choose_family_prototype(
5301            &members,
5302            symbols,
5303            usage_weights,
5304            &canonical_pos,
5305            &signatures,
5306            &black_counts,
5307        );
5308        if diagnostics.sample_lines.len() < 128 {
5309            diagnostics.sample_lines.push(format!(
5310                "refine family: prototype={} members={} prototype_usage={}",
5311                prototype_input_index,
5312                members.len(),
5313                usage_weights
5314                    .and_then(|weights| weights.get(prototype_input_index).copied())
5315                    .unwrap_or(1)
5316            ));
5317        }
5318        export_input_indices.push(prototype_input_index);
5319        diagnostics.exported_member_count += 1;
5320
5321        for &member_input_index in &members {
5322            if member_input_index == prototype_input_index {
5323                continue;
5324            }
5325
5326            let maybe_match = family_match_details(
5327                &mut comparator,
5328                symbols[member_input_index],
5329                member_input_index,
5330                symbols[prototype_input_index],
5331                prototype_input_index,
5332                &signatures,
5333                &black_counts,
5334            );
5335
5336            match maybe_match {
5337                Some((err, dx, dy))
5338                    if family_should_refine(
5339                        symbols[member_input_index],
5340                        symbols[prototype_input_index],
5341                        err,
5342                        dx,
5343                        dy,
5344                        usage_weights
5345                            .and_then(|weights| weights.get(member_input_index).copied())
5346                            .unwrap_or(1),
5347                    ) =>
5348                {
5349                    refinements[member_input_index] = Some(RefinementPlan {
5350                        prototype_input_index,
5351                        refinement_dx: dx,
5352                        refinement_dy: dy,
5353                    });
5354                    diagnostics.refined_member_count += 1;
5355                    if diagnostics.sample_lines.len() < 128 {
5356                        diagnostics.sample_lines.push(format!(
5357                            "  refine member={} -> prototype={} dx={} dy={} err={} usage={}",
5358                            member_input_index,
5359                            prototype_input_index,
5360                            dx,
5361                            dy,
5362                            err,
5363                            usage_weights
5364                                .and_then(|weights| weights.get(member_input_index).copied())
5365                                .unwrap_or(1)
5366                        ));
5367                    }
5368                }
5369                _ => {
5370                    export_input_indices.push(member_input_index);
5371                    diagnostics.exported_member_count += 1;
5372                    if diagnostics.sample_lines.len() < 128 {
5373                        diagnostics.sample_lines.push(format!(
5374                            "  export member={} as standalone usage={}",
5375                            member_input_index,
5376                            usage_weights
5377                                .and_then(|weights| weights.get(member_input_index).copied())
5378                                .unwrap_or(1)
5379                        ));
5380                    }
5381                }
5382            }
5383        }
5384    }
5385
5386    export_input_indices.sort_by_key(|&input_index| canonical_pos[input_index]);
5387
5388    SymbolDictLayout {
5389        export_input_indices,
5390        refinements,
5391        diagnostics,
5392    }
5393}
5394
5395fn family_refinement_gain(
5396    target: &BitImage,
5397    reference: &BitImage,
5398    err: u32,
5399    dx: i32,
5400    dy: i32,
5401) -> i64 {
5402    let plain_cost = symbol_dictionary_entry_bytes(target) as i64 + 10;
5403    let refine_cost = 10
5404        + err as i64
5405        + ((dx.abs() + dy.abs()) as i64 * 3)
5406        + (target.width.abs_diff(reference.width) + target.height.abs_diff(reference.height))
5407            as i64
5408            * 2;
5409    plain_cost - refine_cost
5410}
5411
5412fn family_should_refine(
5413    target: &BitImage,
5414    reference: &BitImage,
5415    err: u32,
5416    dx: i32,
5417    dy: i32,
5418    usage_count: usize,
5419) -> bool {
5420    if usage_count > 1 {
5421        return false;
5422    }
5423    let export_gain = family_refinement_gain(target, reference, err, dx, dy);
5424    export_gain > 12
5425}
5426
5427fn choose_family_prototype(
5428    members: &[usize],
5429    symbols: &[&BitImage],
5430    usage_weights: Option<&[usize]>,
5431    canonical_pos: &[usize],
5432    signatures: &[SymbolSignature],
5433    black_counts: &[usize],
5434) -> usize {
5435    if members.len() == 1 {
5436        return members[0];
5437    }
5438
5439    let mut comparator = Comparator::default();
5440    let mut best_idx = members[0];
5441    let mut best_cost = u64::MAX;
5442    let mut best_support = 0u64;
5443
5444    for &candidate in members {
5445        let mut total_cost = 0u64;
5446        for &other in members {
5447            if candidate == other {
5448                continue;
5449            }
5450            let weight = usage_weights
5451                .and_then(|weights| weights.get(other).copied())
5452                .unwrap_or(1) as u64;
5453            match family_match_details(
5454                &mut comparator,
5455                symbols[other],
5456                other,
5457                symbols[candidate],
5458                candidate,
5459                signatures,
5460                black_counts,
5461            ) {
5462                Some((err, dx, dy)) => {
5463                    total_cost += (refine_compare_score(err, dx, dy) as u64 + 4) * weight;
5464                }
5465                None => total_cost += 1_000_000 * weight,
5466            }
5467        }
5468
5469        let candidate_support = usage_weights
5470            .and_then(|weights| weights.get(candidate).copied())
5471            .unwrap_or(1) as u64;
5472        let score_close = if best_cost == u64::MAX {
5473            false
5474        } else {
5475            total_cost <= best_cost + best_cost / 50
5476        };
5477
5478        if total_cost < best_cost
5479            || (score_close && candidate_support > best_support)
5480            || (total_cost == best_cost
5481                && candidate_support == best_support
5482                && canonical_pos[candidate] < canonical_pos[best_idx])
5483        {
5484            best_cost = total_cost;
5485            best_idx = candidate;
5486            best_support = candidate_support;
5487        }
5488    }
5489
5490    best_idx
5491}
5492
5493fn encode_symbol_dictionary_segments(
5494    symbols: &[&BitImage],
5495    config: &Jbig2Config,
5496    layout: &SymbolDictLayout,
5497) -> Result<EncodedSymbolDictionary> {
5498    let mut encoded = EncodedSymbolDictionary {
5499        payload: Vec::new(),
5500        input_to_exported_pos: vec![u32::MAX; symbols.len()],
5501        exported_symbol_count: 0,
5502    };
5503
5504    let (dict_payload, base_order) =
5505        encode_symbol_dict_subset_with_order(symbols, config, &layout.export_input_indices, 0)?;
5506    for (dict_pos, &input_index) in base_order.iter().enumerate() {
5507        encoded.input_to_exported_pos[input_index] = dict_pos as u32;
5508    }
5509    encoded.exported_symbol_count = base_order.len() as u32;
5510    encoded.payload = dict_payload;
5511
5512    for (input_index, refinement) in layout.refinements.iter().enumerate() {
5513        if let Some(refinement) = refinement {
5514            let prototype_pos = encoded.input_to_exported_pos[refinement.prototype_input_index];
5515            if prototype_pos != u32::MAX {
5516                encoded.input_to_exported_pos[input_index] = prototype_pos;
5517            }
5518        }
5519    }
5520
5521    Ok(encoded)
5522}
5523
5524fn encode_symbol_dict_subset_with_order(
5525    symbols: &[&BitImage],
5526    config: &Jbig2Config,
5527    subset_indices: &[usize],
5528    num_imported_symbols: u32,
5529) -> Result<(Vec<u8>, Vec<usize>)> {
5530    let subset_symbols: Vec<&BitImage> = subset_indices.iter().map(|&i| symbols[i]).collect();
5531    let (payload, subset_order) =
5532        encode_symbol_dict_with_order(&subset_symbols, config, num_imported_symbols)?;
5533    let input_order = subset_order
5534        .into_iter()
5535        .map(|subset_index| subset_indices[subset_index])
5536        .collect();
5537    Ok((payload, input_order))
5538}
5539
5540/// Encodes a symbol dictionary, returning both the payload and the mapping from
5541/// encoded dictionary position → input index.
5542pub fn encode_symbol_dict_with_order(
5543    symbols: &[&BitImage],
5544    _config: &Jbig2Config,
5545    num_imported_symbols: u32,
5546) -> Result<(Vec<u8>, Vec<usize>)> {
5547    // Compute canonical order (filter + dedup + sort)
5548    let canonical_order = canonicalize_dict_symbols(symbols);
5549
5550    if canonical_order.is_empty() {
5551        return Err(anyhow!(
5552            "encode_symbol_dict: no valid symbols supplied (all symbols had zero width or height)"
5553        ));
5554    }
5555
5556    // Build the ordered symbol list
5557    let ordered_symbols: Vec<&BitImage> = canonical_order.iter().map(|&i| symbols[i]).collect();
5558
5559    // Verify symbol dimensions are within JBIG2 limits
5560    for (i, sym) in ordered_symbols.iter().enumerate() {
5561        if sym.width > (1 << 24) || sym.height > (1 << 24) {
5562            return Err(anyhow!(
5563                "Symbol at index {} exceeds maximum dimensions ({}x{})",
5564                i,
5565                sym.width,
5566                sym.height
5567            ));
5568        }
5569    }
5570
5571    let mut payload = Vec::new();
5572    let mut coder = Jbig2ArithCoder::new();
5573
5574    let num_export_syms = ordered_symbols.len() as u32;
5575
5576    // Create symbol dictionary parameters
5577    let params = SymbolDictParams {
5578        sd_template: 0, // Use standard template 0
5579        // Match jbig2enc's template-0 adaptive pixels for symbol dictionaries.
5580        at: [(3, -1), (-3, -1), (2, -2), (-2, -2)],
5581        refine_aggregate: false,
5582        refine_template: 0,
5583        refine_at: [(0, 0), (0, 0)],
5584        exsyms: num_export_syms,
5585        newsyms: ordered_symbols.len() as u32,
5586    };
5587
5588    if cfg!(debug_assertions) {
5589        debug!("encode_symbol_dict: Exporting {} symbols", num_export_syms);
5590        trace!("encode_symbol_dict: SymbolDictParams details: {:?}", params);
5591    }
5592
5593    // Write the symbol dictionary parameters
5594    payload.extend(params.to_bytes());
5595
5596    // Symbols are already in canonical (height, width) order from canonicalize_dict_symbols.
5597    // We need to encode them in this exact order, grouped by height class for delta encoding.
5598    // Build height classes from the already-sorted ordered_symbols to preserve the canonical order.
5599    let mut height_classes: Vec<Vec<&BitImage>> = Vec::new();
5600    let mut current_height: Option<usize> = None;
5601    let mut current_class: Vec<&BitImage> = Vec::new();
5602
5603    for &sym in &ordered_symbols {
5604        match current_height {
5605            None => {
5606                // First symbol
5607                current_height = Some(sym.height);
5608                current_class.push(sym);
5609            }
5610            Some(h) if sym.height == h => {
5611                // Same height class
5612                current_class.push(sym);
5613            }
5614            Some(_) => {
5615                // New height class - push previous and start new
5616                height_classes.push(current_class);
5617                current_height = Some(sym.height);
5618                current_class = vec![sym];
5619            }
5620        }
5621    }
5622    if !current_class.is_empty() {
5623        height_classes.push(current_class);
5624    }
5625
5626    // Debug: log the encoding order and first few pixels of each symbol for verification
5627    #[cfg(debug_assertions)]
5628    {
5629        debug!(
5630            "Symbol dictionary encoding order ({} symbols):",
5631            ordered_symbols.len()
5632        );
5633        let mut dict_pos = 0u32;
5634        for (hc_idx, symbols_in_class) in height_classes.iter().enumerate() {
5635            debug!(
5636                "  Height class {}: {} symbols",
5637                hc_idx,
5638                symbols_in_class.len()
5639            );
5640            for (sym_idx, sym) in symbols_in_class.iter().enumerate() {
5641                // Log first pixel position for each symbol
5642                let first_pixel = first_black_pixel(sym);
5643                if sym_idx < 5 || sym_idx >= symbols_in_class.len() - 2 {
5644                    debug!(
5645                        "    dict_pos={} -> {}x{} first_pixel={:?}",
5646                        dict_pos, sym.width, sym.height, first_pixel
5647                    );
5648                } else if sym_idx == 5 {
5649                    debug!(
5650                        "    ... ({} symbols omitted) ...",
5651                        symbols_in_class.len() - 7
5652                    );
5653                }
5654                dict_pos += 1;
5655            }
5656        }
5657    }
5658
5659    let mut last_height = 0;
5660
5661    // 4. Encode the height classes
5662    for symbols_in_class in &height_classes {
5663        let h = symbols_in_class[0].height; // All symbols in class have same height
5664        // A. Encode Delta Height
5665        let delta_h = h as i32 - last_height as i32;
5666        let _ = coder.encode_integer(crate::jbig2arith::IntProc::Iadh, delta_h);
5667        last_height = h;
5668
5669        let mut last_width = 0;
5670        #[cfg(debug_assertions)]
5671        let mut dict_pos = 0u32;
5672
5673        // Debug: check symbols in this height class (disabled in release)
5674        #[cfg(debug_assertions)]
5675        {
5676            debug!("Height class {} has {} symbols:", h, symbols_in_class.len());
5677            for (i, symbol) in symbols_in_class.iter().enumerate() {
5678                debug!("  Symbol {}: {}x{}", i, symbol.width, symbol.height);
5679            }
5680        }
5681
5682        // B. Encode symbols within this height class
5683        // Symbols within each height class are already sorted by width from canonicalize_dict_symbols.
5684        for (i, symbol) in symbols_in_class.iter().enumerate() {
5685            // I. Encode Delta Width
5686            let delta_w = symbol.width as i32 - last_width;
5687
5688            // Debug output to help diagnose the issue (disabled in release)
5689            #[cfg(debug_assertions)]
5690            debug!(
5691                "Height class {}, Symbol {}: width={}, last_width={}, delta_w={}",
5692                h, i, symbol.width, last_width, delta_w
5693            );
5694
5695            let _ = coder.encode_integer(crate::jbig2arith::IntProc::Iadw, delta_w);
5696            last_width = symbol.width as i32; // last_width becomes current width
5697
5698            // II. Encode Symbol Bitmap using Generic Region Procedure
5699            let packed = symbol.packed_words();
5700
5701            // Debug: dump first few symbols' bitmap data for verification
5702            #[cfg(debug_assertions)]
5703            {
5704                debug!(
5705                    "  dict_pos={} {}x{} first_word={:08x}",
5706                    dict_pos,
5707                    symbol.width,
5708                    symbol.height,
5709                    packed.get(0).unwrap_or(&0)
5710                );
5711            }
5712
5713            // Verify bit-order correctness: first black pixel should match between symbol and packed data
5714            if let Some(expected_first_pixel) = first_black_pixel(symbol) {
5715                let actual_first_pixel = crate::jbig2sym::first_black_pixel_in_packed(
5716                    packed,
5717                    symbol.width,
5718                    symbol.height,
5719                );
5720                assert_eq!(
5721                    actual_first_pixel,
5722                    Some(expected_first_pixel),
5723                    "bit-order / row-order mismatch in symbol dict packer! Expected first black pixel at {:?}, got {:?}",
5724                    expected_first_pixel,
5725                    actual_first_pixel
5726                );
5727            }
5728
5729            coder.encode_generic_region(
5730                packed,
5731                symbol.width,
5732                symbol.height,
5733                params.sd_template,
5734                &[(3, -1), (-3, -1), (2, -2), (-2, -2)],
5735            )?;
5736
5737            #[cfg(debug_assertions)]
5738            {
5739                dict_pos += 1;
5740            }
5741        }
5742
5743        // OOB marks the end of this height class.
5744        let _ = coder.encode_oob(IntProc::Iadw);
5745    }
5746
5747    // Export flags come after the symbol bitmap data (run-length form).
5748    let _ = coder.encode_integer(IntProc::Iaex, 0);
5749    let _ = coder.encode_integer(IntProc::Iaex, num_export_syms as i32);
5750
5751    // 5. flush the coder ONCE
5752    coder.flush(true);
5753
5754    // 6. Append the single, complete arithmetic payload
5755    payload.extend(coder.as_bytes());
5756
5757    Ok((payload, canonical_order))
5758}
5759
5760/// Computes the bounding box that contains all symbol instances.
5761///
5762/// # Arguments
5763/// * `instances` - Slice of symbol instances to compute bounds for
5764/// * `all_known_symbols` - All available symbol bitmaps
5765///
5766/// # Returns
5767/// A tuple of (min_x, min_y, width, height) representing the bounding box
5768fn compute_region_bounds(
5769    instances: &[TextRegionSymbolInstance],
5770    all_known_symbols: &[&BitImage],
5771) -> (u32, u32, u32, u32) {
5772    if instances.is_empty() {
5773        return (0, 0, 0, 0);
5774    }
5775    let mut min_x = u32::MAX;
5776    let mut min_y = u32::MAX;
5777    let mut max_x_coord = 0u32;
5778    let mut max_y_coord = 0u32;
5779
5780    for instance in instances {
5781        let sym_idx = instance.symbol_id as usize;
5782        if sym_idx >= all_known_symbols.len() {
5783            continue; // Skip invalid symbol indices
5784        }
5785
5786        let pos = Rect {
5787            x: instance.x as u32, // Convert i32 to u32
5788            y: instance.y as u32, // Convert i32 to u32
5789            width: crate::jbig2shared::usize_to_u32(all_known_symbols[sym_idx].width),
5790            height: crate::jbig2shared::usize_to_u32(all_known_symbols[sym_idx].height),
5791        };
5792
5793        min_x = min_x.min(pos.x);
5794        min_y = min_y.min(pos.y);
5795        max_x_coord = max_x_coord.max(pos.x + pos.width);
5796        max_y_coord = max_y_coord.max(pos.y + pos.height);
5797    }
5798
5799    // Handle potential underflow if max < min (shouldn't happen with valid coordinates)
5800    let region_width = if max_x_coord > min_x {
5801        max_x_coord - min_x
5802    } else {
5803        0
5804    };
5805
5806    let region_height = if max_y_coord > min_y {
5807        max_y_coord - min_y
5808    } else {
5809        0
5810    };
5811
5812    (min_x, min_y, region_width, region_height)
5813}
5814
5815pub fn encode_refine(
5816    instances: &[TextRegionSymbolInstance],
5817    all_known_symbols: &[&BitImage],
5818    data: &mut Vec<u8>,
5819    coder: &mut Jbig2ArithCoder,
5820) -> Result<()> {
5821    // 1. Compute region bounds
5822    let (min_x, min_y, region_w, region_h) = compute_region_bounds(instances, all_known_symbols);
5823    let width = region_w.max(1);
5824    let height = region_h.max(1);
5825
5826    // 2. Write TextRegion header (flags + params)
5827    // flags: TRREF=1, others zero (arithmetic coding)
5828    let mut flags: u8 = 0;
5829    flags |= 0x40; // TRREF bit
5830    data.push(flags);
5831
5832    let params = TextRegionParams {
5833        width,
5834        height,
5835        x: min_x,
5836        y: min_y,
5837        ds_offset: 0,
5838        refine: true,
5839        log_strips: 0,
5840        ref_corner: 0,
5841        transposed: false,
5842        comb_op: 0,
5843        refine_template: 0,
5844    };
5845    data.extend(params.to_bytes());
5846
5847    // 3. Encode number of instances
5848    let num_inst = instances.len() as u32;
5849    let _ = coder.encode_int_with_ctx(num_inst as i32, 16, IntProc::Iaai);
5850
5851    // 4. Initialize an empty region buffer to track already emitted pixels
5852    let mut region_buf = BitImage::new(width, height).expect("region bitmap too large");
5853
5854    // 5. Emit each instance
5855    for inst in instances {
5856        // IAID symbol ID
5857        let sym_id = inst.symbol_id;
5858        let _ = coder.encode_iaid(sym_id, 16);
5859
5860        // Refinement deltas
5861        let _ = coder.encode_integer(IntProc::Iardx, inst.dx);
5862        let _ = coder.encode_integer(IntProc::Iardy, inst.dy);
5863
5864        // If this is a refinement instance, encode pixel-by-pixel
5865        if inst.is_refinement {
5866            // locate the symbol bitmap
5867            if let Some(&sym) = all_known_symbols.get(sym_id as usize) {
5868                // offset of this instance in region coords
5869                let ox = inst.x as u32 - min_x;
5870                let oy = inst.y as u32 - min_y;
5871
5872                // for each pixel in the symbol region
5873                for y in 0..sym.height as u32 {
5874                    for x in 0..sym.width as u32 {
5875                        // compute region coord
5876                        let rx = ox + x;
5877                        let ry = oy + y;
5878
5879                        // skip out-of-bounds
5880                        if rx >= width || ry >= height {
5881                            continue;
5882                        }
5883
5884                        // Bounds already verified above (rx < width, ry < height);
5885                        // use direct indexing to bypass redundant bounds checks.
5886                        let ref_bit = sym.get_pixel_unchecked(x as usize, y as usize) as u8;
5887                        let pred_bit =
5888                            region_buf.get_pixel_unchecked(rx as usize, ry as usize) as u8;
5889
5890                        // Context = combine ref_bit, pred_bit, template (here simple sum)
5891                        let ctx = ((ref_bit << 1) | pred_bit) as usize;
5892
5893                        // Encode the actual pixel: 1 if sym has pixel, 0 otherwise
5894                        let bit = ref_bit;
5895                        coder.encode_bit(ctx, bit != 0);
5896
5897                        // Update region buffer so subsequent instances see it
5898                        if bit != 0 {
5899                            region_buf.set(rx, ry, true);
5900                        }
5901                    }
5902                }
5903            }
5904        }
5905    }
5906
5907    // 6. flush and append coder payload
5908    coder.flush(true);
5909    data.extend(coder.as_bytes());
5910
5911    Ok(())
5912}
5913
5914/// Encodes a text region segment using pre-computed dictionary position maps.
5915///
5916/// Unlike `encode_text_region` which maps by list position, this function uses
5917/// explicit global_symbols_index → dictionary_position maps that account for the
5918/// canonical (filter/dedup/sort) order produced by `encode_symbol_dict_with_order`.
5919///
5920/// The decoder concatenates dictionary exports: global_dict[0..N] then local_dict[0..M].
5921/// Symbol IDs 0..N-1 map to the global dict, N..N+M-1 map to the local dict.
5922#[inline]
5923fn symbol_id_from_dense_maps(
5924    symbol_index: usize,
5925    global_sym_to_dict_pos: &[u32],
5926    num_global_dict_symbols: u32,
5927    local_sym_to_dict_pos: &[u32],
5928) -> Option<u32> {
5929    let global = global_sym_to_dict_pos
5930        .get(symbol_index)
5931        .copied()
5932        .unwrap_or(u32::MAX);
5933    if global != u32::MAX {
5934        return Some(global);
5935    }
5936    let local = local_sym_to_dict_pos
5937        .get(symbol_index)
5938        .copied()
5939        .unwrap_or(u32::MAX);
5940    if local != u32::MAX {
5941        Some(num_global_dict_symbols + local)
5942    } else {
5943        None
5944    }
5945}
5946
5947pub fn encode_text_region_mapped(
5948    instances: &[SymbolInstance],
5949    config: &Jbig2Config,
5950    all_symbols: &[BitImage],
5951    global_sym_to_dict_pos: &[u32],
5952    num_global_dict_symbols: u32,
5953    local_sym_to_dict_pos: &[u32],
5954    page_num: usize,
5955    num_local_dict_symbols: u32,
5956) -> Result<Vec<u8>> {
5957    if instances.is_empty() {
5958        return Err(anyhow!("No symbol instances provided for text region"));
5959    }
5960
5961    let debug_encoding = page_num == 0 && std::env::var("JBIG2_DEBUG").map_or(false, |v| v == "1");
5962    let mut enc_debug_lines: Vec<String> = Vec::new();
5963
5964    let num_total_dict_symbols = num_global_dict_symbols + num_local_dict_symbols;
5965
5966    let mut payload = Vec::new();
5967    let mut coder = Jbig2ArithCoder::new();
5968
5969    let mut min_x = u32::MAX;
5970    let mut min_y = u32::MAX;
5971    let mut max_x_coord = 0u32;
5972    let mut max_y_coord = 0u32;
5973
5974    for instance in instances {
5975        let sym = &all_symbols[instance.symbol_index];
5976        min_x = min_x.min(instance.position.x);
5977        min_y = min_y.min(instance.position.y);
5978        max_x_coord = max_x_coord.max(instance.position.x + sym.width as u32);
5979        max_y_coord = max_y_coord.max(instance.position.y + sym.height as u32);
5980    }
5981
5982    let region_width = max_x_coord.saturating_sub(min_x);
5983    let region_height = max_y_coord.saturating_sub(min_y);
5984
5985    let params = TextRegionParams {
5986        width: region_width,
5987        height: region_height,
5988        x: min_x,
5989        y: min_y,
5990        ds_offset: config.text_ds_offset,
5991        refine: config.text_refine,
5992        log_strips: config.text_log_strips,
5993        ref_corner: config.text_ref_corner,
5994        transposed: config.text_transposed,
5995        comb_op: config.text_comb_op,
5996        refine_template: config.text_refine_template,
5997    };
5998
5999    payload.extend(params.to_bytes());
6000    payload.extend_from_slice(&(instances.len() as u32).to_be_bytes());
6001
6002    let symbol_id_bits = log2up(num_total_dict_symbols.max(1)).max(1);
6003
6004    #[derive(Clone, Copy)]
6005    struct EncodedInstance {
6006        strip_base: i32,
6007        x: i32,
6008        t_offset: i32,
6009        symbol_id: u32,
6010        symbol_width: i32,
6011    }
6012
6013    let strip_width = 1i32 << params.log_strips.min(3);
6014    let mut encoded_instances = Vec::with_capacity(instances.len());
6015
6016    for instance in instances {
6017        let gs_idx = instance.symbol_index;
6018        let sym = &all_symbols[gs_idx];
6019
6020        // Map to dictionary position using the canonical order maps.
6021        // First check global dict, then local dict (offset by num_global_dict_symbols).
6022        let symbol_id = if let Some(symbol_id) = symbol_id_from_dense_maps(
6023            gs_idx,
6024            global_sym_to_dict_pos,
6025            num_global_dict_symbols,
6026            local_sym_to_dict_pos,
6027        ) {
6028            symbol_id
6029        } else {
6030            anyhow::bail!(
6031                "Symbol instance (global_symbols index {}) not found in any dictionary!",
6032                gs_idx
6033            );
6034        };
6035
6036        let abs = instance.position;
6037        let rel_x = abs.x as i32 - min_x as i32;
6038        // REFCORNER=TOPLEFT (value 1): T is the top of the original bounding box.
6039        let rel_y = abs.y as i32 - min_y as i32;
6040        let strip_base = (rel_y / strip_width) * strip_width;
6041        let t_offset = rel_y - strip_base;
6042
6043        encoded_instances.push(EncodedInstance {
6044            strip_base,
6045            x: rel_x,
6046            t_offset,
6047            symbol_id,
6048            symbol_width: sym.width as i32,
6049        });
6050    }
6051
6052    encoded_instances.sort_by_key(|e| (e.strip_base, e.x));
6053
6054    if debug_encoding {
6055        enc_debug_lines.push(format!("=== PAGE 0 ENCODING LOG ==="));
6056        enc_debug_lines.push(format!(
6057            "Region: {}x{} at ({},{})",
6058            params.width, params.height, params.x, params.y
6059        ));
6060        enc_debug_lines.push(format!(
6061            "min_x={} min_y={} strip_width={}",
6062            min_x, min_y, strip_width
6063        ));
6064        enc_debug_lines.push(format!(
6065            "Total instances: {}, dict symbols: {}",
6066            encoded_instances.len(),
6067            num_total_dict_symbols
6068        ));
6069        enc_debug_lines.push(String::new());
6070
6071        // Show mapping from symbol_id to dimensions for reference
6072        enc_debug_lines.push("Symbol ID -> dimensions lookup (first 30):".to_string());
6073        for (dict_id, sym) in all_symbols.iter().enumerate().take(30) {
6074            let dict_pos = symbol_id_from_dense_maps(
6075                dict_id,
6076                global_sym_to_dict_pos,
6077                num_global_dict_symbols,
6078                local_sym_to_dict_pos,
6079            )
6080            .unwrap_or(u32::MAX);
6081            enc_debug_lines.push(format!(
6082                "  gs_idx={} -> dict_pos={} ({}x{})",
6083                dict_id, dict_pos, sym.width, sym.height
6084            ));
6085        }
6086        enc_debug_lines.push(String::new());
6087
6088        enc_debug_lines.push(format!(
6089            "{:<6} {:<8} {:<8} {:<10} {:<8} {:<10} {:<10} {:<10}",
6090            "Idx", "SymID", "SymW", "StripBase", "TOffset", "RelX", "DeltaT", "DeltaS"
6091        ));
6092    }
6093
6094    let mut strip_t = 0i32;
6095    let mut first_s = 0i32;
6096    let mut idx = 0usize;
6097
6098    // §6.4.5 step 1: initial STRIPT value (decoder reads one IADT before the loop)
6099    let _ = coder.encode_integer(IntProc::Iadt, 0);
6100
6101    while idx < encoded_instances.len() {
6102        let current_strip = encoded_instances[idx].strip_base;
6103        let delta_t = current_strip - strip_t;
6104        let _ = coder.encode_integer(IntProc::Iadt, delta_t / strip_width);
6105
6106        if debug_encoding && delta_t != 0 {
6107            enc_debug_lines.push(format!(
6108                "--- strip break: IADT delta_t={} (strip_t {} → {})",
6109                delta_t, strip_t, current_strip
6110            ));
6111        }
6112        strip_t = current_strip;
6113
6114        let mut first_symbol_in_strip = true;
6115        let mut current_s = 0i32;
6116        while idx < encoded_instances.len() && encoded_instances[idx].strip_base == current_strip {
6117            let item = encoded_instances[idx];
6118            let delta_s;
6119            if first_symbol_in_strip {
6120                delta_s = item.x - first_s;
6121                let _ = coder.encode_integer(IntProc::Iafs, delta_s);
6122                first_s += delta_s;
6123                current_s = first_s;
6124                first_symbol_in_strip = false;
6125            } else {
6126                delta_s = item.x - current_s;
6127                let _ = coder.encode_integer(IntProc::Iads, delta_s);
6128                current_s += delta_s;
6129            }
6130
6131            if debug_encoding {
6132                enc_debug_lines.push(format!(
6133                    "{:<6} {:<8} {:<8} {:<10} {:<8} {:<10} {:<10} {:<10}",
6134                    idx,
6135                    item.symbol_id,
6136                    item.symbol_width,
6137                    item.strip_base,
6138                    item.t_offset,
6139                    item.x,
6140                    delta_t,
6141                    delta_s
6142                ));
6143            }
6144
6145            if strip_width > 1 {
6146                let _ = coder.encode_integer(IntProc::Iait, item.t_offset);
6147            }
6148            let _ = coder.encode_iaid(item.symbol_id, symbol_id_bits as u8);
6149            current_s += item.symbol_width - 1;
6150            idx += 1;
6151        }
6152        let _ = coder.encode_oob(IntProc::Iads);
6153    }
6154
6155    // Decode simulation: replay §6.4.5 from the encoder's perspective
6156    // to verify positions match what the decoder will compute.
6157    if debug_encoding {
6158        enc_debug_lines.push(String::new());
6159        enc_debug_lines.push(format!("=== DECODE SIMULATION ==="));
6160        enc_debug_lines.push(format!(
6161            "{:<6} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<8}",
6162            "Idx", "ExpX", "ExpY", "DecS", "DecT", "AbsX", "AbsY", "Match?"
6163        ));
6164
6165        // Collect the IADT, IAFS, IADS, IAIT values and symbol_ids in encoding order for replay
6166        let sbstrips = strip_width;
6167        let sbdsoffset = params.ds_offset as i32;
6168        let mut dec_stript = 0i32;
6169        let mut dec_firsts = 0i32;
6170        let mut sim_idx = 0usize;
6171        let mut strip_start = 0usize;
6172
6173        // Group instances by strip_base (they're already sorted)
6174        while sim_idx < encoded_instances.len() {
6175            let current_strip = encoded_instances[sim_idx].strip_base;
6176            // Compute delta_t the same way the encoder did
6177            let delta_t = if sim_idx == 0 && current_strip == 0 {
6178                0 // first IADT is always 0 (the initial STRIPT)
6179            } else if sim_idx == strip_start {
6180                // Changed strip: the encoder emits IADT = (current_strip - prev_strip_t) / strip_width
6181                // But we need to replay the exact values. Let's just recompute.
6182                current_strip - dec_stript
6183            } else {
6184                0 // same strip, no IADT
6185            };
6186
6187            // §6.4.5 step 2: STRIPT = STRIPT + IADT × SBSTRIPS
6188            if sim_idx == strip_start || sim_idx == 0 {
6189                let iadt_value = (current_strip - dec_stript) / sbstrips;
6190                dec_stript += iadt_value * sbstrips;
6191            }
6192
6193            let mut first_in_strip = true;
6194            let mut dec_curs = 0i32;
6195            let strip_base = current_strip;
6196
6197            while sim_idx < encoded_instances.len()
6198                && encoded_instances[sim_idx].strip_base == strip_base
6199            {
6200                let item = encoded_instances[sim_idx];
6201
6202                if first_in_strip {
6203                    // §6.4.5: FIRSTS = FIRSTS + IAFS; CURS = FIRSTS
6204                    let iafs = item.x - dec_firsts;
6205                    dec_firsts += iafs;
6206                    dec_curs = dec_firsts;
6207                    first_in_strip = false;
6208                } else {
6209                    // §6.4.5: CURS = CURS + IADS + SBDSOFFSET
6210                    let iads = item.x - dec_curs;
6211                    dec_curs += iads + sbdsoffset;
6212                }
6213
6214                // §6.4.5: TI = STRIPT * SBSTRIPS + IAIT (IAIT=0 when SBSTRIPS=1)
6215                let dec_ti = dec_stript;
6216                let dec_si = dec_curs;
6217
6218                // Absolute page coords the decoder would compute
6219                let abs_x = dec_si + min_x as i32;
6220                let abs_y = dec_ti + min_y as i32;
6221
6222                // Expected absolute coords (what we intended)
6223                let exp_x = item.x + min_x as i32;
6224                let exp_y = item.strip_base + min_y as i32;
6225
6226                let ok = abs_x == exp_x && abs_y == exp_y;
6227                let tag = if ok { "OK" } else { "MISMATCH!" };
6228
6229                if !ok || sim_idx < 60 {
6230                    enc_debug_lines.push(format!(
6231                        "{:<6} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<8}",
6232                        sim_idx, exp_x, exp_y, dec_si, dec_ti, abs_x, abs_y, tag
6233                    ));
6234                }
6235
6236                // §6.4.5 step 4g: CURS = CURS + WI - 1
6237                dec_curs += item.symbol_width - 1;
6238                sim_idx += 1;
6239            }
6240            strip_start = sim_idx;
6241        }
6242    }
6243
6244    // Write encoding debug log for page 0
6245    if debug_encoding && !enc_debug_lines.is_empty() {
6246        let log_path = std::path::Path::new("jbig2_debug_page0.log");
6247        // Append to same file as matching log
6248        if let Ok(mut f) = std::fs::OpenOptions::new()
6249            .create(true)
6250            .append(true)
6251            .open(log_path)
6252        {
6253            use std::io::Write;
6254            let _ = writeln!(f, "");
6255            for line in &enc_debug_lines {
6256                let _ = writeln!(f, "{}", line);
6257            }
6258        }
6259    }
6260
6261    coder.flush(true);
6262    payload.extend(coder.as_bytes());
6263
6264    Ok(payload)
6265}
6266
6267/// Encodes a text region with Soft Pattern Matching (SPM / refinement coding).
6268///
6269/// This is the SBREFINE=1 variant of text region encoding. For each symbol instance:
6270/// - Encode the symbol ID and position (same as non-refinement)
6271/// - Encode RI (refinement indicator) via IARI
6272///   - RI=0: direct substitution of the dictionary symbol (no refinement)
6273///   - RI=1: encode size deltas (IARDW, IARDH), position offsets (IARDX, IARDY),
6274///     then a pixel-by-pixel refinement region using the dictionary symbol as reference
6275///
6276/// This allows lossy symbol clustering (small dictionary) while preserving
6277/// per-instance fidelity through the refinement residual.
6278pub fn encode_text_region_with_refinement(
6279    instances: &[SymbolInstance],
6280    config: &Jbig2Config,
6281    all_symbols: &[BitImage],
6282    global_sym_to_dict_pos: &[u32],
6283    num_global_dict_symbols: u32,
6284    local_sym_to_dict_pos: &[u32],
6285    num_local_dict_symbols: u32,
6286) -> Result<Vec<u8>> {
6287    if instances.is_empty() {
6288        return Err(anyhow!("No symbol instances provided for text region"));
6289    }
6290
6291    let num_total_dict_symbols = num_global_dict_symbols + num_local_dict_symbols;
6292
6293    let mut payload = Vec::new();
6294    let mut coder = Jbig2ArithCoder::new();
6295
6296    // Compute region bounds. For refined instances, use the actual instance
6297    // bitmap size (which may be larger than the prototype) so the region is
6298    // large enough to hold the refined glyphs.
6299    let mut min_x = u32::MAX;
6300    let mut min_y = u32::MAX;
6301    let mut max_x_coord = 0u32;
6302    let mut max_y_coord = 0u32;
6303
6304    for instance in instances {
6305        let (w, h) = if instance.needs_refinement {
6306            let (_, trimmed) = instance.instance_bitmap.trim();
6307            (trimmed.width as u32, trimmed.height as u32)
6308        } else {
6309            let sym = &all_symbols[instance.symbol_index];
6310            (sym.width as u32, sym.height as u32)
6311        };
6312
6313        min_x = min_x.min(instance.position.x);
6314        min_y = min_y.min(instance.position.y);
6315        max_x_coord = max_x_coord.max(instance.position.x + w);
6316        max_y_coord = max_y_coord.max(instance.position.y + h);
6317    }
6318
6319    let region_width = max_x_coord.saturating_sub(min_x);
6320    let region_height = max_y_coord.saturating_sub(min_y);
6321
6322    // SBREFINE=1 in the text region params
6323    let params = TextRegionParams {
6324        width: region_width,
6325        height: region_height,
6326        x: min_x,
6327        y: min_y,
6328        ds_offset: config.text_ds_offset,
6329        refine: true, // SBREFINE = 1
6330        log_strips: config.text_log_strips,
6331        ref_corner: config.text_ref_corner,
6332        transposed: config.text_transposed,
6333        comb_op: config.text_comb_op,
6334        refine_template: config.text_refine_template,
6335    };
6336
6337    payload.extend(params.to_bytes());
6338    payload.extend_from_slice(&(instances.len() as u32).to_be_bytes());
6339
6340    let symbol_id_bits = log2up(num_total_dict_symbols.max(1)).max(1);
6341
6342    // Prepare instances with dictionary mapping (same structure as non-refinement)
6343    #[derive(Clone)]
6344    struct RefinedInstance {
6345        strip_base: i32,
6346        x: i32,
6347        t_offset: i32,
6348        symbol_id: u32,
6349        symbol_width: i32,
6350        // Refinement data
6351        needs_refinement: bool,
6352        /// Index into original instances array (for accessing instance_bitmap)
6353        orig_idx: usize,
6354    }
6355
6356    let strip_width = 1i32 << params.log_strips.min(3);
6357    let mut encoded_instances = Vec::with_capacity(instances.len());
6358
6359    for (orig_idx, instance) in instances.iter().enumerate() {
6360        let gs_idx = instance.symbol_index;
6361        let sym = &all_symbols[gs_idx];
6362
6363        let symbol_id = if let Some(symbol_id) = symbol_id_from_dense_maps(
6364            gs_idx,
6365            global_sym_to_dict_pos,
6366            num_global_dict_symbols,
6367            local_sym_to_dict_pos,
6368        ) {
6369            symbol_id
6370        } else {
6371            anyhow::bail!(
6372                "Symbol instance (global_symbols index {}) not found in any dictionary!",
6373                gs_idx
6374            );
6375        };
6376
6377        let abs = instance.position;
6378        let rel_x = abs.x as i32 - min_x as i32;
6379        // REFCORNER=TOPLEFT (value 1): T is the top of the original bounding box.
6380        let rel_y = abs.y as i32 - min_y as i32;
6381        let strip_base = (rel_y / strip_width) * strip_width;
6382        let t_offset = rel_y - strip_base;
6383
6384        encoded_instances.push(RefinedInstance {
6385            strip_base,
6386            x: rel_x,
6387            t_offset,
6388            symbol_id,
6389            symbol_width: sym.width as i32,
6390            needs_refinement: instance.needs_refinement,
6391            orig_idx,
6392        });
6393    }
6394
6395    encoded_instances.sort_by_key(|e| (e.strip_base, e.x));
6396
6397    // Encode strip-by-strip, symbol-by-symbol (same loop structure as non-refinement)
6398    let mut strip_t = 0i32;
6399    let mut first_s = 0i32;
6400    let mut idx = 0usize;
6401
6402    // Default refinement AT pixel: (-1, -1), matching jbig2enc convention
6403    let grat: [(i8, i8); 1] = [(-1, -1)];
6404
6405    // §6.4.5 step 1: initial STRIPT value (decoder reads one IADT before the loop)
6406    let _ = coder.encode_integer(IntProc::Iadt, 0);
6407
6408    while idx < encoded_instances.len() {
6409        let current_strip = encoded_instances[idx].strip_base;
6410        let delta_t = current_strip - strip_t;
6411        let _ = coder.encode_integer(IntProc::Iadt, delta_t / strip_width);
6412        strip_t = current_strip;
6413
6414        let mut first_symbol_in_strip = true;
6415        let mut current_s = 0i32;
6416
6417        while idx < encoded_instances.len() && encoded_instances[idx].strip_base == current_strip {
6418            let item = &encoded_instances[idx];
6419            if first_symbol_in_strip {
6420                let delta_fs = item.x - first_s;
6421                let _ = coder.encode_integer(IntProc::Iafs, delta_fs);
6422                first_s += delta_fs;
6423                current_s = first_s;
6424                first_symbol_in_strip = false;
6425            } else {
6426                let delta_s = item.x - current_s;
6427                let _ = coder.encode_integer(IntProc::Iads, delta_s);
6428                current_s += delta_s;
6429            }
6430
6431            if strip_width > 1 {
6432                let _ = coder.encode_integer(IntProc::Iait, item.t_offset);
6433            }
6434
6435            // Symbol ID
6436            let _ = coder.encode_iaid(item.symbol_id, symbol_id_bits as u8);
6437
6438            // ── SPM: Refinement indicator (RI) ──
6439            let ri = if item.needs_refinement { 1i32 } else { 0i32 };
6440            let _ = coder.encode_integer(IntProc::Iari, ri);
6441
6442            if item.needs_refinement {
6443                // Get the original instance data and the prototype
6444                let orig_instance = &instances[item.orig_idx];
6445                let prototype = &all_symbols[orig_instance.symbol_index];
6446
6447                // Trim the instance bitmap to get the actual glyph
6448                let (_, trimmed_instance) = orig_instance.instance_bitmap.trim();
6449
6450                // Size deltas: how much wider/taller is the instance vs prototype
6451                let rdwi = trimmed_instance.width as i32 - prototype.width as i32;
6452                let rdhi = trimmed_instance.height as i32 - prototype.height as i32;
6453
6454                let _ = coder.encode_integer(IntProc::Iardw, rdwi);
6455                let _ = coder.encode_integer(IntProc::Iardh, rdhi);
6456
6457                // Position offsets for aligning the reference within the target.
6458                // Per §6.4.11.3.2: GRDX = (RDWI/2) + RDXI, GRDY = (RDHI/2) + RDYI
6459                // Use the pre-computed alignment offsets from clustering
6460                let rdxi = orig_instance.refinement_dx;
6461                let rdyi = orig_instance.refinement_dy;
6462
6463                let _ = coder.encode_integer(IntProc::Iardx, rdxi);
6464                let _ = coder.encode_integer(IntProc::Iardy, rdyi);
6465
6466                // Compute GRDX/GRDY for the refinement region
6467                let grdx = (rdwi / 2) + rdxi;
6468                let grdy = (rdhi / 2) + rdyi;
6469
6470                // Encode the refinement region: pixel-by-pixel difference
6471                // between the trimmed instance and the prototype
6472                coder.encode_refinement_region(
6473                    &trimmed_instance,
6474                    prototype,
6475                    grdx,
6476                    grdy,
6477                    config.text_refine_template,
6478                    &grat,
6479                )?;
6480
6481                // Reset refinement contexts between instances (per JBIG2 spec)
6482                coder.reset_refinement_contexts();
6483            }
6484
6485            current_s += item.symbol_width - 1;
6486            idx += 1;
6487        }
6488        let _ = coder.encode_oob(IntProc::Iads);
6489    }
6490
6491    coder.flush(true);
6492    payload.extend(coder.as_bytes());
6493
6494    Ok(payload)
6495}
6496
6497/// Encodes a text region segment to the output.
6498///
6499/// This function takes a list of symbols and their instances in the text region,
6500/// and encodes them according to JBIG2 spec §6.4.10. It supports both absolute coordinates
6501/// and IADW/IADH delta encoding for more efficient compression.
6502pub fn encode_text_region(
6503    instances: &[SymbolInstance],
6504    config: &Jbig2Config,
6505    all_known_symbols: &[&BitImage],
6506    global_dict_indices: &[usize],
6507    local_dict_indices: &[usize],
6508) -> Result<Vec<u8>> {
6509    // Validate instances
6510    if instances.is_empty() {
6511        return Err(anyhow!("No symbol instances provided for text region"));
6512    }
6513
6514    // Validate global dictionary indices
6515    if global_dict_indices
6516        .iter()
6517        .any(|&idx| idx >= all_known_symbols.len())
6518    {
6519        return Err(anyhow!("Invalid global dictionary index in text region"));
6520    }
6521
6522    // Validate local dictionary indices if provided
6523    if !local_dict_indices.is_empty() {
6524        if local_dict_indices
6525            .iter()
6526            .any(|&idx| idx >= all_known_symbols.len())
6527        {
6528            return Err(anyhow!("Invalid local dictionary index in text region"));
6529        }
6530    }
6531
6532    // Validate each instance
6533    for (i, instance) in instances.iter().enumerate() {
6534        if instance.symbol_index >= all_known_symbols.len() {
6535            return Err(anyhow!(
6536                "Symbol instance {} references invalid symbol index {} (max {})",
6537                i,
6538                instance.symbol_index,
6539                all_known_symbols.len() - 1
6540            ));
6541        }
6542
6543        let symbol = &all_known_symbols[instance.symbol_index];
6544        if instance.position.x as u64 + symbol.width as u64 > u32::MAX as u64
6545            || instance.position.y as u64 + symbol.height as u64 > u32::MAX as u64
6546        {
6547            return Err(anyhow!(
6548                "Symbol instance {} at position ({}, {}) would overflow 32-bit coordinates",
6549                i,
6550                instance.position.x,
6551                instance.position.y
6552            ));
6553        }
6554    }
6555    let mut payload = Vec::new();
6556    let mut coder = Jbig2ArithCoder::new();
6557
6558    let mut min_x = u32::MAX;
6559    let mut min_y = u32::MAX;
6560    let mut max_x_coord = 0;
6561    let mut max_y_coord = 0;
6562
6563    if instances.is_empty() {
6564        min_x = 0;
6565        min_y = 0;
6566    } else {
6567        for instance in instances {
6568            let pos = instance.position();
6569            let sym_idx_in_all_known_list = instance.symbol_index();
6570            let symbol_width = all_known_symbols[sym_idx_in_all_known_list].width as i32;
6571            let symbol_height = all_known_symbols[sym_idx_in_all_known_list].height as i32;
6572
6573            min_x = min_x.min(pos.x as u32);
6574            min_y = min_y.min(pos.y as u32);
6575            max_x_coord = max_x_coord.max((pos.x as i32 + symbol_width) as u32);
6576            max_y_coord = max_y_coord.max((pos.y as i32 + symbol_height) as u32);
6577        }
6578    }
6579
6580    let region_width = if max_x_coord > min_x {
6581        max_x_coord - min_x
6582    } else {
6583        0
6584    };
6585    let region_height = if max_y_coord > min_y {
6586        max_y_coord - min_y
6587    } else {
6588        0
6589    };
6590
6591    let params = TextRegionParams {
6592        width: region_width,
6593        height: region_height,
6594        x: min_x,
6595        y: min_y,
6596        ds_offset: config.text_ds_offset,
6597        refine: config.text_refine,
6598        log_strips: config.text_log_strips,
6599        ref_corner: config.text_ref_corner,
6600        transposed: config.text_transposed,
6601        comb_op: config.text_comb_op,
6602        refine_template: config.text_refine_template,
6603    };
6604    if cfg!(debug_assertions) {
6605        trace!("encode_text_region: TextRegionParams details: {:?}", params);
6606    }
6607    // Write text-region header and number of instances (SBNUMINSTANCES).
6608    payload.extend(params.to_bytes());
6609    payload.extend_from_slice(&(instances.len() as u32).to_be_bytes());
6610
6611    // Number of bits used by IAID symbol coding.
6612    let num_total_dict_symbols = (global_dict_indices.len() + local_dict_indices.len()) as u32;
6613    let symbol_id_bits = log2up(num_total_dict_symbols.max(1)).max(1);
6614
6615    #[derive(Clone, Copy)]
6616    struct EncodedInstance {
6617        strip_base: i32,
6618        x: i32,
6619        t_offset: i32,
6620        symbol_id: u32,
6621        symbol_width: i32,
6622    }
6623
6624    let strip_width = 1i32 << params.log_strips.min(3);
6625    let mut encoded_instances = Vec::with_capacity(instances.len());
6626
6627    for instance in instances {
6628        let sym_idx_in_all_known_list = instance.symbol_index();
6629        let symbol_props = &all_known_symbols[sym_idx_in_all_known_list];
6630        let symbol_id_to_encode = if let Some(pos_global) = global_dict_indices
6631            .iter()
6632            .position(|&idx| idx == sym_idx_in_all_known_list)
6633        {
6634            pos_global as u32
6635        } else if let Some(pos_local) = local_dict_indices
6636            .iter()
6637            .position(|&idx| idx == sym_idx_in_all_known_list)
6638        {
6639            (global_dict_indices.len() + pos_local) as u32
6640        } else {
6641            anyhow::bail!(
6642                "Symbol instance (index {}) not found in referred dictionaries!",
6643                sym_idx_in_all_known_list
6644            );
6645        };
6646
6647        // REFCORNER=TOPLEFT (value 1): T is the top of the original bounding box.
6648        let abs = instance.position();
6649        let rel_x = abs.x as i32 - min_x as i32;
6650        let rel_y = abs.y as i32 - min_y as i32;
6651        let strip_base = (rel_y / strip_width) * strip_width;
6652        let t_offset = rel_y - strip_base;
6653
6654        encoded_instances.push(EncodedInstance {
6655            strip_base,
6656            x: rel_x,
6657            t_offset,
6658            symbol_id: symbol_id_to_encode,
6659            symbol_width: symbol_props.width as i32,
6660        });
6661    }
6662
6663    // Sort strip-wise (top to bottom), then left to right inside each strip.
6664    encoded_instances.sort_by_key(|e| (e.strip_base, e.x));
6665
6666    let mut strip_t = 0i32;
6667    let mut first_s = 0i32;
6668    let mut idx = 0usize;
6669
6670    // §6.4.5 step 1: initial STRIPT value (decoder reads one IADT before the loop)
6671    let _ = coder.encode_integer(IntProc::Iadt, 0);
6672
6673    while idx < encoded_instances.len() {
6674        let current_strip = encoded_instances[idx].strip_base;
6675        let delta_t = current_strip - strip_t;
6676        let _ = coder.encode_integer(IntProc::Iadt, delta_t / strip_width);
6677        strip_t = current_strip;
6678
6679        let mut first_symbol_in_strip = true;
6680        let mut current_s = 0i32;
6681        while idx < encoded_instances.len() && encoded_instances[idx].strip_base == current_strip {
6682            let item = encoded_instances[idx];
6683            if first_symbol_in_strip {
6684                let delta_fs = item.x - first_s;
6685                let _ = coder.encode_integer(IntProc::Iafs, delta_fs);
6686                first_s += delta_fs;
6687                current_s = first_s;
6688                first_symbol_in_strip = false;
6689            } else {
6690                let delta_s = item.x - current_s;
6691                let _ = coder.encode_integer(IntProc::Iads, delta_s);
6692                current_s += delta_s;
6693            }
6694
6695            if strip_width > 1 {
6696                let _ = coder.encode_integer(IntProc::Iait, item.t_offset);
6697            }
6698            let _ = coder.encode_iaid(item.symbol_id, symbol_id_bits as u8);
6699            current_s += item.symbol_width - 1;
6700            idx += 1;
6701        }
6702        let _ = coder.encode_oob(IntProc::Iads);
6703    }
6704
6705    coder.flush(true);
6706    payload.extend(coder.as_bytes());
6707
6708    Ok(payload)
6709}
6710
6711// ── Union-Find helpers for symbol clustering ──────────────────────────
6712
6713fn uf_find(parent: &mut [usize], mut i: usize) -> usize {
6714    while parent[i] != i {
6715        parent[i] = parent[parent[i]]; // path halving
6716        i = parent[i];
6717    }
6718    i
6719}
6720
6721fn uf_union(parent: &mut [usize], rank: &mut [u32], a: usize, b: usize) {
6722    let ra = uf_find(parent, a);
6723    let rb = uf_find(parent, b);
6724    if ra == rb {
6725        return;
6726    }
6727    if rank[ra] < rank[rb] {
6728        parent[ra] = rb;
6729    } else if rank[ra] > rank[rb] {
6730        parent[rb] = ra;
6731    } else {
6732        parent[rb] = ra;
6733        rank[ra] += 1;
6734    }
6735}
6736
6737fn compute_symbol_hash(symbol: &BitImage) -> u32 {
6738    let w = symbol.width as u32;
6739    let h = symbol.height as u32;
6740    (10 * h + 10000 * w) % 10000000
6741}
6742
6743fn log2up(v: u32) -> u32 {
6744    if v == 0 {
6745        return 0;
6746    }
6747    let is_pow_of_2 = (v & (v - 1)) == 0;
6748    let mut r = 0;
6749    let mut val = v;
6750    while val > 1 {
6751        val >>= 1;
6752        r += 1;
6753    }
6754    r + if is_pow_of_2 { 0 } else { 1 }
6755}
6756
6757/// Encodes a sequence of images as a JBIG2 document.
6758///
6759/// # Arguments
6760/// * `images` - A slice of 2D arrays containing the input images
6761/// * `config` - Configuration for the encoder
6762///
6763/// # Returns
6764/// A `Result` containing the encoded JBIG2 document as a byte vector if successful,
6765/// or an error if encoding fails.
6766pub fn encode_document(images: &[Array2<u8>], config: &Jbig2Config) -> Result<Vec<u8>> {
6767    let mut encoder = Jbig2Encoder::new(config);
6768    for image in images {
6769        encoder.add_page(image)?;
6770    }
6771    encoder.flush()
6772}
6773
6774/// Represents a single symbol instance in a text region, with refinement info.
6775#[derive(Debug, Clone)]
6776pub struct TextRegionSymbolInstance {
6777    /// The ID of the symbol in the dictionary.
6778    pub symbol_id: u32,
6779    /// The x-coordinate of the instance's top-left corner.
6780    pub x: i32,
6781    /// The y-coordinate of the instance's top-left corner.
6782    pub y: i32,
6783    /// The horizontal refinement offset.
6784    pub dx: i32,
6785    /// The vertical refinement offset.
6786    pub dy: i32,
6787    /// Whether this instance is a refinement of a dictionary symbol.
6788    pub is_refinement: bool,
6789}
6790
6791impl TextRegionSymbolInstance {
6792    /// Returns the position of this symbol instance as a Rect.
6793    pub fn position(&self) -> crate::jbig2sym::Rect {
6794        crate::jbig2sym::Rect {
6795            x: self.x as u32,
6796            y: self.y as u32,
6797            width: 0,  // These will be set by the caller
6798            height: 0, // These will be set by the caller
6799        }
6800    }
6801
6802    /// Returns the symbol index for this instance.
6803    pub fn symbol_index(&self) -> usize {
6804        self.symbol_id as usize
6805    }
6806
6807    /// Converts to a SymbolInstance
6808    pub fn to_symbol_instance(&self, symbol_bitmap: &BitImage) -> SymbolInstance {
6809        SymbolInstance {
6810            symbol_index: self.symbol_id as usize,
6811            position: self.position(),
6812            instance_bitmap: symbol_bitmap.clone(),
6813            needs_refinement: self.is_refinement,
6814            refinement_dx: self.dx,
6815            refinement_dy: self.dy,
6816        }
6817    }
6818}
6819
6820pub fn build_dictionary_and_get_instances(
6821    symbols: &[(Rect, BitImage)],
6822    comparator: &mut Comparator,
6823) -> (Vec<BitImage>, Vec<TextRegionSymbolInstance>) {
6824    let mut dictionary_symbols: Vec<BitImage> = Vec::with_capacity(symbols.len());
6825    let mut dictionary_black_pixels = Vec::with_capacity(symbols.len());
6826    let mut instances = Vec::with_capacity(symbols.len());
6827
6828    for (rect, symbol_image) in symbols.iter() {
6829        let mut found_match = false;
6830        // Use a 10% error threshold for matching, as recommended.
6831        let max_err = ((symbol_image.width * symbol_image.height) / 10).max(3) as u32;
6832        let symbol_black_pixels = symbol_image.count_ones();
6833
6834        for (dict_idx, dict_symbol) in dictionary_symbols.iter().enumerate() {
6835            if symbol_image.width.abs_diff(dict_symbol.width) > MAX_DIMENSION_DELTA
6836                || symbol_image.height.abs_diff(dict_symbol.height) > MAX_DIMENSION_DELTA
6837            {
6838                continue;
6839            }
6840
6841            if symbol_black_pixels.abs_diff(dictionary_black_pixels[dict_idx]) > max_err as usize {
6842                continue;
6843            }
6844
6845            // Use a low max_err for finding near-duplicates
6846            if let Some((err, dx, dy)) = comparator.distance(symbol_image, dict_symbol, max_err) {
6847                instances.push(TextRegionSymbolInstance {
6848                    symbol_id: dict_idx as u32,
6849                    x: rect.x as i32,
6850                    y: rect.y as i32,
6851                    dx,
6852                    dy,
6853                    is_refinement: err > 0,
6854                });
6855                found_match = true;
6856                break;
6857            }
6858        }
6859
6860        if !found_match {
6861            let new_idx = dictionary_symbols.len();
6862            dictionary_symbols.push(symbol_image.clone());
6863            dictionary_black_pixels.push(symbol_black_pixels);
6864            instances.push(TextRegionSymbolInstance {
6865                symbol_id: new_idx as u32,
6866                x: rect.x as i32,
6867                y: rect.y as i32,
6868                dx: 0,
6869                dy: 0,
6870                is_refinement: false,
6871            });
6872        }
6873    }
6874
6875    (dictionary_symbols, instances)
6876}
6877
6878/// Encodes a single page image using a symbol dictionary.
6879/// This is a high-level function that demonstrates the new encoding pipeline.
6880pub fn encode_page_with_symbol_dictionary(
6881    image: &BitImage,
6882    config: &Jbig2Config,
6883    next_segment_num: u32,
6884) -> Result<(Vec<u8>, u32)> {
6885    // 1. Extract symbols from the page image using CC analysis
6886    #[cfg(feature = "cc-analysis")]
6887    let extracted_symbols = {
6888        let dpi = 300; // Default DPI
6889        let losslevel = if config.is_lossless { 0 } else { 1 };
6890        let cc_image = analyze_page(image, dpi, losslevel);
6891        let shapes = cc_image.extract_shapes();
6892        // Convert to (Rect, BitImage) format
6893        shapes
6894            .into_iter()
6895            .map(|(bitmap, bbox)| {
6896                let rect = Rect {
6897                    x: bbox.xmin as u32,
6898                    y: bbox.ymin as u32,
6899                    width: bbox.width() as u32,
6900                    height: bbox.height() as u32,
6901                };
6902                (rect, bitmap)
6903            })
6904            .collect::<Vec<_>>()
6905    };
6906    #[cfg(not(feature = "cc-analysis"))]
6907    let extracted_symbols: Vec<(Rect, BitImage)> = Vec::new();
6908
6909    if extracted_symbols.is_empty() {
6910        return Ok((Vec::new(), next_segment_num));
6911    }
6912
6913    // 2. Build the symbol dictionary and get symbol instances
6914    let mut comparator = Comparator::default();
6915    let (dictionary_symbols, text_region_instances) =
6916        build_dictionary_and_get_instances(&extracted_symbols, &mut comparator);
6917    debug!(
6918        "Built dictionary with {} symbols and {} instances",
6919        dictionary_symbols.len(),
6920        text_region_instances.len()
6921    );
6922
6923    let mut output = Vec::new();
6924    let mut current_segment_number = next_segment_num;
6925
6926    // 3. Encode the symbol dictionary segment, getting the final symbol-ID mapping.
6927    let dict_refs: Vec<&BitImage> = dictionary_symbols.iter().collect();
6928    let dict_layout = plan_symbol_dictionary_layout(&dict_refs, config, None)?;
6929    let encoded_dict = encode_symbol_dictionary_segments(&dict_refs, config, &dict_layout)?;
6930    let dict_segment_number = current_segment_number;
6931    current_segment_number += 1;
6932    Segment {
6933        number: dict_segment_number,
6934        seg_type: SegmentType::SymbolDictionary,
6935        referred_to: Vec::new(),
6936        page: Some(1),
6937        payload: encoded_dict.payload.clone(),
6938        ..Default::default()
6939    }
6940    .write_into(&mut output)?;
6941
6942    // 4. Encode the text region segment using canonical symbol IDs.
6943    let mut symbol_instances: Vec<SymbolInstance> = text_region_instances
6944        .iter()
6945        .map(|instance| {
6946            let orig_id = instance.symbol_id as usize;
6947            let symbol_bitmap = if orig_id < dictionary_symbols.len() {
6948                &dictionary_symbols[orig_id]
6949            } else {
6950                &dictionary_symbols[0]
6951            };
6952            SymbolInstance {
6953                symbol_index: orig_id,
6954                position: instance.position(),
6955                instance_bitmap: symbol_bitmap.clone(),
6956                needs_refinement: instance.is_refinement,
6957                refinement_dx: instance.dx,
6958                refinement_dy: instance.dy,
6959            }
6960        })
6961        .collect();
6962
6963    for (orig_idx, refinement) in dict_layout.refinements.iter().enumerate() {
6964        if let Some(refinement) = refinement {
6965            for instance in &mut symbol_instances {
6966                if instance.symbol_index == orig_idx {
6967                    instance.symbol_index = refinement.prototype_input_index;
6968                    instance.needs_refinement = true;
6969                    instance.refinement_dx = refinement.refinement_dx;
6970                    instance.refinement_dy = refinement.refinement_dy;
6971                }
6972            }
6973        }
6974    }
6975
6976    let region_payload = if !config.uses_lossy_symbol_dictionary()
6977        && (config.refine || symbol_instances.iter().any(|inst| inst.needs_refinement))
6978    {
6979        encode_text_region_with_refinement(
6980            &symbol_instances,
6981            config,
6982            &dictionary_symbols,
6983            &encoded_dict.input_to_exported_pos,
6984            encoded_dict.exported_symbol_count,
6985            &[],
6986            0,
6987        )?
6988    } else {
6989        encode_text_region_mapped(
6990            &symbol_instances,
6991            config,
6992            &dictionary_symbols,
6993            &encoded_dict.input_to_exported_pos,
6994            encoded_dict.exported_symbol_count,
6995            &[],
6996            0,
6997            0,
6998        )?
6999    };
7000
7001    let region_segment = Segment {
7002        number: current_segment_number,
7003        seg_type: SegmentType::ImmediateTextRegion,
7004        retain_flags: 0,
7005        referred_to: vec![dict_segment_number],
7006        page: Some(1), // Assuming page 1
7007        payload: region_payload,
7008        ..Default::default()
7009    };
7010
7011    // You might want to log text_region_params here too if they are accessible
7012    region_segment.write_into(&mut output)?;
7013    current_segment_number += 1;
7014
7015    Ok((output, current_segment_number))
7016}
7017
7018pub fn get_version() -> &'static str {
7019    "0.2.0"
7020}
7021
7022#[inline]
7023pub fn hash_key(img: &BitImage) -> HashKey {
7024    // Dimension-based bucketing: symbols with similar dimensions land in the same
7025    // bucket, enabling fuzzy matching via the Comparator during extraction.
7026    // The Comparator handles size differences up to MAX_DIMENSION_DELTA (10px),
7027    // so we bucket by (height, width) to keep buckets tight.
7028    let h = img.height as u64;
7029    let w = img.width as u64;
7030    HashKey(h * 10_000 + w)
7031}
7032
7033/// Helper function to find the first black pixel in the BitImage
7034/// Returns (x, y) coordinates of the first black pixel, or None if no black pixels
7035pub fn first_black_pixel(image: &BitImage) -> Option<(usize, usize)> {
7036    for y in 0..image.height {
7037        for x in 0..image.width {
7038            if image.get_usize(x, y) {
7039                return Some((x, y));
7040            }
7041        }
7042    }
7043    None
7044}
7045
7046#[cfg(all(test, feature = "refine"))]
7047mod refine_tests {
7048    use super::*;
7049
7050    fn symbol_from_rows(rows: &[&str]) -> BitImage {
7051        let height = rows.len() as u32;
7052        let width = rows.first().map_or(0, |row| row.len()) as u32;
7053        let mut image = BitImage::new(width, height).expect("test bitmap");
7054        for (y, row) in rows.iter().enumerate() {
7055            for (x, ch) in row.bytes().enumerate() {
7056                if ch == b'1' {
7057                    image.set(x as u32, y as u32, true);
7058                }
7059            }
7060        }
7061        image
7062    }
7063
7064    #[test]
7065    fn refinement_layout_collapses_to_prototypes() {
7066        let base = symbol_from_rows(&["0110", "1001", "1111", "1001", "1001"]);
7067        let variant = symbol_from_rows(&["0110", "1001", "1111", "1001", "1001"]);
7068        let symbols = vec![&base, &variant];
7069
7070        let mut config = Jbig2Config::text();
7071        config.refine = true;
7072        config.text_refine = false;
7073
7074        let layout = plan_symbol_dictionary_layout(&symbols, &config, None).expect("layout");
7075        assert_eq!(layout.segment_count(), 1);
7076        assert_eq!(layout.export_input_indices.len(), 1);
7077        assert!(layout.refinements[1].is_some());
7078
7079        let encoded =
7080            encode_symbol_dictionary_segments(&symbols, &config, &layout).expect("encode");
7081        assert_eq!(encoded.exported_symbol_count, 1);
7082        assert!(
7083            encoded
7084                .input_to_exported_pos
7085                .iter()
7086                .all(|&pos| pos != u32::MAX)
7087        );
7088    }
7089}