Skip to main content

three_dcf_core/
encoder.rs

1use std::cmp::Reverse;
2use std::collections::HashMap;
3use std::fs;
4use std::path::Path;
5
6use html2text::from_read;
7use image::{self, DynamicImage};
8use pulldown_cmark::{html, Parser};
9use rayon::prelude::*;
10
11use crate::document::{
12    hash_payload, CellRecord, CellType, CodeHash, Document, Header, NumGuard, PageInfo,
13};
14use crate::error::{DcfError, Result};
15use crate::metrics::Metrics;
16use crate::normalization::{
17    classify_cell_type, importance_score, looks_like_table_with_tolerance, normalize_lines,
18    HyphenationMode, ImportanceTuning,
19};
20use crate::numguard;
21
22#[cfg(feature = "pdfium")]
23use pdfium_render::prelude::*;
24
25#[derive(Debug, Clone)]
26pub struct EncoderConfig {
27    pub preset: EncoderPreset,
28    pub grid: String,
29    pub codeset: String,
30    pub page_width_px: u32,
31    pub page_height_px: u32,
32    pub margin_left_px: i32,
33    pub margin_top_px: i32,
34    pub line_height_px: u32,
35    pub line_gap_px: u32,
36    pub budget: Option<usize>,
37    pub drop_footers: bool,
38    pub dedup_window_pages: u32,
39    pub hyphenation: HyphenationMode,
40    pub table_column_tolerance: u32,
41    pub enable_ocr: bool,
42    pub force_ocr: bool,
43    pub ocr_languages: Vec<String>,
44    pub importance: ImportanceTuning,
45}
46
47impl EncoderConfig {
48    fn new(preset: EncoderPreset) -> Self {
49        let (page_width_px, page_height_px, line_height_px, line_gap_px) = match preset {
50            EncoderPreset::Slides => (1920, 1080, 42, 12),
51            EncoderPreset::News => (1100, 1600, 28, 8),
52            EncoderPreset::Scans => (1400, 2000, 30, 8),
53            _ => (1024, 1400, 24, 6),
54        };
55        Self {
56            preset,
57            grid: "coarse".to_string(),
58            codeset: "HASH256".to_string(),
59            page_width_px,
60            page_height_px,
61            margin_left_px: 64,
62            margin_top_px: 64,
63            line_height_px,
64            line_gap_px,
65            budget: None,
66            drop_footers: false,
67            dedup_window_pages: 0,
68            hyphenation: HyphenationMode::Merge,
69            table_column_tolerance: 24,
70            enable_ocr: false,
71            force_ocr: false,
72            ocr_languages: vec!["eng".to_string()],
73            importance: ImportanceTuning::default(),
74        }
75    }
76}
77
78#[derive(Debug, Clone)]
79pub struct EncoderBuilder {
80    config: EncoderConfig,
81}
82
83impl EncoderBuilder {
84    pub fn new<S: AsRef<str>>(preset: S) -> Result<Self> {
85        Ok(Self {
86            config: EncoderConfig::new(EncoderPreset::from_str(preset.as_ref())?),
87        })
88    }
89
90    pub fn budget(mut self, budget: Option<usize>) -> Self {
91        self.config.budget = budget;
92        self
93    }
94
95    pub fn drop_footers(mut self, drop: bool) -> Self {
96        self.config.drop_footers = drop;
97        self
98    }
99
100    pub fn dedup_window(mut self, window: u32) -> Self {
101        self.config.dedup_window_pages = window;
102        self
103    }
104
105    pub fn hyphenation(mut self, mode: HyphenationMode) -> Self {
106        self.config.hyphenation = mode;
107        self
108    }
109
110    pub fn table_tolerance(mut self, tolerance: u32) -> Self {
111        self.config.table_column_tolerance = tolerance;
112        self
113    }
114
115    pub fn enable_ocr(mut self, enable: bool) -> Self {
116        self.config.enable_ocr = enable;
117        self
118    }
119
120    pub fn force_ocr(mut self, force: bool) -> Self {
121        self.config.force_ocr = force;
122        self
123    }
124
125    pub fn ocr_languages(mut self, langs: Vec<String>) -> Self {
126        if !langs.is_empty() {
127            self.config.ocr_languages = langs;
128        }
129        self
130    }
131
132    pub fn importance_tuning(mut self, tuning: ImportanceTuning) -> Self {
133        self.config.importance = tuning;
134        self
135    }
136
137    pub fn build(self) -> Encoder {
138        Encoder {
139            config: self.config,
140        }
141    }
142}
143
144#[derive(Debug, Clone)]
145pub struct Encoder {
146    config: EncoderConfig,
147}
148
149impl Encoder {
150    pub fn builder<S: AsRef<str>>(preset: S) -> Result<EncoderBuilder> {
151        EncoderBuilder::new(preset)
152    }
153
154    pub fn from_preset<S: AsRef<str>>(preset: S) -> Result<Self> {
155        Ok(Self::builder(preset)?.build())
156    }
157
158    pub fn with_budget(mut self, budget: usize) -> Self {
159        self.config.budget = Some(budget);
160        self
161    }
162
163    pub fn config(&self) -> &EncoderConfig {
164        &self.config
165    }
166
167    pub fn encode_path<P: AsRef<Path>>(&self, path: P) -> Result<(Document, Metrics)> {
168        let input = EncodeInput::from_path(path.as_ref(), &self.config)?;
169        self.encode(input)
170    }
171
172    pub fn encode_path_with_plaintext<P: AsRef<Path>>(
173        &self,
174        path: P,
175    ) -> Result<(Document, Metrics, String)> {
176        let input = EncodeInput::from_path(path.as_ref(), &self.config)?;
177        self.encode_with_plaintext(input)
178    }
179
180    pub fn encode_with_plaintext(&self, input: EncodeInput) -> Result<(Document, Metrics, String)> {
181        let raw_text = input.to_plaintext();
182        let (document, metrics) = self.encode(input)?;
183        Ok((document, metrics, raw_text))
184    }
185
186    pub fn encode(&self, input: EncodeInput) -> Result<(Document, Metrics)> {
187        let mut document = Document::new(Header {
188            version: 1,
189            grid: self.config.grid.clone(),
190            codeset: self.config.codeset.clone(),
191        });
192        let mut metrics = Metrics::default();
193        metrics.pages = clamp_usize_to_u32(input.pages.len());
194
195        for page in &input.pages {
196            document.add_page(PageInfo {
197                z: page.index,
198                width_px: page.width_px,
199                height_px: page.height_px,
200            });
201        }
202
203        let processed_pages = input
204            .pages
205            .into_par_iter()
206            .map(|page| self.encode_page(page))
207            .collect::<Vec<_>>();
208
209        let mut cells_total = 0usize;
210        let mut lines_total = 0usize;
211        for page_output in processed_pages {
212            let page_output = page_output?;
213            cells_total += page_output.cells.len();
214            lines_total += page_output.line_count;
215            document.cells.extend(page_output.cells);
216            for guard in page_output.numguards {
217                document.add_numguard(guard);
218            }
219            for (code, payload) in page_output.dict_entries {
220                document.dict.entry(code).or_insert(payload);
221            }
222        }
223        metrics.cells_total = clamp_usize_to_u32(cells_total);
224        metrics.lines_total = clamp_usize_to_u32(lines_total);
225
226        let unique_payloads = document.dict.len();
227
228        self.apply_budget(&mut document);
229        self.post_filters(&mut document);
230        self.annotate_rle(&mut document.cells);
231        metrics.cells_kept = clamp_usize_to_u32(document.cells.len());
232        metrics.numguard_count = clamp_usize_to_u32(document.numguards.len());
233        metrics.dedup_ratio = if unique_payloads == 0 {
234            0.0
235        } else {
236            metrics.cells_total as f32 / unique_payloads as f32
237        };
238
239        Ok((document, metrics))
240    }
241
242    fn encode_page(&self, page: PageBuffer) -> Result<PageResult> {
243        let normalized = normalize_lines(&page.lines, self.config.hyphenation);
244        let mut y = self.config.margin_top_px;
245        let mut cells = Vec::with_capacity(normalized.len());
246        let mut dict_entries = Vec::new();
247        let mut numguards_acc = Vec::new();
248        for (line_index, line) in normalized.iter().enumerate() {
249            let mut cell_type: CellType = classify_cell_type(line);
250            if cell_type == CellType::Text
251                && looks_like_table_with_tolerance(line, self.config.table_column_tolerance)
252            {
253                cell_type = CellType::Table;
254            }
255            let importance = importance_score(line, cell_type, line_index, &self.config.importance);
256            let code_id = hash_payload(line);
257            let w = (page.width_px as i32 - self.config.margin_left_px * 2).max(0) as u32;
258            let cell = CellRecord {
259                z: page.index,
260                x: self.config.margin_left_px,
261                y,
262                w,
263                h: self.config.line_height_px,
264                code_id,
265                rle: 0,
266                cell_type,
267                importance,
268            };
269            cells.push(cell);
270            dict_entries.push((code_id, line.clone()));
271            let guards = numguard::extract_guards(
272                line,
273                page.index,
274                self.config.margin_left_px as u32,
275                y as u32,
276            );
277            numguards_acc.extend(guards);
278            y += (self.config.line_height_px + self.config.line_gap_px) as i32;
279        }
280        Ok(PageResult {
281            cells,
282            dict_entries,
283            numguards: numguards_acc,
284            line_count: normalized.len(),
285        })
286    }
287
288    fn apply_budget(&self, doc: &mut Document) {
289        if let Some(limit) = self.config.budget {
290            if doc.cells.len() <= limit {
291                return;
292            }
293            doc.cells.sort_by_key(|c| (Reverse(c.importance), c.key()));
294            doc.cells.truncate(limit);
295            doc.cells.sort_by_key(|c| c.key());
296            doc.retain_dict_for_cells();
297        }
298    }
299
300    fn post_filters(&self, doc: &mut Document) {
301        if self.config.drop_footers {
302            doc.cells.retain(|c| c.cell_type != CellType::Footer);
303        }
304        if self.config.dedup_window_pages > 0 {
305            let mut seen: HashMap<CodeHash, Vec<u32>> = HashMap::new();
306            doc.cells.retain(|cell| {
307                let entry = seen.entry(cell.code_id).or_insert_with(Vec::new);
308                if entry
309                    .iter()
310                    .any(|z| cell.z.abs_diff(*z) <= self.config.dedup_window_pages)
311                {
312                    false
313                } else {
314                    entry.push(cell.z);
315                    true
316                }
317            });
318        }
319        doc.cells.sort_by_key(|c| c.key());
320        doc.retain_dict_for_cells();
321    }
322
323    fn annotate_rle(&self, cells: &mut [CellRecord]) {
324        if cells.is_empty() {
325            return;
326        }
327        let mut i = 0;
328        while i < cells.len() {
329            let mut run = 1;
330            while i + run < cells.len() && cells[i + run].code_id == cells[i].code_id {
331                run += 1;
332            }
333            cells[i].rle = (run - 1) as u32;
334            for j in 1..run {
335                cells[i + j].rle = 0;
336            }
337            i += run;
338        }
339    }
340}
341
342#[derive(Debug, Clone)]
343pub struct EncodeInput {
344    pub pages: Vec<PageBuffer>,
345}
346
347impl EncodeInput {
348    pub fn from_path(path: &Path, config: &EncoderConfig) -> Result<Self> {
349        let ext = path
350            .extension()
351            .and_then(|ext| ext.to_str())
352            .map(|s| s.to_lowercase());
353
354        match ext.as_deref() {
355            Some("txt") | Some("text") => {
356                let content = read_text_lossy(path)?;
357                Ok(Self {
358                    pages: text_to_pages(&content, config),
359                })
360            }
361            Some("md") | Some("markdown") => {
362                let content = read_text_lossy(path)?;
363                let markdown = markdown_to_text(&content);
364                Ok(Self {
365                    pages: text_to_pages(&markdown, config),
366                })
367            }
368            Some("html") | Some("htm") => {
369                let content = read_text_lossy(path)?;
370                let flattened = html_to_plaintext(&content);
371                Ok(Self {
372                    pages: text_to_pages(&flattened, config),
373                })
374            }
375            Some("tex") | Some("json") | Some("bib") => {
376                let content = read_text_lossy(path)?;
377                Ok(Self {
378                    pages: text_to_pages(&content, config),
379                })
380            }
381            Some("pdf") => Self::from_pdf(path, config),
382            Some(ext) if is_image_ext(ext) => Self::from_image(path, config),
383            None => {
384                let content = read_text_lossy(path)?;
385                Ok(Self {
386                    pages: text_to_pages(&content, config),
387                })
388            }
389            _ => Err(DcfError::UnsupportedInput(path.to_path_buf())),
390        }
391    }
392
393    fn from_pdf(path: &Path, config: &EncoderConfig) -> Result<Self> {
394        #[cfg(feature = "pdfium")]
395        {
396            match pdfium_pdf_to_pages(path, config) {
397                Ok(pages) => return Ok(Self { pages }),
398                Err(err) => {
399                    tracing::warn!("pdfium read failed: {err}");
400                }
401            }
402        }
403        let pages = fallback_pdf_to_pages(path, config)?;
404        Ok(Self { pages })
405    }
406
407    fn from_image(path: &Path, config: &EncoderConfig) -> Result<Self> {
408        let image = image::open(path).map_err(|e| {
409            DcfError::Other(format!("failed to open image {}: {e}", path.display()))
410        })?;
411        let pages = ocr_image_to_pages(image, config)?;
412        Ok(Self { pages })
413    }
414
415    pub fn to_plaintext(&self) -> String {
416        let mut buffer = String::new();
417        for (idx, page) in self.pages.iter().enumerate() {
418            if idx > 0 {
419                buffer.push_str("\n\n");
420            }
421            for line in &page.lines {
422                buffer.push_str(line);
423                buffer.push('\n');
424            }
425        }
426        buffer
427    }
428}
429
430#[derive(Debug, Clone)]
431pub struct PageBuffer {
432    pub index: u32,
433    pub width_px: u32,
434    pub height_px: u32,
435    pub lines: Vec<String>,
436}
437
438impl PageBuffer {
439    fn from_text(index: u32, text: &str, config: &EncoderConfig) -> Self {
440        let wrap_width = (config.page_width_px / 10).max(40) as usize;
441        let mut lines = Vec::new();
442        for raw_line in text.lines() {
443            if raw_line.trim().is_empty() {
444                lines.push(String::new());
445                continue;
446            }
447            for chunk in wrap_line(raw_line, wrap_width) {
448                lines.push(chunk);
449            }
450        }
451        if lines.is_empty() {
452            lines.push(String::new());
453        }
454        Self {
455            index,
456            width_px: config.page_width_px,
457            height_px: config.page_height_px,
458            lines,
459        }
460    }
461}
462
463#[derive(Debug, Clone)]
464struct PageResult {
465    cells: Vec<CellRecord>,
466    dict_entries: Vec<(CodeHash, String)>,
467    numguards: Vec<NumGuard>,
468    line_count: usize,
469}
470
471#[derive(Debug, Clone, Copy)]
472pub enum EncoderPreset {
473    Reports,
474    Slides,
475    News,
476    Scans,
477    Custom,
478}
479
480impl EncoderPreset {
481    pub fn from_str(name: &str) -> Result<Self> {
482        match name.to_lowercase().as_str() {
483            "reports" => Ok(Self::Reports),
484            "slides" => Ok(Self::Slides),
485            "news" => Ok(Self::News),
486            "scans" => Ok(Self::Scans),
487            "custom" => Ok(Self::Custom),
488            other => Err(DcfError::UnknownPreset(other.to_string())),
489        }
490    }
491}
492
493fn text_to_pages(text: &str, config: &EncoderConfig) -> Vec<PageBuffer> {
494    text.split('\u{c}')
495        .enumerate()
496        .map(|(idx, chunk)| PageBuffer::from_text(idx as u32, chunk, config))
497        .collect()
498}
499
500fn fallback_pdf_to_pages(path: &Path, config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
501    let pages = pdf_extract::extract_text_by_pages(path)
502        .map_err(|e| DcfError::Other(format!("pdf extract failed: {e}")))?;
503    Ok(pages
504        .into_iter()
505        .enumerate()
506        .map(|(idx, txt)| PageBuffer::from_text(idx as u32, &txt, config))
507        .collect())
508}
509
510#[cfg(feature = "pdfium")]
511fn pdfium_pdf_to_pages(path: &Path, config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
512    let bindings = Pdfium::bind_to_system_library()
513        .map_err(|e| DcfError::Other(format!("pdfium binding failed: {e}")))?;
514    let pdfium = Pdfium::new(bindings);
515    let document = pdfium
516        .load_pdf_from_file(path, None)
517        .map_err(|e| DcfError::Other(format!("pdfium load failed: {e}")))?;
518    let mut buffers = Vec::new();
519    for (idx, page) in document.pages().iter().enumerate() {
520        let mut page_text = page.text().ok().map(|t| t.all()).unwrap_or_default();
521        let mut should_ocr = config.force_ocr;
522        if !should_ocr {
523            let trimmed = page_text.trim();
524            if trimmed.is_empty() || trimmed.len() < 16 {
525                should_ocr = config.enable_ocr;
526            }
527        }
528        if should_ocr {
529            if !config.enable_ocr {
530                return Err(DcfError::OcrSupportDisabled);
531            }
532            let target_width: i32 = config
533                .page_width_px
534                .try_into()
535                .map_err(|_| DcfError::Other("page width exceeds i32::MAX".to_string()))?;
536            let target_height: i32 = config
537                .page_height_px
538                .try_into()
539                .map_err(|_| DcfError::Other("page height exceeds i32::MAX".to_string()))?;
540            let render_config = PdfRenderConfig::new()
541                .set_target_width(target_width)
542                .set_target_height(target_height);
543            let render = page
544                .render_with_config(&render_config)
545                .map_err(|e| DcfError::Other(format!("pdf render failed: {e}")))?;
546            let image = render.as_image();
547            page_text = crate::ocr::image_to_text(&image, &config.ocr_languages)?;
548        }
549        buffers.push(PageBuffer::from_text(idx as u32, &page_text, config));
550    }
551    Ok(buffers)
552}
553
554fn markdown_to_text(md: &str) -> String {
555    let mut html_buf = String::new();
556    html::push_html(&mut html_buf, Parser::new(md));
557    html_to_plaintext(&html_buf)
558}
559
560fn html_to_plaintext(html_src: &str) -> String {
561    from_read(html_src.as_bytes(), 80)
562}
563
564fn is_image_ext(ext: &str) -> bool {
565    matches!(
566        ext,
567        "png" | "jpg" | "jpeg" | "tif" | "tiff" | "bmp" | "webp" | "gif"
568    )
569}
570
571fn clamp_usize_to_u32(value: usize) -> u32 {
572    value.min(u32::MAX as usize) as u32
573}
574
575fn wrap_line(line: &str, width: usize) -> Vec<String> {
576    if line.len() <= width {
577        return vec![line.trim().to_string()];
578    }
579    let mut out = Vec::new();
580    let mut current = String::new();
581    for word in line.split_whitespace() {
582        if current.len() + word.len() + 1 > width && !current.is_empty() {
583            out.push(current.trim().to_string());
584            current.clear();
585        }
586        if !current.is_empty() {
587            current.push(' ');
588        }
589        current.push_str(word);
590    }
591    if !current.is_empty() {
592        out.push(current.trim().to_string());
593    }
594    if out.is_empty() {
595        out.push(line.trim().to_string());
596    }
597    out
598}
599
600fn read_text_lossy(path: &Path) -> Result<String> {
601    let bytes = fs::read(path)?;
602    Ok(String::from_utf8_lossy(&bytes).to_string())
603}
604
605#[cfg(feature = "ocr")]
606fn ocr_image_to_pages(image: DynamicImage, config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
607    if !config.enable_ocr {
608        return Err(DcfError::OcrSupportDisabled);
609    }
610    let text = crate::ocr::image_to_text(&image, &config.ocr_languages)?;
611    Ok(text_to_pages(&text, config))
612}
613
614#[cfg(not(feature = "ocr"))]
615fn ocr_image_to_pages(_image: DynamicImage, _config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
616    Err(DcfError::OcrSupportDisabled)
617}