1use std::cmp::Reverse;
2use std::collections::HashMap;
3use std::fs;
4use std::path::Path;
5
6use html2text::from_read;
7use image::{self, DynamicImage};
8use pulldown_cmark::{html, Parser};
9use rayon::prelude::*;
10
11use crate::document::{
12 hash_payload, CellRecord, CellType, CodeHash, Document, Header, NumGuard, PageInfo,
13};
14use crate::error::{DcfError, Result};
15use crate::metrics::Metrics;
16use crate::normalization::{
17 classify_cell_type, importance_score, looks_like_table_with_tolerance, normalize_lines,
18 HyphenationMode, ImportanceTuning,
19};
20use crate::numguard;
21
22#[cfg(feature = "pdfium")]
23use pdfium_render::prelude::*;
24
25#[derive(Debug, Clone)]
26pub struct EncoderConfig {
27 pub preset: EncoderPreset,
28 pub grid: String,
29 pub codeset: String,
30 pub page_width_px: u32,
31 pub page_height_px: u32,
32 pub margin_left_px: i32,
33 pub margin_top_px: i32,
34 pub line_height_px: u32,
35 pub line_gap_px: u32,
36 pub budget: Option<usize>,
37 pub drop_footers: bool,
38 pub dedup_window_pages: u32,
39 pub hyphenation: HyphenationMode,
40 pub table_column_tolerance: u32,
41 pub enable_ocr: bool,
42 pub force_ocr: bool,
43 pub ocr_languages: Vec<String>,
44 pub importance: ImportanceTuning,
45}
46
47impl EncoderConfig {
48 fn new(preset: EncoderPreset) -> Self {
49 let (page_width_px, page_height_px, line_height_px, line_gap_px) = match preset {
50 EncoderPreset::Slides => (1920, 1080, 42, 12),
51 EncoderPreset::News => (1100, 1600, 28, 8),
52 EncoderPreset::Scans => (1400, 2000, 30, 8),
53 _ => (1024, 1400, 24, 6),
54 };
55 Self {
56 preset,
57 grid: "coarse".to_string(),
58 codeset: "HASH256".to_string(),
59 page_width_px,
60 page_height_px,
61 margin_left_px: 64,
62 margin_top_px: 64,
63 line_height_px,
64 line_gap_px,
65 budget: None,
66 drop_footers: false,
67 dedup_window_pages: 0,
68 hyphenation: HyphenationMode::Merge,
69 table_column_tolerance: 24,
70 enable_ocr: false,
71 force_ocr: false,
72 ocr_languages: vec!["eng".to_string()],
73 importance: ImportanceTuning::default(),
74 }
75 }
76}
77
78#[derive(Debug, Clone)]
79pub struct EncoderBuilder {
80 config: EncoderConfig,
81}
82
83impl EncoderBuilder {
84 pub fn new<S: AsRef<str>>(preset: S) -> Result<Self> {
85 Ok(Self {
86 config: EncoderConfig::new(EncoderPreset::from_str(preset.as_ref())?),
87 })
88 }
89
90 pub fn budget(mut self, budget: Option<usize>) -> Self {
91 self.config.budget = budget;
92 self
93 }
94
95 pub fn drop_footers(mut self, drop: bool) -> Self {
96 self.config.drop_footers = drop;
97 self
98 }
99
100 pub fn dedup_window(mut self, window: u32) -> Self {
101 self.config.dedup_window_pages = window;
102 self
103 }
104
105 pub fn hyphenation(mut self, mode: HyphenationMode) -> Self {
106 self.config.hyphenation = mode;
107 self
108 }
109
110 pub fn table_tolerance(mut self, tolerance: u32) -> Self {
111 self.config.table_column_tolerance = tolerance;
112 self
113 }
114
115 pub fn enable_ocr(mut self, enable: bool) -> Self {
116 self.config.enable_ocr = enable;
117 self
118 }
119
120 pub fn force_ocr(mut self, force: bool) -> Self {
121 self.config.force_ocr = force;
122 self
123 }
124
125 pub fn ocr_languages(mut self, langs: Vec<String>) -> Self {
126 if !langs.is_empty() {
127 self.config.ocr_languages = langs;
128 }
129 self
130 }
131
132 pub fn importance_tuning(mut self, tuning: ImportanceTuning) -> Self {
133 self.config.importance = tuning;
134 self
135 }
136
137 pub fn build(self) -> Encoder {
138 Encoder {
139 config: self.config,
140 }
141 }
142}
143
144#[derive(Debug, Clone)]
145pub struct Encoder {
146 config: EncoderConfig,
147}
148
149impl Encoder {
150 pub fn builder<S: AsRef<str>>(preset: S) -> Result<EncoderBuilder> {
151 EncoderBuilder::new(preset)
152 }
153
154 pub fn from_preset<S: AsRef<str>>(preset: S) -> Result<Self> {
155 Ok(Self::builder(preset)?.build())
156 }
157
158 pub fn with_budget(mut self, budget: usize) -> Self {
159 self.config.budget = Some(budget);
160 self
161 }
162
163 pub fn config(&self) -> &EncoderConfig {
164 &self.config
165 }
166
167 pub fn encode_path<P: AsRef<Path>>(&self, path: P) -> Result<(Document, Metrics)> {
168 let input = EncodeInput::from_path(path.as_ref(), &self.config)?;
169 self.encode(input)
170 }
171
172 pub fn encode_path_with_plaintext<P: AsRef<Path>>(
173 &self,
174 path: P,
175 ) -> Result<(Document, Metrics, String)> {
176 let input = EncodeInput::from_path(path.as_ref(), &self.config)?;
177 self.encode_with_plaintext(input)
178 }
179
180 pub fn encode_with_plaintext(&self, input: EncodeInput) -> Result<(Document, Metrics, String)> {
181 let raw_text = input.to_plaintext();
182 let (document, metrics) = self.encode(input)?;
183 Ok((document, metrics, raw_text))
184 }
185
186 pub fn encode(&self, input: EncodeInput) -> Result<(Document, Metrics)> {
187 let mut document = Document::new(Header {
188 version: 1,
189 grid: self.config.grid.clone(),
190 codeset: self.config.codeset.clone(),
191 });
192 let mut metrics = Metrics::default();
193 metrics.pages = clamp_usize_to_u32(input.pages.len());
194
195 for page in &input.pages {
196 document.add_page(PageInfo {
197 z: page.index,
198 width_px: page.width_px,
199 height_px: page.height_px,
200 });
201 }
202
203 let processed_pages = input
204 .pages
205 .into_par_iter()
206 .map(|page| self.encode_page(page))
207 .collect::<Vec<_>>();
208
209 let mut cells_total = 0usize;
210 let mut lines_total = 0usize;
211 for page_output in processed_pages {
212 let page_output = page_output?;
213 cells_total += page_output.cells.len();
214 lines_total += page_output.line_count;
215 document.cells.extend(page_output.cells);
216 for guard in page_output.numguards {
217 document.add_numguard(guard);
218 }
219 for (code, payload) in page_output.dict_entries {
220 document.dict.entry(code).or_insert(payload);
221 }
222 }
223 metrics.cells_total = clamp_usize_to_u32(cells_total);
224 metrics.lines_total = clamp_usize_to_u32(lines_total);
225
226 let unique_payloads = document.dict.len();
227
228 self.apply_budget(&mut document);
229 self.post_filters(&mut document);
230 self.annotate_rle(&mut document.cells);
231 metrics.cells_kept = clamp_usize_to_u32(document.cells.len());
232 metrics.numguard_count = clamp_usize_to_u32(document.numguards.len());
233 metrics.dedup_ratio = if unique_payloads == 0 {
234 0.0
235 } else {
236 metrics.cells_total as f32 / unique_payloads as f32
237 };
238
239 Ok((document, metrics))
240 }
241
242 fn encode_page(&self, page: PageBuffer) -> Result<PageResult> {
243 let normalized = normalize_lines(&page.lines, self.config.hyphenation);
244 let mut y = self.config.margin_top_px;
245 let mut cells = Vec::with_capacity(normalized.len());
246 let mut dict_entries = Vec::new();
247 let mut numguards_acc = Vec::new();
248 for (line_index, line) in normalized.iter().enumerate() {
249 let mut cell_type: CellType = classify_cell_type(line);
250 if cell_type == CellType::Text
251 && looks_like_table_with_tolerance(line, self.config.table_column_tolerance)
252 {
253 cell_type = CellType::Table;
254 }
255 let importance = importance_score(line, cell_type, line_index, &self.config.importance);
256 let code_id = hash_payload(line);
257 let w = (page.width_px as i32 - self.config.margin_left_px * 2).max(0) as u32;
258 let cell = CellRecord {
259 z: page.index,
260 x: self.config.margin_left_px,
261 y,
262 w,
263 h: self.config.line_height_px,
264 code_id,
265 rle: 0,
266 cell_type,
267 importance,
268 };
269 cells.push(cell);
270 dict_entries.push((code_id, line.clone()));
271 let guards = numguard::extract_guards(
272 line,
273 page.index,
274 self.config.margin_left_px as u32,
275 y as u32,
276 );
277 numguards_acc.extend(guards);
278 y += (self.config.line_height_px + self.config.line_gap_px) as i32;
279 }
280 Ok(PageResult {
281 cells,
282 dict_entries,
283 numguards: numguards_acc,
284 line_count: normalized.len(),
285 })
286 }
287
288 fn apply_budget(&self, doc: &mut Document) {
289 if let Some(limit) = self.config.budget {
290 if doc.cells.len() <= limit {
291 return;
292 }
293 doc.cells.sort_by_key(|c| (Reverse(c.importance), c.key()));
294 doc.cells.truncate(limit);
295 doc.cells.sort_by_key(|c| c.key());
296 doc.retain_dict_for_cells();
297 }
298 }
299
300 fn post_filters(&self, doc: &mut Document) {
301 if self.config.drop_footers {
302 doc.cells.retain(|c| c.cell_type != CellType::Footer);
303 }
304 if self.config.dedup_window_pages > 0 {
305 let mut seen: HashMap<CodeHash, Vec<u32>> = HashMap::new();
306 doc.cells.retain(|cell| {
307 let entry = seen.entry(cell.code_id).or_insert_with(Vec::new);
308 if entry
309 .iter()
310 .any(|z| cell.z.abs_diff(*z) <= self.config.dedup_window_pages)
311 {
312 false
313 } else {
314 entry.push(cell.z);
315 true
316 }
317 });
318 }
319 doc.cells.sort_by_key(|c| c.key());
320 doc.retain_dict_for_cells();
321 }
322
323 fn annotate_rle(&self, cells: &mut [CellRecord]) {
324 if cells.is_empty() {
325 return;
326 }
327 let mut i = 0;
328 while i < cells.len() {
329 let mut run = 1;
330 while i + run < cells.len() && cells[i + run].code_id == cells[i].code_id {
331 run += 1;
332 }
333 cells[i].rle = (run - 1) as u32;
334 for j in 1..run {
335 cells[i + j].rle = 0;
336 }
337 i += run;
338 }
339 }
340}
341
342#[derive(Debug, Clone)]
343pub struct EncodeInput {
344 pub pages: Vec<PageBuffer>,
345}
346
347impl EncodeInput {
348 pub fn from_path(path: &Path, config: &EncoderConfig) -> Result<Self> {
349 let ext = path
350 .extension()
351 .and_then(|ext| ext.to_str())
352 .map(|s| s.to_lowercase());
353
354 match ext.as_deref() {
355 Some("txt") | Some("text") => {
356 let content = read_text_lossy(path)?;
357 Ok(Self {
358 pages: text_to_pages(&content, config),
359 })
360 }
361 Some("md") | Some("markdown") => {
362 let content = read_text_lossy(path)?;
363 let markdown = markdown_to_text(&content);
364 Ok(Self {
365 pages: text_to_pages(&markdown, config),
366 })
367 }
368 Some("html") | Some("htm") => {
369 let content = read_text_lossy(path)?;
370 let flattened = html_to_plaintext(&content);
371 Ok(Self {
372 pages: text_to_pages(&flattened, config),
373 })
374 }
375 Some("tex") | Some("json") | Some("bib") => {
376 let content = read_text_lossy(path)?;
377 Ok(Self {
378 pages: text_to_pages(&content, config),
379 })
380 }
381 Some("pdf") => Self::from_pdf(path, config),
382 Some(ext) if is_image_ext(ext) => Self::from_image(path, config),
383 None => {
384 let content = read_text_lossy(path)?;
385 Ok(Self {
386 pages: text_to_pages(&content, config),
387 })
388 }
389 _ => Err(DcfError::UnsupportedInput(path.to_path_buf())),
390 }
391 }
392
393 fn from_pdf(path: &Path, config: &EncoderConfig) -> Result<Self> {
394 #[cfg(feature = "pdfium")]
395 {
396 match pdfium_pdf_to_pages(path, config) {
397 Ok(pages) => return Ok(Self { pages }),
398 Err(err) => {
399 tracing::warn!("pdfium read failed: {err}");
400 }
401 }
402 }
403 let pages = fallback_pdf_to_pages(path, config)?;
404 Ok(Self { pages })
405 }
406
407 fn from_image(path: &Path, config: &EncoderConfig) -> Result<Self> {
408 let image = image::open(path).map_err(|e| {
409 DcfError::Other(format!("failed to open image {}: {e}", path.display()))
410 })?;
411 let pages = ocr_image_to_pages(image, config)?;
412 Ok(Self { pages })
413 }
414
415 pub fn to_plaintext(&self) -> String {
416 let mut buffer = String::new();
417 for (idx, page) in self.pages.iter().enumerate() {
418 if idx > 0 {
419 buffer.push_str("\n\n");
420 }
421 for line in &page.lines {
422 buffer.push_str(line);
423 buffer.push('\n');
424 }
425 }
426 buffer
427 }
428}
429
430#[derive(Debug, Clone)]
431pub struct PageBuffer {
432 pub index: u32,
433 pub width_px: u32,
434 pub height_px: u32,
435 pub lines: Vec<String>,
436}
437
438impl PageBuffer {
439 fn from_text(index: u32, text: &str, config: &EncoderConfig) -> Self {
440 let wrap_width = (config.page_width_px / 10).max(40) as usize;
441 let mut lines = Vec::new();
442 for raw_line in text.lines() {
443 if raw_line.trim().is_empty() {
444 lines.push(String::new());
445 continue;
446 }
447 for chunk in wrap_line(raw_line, wrap_width) {
448 lines.push(chunk);
449 }
450 }
451 if lines.is_empty() {
452 lines.push(String::new());
453 }
454 Self {
455 index,
456 width_px: config.page_width_px,
457 height_px: config.page_height_px,
458 lines,
459 }
460 }
461}
462
463#[derive(Debug, Clone)]
464struct PageResult {
465 cells: Vec<CellRecord>,
466 dict_entries: Vec<(CodeHash, String)>,
467 numguards: Vec<NumGuard>,
468 line_count: usize,
469}
470
471#[derive(Debug, Clone, Copy)]
472pub enum EncoderPreset {
473 Reports,
474 Slides,
475 News,
476 Scans,
477 Custom,
478}
479
480impl EncoderPreset {
481 pub fn from_str(name: &str) -> Result<Self> {
482 match name.to_lowercase().as_str() {
483 "reports" => Ok(Self::Reports),
484 "slides" => Ok(Self::Slides),
485 "news" => Ok(Self::News),
486 "scans" => Ok(Self::Scans),
487 "custom" => Ok(Self::Custom),
488 other => Err(DcfError::UnknownPreset(other.to_string())),
489 }
490 }
491}
492
493fn text_to_pages(text: &str, config: &EncoderConfig) -> Vec<PageBuffer> {
494 text.split('\u{c}')
495 .enumerate()
496 .map(|(idx, chunk)| PageBuffer::from_text(idx as u32, chunk, config))
497 .collect()
498}
499
500fn fallback_pdf_to_pages(path: &Path, config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
501 let pages = pdf_extract::extract_text_by_pages(path)
502 .map_err(|e| DcfError::Other(format!("pdf extract failed: {e}")))?;
503 Ok(pages
504 .into_iter()
505 .enumerate()
506 .map(|(idx, txt)| PageBuffer::from_text(idx as u32, &txt, config))
507 .collect())
508}
509
510#[cfg(feature = "pdfium")]
511fn pdfium_pdf_to_pages(path: &Path, config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
512 let bindings = Pdfium::bind_to_system_library()
513 .map_err(|e| DcfError::Other(format!("pdfium binding failed: {e}")))?;
514 let pdfium = Pdfium::new(bindings);
515 let document = pdfium
516 .load_pdf_from_file(path, None)
517 .map_err(|e| DcfError::Other(format!("pdfium load failed: {e}")))?;
518 let mut buffers = Vec::new();
519 for (idx, page) in document.pages().iter().enumerate() {
520 let mut page_text = page.text().ok().map(|t| t.all()).unwrap_or_default();
521 let mut should_ocr = config.force_ocr;
522 if !should_ocr {
523 let trimmed = page_text.trim();
524 if trimmed.is_empty() || trimmed.len() < 16 {
525 should_ocr = config.enable_ocr;
526 }
527 }
528 if should_ocr {
529 if !config.enable_ocr {
530 return Err(DcfError::OcrSupportDisabled);
531 }
532 let target_width: i32 = config
533 .page_width_px
534 .try_into()
535 .map_err(|_| DcfError::Other("page width exceeds i32::MAX".to_string()))?;
536 let target_height: i32 = config
537 .page_height_px
538 .try_into()
539 .map_err(|_| DcfError::Other("page height exceeds i32::MAX".to_string()))?;
540 let render_config = PdfRenderConfig::new()
541 .set_target_width(target_width)
542 .set_target_height(target_height);
543 let render = page
544 .render_with_config(&render_config)
545 .map_err(|e| DcfError::Other(format!("pdf render failed: {e}")))?;
546 let image = render.as_image();
547 page_text = crate::ocr::image_to_text(&image, &config.ocr_languages)?;
548 }
549 buffers.push(PageBuffer::from_text(idx as u32, &page_text, config));
550 }
551 Ok(buffers)
552}
553
554fn markdown_to_text(md: &str) -> String {
555 let mut html_buf = String::new();
556 html::push_html(&mut html_buf, Parser::new(md));
557 html_to_plaintext(&html_buf)
558}
559
560fn html_to_plaintext(html_src: &str) -> String {
561 from_read(html_src.as_bytes(), 80)
562}
563
564fn is_image_ext(ext: &str) -> bool {
565 matches!(
566 ext,
567 "png" | "jpg" | "jpeg" | "tif" | "tiff" | "bmp" | "webp" | "gif"
568 )
569}
570
571fn clamp_usize_to_u32(value: usize) -> u32 {
572 value.min(u32::MAX as usize) as u32
573}
574
575fn wrap_line(line: &str, width: usize) -> Vec<String> {
576 if line.len() <= width {
577 return vec![line.trim().to_string()];
578 }
579 let mut out = Vec::new();
580 let mut current = String::new();
581 for word in line.split_whitespace() {
582 if current.len() + word.len() + 1 > width && !current.is_empty() {
583 out.push(current.trim().to_string());
584 current.clear();
585 }
586 if !current.is_empty() {
587 current.push(' ');
588 }
589 current.push_str(word);
590 }
591 if !current.is_empty() {
592 out.push(current.trim().to_string());
593 }
594 if out.is_empty() {
595 out.push(line.trim().to_string());
596 }
597 out
598}
599
600fn read_text_lossy(path: &Path) -> Result<String> {
601 let bytes = fs::read(path)?;
602 Ok(String::from_utf8_lossy(&bytes).to_string())
603}
604
605#[cfg(feature = "ocr")]
606fn ocr_image_to_pages(image: DynamicImage, config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
607 if !config.enable_ocr {
608 return Err(DcfError::OcrSupportDisabled);
609 }
610 let text = crate::ocr::image_to_text(&image, &config.ocr_languages)?;
611 Ok(text_to_pages(&text, config))
612}
613
614#[cfg(not(feature = "ocr"))]
615fn ocr_image_to_pages(_image: DynamicImage, _config: &EncoderConfig) -> Result<Vec<PageBuffer>> {
616 Err(DcfError::OcrSupportDisabled)
617}