Skip to main content

pdfplumber_core/
html.rs

1//! HTML rendering for PDF page content.
2//!
3//! Converts extracted text, tables, and structural elements into
4//! semantic HTML. Useful for document conversion and web display.
5
6use crate::layout::{
7    TextBlock, TextLine, cluster_lines_into_blocks, cluster_words_into_lines,
8    sort_blocks_reading_order, split_lines_at_columns,
9};
10use crate::table::Table;
11use crate::text::Char;
12use crate::words::{Word, WordExtractor, WordOptions};
13
14/// Options for HTML rendering.
15#[derive(Debug, Clone)]
16pub struct HtmlOptions {
17    /// Vertical tolerance for clustering words into lines (in points).
18    pub y_tolerance: f64,
19    /// Maximum vertical gap for grouping lines into blocks (in points).
20    pub y_density: f64,
21    /// Minimum horizontal gap to detect column boundaries (in points).
22    pub x_density: f64,
23    /// Minimum font size ratio (relative to median) to consider text a heading.
24    pub heading_min_ratio: f64,
25    /// Whether to detect bullet/numbered lists from text patterns.
26    pub detect_lists: bool,
27    /// Whether to detect bold/italic from font name analysis.
28    pub detect_emphasis: bool,
29}
30
31impl Default for HtmlOptions {
32    fn default() -> Self {
33        Self {
34            y_tolerance: 3.0,
35            y_density: 10.0,
36            x_density: 10.0,
37            heading_min_ratio: 1.2,
38            detect_lists: true,
39            detect_emphasis: true,
40        }
41    }
42}
43
44/// A content element identified during HTML rendering.
45#[derive(Debug, Clone, PartialEq)]
46enum HtmlElement {
47    /// A heading with level (1-6) and text content.
48    Heading { level: u8, text: String },
49    /// A paragraph of text (may contain inline HTML for emphasis).
50    Paragraph(String),
51    /// An HTML table.
52    Table(String),
53    /// A list item (bullet or numbered).
54    ListItem {
55        /// Whether it's a numbered (ordered) list item.
56        ordered: bool,
57        /// The text content.
58        text: String,
59    },
60}
61
62/// Renders PDF page content as semantic HTML.
63pub struct HtmlRenderer;
64
65impl HtmlRenderer {
66    /// Render characters and tables as HTML.
67    ///
68    /// This is the main entry point. It:
69    /// 1. Extracts words from characters
70    /// 2. Groups words into text blocks
71    /// 3. Classifies blocks as headings, paragraphs, or lists
72    /// 4. Converts tables to HTML table elements
73    /// 5. Interleaves text and tables in reading order
74    pub fn render(chars: &[Char], tables: &[Table], options: &HtmlOptions) -> String {
75        if chars.is_empty() && tables.is_empty() {
76            return String::new();
77        }
78
79        let words = WordExtractor::extract(
80            chars,
81            &WordOptions {
82                y_tolerance: options.y_tolerance,
83                ..WordOptions::default()
84            },
85        );
86
87        let lines = cluster_words_into_lines(&words, options.y_tolerance);
88        let split = split_lines_at_columns(lines, options.x_density);
89        let mut blocks = cluster_lines_into_blocks(split, options.y_density);
90        sort_blocks_reading_order(&mut blocks, options.x_density);
91
92        let median_size = compute_median_font_size(chars);
93
94        let mut elements = classify_blocks(&blocks, median_size, options);
95
96        // Insert tables
97        for table in tables {
98            let table_html = table_to_html(table);
99            elements.push(HtmlElement::Table(table_html));
100        }
101
102        render_elements(&elements)
103    }
104
105    /// Render characters as HTML (no tables).
106    pub fn render_text(chars: &[Char], options: &HtmlOptions) -> String {
107        Self::render(chars, &[], options)
108    }
109
110    /// Convert a table to HTML table element.
111    pub fn table_to_html(table: &Table) -> String {
112        table_to_html(table)
113    }
114
115    /// Detect heading level from font size relative to median.
116    ///
117    /// Returns `Some(level)` (1-6) if the text qualifies as a heading,
118    /// or `None` if it's normal text.
119    pub fn detect_heading_level(font_size: f64, median_size: f64, min_ratio: f64) -> Option<u8> {
120        detect_heading_level(font_size, median_size, min_ratio)
121    }
122}
123
124/// Compute the median font size from characters.
125fn compute_median_font_size(chars: &[Char]) -> f64 {
126    if chars.is_empty() {
127        return 12.0;
128    }
129
130    let mut sizes: Vec<f64> = chars
131        .iter()
132        .filter(|c| c.size > 0.0 && !c.text.trim().is_empty())
133        .map(|c| c.size)
134        .collect();
135
136    if sizes.is_empty() {
137        return 12.0;
138    }
139
140    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap());
141    let mid = sizes.len() / 2;
142    if sizes.len() % 2 == 0 {
143        (sizes[mid - 1] + sizes[mid]) / 2.0
144    } else {
145        sizes[mid]
146    }
147}
148
149/// Detect heading level from font size ratio.
150fn detect_heading_level(font_size: f64, median_size: f64, min_ratio: f64) -> Option<u8> {
151    if median_size <= 0.0 || font_size <= 0.0 {
152        return None;
153    }
154
155    let ratio = font_size / median_size;
156    if ratio < min_ratio {
157        return None;
158    }
159
160    if ratio >= 2.0 {
161        Some(1)
162    } else if ratio >= 1.6 {
163        Some(2)
164    } else if ratio >= 1.3 {
165        Some(3)
166    } else {
167        Some(4)
168    }
169}
170
171/// Detect if text is a list item. Returns (ordered, prefix, rest_text).
172fn detect_list_item(text: &str) -> Option<(bool, String)> {
173    let trimmed = text.trim_start();
174
175    // Bullet patterns
176    for prefix in &["- ", "* ", "\u{2022} ", "\u{2013} ", "\u{2014} "] {
177        if let Some(rest) = trimmed.strip_prefix(prefix) {
178            return Some((false, rest.to_string()));
179        }
180    }
181
182    // Numbered patterns: "1. ", "2) ", etc.
183    let bytes = trimmed.as_bytes();
184    if !bytes.is_empty() {
185        let mut i = 0;
186        while i < bytes.len() && bytes[i].is_ascii_digit() {
187            i += 1;
188        }
189        if i > 0 && i + 1 < bytes.len() {
190            let sep = bytes[i];
191            let space = bytes[i + 1];
192            if (sep == b'.' || sep == b')') && space == b' ' {
193                let rest = &trimmed[i + 2..];
194                return Some((true, rest.to_string()));
195            }
196        }
197    }
198
199    None
200}
201
202/// Get the dominant font size in a text block.
203fn block_dominant_size(block: &TextBlock) -> f64 {
204    let mut sizes: Vec<f64> = Vec::new();
205    for line in &block.lines {
206        for word in &line.words {
207            for ch in &word.chars {
208                if ch.size > 0.0 && !ch.text.trim().is_empty() {
209                    sizes.push(ch.size);
210                }
211            }
212        }
213    }
214    if sizes.is_empty() {
215        return 0.0;
216    }
217
218    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap());
219    let mut best_size = sizes[0];
220    let mut best_count = 1;
221    let mut current_count = 1;
222    for i in 1..sizes.len() {
223        if (sizes[i] - sizes[i - 1]).abs() < 0.1 {
224            current_count += 1;
225        } else {
226            if current_count > best_count {
227                best_count = current_count;
228                best_size = sizes[i - 1];
229            }
230            current_count = 1;
231        }
232    }
233    if current_count > best_count {
234        best_size = *sizes.last().unwrap();
235    }
236    best_size
237}
238
239/// Check if a font name indicates bold.
240fn is_bold_font(fontname: &str) -> bool {
241    let lower = fontname.to_lowercase();
242    lower.contains("bold") || lower.contains("heavy") || lower.contains("black")
243}
244
245/// Check if a font name indicates italic.
246fn is_italic_font(fontname: &str) -> bool {
247    let lower = fontname.to_lowercase();
248    lower.contains("italic") || lower.contains("oblique")
249}
250
251/// Get the dominant font name in a word.
252fn word_dominant_font(word: &Word) -> &str {
253    word.chars
254        .iter()
255        .find(|c| !c.text.trim().is_empty())
256        .map(|c| c.fontname.as_str())
257        .unwrap_or("")
258}
259
260/// Escape special HTML characters.
261fn escape_html(text: &str) -> String {
262    text.replace('&', "&amp;")
263        .replace('<', "&lt;")
264        .replace('>', "&gt;")
265        .replace('"', "&quot;")
266}
267
268/// Classify text blocks into HTML content elements.
269fn classify_blocks(
270    blocks: &[TextBlock],
271    median_size: f64,
272    options: &HtmlOptions,
273) -> Vec<HtmlElement> {
274    let mut elements = Vec::new();
275
276    for block in blocks {
277        let block_text = block_to_text(block);
278        if block_text.trim().is_empty() {
279            continue;
280        }
281
282        let dominant_size = block_dominant_size(block);
283
284        // Check for heading
285        if let Some(level) =
286            detect_heading_level(dominant_size, median_size, options.heading_min_ratio)
287        {
288            let is_short =
289                block.lines.len() <= 2 && block.lines.iter().all(|l| l.words.len() <= 15);
290            if is_short {
291                let text = escape_html(block_text.trim());
292                elements.push(HtmlElement::Heading { level, text });
293                continue;
294            }
295        }
296
297        // Check for list items
298        if options.detect_lists {
299            let line_texts: Vec<String> = block.lines.iter().map(line_to_text).collect();
300            let all_list_items = line_texts.iter().all(|t| detect_list_item(t).is_some());
301            if all_list_items && !line_texts.is_empty() {
302                for text in &line_texts {
303                    if let Some((ordered, rest)) = detect_list_item(text) {
304                        elements.push(HtmlElement::ListItem {
305                            ordered,
306                            text: escape_html(&rest),
307                        });
308                    }
309                }
310                continue;
311            }
312        }
313
314        // Apply emphasis if enabled
315        let rendered_text = if options.detect_emphasis {
316            render_block_with_emphasis(block)
317        } else {
318            escape_html(&block_text)
319        };
320
321        elements.push(HtmlElement::Paragraph(rendered_text.trim().to_string()));
322    }
323
324    elements
325}
326
327/// Convert a text block to plain text.
328fn block_to_text(block: &TextBlock) -> String {
329    block
330        .lines
331        .iter()
332        .map(line_to_text)
333        .collect::<Vec<_>>()
334        .join("\n")
335}
336
337/// Convert a text line to plain text.
338fn line_to_text(line: &TextLine) -> String {
339    line.words
340        .iter()
341        .map(|w| w.text.as_str())
342        .collect::<Vec<_>>()
343        .join(" ")
344}
345
346/// Render a block with bold/italic emphasis as HTML.
347fn render_block_with_emphasis(block: &TextBlock) -> String {
348    block
349        .lines
350        .iter()
351        .map(render_line_with_emphasis)
352        .collect::<Vec<_>>()
353        .join("\n")
354}
355
356/// Render a line with HTML emphasis tags.
357fn render_line_with_emphasis(line: &TextLine) -> String {
358    let mut parts: Vec<String> = Vec::new();
359
360    for word in &line.words {
361        let font = word_dominant_font(word);
362        let bold = is_bold_font(font);
363        let italic = is_italic_font(font);
364        let text = escape_html(&word.text);
365
366        if bold && italic {
367            parts.push(format!("<strong><em>{text}</em></strong>"));
368        } else if bold {
369            parts.push(format!("<strong>{text}</strong>"));
370        } else if italic {
371            parts.push(format!("<em>{text}</em>"));
372        } else {
373            parts.push(text);
374        }
375    }
376
377    parts.join(" ")
378}
379
380/// Convert a Table to an HTML table element.
381fn table_to_html(table: &Table) -> String {
382    if table.rows.is_empty() {
383        return String::new();
384    }
385
386    let mut html = String::from("<table>\n");
387
388    for (i, row) in table.rows.iter().enumerate() {
389        if i == 0 {
390            html.push_str("<thead>\n<tr>");
391            for cell in row {
392                let text = escape_html(cell.text.as_deref().unwrap_or(""));
393                html.push_str(&format!("<th>{text}</th>"));
394            }
395            html.push_str("</tr>\n</thead>\n<tbody>\n");
396        } else {
397            html.push_str("<tr>");
398            for cell in row {
399                let text = escape_html(cell.text.as_deref().unwrap_or(""));
400                html.push_str(&format!("<td>{text}</td>"));
401            }
402            html.push_str("</tr>\n");
403        }
404    }
405
406    html.push_str("</tbody>\n</table>");
407    html
408}
409
410/// Render HTML elements into a complete HTML string.
411fn render_elements(elements: &[HtmlElement]) -> String {
412    let mut parts: Vec<String> = Vec::new();
413    let mut i = 0;
414
415    while i < elements.len() {
416        match &elements[i] {
417            HtmlElement::Heading { level, text } => {
418                parts.push(format!("<h{level}>{text}</h{level}>"));
419                i += 1;
420            }
421            HtmlElement::Paragraph(text) => {
422                parts.push(format!("<p>{text}</p>"));
423                i += 1;
424            }
425            HtmlElement::Table(html) => {
426                parts.push(html.clone());
427                i += 1;
428            }
429            HtmlElement::ListItem { ordered, .. } => {
430                // Collect consecutive list items of the same type
431                let is_ordered = *ordered;
432                let tag = if is_ordered { "ol" } else { "ul" };
433                let mut items = Vec::new();
434                while i < elements.len() {
435                    if let HtmlElement::ListItem { ordered, text } = &elements[i] {
436                        if *ordered == is_ordered {
437                            items.push(format!("<li>{text}</li>"));
438                            i += 1;
439                        } else {
440                            break;
441                        }
442                    } else {
443                        break;
444                    }
445                }
446                parts.push(format!("<{tag}>\n{}\n</{tag}>", items.join("\n")));
447            }
448        }
449    }
450
451    parts.join("\n")
452}
453
454#[cfg(test)]
455mod tests {
456    use super::*;
457    use crate::geometry::BBox;
458    use crate::table::Cell;
459    use crate::text::TextDirection;
460
461    fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64, size: f64) -> Char {
462        Char {
463            text: text.to_string(),
464            bbox: BBox::new(x0, top, x1, bottom),
465            fontname: "Helvetica".to_string(),
466            size,
467            doctop: top,
468            upright: true,
469            direction: TextDirection::Ltr,
470            stroking_color: None,
471            non_stroking_color: None,
472            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
473            char_code: 0,
474            mcid: None,
475            tag: None,
476        }
477    }
478
479    fn make_word_from_text(
480        text: &str,
481        x0: f64,
482        top: f64,
483        x1: f64,
484        bottom: f64,
485        size: f64,
486        fontname: &str,
487    ) -> Word {
488        let chars: Vec<Char> = text
489            .chars()
490            .enumerate()
491            .map(|(i, c)| {
492                let char_width = (x1 - x0) / text.len() as f64;
493                let cx0 = x0 + i as f64 * char_width;
494                let cx1 = cx0 + char_width;
495                Char {
496                    text: c.to_string(),
497                    bbox: BBox::new(cx0, top, cx1, bottom),
498                    fontname: fontname.to_string(),
499                    size,
500                    doctop: top,
501                    upright: true,
502                    direction: TextDirection::Ltr,
503                    stroking_color: None,
504                    non_stroking_color: None,
505                    ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
506                    char_code: 0,
507                    mcid: None,
508                    tag: None,
509                }
510            })
511            .collect();
512        Word {
513            text: text.to_string(),
514            bbox: BBox::new(x0, top, x1, bottom),
515            doctop: top,
516            direction: TextDirection::Ltr,
517            chars,
518        }
519    }
520
521    // --- Heading detection tests ---
522
523    #[test]
524    fn test_heading_h1() {
525        assert_eq!(detect_heading_level(24.0, 12.0, 1.2), Some(1));
526    }
527
528    #[test]
529    fn test_heading_h2() {
530        assert_eq!(detect_heading_level(20.0, 12.0, 1.2), Some(2));
531    }
532
533    #[test]
534    fn test_heading_h3() {
535        assert_eq!(detect_heading_level(16.0, 12.0, 1.2), Some(3));
536    }
537
538    #[test]
539    fn test_heading_h4() {
540        assert_eq!(detect_heading_level(14.5, 12.0, 1.2), Some(4));
541    }
542
543    #[test]
544    fn test_no_heading_normal_size() {
545        assert_eq!(detect_heading_level(12.0, 12.0, 1.2), None);
546    }
547
548    #[test]
549    fn test_heading_zero_median() {
550        assert_eq!(detect_heading_level(12.0, 0.0, 1.2), None);
551    }
552
553    // --- HTML escape tests ---
554
555    #[test]
556    fn test_escape_html_ampersand() {
557        assert_eq!(escape_html("A & B"), "A &amp; B");
558    }
559
560    #[test]
561    fn test_escape_html_angle_brackets() {
562        assert_eq!(escape_html("<div>"), "&lt;div&gt;");
563    }
564
565    #[test]
566    fn test_escape_html_quotes() {
567        assert_eq!(escape_html("say \"hello\""), "say &quot;hello&quot;");
568    }
569
570    #[test]
571    fn test_escape_html_combined() {
572        assert_eq!(escape_html("a < b & c > d"), "a &lt; b &amp; c &gt; d");
573    }
574
575    // --- Table to HTML tests ---
576
577    #[test]
578    fn test_table_to_html_simple() {
579        let table = Table {
580            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
581            cells: vec![],
582            rows: vec![
583                vec![
584                    Cell {
585                        bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
586                        text: Some("Name".to_string()),
587                    },
588                    Cell {
589                        bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
590                        text: Some("Age".to_string()),
591                    },
592                ],
593                vec![
594                    Cell {
595                        bbox: BBox::new(0.0, 25.0, 50.0, 50.0),
596                        text: Some("Alice".to_string()),
597                    },
598                    Cell {
599                        bbox: BBox::new(50.0, 25.0, 100.0, 50.0),
600                        text: Some("30".to_string()),
601                    },
602                ],
603            ],
604            columns: vec![],
605        };
606        let html = table_to_html(&table);
607        assert!(html.contains("<table>"));
608        assert!(html.contains("<thead>"));
609        assert!(html.contains("<th>Name</th>"));
610        assert!(html.contains("<th>Age</th>"));
611        assert!(html.contains("</thead>"));
612        assert!(html.contains("<tbody>"));
613        assert!(html.contains("<td>Alice</td>"));
614        assert!(html.contains("<td>30</td>"));
615        assert!(html.contains("</tbody>"));
616        assert!(html.contains("</table>"));
617    }
618
619    #[test]
620    fn test_table_to_html_with_none_cells() {
621        let table = Table {
622            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
623            cells: vec![],
624            rows: vec![
625                vec![
626                    Cell {
627                        bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
628                        text: Some("Header".to_string()),
629                    },
630                    Cell {
631                        bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
632                        text: None,
633                    },
634                ],
635                vec![
636                    Cell {
637                        bbox: BBox::new(0.0, 25.0, 50.0, 50.0),
638                        text: None,
639                    },
640                    Cell {
641                        bbox: BBox::new(50.0, 25.0, 100.0, 50.0),
642                        text: Some("Data".to_string()),
643                    },
644                ],
645            ],
646            columns: vec![],
647        };
648        let html = table_to_html(&table);
649        assert!(html.contains("<th>Header</th>"));
650        assert!(html.contains("<th></th>"));
651        assert!(html.contains("<td></td>"));
652        assert!(html.contains("<td>Data</td>"));
653    }
654
655    #[test]
656    fn test_table_to_html_empty() {
657        let table = Table {
658            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
659            cells: vec![],
660            rows: vec![],
661            columns: vec![],
662        };
663        assert_eq!(table_to_html(&table), "");
664    }
665
666    #[test]
667    fn test_table_to_html_escapes_html() {
668        let table = Table {
669            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
670            cells: vec![],
671            rows: vec![
672                vec![Cell {
673                    bbox: BBox::new(0.0, 0.0, 100.0, 25.0),
674                    text: Some("A<B>".to_string()),
675                }],
676                vec![Cell {
677                    bbox: BBox::new(0.0, 25.0, 100.0, 50.0),
678                    text: Some("C&D".to_string()),
679                }],
680            ],
681            columns: vec![],
682        };
683        let html = table_to_html(&table);
684        assert!(html.contains("A&lt;B&gt;"));
685        assert!(html.contains("C&amp;D"));
686    }
687
688    // --- Paragraph wrapping tests ---
689
690    #[test]
691    fn test_render_simple_paragraph() {
692        let chars = vec![
693            make_char("H", 0.0, 0.0, 8.0, 12.0, 12.0),
694            make_char("e", 8.0, 0.0, 16.0, 12.0, 12.0),
695            make_char("l", 16.0, 0.0, 24.0, 12.0, 12.0),
696            make_char("l", 24.0, 0.0, 32.0, 12.0, 12.0),
697            make_char("o", 32.0, 0.0, 40.0, 12.0, 12.0),
698            make_char(" ", 40.0, 0.0, 44.0, 12.0, 12.0),
699            make_char("W", 44.0, 0.0, 52.0, 12.0, 12.0),
700            make_char("o", 52.0, 0.0, 60.0, 12.0, 12.0),
701            make_char("r", 60.0, 0.0, 68.0, 12.0, 12.0),
702            make_char("l", 68.0, 0.0, 76.0, 12.0, 12.0),
703            make_char("d", 76.0, 0.0, 84.0, 12.0, 12.0),
704        ];
705        let result = HtmlRenderer::render_text(&chars, &HtmlOptions::default());
706        assert!(
707            result.contains("<p>Hello World</p>"),
708            "Expected paragraph wrapping, got: {result}"
709        );
710    }
711
712    #[test]
713    fn test_render_heading_detection() {
714        let mut chars = Vec::new();
715        // Large heading at 24pt
716        for (i, c) in "Title".chars().enumerate() {
717            chars.push(make_char(
718                &c.to_string(),
719                i as f64 * 16.0,
720                0.0,
721                (i + 1) as f64 * 16.0,
722                24.0,
723                24.0,
724            ));
725        }
726        // Normal body text (gap > y_density)
727        for (i, c) in "Body text here".chars().enumerate() {
728            let x0 = i as f64 * 8.0;
729            chars.push(make_char(&c.to_string(), x0, 40.0, x0 + 8.0, 52.0, 12.0));
730        }
731        let result = HtmlRenderer::render_text(&chars, &HtmlOptions::default());
732        assert!(
733            result.contains("<h1>Title</h1>"),
734            "Expected H1 heading, got: {result}"
735        );
736        assert!(
737            result.contains("Body text here"),
738            "Expected body text, got: {result}"
739        );
740    }
741
742    #[test]
743    fn test_render_empty_input() {
744        let result = HtmlRenderer::render(&[], &[], &HtmlOptions::default());
745        assert_eq!(result, "");
746    }
747
748    // --- Bold/italic emphasis tests ---
749
750    #[test]
751    fn test_bold_font_detection() {
752        assert!(is_bold_font("Helvetica-Bold"));
753        assert!(is_bold_font("TimesNewRoman-BoldItalic"));
754        assert!(!is_bold_font("Helvetica"));
755        assert!(!is_bold_font("Times-Roman"));
756    }
757
758    #[test]
759    fn test_italic_font_detection() {
760        assert!(is_italic_font("Helvetica-Oblique"));
761        assert!(is_italic_font("Times-Italic"));
762        assert!(!is_italic_font("Helvetica"));
763        assert!(!is_italic_font("Helvetica-Bold"));
764    }
765
766    #[test]
767    fn test_render_line_with_emphasis() {
768        let line = TextLine {
769            words: vec![
770                make_word_from_text("normal", 0.0, 0.0, 48.0, 12.0, 12.0, "Helvetica"),
771                make_word_from_text("bold", 52.0, 0.0, 88.0, 12.0, 12.0, "Helvetica-Bold"),
772                make_word_from_text("italic", 92.0, 0.0, 140.0, 12.0, 12.0, "Helvetica-Oblique"),
773            ],
774            bbox: BBox::new(0.0, 0.0, 140.0, 12.0),
775        };
776        let result = render_line_with_emphasis(&line);
777        assert_eq!(result, "normal <strong>bold</strong> <em>italic</em>");
778    }
779
780    #[test]
781    fn test_render_bold_italic_combined() {
782        let line = TextLine {
783            words: vec![make_word_from_text(
784                "emphasis",
785                0.0,
786                0.0,
787                64.0,
788                12.0,
789                12.0,
790                "Helvetica-BoldOblique",
791            )],
792            bbox: BBox::new(0.0, 0.0, 64.0, 12.0),
793        };
794        let result = render_line_with_emphasis(&line);
795        assert_eq!(result, "<strong><em>emphasis</em></strong>");
796    }
797
798    // --- HtmlOptions default tests ---
799
800    #[test]
801    fn test_html_options_default() {
802        let opts = HtmlOptions::default();
803        assert_eq!(opts.y_tolerance, 3.0);
804        assert_eq!(opts.y_density, 10.0);
805        assert_eq!(opts.x_density, 10.0);
806        assert_eq!(opts.heading_min_ratio, 1.2);
807        assert!(opts.detect_lists);
808        assert!(opts.detect_emphasis);
809    }
810
811    // --- List detection tests ---
812
813    #[test]
814    fn test_detect_bullet_list() {
815        let result = detect_list_item("- item text");
816        assert_eq!(result, Some((false, "item text".to_string())));
817    }
818
819    #[test]
820    fn test_detect_numbered_list() {
821        let result = detect_list_item("1. first item");
822        assert_eq!(result, Some((true, "first item".to_string())));
823    }
824
825    #[test]
826    fn test_detect_no_list() {
827        assert_eq!(detect_list_item("Just normal text"), None);
828    }
829
830    // --- Element rendering tests ---
831
832    #[test]
833    fn test_render_heading_and_paragraph() {
834        let elements = vec![
835            HtmlElement::Heading {
836                level: 1,
837                text: "My Title".to_string(),
838            },
839            HtmlElement::Paragraph("Some body text.".to_string()),
840        ];
841        let result = render_elements(&elements);
842        assert_eq!(result, "<h1>My Title</h1>\n<p>Some body text.</p>");
843    }
844
845    #[test]
846    fn test_render_unordered_list() {
847        let elements = vec![
848            HtmlElement::ListItem {
849                ordered: false,
850                text: "first".to_string(),
851            },
852            HtmlElement::ListItem {
853                ordered: false,
854                text: "second".to_string(),
855            },
856        ];
857        let result = render_elements(&elements);
858        assert_eq!(result, "<ul>\n<li>first</li>\n<li>second</li>\n</ul>");
859    }
860
861    #[test]
862    fn test_render_ordered_list() {
863        let elements = vec![
864            HtmlElement::ListItem {
865                ordered: true,
866                text: "first".to_string(),
867            },
868            HtmlElement::ListItem {
869                ordered: true,
870                text: "second".to_string(),
871            },
872        ];
873        let result = render_elements(&elements);
874        assert_eq!(result, "<ol>\n<li>first</li>\n<li>second</li>\n</ol>");
875    }
876
877    #[test]
878    fn test_render_with_table() {
879        let table = Table {
880            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
881            cells: vec![],
882            rows: vec![
883                vec![
884                    Cell {
885                        bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
886                        text: Some("Col1".to_string()),
887                    },
888                    Cell {
889                        bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
890                        text: Some("Col2".to_string()),
891                    },
892                ],
893                vec![
894                    Cell {
895                        bbox: BBox::new(0.0, 25.0, 50.0, 50.0),
896                        text: Some("A".to_string()),
897                    },
898                    Cell {
899                        bbox: BBox::new(50.0, 25.0, 100.0, 50.0),
900                        text: Some("B".to_string()),
901                    },
902                ],
903            ],
904            columns: vec![],
905        };
906        let result = HtmlRenderer::render(&[], &[table], &HtmlOptions::default());
907        assert!(result.contains("<table>"));
908        assert!(result.contains("<th>Col1</th>"));
909        assert!(result.contains("<td>A</td>"));
910        assert!(result.contains("</table>"));
911    }
912
913    #[test]
914    fn test_table_single_row() {
915        let table = Table {
916            bbox: BBox::new(0.0, 0.0, 100.0, 25.0),
917            cells: vec![],
918            rows: vec![vec![
919                Cell {
920                    bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
921                    text: Some("Only".to_string()),
922                },
923                Cell {
924                    bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
925                    text: Some("Row".to_string()),
926                },
927            ]],
928            columns: vec![],
929        };
930        let html = table_to_html(&table);
931        assert!(html.contains("<th>Only</th>"));
932        assert!(html.contains("<th>Row</th>"));
933        // Single row: thead only, empty tbody
934        assert!(html.contains("<tbody>"));
935    }
936
937    #[test]
938    fn test_median_font_size_empty() {
939        assert_eq!(compute_median_font_size(&[]), 12.0);
940    }
941
942    #[test]
943    fn test_median_font_size_single() {
944        let chars = vec![make_char("A", 0.0, 0.0, 10.0, 12.0, 14.0)];
945        assert_eq!(compute_median_font_size(&chars), 14.0);
946    }
947
948    #[test]
949    fn test_block_dominant_size() {
950        let block = TextBlock {
951            lines: vec![TextLine {
952                words: vec![make_word_from_text(
953                    "Hello",
954                    0.0,
955                    0.0,
956                    40.0,
957                    12.0,
958                    14.0,
959                    "Helvetica",
960                )],
961                bbox: BBox::new(0.0, 0.0, 40.0, 12.0),
962            }],
963            bbox: BBox::new(0.0, 0.0, 40.0, 12.0),
964        };
965        assert_eq!(block_dominant_size(&block), 14.0);
966    }
967
968    // --- End-to-end rendering tests ---
969
970    #[test]
971    fn test_render_list_items_as_html() {
972        let mut chars = Vec::new();
973        for (i, c) in "- first item".chars().enumerate() {
974            let x0 = i as f64 * 8.0;
975            chars.push(make_char(&c.to_string(), x0, 0.0, x0 + 8.0, 12.0, 12.0));
976        }
977        for (i, c) in "- second item".chars().enumerate() {
978            let x0 = i as f64 * 8.0;
979            chars.push(make_char(&c.to_string(), x0, 15.0, x0 + 8.0, 27.0, 12.0));
980        }
981        let result = HtmlRenderer::render_text(&chars, &HtmlOptions::default());
982        assert!(
983            result.contains("<ul>"),
984            "Expected unordered list, got: {result}"
985        );
986        assert!(
987            result.contains("<li>first item</li>"),
988            "Expected first list item, got: {result}"
989        );
990        assert!(
991            result.contains("<li>second item</li>"),
992            "Expected second list item, got: {result}"
993        );
994        assert!(
995            result.contains("</ul>"),
996            "Expected closing ul tag, got: {result}"
997        );
998    }
999
1000    #[test]
1001    fn test_heading_html_escapes_content() {
1002        let elements = vec![HtmlElement::Heading {
1003            level: 2,
1004            text: "A &amp; B".to_string(),
1005        }];
1006        let result = render_elements(&elements);
1007        assert_eq!(result, "<h2>A &amp; B</h2>");
1008    }
1009
1010    #[test]
1011    fn test_paragraph_html_wrapping() {
1012        let elements = vec![HtmlElement::Paragraph("Hello world".to_string())];
1013        let result = render_elements(&elements);
1014        assert_eq!(result, "<p>Hello world</p>");
1015    }
1016}