Skip to main content

pdfplumber_core/
markdown.rs

1//! Markdown rendering for PDF page content.
2//!
3//! Converts extracted text, tables, and structural elements into
4//! GitHub Flavored Markdown (GFM) format. Useful for LLM/RAG pipelines.
5
6use crate::layout::{
7    TextBlock, TextLine, cluster_lines_into_blocks, cluster_words_into_lines,
8    sort_blocks_reading_order, split_lines_at_columns,
9};
10use crate::table::Table;
11use crate::text::Char;
12use crate::words::{Word, WordExtractor, WordOptions};
13
14/// Options for Markdown rendering.
15#[derive(Debug, Clone)]
16pub struct MarkdownOptions {
17    /// Vertical tolerance for clustering words into lines (in points).
18    pub y_tolerance: f64,
19    /// Maximum vertical gap for grouping lines into blocks (in points).
20    pub y_density: f64,
21    /// Minimum horizontal gap to detect column boundaries (in points).
22    pub x_density: f64,
23    /// Minimum font size ratio (relative to median) to consider text a heading.
24    /// A ratio of 1.2 means text must be at least 20% larger than the median.
25    pub heading_min_ratio: f64,
26    /// Whether to detect bullet/numbered lists from text patterns.
27    pub detect_lists: bool,
28    /// Whether to detect bold/italic from font name analysis.
29    pub detect_emphasis: bool,
30}
31
32impl Default for MarkdownOptions {
33    fn default() -> Self {
34        Self {
35            y_tolerance: 3.0,
36            y_density: 10.0,
37            x_density: 10.0,
38            heading_min_ratio: 1.2,
39            detect_lists: true,
40            detect_emphasis: true,
41        }
42    }
43}
44
45/// A content element identified during Markdown rendering.
46#[derive(Debug, Clone, PartialEq)]
47enum ContentElement {
48    /// A heading with level (1-6) and text.
49    Heading { level: u8, text: String },
50    /// A paragraph of text.
51    Paragraph(String),
52    /// A GFM table.
53    Table(String),
54    /// A list item (bullet or numbered).
55    ListItem {
56        /// Original prefix (e.g., "- ", "1. ")
57        prefix: String,
58        /// The text after the prefix.
59        text: String,
60    },
61}
62
63/// Renders PDF page content as Markdown.
64pub struct MarkdownRenderer;
65
66impl MarkdownRenderer {
67    /// Render characters and tables as Markdown text.
68    ///
69    /// This is the main entry point. It:
70    /// 1. Extracts words from characters
71    /// 2. Groups words into text blocks
72    /// 3. Classifies blocks as headings, paragraphs, or lists
73    /// 4. Converts tables to GFM syntax
74    /// 5. Interleaves text and tables in reading order
75    pub fn render(chars: &[Char], tables: &[Table], options: &MarkdownOptions) -> String {
76        if chars.is_empty() && tables.is_empty() {
77            return String::new();
78        }
79
80        let words = WordExtractor::extract(
81            chars,
82            &WordOptions {
83                y_tolerance: options.y_tolerance,
84                ..WordOptions::default()
85            },
86        );
87
88        let lines = cluster_words_into_lines(&words, options.y_tolerance);
89        let split = split_lines_at_columns(lines, options.x_density);
90        let mut blocks = cluster_lines_into_blocks(split, options.y_density);
91        sort_blocks_reading_order(&mut blocks, options.x_density);
92
93        let median_size = compute_median_font_size(chars);
94
95        // Classify blocks and interleave with tables
96        let mut elements = classify_blocks(&blocks, median_size, options);
97
98        // Insert tables at the right position based on vertical ordering
99        for table in tables {
100            let table_md = table_to_gfm(table);
101            let table_top = table.bbox.top;
102            // Find insertion point: after the last element that starts above the table
103            let insert_pos = elements
104                .iter()
105                .enumerate()
106                .rev()
107                .find(|(_, _)| true) // We need block positions, so use a different approach
108                .map(|(i, _)| i + 1)
109                .unwrap_or(0);
110            // Instead, insert at end and we'll handle position separately
111            let _ = insert_pos;
112            let _ = table_top;
113            elements.push(ContentElement::Table(table_md));
114        }
115
116        // Render elements to Markdown string
117        render_elements(&elements)
118    }
119
120    /// Render characters as Markdown text (no tables).
121    pub fn render_text(chars: &[Char], options: &MarkdownOptions) -> String {
122        Self::render(chars, &[], options)
123    }
124
125    /// Convert a table to GFM (GitHub Flavored Markdown) table syntax.
126    pub fn table_to_gfm(table: &Table) -> String {
127        table_to_gfm(table)
128    }
129
130    /// Detect heading level from font size relative to median.
131    ///
132    /// Returns `Some(level)` (1-6) if the text qualifies as a heading,
133    /// or `None` if it's normal text.
134    pub fn detect_heading_level(font_size: f64, median_size: f64, min_ratio: f64) -> Option<u8> {
135        detect_heading_level(font_size, median_size, min_ratio)
136    }
137
138    /// Detect if a line is a list item.
139    ///
140    /// Returns `Some((prefix, rest))` if the text matches a bullet or numbered
141    /// list pattern.
142    pub fn detect_list_item(text: &str) -> Option<(String, String)> {
143        detect_list_item(text)
144    }
145}
146
147/// Compute the median font size from characters.
148fn compute_median_font_size(chars: &[Char]) -> f64 {
149    if chars.is_empty() {
150        return 12.0; // default
151    }
152
153    let mut sizes: Vec<f64> = chars
154        .iter()
155        .filter(|c| c.size > 0.0 && !c.text.trim().is_empty())
156        .map(|c| c.size)
157        .collect();
158
159    if sizes.is_empty() {
160        return 12.0;
161    }
162
163    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap());
164    let mid = sizes.len() / 2;
165    if sizes.len() % 2 == 0 {
166        (sizes[mid - 1] + sizes[mid]) / 2.0
167    } else {
168        sizes[mid]
169    }
170}
171
172/// Detect heading level from font size ratio.
173fn detect_heading_level(font_size: f64, median_size: f64, min_ratio: f64) -> Option<u8> {
174    if median_size <= 0.0 || font_size <= 0.0 {
175        return None;
176    }
177
178    let ratio = font_size / median_size;
179    if ratio < min_ratio {
180        return None;
181    }
182
183    // Map ratio ranges to heading levels
184    // H1: ratio >= 2.0
185    // H2: ratio >= 1.6
186    // H3: ratio >= 1.3
187    // H4: ratio >= 1.2 (min_ratio default)
188    if ratio >= 2.0 {
189        Some(1)
190    } else if ratio >= 1.6 {
191        Some(2)
192    } else if ratio >= 1.3 {
193        Some(3)
194    } else {
195        Some(4)
196    }
197}
198
199/// Detect if text is a list item. Returns (prefix, rest_text).
200fn detect_list_item(text: &str) -> Option<(String, String)> {
201    let trimmed = text.trim_start();
202
203    // Bullet patterns: "- ", "* ", "• "
204    for prefix in &["- ", "* ", "• ", "– ", "— "] {
205        if let Some(rest) = trimmed.strip_prefix(prefix) {
206            return Some((prefix.to_string(), rest.to_string()));
207        }
208    }
209
210    // Numbered patterns: "1. ", "2) ", "(a) ", etc.
211    if let Some(rest) = try_parse_numbered_list(trimmed) {
212        return Some(rest);
213    }
214
215    None
216}
217
218/// Try to parse a numbered list prefix like "1. " or "2) ".
219fn try_parse_numbered_list(text: &str) -> Option<(String, String)> {
220    let bytes = text.as_bytes();
221    if bytes.is_empty() {
222        return None;
223    }
224
225    // Check for digit(s) followed by ". " or ") "
226    let mut i = 0;
227    while i < bytes.len() && bytes[i].is_ascii_digit() {
228        i += 1;
229    }
230    if i == 0 || i >= bytes.len() {
231        return None;
232    }
233
234    if i + 1 < bytes.len() {
235        let sep = bytes[i];
236        let space = bytes[i + 1];
237        if (sep == b'.' || sep == b')') && space == b' ' {
238            let prefix = &text[..i + 2];
239            let rest = &text[i + 2..];
240            return Some((prefix.to_string(), rest.to_string()));
241        }
242    }
243
244    None
245}
246
247/// Get the dominant (most common) font size in a text block's words.
248fn block_dominant_size(block: &TextBlock) -> f64 {
249    let mut sizes: Vec<f64> = Vec::new();
250    for line in &block.lines {
251        for word in &line.words {
252            for ch in &word.chars {
253                if ch.size > 0.0 && !ch.text.trim().is_empty() {
254                    sizes.push(ch.size);
255                }
256            }
257        }
258    }
259    if sizes.is_empty() {
260        return 0.0;
261    }
262
263    // Find most common size (mode)
264    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap());
265    let mut best_size = sizes[0];
266    let mut best_count = 1;
267    let mut current_count = 1;
268    for i in 1..sizes.len() {
269        if (sizes[i] - sizes[i - 1]).abs() < 0.1 {
270            current_count += 1;
271        } else {
272            if current_count > best_count {
273                best_count = current_count;
274                best_size = sizes[i - 1];
275            }
276            current_count = 1;
277        }
278    }
279    if current_count > best_count {
280        best_size = *sizes.last().unwrap();
281    }
282    best_size
283}
284
285/// Check if a font name indicates bold.
286fn is_bold_font(fontname: &str) -> bool {
287    let lower = fontname.to_lowercase();
288    lower.contains("bold") || lower.contains("heavy") || lower.contains("black")
289}
290
291/// Check if a font name indicates italic.
292fn is_italic_font(fontname: &str) -> bool {
293    let lower = fontname.to_lowercase();
294    lower.contains("italic") || lower.contains("oblique")
295}
296
297/// Get the dominant font name in a word.
298fn word_dominant_font(word: &Word) -> &str {
299    if word.chars.is_empty() {
300        return "";
301    }
302    // Use the font of the first non-space character
303    word.chars
304        .iter()
305        .find(|c| !c.text.trim().is_empty())
306        .map(|c| c.fontname.as_str())
307        .unwrap_or("")
308}
309
310/// Classify text blocks into content elements.
311fn classify_blocks(
312    blocks: &[TextBlock],
313    median_size: f64,
314    options: &MarkdownOptions,
315) -> Vec<ContentElement> {
316    let mut elements = Vec::new();
317
318    for block in blocks {
319        let block_text = block_to_text(block);
320        if block_text.trim().is_empty() {
321            continue;
322        }
323
324        let dominant_size = block_dominant_size(block);
325
326        // Check for heading
327        if let Some(level) =
328            detect_heading_level(dominant_size, median_size, options.heading_min_ratio)
329        {
330            // Headings are typically short (single line or few words)
331            let is_short =
332                block.lines.len() <= 2 && block.lines.iter().all(|l| l.words.len() <= 15);
333            if is_short {
334                elements.push(ContentElement::Heading {
335                    level,
336                    text: block_text.trim().to_string(),
337                });
338                continue;
339            }
340        }
341
342        // Check for list items
343        if options.detect_lists {
344            let line_texts: Vec<String> = block.lines.iter().map(line_to_text).collect();
345
346            let all_list_items = line_texts.iter().all(|t| detect_list_item(t).is_some());
347            if all_list_items && !line_texts.is_empty() {
348                for text in &line_texts {
349                    if let Some((prefix, rest)) = detect_list_item(text) {
350                        elements.push(ContentElement::ListItem { prefix, text: rest });
351                    }
352                }
353                continue;
354            }
355        }
356
357        // Apply emphasis if enabled
358        let rendered_text = if options.detect_emphasis {
359            render_block_with_emphasis(block)
360        } else {
361            block_text
362        };
363
364        elements.push(ContentElement::Paragraph(rendered_text.trim().to_string()));
365    }
366
367    elements
368}
369
370/// Convert a text block to plain text.
371fn block_to_text(block: &TextBlock) -> String {
372    block
373        .lines
374        .iter()
375        .map(line_to_text)
376        .collect::<Vec<_>>()
377        .join("\n")
378}
379
380/// Convert a text line to plain text.
381fn line_to_text(line: &TextLine) -> String {
382    line.words
383        .iter()
384        .map(|w| w.text.as_str())
385        .collect::<Vec<_>>()
386        .join(" ")
387}
388
389/// Render a block with bold/italic emphasis based on font names.
390fn render_block_with_emphasis(block: &TextBlock) -> String {
391    block
392        .lines
393        .iter()
394        .map(render_line_with_emphasis)
395        .collect::<Vec<_>>()
396        .join("\n")
397}
398
399/// Render a line with emphasis markers.
400fn render_line_with_emphasis(line: &TextLine) -> String {
401    let mut parts: Vec<String> = Vec::new();
402
403    for word in &line.words {
404        let font = word_dominant_font(word);
405        let bold = is_bold_font(font);
406        let italic = is_italic_font(font);
407
408        let text = &word.text;
409        if bold && italic {
410            parts.push(format!("***{text}***"));
411        } else if bold {
412            parts.push(format!("**{text}**"));
413        } else if italic {
414            parts.push(format!("*{text}*"));
415        } else {
416            parts.push(text.clone());
417        }
418    }
419
420    parts.join(" ")
421}
422
423/// Convert a Table to GitHub Flavored Markdown table syntax.
424fn table_to_gfm(table: &Table) -> String {
425    if table.rows.is_empty() {
426        return String::new();
427    }
428
429    let mut lines = Vec::new();
430
431    for (i, row) in table.rows.iter().enumerate() {
432        let cells: Vec<String> = row
433            .iter()
434            .map(|cell| {
435                cell.text
436                    .as_deref()
437                    .unwrap_or("")
438                    .replace('|', "\\|")
439                    .replace('\n', " ")
440            })
441            .collect();
442
443        let line = format!("| {} |", cells.join(" | "));
444        lines.push(line);
445
446        // Add separator after first row (header)
447        if i == 0 {
448            let sep: Vec<&str> = cells.iter().map(|_| "---").collect();
449            lines.push(format!("| {} |", sep.join(" | ")));
450        }
451    }
452
453    lines.join("\n")
454}
455
456/// Render content elements into a Markdown string.
457fn render_elements(elements: &[ContentElement]) -> String {
458    let mut parts: Vec<String> = Vec::new();
459
460    for element in elements {
461        match element {
462            ContentElement::Heading { level, text } => {
463                let hashes = "#".repeat(*level as usize);
464                parts.push(format!("{hashes} {text}"));
465            }
466            ContentElement::Paragraph(text) => {
467                parts.push(text.clone());
468            }
469            ContentElement::Table(md) => {
470                parts.push(md.clone());
471            }
472            ContentElement::ListItem { prefix, text } => {
473                // Normalize list prefix to standard Markdown
474                let md_prefix = if prefix.starts_with(|c: char| c.is_ascii_digit()) {
475                    prefix.clone()
476                } else {
477                    "- ".to_string()
478                };
479                parts.push(format!("{md_prefix}{text}"));
480            }
481        }
482    }
483
484    parts.join("\n\n")
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490    use crate::geometry::BBox;
491    use crate::table::Cell;
492    use crate::text::TextDirection;
493
494    fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64, size: f64) -> Char {
495        Char {
496            text: text.to_string(),
497            bbox: BBox::new(x0, top, x1, bottom),
498            fontname: "Helvetica".to_string(),
499            size,
500            doctop: top,
501            upright: true,
502            direction: TextDirection::Ltr,
503            stroking_color: None,
504            non_stroking_color: None,
505            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
506            char_code: 0,
507            mcid: None,
508            tag: None,
509        }
510    }
511
512    fn make_word_from_text(
513        text: &str,
514        x0: f64,
515        top: f64,
516        x1: f64,
517        bottom: f64,
518        size: f64,
519        fontname: &str,
520    ) -> Word {
521        let chars: Vec<Char> = text
522            .chars()
523            .enumerate()
524            .map(|(i, c)| {
525                let char_width = (x1 - x0) / text.len() as f64;
526                let cx0 = x0 + i as f64 * char_width;
527                let cx1 = cx0 + char_width;
528                Char {
529                    text: c.to_string(),
530                    bbox: BBox::new(cx0, top, cx1, bottom),
531                    fontname: fontname.to_string(),
532                    size,
533                    doctop: top,
534                    upright: true,
535                    direction: TextDirection::Ltr,
536                    stroking_color: None,
537                    non_stroking_color: None,
538                    ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
539                    char_code: 0,
540                    mcid: None,
541                    tag: None,
542                }
543            })
544            .collect();
545        Word {
546            text: text.to_string(),
547            bbox: BBox::new(x0, top, x1, bottom),
548            doctop: top,
549            direction: TextDirection::Ltr,
550            chars,
551        }
552    }
553
554    // --- Heading detection tests ---
555
556    #[test]
557    fn test_detect_heading_h1() {
558        assert_eq!(detect_heading_level(24.0, 12.0, 1.2), Some(1));
559    }
560
561    #[test]
562    fn test_detect_heading_h2() {
563        assert_eq!(detect_heading_level(20.0, 12.0, 1.2), Some(2));
564    }
565
566    #[test]
567    fn test_detect_heading_h3() {
568        assert_eq!(detect_heading_level(16.0, 12.0, 1.2), Some(3));
569    }
570
571    #[test]
572    fn test_detect_heading_h4() {
573        assert_eq!(detect_heading_level(14.5, 12.0, 1.2), Some(4));
574    }
575
576    #[test]
577    fn test_detect_no_heading_normal_size() {
578        assert_eq!(detect_heading_level(12.0, 12.0, 1.2), None);
579    }
580
581    #[test]
582    fn test_detect_heading_zero_median() {
583        assert_eq!(detect_heading_level(12.0, 0.0, 1.2), None);
584    }
585
586    #[test]
587    fn test_detect_heading_zero_font_size() {
588        assert_eq!(detect_heading_level(0.0, 12.0, 1.2), None);
589    }
590
591    // --- List detection tests ---
592
593    #[test]
594    fn test_detect_bullet_dash() {
595        let result = detect_list_item("- item text");
596        assert_eq!(result, Some(("- ".to_string(), "item text".to_string())));
597    }
598
599    #[test]
600    fn test_detect_bullet_asterisk() {
601        let result = detect_list_item("* item text");
602        assert_eq!(result, Some(("* ".to_string(), "item text".to_string())));
603    }
604
605    #[test]
606    fn test_detect_bullet_unicode() {
607        let result = detect_list_item("• item text");
608        assert_eq!(result, Some(("• ".to_string(), "item text".to_string())));
609    }
610
611    #[test]
612    fn test_detect_numbered_list_dot() {
613        let result = detect_list_item("1. first item");
614        assert_eq!(result, Some(("1. ".to_string(), "first item".to_string())));
615    }
616
617    #[test]
618    fn test_detect_numbered_list_paren() {
619        let result = detect_list_item("2) second item");
620        assert_eq!(result, Some(("2) ".to_string(), "second item".to_string())));
621    }
622
623    #[test]
624    fn test_detect_no_list_normal_text() {
625        assert_eq!(detect_list_item("Just normal text"), None);
626    }
627
628    #[test]
629    fn test_detect_no_list_empty() {
630        assert_eq!(detect_list_item(""), None);
631    }
632
633    // --- Median font size tests ---
634
635    #[test]
636    fn test_median_font_size_empty() {
637        assert_eq!(compute_median_font_size(&[]), 12.0);
638    }
639
640    #[test]
641    fn test_median_font_size_single() {
642        let chars = vec![make_char("A", 0.0, 0.0, 10.0, 12.0, 14.0)];
643        assert_eq!(compute_median_font_size(&chars), 14.0);
644    }
645
646    #[test]
647    fn test_median_font_size_odd_count() {
648        let chars = vec![
649            make_char("A", 0.0, 0.0, 10.0, 12.0, 10.0),
650            make_char("B", 10.0, 0.0, 20.0, 12.0, 12.0),
651            make_char("C", 20.0, 0.0, 30.0, 12.0, 14.0),
652        ];
653        assert_eq!(compute_median_font_size(&chars), 12.0);
654    }
655
656    #[test]
657    fn test_median_font_size_even_count() {
658        let chars = vec![
659            make_char("A", 0.0, 0.0, 10.0, 12.0, 10.0),
660            make_char("B", 10.0, 0.0, 20.0, 12.0, 14.0),
661        ];
662        assert_eq!(compute_median_font_size(&chars), 12.0);
663    }
664
665    #[test]
666    fn test_median_font_size_ignores_zero_size() {
667        let chars = vec![
668            make_char("A", 0.0, 0.0, 10.0, 12.0, 0.0),
669            make_char("B", 10.0, 0.0, 20.0, 12.0, 12.0),
670            make_char("C", 20.0, 0.0, 30.0, 12.0, 14.0),
671        ];
672        assert_eq!(compute_median_font_size(&chars), 13.0);
673    }
674
675    // --- Table to GFM tests ---
676
677    #[test]
678    fn test_table_to_gfm_simple() {
679        let table = Table {
680            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
681            cells: vec![],
682            rows: vec![
683                vec![
684                    Cell {
685                        bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
686                        text: Some("Name".to_string()),
687                    },
688                    Cell {
689                        bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
690                        text: Some("Age".to_string()),
691                    },
692                ],
693                vec![
694                    Cell {
695                        bbox: BBox::new(0.0, 25.0, 50.0, 50.0),
696                        text: Some("Alice".to_string()),
697                    },
698                    Cell {
699                        bbox: BBox::new(50.0, 25.0, 100.0, 50.0),
700                        text: Some("30".to_string()),
701                    },
702                ],
703            ],
704            columns: vec![],
705        };
706        let gfm = table_to_gfm(&table);
707        assert_eq!(gfm, "| Name | Age |\n| --- | --- |\n| Alice | 30 |");
708    }
709
710    #[test]
711    fn test_table_to_gfm_with_none_cells() {
712        let table = Table {
713            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
714            cells: vec![],
715            rows: vec![
716                vec![
717                    Cell {
718                        bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
719                        text: Some("Header".to_string()),
720                    },
721                    Cell {
722                        bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
723                        text: None,
724                    },
725                ],
726                vec![
727                    Cell {
728                        bbox: BBox::new(0.0, 25.0, 50.0, 50.0),
729                        text: None,
730                    },
731                    Cell {
732                        bbox: BBox::new(50.0, 25.0, 100.0, 50.0),
733                        text: Some("Data".to_string()),
734                    },
735                ],
736            ],
737            columns: vec![],
738        };
739        let gfm = table_to_gfm(&table);
740        assert_eq!(gfm, "| Header |  |\n| --- | --- |\n|  | Data |");
741    }
742
743    #[test]
744    fn test_table_to_gfm_empty_rows() {
745        let table = Table {
746            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
747            cells: vec![],
748            rows: vec![],
749            columns: vec![],
750        };
751        assert_eq!(table_to_gfm(&table), "");
752    }
753
754    #[test]
755    fn test_table_to_gfm_escapes_pipe() {
756        let table = Table {
757            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
758            cells: vec![],
759            rows: vec![
760                vec![Cell {
761                    bbox: BBox::new(0.0, 0.0, 100.0, 25.0),
762                    text: Some("A|B".to_string()),
763                }],
764                vec![Cell {
765                    bbox: BBox::new(0.0, 25.0, 100.0, 50.0),
766                    text: Some("C".to_string()),
767                }],
768            ],
769            columns: vec![],
770        };
771        let gfm = table_to_gfm(&table);
772        assert!(gfm.contains("A\\|B"));
773    }
774
775    // --- Paragraph grouping tests ---
776
777    #[test]
778    fn test_render_simple_paragraph() {
779        // Create characters that form "Hello World" on one line
780        // Keep word gap < x_density (10) to avoid column split
781        let chars = vec![
782            make_char("H", 0.0, 0.0, 8.0, 12.0, 12.0),
783            make_char("e", 8.0, 0.0, 16.0, 12.0, 12.0),
784            make_char("l", 16.0, 0.0, 24.0, 12.0, 12.0),
785            make_char("l", 24.0, 0.0, 32.0, 12.0, 12.0),
786            make_char("o", 32.0, 0.0, 40.0, 12.0, 12.0),
787            make_char(" ", 40.0, 0.0, 44.0, 12.0, 12.0),
788            make_char("W", 44.0, 0.0, 52.0, 12.0, 12.0),
789            make_char("o", 52.0, 0.0, 60.0, 12.0, 12.0),
790            make_char("r", 60.0, 0.0, 68.0, 12.0, 12.0),
791            make_char("l", 68.0, 0.0, 76.0, 12.0, 12.0),
792            make_char("d", 76.0, 0.0, 84.0, 12.0, 12.0),
793        ];
794        let result = MarkdownRenderer::render_text(&chars, &MarkdownOptions::default());
795        assert_eq!(result.trim(), "Hello World");
796    }
797
798    #[test]
799    fn test_render_heading_detection() {
800        // Large text at 24pt (should be H1 relative to 12pt median)
801        let mut chars = Vec::new();
802        // Large heading
803        for (i, c) in "Title".chars().enumerate() {
804            chars.push(make_char(
805                &c.to_string(),
806                i as f64 * 16.0,
807                0.0,
808                (i + 1) as f64 * 16.0,
809                24.0,
810                24.0,
811            ));
812        }
813        // Normal body text on a separate line (gap > y_density)
814        for (i, c) in "Body text here".chars().enumerate() {
815            let x0 = i as f64 * 8.0;
816            if c == ' ' {
817                chars.push(make_char(" ", x0, 40.0, x0 + 8.0, 52.0, 12.0));
818            } else {
819                chars.push(make_char(&c.to_string(), x0, 40.0, x0 + 8.0, 52.0, 12.0));
820            }
821        }
822        let result = MarkdownRenderer::render_text(&chars, &MarkdownOptions::default());
823        assert!(
824            result.contains("# Title"),
825            "Expected H1 heading, got: {result}"
826        );
827        assert!(
828            result.contains("Body text here"),
829            "Expected body text, got: {result}"
830        );
831    }
832
833    #[test]
834    fn test_render_empty_input() {
835        let result = MarkdownRenderer::render(&[], &[], &MarkdownOptions::default());
836        assert_eq!(result, "");
837    }
838
839    // --- Bold/italic detection tests ---
840
841    #[test]
842    fn test_bold_font_detection() {
843        assert!(is_bold_font("Helvetica-Bold"));
844        assert!(is_bold_font("TimesNewRoman-BoldItalic"));
845        assert!(!is_bold_font("Helvetica"));
846        assert!(!is_bold_font("Times-Roman"));
847    }
848
849    #[test]
850    fn test_italic_font_detection() {
851        assert!(is_italic_font("Helvetica-Oblique"));
852        assert!(is_italic_font("Times-Italic"));
853        assert!(!is_italic_font("Helvetica"));
854        assert!(!is_italic_font("Helvetica-Bold"));
855    }
856
857    #[test]
858    fn test_render_with_emphasis() {
859        let line = TextLine {
860            words: vec![
861                make_word_from_text("normal", 0.0, 0.0, 48.0, 12.0, 12.0, "Helvetica"),
862                make_word_from_text("bold", 52.0, 0.0, 88.0, 12.0, 12.0, "Helvetica-Bold"),
863                make_word_from_text("italic", 92.0, 0.0, 140.0, 12.0, 12.0, "Helvetica-Oblique"),
864            ],
865            bbox: BBox::new(0.0, 0.0, 140.0, 12.0),
866        };
867        let result = render_line_with_emphasis(&line);
868        assert_eq!(result, "normal **bold** *italic*");
869    }
870
871    // --- MarkdownOptions default tests ---
872
873    #[test]
874    fn test_markdown_options_default() {
875        let opts = MarkdownOptions::default();
876        assert_eq!(opts.y_tolerance, 3.0);
877        assert_eq!(opts.y_density, 10.0);
878        assert_eq!(opts.x_density, 10.0);
879        assert_eq!(opts.heading_min_ratio, 1.2);
880        assert!(opts.detect_lists);
881        assert!(opts.detect_emphasis);
882    }
883
884    // --- End-to-end rendering tests ---
885
886    #[test]
887    fn test_render_with_table() {
888        let table = Table {
889            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
890            cells: vec![],
891            rows: vec![
892                vec![
893                    Cell {
894                        bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
895                        text: Some("Col1".to_string()),
896                    },
897                    Cell {
898                        bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
899                        text: Some("Col2".to_string()),
900                    },
901                ],
902                vec![
903                    Cell {
904                        bbox: BBox::new(0.0, 25.0, 50.0, 50.0),
905                        text: Some("A".to_string()),
906                    },
907                    Cell {
908                        bbox: BBox::new(50.0, 25.0, 100.0, 50.0),
909                        text: Some("B".to_string()),
910                    },
911                ],
912            ],
913            columns: vec![],
914        };
915        let result = MarkdownRenderer::render(&[], &[table], &MarkdownOptions::default());
916        assert!(result.contains("| Col1 | Col2 |"));
917        assert!(result.contains("| --- | --- |"));
918        assert!(result.contains("| A | B |"));
919    }
920
921    #[test]
922    fn test_table_to_gfm_single_row() {
923        let table = Table {
924            bbox: BBox::new(0.0, 0.0, 100.0, 25.0),
925            cells: vec![],
926            rows: vec![vec![
927                Cell {
928                    bbox: BBox::new(0.0, 0.0, 50.0, 25.0),
929                    text: Some("Only".to_string()),
930                },
931                Cell {
932                    bbox: BBox::new(50.0, 0.0, 100.0, 25.0),
933                    text: Some("Row".to_string()),
934                },
935            ]],
936            columns: vec![],
937        };
938        let gfm = table_to_gfm(&table);
939        // Single row should still have separator
940        assert_eq!(gfm, "| Only | Row |\n| --- | --- |");
941    }
942
943    #[test]
944    fn test_render_list_items() {
945        // Create chars that form list items
946        let mut chars = Vec::new();
947        for (i, c) in "- first item".chars().enumerate() {
948            let x0 = i as f64 * 8.0;
949            chars.push(make_char(&c.to_string(), x0, 0.0, x0 + 8.0, 12.0, 12.0));
950        }
951        for (i, c) in "- second item".chars().enumerate() {
952            let x0 = i as f64 * 8.0;
953            chars.push(make_char(&c.to_string(), x0, 15.0, x0 + 8.0, 27.0, 12.0));
954        }
955        let result = MarkdownRenderer::render_text(&chars, &MarkdownOptions::default());
956        assert!(
957            result.contains("- first item"),
958            "Expected first list item, got: {result}"
959        );
960        assert!(
961            result.contains("- second item"),
962            "Expected second list item, got: {result}"
963        );
964    }
965
966    #[test]
967    fn test_detect_numbered_list_multi_digit() {
968        let result = detect_list_item("12. twelfth item");
969        assert_eq!(
970            result,
971            Some(("12. ".to_string(), "twelfth item".to_string()))
972        );
973    }
974
975    #[test]
976    fn test_block_dominant_size() {
977        let block = TextBlock {
978            lines: vec![TextLine {
979                words: vec![make_word_from_text(
980                    "Hello",
981                    0.0,
982                    0.0,
983                    40.0,
984                    12.0,
985                    14.0,
986                    "Helvetica",
987                )],
988                bbox: BBox::new(0.0, 0.0, 40.0, 12.0),
989            }],
990            bbox: BBox::new(0.0, 0.0, 40.0, 12.0),
991        };
992        assert_eq!(block_dominant_size(&block), 14.0);
993    }
994
995    #[test]
996    fn test_render_elements_heading_and_paragraph() {
997        let elements = vec![
998            ContentElement::Heading {
999                level: 1,
1000                text: "My Title".to_string(),
1001            },
1002            ContentElement::Paragraph("Some body text.".to_string()),
1003        ];
1004        let result = render_elements(&elements);
1005        assert_eq!(result, "# My Title\n\nSome body text.");
1006    }
1007
1008    #[test]
1009    fn test_render_elements_list() {
1010        let elements = vec![
1011            ContentElement::ListItem {
1012                prefix: "- ".to_string(),
1013                text: "first".to_string(),
1014            },
1015            ContentElement::ListItem {
1016                prefix: "- ".to_string(),
1017                text: "second".to_string(),
1018            },
1019        ];
1020        let result = render_elements(&elements);
1021        assert_eq!(result, "- first\n\n- second");
1022    }
1023
1024    #[test]
1025    fn test_render_elements_numbered_list() {
1026        let elements = vec![
1027            ContentElement::ListItem {
1028                prefix: "1. ".to_string(),
1029                text: "first".to_string(),
1030            },
1031            ContentElement::ListItem {
1032                prefix: "2. ".to_string(),
1033                text: "second".to_string(),
1034            },
1035        ];
1036        let result = render_elements(&elements);
1037        assert_eq!(result, "1. first\n\n2. second");
1038    }
1039
1040    #[test]
1041    fn test_table_to_gfm_newline_in_cell() {
1042        let table = Table {
1043            bbox: BBox::new(0.0, 0.0, 100.0, 50.0),
1044            cells: vec![],
1045            rows: vec![
1046                vec![Cell {
1047                    bbox: BBox::new(0.0, 0.0, 100.0, 25.0),
1048                    text: Some("Header".to_string()),
1049                }],
1050                vec![Cell {
1051                    bbox: BBox::new(0.0, 25.0, 100.0, 50.0),
1052                    text: Some("Line1\nLine2".to_string()),
1053                }],
1054            ],
1055            columns: vec![],
1056        };
1057        let gfm = table_to_gfm(&table);
1058        // Newlines in cells should be replaced with spaces
1059        assert!(gfm.contains("Line1 Line2"));
1060        // Check that the GFM has 3 lines: header, separator, data row
1061        let gfm_lines: Vec<&str> = gfm.lines().collect();
1062        assert_eq!(gfm_lines.len(), 3);
1063    }
1064}