Skip to main content

pivot_pdf/
textflow.rs

1use std::collections::BTreeSet;
2
3use crate::document::format_coord;
4use crate::fonts::{BuiltinFont, FontMetrics, FontRef};
5use crate::truetype::TrueTypeFont;
6use crate::writer::escape_pdf_string;
7
8/// Controls how words wider than the available box width are handled.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
10pub enum WordBreak {
11    /// Break wide words at a character boundary. (Default)
12    #[default]
13    BreakAll,
14    /// Break wide words at a character boundary and insert a hyphen.
15    Hyphenate,
16    /// Do not break words. Wide words overflow the box.
17    Normal,
18}
19
20/// Result of fitting text into a bounding box.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum FitResult {
23    /// All text has been placed.
24    Stop,
25    /// The bounding box is full but text remains.
26    BoxFull,
27    /// The bounding box is too small to fit any text.
28    BoxEmpty,
29}
30
31/// A bounding rectangle for text placement.
32///
33/// `(x, y)` is the upper-left corner; text flows top-to-bottom.
34/// All values are in PDF points.
35#[derive(Debug, Clone, Copy)]
36pub struct Rect {
37    /// Left edge in PDF points (origin at top-left of the layout area).
38    pub x: f64,
39    /// Top edge in PDF points (origin at top-left of the layout area).
40    pub y: f64,
41    /// Width in PDF points.
42    pub width: f64,
43    /// Height in PDF points.
44    pub height: f64,
45}
46
47/// Tracks which fonts were actually used during content generation.
48#[derive(Debug, Default)]
49pub struct UsedFonts {
50    /// Set of built-in fonts used.
51    pub builtin: BTreeSet<BuiltinFont>,
52    /// Indices into the document's TrueType font list for fonts used.
53    pub truetype: BTreeSet<usize>,
54}
55
56/// Text styling options.
57#[derive(Debug, Clone)]
58pub struct TextStyle {
59    /// Font to use for this style.
60    pub font: FontRef,
61    /// Font size in PDF points.
62    pub font_size: f64,
63}
64
65impl Default for TextStyle {
66    fn default() -> Self {
67        TextStyle {
68            font: FontRef::Builtin(BuiltinFont::Helvetica),
69            font_size: 12.0,
70        }
71    }
72}
73
74impl TextStyle {
75    /// Convenience constructor for builtin fonts.
76    pub fn builtin(font: BuiltinFont, font_size: f64) -> Self {
77        TextStyle {
78            font: FontRef::Builtin(font),
79            font_size,
80        }
81    }
82}
83
84/// A span of text with associated style.
85#[derive(Debug, Clone)]
86struct TextSpan {
87    text: String,
88    style: TextStyle,
89}
90
91/// A word extracted from spans, carrying its style and whether
92/// it is preceded by a space.
93#[derive(Debug, Clone)]
94struct Word {
95    text: String,
96    style: TextStyle,
97    leading_space: bool,
98}
99
100/// A TextFlow manages styled text and flows it into bounding boxes
101/// across one or more pages.
102#[derive(Debug)]
103pub struct TextFlow {
104    spans: Vec<TextSpan>,
105    /// Current position into the word list (for multi-page flow).
106    cursor: usize,
107    /// How to handle words wider than the bounding box.
108    pub word_break: WordBreak,
109}
110
111impl TextFlow {
112    /// Create an empty text flow with default word-break settings.
113    pub fn new() -> Self {
114        TextFlow {
115            spans: Vec::new(),
116            cursor: 0,
117            word_break: WordBreak::BreakAll,
118        }
119    }
120
121    /// Add styled text to the flow.
122    pub fn add_text(&mut self, text: &str, style: &TextStyle) {
123        self.spans.push(TextSpan {
124            text: text.to_string(),
125            style: style.clone(),
126        });
127    }
128
129    /// Returns true if all text has been consumed.
130    pub fn is_finished(&self) -> bool {
131        let words = self.extract_words();
132        self.cursor >= words.len()
133    }
134
135    /// Extract all words from spans, splitting on whitespace and
136    /// preserving newlines as separate entries.
137    fn extract_words(&self) -> Vec<Word> {
138        let mut words = Vec::new();
139        let mut had_space = false;
140        for span in &self.spans {
141            let mut chars = span.text.chars().peekable();
142
143            while chars.peek().is_some() {
144                // Consume leading spaces
145                while chars.peek() == Some(&' ') {
146                    had_space = true;
147                    chars.next();
148                }
149
150                if chars.peek() == Some(&'\n') {
151                    chars.next();
152                    words.push(Word {
153                        text: "\n".to_string(),
154                        style: span.style.clone(),
155                        leading_space: false,
156                    });
157                    had_space = false;
158                    continue;
159                }
160
161                // Collect word characters
162                let mut word = String::new();
163                while let Some(&ch) = chars.peek() {
164                    if ch == ' ' || ch == '\n' {
165                        break;
166                    }
167                    word.push(ch);
168                    chars.next();
169                }
170
171                if !word.is_empty() {
172                    words.push(Word {
173                        text: word,
174                        style: span.style.clone(),
175                        leading_space: had_space && !words.is_empty(),
176                    });
177                    had_space = false;
178                }
179            }
180        }
181        words
182    }
183
184    /// Generate PDF content stream operations that fit within
185    /// the given rectangle. Returns the content bytes, a
186    /// FitResult, and the fonts actually used.
187    ///
188    /// **Multi-page stability:** when `word_break` is not `Normal`, the word
189    /// list is pre-processed by `break_wide_words` before layout. That
190    /// function is deterministic for a given `rect.width`, so the internal
191    /// cursor index remains valid across successive calls — provided the
192    /// caller supplies the same `rect.width` every time for a given flow.
193    pub fn generate_content_ops(
194        &mut self,
195        rect: &Rect,
196        tt_fonts: &mut [TrueTypeFont],
197    ) -> (Vec<u8>, FitResult, UsedFonts) {
198        let empty = UsedFonts::default();
199        let raw_words = self.extract_words();
200        let words = if self.word_break != WordBreak::Normal {
201            break_wide_words(raw_words, rect.width, self.word_break, tt_fonts)
202        } else {
203            raw_words
204        };
205        if self.cursor >= words.len() {
206            return (Vec::new(), FitResult::Stop, empty);
207        }
208
209        let mut output = Vec::new();
210        let mut used = UsedFonts::default();
211        let first_word = &words[self.cursor];
212        let first_line_height = line_height_for(&first_word.style, tt_fonts);
213
214        // Check if even one line fits vertically
215        if first_line_height > rect.height {
216            return (Vec::new(), FitResult::BoxEmpty, empty);
217        }
218
219        output.extend_from_slice(b"BT\n");
220
221        // First baseline: top of rect minus ascent (approximated
222        // as font_size since line_height ~ font_size * 1.2).
223        let first_baseline_y = rect.y - first_word.style.font_size;
224        let mut current_y = first_baseline_y;
225        let mut is_first_line = true;
226        let mut any_text_placed = false;
227
228        // Track current font state in the content stream
229        let mut active_font: Option<FontRef> = None;
230        let mut active_size: Option<f64> = None;
231
232        while self.cursor < words.len() {
233            let line_height = line_height_for(&words[self.cursor].style, tt_fonts);
234
235            if !is_first_line {
236                let next_y = current_y - line_height;
237                let bottom = rect.y - rect.height;
238                if next_y < bottom {
239                    output.extend_from_slice(b"ET\n");
240                    return (output, FitResult::BoxFull, used);
241                }
242            }
243
244            // Collect words that fit on this line
245            let line_start = self.cursor;
246            let mut line_width: f64 = 0.0;
247            let mut line_end = self.cursor;
248
249            while line_end < words.len() {
250                let word = &words[line_end];
251
252                if word.text == "\n" {
253                    line_end += 1;
254                    break;
255                }
256
257                let word_width = measure_word(&word.text, &word.style, tt_fonts);
258                let space_width = if word.leading_space {
259                    measure_word(" ", &word.style, tt_fonts)
260                } else {
261                    0.0
262                };
263
264                let total = line_width + space_width + word_width;
265                if total > rect.width && line_end > line_start {
266                    break;
267                }
268                if total > rect.width && line_end == line_start {
269                    if !any_text_placed {
270                        output.extend_from_slice(b"ET\n");
271                        return (Vec::new(), FitResult::BoxEmpty, UsedFonts::default());
272                    }
273                    line_end += 1;
274                    break;
275                }
276
277                line_width = total;
278                line_end += 1;
279            }
280
281            if line_end == line_start {
282                break;
283            }
284
285            // Emit line positioning
286            if is_first_line {
287                output.extend_from_slice(
288                    format!(
289                        "{} {} Td\n",
290                        format_coord(rect.x),
291                        format_coord(first_baseline_y),
292                    )
293                    .as_bytes(),
294                );
295                is_first_line = false;
296            } else {
297                output.extend_from_slice(
298                    format!("0 {} Td\n", format_coord(-line_height),).as_bytes(),
299                );
300                current_y -= line_height;
301            }
302
303            // Emit words for this line
304            for i in line_start..line_end {
305                let word = &words[i];
306                if word.text == "\n" {
307                    continue;
308                }
309                let font_ref = word.style.font;
310                let font_size = word.style.font_size;
311
312                // Set font if changed
313                if active_font != Some(font_ref) || active_size != Some(font_size) {
314                    let name = pdf_font_name(font_ref, tt_fonts);
315                    output.extend_from_slice(
316                        format!("/{} {} Tf\n", name, format_coord(font_size),).as_bytes(),
317                    );
318                    active_font = Some(font_ref);
319                    active_size = Some(font_size);
320                    record_font(&font_ref, &mut used);
321                }
322
323                let is_first_on_line = i == line_start;
324                let display_text = if word.leading_space && !is_first_on_line {
325                    format!(" {}", word.text)
326                } else {
327                    word.text.clone()
328                };
329
330                emit_text(&display_text, font_ref, tt_fonts, &mut output);
331            }
332
333            any_text_placed = true;
334            self.cursor = line_end;
335        }
336
337        output.extend_from_slice(b"ET\n");
338
339        let result = if self.cursor >= words.len() {
340            FitResult::Stop
341        } else {
342            FitResult::BoxFull
343        };
344        (output, result, used)
345    }
346}
347
348/// Split any word wider than `max_width` into character-boundary pieces.
349///
350/// Words that fit are left unchanged. Words that exceed `max_width` are split
351/// via `break_word` and re-assembled as `Word` structs that carry the
352/// original style and leading-space flag.
353///
354/// Because `extract_words` always produces the same vector for the same spans,
355/// this function is also deterministic — the cursor index stays valid across
356/// multiple `generate_content_ops` calls (i.e. across page breaks).
357fn break_wide_words(
358    words: Vec<Word>,
359    max_width: f64,
360    mode: WordBreak,
361    tt_fonts: &[TrueTypeFont],
362) -> Vec<Word> {
363    let mut result: Vec<Word> = Vec::with_capacity(words.len());
364
365    for word in words {
366        if word.text == "\n" {
367            result.push(word);
368            continue;
369        }
370
371        let word_width = measure_word(&word.text, &word.style, tt_fonts);
372        if word_width <= max_width {
373            result.push(word);
374            continue;
375        }
376
377        let ts = TextStyle {
378            font: word.style.font,
379            font_size: word.style.font_size,
380        };
381        let pieces = break_word(&word.text, max_width, &ts, mode, tt_fonts);
382        let leading_space = word.leading_space;
383
384        for (i, piece) in pieces.into_iter().enumerate() {
385            result.push(Word {
386                text: piece,
387                style: word.style.clone(),
388                leading_space: i == 0 && leading_space,
389            });
390        }
391    }
392
393    result
394}
395
396/// Break a single word into pieces that each fit within `avail_width`.
397///
398/// Returns at least one piece. In `Hyphenate` mode a `-` is appended to
399/// every piece except the last. Forward progress is always guaranteed: a
400/// single character is always emitted even if it exceeds the budget, so
401/// the loop cannot run forever on a pathologically narrow box.
402pub(crate) fn break_word(
403    word: &str,
404    avail_width: f64,
405    style: &TextStyle,
406    mode: WordBreak,
407    tt_fonts: &[TrueTypeFont],
408) -> Vec<String> {
409    let hyphen_w = if mode == WordBreak::Hyphenate {
410        measure_word("-", style, tt_fonts)
411    } else {
412        0.0
413    };
414    let mut pieces: Vec<String> = Vec::new();
415    let mut remaining = word;
416
417    while !remaining.is_empty() {
418        let budget = avail_width - hyphen_w;
419        let mut prefix_end = 0;
420        let mut prefix_width = 0.0;
421
422        for ch in remaining.chars() {
423            let next_end = prefix_end + ch.len_utf8();
424            let ch_w = measure_word(&remaining[..next_end], style, tt_fonts) - prefix_width;
425            if prefix_width + ch_w > budget && prefix_end > 0 {
426                break;
427            }
428            prefix_width += ch_w;
429            prefix_end = next_end;
430            // A single char already fills the budget — emit it and move on.
431            if prefix_width >= budget {
432                break;
433            }
434        }
435
436        // Degenerate: budget so tiny even one char didn't fit — take one char.
437        if prefix_end == 0 {
438            prefix_end = remaining.chars().next().map_or(0, |c| c.len_utf8());
439        }
440
441        let is_last = prefix_end >= remaining.len();
442        let piece = if !is_last && mode == WordBreak::Hyphenate {
443            format!("{}-", &remaining[..prefix_end])
444        } else {
445            remaining[..prefix_end].to_string()
446        };
447        pieces.push(piece);
448        remaining = &remaining[prefix_end..];
449    }
450    pieces
451}
452
453/// Compute line height based on font type.
454pub(crate) fn line_height_for(style: &TextStyle, tt_fonts: &[TrueTypeFont]) -> f64 {
455    match style.font {
456        FontRef::Builtin(b) => FontMetrics::line_height(b, style.font_size),
457        FontRef::TrueType(id) => tt_fonts[id.0].line_height(style.font_size),
458    }
459}
460
461/// Measure a word's width based on font type.
462pub(crate) fn measure_word(text: &str, style: &TextStyle, tt_fonts: &[TrueTypeFont]) -> f64 {
463    match style.font {
464        FontRef::Builtin(b) => FontMetrics::measure_text(text, b, style.font_size),
465        FontRef::TrueType(id) => tt_fonts[id.0].measure_text(text, style.font_size),
466    }
467}
468
469/// Get the PDF resource name for a font.
470fn pdf_font_name(font: FontRef, tt_fonts: &[TrueTypeFont]) -> String {
471    match font {
472        FontRef::Builtin(b) => b.pdf_name().to_string(),
473        FontRef::TrueType(id) => tt_fonts[id.0].pdf_name.clone(),
474    }
475}
476
477/// Record a font as used.
478fn record_font(font: &FontRef, used: &mut UsedFonts) {
479    match font {
480        FontRef::Builtin(b) => {
481            used.builtin.insert(*b);
482        }
483        FontRef::TrueType(id) => {
484            used.truetype.insert(id.0);
485        }
486    }
487}
488
489/// Emit text as either literal `(text) Tj` for builtin fonts
490/// or hex `<glyph_ids> Tj` for TrueType fonts.
491fn emit_text(text: &str, font: FontRef, tt_fonts: &mut [TrueTypeFont], output: &mut Vec<u8>) {
492    match font {
493        FontRef::Builtin(_) => {
494            let escaped = escape_pdf_string(text);
495            output.extend_from_slice(format!("({}) Tj\n", escaped).as_bytes());
496        }
497        FontRef::TrueType(id) => {
498            let hex = tt_fonts[id.0].encode_text_hex(text);
499            output.extend_from_slice(format!("{} Tj\n", hex).as_bytes());
500        }
501    }
502}
503
504#[cfg(test)]
505mod break_word_tests {
506    use super::*;
507    use crate::fonts::BuiltinFont;
508
509    /// Helvetica 12pt TextStyle — the font we use throughout the tests.
510    fn hv12() -> TextStyle {
511        TextStyle::builtin(BuiltinFont::Helvetica, 12.0)
512    }
513
514    /// Measure a string with Helvetica 12pt, no TrueType fonts.
515    fn w(text: &str) -> f64 {
516        measure_word(text, &hv12(), &[])
517    }
518
519    // -------------------------------------------------------
520    // Basic correctness
521    // -------------------------------------------------------
522
523    #[test]
524    fn empty_word_returns_empty_vec() {
525        // The outer while-loop exits immediately for an empty string.
526        let pieces = break_word("", 100.0, &hv12(), WordBreak::BreakAll, &[]);
527        assert!(pieces.is_empty());
528    }
529
530    #[test]
531    fn word_that_fits_returns_single_unchanged_piece() {
532        let style = hv12();
533        let avail = w("hello") + 1.0; // generous budget
534        let pieces = break_word("hello", avail, &style, WordBreak::BreakAll, &[]);
535        assert_eq!(pieces, vec!["hello"]);
536    }
537
538    #[test]
539    fn word_exactly_at_boundary_is_not_broken() {
540        // When the word fills the budget exactly the loop exits on the
541        // `prefix_width >= budget` break, then `prefix_end == remaining.len()`,
542        // so it's treated as the last piece — no split.
543        let style = hv12();
544        let avail = w("www"); // exactly 3 w's wide
545        let pieces = break_word("www", avail, &style, WordBreak::BreakAll, &[]);
546        assert_eq!(pieces, vec!["www"]);
547    }
548
549    // -------------------------------------------------------
550    // BreakAll mode
551    // -------------------------------------------------------
552
553    #[test]
554    fn break_all_splits_evenly_on_char_boundary() {
555        // "wwwwww" at budget of exactly 3 w's → ["www", "www"].
556        // Helvetica 'w' = 722/1000 em → at 12pt = 8.664 pt.
557        let style = hv12();
558        let avail = w("www"); // ~25.992 pt; "wwww" = ~34.656 pt won't fit
559        let pieces = break_word("wwwwww", avail, &style, WordBreak::BreakAll, &[]);
560        assert_eq!(pieces, vec!["www", "www"]);
561    }
562
563    #[test]
564    fn break_all_produces_no_hyphens() {
565        let style = hv12();
566        let avail = w("ww"); // force a split
567        let pieces = break_word("wwww", avail, &style, WordBreak::BreakAll, &[]);
568        for piece in &pieces {
569            assert!(
570                !piece.ends_with('-'),
571                "BreakAll should not add hyphens, got: {:?}",
572                pieces
573            );
574        }
575    }
576
577    #[test]
578    fn break_all_three_pieces() {
579        // "iiiiiiiii" (9 i's) at width of 3 i's → ["iii", "iii", "iii"].
580        // Helvetica 'i' = 222/1000 em → at 12pt = 2.664 pt.
581        let style = hv12();
582        let avail = w("iii");
583        let pieces = break_word("iiiiiiiii", avail, &style, WordBreak::BreakAll, &[]);
584        assert_eq!(pieces, vec!["iii", "iii", "iii"]);
585    }
586
587    // -------------------------------------------------------
588    // Hyphenate mode
589    // -------------------------------------------------------
590
591    #[test]
592    fn hyphenate_adds_hyphen_to_non_last_pieces() {
593        // Budget = 3w - hyphen_width.  'w' = 8.664pt, '-' = 3.996pt.
594        // Budget ≈ 25.992 - 3.996 = 21.996pt → "ww" (17.328) fits, "www" doesn't.
595        // So each non-last piece holds 2 w's plus a hyphen.
596        let style = hv12();
597        let avail = w("www"); // ~25.992 pt
598        let pieces = break_word("wwwwww", avail, &style, WordBreak::Hyphenate, &[]);
599        // Every piece except the last must end with '-'.
600        let (last, rest) = pieces.split_last().unwrap();
601        for piece in rest {
602            assert!(
603                piece.ends_with('-'),
604                "non-last piece should end with '-', got: {:?}",
605                piece
606            );
607        }
608        assert!(
609            !last.ends_with('-'),
610            "last piece must not end with '-', got: {:?}",
611            last
612        );
613    }
614
615    #[test]
616    fn hyphenate_last_piece_never_ends_with_hyphen() {
617        // The final piece of any split must not carry a hyphen.
618        // Use a word that requires 3 pieces so the invariant is non-trivial.
619        let style = hv12();
620        let avail = w("www"); // ~25.992 pt → forces multi-piece split
621        let pieces = break_word("wwwwwwww", avail, &style, WordBreak::Hyphenate, &[]);
622        assert!(pieces.len() > 1, "expected a split");
623        assert!(!pieces.last().unwrap().ends_with('-'));
624    }
625
626    #[test]
627    fn hyphenate_word_fitting_budget_produces_one_piece_without_hyphen() {
628        // When avail is large enough that the word fits even after reserving
629        // room for a hyphen, break_word returns a single unhyphenated piece.
630        // avail = word_width + hyphen_width + 1pt leaves the budget ≥ word_width.
631        let style = hv12();
632        let avail = w("hello") + w("-") + 1.0;
633        let pieces = break_word("hello", avail, &style, WordBreak::Hyphenate, &[]);
634        assert_eq!(pieces, vec!["hello"]);
635    }
636
637    #[test]
638    fn hyphenate_pieces_respect_hyphen_width_budget() {
639        // Each non-last piece (including its hyphen) must fit within avail.
640        let style = hv12();
641        let avail = w("www"); // ~25.992 pt
642        let pieces = break_word("wwwwwwwwww", avail, &style, WordBreak::Hyphenate, &[]);
643        for piece in &pieces {
644            let piece_w = measure_word(piece, &style, &[]);
645            assert!(
646                piece_w <= avail + f64::EPSILON,
647                "piece {:?} ({:.3}pt) exceeds avail ({:.3}pt)",
648                piece,
649                piece_w,
650                avail
651            );
652        }
653    }
654
655    // -------------------------------------------------------
656    // Forward-progress guarantee (degenerate narrow box)
657    // -------------------------------------------------------
658
659    #[test]
660    fn single_char_wider_than_budget_still_emitted() {
661        // When even one character is wider than the budget, the fallback
662        // takes one character unconditionally so the loop always terminates.
663        let style = hv12();
664        let tiny = 1.0; // far smaller than any glyph
665        let pieces = break_word("iii", tiny, &style, WordBreak::BreakAll, &[]);
666        // One char per piece — forward progress guaranteed.
667        assert_eq!(pieces, vec!["i", "i", "i"]);
668    }
669
670    #[test]
671    fn single_char_word_with_tiny_budget_returns_that_char() {
672        let style = hv12();
673        let pieces = break_word("w", 1.0, &style, WordBreak::BreakAll, &[]);
674        assert_eq!(pieces, vec!["w"]);
675    }
676
677    // -------------------------------------------------------
678    // Unicode safety
679    // -------------------------------------------------------
680
681    #[test]
682    fn multibyte_chars_split_on_codepoint_boundary() {
683        // "é" is U+00E9, encoded as 2 bytes in UTF-8.
684        // Ensure break_word never produces an invalid UTF-8 slice.
685        // (The font will fall back to a default width for non-ASCII, which is fine.)
686        let style = hv12();
687        let pieces = break_word("éàü", 1.0, &style, WordBreak::BreakAll, &[]);
688        // Each piece must be valid UTF-8 (Rust strings guarantee this).
689        for piece in &pieces {
690            assert!(!piece.is_empty());
691        }
692        // All characters must be accounted for.
693        let rejoined: String = pieces.join("");
694        assert_eq!(rejoined, "éàü");
695    }
696}