Skip to main content

tess/
render.rs

1use unicode_segmentation::UnicodeSegmentation;
2use unicode_width::UnicodeWidthStr;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub enum Cell {
6    Char { ch: char, width: u8 },
7    Continuation,
8    Empty,
9}
10
11#[derive(Debug, Clone)]
12pub struct RenderOpts {
13    pub tab_width: u8,
14    pub wrap: bool,
15    pub cols: u16,
16}
17
18impl Default for RenderOpts {
19    fn default() -> Self {
20        Self { tab_width: 8, wrap: true, cols: 80 }
21    }
22}
23
24/// Try to decode one grapheme cluster starting at `bytes[i]`.
25/// Returns the cluster as &str and number of bytes consumed.
26/// Returns None if `bytes[i..]` does not begin with a valid UTF-8 sequence.
27fn decode_cluster(bytes: &[u8], i: usize) -> Option<(&str, usize)> {
28    // Find the longest valid UTF-8 prefix starting at i (capped at 4 bytes
29    // for the first codepoint, then continue while next codepoint is a
30    // zero-width continuation of the same cluster).
31    // Strategy: try to validate up to 4 bytes for the leading codepoint,
32    // then extend as long as additional codepoints belong to the same cluster.
33
34    // First, validate one codepoint.
35    let max = (i + 4).min(bytes.len());
36    let mut end = i;
37    for try_end in (i + 1)..=max {
38        if std::str::from_utf8(&bytes[i..try_end]).is_ok() {
39            end = try_end;
40            break;
41        }
42    }
43    if end == i {
44        return None;
45    }
46
47    // Now extend by additional valid codepoints that the segmenter groups
48    // into the first cluster. Use unicode-segmentation for cluster boundaries.
49    // We keep adding bytes (validated as UTF-8) until the cluster boundary
50    // changes or we run out of bytes.
51    let mut probe_end = end;
52    loop {
53        // Try extending by up to 4 more bytes.
54        let probe_max = (probe_end + 4).min(bytes.len());
55        let mut next_end = probe_end;
56        for try_end in (probe_end + 1)..=probe_max {
57            if std::str::from_utf8(&bytes[i..try_end]).is_ok() {
58                next_end = try_end;
59                break;
60            }
61        }
62        if next_end == probe_end {
63            break;
64        }
65        let candidate = std::str::from_utf8(&bytes[i..next_end]).unwrap();
66        let cluster_count = candidate.graphemes(true).count();
67        if cluster_count > 1 {
68            // Adding broke into a new cluster; stop at probe_end.
69            break;
70        }
71        probe_end = next_end;
72    }
73
74    Some((std::str::from_utf8(&bytes[i..probe_end]).unwrap(), probe_end - i))
75}
76
77pub fn render_line(bytes: &[u8], opts: &RenderOpts) -> Vec<Vec<Cell>> {
78    let cols = opts.cols as usize;
79    let mut rows: Vec<Vec<Cell>> = Vec::new();
80    let mut current: Vec<Cell> = Vec::with_capacity(cols);
81
82    fn push(current: &mut Vec<Cell>, rows: &mut Vec<Vec<Cell>>, cell: Cell, opts: &RenderOpts) {
83        if current.len() >= opts.cols as usize {
84            if opts.wrap {
85                let mut full = std::mem::replace(current, Vec::with_capacity(opts.cols as usize));
86                while full.len() < opts.cols as usize { full.push(Cell::Empty); }
87                rows.push(full);
88            } else {
89                return;
90            }
91        }
92        current.push(cell);
93    }
94
95    fn push_str(current: &mut Vec<Cell>, rows: &mut Vec<Vec<Cell>>, s: &str, opts: &RenderOpts) {
96        for c in s.chars() {
97            push(current, rows, Cell::Char { ch: c, width: 1 }, opts);
98        }
99    }
100
101    fn push_wide(
102        current: &mut Vec<Cell>,
103        rows: &mut Vec<Vec<Cell>>,
104        ch: char,
105        width: u8,
106        opts: &RenderOpts,
107    ) {
108        let cols = opts.cols as usize;
109        // If the wide char wouldn't fit in the remainder of this row, wrap first.
110        if current.len() + width as usize > cols {
111            if opts.wrap {
112                let mut full = std::mem::replace(current, Vec::with_capacity(cols));
113                while full.len() < cols { full.push(Cell::Empty); }
114                rows.push(full);
115            } else {
116                return; // chop
117            }
118        }
119        current.push(Cell::Char { ch, width });
120        for _ in 1..width {
121            current.push(Cell::Continuation);
122        }
123    }
124
125    let mut i = 0;
126    while i < bytes.len() {
127        let b = bytes[i];
128        if b == b'\t' {
129            let stop = opts.tab_width.max(1) as usize;
130            let cur_col = current.len();
131            let next_stop = ((cur_col / stop) + 1) * stop;
132            for _ in cur_col..next_stop {
133                push(&mut current, &mut rows, Cell::Char { ch: ' ', width: 1 }, opts);
134            }
135            i += 1;
136        } else if b == b'\n' {
137            i += 1;
138        } else if b < 0x20 || b == 0x7F {
139            let printable = if b == 0x7F { '?' } else { (b ^ 0x40) as char };
140            push(&mut current, &mut rows, Cell::Char { ch: '^', width: 1 }, opts);
141            push(&mut current, &mut rows, Cell::Char { ch: printable, width: 1 }, opts);
142            i += 1;
143        } else {
144            // Try to decode a UTF-8 grapheme cluster starting at i.
145            match decode_cluster(bytes, i) {
146                Some((cluster, consumed)) => {
147                    let w = UnicodeWidthStr::width(cluster) as u8;
148                    let base_char = cluster.chars().next().unwrap_or('\u{FFFD}');
149                    if w == 0 {
150                        // Lone combining mark with no base — emit replacement.
151                        push(&mut current, &mut rows, Cell::Char { ch: '\u{FFFD}', width: 1 }, opts);
152                    } else {
153                        push_wide(&mut current, &mut rows, base_char, w, opts);
154                    }
155                    i += consumed;
156                }
157                None => {
158                    // Invalid byte: emit <HH>, advance one byte.
159                    let s = format!("<{:02X}>", b);
160                    push_str(&mut current, &mut rows, &s, opts);
161                    i += 1;
162                }
163            }
164        }
165    }
166
167    while current.len() < cols {
168        current.push(Cell::Empty);
169    }
170    rows.push(current);
171    rows
172}
173
174pub fn count_rows(bytes: &[u8], opts: &RenderOpts) -> usize {
175    if !opts.wrap {
176        return 1;
177    }
178    let cols = opts.cols.max(1) as usize;
179    let mut col = 0usize;
180    let mut rows = 1usize;
181
182    let bump = |w: usize, col: &mut usize, rows: &mut usize| {
183        if *col + w > cols {
184            *rows += 1;
185            *col = 0;
186        }
187        *col += w;
188    };
189
190    let mut i = 0;
191    while i < bytes.len() {
192        let b = bytes[i];
193        if b == b'\t' {
194            let stop = opts.tab_width.max(1) as usize;
195            let next_stop = ((col / stop) + 1) * stop;
196            let advance = next_stop - col;
197            // Tabs may overflow into multiple wraps if cols < tab_width.
198            for _ in 0..advance {
199                bump(1, &mut col, &mut rows);
200            }
201            i += 1;
202        } else if b == b'\n' {
203            i += 1;
204        } else if b < 0x20 || b == 0x7F {
205            bump(1, &mut col, &mut rows); // ^
206            bump(1, &mut col, &mut rows); // X
207            i += 1;
208        } else {
209            match decode_cluster(bytes, i) {
210                Some((cluster, consumed)) => {
211                    let w = UnicodeWidthStr::width(cluster);
212                    let w = if w == 0 { 1 } else { w };
213                    bump(w, &mut col, &mut rows);
214                    i += consumed;
215                }
216                None => {
217                    // <HH> = 4 cells
218                    for _ in 0..4 { bump(1, &mut col, &mut rows); }
219                    i += 1;
220                }
221            }
222        }
223    }
224    rows
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    fn opts(cols: u16, wrap: bool) -> RenderOpts {
232        RenderOpts { tab_width: 8, wrap, cols }
233    }
234
235    fn ch(c: char) -> Cell { Cell::Char { ch: c, width: 1 } }
236
237    #[test]
238    fn ascii_short_line_pads_to_cols() {
239        let rows = render_line(b"hi", &opts(5, true));
240        assert_eq!(rows.len(), 1);
241        assert_eq!(rows[0], vec![ch('h'), ch('i'), Cell::Empty, Cell::Empty, Cell::Empty]);
242    }
243
244    #[test]
245    fn ascii_exact_width() {
246        let rows = render_line(b"hello", &opts(5, true));
247        assert_eq!(rows.len(), 1);
248        assert_eq!(rows[0], vec![ch('h'), ch('e'), ch('l'), ch('l'), ch('o')]);
249    }
250
251    #[test]
252    fn empty_input_yields_one_empty_row() {
253        let rows = render_line(b"", &opts(3, true));
254        assert_eq!(rows, vec![vec![Cell::Empty, Cell::Empty, Cell::Empty]]);
255    }
256
257    #[test]
258    fn tab_at_col_zero_expands_to_eight() {
259        let rows = render_line(b"\tx", &opts(20, true));
260        // Eight spaces, then 'x', then padding.
261        for (i, cell) in rows[0].iter().take(8).enumerate() {
262            assert_eq!(*cell, ch(' '), "col {i} should be space");
263        }
264        assert_eq!(rows[0][8], ch('x'));
265    }
266
267    #[test]
268    fn tab_at_col_three_advances_to_next_stop() {
269        // "abc\tx" → cols 0,1,2 = a,b,c; tab fills to col 8 with spaces; col 8 = x
270        let rows = render_line(b"abc\tx", &opts(20, true));
271        assert_eq!(rows[0][0], ch('a'));
272        assert_eq!(rows[0][2], ch('c'));
273        for cell in rows[0].iter().skip(3).take(5) {
274            assert_eq!(*cell, ch(' '));
275        }
276        assert_eq!(rows[0][8], ch('x'));
277    }
278
279    #[test]
280    fn tab_at_col_eight_advances_to_sixteen() {
281        let mut input = vec![b'a'; 8];
282        input.push(b'\t');
283        input.push(b'x');
284        let rows = render_line(&input, &opts(20, true));
285        for cell in rows[0].iter().skip(8).take(8) {
286            assert_eq!(*cell, ch(' '));
287        }
288        assert_eq!(rows[0][16], ch('x'));
289    }
290
291    #[test]
292    fn null_renders_as_caret_at() {
293        let rows = render_line(b"\0", &opts(5, true));
294        assert_eq!(rows[0][0], ch('^'));
295        assert_eq!(rows[0][1], ch('@'));
296    }
297
298    #[test]
299    fn esc_renders_as_caret_lbracket() {
300        let rows = render_line(b"\x1b", &opts(5, true));
301        assert_eq!(rows[0][0], ch('^'));
302        assert_eq!(rows[0][1], ch('['));
303    }
304
305    #[test]
306    fn del_renders_as_caret_question() {
307        let rows = render_line(b"\x7f", &opts(5, true));
308        assert_eq!(rows[0][0], ch('^'));
309        assert_eq!(rows[0][1], ch('?'));
310    }
311
312    #[test]
313    fn invalid_utf8_byte_renders_as_angle_hex() {
314        let rows = render_line(&[0xFF], &opts(8, true));
315        assert_eq!(rows[0][0], ch('<'));
316        assert_eq!(rows[0][1], ch('F'));
317        assert_eq!(rows[0][2], ch('F'));
318        assert_eq!(rows[0][3], ch('>'));
319    }
320
321    #[test]
322    fn partial_multibyte_each_byte_renders_separately() {
323        // 0xC3 starts a 2-byte sequence; alone it's invalid → <C3>
324        let rows = render_line(&[0xC3], &opts(8, true));
325        assert_eq!(rows[0][0], ch('<'));
326        assert_eq!(rows[0][1], ch('C'));
327        assert_eq!(rows[0][2], ch('3'));
328        assert_eq!(rows[0][3], ch('>'));
329    }
330
331    #[test]
332    fn single_byte_utf8_e_acute() {
333        let rows = render_line("é".as_bytes(), &opts(5, true));
334        assert_eq!(rows[0][0], Cell::Char { ch: 'é', width: 1 });
335    }
336
337    #[test]
338    fn cjk_char_takes_two_columns() {
339        // 日 is width 2.
340        let rows = render_line("日".as_bytes(), &opts(5, true));
341        assert_eq!(rows[0][0], Cell::Char { ch: '日', width: 2 });
342        assert_eq!(rows[0][1], Cell::Continuation);
343        assert_eq!(rows[0][2], Cell::Empty);
344    }
345
346    #[test]
347    fn emoji_takes_two_columns() {
348        let rows = render_line("🦀".as_bytes(), &opts(5, true));
349        // Width depends on unicode-width; crab emoji is width 2.
350        assert!(matches!(rows[0][0], Cell::Char { width: 2, .. }));
351        assert_eq!(rows[0][1], Cell::Continuation);
352    }
353
354    #[test]
355    fn combining_mark_folds_into_prior_cell() {
356        // "e\u{0301}" is one grapheme cluster (e with combining acute).
357        let rows = render_line("e\u{0301}".as_bytes(), &opts(5, true));
358        // Cluster renders as a single cell carrying base char.
359        assert!(matches!(rows[0][0], Cell::Char { width: 1, .. }));
360        assert_eq!(rows[0][1], Cell::Empty);
361    }
362
363    #[test]
364    fn wrap_long_line_into_multiple_rows() {
365        let rows = render_line(b"abcdefghij", &opts(4, true));
366        assert_eq!(rows.len(), 3);
367        assert_eq!(rows[0], vec![ch('a'), ch('b'), ch('c'), ch('d')]);
368        assert_eq!(rows[1], vec![ch('e'), ch('f'), ch('g'), ch('h')]);
369        assert_eq!(rows[2], vec![ch('i'), ch('j'), Cell::Empty, Cell::Empty]);
370    }
371
372    #[test]
373    fn chop_long_line_truncates() {
374        let rows = render_line(b"abcdefghij", &opts(4, false));
375        assert_eq!(rows.len(), 1);
376        assert_eq!(rows[0], vec![ch('a'), ch('b'), ch('c'), ch('d')]);
377    }
378
379    #[test]
380    fn wide_char_at_boundary_pushed_to_next_row() {
381        // cols=3, content "ab日" — 日 is width 2, doesn't fit at col 2,
382        // so row 0 = a, b, Empty; row 1 = 日(continuation), Empty.
383        let rows = render_line("ab日".as_bytes(), &opts(3, true));
384        assert_eq!(rows.len(), 2);
385        assert_eq!(rows[0], vec![ch('a'), ch('b'), Cell::Empty]);
386        assert_eq!(rows[1][0], Cell::Char { ch: '日', width: 2 });
387        assert_eq!(rows[1][1], Cell::Continuation);
388        assert_eq!(rows[1][2], Cell::Empty);
389    }
390
391    #[test]
392    fn count_rows_matches_render_line_for_short() {
393        let o = opts(80, true);
394        let bytes = b"hello world";
395        assert_eq!(count_rows(bytes, &o), render_line(bytes, &o).len());
396    }
397
398    #[test]
399    fn count_rows_matches_render_line_for_long_wrap() {
400        let o = opts(4, true);
401        let bytes = b"abcdefghij";
402        assert_eq!(count_rows(bytes, &o), render_line(bytes, &o).len());
403    }
404
405    #[test]
406    fn count_rows_chop_is_one() {
407        let o = opts(4, false);
408        let bytes = b"abcdefghij";
409        assert_eq!(count_rows(bytes, &o), 1);
410    }
411
412    #[test]
413    fn count_rows_handles_wide_char() {
414        let o = opts(3, true);
415        let bytes = "ab日".as_bytes();
416        assert_eq!(count_rows(bytes, &o), render_line(bytes, &o).len());
417    }
418}