runefix_core/atom.rs
1//! Atom segmentation (width-aware layout units).
2//!
3//! This module defines [`atoms()`], a runefix-specific alternative to graphemes.
4//! It segments a string into visual display units for terminal and TUI rendering.
5//!
6//! This helps solve alignment bugs in monospaced environments caused by emoji and CJK widths.
7
8use crate::RuneDisplayWidth;
9
10/// Splits the input string into **layout atoms** — visual units used for width-aware layout.
11///
12/// This is a **runefix-specific segmentation**, based on actual display width, not linguistic boundaries.
13/// It differs from [`graphemes()`] (which follows Unicode UAX #29) by focusing purely on units that affect layout:
14///
15/// - Characters with width = 0 (e.g., combining marks, control codes) are grouped with their leading base
16/// - Emoji sequences (e.g. ZWJ, variation selectors) are preserved as atomic units
17/// - Output is suitable for TUI rendering, Markdown table layout, and CLI alignment
18///
19/// # Example
20/// ```
21/// use runefix_core::atoms;
22/// assert_eq!(atoms("👩❤️💋👨"), vec!["👩", "\u{200d}", "❤", "\u{fe0f}", "\u{200d}", "💋", "\u{200d}", "👨"]);
23/// ```
24///
25/// # Note
26/// This function is **not** Unicode-compliant segmentation. For that, see [`graphemes()`].
27pub fn atoms(s: &str) -> Vec<&str> {
28 let mut atoms = Vec::new(); // Store resulting display atoms
29 let mut start = 0; // Current segment start position
30
31 for (i, c) in s.char_indices() {
32 // Determine if this char has visual width
33 let w = c.width();
34
35 if w > 0 {
36 if start < i {
37 // Push preceding zero-width chars (e.g. ZWJ, marks)
38 atoms.push(&s[start..i]);
39 }
40
41 // Push current width-bearing char as an atom
42 atoms.push(&s[i..i + c.len_utf8()]);
43 start = i + c.len_utf8();
44 }
45 }
46
47 if start < s.len() {
48 // Push trailing zero-width sequence if any
49 atoms.push(&s[start..]);
50 }
51
52 atoms
53}