lb_rs/model/text/units.rs
1//! Type-safe text addressing — companion to [`offset_types`][parent].
2//!
3//! `offset_types` provides the position/count newtypes ([`Byte`],
4//! [`Bytes`], [`Grapheme`], [`Graphemes`]) and their algebra. This module
5//! adds the *unit-aware constructors* that make those types load-bearing
6//! against the bug class we kept hitting:
7//!
8//! - [`Graphemes::measure_replace`] — actual graphemes contributed by a
9//! Replace, accounting for seam fusion (Devanagari spacing marks, ZWJ
10//! sequences). The OT-correct number; the only constructor for an
11//! OT-suitable `Graphemes`.
12//! - [`Graphemes::from_isolated_str`] — in-isolation count (what
13//! `text.graphemes(true).count()` returns). Named `_isolated_` so misuse
14//! stands out in review; legitimate for display widths but **wrong** for
15//! OT or cursor placement.
16//! - [`UnicodeSegs::byte_to_grapheme_strict`] / `_floor` / `_ceil` —
17//! conversions from a non-grapheme-aware byte source (cosmic-text glyphs,
18//! comrak sourcepos). Strict returns a `Result`; the snapping variants
19//! round to the nearest cluster boundary.
20//!
21//! [parent]: super::offset_types
22//!
23//! ## Codepoint type
24//!
25//! [`Codepoint`] / [`Codepoints`] live here rather than in `offset_types`
26//! because the legacy code never had a codepoint unit — codepoints are
27//! introduced as part of the type-safety pass to make conversions through
28//! cosmic-text glyph positions and comrak sourcepos explicit.
29
30use super::offset_types::{Byte, Grapheme, Graphemes, RangeExt};
31use super::unicode_segs::UnicodeSegs;
32use unicode_segmentation::UnicodeSegmentation;
33
34// ─── Codepoint unit ──────────────────────────────────────────────────────
35
36/// Unicode scalar value index. Each value corresponds to a Rust `char`
37/// (U+0000 to U+10FFFF excluding surrogates).
38#[repr(transparent)]
39#[derive(Default, Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
40pub struct Codepoint(pub usize);
41
42/// A count of Unicode scalar values.
43#[repr(transparent)]
44#[derive(Default, Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
45pub struct Codepoints(pub usize);
46
47impl Codepoints {
48 /// Codepoint count of `s`. Always correct — codepoints are local to
49 /// the string, no fusion concerns.
50 pub fn measure(s: &str) -> Self {
51 Self(s.chars().count())
52 }
53}
54
55// ─── `Graphemes` measurement (the load-bearing OT constructor) ──────────
56
57impl Graphemes {
58 /// Actual graphemes contributed by replacing `replaced` (in `old_segs`)
59 /// with new text whose effect is captured by `new_segs`. Accounts for
60 /// seam fusion at the boundaries of the replaced range — this is the
61 /// number that OT position math requires.
62 ///
63 /// The math: new buffer's grapheme count = old count − replaced + actual,
64 /// so actual = (new_total + replaced) − old_total.
65 pub fn measure_replace(
66 old_segs: &UnicodeSegs, new_segs: &UnicodeSegs, replaced: (Grapheme, Grapheme),
67 ) -> Self {
68 let old_total = old_segs.last_grapheme();
69 let new_total = new_segs.last_grapheme();
70 let replaced_len = replaced.len();
71 Self((new_total.0 + replaced_len.0).saturating_sub(old_total.0))
72 }
73
74 /// Grapheme count of `s` *in isolation* — what `unicode_segmentation`
75 /// reports without context. **Under-counts** when `s` is later spliced
76 /// into a buffer where its boundary characters fuse with neighbors (a
77 /// Devanagari spacing mark joining the preceding consonant; a ZWJ
78 /// joining adjacent emoji into one cluster).
79 ///
80 /// Use only for purposes that genuinely want the in-isolation count
81 /// (display widths, soft constraints). For OT or cursor placement, use
82 /// [`Graphemes::measure_replace`] instead.
83 pub fn from_isolated_str(s: &str) -> Self {
84 Self(s.graphemes(true).count())
85 }
86}
87
88// ─── Conversions on `UnicodeSegs` ────────────────────────────────────────
89
90/// Returned when a strict byte→grapheme conversion is asked for a byte that
91/// doesn't lie on a grapheme cluster boundary.
92#[derive(Debug)]
93pub enum BoundaryError {
94 NotGraphemeAligned(Byte),
95}
96
97impl UnicodeSegs {
98 /// Last valid grapheme position — i.e. the position one past the last
99 /// grapheme cluster, where the cursor sits at end-of-buffer.
100 pub fn last_grapheme(&self) -> Grapheme {
101 Grapheme(self.grapheme_indexes.len().saturating_sub(1))
102 }
103
104 /// Strict: byte must be on a grapheme boundary.
105 pub fn byte_to_grapheme_strict(&self, b: Byte) -> Result<Grapheme, BoundaryError> {
106 match self.grapheme_indexes.binary_search(&b) {
107 Ok(i) => Ok(Grapheme(i)),
108 Err(_) => Err(BoundaryError::NotGraphemeAligned(b)),
109 }
110 }
111
112 /// Snap down to the start of the cluster containing `b`. Use for
113 /// inclusive boundaries from a non-grapheme-aware source.
114 pub fn byte_to_grapheme_floor(&self, b: Byte) -> Grapheme {
115 match self.grapheme_indexes.binary_search(&b) {
116 Ok(i) => Grapheme(i),
117 Err(i) => Grapheme(i.saturating_sub(1)),
118 }
119 }
120
121 /// Snap up to the start of the next cluster. Use for exclusive
122 /// boundaries from a non-grapheme-aware source — the cluster containing
123 /// `b` ends up included in whatever range this byte terminates.
124 pub fn byte_to_grapheme_ceil(&self, b: Byte) -> Grapheme {
125 match self.grapheme_indexes.binary_search(&b) {
126 Ok(i) => Grapheme(i),
127 Err(i) => Grapheme(i.min(self.grapheme_indexes.len().saturating_sub(1))),
128 }
129 }
130
131 /// Always-safe direction.
132 pub fn grapheme_to_byte(&self, g: Grapheme) -> Byte {
133 self.grapheme_indexes[g.0]
134 }
135}