Skip to main content

lb_rs/model/text/
units.rs

1//! Type-safe text addressing — companion to [`offset_types`][parent].
2//!
3//! `offset_types` provides the position/count newtypes ([`Byte`],
4//! [`Bytes`], [`Grapheme`], [`Graphemes`]) and their algebra. This module
5//! adds the *unit-aware constructors* that make those types load-bearing
6//! against the bug class we kept hitting:
7//!
8//! - [`Graphemes::measure_replace`] — actual graphemes contributed by a
9//!   Replace, accounting for seam fusion (Devanagari spacing marks, ZWJ
10//!   sequences). The OT-correct number; the only constructor for an
11//!   OT-suitable `Graphemes`.
12//! - [`Graphemes::from_isolated_str`] — in-isolation count (what
13//!   `text.graphemes(true).count()` returns). Named `_isolated_` so misuse
14//!   stands out in review; legitimate for display widths but **wrong** for
15//!   OT or cursor placement.
16//! - [`UnicodeSegs::byte_to_grapheme_strict`] / `_floor` / `_ceil` —
17//!   conversions from a non-grapheme-aware byte source (cosmic-text glyphs,
18//!   comrak sourcepos). Strict returns a `Result`; the snapping variants
19//!   round to the nearest cluster boundary.
20//!
21//! [parent]: super::offset_types
22//!
23//! ## Codepoint type
24//!
25//! [`Codepoint`] / [`Codepoints`] live here rather than in `offset_types`
26//! because the legacy code never had a codepoint unit — codepoints are
27//! introduced as part of the type-safety pass to make conversions through
28//! cosmic-text glyph positions and comrak sourcepos explicit.
29
30use super::offset_types::{Byte, Grapheme, Graphemes, RangeExt};
31use super::unicode_segs::UnicodeSegs;
32use unicode_segmentation::UnicodeSegmentation;
33
34// ─── Codepoint unit ──────────────────────────────────────────────────────
35
36/// Unicode scalar value index. Each value corresponds to a Rust `char`
37/// (U+0000 to U+10FFFF excluding surrogates).
38#[repr(transparent)]
39#[derive(Default, Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
40pub struct Codepoint(pub usize);
41
42/// A count of Unicode scalar values.
43#[repr(transparent)]
44#[derive(Default, Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
45pub struct Codepoints(pub usize);
46
47impl Codepoints {
48    /// Codepoint count of `s`. Always correct — codepoints are local to
49    /// the string, no fusion concerns.
50    pub fn measure(s: &str) -> Self {
51        Self(s.chars().count())
52    }
53}
54
55// ─── `Graphemes` measurement (the load-bearing OT constructor) ──────────
56
57impl Graphemes {
58    /// Actual graphemes contributed by replacing `replaced` (in `old_segs`)
59    /// with new text whose effect is captured by `new_segs`. Accounts for
60    /// seam fusion at the boundaries of the replaced range — this is the
61    /// number that OT position math requires.
62    ///
63    /// The math: new buffer's grapheme count = old count − replaced + actual,
64    /// so actual = (new_total + replaced) − old_total.
65    pub fn measure_replace(
66        old_segs: &UnicodeSegs, new_segs: &UnicodeSegs, replaced: (Grapheme, Grapheme),
67    ) -> Self {
68        let old_total = old_segs.last_grapheme();
69        let new_total = new_segs.last_grapheme();
70        let replaced_len = replaced.len();
71        Self((new_total.0 + replaced_len.0).saturating_sub(old_total.0))
72    }
73
74    /// Grapheme count of `s` *in isolation* — what `unicode_segmentation`
75    /// reports without context. **Under-counts** when `s` is later spliced
76    /// into a buffer where its boundary characters fuse with neighbors (a
77    /// Devanagari spacing mark joining the preceding consonant; a ZWJ
78    /// joining adjacent emoji into one cluster).
79    ///
80    /// Use only for purposes that genuinely want the in-isolation count
81    /// (display widths, soft constraints). For OT or cursor placement, use
82    /// [`Graphemes::measure_replace`] instead.
83    pub fn from_isolated_str(s: &str) -> Self {
84        Self(s.graphemes(true).count())
85    }
86}
87
88// ─── Conversions on `UnicodeSegs` ────────────────────────────────────────
89
90/// Returned when a strict byte→grapheme conversion is asked for a byte that
91/// doesn't lie on a grapheme cluster boundary.
92#[derive(Debug)]
93pub enum BoundaryError {
94    NotGraphemeAligned(Byte),
95}
96
97impl UnicodeSegs {
98    /// Last valid grapheme position — i.e. the position one past the last
99    /// grapheme cluster, where the cursor sits at end-of-buffer.
100    pub fn last_grapheme(&self) -> Grapheme {
101        Grapheme(self.grapheme_indexes.len().saturating_sub(1))
102    }
103
104    /// Strict: byte must be on a grapheme boundary.
105    pub fn byte_to_grapheme_strict(&self, b: Byte) -> Result<Grapheme, BoundaryError> {
106        match self.grapheme_indexes.binary_search(&b) {
107            Ok(i) => Ok(Grapheme(i)),
108            Err(_) => Err(BoundaryError::NotGraphemeAligned(b)),
109        }
110    }
111
112    /// Snap down to the start of the cluster containing `b`. Use for
113    /// inclusive boundaries from a non-grapheme-aware source.
114    pub fn byte_to_grapheme_floor(&self, b: Byte) -> Grapheme {
115        match self.grapheme_indexes.binary_search(&b) {
116            Ok(i) => Grapheme(i),
117            Err(i) => Grapheme(i.saturating_sub(1)),
118        }
119    }
120
121    /// Snap up to the start of the next cluster. Use for exclusive
122    /// boundaries from a non-grapheme-aware source — the cluster containing
123    /// `b` ends up included in whatever range this byte terminates.
124    pub fn byte_to_grapheme_ceil(&self, b: Byte) -> Grapheme {
125        match self.grapheme_indexes.binary_search(&b) {
126            Ok(i) => Grapheme(i),
127            Err(i) => Grapheme(i.min(self.grapheme_indexes.len().saturating_sub(1))),
128        }
129    }
130
131    /// Always-safe direction.
132    pub fn grapheme_to_byte(&self, g: Grapheme) -> Byte {
133        self.grapheme_indexes[g.0]
134    }
135}