print_positions/lib.rs
1//! The [print_positions] and [print_position_data] functions
2//! provide iterators which return "print positions".
3//!
4//! A print position is a generalization of the
5//! [UAX#29 extended grapheme cluster](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) to include rendering color and emphasis of the user-visible
6//! character using
7//! [ANSI escape codes](https://en.wikipedia.org/wiki/ANSI_escape_code#Description).
8//! So a "print position" is an even longer multi-byte sequence that still represents a single user visible character on the screen.
9//!
10//! ## Example:
11//! ```rust
12//! use print_positions::{print_positions, print_position_data};
13//!
14//! // content is e with dieresis, displayed in green with a color reset at the end.
15//! // Looks like 1 character on the screen. See example "padding" to print one out.
16//! let content = &["\u{1b}[30;42m", "\u{0065}", "\u{0308}", "\u{1b}[0m"].join("");
17//!
18//! // access number of print positions without examining the content
19//! assert_eq!(print_positions(content).count(), 1);
20//!
21//! let segmented:Vec<_> = print_position_data(content).collect();
22//! assert_eq!(content.len(), 15); // content is 15 chars long
23//! assert_eq!(segmented.len(), 1); // but only 1 print position
24//!
25//! ```
26//! ## Rationale:
27//! In the good old days, a "character" was a simple entity. It would always fit into one octet
28//! (or perhaps only a [sestet](https://retrocomputing.stackexchange.com/questions/7937/last-computer-not-to-use-octets-8-bit-bytes)).
29//! You could access the i'th character in a string by accessing the i'th element of its array.
30//! And you could process characters in any human language you wanted, as long as it was (transliterated into) English.
31//!
32//! Modern applications must support multiple natural languages and some are rendered on an ANSI-compatible
33//! screen (or, less often, print device). So it's a given that what a user would consider a simple "character", visible as a single
34//! glyph on the screen, is represented in memory by multiple and variable numbers of bytes.
35//!
36//! This crate provides a tool to make it once again easy to access the i'th "character" of a word on the screen
37//! by indexing to the i'th element of an array, but the array now consists of "print positions" rather than bytes or primitive type `char`s.
38//! See iterator [PrintPositionData].
39//!
40//! Sometimes you don't even need to access the character data itself, you just want to know how many visible
41//! columns it will consume on the screen, in order to align it with other text or within a fixed area on the screen. See iterator [PrintPositions].
42//!
43
44#[cfg(test)]
45mod tests;
46
47use unicode_segmentation::{GraphemeIndices, UnicodeSegmentation};
48
49/// This iterator identifies print positions in the source string and returns start and end offsets of
50/// the data rather than the data itself.
51/// See [PrintPositionData] if you want to iterate through the data instead.
52///
53/// A print position is an immutable slice of the source string. It contains 1 grapheme cluster (by definition)
54/// and any ANSI escape codes found between graphemes in the source. The ANSI escape codes will generally *preceed*
55/// the grapheme (since these codes change the rendering of characters that follow), but sometimes will *follow* the
56/// grapheme (for the few codes that reset special graphic rendering).
57///
58/// ```rust
59/// use print_positions::print_positions;
60///
61/// let content = "\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}abc";
62/// let segments: Vec<(usize, usize)> = print_positions(content).collect();
63/// assert_eq!(vec!((0, 18), (18, 19), (19, 20), (20, 21)), segments);
64///
65/// // access print position data after segmenting source.
66/// assert_eq!( &content[segments[1].0..segments[1].1], "a");
67///
68/// // Count print positions in content.
69/// assert_eq!( print_positions(content).count(), 4);
70/// ```
71#[derive(Clone)]
72pub struct PrintPositions<'a> {
73 // the victim string -- all outputs are slices of this.
74 string: &'a str,
75 // offset of beginning of slice currently being assembled or last returned.
76 cur_offset: usize,
77 // offset of the first unexamined char
78 next_offset: usize,
79 // wrapped grapheme (== extended grapheme cluster) iterator
80 gi_iterator: GraphemeIndices<'a>,
81}
82/// Factory method to create a new [PrintPositions] iterator
83///
84#[inline]
85pub fn print_positions<'a>(s: &'a str) -> PrintPositions<'a> {
86 let iter = UnicodeSegmentation::grapheme_indices(s, true);
87 PrintPositions {
88 string: s,
89 cur_offset: 0,
90 next_offset: 0,
91 gi_iterator: iter,
92 }
93}
94
95impl<'a> PrintPositions<'a> {
96 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
97 ///
98 /// ```rust
99 /// # use print_positions::print_positions;
100 /// let mut iter = print_positions("abc");
101 /// assert_eq!(iter.as_str(), "abc");
102 /// iter.next();
103 /// assert_eq!(iter.as_str(), "bc");
104 /// iter.next();
105 /// iter.next();
106 /// assert_eq!(iter.as_str(), "");
107 /// ```
108 #[inline]
109 pub fn as_str(&self) -> &'a str {
110 &self.string[self.cur_offset..self.string.len()]
111 }
112}
113
114impl<'a> Iterator for PrintPositions<'a> {
115 /// Iterator returns tuple of start offset and end + 1 offset
116 /// in source string of current print position.
117 type Item = (usize, usize);
118
119 fn next(&mut self) -> Option<Self::Item> {
120 if self.next_offset > self.string.len() {
121 return None;
122 };
123
124 enum EscapeState {
125 Normal,
126 EscapeSeen, // just saw an escape, start accumulating
127 CSISeen, // 2nd char not terminal, continue accumulating
128 OSCSeen, // operating system commmand, accumulate through ESC\.
129 OSCSeen1, // in OSC, saw ESC, look for \
130 }
131
132 let mut escape_state = EscapeState::Normal;
133
134 while self.next_offset < self.string.len() {
135 let grap = self.gi_iterator.next().expect("already checked not at EOS");
136 debug_assert_eq!(
137 grap.0, self.next_offset,
138 "offset of retrieved grap (left) not at start of rest of string (right)",
139 );
140 self.next_offset += grap.1.len();
141
142 let ascii_byte = grap.1.as_bytes()[0];
143
144 match escape_state {
145 EscapeState::Normal => {
146 if ascii_byte == 0x1b {
147 escape_state = EscapeState::EscapeSeen;
148 } else {
149 break; // terminate the grapheme
150 }
151 }
152
153 EscapeState::EscapeSeen => match ascii_byte {
154 b'[' => {
155 escape_state = EscapeState::CSISeen;
156 }
157 b']' => {
158 escape_state = EscapeState::OSCSeen;
159 }
160 0x40..=0x5F => {
161 // terminate escape, but continue accumulating rest of print position
162 escape_state = EscapeState::Normal;
163 }
164 _ => {
165 debug_assert!(
166 true, // don't actually fail fuzz testing, but document behavior for malformed escapes.
167 "unexpected char {ascii_byte} following ESC, terminating escape"
168 );
169 escape_state = EscapeState::Normal;
170 }
171 },
172
173 EscapeState::CSISeen => {
174 if (0x40..=0x7e).contains(&ascii_byte) {
175 // end of CSI, but continue accumulating
176 escape_state = EscapeState::Normal;
177 } else if (0x20..=0x3f).contains(&ascii_byte) { // accumulate CSI
178 } else {
179 debug_assert!(
180 true, // don't actually fail fuzz testing, but document behavior for malformed escapes.
181 "unexpected char {ascii_byte} in CSI sequence, terminating escape"
182 );
183 escape_state = EscapeState::Normal;
184 }
185 }
186
187 EscapeState::OSCSeen => {
188 if ascii_byte == 0x07 {
189 // spec says BEL terminates seq (on some emulators)
190 escape_state = EscapeState::Normal;
191 } else if ascii_byte == 0x1b {
192 escape_state = EscapeState::OSCSeen1;
193 } // anything else stays in OSC accumulation
194 }
195
196 EscapeState::OSCSeen1 => {
197 match ascii_byte {
198 0x5c => {
199 // backslash
200 escape_state = EscapeState::Normal;
201 }
202 0x1b => {
203 escape_state = EscapeState::OSCSeen1;
204 }
205 _ => {
206 escape_state = EscapeState::OSCSeen;
207 }
208 }
209 }
210 }
211 }
212
213 // before returning, peek ahead and see whether there's a reset escape sequence we can append.
214 // There are 3 ANSI reset sequences.
215 // if, perversely, there is more than one sequence following the grapheme, take them all.
216 // If, even more perversely, the last char of the esc sequence plus some following
217 // characters in the string happen to form a multi-character grapheme, take all of that.
218 // This means that the reset escape sequence is not always the end of the print position slice.
219
220 while self.next_offset < self.string.len()
221 && self.string.as_bytes()[self.next_offset] == 0x1b
222 {
223 if self.next_offset + 2 <= self.string.len()
224 && self.string[self.next_offset..].starts_with("\x1bc")
225 {
226 self.gi_iterator.next();
227 let last = self.gi_iterator.next().expect("must be >=2");
228 self.next_offset += 1 + last.1.len();
229 } else if self.next_offset + 3 <= self.string.len()
230 && self.string[self.next_offset..].starts_with("\x1b[m")
231 {
232 self.gi_iterator.next();
233 self.gi_iterator.next();
234 let last = self.gi_iterator.next().expect("must be >=3");
235 self.next_offset += 2 + last.1.len();
236 } else if self.next_offset + 4 <= self.string.len()
237 && self.string[self.next_offset..].starts_with("\x1b[0m")
238 {
239 self.gi_iterator.next();
240 self.gi_iterator.next();
241 self.gi_iterator.next();
242 let last = self.gi_iterator.next().expect("must be >=4");
243 self.next_offset += 3 + last.1.len();
244 } else {
245 break; // ESC then something else. Take it at the beginning of the next call.
246 }
247 }
248 // return everything between start and end offsets
249 if self.next_offset <= self.cur_offset {
250 return None;
251 } else {
252 let retval = (self.cur_offset, self.next_offset);
253 // advance start to one beyond end of what we're returning
254 self.cur_offset = self.next_offset;
255 return Some(retval);
256 }
257 }
258}
259
260
261/// This iterator returns "print position" data found in a string, as an immutable slice within the source string.
262///
263/// All the source bytes are passed through the iterator in order and without modification, except they are grouped or "segmented" into print position slices.
264///
265/// ```rust
266/// use print_positions::print_position_data;
267///
268/// let segs: Vec<_> = print_position_data("abc\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}").collect();
269/// assert_eq!(vec!("a","b","c",
270/// "\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}" // unicode family emoji -- 1 print position
271/// ), segs);
272///
273/// // Control chars and ANSI escapes returned within the print position slice.
274/// let content = "abc\u{1b}[37;46mdef\u{1b}[0mg";
275/// let segs: Vec<_> = print_position_data(content).collect();
276/// assert_eq!(vec!("a","b","c", "\u{1b}[37;46md","e","f\u{1b}[0m", "g"), segs);
277/// assert_eq!(content, segs.join(""), "all characters passed through iterator transparently");
278/// ```
279///
280/// Run `cargo run --example padding`
281/// for an example of fixed-width formatting based on counting print positions
282/// rather than characters in the data.
283///
284pub struct PrintPositionData<'a>(PrintPositions<'a>);
285
286#[inline]
287/// Factory method to provide a new [PrintPositionData] iterator.
288///
289pub fn print_position_data<'a>(s: &'a str) -> PrintPositionData<'a> {
290 PrintPositionData(print_positions(s))
291}
292
293impl<'a> PrintPositionData<'a> {
294 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
295 ///
296 /// ```rust
297 /// # use print_positions::print_position_data;
298 /// let mut iter = print_position_data("abc");
299 /// assert_eq!(iter.as_str(), "abc");
300 /// iter.next();
301 /// assert_eq!(iter.as_str(), "bc");
302 /// iter.next();
303 /// iter.next();
304 /// assert_eq!(iter.as_str(), "");
305 /// ```
306 #[inline]
307 pub fn as_str(&self) -> &'a str {
308 &self.0.string[self.0.cur_offset..self.0.string.len()]
309 }
310}
311
312impl<'a> Iterator for PrintPositionData<'a> {
313 type Item = &'a str;
314
315 #[inline]
316 fn next(&mut self) -> Option<Self::Item> {
317 if let Some((start, end)) = self.0.next() {
318 Some(&self.0.string[start..end])
319 } else {
320 None
321 }
322 }
323}
324