print_positions/
lib.rs

1//! The [print_positions] and [print_position_data] functions
2//! provide iterators which return "print positions".
3//!
4//! A print position is a generalization of the
5//! [UAX#29 extended grapheme cluster](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) to include rendering color and emphasis of the user-visible
6//! character using 
7//! [ANSI escape codes](https://en.wikipedia.org/wiki/ANSI_escape_code#Description).
8//! So a "print position" is an even longer multi-byte sequence that still represents a single user visible character on the screen.
9//!
10//! ## Example:
11//! ```rust
12//! use print_positions::{print_positions, print_position_data};
13//!
14//! // content is e with dieresis, displayed in green with a color reset at the end.  
15//! // Looks like 1 character on the screen.  See example "padding" to print one out.
16//! let content = &["\u{1b}[30;42m", "\u{0065}", "\u{0308}", "\u{1b}[0m"].join("");
17//!
18//! // access number of print positions without examining the content
19//! assert_eq!(print_positions(content).count(), 1);
20//! 
21//! let segmented:Vec<_> = print_position_data(content).collect();
22//! assert_eq!(content.len(), 15);          // content is 15 chars long
23//! assert_eq!(segmented.len(), 1);   // but only 1 print position
24//! 
25//! ```
26//! ## Rationale:
27//! In the good old days, a "character" was a simple entity.  It would always fit into one octet 
28//! (or perhaps only a [sestet](https://retrocomputing.stackexchange.com/questions/7937/last-computer-not-to-use-octets-8-bit-bytes)).
29//! You could access the i'th character in a string by accessing the i'th element of its array.  
30//! And you could process characters in any human language you wanted, as long as it was (transliterated into) English.
31//! 
32//! Modern applications must support multiple natural languages and some are rendered on an ANSI-compatible
33//! screen (or, less often, print device). So it's a given that what a user would consider a simple "character", visible as a single
34//! glyph on the screen, is represented in memory by multiple and variable numbers of bytes.
35//! 
36//! This crate provides a tool to make it once again easy to access the i'th "character" of a word on the screen
37//! by indexing to the i'th element of an array, but the array now consists of "print positions" rather than bytes or primitive type `char`s. 
38//! See iterator [PrintPositionData].
39//! 
40//! Sometimes you don't even need to access the character data itself, you just want to know how many visible
41//! columns it will consume on the screen, in order to align it with other text or within a fixed area on the screen.  See iterator [PrintPositions].
42//!
43
44#[cfg(test)]
45mod tests;
46
47use unicode_segmentation::{GraphemeIndices, UnicodeSegmentation};
48
49/// This iterator identifies print positions in the source string and returns start and end offsets of 
50/// the data rather than the data itself.
51/// See [PrintPositionData] if you want to iterate through the data instead.
52/// 
53/// A print position is an immutable slice of the source string.  It contains 1 grapheme cluster (by definition)
54/// and any ANSI escape codes found between graphemes in the source.  The ANSI escape codes will generally *preceed*
55/// the grapheme (since these codes change the rendering of characters that follow), but sometimes will *follow* the
56/// grapheme (for the few codes that reset special graphic rendering).
57/// 
58/// ```rust
59/// use print_positions::print_positions;
60///
61/// let content = "\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}abc";
62/// let segments: Vec<(usize, usize)> = print_positions(content).collect();
63/// assert_eq!(vec!((0, 18), (18, 19), (19, 20), (20, 21)), segments);
64/// 
65/// // access print position data after segmenting source.
66/// assert_eq!( &content[segments[1].0..segments[1].1], "a"); 
67/// 
68/// // Count print positions in content.
69/// assert_eq!( print_positions(content).count(), 4);
70/// ```
71#[derive(Clone)]
72pub struct PrintPositions<'a> {
73    // the victim string -- all outputs are slices of this.
74    string: &'a str,
75    // offset of beginning of slice currently being assembled or last returned.
76    cur_offset: usize,
77    // offset of the first unexamined char
78    next_offset: usize,
79    // wrapped grapheme (== extended grapheme cluster) iterator
80    gi_iterator: GraphemeIndices<'a>,
81}
82/// Factory method to create a new [PrintPositions] iterator
83///
84#[inline]
85pub fn print_positions<'a>(s: &'a str) -> PrintPositions<'a> {
86    let iter = UnicodeSegmentation::grapheme_indices(s, true);
87    PrintPositions {
88        string: s,
89        cur_offset: 0,
90        next_offset: 0,
91        gi_iterator: iter,
92    }
93}
94
95impl<'a> PrintPositions<'a> {
96    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
97    ///
98    /// ```rust
99    /// # use print_positions::print_positions;
100    /// let mut iter = print_positions("abc");
101    /// assert_eq!(iter.as_str(), "abc");
102    /// iter.next();
103    /// assert_eq!(iter.as_str(), "bc");
104    /// iter.next();
105    /// iter.next();
106    /// assert_eq!(iter.as_str(), "");
107    /// ```
108    #[inline]
109    pub fn as_str(&self) -> &'a str {
110        &self.string[self.cur_offset..self.string.len()]
111    }
112}
113
114impl<'a> Iterator for PrintPositions<'a> {
115    /// Iterator returns tuple of start offset and end + 1 offset
116    /// in source string of current print position.
117    type Item = (usize, usize);
118
119    fn next(&mut self) -> Option<Self::Item> {
120        if self.next_offset > self.string.len() {
121            return None;
122        };
123
124        enum EscapeState {
125            Normal,
126            EscapeSeen, // just saw an escape, start accumulating
127            CSISeen,    // 2nd char not terminal, continue accumulating
128            OSCSeen,    // operating system commmand, accumulate through ESC\.
129            OSCSeen1,   // in OSC, saw ESC, look for \
130        }
131
132        let mut escape_state = EscapeState::Normal;
133
134        while self.next_offset < self.string.len() {
135            let grap = self.gi_iterator.next().expect("already checked not at EOS");
136            debug_assert_eq!(
137                grap.0, self.next_offset,
138                "offset of retrieved grap (left) not at start of rest of string (right)",
139            );
140            self.next_offset += grap.1.len();
141
142            let ascii_byte = grap.1.as_bytes()[0];
143
144            match escape_state {
145                EscapeState::Normal => {
146                    if ascii_byte == 0x1b {
147                        escape_state = EscapeState::EscapeSeen;
148                    } else {
149                        break; // terminate the grapheme
150                    }
151                }
152
153                EscapeState::EscapeSeen => match ascii_byte {
154                    b'[' => {
155                        escape_state = EscapeState::CSISeen;
156                    }
157                    b']' => {
158                        escape_state = EscapeState::OSCSeen;
159                    }
160                    0x40..=0x5F => {
161                        // terminate escape, but continue accumulating rest of print position
162                        escape_state = EscapeState::Normal;
163                    }
164                    _ => {
165                        debug_assert!(
166                            true, // don't actually fail fuzz testing, but document behavior for malformed escapes.
167                            "unexpected char {ascii_byte} following ESC, terminating escape"
168                        );
169                        escape_state = EscapeState::Normal;
170                    }
171                },
172
173                EscapeState::CSISeen => {
174                    if (0x40..=0x7e).contains(&ascii_byte) {
175                        // end of CSI, but continue accumulating
176                        escape_state = EscapeState::Normal;
177                    } else if (0x20..=0x3f).contains(&ascii_byte) { // accumulate CSI
178                    } else {
179                        debug_assert!(
180                            true, // don't actually fail fuzz testing, but document behavior for malformed escapes.
181                            "unexpected char {ascii_byte} in CSI sequence, terminating escape"
182                        );
183                        escape_state = EscapeState::Normal;
184                    }
185                }
186
187                EscapeState::OSCSeen => {
188                    if ascii_byte == 0x07 {
189                        // spec says BEL terminates seq (on some emulators)
190                        escape_state = EscapeState::Normal;
191                    } else if ascii_byte == 0x1b {
192                        escape_state = EscapeState::OSCSeen1;
193                    } // anything else stays in OSC accumulation
194                }
195
196                EscapeState::OSCSeen1 => {
197                    match ascii_byte {
198                        0x5c => {
199                            // backslash
200                            escape_state = EscapeState::Normal;
201                        }
202                        0x1b => {
203                            escape_state = EscapeState::OSCSeen1;
204                        }
205                        _ => {
206                            escape_state = EscapeState::OSCSeen;
207                        }
208                    }
209                }
210            }
211        }
212
213        // before returning, peek ahead and see whether there's a reset escape sequence we can append.
214        // There are 3 ANSI reset sequences.
215        // if, perversely, there is more than one sequence following the grapheme, take them all.
216        // If, even more perversely, the last char of the esc sequence plus some following
217        // characters in the string happen to form a multi-character grapheme, take all of that.
218        // This means that the reset escape sequence is not always the end of the print position slice.
219
220        while self.next_offset < self.string.len()
221            && self.string.as_bytes()[self.next_offset] == 0x1b
222        {
223            if self.next_offset + 2 <= self.string.len()
224                && self.string[self.next_offset..].starts_with("\x1bc")
225            {
226                self.gi_iterator.next();
227                let last = self.gi_iterator.next().expect("must be >=2");
228                self.next_offset += 1 + last.1.len();
229            } else if self.next_offset + 3 <= self.string.len()
230                && self.string[self.next_offset..].starts_with("\x1b[m")
231            {
232                self.gi_iterator.next();
233                self.gi_iterator.next();
234                let last = self.gi_iterator.next().expect("must be >=3");
235                self.next_offset += 2 + last.1.len();
236            } else if self.next_offset + 4 <= self.string.len()
237                && self.string[self.next_offset..].starts_with("\x1b[0m")
238            {
239                self.gi_iterator.next();
240                self.gi_iterator.next();
241                self.gi_iterator.next();
242                let last = self.gi_iterator.next().expect("must be >=4");
243                self.next_offset += 3 + last.1.len();
244            } else {
245                break; // ESC then something else.  Take it at the beginning of the next call.
246            }
247        }
248        // return everything between start and end offsets
249        if self.next_offset <= self.cur_offset {
250            return None;
251        } else {
252            let retval = (self.cur_offset, self.next_offset);
253            // advance start to one beyond end of what we're returning
254            self.cur_offset = self.next_offset;
255            return Some(retval);
256        }
257    }
258}
259
260
261/// This iterator returns "print position" data found in a string, as an immutable slice within the source string.  
262/// 
263/// All the source bytes are passed through the iterator in order and without modification, except they are grouped or "segmented" into print position slices.
264/// 
265/// ```rust
266/// use print_positions::print_position_data;
267///
268/// let segs: Vec<_> = print_position_data("abc\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}").collect();
269/// assert_eq!(vec!("a","b","c",
270///     "\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"   // unicode family emoji -- 1 print position
271///     ), segs);
272///
273/// // Control chars and ANSI escapes returned within the print position slice.
274/// let content = "abc\u{1b}[37;46mdef\u{1b}[0mg";
275/// let segs: Vec<_> = print_position_data(content).collect();
276/// assert_eq!(vec!("a","b","c", "\u{1b}[37;46md","e","f\u{1b}[0m", "g"), segs);
277/// assert_eq!(content, segs.join(""), "all characters passed through iterator transparently");
278/// ```
279///
280/// Run `cargo run --example padding`
281/// for an example of fixed-width formatting based on counting print positions
282/// rather than characters in the data.
283///
284pub struct PrintPositionData<'a>(PrintPositions<'a>);
285
286#[inline]
287/// Factory method to provide a new [PrintPositionData] iterator.
288///
289pub fn print_position_data<'a>(s: &'a str) -> PrintPositionData<'a> {
290    PrintPositionData(print_positions(s))
291}
292
293impl<'a> PrintPositionData<'a> {
294    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
295    ///
296    /// ```rust
297    /// # use print_positions::print_position_data;
298    /// let mut iter = print_position_data("abc");
299    /// assert_eq!(iter.as_str(), "abc");
300    /// iter.next();
301    /// assert_eq!(iter.as_str(), "bc");
302    /// iter.next();
303    /// iter.next();
304    /// assert_eq!(iter.as_str(), "");
305    /// ```
306    #[inline]
307    pub fn as_str(&self) -> &'a str {
308        &self.0.string[self.0.cur_offset..self.0.string.len()]
309    }
310}
311
312impl<'a> Iterator for PrintPositionData<'a> {
313    type Item = &'a str;
314
315    #[inline]
316    fn next(&mut self) -> Option<Self::Item> {
317        if let Some((start, end)) = self.0.next() {
318            Some(&self.0.string[start..end])
319        } else {
320            None
321        }
322    }
323}
324