strcursor/
lib.rs

1/*
2Copyright ⓒ 2015-2017 Daniel Keep.
3
4Licensed under the MIT license (see LICENSE or <http://opensource.org
5/licenses/MIT>) or the Apache License, Version 2.0 (see LICENSE of
6<http://www.apache.org/licenses/LICENSE-2.0>), at your option. All
7files in the project carrying such notice may not be copied, modified,
8or distributed except according to those terms.
9*/
10/*!
11This crate provides a "cursor" type for string slices.  It provides the ability to safely seek back and forth through a string without worrying about producing invalid UTF-8 sequences, or splitting grapheme clusters.
12
13In addition, it provides types to represent single grapheme clusters ([`Gc`](struct.Gc.html) and [`GcBuf`](struct.GcBuf.html)) as distinct from arbitrary string slices.
14
15See the [`StrCursor`](struct.StrCursor.html) type for details.
16
17<style type="text/css">
18.link-block { font-family: "Fira Sans"; }
19.link-block > p { display: inline-block; }
20.link-block > p > strong { font-weight: 500; margin-right: 1em; }
21.link-block > ul { display: inline-block; padding: 0; list-style: none; }
22.link-block > ul > li {
23  font-size: 0.8em;
24  background-color: #eee;
25  border: 1px solid #ccc;
26  padding: 0.3em;
27  display: inline-block;
28}
29</style>
30<span></span><div class="link-block">
31
32**Links**
33
34* [Latest Release](https://crates.io/crates/strcursor/)
35* [Latest Docs](https://docs.rs/strcursor/%2A/strcursor/index.html)
36* [Repository](https://github.com/DanielKeep/strcursor)
37
38<span></span></div>
39
40## Compatibility
41
42`strcursor` is currently supported on `rustc` version 1.1.0 and higher.
43
44* `rustc` < 1.4 will use a larger, less space-efficient implementation of `GcBuf`; rather than being the same size as `Box<str>`, it will be the same size as `String`.
45
46* `rustc` < 1.1 is not supported, due to a mysterious compiler crash.
47
48*/
49extern crate unicode_segmentation as uniseg;
50
51/**
52Inserts a panic in debug builds, an optimisation hint in release builds.
53
54**Do not replace this with the `debug_unreachable` crate.**  Recent versions of that crate do not build under Rust < 1.6, and old versions that used to no longer will, as they have sufficiently vague dependency version specifiers.
55*/
56#[doc(hidden)]
57macro_rules! debug_unreachable {
58    () => {
59        if cfg!(ndebug) {
60            ::util::unreachable()
61        } else {
62            panic!("entered unreachable code")
63        }
64    };
65}
66
67pub use grapheme::{Gc, GcBuf};
68
69pub mod grapheme;
70mod util;
71
72use uniseg::UnicodeSegmentation as UniSeg;
73
74/**
75This type represents a cursor into a string slice; that is, in addition to having a beginning and end, it also has a current position between those two.  This position can be seeked left and right within those bounds.
76
77> **Note**: the cursor may validly be positioned *at* the end of the string.  That is, in a position where there are no code points or grapheme clusters to the right of the cursor, and the entire contents of the string is to the left of the cursor.
78
79The main reason for this is that *sometimes*, you want the ability to do things like "advance a character", and the existing APIs for this can be somewhat verbose.
80
81In addition, *unstable* support for grapheme clusters is exposed by the standard library, which conflicts with the *stable* support provided by the `unicode-segmentation` crate, which makes doing "the right thing" painful.  `StrCursor` exposes grapheme clusters by default, and makes them cleaner to work with.
82
83The cursor guarantees the following at all times:
84
85* The cursor position *cannot* be outside of the original string slice it was constructed with.
86* The cursor position *cannot* lie between Unicode code points, meaning that you *cannot* generate an invalid string slice from a cursor.
87* If the code point-specific methods are *not* used, the cursor will always lie between grapheme clusters.
88
89This last point is somewhat important: the cursor is designed to favour operating on grapheme clusters, rather than code points.  If you misalign the cursor with respect to grapheme clusters, the behaviour of methods that deal with grapheme clusters is officially *undefined*, but is generally well-behaved.
90
91The methods that operate on the cursor will either return a fresh `Option<StrCursor>` (depending on whether the seek operation is valid or not), or mutate the existing cursor (in which case, they will *panic* if the seek operation is not valid).
92*/
93pub struct StrCursor<'a> {
94    s: &'a str,
95    at: *const u8,
96}
97
98impl<'a> StrCursor<'a> {
99    /**
100    Create a new cursor at the start of `s`.
101    */
102    #[inline]
103    pub fn new_at_start(s: &'a str) -> StrCursor<'a> {
104        StrCursor {
105            s: s,
106            at: s.as_ptr(),
107        }
108    }
109
110    /**
111    Create a new cursor past at the end of `s`.
112    */
113    #[inline]
114    pub fn new_at_end(s: &'a str) -> StrCursor<'a> {
115        StrCursor {
116            s: s,
117            at: byte_pos_to_ptr(s, s.len()),
118        }
119    }
120
121    /**
122    Create a new cursor at the first grapheme cluster which begins at or to the left of the given byte position.
123    */
124    #[inline]
125    pub fn new_at_left_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> {
126        // Start at a codepoint.
127        let cur = StrCursor::new_at_cp_left_of_byte_pos(s, byte_pos);
128
129        // Seek back to the previous grapheme.
130        let prev = cur.at_prev();
131
132        let prev = match prev {
133            None => return cur, // We were already at the start.
134            Some(c) => c
135        };
136
137        // unwrap should be OK here.
138        if prev.byte_pos() + prev.after().unwrap().len() > byte_pos {
139            prev
140        } else {
141            cur
142        }
143    }
144
145    /**
146    Create a new cursor at the first grapheme cluster which begins at or to the right of the given byte position.
147    */
148    #[inline]
149    pub fn new_at_right_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> {
150        // I don't know how robust the grapheme iteration rules are when trying to step forward from a (potentially) invalid position.  As such, I'm *instead* going to start from a known-good position.
151        let cur = StrCursor::new_at_left_of_byte_pos(s, byte_pos);
152        if cur.byte_pos() == byte_pos {
153            return cur;
154        }
155
156        // This unwrap shouldn't be able to fail.
157        cur.at_next().unwrap()
158    }
159
160    /**
161    Create a new cursor at the first code point which begins at or to the left of the given byte position.
162
163    # Note
164
165    Where possible, you should prefer `new_at_left_of_byte_pos`.
166    */
167    #[inline]
168    pub fn new_at_cp_left_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> {
169        StrCursor {
170            s: s,
171            at: unsafe { seek_utf8_cp_start_left(s, byte_pos_to_ptr(s, byte_pos)) },
172        }
173    }
174
175    /**
176    Create a new cursor at the first code point which begins at or to the right of the given byte position.
177
178    # Note
179
180    Where possible, you should prefer `new_at_right_of_byte_pos`.
181    */
182    #[inline]
183    pub fn new_at_cp_right_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> {
184        StrCursor {
185            s: s,
186            at: unsafe { seek_utf8_cp_start_right(s, byte_pos_to_ptr(s, byte_pos)) },
187        }
188    }
189
190    /**
191    Returns a new cursor at the beginning of the previous grapheme cluster, or `None` if the cursor is currently positioned at the beginning of the string.
192    */
193    #[inline]
194    pub fn at_prev(mut self) -> Option<StrCursor<'a>> {
195        match self.try_seek_left_gr() {
196            true => Some(self),
197            false => None
198        }
199    }
200
201    /**
202    Returns a new cursor at the beginning of the next grapheme cluster, or `None` if the cursor is currently positioned at the end of the string.
203    */
204    #[inline]
205    pub fn at_next(mut self) -> Option<StrCursor<'a>> {
206        match self.try_seek_right_gr() {
207            true => Some(self),
208            false => None
209        }
210    }
211
212    /**
213    Returns a new cursor at the beginning of the previous code point, or `None` if the cursor is currently positioned at the beginning of the string.
214
215    # Note
216
217    Where possible, you should prefer `at_prev`.
218    */
219    #[inline]
220    pub fn at_prev_cp(mut self) -> Option<StrCursor<'a>> {
221        match self.try_seek_left_cp() {
222            true => Some(self),
223            false => None
224        }
225    }
226
227    /**
228    Returns a new cursor at the beginning of the next code point, or `None` if the cursor is currently positioned at the end of the string.
229
230    # Note
231
232    Where possible, you should prefer `at_next`.
233    */
234    #[inline]
235    pub fn at_next_cp(mut self) -> Option<StrCursor<'a>> {
236        match self.try_seek_right_cp() {
237            true => Some(self),
238            false => None
239        }
240    }
241
242    /**
243    Seeks the cursor to the beginning of the previous grapheme cluster.
244
245    # Panics
246
247    If the cursor is currently at the start of the string, then this function will panic.
248    */
249    #[inline]
250    pub fn seek_prev(&mut self) {
251        if !self.try_seek_left_gr() {
252            panic!("cannot seek past the beginning of a string");
253        }
254    }
255
256    /**
257    Seeks the cursor to the beginning of the next grapheme cluster.
258
259    # Panics
260
261    If the cursor is currently at the end of the string, then this function will panic.
262    */
263    #[inline]
264    pub fn seek_next(&mut self) {
265        if !self.try_seek_right_gr() {
266            panic!("cannot seek past the end of a string");
267        }
268    }
269
270    /**
271    Seeks the cursor to the beginning of the previous code point.
272
273    # Panics
274
275    If the cursor is currently at the start of the string, then this function will panic.
276
277    # Note
278
279    Where possible, you should prefer `seek_prev`.
280    */
281    #[inline]
282    pub fn seek_prev_cp(&mut self) {
283        if !self.try_seek_left_cp() {
284            panic!("cannot seek past the beginning of a string");
285        }
286    }
287
288    /**
289    Seeks the cursor to the beginning of the next code point.
290
291    # Panics
292
293    If the cursor is currently at the end of the string, then this function will panic.
294
295    # Note
296
297    Where possible, you should prefer `seek_next`.
298    */
299    #[inline]
300    pub fn seek_next_cp(&mut self) {
301        if !self.try_seek_right_cp() {
302            panic!("cannot seek past the end of a string");
303        }
304    }
305
306    /**
307    Returns both the previous grapheme cluster and the cursor having seeked before it.
308
309    This may be more efficient than doing both operations individually.
310    */
311    #[inline]
312    pub fn prev(mut self) -> Option<(&'a Gc, StrCursor<'a>)> {
313        unsafe {
314            let g = match self.before() {
315                Some(g) => g,
316                None => return None,
317            };
318            self.unsafe_set_at(g.as_str());
319            Some((g, self))
320        }
321    }
322
323    /**
324    Returns both the previous code point and the cursor having seeked before it.
325
326    This may be more efficient than doing both operations individually.
327
328    # Note
329
330    Where possible, you should prefer `prev`.
331    */
332    #[inline]
333    pub fn prev_cp(mut self) -> Option<(char, StrCursor<'a>)> {
334        unsafe {
335            let cp = match self.cp_before() {
336                Some(cp) => cp,
337                None => return None,
338            };
339            self.unsafe_seek_left(cp.len_utf8());
340            Some((cp, self))
341        }
342    }
343
344    /**
345    Returns both the next grapheme cluster and the cursor having seeked past it.
346
347    This may be more efficient than doing both operations individually.
348    */
349    #[inline]
350    pub fn next(mut self) -> Option<(&'a Gc, StrCursor<'a>)> {
351        unsafe {
352            let g = match self.after() {
353                Some(g) => g,
354                None => return None,
355            };
356            self.unsafe_seek_right(g.len());
357            Some((g, self))
358        }
359    }
360
361    /**
362    Returns both the next code point and the cursor having seeked past it.
363
364    This may be more efficient than doing both operations individually.
365
366    # Note
367
368    Where possible, you should prefer `next`.
369    */
370    #[inline]
371    pub fn next_cp(mut self) -> Option<(char, StrCursor<'a>)> {
372        unsafe {
373            let cp = match self.cp_after() {
374                Some(cp) => cp,
375                None => return None,
376            };
377            self.unsafe_seek_right(cp.len_utf8());
378            Some((cp, self))
379        }
380    }
381
382    /**
383    Returns the grapheme cluster immediately to the left of the cursor, or `None` is the cursor is at the start of the string.
384    */
385    #[inline]
386    pub fn before(&self) -> Option<&'a Gc> {
387        self.at_prev().and_then(|cur| cur.after())
388    }
389
390    /**
391    Returns the grapheme cluster immediately to the right of the cursor, or `None` is the cursor is at the end of the string.
392    */
393    #[inline]
394    pub fn after(&self) -> Option<&'a Gc> {
395        Gc::split_from(self.slice_after()).map(|(gc, _)| gc)
396    }
397
398    /**
399    Returns the contents of the string to the left of the cursor.
400    */
401    #[inline]
402    pub fn slice_before(&self) -> &'a str {
403        unsafe {
404            self.s.slice_unchecked(0, self.byte_pos())
405        }
406    }
407
408    /**
409    Returns the contents of the string to the right of the cursor.
410    */
411    #[inline]
412    pub fn slice_after(&self) -> &'a str {
413        unsafe {
414            self.s.slice_unchecked(self.byte_pos(), self.s.len())
415        }
416    }
417
418    /**
419    Returns the contents of the string *between* this cursor and another cursor.
420
421    Returns `None` if the cursors are from different strings (even different subsets of the same string).
422    */
423    #[inline]
424    pub fn slice_between(&self, until: StrCursor<'a>) -> Option<&'a str> {
425        if !str_eq_literal(self.s, until.s) {
426            None
427        } else {
428            use std::cmp::{max, min};
429            unsafe {
430                let beg = min(self.at, until.at);
431                let end = max(self.at, until.at);
432                let len = end as usize - beg as usize;
433                let bytes = ::std::slice::from_raw_parts(beg, len);
434                Some(::std::str::from_utf8_unchecked(bytes))
435            }
436        }
437    }
438
439    /**
440    Returns the code point immediately to the left of the cursor, or `None` is the cursor is at the start of the string.
441    */
442    #[inline]
443    pub fn cp_before(&self) -> Option<char> {
444        self.at_prev_cp().and_then(|cur| cur.cp_after())
445    }
446
447    /**
448    Returns the code point immediately to the right of the cursor, or `None` is the cursor is at the end of the string.
449    */
450    #[inline]
451    pub fn cp_after(&self) -> Option<char> {
452        self.slice_after().chars().next()
453    }
454
455    /**
456    Returns the entire string slice behind the cursor.
457    */
458    #[inline]
459    pub fn slice_all(&self) -> &'a str {
460        self.s
461    }
462
463    /**
464    Returns the cursor's current position within the string as the number of UTF-8 code units from the beginning of the string.
465    */
466    #[inline]
467    pub fn byte_pos(&self) -> usize {
468        self.at as usize - self.s.as_ptr() as usize
469    }
470
471    #[inline]
472    fn try_seek_left_cp(&mut self) -> bool {
473        unsafe {
474            // We just have to ensure that offsetting the `at` pointer *at all* is safe.
475            if self.byte_pos() == 0 {
476                return false;
477            }
478            self.at = seek_utf8_cp_start_left(self.s, self.at.offset(-1));
479            true
480        }
481    }
482
483    #[inline]
484    fn try_seek_right_cp(&mut self) -> bool {
485        unsafe {
486            // We just have to ensure that offsetting the `at` pointer *at all* is safe.
487            if self.byte_pos() == self.s.len() {
488                return false;
489            }
490            self.at = seek_utf8_cp_start_right(self.s, self.at.offset(1));
491            true
492        }
493    }
494
495    #[inline]
496    fn try_seek_left_gr(&mut self) -> bool {
497        let len = {
498            let gr = UniSeg::graphemes(self.slice_before(), /*is_extended:*/true).next_back();
499            gr.map(|gr| gr.len())
500        };
501        match len {
502            Some(len) => {
503                unsafe {
504                    self.at = self.at.offset(-(len as isize));
505                }
506                true
507            },
508            None => false
509        }
510    }
511
512    #[inline]
513    fn try_seek_right_gr(&mut self) -> bool {
514        let len = {
515            let gr = UniSeg::graphemes(self.slice_after(), /*is_extended:*/true).next();
516            gr.map(|gr| gr.len())
517        };
518        match len {
519            Some(len) => {
520                unsafe {
521                    self.at = self.at.offset(len as isize);
522                }
523                true
524            },
525            None => false
526        }
527    }
528
529    /**
530    Seeks exactly `bytes` left, without performing any bounds or validity checks.
531    */
532    #[inline]
533    pub unsafe fn unsafe_seek_left(&mut self, bytes: usize) {
534        self.at = self.at.offset(-(bytes as isize));
535    }
536
537    /**
538    Seeks exactly `bytes` right, without performing any bounds or validity checks.
539    */
540    #[inline]
541    pub unsafe fn unsafe_seek_right(&mut self, bytes: usize) {
542        self.at = self.at.offset(bytes as isize);
543    }
544
545    /**
546    Seeks to the start of `s`, without performing any bounds or validity checks.
547    */
548    #[inline]
549    pub unsafe fn unsafe_set_at(&mut self, s: &'a str) {
550        self.at = s.as_bytes().as_ptr();
551    }
552}
553
554impl<'a> Copy for StrCursor<'a> {}
555
556impl<'a> Clone for StrCursor<'a> {
557    fn clone(&self) -> StrCursor<'a> {
558        *self
559    }
560}
561
562impl<'a> std::fmt::Debug for StrCursor<'a> {
563	fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
564        write!(fmt, "StrCursor({:?} | {:?})", self.slice_before(), self.slice_after())
565    }
566}
567
568impl<'a> Eq for StrCursor<'a> {}
569
570impl<'a> PartialEq for StrCursor<'a> {
571    fn eq(&self, other: &StrCursor<'a>) -> bool {
572        (self.at == other.at)
573        && (self.s.as_ptr() == other.s.as_ptr())
574        && (self.s.len() == other.s.len())
575    }
576
577    fn ne(&self, other: &StrCursor<'a>) -> bool {
578        (self.at != other.at)
579        || (self.s.as_ptr() != other.s.as_ptr())
580        || (self.s.len() != other.s.len())
581    }
582}
583
584impl<'a> PartialOrd for StrCursor<'a> {
585    fn partial_cmp(&self, other: &StrCursor<'a>) -> Option<std::cmp::Ordering> {
586        // If the cursors are from different strings, they are unordered.
587        if (self.s.as_ptr() != other.s.as_ptr()) || (self.s.len() != other.s.len()) {
588            None
589        } else {
590            self.at.partial_cmp(&other.at)
591        }
592    }
593}
594
595impl<'a> std::hash::Hash for StrCursor<'a> {
596    fn hash<H>(&self, state: &mut H)
597    where H: std::hash::Hasher {
598        self.s.as_ptr().hash(state);
599        self.s.len().hash(state);
600        self.at.hash(state);
601    }
602}
603
604#[cfg(test)]
605#[test]
606fn test_new_at_start() {
607    let cur = StrCursor::new_at_start("abcdef");
608    assert_eq!(cur.slice_before(), "");
609    assert_eq!(cur.slice_after(), "abcdef");
610}
611
612#[cfg(test)]
613#[test]
614fn test_new_at_end() {
615    let cur = StrCursor::new_at_end("abcdef");
616    assert_eq!(cur.slice_before(), "abcdef");
617    assert_eq!(cur.slice_after(), "");
618}
619
620#[cfg(test)]
621#[test]
622fn test_new_at_cp_left_of_byte_pos() {
623    let s = "This is a 本当 test.";
624    let cur = StrCursor::new_at_cp_left_of_byte_pos(s, 11);
625    assert_eq!(cur.slice_before(), "This is a ");
626    assert_eq!(cur.slice_after(), "本当 test.");
627}
628
629#[cfg(test)]
630#[test]
631fn test_new_at_cp_right_of_byte_pos() {
632    let s = "This is a 本当 test.";
633    let cur = StrCursor::new_at_cp_right_of_byte_pos(s, 11);
634    assert_eq!(cur.slice_before(), "This is a 本");
635    assert_eq!(cur.slice_after(), "当 test.");
636}
637
638#[cfg(test)]
639#[test]
640fn test_new_at_left_of_byte_pos() {
641    let s = "Jäger,Jäger,大嫌い,💪❤!";
642    let r = (0..s.len()+1).map(|i| (i, StrCursor::new_at_left_of_byte_pos(s, i)))
643        .map(|(i, cur)| (i, cur.byte_pos(), cur.after().map(Gc::as_str)))
644        .collect::<Vec<_>>();
645    assert_eq!(r, vec![
646        (0, 0, Some("J")),
647        (1, 1, Some("ä")),
648        (2, 1, Some("ä")),
649        (3, 3, Some("g")),
650        (4, 4, Some("e")),
651        (5, 5, Some("r")),
652        (6, 6, Some(",")),
653        (7, 7, Some("J")),
654        (8, 8, Some("ä")),
655        (9, 8, Some("ä")),
656        (10, 8, Some("ä")),
657        (11, 11, Some("g")),
658        (12, 12, Some("e")),
659        (13, 13, Some("r")),
660        (14, 14, Some(",")),
661        (15, 15, Some("大")),
662        (16, 15, Some("大")),
663        (17, 15, Some("大")),
664        (18, 18, Some("嫌")),
665        (19, 18, Some("嫌")),
666        (20, 18, Some("嫌")),
667        (21, 21, Some("い")),
668        (22, 21, Some("い")),
669        (23, 21, Some("い")),
670        (24, 24, Some(",")),
671        (25, 25, Some("💪")),
672        (26, 25, Some("💪")),
673        (27, 25, Some("💪")),
674        (28, 25, Some("💪")),
675        (29, 29, Some("❤")),
676        (30, 29, Some("❤")),
677        (31, 29, Some("❤")),
678        (32, 32, Some("!")),
679        (33, 33, None),
680    ]);
681}
682
683#[cfg(test)]
684#[test]
685fn test_new_at_right_of_byte_pos() {
686    let s = "Jäger,Jäger,大嫌い,💪❤!";
687    let r = (0..s.len()+1).map(|i| (i, StrCursor::new_at_right_of_byte_pos(s, i)))
688        .map(|(i, cur)| (i, cur.byte_pos(), cur.after().map(Gc::as_str)))
689        .collect::<Vec<_>>();
690    assert_eq!(r, vec![
691        (0, 0, Some("J")),
692        (1, 1, Some("ä")),
693        (2, 3, Some("g")),
694        (3, 3, Some("g")),
695        (4, 4, Some("e")),
696        (5, 5, Some("r")),
697        (6, 6, Some(",")),
698        (7, 7, Some("J")),
699        (8, 8, Some("ä")),
700        (9, 11, Some("g")),
701        (10, 11, Some("g")),
702        (11, 11, Some("g")),
703        (12, 12, Some("e")),
704        (13, 13, Some("r")),
705        (14, 14, Some(",")),
706        (15, 15, Some("大")),
707        (16, 18, Some("嫌")),
708        (17, 18, Some("嫌")),
709        (18, 18, Some("嫌")),
710        (19, 21, Some("い")),
711        (20, 21, Some("い")),
712        (21, 21, Some("い")),
713        (22, 24, Some(",")),
714        (23, 24, Some(",")),
715        (24, 24, Some(",")),
716        (25, 25, Some("💪")),
717        (26, 29, Some("❤")),
718        (27, 29, Some("❤")),
719        (28, 29, Some("❤")),
720        (29, 29, Some("❤")),
721        (30, 32, Some("!")),
722        (31, 32, Some("!")),
723        (32, 32, Some("!")),
724        (33, 33, None),
725    ]);
726}
727
728#[cfg(test)]
729#[test]
730fn test_at_prev_cp() {
731    let s = "大嫌い,💪❤";
732    let cur = StrCursor::new_at_end(s);
733    let bps = test_util::finite_iterate(cur, StrCursor::at_prev_cp)
734        .map(|cur| cur.byte_pos())
735        .collect::<Vec<_>>();
736    assert_eq!(bps, vec![14, 10, 9, 6, 3, 0]);
737}
738
739#[cfg(test)]
740#[test]
741fn test_at_next_cp() {
742    let s = "大嫌い,💪❤";
743    let cur = StrCursor::new_at_start(s);
744    let bps = test_util::finite_iterate(cur, StrCursor::at_next_cp)
745        .map(|cur| cur.byte_pos())
746        .collect::<Vec<_>>();
747    assert_eq!(bps, vec![3, 6, 9, 10, 14, 17]);
748}
749
750#[cfg(test)]
751#[test]
752fn test_at_prev_and_before() {
753    let s = "noe\u{0308}l";
754    let cur = StrCursor::new_at_end(s);
755    let bps = test_util::finite_iterate_lead(cur, StrCursor::at_prev)
756        .map(|cur| (cur.byte_pos(), cur.after().map(Gc::as_str)))
757        .collect::<Vec<_>>();
758    assert_eq!(bps, vec![
759        (6, None),
760        (5, Some("l")),
761        (2, Some("e\u{0308}")),
762        (1, Some("o")),
763        (0, Some("n")),
764    ]);
765}
766
767#[cfg(test)]
768#[test]
769fn test_at_next_and_after() {
770    let s = "noe\u{0308}l";
771    let cur = StrCursor::new_at_start(s);
772    let bps = test_util::finite_iterate_lead(cur, StrCursor::at_next)
773        .map(|cur| (cur.byte_pos(), cur.after().map(Gc::as_str)))
774        .collect::<Vec<_>>();
775    assert_eq!(bps, vec![
776        (0, Some("n")),
777        (1, Some("o")),
778        (2, Some("e\u{0308}")),
779        (5, Some("l")),
780        (6, None),
781    ]);
782}
783
784#[cfg(test)]
785#[test]
786fn test_prev() {
787    let s = "Jäger,Jäger,大嫌い,💪❤!";
788    let cur = StrCursor::new_at_end(s);
789    let r = test_util::finite_iterate_lead(cur, StrCursor::at_prev)
790        .map(|cur| cur.prev().map(|(gr, cur)| (gr.as_str(), cur.byte_pos())))
791        .collect::<Vec<_>>();
792    assert_eq!(r, vec![
793        Some(("!", 32)),
794        Some(("❤", 29)),
795        Some(("💪", 25)),
796        Some((",", 24)),
797        Some(("い", 21)),
798        Some(("嫌", 18)),
799        Some(("大", 15)),
800        Some((",", 14)),
801        Some(("r", 13)),
802        Some(("e", 12)),
803        Some(("g", 11)),
804        Some(("ä", 8)),
805        Some(("J", 7)),
806        Some((",", 6)),
807        Some(("r", 5)),
808        Some(("e", 4)),
809        Some(("g", 3)),
810        Some(("ä", 1)),
811        Some(("J", 0)),
812        None,
813    ]);
814}
815
816#[cfg(test)]
817#[test]
818fn test_prev_cp() {
819    let s = "Jäger,Jäger,大嫌い,💪❤!";
820    let cur = StrCursor::new_at_end(s);
821    let r = test_util::finite_iterate_lead(cur, StrCursor::at_prev_cp)
822        .map(|cur| cur.prev_cp().map(|(cp, cur)| (cp, cur.byte_pos())))
823        .collect::<Vec<_>>();
824    assert_eq!(r, vec![
825        Some(('!', 32)),
826        Some(('❤', 29)),
827        Some(('💪', 25)),
828        Some((',', 24)),
829        Some(('い', 21)),
830        Some(('嫌', 18)),
831        Some(('大', 15)),
832        Some((',', 14)),
833        Some(('r', 13)),
834        Some(('e', 12)),
835        Some(('g', 11)),
836        Some(('̈', 9)),
837        Some(('a', 8)),
838        Some(('J', 7)),
839        Some((',', 6)),
840        Some(('r', 5)),
841        Some(('e', 4)),
842        Some(('g', 3)),
843        Some(('ä', 1)),
844        Some(('J', 0)),
845        None,
846    ]);
847}
848
849#[cfg(test)]
850#[test]
851fn test_next() {
852    let s = "Jäger,Jäger,大嫌い,💪❤!";
853    let cur = StrCursor::new_at_start(s);
854    let r = test_util::finite_iterate_lead(cur, StrCursor::at_next)
855        .map(|cur| cur.next().map(|(gr, cur)| (gr.as_str(), cur.byte_pos())))
856        .collect::<Vec<_>>();
857    assert_eq!(r, vec![
858        Some(("J", 1)),
859        Some(("ä", 3)),
860        Some(("g", 4)),
861        Some(("e", 5)),
862        Some(("r", 6)),
863        Some((",", 7)),
864        Some(("J", 8)),
865        Some(("ä", 11)),
866        Some(("g", 12)),
867        Some(("e", 13)),
868        Some(("r", 14)),
869        Some((",", 15)),
870        Some(("大", 18)),
871        Some(("嫌", 21)),
872        Some(("い", 24)),
873        Some((",", 25)),
874        Some(("💪", 29)),
875        Some(("❤", 32)),
876        Some(("!", 33)),
877        None,
878    ]);
879}
880
881#[cfg(test)]
882#[test]
883fn test_next_cp() {
884    let s = "Jäger,Jäger,大嫌い,💪❤!";
885    let cur = StrCursor::new_at_start(s);
886    let r = test_util::finite_iterate_lead(cur, StrCursor::at_next_cp)
887        .map(|cur| cur.next_cp().map(|(cp, cur)| (cp, cur.byte_pos())))
888        .collect::<Vec<_>>();
889    assert_eq!(r, vec![
890        Some(('J', 1)),
891        Some(('ä', 3)),
892        Some(('g', 4)),
893        Some(('e', 5)),
894        Some(('r', 6)),
895        Some((',', 7)),
896        Some(('J', 8)),
897        Some(('a', 9)),
898        Some(('̈', 11)),
899        Some(('g', 12)),
900        Some(('e', 13)),
901        Some(('r', 14)),
902        Some((',', 15)),
903        Some(('大', 18)),
904        Some(('嫌', 21)),
905        Some(('い', 24)),
906        Some((',', 25)),
907        Some(('💪', 29)),
908        Some(('❤', 32)),
909        Some(('!', 33)),
910        None,
911    ]);
912}
913
914#[cfg(test)]
915#[test]
916fn test_seek_prev() {
917    let s = "Jäger,Jäger,大嫌い,💪❤!";
918    let mut cur = StrCursor::new_at_end(s);
919    let mut r = vec![];
920    for i in 0..19 {
921        println!("i: {:?}", i);
922        println!("cur.byte_pos(): {:?}", cur.byte_pos());
923        cur.seek_prev();
924        r.push((cur.after().unwrap().as_str(), cur.byte_pos()));
925    }
926    assert_eq!(r, vec![
927        ("!", 32),
928        ("❤", 29),
929        ("💪", 25),
930        (",", 24),
931        ("い", 21),
932        ("嫌", 18),
933        ("大", 15),
934        (",", 14),
935        ("r", 13),
936        ("e", 12),
937        ("g", 11),
938        ("ä", 8),
939        ("J", 7),
940        (",", 6),
941        ("r", 5),
942        ("e", 4),
943        ("g", 3),
944        ("ä", 1),
945        ("J", 0),
946    ]);
947}
948
949#[cfg(test)]
950#[test]
951#[should_panic]
952fn test_seek_prev_panic() {
953    let s = "Jäger,Jäger,大嫌い,💪❤!";
954    let mut cur = StrCursor::new_at_start(s);
955    cur.seek_prev();
956}
957
958#[cfg(test)]
959#[test]
960fn test_seek_prev_cp() {
961    let s = "Jäger,Jäger,大嫌い,💪❤!";
962    let mut cur = StrCursor::new_at_end(s);
963    let mut r = vec![];
964    for _ in 0..20 {
965        cur.seek_prev_cp();
966        r.push((cur.cp_after().unwrap(), cur.byte_pos()));
967    }
968    assert_eq!(r, vec![
969        ('!', 32),
970        ('❤', 29),
971        ('💪', 25),
972        (',', 24),
973        ('い', 21),
974        ('嫌', 18),
975        ('大', 15),
976        (',', 14),
977        ('r', 13),
978        ('e', 12),
979        ('g', 11),
980        ('̈', 9),
981        ('a', 8),
982        ('J', 7),
983        (',', 6),
984        ('r', 5),
985        ('e', 4),
986        ('g', 3),
987        ('ä', 1),
988        ('J', 0),
989    ]);
990}
991
992#[cfg(test)]
993#[test]
994#[should_panic]
995fn test_seek_prev_cp_panic() {
996    let s = "Jäger,Jäger,大嫌い,💪❤!";
997    let mut cur = StrCursor::new_at_start(s);
998    cur.seek_prev_cp();
999}
1000
1001#[cfg(test)]
1002#[test]
1003fn test_seek_next() {
1004    let s = "Jäger,Jäger,大嫌い,💪❤!";
1005    let mut cur = StrCursor::new_at_start(s);
1006    let mut r = vec![];
1007    for _ in 0..19 {
1008        cur.seek_next();
1009        r.push((cur.before().unwrap().as_str(), cur.byte_pos()));
1010    }
1011    assert_eq!(r, vec![
1012        ("J", 1),
1013        ("ä", 3),
1014        ("g", 4),
1015        ("e", 5),
1016        ("r", 6),
1017        (",", 7),
1018        ("J", 8),
1019        ("ä", 11),
1020        ("g", 12),
1021        ("e", 13),
1022        ("r", 14),
1023        (",", 15),
1024        ("大", 18),
1025        ("嫌", 21),
1026        ("い", 24),
1027        (",", 25),
1028        ("💪", 29),
1029        ("❤", 32),
1030        ("!", 33),
1031    ]);
1032}
1033
1034#[cfg(test)]
1035#[test]
1036#[should_panic]
1037fn test_seek_next_panic() {
1038    let s = "Jäger,Jäger,大嫌い,💪❤!";
1039    let mut cur = StrCursor::new_at_end(s);
1040    cur.seek_next();
1041}
1042
1043#[cfg(test)]
1044#[test]
1045fn test_seek_next_cp() {
1046    let s = "Jäger,Jäger,大嫌い,💪❤!";
1047    let mut cur = StrCursor::new_at_start(s);
1048    let mut r = vec![];
1049    for _ in 0..20 {
1050        cur.seek_next_cp();
1051        r.push((cur.cp_before().unwrap(), cur.byte_pos()));
1052    }
1053    assert_eq!(r, vec![
1054        ('J', 1),
1055        ('ä', 3),
1056        ('g', 4),
1057        ('e', 5),
1058        ('r', 6),
1059        (',', 7),
1060        ('J', 8),
1061        ('a', 9),
1062        ('̈', 11),
1063        ('g', 12),
1064        ('e', 13),
1065        ('r', 14),
1066        (',', 15),
1067        ('大', 18),
1068        ('嫌', 21),
1069        ('い', 24),
1070        (',', 25),
1071        ('💪', 29),
1072        ('❤', 32),
1073        ('!', 33),
1074    ]);
1075}
1076
1077#[cfg(test)]
1078#[test]
1079#[should_panic]
1080fn test_seek_next_cp_panic() {
1081    let s = "Jäger,Jäger,大嫌い,💪❤!";
1082    let mut cur = StrCursor::new_at_end(s);
1083    cur.seek_next_cp();
1084}
1085
1086#[cfg(test)]
1087#[test]
1088fn test_char_before_and_after() {
1089    let s = "大嫌い,💪❤";
1090    let cur = StrCursor::new_at_start(s);
1091    let r = test_util::finite_iterate_lead(cur, StrCursor::at_next_cp)
1092        .map(|cur| (cur.byte_pos(), cur.cp_before(), cur.cp_after()))
1093        .collect::<Vec<_>>();
1094    assert_eq!(r, vec![
1095        (0, None, Some('大')),
1096        (3, Some('大'), Some('嫌')),
1097        (6, Some('嫌'), Some('い')),
1098        (9, Some('い'), Some(',')),
1099        (10, Some(','), Some('💪')),
1100        (14, Some('💪'), Some('❤')),
1101        (17, Some('❤'), None)
1102    ]);
1103}
1104
1105#[cfg(test)]
1106#[test]
1107fn test_slice_between() {
1108    let s = "they hit, fight, kick, wreak havoc, and rejoice";
1109    let cur0 = StrCursor::new_at_start(s);
1110    let cur1 = StrCursor::new_at_end(s);
1111    let cur2 = StrCursor::new_at_end("nobody knows what they're lookin' for");
1112    let cur3 = StrCursor::new_at_end(&s[1..]);
1113    assert_eq!(cur0.slice_between(cur1), Some(s));
1114    assert_eq!(cur1.slice_between(cur0), Some(s));
1115    assert_eq!(cur0.slice_between(cur2), None);
1116    assert_eq!(cur0.slice_between(cur3), None);
1117}
1118
1119#[inline]
1120fn byte_pos_to_ptr(s: &str, byte_pos: usize) -> *const u8 {
1121    if s.len() < byte_pos {
1122        panic!("byte position out of bounds: the len is {} but the position is {}",
1123            s.len(), byte_pos);
1124    }
1125    unsafe { s.as_ptr().offset(byte_pos as isize) }
1126}
1127
1128#[inline]
1129unsafe fn seek_utf8_cp_start_left(s: &str, mut from: *const u8) -> *const u8 {
1130    let beg = s.as_ptr();
1131    while from > beg && (*from & 0b11_00_0000 == 0b10_00_0000) {
1132        from = from.offset(-1);
1133    }
1134    from
1135}
1136
1137#[cfg(test)]
1138#[test]
1139fn test_seek_utf8_cp_start_left() {
1140    let s = "カブム!";
1141    let b = s.as_bytes();
1142    assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[0]) }, &b[0]);
1143    assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[1]) }, &b[0]);
1144    assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[2]) }, &b[0]);
1145    assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[3]) }, &b[3]);
1146    assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[4]) }, &b[3]);
1147    assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[5]) }, &b[3]);
1148}
1149
1150#[inline]
1151unsafe fn seek_utf8_cp_start_right(s: &str, mut from: *const u8) -> *const u8 {
1152    let end = s.as_ptr().offset(s.len() as isize);
1153    while from < end && (*from & 0b11_00_0000 == 0b10_00_0000) {
1154        from = from.offset(1);
1155    }
1156    from
1157}
1158
1159#[cfg(test)]
1160#[test]
1161fn test_seek_utf8_cp_start_right() {
1162    let s = "カブム!";
1163    let b = s.as_bytes();
1164    assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[0]) }, &b[0]);
1165    assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[1]) }, &b[3]);
1166    assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[2]) }, &b[3]);
1167    assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[3]) }, &b[3]);
1168    assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[4]) }, &b[6]);
1169    assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[5]) }, &b[6]);
1170}
1171
1172#[inline]
1173fn str_eq_literal(a: &str, b: &str) -> bool {
1174    a.as_bytes().as_ptr() == b.as_bytes().as_ptr()
1175        && a.len() == b.len()
1176}
1177
1178#[cfg(test)]
1179#[test]
1180fn test_str_eq_literal() {
1181    let s = "hare hare yukai";
1182    assert!(str_eq_literal(s, s));
1183    assert!(str_eq_literal(&s[0..4], &s[0..4]));
1184    assert!(!str_eq_literal(&s[0..4], &s[5..9]));
1185    assert!(!str_eq_literal(&s[0..4], &s[0..3]));
1186}
1187
1188#[cfg(test)]
1189mod test_util {
1190    pub struct FiniteIter<T, F>(Option<T>, F);
1191
1192    impl<T, F> Iterator for FiniteIter<T, F>
1193    where
1194        F: FnMut(T) -> Option<T>,
1195        T: Clone,
1196    {
1197        type Item = T;
1198
1199        fn next(&mut self) -> Option<Self::Item> {
1200            self.0.take().and_then(|last| {
1201                match (self.1)(last) {
1202                    Some(e) => {
1203                        self.0 = Some(e);
1204                        self.0.clone()
1205                    },
1206                    None => None
1207                }
1208            })
1209        }
1210    }
1211
1212    pub fn finite_iterate<T, F>(seed: T, f: F) -> FiniteIter<T, F>
1213    where
1214        F: FnMut(T) -> Option<T>,
1215        T: Clone,
1216    {
1217        FiniteIter(Some(seed), f)
1218    }
1219    pub struct FiniteIterLead<T, F>(Option<T>, F, bool);
1220
1221    impl<T, F> Iterator for FiniteIterLead<T, F>
1222    where
1223        F: FnMut(T) -> Option<T>,
1224        T: Clone,
1225    {
1226        type Item = T;
1227
1228        fn next(&mut self) -> Option<Self::Item> {
1229            if !self.2 {
1230                self.2 = true;
1231                return self.0.clone();
1232            }
1233
1234            self.0.take().and_then(|last| {
1235                match (self.1)(last) {
1236                    Some(e) => {
1237                        self.0 = Some(e);
1238                        self.0.clone()
1239                    },
1240                    None => None
1241                }
1242            })
1243        }
1244    }
1245
1246    pub fn finite_iterate_lead<T, F>(seed: T, f: F) -> FiniteIterLead<T, F>
1247    where
1248        F: FnMut(T) -> Option<T>,
1249        T: Clone,
1250    {
1251        FiniteIterLead(Some(seed), f, false)
1252    }
1253}