sp_ropey/
crlf.rs

1/// Returns whether the given byte index in `text` is a valid
2/// splitting point.  Valid splitting point in this case means
3/// that it _is_ a utf8 code point boundary and _is not_ the
4/// middle of a CRLF pair.
5#[inline]
6pub fn is_break(byte_idx: usize, text: &[u8]) -> bool {
7    debug_assert!(byte_idx <= text.len());
8
9    if byte_idx == 0 || byte_idx == text.len() {
10        true
11    } else {
12        (text[byte_idx] >> 6 != 0b10) && ((text[byte_idx - 1] != 0x0D) | (text[byte_idx] != 0x0A))
13    }
14}
15
16/// Returns whether the seam between `left` and `right` is a valid
17/// splitting point.  Valid splitting point in this case means
18/// that it _is_ a utf8 code point boundary and _is not_ the middle
19/// of a CRLF pair.
20#[inline]
21pub fn seam_is_break(left: &[u8], right: &[u8]) -> bool {
22    debug_assert!(!left.is_empty() && !right.is_empty());
23    (right[0] >> 6 != 0b10) && ((left[left.len() - 1] != 0x0D) | (right[0] != 0x0A))
24}
25
26/// Returns the segment break before (but not including) the given byte
27/// boundary.
28///
29/// This will return back the passed byte boundary if it is at the start
30/// of the string.
31#[inline]
32pub fn prev_break(byte_idx: usize, text: &[u8]) -> usize {
33    // Bounds check
34    debug_assert!(byte_idx <= text.len());
35
36    if byte_idx == 0 {
37        0
38    } else {
39        let mut boundary_idx = byte_idx - 1;
40        while !is_break(boundary_idx, text) {
41            boundary_idx -= 1;
42        }
43        boundary_idx
44    }
45}
46
47/// Returns the segment break after (but not including) the given byte
48/// boundary.
49///
50/// This will return back the passed byte boundary if it is at the end of
51/// the string.
52#[inline]
53pub fn next_break(byte_idx: usize, text: &[u8]) -> usize {
54    // Bounds check
55    debug_assert!(byte_idx <= text.len());
56
57    if byte_idx == text.len() {
58        text.len()
59    } else {
60        let mut boundary_idx = byte_idx + 1;
61        while !is_break(boundary_idx, text) {
62            boundary_idx += 1;
63        }
64        boundary_idx
65    }
66}
67
68/// Finds the segment break nearest to the given byte that is not the
69/// left or right edge of the text.
70///
71/// There is only one circumstance where the left or right edge will be
72/// returned: if the entire text is a single unbroken segment, then the
73/// right edge of the text is returned.
74#[inline]
75pub fn nearest_internal_break(byte_idx: usize, text: &[u8]) -> usize {
76    // Bounds check
77    debug_assert!(byte_idx <= text.len());
78
79    // Find the two nearest segment boundaries
80    let left = if is_break(byte_idx, text) && byte_idx != text.len() {
81        byte_idx
82    } else {
83        prev_break(byte_idx, text)
84    };
85    let right = next_break(byte_idx, text);
86
87    // Otherwise, return the closest of left and right that isn't the
88    // start or end of the string
89    if left == 0 || (right != text.len() && (byte_idx - left) >= (right - byte_idx)) {
90        return right;
91    } else {
92        return left;
93    }
94}
95
96#[inline]
97pub fn find_good_split(byte_idx: usize, text: &[u8], bias_left: bool) -> usize {
98    // Bounds check
99    debug_assert!(byte_idx <= text.len());
100
101    if is_break(byte_idx, text) {
102        byte_idx
103    } else {
104        let prev = prev_break(byte_idx, text);
105        let next = next_break(byte_idx, text);
106        if bias_left {
107            if prev > 0 {
108                prev
109            } else {
110                next
111            }
112        } else {
113            if next < text.len() {
114                next
115            } else {
116                prev
117            }
118        }
119    }
120}
121
122//===========================================================================
123
124#[cfg(test)]
125mod tests {
126    use super::*;
127
128    #[test]
129    fn crlf_segmenter_01() {
130        let text = b"Hello world!\r\nHow's it going?";
131
132        assert!(is_break(0, b""));
133        assert!(is_break(0, text));
134        assert!(is_break(12, text));
135        assert!(!is_break(13, text));
136        assert!(is_break(14, text));
137        assert!(is_break(19, text));
138    }
139
140    #[test]
141    fn crlf_segmenter_02() {
142        let l = b"Hello world!\r";
143        let r = b"\nHow's it going?";
144
145        assert!(!seam_is_break(l, r));
146        assert!(!seam_is_break(l, b"\n"));
147        assert!(!seam_is_break(b"\r", r));
148        assert!(!seam_is_break(b"\r", b"\n"));
149        assert!(seam_is_break(r, l));
150        assert!(seam_is_break(b"\n", b"\r"));
151    }
152
153    #[test]
154    fn nearest_internal_break_01() {
155        let text = b"Hello world!";
156        assert_eq!(1, nearest_internal_break(0, text));
157        assert_eq!(6, nearest_internal_break(6, text));
158        assert_eq!(11, nearest_internal_break(12, text));
159    }
160
161    #[test]
162    fn nearest_internal_break_02() {
163        let text = b"Hello\r\n world!";
164        assert_eq!(5, nearest_internal_break(5, text));
165        assert_eq!(7, nearest_internal_break(6, text));
166        assert_eq!(7, nearest_internal_break(7, text));
167    }
168
169    #[test]
170    fn nearest_internal_break_03() {
171        let text = b"\r\nHello world!\r\n";
172        assert_eq!(2, nearest_internal_break(0, text));
173        assert_eq!(2, nearest_internal_break(1, text));
174        assert_eq!(2, nearest_internal_break(2, text));
175        assert_eq!(14, nearest_internal_break(14, text));
176        assert_eq!(14, nearest_internal_break(15, text));
177        assert_eq!(14, nearest_internal_break(16, text));
178    }
179
180    #[test]
181    fn nearest_internal_break_04() {
182        let text = b"\r\n";
183        assert_eq!(2, nearest_internal_break(0, text));
184        assert_eq!(2, nearest_internal_break(1, text));
185        assert_eq!(2, nearest_internal_break(2, text));
186    }
187
188    #[test]
189    fn is_break_01() {
190        let text = b"\n\r\n\r\n\r\n\r\n\r\n\r";
191
192        assert!(is_break(0, text));
193        assert!(is_break(12, text));
194        assert!(is_break(3, text));
195        assert!(!is_break(6, text));
196    }
197
198    #[test]
199    fn seam_is_break_01() {
200        let text1 = b"\r\n\r\n\r\n";
201        let text2 = b"\r\n\r\n";
202
203        assert!(seam_is_break(text1, text2));
204    }
205
206    #[test]
207    fn seam_is_break_02() {
208        let text1 = b"\r\n\r\n\r";
209        let text2 = b"\n\r\n\r\n";
210
211        assert!(!seam_is_break(text1, text2));
212    }
213}