const_str/__ctfe/
split.rs

1use crate::slice::advance;
2use crate::slice::subslice;
3use crate::utf8::CharEncodeUtf8;
4
5use core::str;
6
7struct SplitImpl<'input, 'pat> {
8    input: &'input str,
9    pattern: &'pat str,
10    inclusive: bool,
11}
12
13impl<'input> SplitImpl<'input, '_> {
14    const fn output_len(&self) -> usize {
15        let mut input = self.input;
16        let pat = self.pattern;
17
18        if pat.is_empty() {
19            crate::utf8::str_count_chars(input) + 2
20        } else {
21            let mut ans = 0;
22            while let Some((_, remain)) = crate::str::next_match(input, pat) {
23                ans += 1;
24                input = remain
25            }
26            if self.inclusive {
27                if !input.is_empty() {
28                    ans += 1;
29                }
30            } else {
31                ans += 1;
32            }
33            ans
34        }
35    }
36
37    #[allow(unsafe_code)]
38    const fn const_eval<const N: usize>(&self) -> [&'input str; N] {
39        let mut input = self.input;
40        let pat = self.pattern;
41
42        let mut buf: [&str; N] = [""; N];
43        let mut pos = 0;
44
45        if pat.is_empty() {
46            let mut input = input.as_bytes();
47
48            {
49                buf[pos] = unsafe { str::from_utf8_unchecked(subslice(input, 0..0)) };
50                pos += 1;
51            }
52
53            while let Some((_, count)) = crate::utf8::next_char(input) {
54                buf[pos] = unsafe { str::from_utf8_unchecked(subslice(input, 0..count)) };
55                pos += 1;
56                input = advance(input, count);
57            }
58
59            {
60                buf[pos] = unsafe { str::from_utf8_unchecked(subslice(input, 0..0)) };
61                pos += 1;
62            }
63        } else {
64            while let Some((m, remain)) = crate::str::next_match(input, pat) {
65                let substr = if self.inclusive {
66                    subslice(input.as_bytes(), 0..m + pat.len())
67                } else {
68                    subslice(input.as_bytes(), 0..m)
69                };
70                buf[pos] = unsafe { str::from_utf8_unchecked(substr) };
71                pos += 1;
72                input = remain;
73            }
74            if self.inclusive {
75                if !input.is_empty() {
76                    buf[pos] = input;
77                    pos += 1;
78                }
79            } else {
80                buf[pos] = input;
81                pos += 1;
82            }
83        }
84        assert!(pos == N);
85        buf
86    }
87}
88
89pub struct Split<T, P>(pub T, pub P);
90
91impl<'input, 'pat> Split<&'input str, &'pat str> {
92    const fn to_impl(&self) -> SplitImpl<'input, 'pat> {
93        SplitImpl {
94            input: self.0,
95            pattern: self.1,
96            inclusive: false,
97        }
98    }
99
100    pub const fn output_len(&self) -> usize {
101        self.to_impl().output_len()
102    }
103
104    pub const fn const_eval<const N: usize>(&self) -> [&'input str; N] {
105        self.to_impl().const_eval()
106    }
107}
108
109impl<'input> Split<&'input str, char> {
110    const fn to_impl<'pat>(&self, ch: &'pat CharEncodeUtf8) -> SplitImpl<'input, 'pat> {
111        SplitImpl {
112            input: self.0,
113            pattern: ch.as_str(),
114            inclusive: false,
115        }
116    }
117
118    pub const fn output_len(&self) -> usize {
119        let ch = CharEncodeUtf8::new(self.1);
120        self.to_impl(&ch).output_len()
121    }
122
123    pub const fn const_eval<const N: usize>(&self) -> [&'input str; N] {
124        let ch = CharEncodeUtf8::new(self.1);
125        self.to_impl(&ch).const_eval()
126    }
127}
128
129/// Returns an array of substrings of a string slice, separated by characters matched by a pattern.
130///
131/// The pattern type must be one of
132///
133/// + [`&str`](prim@str)
134/// + [`char`]
135///
136/// This macro is [const-context only](./index.html#const-context-only).
137///
138/// See also [`str::split`](https://doc.rust-lang.org/std/primitive.str.html#method.split).
139///
140/// # Examples
141///
142/// ```
143/// const SEPARATOR: &str = ", ";
144/// const TEXT: &str = "lion, tiger, leopard";
145///
146/// const ANIMALS_ARRAY: [&str;3] = const_str::split!(TEXT, SEPARATOR);
147/// const ANIMALS_SLICE: &[&str] = &const_str::split!(TEXT, SEPARATOR);
148///
149/// assert_eq!(ANIMALS_ARRAY, ANIMALS_SLICE);
150/// assert_eq!(ANIMALS_SLICE, &["lion", "tiger", "leopard"]);
151/// ```
152#[macro_export]
153macro_rules! split {
154    ($s: expr, $pat: expr) => {{
155        const INPUT: &str = $s;
156        const OUTPUT_LEN: usize = $crate::__ctfe::Split(INPUT, $pat).output_len();
157        const OUTPUT_BUF: [&str; OUTPUT_LEN] = $crate::__ctfe::Split(INPUT, $pat).const_eval();
158        OUTPUT_BUF
159    }};
160}
161
162pub struct SplitInclusive<T, P>(pub T, pub P);
163
164impl<'input, 'pat> SplitInclusive<&'input str, &'pat str> {
165    const fn to_impl(&self) -> SplitImpl<'input, 'pat> {
166        SplitImpl {
167            input: self.0,
168            pattern: self.1,
169            inclusive: true,
170        }
171    }
172
173    pub const fn output_len(&self) -> usize {
174        self.to_impl().output_len()
175    }
176
177    pub const fn const_eval<const N: usize>(&self) -> [&'input str; N] {
178        self.to_impl().const_eval()
179    }
180}
181
182impl<'input> SplitInclusive<&'input str, char> {
183    const fn to_impl<'pat>(&self, ch: &'pat CharEncodeUtf8) -> SplitImpl<'input, 'pat> {
184        SplitImpl {
185            input: self.0,
186            pattern: ch.as_str(),
187            inclusive: true,
188        }
189    }
190
191    pub const fn output_len(&self) -> usize {
192        let ch = CharEncodeUtf8::new(self.1);
193        self.to_impl(&ch).output_len()
194    }
195
196    pub const fn const_eval<const N: usize>(&self) -> [&'input str; N] {
197        let ch = CharEncodeUtf8::new(self.1);
198        self.to_impl(&ch).const_eval()
199    }
200}
201
202/// Returns an array of substrings of a string slice, separated by characters matched by a pattern.
203///
204/// Differs from the array produced by [`split!`] in that
205/// [`split_inclusive!`](crate::split_inclusive) leaves the matched part as the terminator of the substring.
206///
207/// If the last element of the string is matched,
208/// that element will be considered the terminator of the preceding substring.
209/// That substring will be the last item returned by the iterator.
210///
211/// The pattern type must be one of
212///
213/// + [`&str`](prim@str)
214/// + [`char`]
215///
216/// This macro is [const-context only](./index.html#const-context-only).
217///
218/// See also [`str::split_inclusive`](https://doc.rust-lang.org/std/primitive.str.html#method.split_inclusive).
219///
220/// # Examples
221/// ```
222/// const TEXT: &str = "Mary had a little lamb\nlittle lamb\nlittle lamb.";
223/// const ANSWER:&[&str] = &const_str::split_inclusive!(TEXT, "\n");
224/// assert_eq!(ANSWER, &["Mary had a little lamb\n", "little lamb\n", "little lamb."]);
225/// ```
226/// ```
227/// const TEXT: &str = "\nA\nB\nC\n";
228/// const ANSWER:&[&str] = &const_str::split_inclusive!(TEXT, "\n");
229/// assert_eq!(ANSWER, &["\n", "A\n", "B\n", "C\n"]);
230/// ```
231#[macro_export]
232macro_rules! split_inclusive {
233    ($s: expr, $pat: expr) => {{
234        const INPUT: &str = $s;
235        const OUTPUT_LEN: usize = $crate::__ctfe::SplitInclusive(INPUT, $pat).output_len();
236        const OUTPUT_BUF: [&str; OUTPUT_LEN] =
237            $crate::__ctfe::SplitInclusive(INPUT, $pat).const_eval();
238        OUTPUT_BUF
239    }};
240}
241
242pub struct SplitAsciiWhitespace<T>(pub T);
243
244impl SplitAsciiWhitespace<&'_ str> {
245    pub const fn output_len(&self) -> usize {
246        let bytes = self.0.as_bytes();
247        let mut count = 0;
248        let mut i = 0;
249        let mut in_word = false;
250
251        while i < bytes.len() {
252            if bytes[i].is_ascii_whitespace() {
253                if in_word {
254                    count += 1;
255                    in_word = false;
256                }
257            } else {
258                in_word = true;
259            }
260            i += 1;
261        }
262
263        if in_word {
264            count += 1;
265        }
266
267        count
268    }
269
270    #[allow(unsafe_code)]
271    pub const fn const_eval<const N: usize>(&self) -> [&'_ str; N] {
272        let bytes = self.0.as_bytes();
273        let mut buf: [&str; N] = [""; N];
274        let mut pos = 0;
275        let mut i = 0;
276
277        while i < bytes.len() {
278            // Skip leading whitespace
279            while i < bytes.len() && bytes[i].is_ascii_whitespace() {
280                i += 1;
281            }
282
283            if i >= bytes.len() {
284                break;
285            }
286
287            // Mark start of word
288            let start = i;
289
290            // Find end of word
291            while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
292                i += 1;
293            }
294
295            // Extract word
296            let word_bytes = subslice(bytes, start..i);
297            buf[pos] = unsafe { core::str::from_utf8_unchecked(word_bytes) };
298            pos += 1;
299        }
300
301        assert!(pos == N);
302        buf
303    }
304}
305
306pub const fn map_lines<const N: usize>(mut lines: [&str; N]) -> [&str; N] {
307    let mut i = 0;
308    while i < N {
309        let s = lines[i];
310        match crate::str::strip_suffix(s, "\r\n") {
311            Some(s) => lines[i] = s,
312            None => match crate::str::strip_suffix(s, "\n") {
313                Some(s) => lines[i] = s,
314                None => lines[i] = s,
315            },
316        }
317        i += 1;
318    }
319    lines
320}
321
322/// Returns an array of the lines in a string.
323///
324/// Lines are split by LF (`\n`) or CRLF (`\r\n`).
325///
326/// Line terminators are not included in the returned array.
327///
328/// The final line ending is optional.
329/// A string that ends with a final line ending will return the same lines
330/// as an otherwise identical string without a final line ending.
331///
332/// This macro is [const-context only](./index.html#const-context-only).
333///
334/// See also [`str::lines`](https://doc.rust-lang.org/std/primitive.str.html#method.lines)
335///
336/// # Examples
337/// ```rust
338/// const TEXT: &str = "foo\r\nbar\n\nbaz\r";
339/// const LINES_ARRAY: [&str;4] = const_str::split_lines!(TEXT);
340/// const LINES_SLICE: &[&str] = &const_str::split_lines!(TEXT);
341///
342/// assert_eq!(LINES_ARRAY, LINES_SLICE);
343/// assert_eq!(LINES_SLICE, &["foo", "bar", "", "baz\r"]);
344/// ```
345/// ```rust
346/// const TEXT1: &str = "1\r\n2\r\n3\r\n";
347/// const TEXT2: &str = "1\n2\n3\n";
348/// const TEXT3: &str = "1\n2\n3";
349/// const LINES1: &[&str] = &const_str::split_lines!(TEXT1);
350/// const LINES2: &[&str] = &const_str::split_lines!(TEXT2);
351/// const LINES3: &[&str] = &const_str::split_lines!(TEXT3);
352/// assert_eq!(LINES1, LINES2);
353/// assert_eq!(LINES2, LINES3);
354/// ```
355#[macro_export]
356macro_rules! split_lines {
357    ($s: expr) => {{
358        $crate::__ctfe::map_lines($crate::split_inclusive!($s, "\n"))
359    }};
360}
361
362/// Returns an array of substrings of a string slice, separated by ASCII whitespace.
363///
364/// ASCII whitespace characters are: space (` `), tab (`\t`), newline (`\n`),
365/// carriage return (`\r`), and form feed (`\f`).
366///
367/// Consecutive whitespace characters are treated as a single separator.
368/// Leading and trailing whitespace is ignored.
369///
370/// This macro is [const-context only](./index.html#const-context-only).
371///
372/// See also [`str::split_ascii_whitespace`](https://doc.rust-lang.org/std/primitive.str.html#method.split_ascii_whitespace).
373///
374/// # Examples
375///
376/// ```
377/// const TEXT: &str = "  hello   world  ";
378/// const WORDS_ARRAY: [&str; 2] = const_str::split_ascii_whitespace!(TEXT);
379/// const WORDS_SLICE: &[&str] = &const_str::split_ascii_whitespace!(TEXT);
380///
381/// assert_eq!(WORDS_ARRAY, WORDS_SLICE);
382/// assert_eq!(WORDS_SLICE, &["hello", "world"]);
383/// ```
384///
385/// ```
386/// const TEXT: &str = "word1\t\tword2\n\nword3";
387/// const WORDS: &[&str] = &const_str::split_ascii_whitespace!(TEXT);
388/// assert_eq!(WORDS, &["word1", "word2", "word3"]);
389/// ```
390#[macro_export]
391macro_rules! split_ascii_whitespace {
392    ($s: expr) => {{
393        const INPUT: &str = $s;
394        const OUTPUT_LEN: usize = $crate::__ctfe::SplitAsciiWhitespace(INPUT).output_len();
395        const OUTPUT_BUF: [&str; OUTPUT_LEN] =
396            $crate::__ctfe::SplitAsciiWhitespace(INPUT).const_eval();
397        OUTPUT_BUF
398    }};
399}
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404
405    #[test]
406    fn test_split() {
407        macro_rules! testcase {
408            ($input: expr, $pat: expr) => {{
409                const OUTPUT: &[&str] = &$crate::split!($input, $pat);
410
411                let ans = $input.split($pat).collect::<Vec<_>>();
412                assert_eq!(OUTPUT.len(), ans.len());
413                assert_eq!(OUTPUT, &*ans, "ans = {:?}", ans);
414            }};
415        }
416
417        testcase!("", "");
418        testcase!("a中1😂1!", "");
419        testcase!("a中1😂1!", "a");
420        testcase!("a中1😂1!", "中");
421        testcase!("a中1😂1!", "1");
422        testcase!("a中1😂1!", "😂");
423        testcase!("a中1😂1!", "!");
424        testcase!("11111", "1");
425        testcase!("222", "22");
426        testcase!("啊哈哈哈", "哈哈");
427        testcase!("some string:another string", ":");
428
429        testcase!("11111", '1');
430        testcase!("a中1😂1!", 'a');
431        testcase!("a中1😂1!", '中');
432        testcase!("a中1😂1!", '1');
433        testcase!("a中1😂1!", '😂');
434        testcase!("a中1😂1!", '!');
435    }
436
437    #[test]
438    fn test_split_ascii_whitespace() {
439        macro_rules! testcase {
440            ($input: expr) => {{
441                const OUTPUT: &[&str] = &$crate::split_ascii_whitespace!($input);
442
443                let ans = $input.split_ascii_whitespace().collect::<Vec<_>>();
444                assert_eq!(
445                    OUTPUT.len(),
446                    ans.len(),
447                    "Length mismatch for input: {:?}",
448                    $input
449                );
450                assert_eq!(
451                    OUTPUT, &*ans,
452                    "Content mismatch for input: {:?}, expected: {:?}",
453                    $input, ans
454                );
455            }};
456        }
457
458        // Basic cases
459        testcase!("");
460        testcase!(" ");
461        testcase!("  ");
462        testcase!("hello");
463        testcase!(" hello ");
464        testcase!("  hello  ");
465        testcase!("hello world");
466        testcase!(" hello world ");
467        testcase!("  hello   world  ");
468
469        // Different whitespace types
470        testcase!("a\tb\nc\rd\x0Cf");
471        testcase!(" \t\n\r\x0C ");
472        testcase!("word1\t\t\tword2\n\n\nword3");
473
474        // Mixed content
475        testcase!("foo bar baz");
476        testcase!("\tfoo\nbar\rbaz\x0C");
477        testcase!("   a   b   c   ");
478        testcase!("\t\n\r\x0C");
479
480        // Edge cases
481        testcase!("single");
482        testcase!("a");
483        testcase!("a b");
484        testcase!("  a  b  ");
485    }
486}