pub trait PatternWithLen {
fn find_with_len(&self, haystack: &str) -> Option<(usize, usize)>;
const IGNORE_FINAL_EMPTY: bool = false;
}
impl<F: Fn(char) -> bool> PatternWithLen for F {
fn find_with_len(&self, haystack: &str) -> Option<(usize, usize)> {
match haystack.find(self) {
Some(i) => Some((i, next_char_len(haystack, i)?)),
None => None,
}
}
}
fn next_char_len(haystack: &str, i: usize) -> Option<usize> {
Some(haystack[i..].chars().next()?.len_utf8())
}
impl PatternWithLen for char {
fn find_with_len(&self, haystack: &str) -> Option<(usize, usize)> {
haystack.find(*self).map(|i| (i, self.len_utf8()))
}
}
impl PatternWithLen for &str {
fn find_with_len(&self, haystack: &str) -> Option<(usize, usize)> {
assert!(!self.is_empty(), "Empty pattern is not allowed. Discussions see <https://github.com/rust-lang/rust/issues/33882>");
haystack.find(self).map(|i| (i, self.len()))
}
}
#[derive(Debug, Clone, Copy)]
pub struct NewLine;
impl PatternWithLen for NewLine {
const IGNORE_FINAL_EMPTY: bool = true;
fn find_with_len(&self, haystack: &str) -> Option<(usize, usize)> {
const LEN_CR: usize = "\r".len();
const LEN_LF: usize = "\n".len();
const LEN_CRLF: usize = "\r\n".len();
let lf_index = haystack.find('\n')?;
if lf_index >= LEN_CR {
let cr_index = lf_index - LEN_CR;
if haystack.is_char_boundary(cr_index) && haystack[cr_index..lf_index] == *"\r" {
return Some((cr_index, LEN_CRLF));
}
}
Some((lf_index, LEN_LF))
}
}
#[derive(Debug, Clone)]
#[must_use = "iterators are lazy and do nothing unless consumed"]
pub struct IterSplitCharOwned<Pattern: PatternWithLen> {
residual: Option<String>,
pattern: Pattern,
}
impl<Pattern: PatternWithLen> Iterator for IterSplitCharOwned<Pattern> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
let residual = self.residual.as_mut()?;
let mut new_residual = match self.pattern.find_with_len(residual) {
Some((index_begin_of_delim, len_delim)) => {
let index_end_of_delim = index_begin_of_delim + len_delim;
let new_residual = residual.split_off(index_end_of_delim);
residual.truncate(index_begin_of_delim);
new_residual
}
None => return self.residual.take(),
};
std::mem::swap(residual, &mut new_residual);
if Pattern::IGNORE_FINAL_EMPTY && residual.is_empty() {
self.residual = None;
}
let splitted_out = new_residual;
Some(splitted_out)
}
}
pub trait SplitOwned: Sized {
fn split_owned<Pattern: PatternWithLen>(self, pat: Pattern) -> impl Iterator<Item = String>;
fn lines_owned(self) -> impl Iterator<Item = String> {
self.split_owned(NewLine)
}
fn split_owned_once<Pattern: PatternWithLen>(self, pat: Pattern) -> Result<(Self, Self), Self>;
fn split_line_owned_once(self) -> Result<(Self, Self), Self> {
self.split_owned_once(NewLine)
}
}
impl SplitOwned for String {
fn split_owned<Pattern: PatternWithLen>(
self,
pattern: Pattern,
) -> impl Iterator<Item = String> {
IterSplitCharOwned {
residual: Some(self),
pattern,
}
}
fn split_owned_once<Pattern: PatternWithLen>(
mut self,
pattern: Pattern,
) -> Result<(Self, Self), Self> {
match pattern.find_with_len(&self) {
Some((index_begin_of_delim, len_delim)) => {
let index_end_of_delim = index_begin_of_delim + len_delim;
debug_assert!(
self.is_char_boundary(index_end_of_delim),
"不会发生:find_delim在{self:?}中找到的索引{index_begin_of_delim}应该在合法UTF-8位置"
);
let right = self.split_off(index_end_of_delim);
self.truncate(index_begin_of_delim);
Ok((self, right))
}
None => Err(self),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{f_tensor, macro_once};
#[test]
fn split_owned_char() {
fn test(c: char, s: impl ToString) {
let s = s.to_string();
let cloned_split = s
.split_once(c)
.map(|(a, b)| (a.to_owned(), b.to_owned()))
.ok_or_else(|| s.to_owned());
let owned_split = s.clone().split_owned_once(c);
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}\nc = {c:?}"
);
let cloned_split = s.split(c).map(ToString::to_string).collect::<Vec<_>>();
let owned_split = s.clone().split_owned(c).take(0xff).collect::<Vec<_>>();
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}\nc = {c:?}"
);
}
f_tensor! {
test;
'\r' '\n' '\t';
"中文123🤣👉⇑🤡↑\nEnglish😆\nあ💭this\nYou!\r\n\t \x121\n"
"r \r n \n rn \r\n换行最后有内容"
"换行最后无内容\r"
"换行最后无内容\n"
"换行最后无内容\r\n"
};
}
#[test]
fn split_owned_ref_str() {
fn test(c: &str, s: impl ToString) {
let s = s.to_string();
let cloned_split = s
.split_once(c)
.map(|(a, b)| (a.to_owned(), b.to_owned()))
.ok_or_else(|| s.to_owned());
let owned_split = s.clone().split_owned_once(c);
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}\nc = {c:?}"
);
let cloned_split = s.split(c).map(ToString::to_string).collect::<Vec<_>>();
let owned_split = s.clone().split_owned(c).take(0xff).collect::<Vec<_>>();
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}\nc = {c:?}"
);
}
f_tensor! {
test;
"\r" "\n" "\r\n" "\t" "🤣" "n";
"中文123🤣👉⇑🤡↑\nEnglish😆\nあ💭this\nYou!\r\n\t \x121\n"
"r \r n \n rn \r\n换行最后有内容"
"换行最后无内容\r"
"换行最后无内容\n"
"换行最后无内容\r\n"
};
}
#[test]
#[should_panic]
fn empty_str_pattern_is_forbidden() {
for _ in "abc".to_string().split_owned("") {}
}
#[test]
fn split_owned_fn() {
fn test(pat: impl Fn(char) -> bool, s: impl ToString) {
let s = s.to_string();
let cloned_split = s
.split_once(&pat)
.map(|(a, b)| (a.to_owned(), b.to_owned()))
.ok_or_else(|| s.to_owned());
let owned_split = s.clone().split_owned_once(&pat);
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}"
);
let cloned_split = s.split(&pat).map(ToString::to_string).collect::<Vec<_>>();
let owned_split = s.clone().split_owned(&pat).take(0xff).collect::<Vec<_>>();
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}"
);
}
f_tensor! {
test;
char::is_whitespace
char::is_alphabetic
char::is_alphanumeric,
{ |c:char| c.is_ascii() }
;
"中文123🤣👉⇑🤡↑\nEnglish😆\nあ💭this\nYou!\r\n\t \x121\n"
"r \r n \n rn \r\n换行最后有内容"
"换行最后无内容\r"
"换行最后无内容\n"
"换行最后无内容\r\n"
};
}
#[test]
fn lines_owned() {
fn test(s: impl ToString) {
let s = s.to_string();
let cloned_split = 'cloned: {
let Some(i_lf) = s.find('\n') else {
break 'cloned Err(s.to_owned());
};
const LEN_LF: usize = "\n".len();
let left_i = match s.find("\r\n") {
Some(i_crlf) if i_crlf == i_lf - LEN_LF => i_crlf,
_ => i_lf,
};
Ok((s[..left_i].to_owned(), s[i_lf + 1..].to_owned()))
};
let owned_split = s.clone().split_line_owned_once();
assert_eq!(
cloned_split, owned_split,
"两种方式拆分不等:\ns = {s:?}\ncloned = {cloned_split:?}\n!=\nowned = {owned_split:?}"
);
let cloned_lines = s.lines().map(ToString::to_string).collect::<Vec<_>>();
let owned_lines = s.clone().lines_owned().take(0xffff).collect::<Vec<_>>();
assert_eq!(
cloned_lines, owned_lines,
"两种方式拆分不等:\ns = {s:?}\ncloned = {cloned_lines:?}\n!=\nowned = {owned_lines:?}"
);
}
macro_once! {
macro test( $($input:expr)* ) {
$(test($input);)*
}
"中文123🤣👉⇑🤡↑\nEnglish😆\nあ💭this\nYou!\r\n\t \x121\n"
"r \r n \n rn \r\n换行最后有内容"
"俩\\n \n\n 后边"
"俩\\r \r\r 后边"
"\\r\\n \r\n 后边"
"仨\\n \n\n\n 后边"
"仨\\r \r\r\r 后边"
"\\r\\n\\r \r\n\r 后边"
"后边没有:俩\\n \n\n"
"后边没有:俩\\r \r\r"
"后边没有:\\r\\n \r\n"
"后边没有:仨\\n \n\n\n"
"后边没有:仨\\r \r\r\r"
"后边没有:\\r\\n\\r \r\n\r"
"换行最后无内容\r"
"换行最后无内容\n"
"换行最后无内容\r\n"
"\r".repeat(0xff)
"\n".repeat(0xff)
"\r\n".repeat(0xff)
" \r".repeat(0xff)
" \n".repeat(0xff)
" \r\n".repeat(0xff)
" \r ".repeat(0xff)
" \n ".repeat(0xff)
" \r\n ".repeat(0xff)
"\r ".repeat(0xff)
"\n ".repeat(0xff)
"\r\n ".repeat(0xff)
}
}
}