use std::borrow::Cow;
use crate::string::zero_copy::{ZeroCopySegment, SegmentType};
#[ cfg( feature = "simd" ) ]
use memchr;
#[ derive( Debug, Clone, Copy, PartialEq, Eq ) ]
pub enum SplitAlgorithm {
SingleChar,
BoyerMoore,
CSV,
StateMachine,
AhoCorasick,
Generic,
}
#[ derive( Debug, Clone, PartialEq, Eq ) ]
pub enum SplitResult<'a> {
Borrowed( &'a str ),
Owned( String ),
}
impl<'a> SplitResult<'a> {
pub fn as_str( &self ) -> &str {
match self {
SplitResult::Borrowed( s ) => s,
SplitResult::Owned( s ) => s.as_str(),
}
}
pub fn to_zero_copy_segment( &self, start_pos: usize, end_pos: usize ) -> ZeroCopySegment<'_> {
match self {
SplitResult::Borrowed( s ) => ZeroCopySegment {
content: Cow::Borrowed( s ),
segment_type: SegmentType::Content,
start_pos,
end_pos,
was_quoted: false,
},
SplitResult::Owned( s ) => ZeroCopySegment {
content: Cow::Borrowed( s.as_str() ),
segment_type: SegmentType::Content,
start_pos,
end_pos,
was_quoted: true, },
}
}
}
impl<'a> AsRef<str> for SplitResult<'a> {
fn as_ref( &self ) -> &str {
self.as_str()
}
}
#[ derive( Debug, Clone ) ]
pub struct SingleCharSplitIterator<'a> {
input: &'a str,
delimiter: u8,
position: usize,
preserve_delimiter: bool,
finished: bool,
pending_delimiter: Option<( usize, usize )>, }
impl<'a> SingleCharSplitIterator<'a> {
pub fn new( input: &'a str, delimiter: char, preserve_delimiter: bool ) -> Self {
assert!( delimiter.is_ascii(), "SingleChar optimization requires ASCII delimiter, got: {:?}", delimiter );
Self {
input,
delimiter: delimiter as u8,
position: 0,
preserve_delimiter,
finished: false,
pending_delimiter: None,
}
}
#[ cfg( feature = "simd" ) ]
fn find_next_delimiter( &self ) -> Option<usize> {
if self.position >= self.input.len() {
return None;
}
let remaining_bytes = &self.input.as_bytes()[ self.position.. ];
memchr::memchr( self.delimiter, remaining_bytes )
.map( |pos| self.position + pos )
}
#[ cfg( not( feature = "simd" ) ) ]
fn find_next_delimiter( &self ) -> Option<usize> {
if self.position >= self.input.len() {
return None;
}
let remaining_bytes = &self.input.as_bytes()[ self.position.. ];
for ( i, &byte ) in remaining_bytes.iter().enumerate() {
if byte == self.delimiter {
return Some( self.position + i );
}
}
None
}
}
impl<'a> Iterator for SingleCharSplitIterator<'a> {
type Item = SplitResult<'a>;
fn next( &mut self ) -> Option<Self::Item> {
if let Some(( delim_start, delim_end )) = self.pending_delimiter.take() {
let delimiter_str = &self.input[ delim_start..delim_end ];
return Some( SplitResult::Borrowed( delimiter_str ) );
}
if self.finished || self.position > self.input.len() {
return None;
}
if self.position == self.input.len() {
self.finished = true;
return None;
}
match self.find_next_delimiter() {
Some( delim_pos ) => {
let content = &self.input[ self.position..delim_pos ];
let new_position = delim_pos + 1;
if self.preserve_delimiter && delim_pos < self.input.len() {
self.pending_delimiter = Some(( delim_pos, delim_pos + 1 ));
}
self.position = new_position;
Some( SplitResult::Borrowed( content ) )
},
None => {
let remaining = &self.input[ self.position.. ];
self.position = self.input.len();
self.finished = true;
if remaining.is_empty() {
None
} else {
Some( SplitResult::Borrowed( remaining ) )
}
}
}
}
}
#[ derive( Debug ) ]
pub struct AlgorithmSelector;
impl AlgorithmSelector {
pub fn select_split_algorithm( delimiters: &[ &str ] ) -> SplitAlgorithm {
if delimiters.is_empty() {
return SplitAlgorithm::Generic;
}
if delimiters.len() == 1 {
let delim = delimiters[0];
if delim.len() == 1 {
let ch = delim.chars().next().unwrap();
if ch.is_ascii() {
return SplitAlgorithm::SingleChar;
}
}
if Self::is_csv_delimiter( delim ) {
return SplitAlgorithm::CSV;
}
if delim.len() >= 2 && delim.len() <= 8 && delim.is_ascii() {
return SplitAlgorithm::BoyerMoore;
}
}
if Self::is_url_pattern( delimiters ) {
return SplitAlgorithm::StateMachine;
}
if delimiters.len() <= 8 && delimiters.iter().all( |d| d.len() <= 4 ) {
return SplitAlgorithm::AhoCorasick;
}
SplitAlgorithm::Generic
}
fn is_csv_delimiter( delim: &str ) -> bool {
matches!( delim, "," | "\t" | ";" )
}
fn is_url_pattern( delimiters: &[ &str ] ) -> bool {
let url_delims = [ "://", "/", "?", "#" ];
delimiters.iter().all( |d| url_delims.contains( d ) )
}
pub fn select_with_size_hint( delimiters: &[ &str ], input_size: usize ) -> SplitAlgorithm {
let base_algorithm = Self::select_split_algorithm( delimiters );
match ( base_algorithm, input_size ) {
( SplitAlgorithm::BoyerMoore, 0..=1024 ) => SplitAlgorithm::Generic,
( SplitAlgorithm::Generic, 100_000.. ) if delimiters.len() <= 4 => SplitAlgorithm::AhoCorasick,
( algo, _ ) => algo,
}
}
}
pub fn smart_split<'a>( input: &'a str, delimiters: &'a [ &'a str ] ) -> Box<dyn Iterator<Item = SplitResult<'a>> + 'a> {
let algorithm = AlgorithmSelector::select_with_size_hint( delimiters, input.len() );
match algorithm {
SplitAlgorithm::SingleChar => {
let delim_char = delimiters[0].chars().next().unwrap();
Box::new( SingleCharSplitIterator::new( input, delim_char, false ) )
},
SplitAlgorithm::BoyerMoore => {
Box::new( BoyerMooreSplitIterator::new( input, delimiters[0] ) )
},
SplitAlgorithm::CSV => {
let delim_char = delimiters[0].chars().next().unwrap();
Box::new( SingleCharSplitIterator::new( input, delim_char, false ) )
},
SplitAlgorithm::StateMachine => {
let delim_char = delimiters[0].chars().next().unwrap();
Box::new( SingleCharSplitIterator::new( input, delim_char, false ) )
},
SplitAlgorithm::AhoCorasick => {
#[ cfg( feature = "simd" ) ]
{
match crate::simd::simd_split_cached( input, delimiters ) {
Ok( simd_iter ) => {
Box::new( simd_iter.map( |split| {
match split.string {
std::borrow::Cow::Borrowed( s ) => SplitResult::Borrowed( s ),
std::borrow::Cow::Owned( s ) => SplitResult::Owned( s ),
}
} ) )
},
Err( _ ) => {
Box::new( fallback_generic_split( input, delimiters ) )
}
}
}
#[ cfg( not( feature = "simd" ) ) ]
{
Box::new( fallback_generic_split( input, delimiters ) )
}
},
SplitAlgorithm::Generic => {
Box::new( fallback_generic_split( input, delimiters ) )
},
}
}
#[ derive( Debug, Clone ) ]
pub struct BoyerMooreSplitIterator<'a> {
input: &'a str,
pattern: &'a str,
#[allow(dead_code)]
bad_char_table: [ usize; 256 ],
position: usize,
finished: bool,
}
impl<'a> BoyerMooreSplitIterator<'a> {
pub fn new( input: &'a str, pattern: &'a str ) -> Self {
assert!( !pattern.is_empty(), "Boyer-Moore requires non-empty pattern" );
assert!( pattern.len() >= 2, "Boyer-Moore optimization requires pattern length >= 2" );
assert!( pattern.len() <= 8, "Boyer-Moore optimization works best with pattern length <= 8" );
let mut bad_char_table = [ pattern.len(); 256 ];
let pattern_bytes = pattern.as_bytes();
for ( i, &byte ) in pattern_bytes.iter().enumerate() {
if i < pattern_bytes.len() - 1 { bad_char_table[ byte as usize ] = pattern_bytes.len() - i - 1;
}
}
Self {
input,
pattern,
bad_char_table,
position: 0,
finished: false,
}
}
fn find_next_pattern( &self ) -> Option<usize> {
if self.finished || self.position >= self.input.len() {
return None;
}
let text_bytes = self.input.as_bytes();
let pattern_bytes = self.pattern.as_bytes();
let text_len = text_bytes.len();
let pattern_len = pattern_bytes.len();
if self.position + pattern_len > text_len {
return None;
}
let remaining_text = &text_bytes[ self.position.. ];
for i in 0..=( remaining_text.len().saturating_sub( pattern_len ) ) {
let mut matches = true;
for j in 0..pattern_len {
if remaining_text[ i + j ] != pattern_bytes[ j ] {
matches = false;
break;
}
}
if matches {
return Some( self.position + i );
}
}
None
}
}
impl<'a> Iterator for BoyerMooreSplitIterator<'a> {
type Item = SplitResult<'a>;
fn next( &mut self ) -> Option<Self::Item> {
if self.finished || self.position > self.input.len() {
return None;
}
if self.position == self.input.len() {
self.finished = true;
return None;
}
match self.find_next_pattern() {
Some( match_pos ) => {
let content = &self.input[ self.position..match_pos ];
self.position = match_pos + self.pattern.len();
Some( SplitResult::Borrowed( content ) )
},
None => {
let remaining = &self.input[ self.position.. ];
self.position = self.input.len();
self.finished = true;
if remaining.is_empty() {
None
} else {
Some( SplitResult::Borrowed( remaining ) )
}
}
}
}
}
fn fallback_generic_split<'a>( input: &'a str, delimiters: &'a [ &'a str ] ) -> impl Iterator<Item = SplitResult<'a>> + 'a {
crate::string::zero_copy::zero_copy_split( input, delimiters )
.map( |segment| {
match segment.content {
std::borrow::Cow::Borrowed( s ) => SplitResult::Borrowed( s ),
std::borrow::Cow::Owned( s ) => {
SplitResult::Owned( s )
}
}
} )
}
#[ cfg( test ) ]
mod tests {
use super::*;
#[ test ]
fn test_single_char_split_basic() {
let input = "apple,banana,cherry";
let results: Vec<_> = SingleCharSplitIterator::new( input, ',', false )
.collect();
assert_eq!( results.len(), 3 );
assert_eq!( results[0].as_str(), "apple" );
assert_eq!( results[1].as_str(), "banana" );
assert_eq!( results[2].as_str(), "cherry" );
}
#[ test ]
fn test_single_char_split_with_empty_segments() {
let input = "a,,b,c";
let results: Vec<_> = SingleCharSplitIterator::new( input, ',', false )
.collect();
assert_eq!( results.len(), 4 );
assert_eq!( results[0].as_str(), "a" );
assert_eq!( results[1].as_str(), "" );
assert_eq!( results[2].as_str(), "b" );
assert_eq!( results[3].as_str(), "c" );
}
#[ test ]
fn test_single_char_split_preserve_delimiter() {
let input = "a,b,c";
let results: Vec<_> = SingleCharSplitIterator::new( input, ',', true )
.collect();
assert_eq!( results.len(), 5 ); assert_eq!( results[0].as_str(), "a" );
assert_eq!( results[1].as_str(), "," );
assert_eq!( results[2].as_str(), "b" );
assert_eq!( results[3].as_str(), "," );
assert_eq!( results[4].as_str(), "c" );
}
#[ test ]
fn test_algorithm_selection_single_char() {
assert_eq!( AlgorithmSelector::select_split_algorithm( &[","] ), SplitAlgorithm::SingleChar );
assert_eq!( AlgorithmSelector::select_split_algorithm( &[" "] ), SplitAlgorithm::SingleChar );
assert_eq!( AlgorithmSelector::select_split_algorithm( &["\t"] ), SplitAlgorithm::SingleChar ); }
#[ test ]
fn test_algorithm_selection_boyer_moore() {
assert_eq!( AlgorithmSelector::select_split_algorithm( &["::"] ), SplitAlgorithm::BoyerMoore );
assert_eq!( AlgorithmSelector::select_split_algorithm( &["->"] ), SplitAlgorithm::BoyerMoore );
}
#[ test ]
fn test_algorithm_selection_csv() {
assert_eq!( AlgorithmSelector::select_split_algorithm( &[","] ), SplitAlgorithm::SingleChar ); assert_eq!( AlgorithmSelector::select_split_algorithm( &["\t"] ), SplitAlgorithm::SingleChar ); assert_eq!( AlgorithmSelector::select_split_algorithm( &[";"] ), SplitAlgorithm::SingleChar ); }
#[ test ]
fn test_smart_split_integration() {
let input = "field1,field2,field3,field4";
let results: Vec<_> = smart_split( input, &[","] ).collect();
assert_eq!( results.len(), 4 );
assert_eq!( results[0].as_str(), "field1" );
assert_eq!( results[1].as_str(), "field2" );
assert_eq!( results[2].as_str(), "field3" );
assert_eq!( results[3].as_str(), "field4" );
}
#[ test ]
fn test_split_result_conversions() {
let borrowed = SplitResult::Borrowed( "test" );
let owned = SplitResult::Owned( "test".to_string() );
assert_eq!( borrowed.as_str(), "test" );
assert_eq!( owned.as_str(), "test" );
assert_eq!( borrowed.as_ref(), "test" );
assert_eq!( owned.as_ref(), "test" );
}
#[ test ]
#[ should_panic( expected = "SingleChar optimization requires ASCII delimiter" ) ]
fn test_single_char_non_ascii_panic() {
SingleCharSplitIterator::new( "test", 'â„¢', false );
}
#[ test ]
fn test_boyer_moore_split_basic() {
let input = "field1::field2::field3::field4";
let results: Vec<_> = BoyerMooreSplitIterator::new( input, "::" )
.collect();
assert_eq!( results.len(), 4 );
assert_eq!( results[0].as_str(), "field1" );
assert_eq!( results[1].as_str(), "field2" );
assert_eq!( results[2].as_str(), "field3" );
assert_eq!( results[3].as_str(), "field4" );
}
#[ test ]
fn test_boyer_moore_split_with_empty_segments() {
let input = "a::::b::c";
let results: Vec<_> = BoyerMooreSplitIterator::new( input, "::" )
.collect();
assert_eq!( results.len(), 4 );
assert_eq!( results[0].as_str(), "a" );
assert_eq!( results[1].as_str(), "" );
assert_eq!( results[2].as_str(), "b" );
assert_eq!( results[3].as_str(), "c" );
}
#[ test ]
fn test_boyer_moore_no_pattern() {
let input = "no delimiters here";
let results: Vec<_> = BoyerMooreSplitIterator::new( input, "::" )
.collect();
assert_eq!( results.len(), 1 );
assert_eq!( results[0].as_str(), "no delimiters here" );
}
#[ test ]
fn test_boyer_moore_different_patterns() {
let input = "a->b->c->d";
let results: Vec<_> = BoyerMooreSplitIterator::new( input, "->" )
.collect();
assert_eq!( results.len(), 4 );
assert_eq!( results[0].as_str(), "a" );
assert_eq!( results[1].as_str(), "b" );
assert_eq!( results[2].as_str(), "c" );
assert_eq!( results[3].as_str(), "d" );
}
#[ test ]
#[ should_panic( expected = "Boyer-Moore requires non-empty pattern" ) ]
fn test_boyer_moore_empty_pattern_panic() {
BoyerMooreSplitIterator::new( "test", "" );
}
#[ test ]
#[ should_panic( expected = "Boyer-Moore optimization requires pattern length >= 2" ) ]
fn test_boyer_moore_single_char_pattern_panic() {
BoyerMooreSplitIterator::new( "test", "a" );
}
#[ test ]
#[ should_panic( expected = "Boyer-Moore optimization works best with pattern length <= 8" ) ]
fn test_boyer_moore_long_pattern_panic() {
BoyerMooreSplitIterator::new( "test", "verylongpattern" );
}
#[ test ]
fn test_boyer_moore_vs_smart_split_integration() {
let input = "namespace::class::method::args";
let smart_results: Vec<_> = smart_split( input, &["::"] ).collect();
let bm_results: Vec<_> = BoyerMooreSplitIterator::new( input, "::" ).collect();
assert_eq!( smart_results.len(), bm_results.len() );
for ( smart, bm ) in smart_results.iter().zip( bm_results.iter() ) {
assert_eq!( smart.as_str(), bm.as_str() );
}
}
}