use std::collections::VecDeque;
use crate::chunker::{Chunker, ChunkingError, StringBuffer};
#[derive(Debug, Clone, Copy)]
struct CharPosition {
start: usize,
len: usize,
}
struct CharactersChunkIndices {
start: usize,
end: usize,
new_byte_position: usize,
new_char_position: usize,
}
#[derive(Clone)]
pub struct CharactersChunker {
chunk_size: usize,
overlap: usize,
char_positions: VecDeque<CharPosition>,
current_char_position: usize,
}
impl CharactersChunker {
pub fn new(chunk_size: usize, overlap: usize) -> Result<Self, ChunkingError> {
if overlap >= chunk_size {
return Err(ChunkingError::InvalidArguments {
chunk_size,
overlap,
});
}
Ok(Self {
chunk_size,
overlap,
char_positions: VecDeque::new(),
current_char_position: 0,
})
}
fn build_char_positions(&mut self, text: &str, offset: usize) {
let cp = text.char_indices().map(|(pos, ch)| CharPosition {
start: pos + offset,
len: ch.len_utf8(),
});
self.char_positions.extend(cp);
}
fn compact(&mut self, string_buffer: &mut StringBuffer<impl Iterator<Item = String>>) {
if self.current_char_position > self.char_positions.len() / 2 {
let keep_from_chars = self.current_char_position;
let keep_from_bytes = self.char_positions[keep_from_chars].start;
self.char_positions.drain(0..self.current_char_position);
string_buffer.compact_to(keep_from_bytes);
self.current_char_position = 0;
for cp in self.char_positions.iter_mut() {
if cp.start < keep_from_bytes {
println!("cp start {}, keep from bytes {}", cp.start, keep_from_bytes);
}
cp.start -= keep_from_bytes;
}
}
}
fn next_chunk_indices(&self, buffer: &str) -> Option<CharactersChunkIndices> {
let buffer_len = buffer.len();
let chars_len = self.char_positions.len();
if self.current_char_position >= chars_len {
return None;
}
let start_idx = self.current_char_position;
let end_idx = (start_idx + self.chunk_size).min(chars_len);
let start_byte = self.char_positions[start_idx].start;
let end_byte = if end_idx >= chars_len {
buffer_len
} else {
let char_pos = self.char_positions[end_idx - 1];
char_pos.start + char_pos.len
};
if end_idx >= chars_len {
return Some(CharactersChunkIndices {
start: start_byte,
end: end_byte,
new_byte_position: buffer_len,
new_char_position: chars_len,
});
}
let step = self.chunk_size.saturating_sub(self.overlap);
let next_char_position = start_idx + step;
let next_byte_position = self.char_positions[next_char_position].start;
Some(CharactersChunkIndices {
start: start_byte,
end: end_byte,
new_byte_position: next_byte_position,
new_char_position: next_char_position,
})
}
}
impl Chunker for CharactersChunker {
fn chunk_string(mut self, input: String) -> impl Iterator<Item = String> {
self.build_char_positions(&input, 0);
std::iter::from_fn(move || {
let next = self.next_chunk_indices(&input)?;
self.current_char_position = next.new_char_position;
Some(input[next.start..next.end].to_string())
})
}
fn chunk_stream(mut self, input: impl Iterator<Item = String>) -> impl Iterator<Item = String> {
let mut string_buffer = StringBuffer::new(input, self.chunk_size * 5);
std::iter::from_fn(move || loop {
let buffer = string_buffer.buffer();
let next = self.next_chunk_indices(buffer);
match next {
None if string_buffer.done => return None,
None if !string_buffer.done => {
let old_buffer_len = buffer.len();
string_buffer.fill_no_compact();
let new_buffer_len = string_buffer.buffer().len();
self.build_char_positions(
&string_buffer.buffer()[old_buffer_len..new_buffer_len],
old_buffer_len,
);
self.compact(&mut string_buffer);
continue;
}
None => unreachable!(),
Some(CharactersChunkIndices { end, .. })
if !string_buffer.done && end == buffer.len() =>
{
let old_buffer_len = buffer.len();
string_buffer.fill_no_compact();
let new_buffer_len = string_buffer.buffer().len();
self.build_char_positions(
&string_buffer.buffer()[old_buffer_len..new_buffer_len],
old_buffer_len,
);
self.compact(&mut string_buffer);
continue;
}
Some(ref n) => {
let chunk = buffer[n.start..n.end].to_string();
string_buffer.set_position(n.new_byte_position);
self.current_char_position = n.new_char_position;
return Some(chunk);
}
};
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_characters_chunker_stream() {
let reader = vec!["01234".to_string(), "56789".to_string()].into_iter();
let overlap = 2;
let chunk_size = 6;
let chunker = CharactersChunker::new(chunk_size, overlap).unwrap();
let mut chunked_iter = chunker.chunk_stream(reader);
for chunk in chunked_iter.by_ref() {
println!("Chunk length: {}", chunk.len());
println!("Chunk content: {}", &chunk);
}
}
}