use crate::chunker::{Chunker, ChunkingError, StringBuffer};
struct BytesChunkIndices {
start: usize,
end: usize,
new_position: usize,
}
#[derive(Clone)]
pub struct BytesChunker {
chunk_size: usize,
overlap: usize,
}
impl BytesChunker {
pub fn new(chunk_size: usize, overlap: usize) -> Result<Self, ChunkingError> {
if overlap >= chunk_size {
return Err(ChunkingError::InvalidArguments {
chunk_size,
overlap,
});
}
Ok(Self {
chunk_size,
overlap,
})
}
fn next_chunk_indices(
&self,
buffer: &str,
current_position: usize,
) -> Option<BytesChunkIndices> {
let buffer_len = buffer.len();
if current_position >= buffer_len {
return None;
}
let start = current_position;
assert!(
buffer.is_char_boundary(start),
"Bug: start position {} is not at char boundary",
start
);
let target_end = (start + self.chunk_size).min(buffer_len);
let end = if target_end == buffer_len {
buffer_len } else if buffer.is_char_boundary(target_end) {
target_end } else {
(target_end.saturating_sub(3)..target_end)
.rev()
.find(|&i| buffer.is_char_boundary(i))
.expect("Bug: no char boundary found")
};
if end >= buffer_len {
return Some(BytesChunkIndices {
start,
end,
new_position: buffer_len,
});
}
let actual_chunk_len = end - start;
let step = actual_chunk_len.saturating_sub(self.overlap);
let target_next_pos = start + step;
let next_pos = if buffer.is_char_boundary(target_next_pos) {
target_next_pos
} else {
(target_next_pos.saturating_sub(3)..=target_next_pos)
.rev()
.find(|&i| buffer.is_char_boundary(i))
.expect("Bug: no char boundary found")
};
Some(BytesChunkIndices {
start,
end,
new_position: next_pos,
})
}
}
impl Chunker for BytesChunker {
fn chunk_string(self, input: String) -> impl Iterator<Item = String> {
let mut current_position = 0;
std::iter::from_fn(move || {
let next = self.next_chunk_indices(&input, current_position)?;
current_position = next.new_position;
Some(input[next.start..next.end].to_string())
})
}
fn chunk_stream(self, input: impl Iterator<Item = String>) -> impl Iterator<Item = String> {
let mut string_buffer = StringBuffer::new(input, self.chunk_size * 5);
std::iter::from_fn(move || loop {
let buffer = string_buffer.buffer();
let next = self.next_chunk_indices(buffer, string_buffer.position);
match next {
None if string_buffer.done => return None,
None if !string_buffer.done => {
string_buffer.fill();
continue;
}
None => unreachable!(),
Some(BytesChunkIndices { end, .. })
if !string_buffer.done && end == buffer.len() =>
{
string_buffer.fill();
continue;
}
Some(ref n) => {
let chunk = buffer[n.start..n.end].to_string();
string_buffer.set_position(n.new_position);
return Some(chunk);
}
};
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bytes_chunker_stream() {
let reader = vec!["01234".to_string(), "56789".to_string()].into_iter();
let overlap = 2;
let chunk_size = 6;
let chunker = BytesChunker::new(chunk_size, overlap).unwrap();
let mut chunked_iter = chunker.chunk_stream(reader);
for chunk in chunked_iter.by_ref() {
println!("Chunk length: {}", chunk.len());
println!("Chunk content: {}", &chunk);
}
}
}