use std::collections::VecDeque;
use std::io::{self, BufRead, BufReader, Read};
use crate::tokenizer::{Token, Tokenizer};
use crate::Result;
pub struct StreamingTokenizer {
tokenizer: Tokenizer,
buffer: String,
chunk_size: usize,
sentence_delimiters: Vec<char>,
total_chars_processed: usize,
max_buffer_size: usize,
}
impl StreamingTokenizer {
pub const DEFAULT_CHUNK_SIZE: usize = 8192;
pub const DEFAULT_MAX_BUFFER_SIZE: usize = 16 * 1024 * 1024;
#[must_use]
pub fn new(tokenizer: Tokenizer) -> Self {
Self {
tokenizer,
buffer: String::with_capacity(Self::DEFAULT_CHUNK_SIZE),
chunk_size: Self::DEFAULT_CHUNK_SIZE,
sentence_delimiters: vec!['.', '!', '?', '。', '.', '\n'],
total_chars_processed: 0,
max_buffer_size: Self::DEFAULT_MAX_BUFFER_SIZE,
}
}
#[must_use]
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.chunk_size = size;
self.buffer = String::with_capacity(size);
self
}
#[must_use]
pub fn with_sentence_delimiters(mut self, delimiters: Vec<char>) -> Self {
self.sentence_delimiters = delimiters;
self
}
pub fn process_chunk(&mut self, chunk: &str) -> Vec<Token> {
self.buffer.push_str(chunk);
let split_pos = self.find_last_sentence_boundary();
if let Some(pos) = split_pos {
let to_process = self.buffer[..=pos].to_string();
let remaining = self.buffer[pos + 1..].to_string();
let mut tokens = self.tokenizer.tokenize(&to_process);
for token in &mut tokens {
token.start_pos += self.total_chars_processed;
token.end_pos += self.total_chars_processed;
}
self.total_chars_processed += to_process.chars().count();
self.buffer = remaining;
tokens
} else if self.buffer.len() > self.max_buffer_size {
self.force_flush_partial()
} else {
Vec::new()
}
}
fn find_last_sentence_boundary(&self) -> Option<usize> {
let bytes = self.buffer.as_bytes();
for (i, ch) in self.buffer.char_indices().rev() {
if self.sentence_delimiters.contains(&ch) {
if ch == '.' && i > 0 && i + ch.len_utf8() < bytes.len() {
let prev_byte = bytes[i - 1];
let next_byte = bytes[i + ch.len_utf8()];
if prev_byte.is_ascii_digit() && next_byte.is_ascii_digit() {
continue;
}
}
return Some(i + ch.len_utf8() - 1);
}
}
None
}
fn find_safe_split_point(&self, target_pos: usize) -> usize {
let mut pos = target_pos.min(self.buffer.len());
while pos > 0 && !self.buffer.is_char_boundary(pos) {
pos -= 1;
}
for (byte_idx, ch) in self.buffer[..pos].char_indices().rev() {
if ch.is_whitespace() || self.sentence_delimiters.contains(&ch) {
return byte_idx + ch.len_utf8();
}
}
pos
}
fn force_flush_partial(&mut self) -> Vec<Token> {
let target_pos = self.buffer.len() / 2;
let split_pos = self.find_safe_split_point(target_pos);
if split_pos == 0 {
return self.flush();
}
let to_process = self.buffer[..split_pos].to_string();
let remaining = self.buffer[split_pos..].to_string();
let mut tokens = self.tokenizer.tokenize(&to_process);
for token in &mut tokens {
token.start_pos += self.total_chars_processed;
token.end_pos += self.total_chars_processed;
}
self.total_chars_processed += to_process.chars().count();
self.buffer = remaining;
tokens
}
pub fn flush(&mut self) -> Vec<Token> {
if self.buffer.is_empty() {
return Vec::new();
}
let to_process = std::mem::take(&mut self.buffer);
let mut tokens = self.tokenizer.tokenize(&to_process);
for token in &mut tokens {
token.start_pos += self.total_chars_processed;
token.end_pos += self.total_chars_processed;
}
self.total_chars_processed += to_process.chars().count();
tokens
}
pub fn process_reader<R: Read>(&mut self, reader: R) -> Result<Vec<Token>> {
let mut buf_reader = BufReader::with_capacity(self.chunk_size, reader);
let mut all_tokens = Vec::new();
loop {
let mut line = String::new();
let bytes_read = buf_reader
.read_line(&mut line)
.map_err(|e| crate::Error::Analysis(format!("Failed to read line: {e}")))?;
if bytes_read == 0 {
break; }
let tokens = self.process_chunk(&line);
all_tokens.extend(tokens);
}
let remaining = self.flush();
all_tokens.extend(remaining);
Ok(all_tokens)
}
pub fn process_file<P: AsRef<std::path::Path>>(&mut self, path: P) -> Result<Vec<Token>> {
let file = std::fs::File::open(path)
.map_err(|e| crate::Error::Analysis(format!("Failed to open file: {e}")))?;
self.process_reader(file)
}
#[must_use]
pub fn buffer_len(&self) -> usize {
self.buffer.len()
}
#[must_use]
pub const fn total_chars_processed(&self) -> usize {
self.total_chars_processed
}
pub fn reset(&mut self) {
self.buffer.clear();
self.total_chars_processed = 0;
}
}
pub struct TokenStream<I>
where
I: Iterator<Item = String>,
{
chunks: I,
streaming: StreamingTokenizer,
token_buffer: VecDeque<Token>,
finished: bool,
tokens_yielded: usize,
}
impl<I> TokenStream<I>
where
I: Iterator<Item = String>,
{
#[must_use]
pub fn new(chunks: I, tokenizer: Tokenizer) -> Self {
Self {
chunks,
streaming: StreamingTokenizer::new(tokenizer),
token_buffer: VecDeque::new(),
finished: false,
tokens_yielded: 0,
}
}
#[must_use]
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.streaming = self.streaming.with_chunk_size(size);
self
}
#[must_use]
pub const fn tokens_yielded(&self) -> usize {
self.tokens_yielded
}
}
impl<I> Iterator for TokenStream<I>
where
I: Iterator<Item = String>,
{
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if let Some(token) = self.token_buffer.pop_front() {
self.tokens_yielded += 1;
return Some(token);
}
if self.finished {
return None;
}
for chunk in self.chunks.by_ref() {
let tokens = self.streaming.process_chunk(&chunk);
if !tokens.is_empty() {
self.token_buffer.extend(tokens);
if let Some(token) = self.token_buffer.pop_front() {
self.tokens_yielded += 1;
return Some(token);
}
}
}
self.finished = true;
let remaining = self.streaming.flush();
if !remaining.is_empty() {
self.token_buffer.extend(remaining);
if let Some(token) = self.token_buffer.pop_front() {
self.tokens_yielded += 1;
return Some(token);
}
}
None
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.token_buffer.len(), None)
}
}
pub type ProgressCallback = Box<dyn Fn(StreamingProgress) + Send>;
#[derive(Debug, Clone)]
pub struct StreamingProgress {
pub bytes_processed: usize,
pub total_bytes: Option<usize>,
pub tokens_generated: usize,
pub chunks_processed: usize,
}
impl StreamingProgress {
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn percent(&self) -> Option<f64> {
self.total_bytes
.map(|total| (self.bytes_processed as f64 / total as f64) * 100.0)
}
}
pub struct ProgressStreamingTokenizer {
inner: StreamingTokenizer,
callback: Option<ProgressCallback>,
bytes_processed: usize,
total_bytes: Option<usize>,
tokens_generated: usize,
chunks_processed: usize,
callback_interval: usize,
last_callback_bytes: usize,
}
impl ProgressStreamingTokenizer {
pub const DEFAULT_CALLBACK_INTERVAL: usize = 65536;
#[must_use]
pub fn new(tokenizer: Tokenizer) -> Self {
Self {
inner: StreamingTokenizer::new(tokenizer),
callback: None,
bytes_processed: 0,
total_bytes: None,
tokens_generated: 0,
chunks_processed: 0,
callback_interval: Self::DEFAULT_CALLBACK_INTERVAL,
last_callback_bytes: 0,
}
}
#[must_use]
pub fn with_progress_callback<F>(mut self, callback: F) -> Self
where
F: Fn(StreamingProgress) + Send + 'static,
{
self.callback = Some(Box::new(callback));
self
}
#[must_use]
pub const fn with_total_bytes(mut self, total: usize) -> Self {
self.total_bytes = Some(total);
self
}
#[must_use]
pub const fn with_callback_interval(mut self, interval: usize) -> Self {
self.callback_interval = interval;
self
}
#[must_use]
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.inner = self.inner.with_chunk_size(size);
self
}
pub fn process_chunk(&mut self, chunk: &str) -> Vec<Token> {
self.bytes_processed += chunk.len();
self.chunks_processed += 1;
let tokens = self.inner.process_chunk(chunk);
self.tokens_generated += tokens.len();
if self.bytes_processed - self.last_callback_bytes >= self.callback_interval {
self.report_progress();
self.last_callback_bytes = self.bytes_processed;
}
tokens
}
pub fn flush(&mut self) -> Vec<Token> {
let tokens = self.inner.flush();
self.tokens_generated += tokens.len();
self.report_progress();
tokens
}
fn report_progress(&self) {
if let Some(ref callback) = self.callback {
callback(StreamingProgress {
bytes_processed: self.bytes_processed,
total_bytes: self.total_bytes,
tokens_generated: self.tokens_generated,
chunks_processed: self.chunks_processed,
});
}
}
pub fn process_reader<R: Read>(&mut self, reader: R) -> Result<Vec<Token>> {
let mut buf_reader = BufReader::with_capacity(self.inner.chunk_size, reader);
let mut all_tokens = Vec::new();
loop {
let mut line = String::new();
let bytes_read = buf_reader
.read_line(&mut line)
.map_err(|e| crate::Error::Analysis(format!("Failed to read line: {e}")))?;
if bytes_read == 0 {
break;
}
let tokens = self.process_chunk(&line);
all_tokens.extend(tokens);
}
let remaining = self.flush();
all_tokens.extend(remaining);
Ok(all_tokens)
}
#[allow(clippy::cast_possible_truncation)]
pub fn process_file<P: AsRef<std::path::Path>>(&mut self, path: P) -> Result<Vec<Token>> {
let metadata = std::fs::metadata(path.as_ref())
.map_err(|e| crate::Error::Analysis(format!("Failed to read metadata: {e}")))?;
self.total_bytes = Some(metadata.len() as usize);
let file = std::fs::File::open(path)
.map_err(|e| crate::Error::Analysis(format!("Failed to open file: {e}")))?;
self.process_reader(file)
}
#[must_use]
pub const fn progress(&self) -> StreamingProgress {
StreamingProgress {
bytes_processed: self.bytes_processed,
total_bytes: self.total_bytes,
tokens_generated: self.tokens_generated,
chunks_processed: self.chunks_processed,
}
}
pub fn reset(&mut self) {
self.inner.reset();
self.bytes_processed = 0;
self.tokens_generated = 0;
self.chunks_processed = 0;
self.last_callback_bytes = 0;
}
}
pub struct ChunkedTokenIterator<I>
where
I: Iterator<Item = String>,
{
chunks: I,
streaming: StreamingTokenizer,
finished: bool,
}
impl<I> ChunkedTokenIterator<I>
where
I: Iterator<Item = String>,
{
#[must_use]
pub fn new(chunks: I, tokenizer: Tokenizer) -> Self {
Self {
chunks,
streaming: StreamingTokenizer::new(tokenizer),
finished: false,
}
}
#[must_use]
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.streaming = self.streaming.with_chunk_size(size);
self
}
}
impl<I> Iterator for ChunkedTokenIterator<I>
where
I: Iterator<Item = String>,
{
type Item = Vec<Token>;
fn next(&mut self) -> Option<Self::Item> {
if self.finished {
return None;
}
for chunk in self.chunks.by_ref() {
let tokens = self.streaming.process_chunk(&chunk);
if !tokens.is_empty() {
return Some(tokens);
}
}
self.finished = true;
let remaining = self.streaming.flush();
if remaining.is_empty() {
None
} else {
Some(remaining)
}
}
}
#[cfg(test)]
#[allow(clippy::expect_used)]
mod tests {
use super::*;
fn create_test_tokenizer() -> Tokenizer {
Tokenizer::new().expect("should create tokenizer")
}
#[test]
fn test_streaming_tokenizer_creation() {
let tokenizer = create_test_tokenizer();
let stream = StreamingTokenizer::new(tokenizer);
assert_eq!(stream.buffer_len(), 0);
assert_eq!(stream.total_chars_processed(), 0);
}
#[test]
fn test_process_chunk_with_delimiter() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
let tokens = stream.process_chunk("안녕\n");
assert!(!tokens.is_empty() || stream.buffer_len() > 0);
let remaining = stream.flush();
let total_tokens = tokens.len() + remaining.len();
assert!(total_tokens > 0);
}
#[test]
fn test_process_chunk_without_delimiter() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
let tokens = stream.process_chunk("안녕하세요");
assert!(tokens.is_empty() || stream.buffer_len() > 0);
}
#[test]
fn test_flush() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
stream.process_chunk("안녕하세요");
let tokens = stream.flush();
assert!(!tokens.is_empty());
assert_eq!(stream.buffer_len(), 0);
}
#[test]
fn test_multiple_chunks() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
let _tokens1 = stream.process_chunk("안녕하세요.\n");
let _tokens2 = stream.process_chunk("감사합니다.\n");
let _remaining = stream.flush();
assert!(stream.total_chars_processed() > 0);
}
#[test]
fn test_reset() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
stream.process_chunk("안녕하세요");
stream.reset();
assert_eq!(stream.buffer_len(), 0);
assert_eq!(stream.total_chars_processed(), 0);
}
#[test]
fn test_custom_chunk_size() {
let tokenizer = create_test_tokenizer();
let stream = StreamingTokenizer::new(tokenizer).with_chunk_size(1024);
assert_eq!(stream.chunk_size, 1024);
}
#[test]
fn test_custom_delimiters() {
let tokenizer = create_test_tokenizer();
let stream =
StreamingTokenizer::new(tokenizer).with_sentence_delimiters(vec!['.', '!', '?']);
assert_eq!(stream.sentence_delimiters.len(), 3);
}
#[test]
fn test_token_stream_creation() {
let tokenizer = create_test_tokenizer();
let chunks = vec!["안녕하세요.\n".to_string(), "감사합니다.\n".to_string()];
let stream = TokenStream::new(chunks.into_iter(), tokenizer);
assert!(!stream.finished);
}
#[test]
fn test_token_stream_iteration() {
let tokenizer = create_test_tokenizer();
let chunks = vec!["안녕\n".to_string(), "감사\n".to_string()];
let stream = TokenStream::new(chunks.into_iter(), tokenizer);
let tokens: Vec<_> = stream.collect();
assert!(!tokens.is_empty());
}
#[test]
fn test_token_stream_tokens_yielded() {
let tokenizer = create_test_tokenizer();
let chunks = vec!["안녕하세요.\n".to_string()];
let mut stream = TokenStream::new(chunks.into_iter(), tokenizer);
let mut count = 0;
while stream.next().is_some() {
count += 1;
}
assert_eq!(stream.tokens_yielded(), count);
}
#[test]
fn test_token_stream_size_hint() {
let tokenizer = create_test_tokenizer();
let chunks = vec!["안녕하세요.\n".to_string()];
let stream = TokenStream::new(chunks.into_iter(), tokenizer);
let (lower, _upper) = stream.size_hint();
assert_eq!(lower, 0);
}
#[test]
fn test_progress_streaming_tokenizer() {
let tokenizer = create_test_tokenizer();
let mut stream = ProgressStreamingTokenizer::new(tokenizer);
let _tokens = stream.process_chunk("안녕하세요.\n");
let progress = stream.progress();
assert!(progress.bytes_processed > 0);
assert!(progress.chunks_processed > 0);
}
#[test]
fn test_progress_callback() {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
let tokenizer = create_test_tokenizer();
let callback_count = Arc::new(AtomicUsize::new(0));
let callback_count_clone = Arc::clone(&callback_count);
let mut stream = ProgressStreamingTokenizer::new(tokenizer)
.with_callback_interval(1) .with_progress_callback(move |_progress| {
callback_count_clone.fetch_add(1, Ordering::SeqCst);
});
stream.process_chunk("안녕하세요. 오늘 날씨가 좋네요.\n");
let _remaining = stream.flush();
assert!(callback_count.load(Ordering::SeqCst) > 0);
}
#[test]
fn test_progress_percent() {
let progress = StreamingProgress {
bytes_processed: 50,
total_bytes: Some(100),
tokens_generated: 10,
chunks_processed: 2,
};
assert_eq!(progress.percent(), Some(50.0));
}
#[test]
fn test_chunked_token_iterator() {
let tokenizer = create_test_tokenizer();
let chunks = vec!["안녕하세요.\n".to_string(), "감사합니다.\n".to_string()];
let iter = ChunkedTokenIterator::new(chunks.into_iter(), tokenizer);
let token_chunks: Vec<_> = iter.collect();
let total_tokens: usize = token_chunks.iter().map(std::vec::Vec::len).sum();
let _ = total_tokens; }
#[test]
fn test_multibyte_delimiter_no_panic() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer)
.with_sentence_delimiters(vec!['.', '!', '?', '。', '.', '\n']);
let tokens = stream.process_chunk("テスト。次の文。\n");
let remaining = stream.flush();
let total = tokens.len() + remaining.len();
assert!(total > 0 || stream.buffer_len() == 0);
}
#[test]
fn test_decimal_number_not_split() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
let tokens = stream.process_chunk("값은 3.14입니다.\n");
let remaining = stream.flush();
let all: Vec<_> = tokens.into_iter().chain(remaining).collect();
let surfaces: Vec<_> = all.iter().map(|t| t.surface.as_str()).collect();
let joined = surfaces.join(" ");
assert!(
!joined.contains("3 .") && !joined.contains(". 14"),
"Decimal was incorrectly split: {joined}"
);
}
#[test]
fn test_buffer_limit_forces_flush() {
let tokenizer = create_test_tokenizer();
let mut stream = StreamingTokenizer::new(tokenizer);
stream.max_buffer_size = 32;
let tokens = stream.process_chunk(&"가".repeat(100));
assert!(!tokens.is_empty(), "Buffer limit should force a flush");
}
#[test]
fn test_safe_split_point() {
let tokenizer = create_test_tokenizer();
let stream = StreamingTokenizer::new(tokenizer)
.with_sentence_delimiters(vec!['.', '!', '?', '\n', ' ']);
let mut stream = stream;
let _tokens = stream.process_chunk("안녕하세요 감사합니다");
assert!(stream.buffer_len() > 0);
}
}
pub struct SentenceReader<R: BufRead> {
reader: R,
buffer: String,
queue: std::collections::VecDeque<String>,
eof: bool,
max_buffer_size: usize,
}
impl<R: BufRead> SentenceReader<R> {
pub const DEFAULT_MAX_BUFFER_SIZE: usize = 16 * 1024 * 1024;
#[must_use]
pub const fn new(reader: R) -> Self {
Self {
reader,
buffer: String::new(),
queue: std::collections::VecDeque::new(),
eof: false,
max_buffer_size: Self::DEFAULT_MAX_BUFFER_SIZE,
}
}
#[must_use]
pub const fn with_max_buffer_size(mut self, size: usize) -> Self {
self.max_buffer_size = size;
self
}
fn drain_sentences(&mut self) {
let buf = self.buffer.as_str();
let indices: Vec<(usize, char)> = buf.char_indices().collect();
let len = indices.len();
let mut start_char = 0;
let mut i = 0;
while i < len {
let (_, ch) = indices[i];
if ch == '\n' {
let start_byte = indices[start_char].0;
let end_byte = indices[i].0;
let trimmed = buf[start_byte..end_byte].trim();
if !trimmed.is_empty() {
self.queue.push_back(trimmed.to_string());
}
start_char = i + 1;
i += 1;
continue;
}
if matches!(ch, '.' | '?' | '!') {
if ch == '.' {
let prev_is_digit = i > 0 && indices[i - 1].1.is_ascii_digit();
let next_is_digit = i + 1 < len && indices[i + 1].1.is_ascii_digit();
if prev_is_digit && next_is_digit {
i += 1;
continue;
}
}
let punct_byte_end = indices[i].0 + ch.len_utf8();
let mut j = i + 1;
while j < len && matches!(indices[j].1, ')' | ']' | '"' | '\'') {
j += 1;
}
let followed_by_whitespace = j < len && indices[j].1.is_whitespace();
let followed_by_eof = j >= len && self.eof;
if followed_by_whitespace || followed_by_eof {
let start_byte = indices[start_char].0;
let trimmed = buf[start_byte..punct_byte_end].trim();
if !trimmed.is_empty() {
self.queue.push_back(trimmed.to_string());
}
start_char = j;
if j < len && indices[j].1.is_whitespace() && indices[j].1 != '\n' {
start_char = j + 1;
i = j + 1;
} else {
i = j;
}
continue;
}
}
i += 1;
}
if self.eof && start_char < len {
let start_byte = indices[start_char].0;
let trimmed = buf[start_byte..].trim();
if !trimmed.is_empty() {
self.queue.push_back(trimmed.to_string());
}
self.buffer.clear();
} else if start_char > 0 && start_char < len {
let byte_offset = indices[start_char].0;
self.buffer.drain(..byte_offset);
} else if start_char >= len && !self.eof {
self.buffer.clear();
}
}
fn fill_buffer(&mut self) -> io::Result<bool> {
if self.buffer.len() >= self.max_buffer_size {
let trimmed = self.buffer.trim().to_string();
if !trimmed.is_empty() {
self.queue.push_back(trimmed);
}
self.buffer.clear();
}
let mut line = String::new();
let n = self.reader.read_line(&mut line)?;
if n == 0 {
self.eof = true;
Ok(false)
} else {
self.buffer.push_str(&line);
Ok(true)
}
}
}
impl<R: BufRead> Iterator for SentenceReader<R> {
type Item = io::Result<String>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(sentence) = self.queue.pop_front() {
return Some(Ok(sentence));
}
if self.eof {
return None;
}
if let Err(e) = self.fill_buffer() {
return Some(Err(e));
}
self.drain_sentences();
}
}
}
#[cfg(test)]
#[allow(clippy::expect_used)]
mod sentence_reader_tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_single_sentence() {
let input = "안녕하세요.\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences, vec!["안녕하세요."]);
}
#[test]
fn test_multiple_sentences() {
let input = "첫 번째 문장입니다. 두 번째 문장입니다.\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences.len(), 2);
assert_eq!(sentences[0], "첫 번째 문장입니다.");
assert_eq!(sentences[1], "두 번째 문장입니다.");
}
#[test]
fn test_newline_boundary() {
let input = "줄 하나\n줄 둘\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences, vec!["줄 하나", "줄 둘"]);
}
#[test]
fn test_decimal_not_boundary() {
let input = "값은 3.14입니다.\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences, vec!["값은 3.14입니다."]);
}
#[test]
fn test_question_mark() {
let input = "이것은 무엇인가요? 네, 맞습니다.\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.collect::<std::result::Result<_, _>>().unwrap();
assert_eq!(sentences.len(), 2);
}
#[test]
fn test_empty_input() {
let input = "";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.collect::<std::result::Result<_, _>>().unwrap();
assert!(sentences.is_empty());
}
#[test]
fn test_no_trailing_newline() {
let input = "마지막 문장";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences, vec!["마지막 문장"]);
}
#[test]
fn test_multiple_newlines() {
let input = "첫째\n\n둘째\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences, vec!["첫째", "둘째"]);
}
#[test]
fn test_exclamation() {
let input = "대단합니다! 정말요?\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.collect::<std::result::Result<_, _>>().unwrap();
assert_eq!(sentences.len(), 2);
}
#[test]
fn test_sentence_reader_is_send() {
fn assert_send<T: Send>() {}
assert_send::<SentenceReader<std::io::Cursor<&[u8]>>>();
}
#[test]
fn test_closing_paren_before_whitespace() {
let input = "문장입니다.) 다음 문장.\n";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.collect::<std::result::Result<_, _>>().unwrap();
assert_eq!(sentences.len(), 2);
}
#[test]
fn test_no_trailing_newline_punctuation() {
let input = "첫째. 둘째.";
let reader = SentenceReader::new(Cursor::new(input));
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert_eq!(sentences.len(), 2);
assert_eq!(sentences[0], "첫째.");
assert_eq!(sentences[1], "둘째.");
}
#[test]
fn test_buffer_limit_prevents_oom() {
let long_line = "가".repeat(200);
let reader = SentenceReader::new(Cursor::new(long_line.as_str())).with_max_buffer_size(64);
let sentences: Vec<_> = reader.map(|r| r.unwrap()).collect();
assert!(!sentences.is_empty());
}
}