mecab-ko-core 0.7.2

한국어 형태소 분석 핵심 엔진 - Lattice, Viterbi, 토크나이저
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
//! # Async Tokenizer Module
//!
//! 비동기 형태소 분석 API (tokio 기반)
//!
//! ## 주요 기능
//!
//! - 비동기 파일 처리
//! - 비동기 스트림 처리
//! - 동시성 제어 (병렬 처리)
//!
//! ## Example
//!
//! ```rust,no_run
//! use mecab_ko_core::async_tokenizer::AsyncTokenizer;
//!
//! #[tokio::main]
//! async fn main() {
//!     let tokenizer = AsyncTokenizer::new().await.unwrap();
//!     let tokens = tokenizer.tokenize_async("안녕하세요").await;
//!
//!     for token in tokens {
//!         println!("{}: {}", token.surface, token.pos);
//!     }
//! }
//! ```

use std::path::Path;
use std::sync::Arc;

use tokio::fs::File;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
use tokio::sync::{Mutex, Semaphore};

use crate::tokenizer::{Token, Tokenizer};
use crate::Result;

/// 비동기 토크나이저
///
/// tokio 런타임을 사용하여 비동기 형태소 분석을 수행합니다.
/// 내부적으로 동기 Tokenizer를 Mutex로 보호하여 사용합니다.
pub struct AsyncTokenizer {
    /// 동기 토크나이저 (Arc<Mutex>로 공유)
    tokenizer: Arc<Mutex<Tokenizer>>,

    /// 동시 실행 제한 (semaphore)
    semaphore: Arc<Semaphore>,

    /// 최대 동시 실행 수
    max_concurrent: usize,
}

impl AsyncTokenizer {
    /// 기본 동시 실행 수
    pub const DEFAULT_MAX_CONCURRENT: usize = 4;

    /// 새 비동기 토크나이저 생성
    ///
    /// # Errors
    ///
    /// 토크나이저 초기화 실패 시
    pub async fn new() -> Result<Self> {
        let tokenizer = tokio::task::spawn_blocking(Tokenizer::new)
            .await
            .map_err(|e| crate::Error::Init(format!("Failed to spawn task: {e}")))?
            .map_err(|e| crate::Error::Init(format!("Failed to create tokenizer: {e}")))?;

        Ok(Self {
            tokenizer: Arc::new(Mutex::new(tokenizer)),
            semaphore: Arc::new(Semaphore::new(Self::DEFAULT_MAX_CONCURRENT)),
            max_concurrent: Self::DEFAULT_MAX_CONCURRENT,
        })
    }

    /// 사전 경로 지정하여 생성
    ///
    /// # Arguments
    ///
    /// * `dict_path` - 사전 디렉토리 경로
    ///
    /// # Errors
    ///
    /// 토크나이저 초기화 실패 시
    pub async fn with_dict<P: AsRef<Path> + Send + 'static>(dict_path: P) -> Result<Self> {
        let tokenizer = tokio::task::spawn_blocking(move || Tokenizer::with_dict(dict_path))
            .await
            .map_err(|e| crate::Error::Init(format!("Failed to spawn task: {e}")))?
            .map_err(|e| crate::Error::Init(format!("Failed to create tokenizer: {e}")))?;

        Ok(Self {
            tokenizer: Arc::new(Mutex::new(tokenizer)),
            semaphore: Arc::new(Semaphore::new(Self::DEFAULT_MAX_CONCURRENT)),
            max_concurrent: Self::DEFAULT_MAX_CONCURRENT,
        })
    }

    /// 최대 동시 실행 수 설정
    ///
    /// # Arguments
    ///
    /// * `max` - 최대 동시 실행 수
    #[must_use]
    pub fn with_max_concurrent(mut self, max: usize) -> Self {
        self.max_concurrent = max;
        self.semaphore = Arc::new(Semaphore::new(max));
        self
    }

    /// 비동기 형태소 분석
    ///
    /// # Arguments
    ///
    /// * `text` - 분석할 텍스트
    ///
    /// # Returns
    ///
    /// 토큰 목록
    pub async fn tokenize_async(&self, text: &str) -> Vec<Token> {
        // Semaphore로 동시 실행 제어
        let _permit = match self.semaphore.acquire().await {
            Ok(permit) => permit,
            Err(_) => return Vec::new(), // Semaphore closed, return empty result
        };

        let text_owned = text.to_string();
        let tokenizer = Arc::clone(&self.tokenizer);

        // 블로킹 작업을 별도 스레드에서 실행
        tokio::task::spawn_blocking(move || {
            let mut tok = tokenizer.blocking_lock();
            tok.tokenize(&text_owned)
        })
        .await
        .unwrap_or_default()
    }

    /// 비동기 파일 처리
    ///
    /// # Arguments
    ///
    /// * `path` - 파일 경로
    ///
    /// # Returns
    ///
    /// 모든 토큰 목록
    ///
    /// # Errors
    ///
    /// 파일 읽기 실패 시
    pub async fn tokenize_file<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Token>> {
        let file = File::open(path)
            .await
            .map_err(|e| crate::Error::Analysis(format!("Failed to open file: {e}")))?;

        self.tokenize_reader(file).await
    }

    /// 비동기 Reader 처리
    ///
    /// # Arguments
    ///
    /// * `reader` - 비동기 Reader
    ///
    /// # Returns
    ///
    /// 모든 토큰 목록
    ///
    /// # Errors
    ///
    /// 읽기 실패 시
    pub async fn tokenize_reader<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Vec<Token>> {
        let mut buf_reader = BufReader::new(reader);
        let mut all_tokens = Vec::new();

        loop {
            let mut line = String::new();
            let bytes_read = buf_reader
                .read_line(&mut line)
                .await
                .map_err(|e| crate::Error::Analysis(format!("Failed to read line: {e}")))?;

            if bytes_read == 0 {
                break; // EOF
            }

            let tokens = self.tokenize_async(&line).await;
            all_tokens.extend(tokens);
        }

        Ok(all_tokens)
    }

    /// 배치 비동기 처리
    ///
    /// 여러 텍스트를 동시에 처리합니다.
    ///
    /// # Arguments
    ///
    /// * `texts` - 텍스트 목록
    ///
    /// # Returns
    ///
    /// 각 텍스트의 토큰 목록
    pub async fn tokenize_batch(&self, texts: Vec<String>) -> Vec<Vec<Token>> {
        let mut handles = Vec::new();

        for text in texts {
            let tokenizer = Arc::clone(&self.tokenizer);
            let semaphore = Arc::clone(&self.semaphore);

            let handle = tokio::spawn(async move {
                let _permit = match semaphore.acquire().await {
                    Ok(permit) => permit,
                    Err(_) => return Vec::new(), // Semaphore closed, return empty result
                };

                tokio::task::spawn_blocking(move || {
                    let mut tok = tokenizer.blocking_lock();
                    tok.tokenize(&text)
                })
                .await
                .unwrap_or_default()
            });

            handles.push(handle);
        }

        let mut results = Vec::new();
        for handle in handles {
            if let Ok(tokens) = handle.await {
                results.push(tokens);
            } else {
                results.push(Vec::new());
            }
        }

        results
    }

    /// 스트림 처리
    ///
    /// # Arguments
    ///
    /// * `texts` - 텍스트 스트림
    ///
    /// # Returns
    ///
    /// 토큰 스트림
    pub async fn tokenize_stream<I>(&self, texts: I) -> Vec<Vec<Token>>
    where
        I: IntoIterator<Item = String>,
    {
        let texts_vec: Vec<_> = texts.into_iter().collect();
        self.tokenize_batch(texts_vec).await
    }

    /// 동기 토크나이저 참조 (async context에서 접근)
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use mecab_ko_core::async_tokenizer::AsyncTokenizer;
    /// # async fn example() {
    /// #     let async_tokenizer = AsyncTokenizer::new().await.unwrap();
    /// let tokenizer = async_tokenizer.get_tokenizer().await;
    /// // ... 동기 작업 수행
    /// # }
    /// ```
    pub async fn get_tokenizer(&self) -> tokio::sync::MutexGuard<'_, Tokenizer> {
        self.tokenizer.lock().await
    }

    /// 최대 동시 실행 수 조회
    #[must_use]
    pub fn max_concurrent(&self) -> usize {
        self.max_concurrent
    }
}

/// 비동기 스트리밍 토크나이저
///
/// AsyncRead를 받아 비동기적으로 토큰을 생성합니다.
pub struct AsyncStreamingTokenizer {
    /// 비동기 토크나이저
    tokenizer: AsyncTokenizer,

    /// 버퍼
    buffer: String,

    /// 문장 구분자
    sentence_delimiters: Vec<char>,
}

impl AsyncStreamingTokenizer {
    /// 새 비동기 스트리밍 토크나이저 생성
    ///
    /// # Arguments
    ///
    /// * `tokenizer` - 비동기 토크나이저
    #[must_use]
    pub fn new(tokenizer: AsyncTokenizer) -> Self {
        Self {
            tokenizer,
            buffer: String::new(),
            sentence_delimiters: vec!['.', '!', '?', '', '', '\n'],
        }
    }

    /// 청크 처리 (비동기)
    ///
    /// # Arguments
    ///
    /// * `chunk` - 입력 청크
    ///
    /// # Returns
    ///
    /// 토큰 목록
    pub async fn process_chunk(&mut self, chunk: &str) -> Vec<Token> {
        self.buffer.push_str(chunk);

        // 마지막 문장 구분자 찾기
        let split_pos = self.find_last_sentence_boundary();

        if let Some(pos) = split_pos {
            let to_process = self.buffer[..=pos].to_string();
            let remaining = self.buffer[pos + 1..].to_string();

            let tokens = self.tokenizer.tokenize_async(&to_process).await;

            self.buffer = remaining;
            tokens
        } else {
            Vec::new()
        }
    }

    /// 마지막 문장 경계 찾기
    fn find_last_sentence_boundary(&self) -> Option<usize> {
        let mut last_pos = None;

        for (i, ch) in self.buffer.char_indices() {
            if self.sentence_delimiters.contains(&ch) {
                last_pos = Some(i);
            }
        }

        last_pos
    }

    /// 남은 버퍼 처리 (비동기)
    pub async fn flush(&mut self) -> Vec<Token> {
        if self.buffer.is_empty() {
            return Vec::new();
        }

        let to_process = std::mem::take(&mut self.buffer);
        self.tokenizer.tokenize_async(&to_process).await
    }

    /// Reader에서 스트리밍 처리 (비동기)
    ///
    /// # Arguments
    ///
    /// * `reader` - 비동기 Reader
    ///
    /// # Returns
    ///
    /// 모든 토큰 목록
    ///
    /// # Errors
    ///
    /// 읽기 실패 시
    pub async fn process_reader<R: AsyncRead + Unpin>(&mut self, reader: R) -> Result<Vec<Token>> {
        let mut buf_reader = BufReader::new(reader);
        let mut all_tokens = Vec::new();

        loop {
            let mut line = String::new();
            let bytes_read = buf_reader
                .read_line(&mut line)
                .await
                .map_err(|e| crate::Error::Analysis(format!("Failed to read line: {e}")))?;

            if bytes_read == 0 {
                break; // EOF
            }

            let tokens = self.process_chunk(&line).await;
            all_tokens.extend(tokens);
        }

        // Flush
        let remaining = self.flush().await;
        all_tokens.extend(remaining);

        Ok(all_tokens)
    }
}

#[cfg(test)]
#[allow(clippy::expect_used)]
mod tests {
    use super::*;

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — construction
    // ---------------------------------------------------------------------------

    #[tokio::test]
    async fn test_async_tokenizer_creation() {
        let result = AsyncTokenizer::new().await;
        assert!(result.is_ok());
    }

    /// DEFAULT_MAX_CONCURRENT is 4 immediately after construction.
    #[tokio::test]
    async fn test_default_max_concurrent_value() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        assert_eq!(tokenizer.max_concurrent(), AsyncTokenizer::DEFAULT_MAX_CONCURRENT);
    }

    /// `with_max_concurrent` returns a new value reflected by `max_concurrent()`.
    #[tokio::test]
    async fn test_max_concurrent() {
        let tokenizer = AsyncTokenizer::new()
            .await
            .expect("should create")
            .with_max_concurrent(8);

        assert_eq!(tokenizer.max_concurrent(), 8);
    }

    /// `with_max_concurrent(1)` is a legal edge value (serialise all work).
    #[tokio::test]
    async fn test_max_concurrent_one() {
        let tokenizer = AsyncTokenizer::new()
            .await
            .expect("should create")
            .with_max_concurrent(1);

        assert_eq!(tokenizer.max_concurrent(), 1);
    }

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — tokenize_async
    // ---------------------------------------------------------------------------

    /// Empty string input must return an empty Vec without panicking.
    #[tokio::test]
    async fn test_tokenize_async_empty_string() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let tokens = tokenizer.tokenize_async("").await;
        // Empty input always produces zero tokens regardless of the dictionary.
        assert!(tokens.is_empty(), "expected no tokens for empty input, got {}", tokens.len());
    }

    /// Single ASCII character — must not panic; token count >= 0.
    #[tokio::test]
    async fn test_tokenize_async_single_ascii_char() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let tokens = tokenizer.tokenize_async("a").await;
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
    }

    /// Korean text tokenisation — may produce 0 tokens with the mini-dict, but
    /// must not panic and must return a Vec.
    #[tokio::test]
    async fn test_tokenize_async_korean_text() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let tokens = tokenizer.tokenize_async("안녕하세요").await;
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
    }

    /// Multi-byte Korean input with punctuation must not panic.
    #[tokio::test]
    async fn test_tokenize_async_multibyte_korean() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        // 오늘 날씨가 좋네요 — contains multi-byte UTF-8 characters
        let tokens = tokenizer.tokenize_async("오늘 날씨가 좋네요.").await;
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
    }

    /// Calling tokenize_async twice on the same AsyncTokenizer must work (Mutex
    /// released between calls).
    #[tokio::test]
    async fn test_tokenize_async_reuse() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let t1 = tokenizer.tokenize_async("안녕").await;
        let t2 = tokenizer.tokenize_async("안녕").await;
        // Both calls must produce the same number of tokens (determinism).
        assert_eq!(t1.len(), t2.len(), "repeated calls should return same token count");
    }

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — tokenize_batch
    // ---------------------------------------------------------------------------

    /// Batch with two texts — result length must equal input length.
    #[tokio::test]
    async fn test_tokenize_batch_length() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let texts = vec!["안녕하세요".to_string(), "감사합니다".to_string()];
        let results = tokenizer.tokenize_batch(texts).await;
        assert_eq!(results.len(), 2, "batch result count must match input count");
    }

    /// Batch with an empty list — must return an empty Vec.
    #[tokio::test]
    async fn test_tokenize_batch_empty_input() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let results = tokenizer.tokenize_batch(Vec::new()).await;
        assert!(results.is_empty(), "empty batch must produce empty results");
    }

    /// Batch with a single-item list — result length is 1.
    #[tokio::test]
    async fn test_tokenize_batch_single_item() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let results = tokenizer.tokenize_batch(vec!["안녕".to_string()]).await;
        assert_eq!(results.len(), 1);
    }

    /// Batch with empty string entries — must return a result per entry.
    #[tokio::test]
    async fn test_tokenize_batch_with_empty_strings() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let texts = vec!["".to_string(), "".to_string(), "".to_string()];
        let results = tokenizer.tokenize_batch(texts).await;
        assert_eq!(results.len(), 3);
        // Empty strings always produce empty token lists.
        for result in &results {
            assert!(result.is_empty());
        }
    }

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — tokenize_stream
    // ---------------------------------------------------------------------------

    /// `tokenize_stream` is defined as a thin wrapper around `tokenize_batch`;
    /// its result length must equal the number of items in the iterator.
    #[tokio::test]
    async fn test_tokenize_stream_length() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let texts = vec!["안녕하세요".to_string(), "감사합니다".to_string()];
        let results = tokenizer.tokenize_stream(texts).await;
        assert_eq!(results.len(), 2);
    }

    /// `tokenize_stream` on an empty iterator must return an empty Vec.
    #[tokio::test]
    async fn test_tokenize_stream_empty() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let results = tokenizer.tokenize_stream(std::iter::empty::<String>()).await;
        assert!(results.is_empty());
    }

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — tokenize_reader
    // ---------------------------------------------------------------------------

    /// Reader over an empty byte slice must return Ok with an empty token Vec.
    #[tokio::test]
    async fn test_tokenize_reader_empty() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let cursor = std::io::Cursor::new(b"" as &[u8]);
        let result = tokenizer.tokenize_reader(cursor).await;
        assert!(result.is_ok(), "tokenize_reader should succeed on empty input");
        assert!(result.unwrap().is_empty());
    }

    /// Reader over a single newline-terminated line must not panic.
    #[tokio::test]
    async fn test_tokenize_reader_single_line() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let data = "안녕하세요.\n";
        let cursor = std::io::Cursor::new(data.as_bytes());
        let result = tokenizer.tokenize_reader(cursor).await;
        assert!(result.is_ok(), "tokenize_reader should succeed");
    }

    /// Reader over multiple lines must process all lines without error.
    #[tokio::test]
    async fn test_tokenize_reader_multiple_lines() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let data = "첫 번째 줄.\n두 번째 줄.\n";
        let cursor = std::io::Cursor::new(data.as_bytes());
        let result = tokenizer.tokenize_reader(cursor).await;
        assert!(result.is_ok(), "tokenize_reader should succeed on multiple lines");
    }

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — tokenize_file (error path)
    // ---------------------------------------------------------------------------

    /// Attempting to open a non-existent file must return an Err, not panic.
    #[tokio::test]
    async fn test_tokenize_file_nonexistent() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let result = tokenizer.tokenize_file("/nonexistent/path/that/does/not/exist.txt").await;
        assert!(result.is_err(), "tokenize_file on missing path must return Err");
    }

    // ---------------------------------------------------------------------------
    // AsyncTokenizer — get_tokenizer
    // ---------------------------------------------------------------------------

    /// `get_tokenizer` must return a guard that can call `tokenize` synchronously.
    #[tokio::test]
    async fn test_get_tokenizer_sync_call() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut guard = tokenizer.get_tokenizer().await;
        // Call the synchronous tokenizer through the guard — must not panic.
        let tokens = guard.tokenize("안녕");
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
    }

    // ---------------------------------------------------------------------------
    // AsyncStreamingTokenizer — construction
    // ---------------------------------------------------------------------------

    /// Default sentence delimiters must include '.' '\n' '?' '!'.
    #[tokio::test]
    async fn test_async_streaming_tokenizer_default_delimiters() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let stream = AsyncStreamingTokenizer::new(tokenizer);
        assert!(stream.sentence_delimiters.contains(&'.'));
        assert!(stream.sentence_delimiters.contains(&'\n'));
        assert!(stream.sentence_delimiters.contains(&'?'));
        assert!(stream.sentence_delimiters.contains(&'!'));
    }

    /// New stream must start with an empty buffer.
    #[tokio::test]
    async fn test_async_streaming_tokenizer_initial_buffer_empty() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let stream = AsyncStreamingTokenizer::new(tokenizer);
        assert!(stream.buffer.is_empty(), "buffer must be empty on construction");
    }

    // ---------------------------------------------------------------------------
    // AsyncStreamingTokenizer — process_chunk / flush
    // ---------------------------------------------------------------------------

    /// `flush` on an empty buffer must return an empty Vec.
    #[tokio::test]
    async fn test_async_streaming_flush_empty_buffer() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);
        let tokens = stream.flush().await;
        assert!(tokens.is_empty(), "flush on empty buffer must produce no tokens");
        assert!(stream.buffer.is_empty(), "buffer must remain empty after flushing empty buffer");
    }

    /// After `flush`, the buffer must be empty regardless of prior state.
    #[tokio::test]
    async fn test_async_streaming_flush_clears_buffer() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        // Push text without a sentence delimiter so it stays in the buffer.
        let tokens = stream.process_chunk("버퍼에 남을 텍스트").await;
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
        assert!(!stream.buffer.is_empty(), "buffer should hold unprocessed text");

        let flushed = stream.flush().await;
        assert!(flushed.iter().all(|t| !t.surface.is_empty()));
        assert!(stream.buffer.is_empty(), "flush must clear the buffer");
    }

    /// Text with a newline delimiter — process_chunk triggers tokenisation and
    /// leaves whatever follows the delimiter in the buffer.
    #[tokio::test]
    async fn test_async_streaming_chunk_with_newline_delimiter() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        // The '\n' triggers sentence boundary detection.  Regardless of the
        // token count (which depends on the dictionary), the call must not panic
        // and the buffer must not still contain the '\n'-terminated prefix.
        let tokens = stream.process_chunk("안녕하세요.\n").await;
        let remaining = stream.flush().await;
        // Total token count may be zero with mini-dict, but the pipeline must complete.
        let total = tokens.len() + remaining.len();
        assert!(tokens.iter().chain(remaining.iter()).all(|t| !t.surface.is_empty()), "all tokens must have non-empty surface (total: {total})");
    }

    /// Text with no delimiter must be buffered, not tokenised immediately.
    #[tokio::test]
    async fn test_async_streaming_chunk_without_delimiter_stays_buffered() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        let tokens = stream.process_chunk("구분자없음").await;
        // No delimiter ⇒ no output from process_chunk.
        assert!(tokens.is_empty(), "text without delimiter must not produce tokens immediately");
        // The text must have been buffered.
        assert!(!stream.buffer.is_empty(), "text without delimiter must be held in the buffer");
    }

    /// process_reader on empty bytes — must return Ok(empty).
    #[tokio::test]
    async fn test_async_streaming_process_reader_empty() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        let cursor = std::io::Cursor::new(b"" as &[u8]);
        let result = stream.process_reader(cursor).await;
        assert!(result.is_ok(), "process_reader on empty input must succeed");
        assert!(result.unwrap().is_empty());
    }

    /// process_reader on multi-line input — must succeed and flush everything.
    #[tokio::test]
    async fn test_async_streaming_process_reader_multiline() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        let data = "첫째 줄.\n둘째 줄.\n";
        let cursor = std::io::Cursor::new(data.as_bytes());
        let result = stream.process_reader(cursor).await;
        assert!(result.is_ok(), "process_reader must succeed on multi-line input");
        // After process_reader the buffer should be empty (flush was called internally).
        assert!(stream.buffer.is_empty(), "process_reader must flush the buffer at the end");
    }

    // ---------------------------------------------------------------------------
    // AsyncStreamingTokenizer — find_last_sentence_boundary (via process_chunk)
    // ---------------------------------------------------------------------------

    /// Multiple delimiter characters ('.' '!' '?') — the last one is used as the
    /// split point.  Each chunk must be processed without panic.
    #[tokio::test]
    async fn test_async_streaming_multiple_delimiters_in_chunk() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        // Both '.' and '?' are delimiters; the last one ('?') should be the split.
        let tokens = stream.process_chunk("안녕하세요. 괜찮으세요?").await;
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
        let flushed = stream.flush().await;
        assert!(flushed.iter().all(|t| !t.surface.is_empty()));
    }

    /// Japanese full-stop '。' is a multi-byte delimiter — must not panic.
    #[tokio::test]
    async fn test_async_streaming_multibyte_delimiter_no_panic() {
        let tokenizer = AsyncTokenizer::new().await.expect("should create");
        let mut stream = AsyncStreamingTokenizer::new(tokenizer);

        // '。' is U+3002, encoded as 3 bytes in UTF-8.
        let tokens = stream.process_chunk("テスト。次の文。\n").await;
        assert!(tokens.iter().all(|t| !t.surface.is_empty()));
        let flushed = stream.flush().await;
        assert!(flushed.iter().all(|t| !t.surface.is_empty()));
    }
}