Skip to main content

llm_tokenizer/
stop.rs

1use std::{collections::HashSet, sync::Arc};
2
3use aho_corasick::AhoCorasick;
4use anyhow::Result;
5
6use crate::{
7    sequence::Sequence,
8    traits::{self, TokenIdType},
9};
10
11/// Output from the sequence decoder
12#[derive(Debug, Clone, PartialEq)]
13pub enum SequenceDecoderOutput {
14    /// Normal text output
15    Text(String),
16    /// Text is being held due to partial stop sequence match
17    Held,
18    /// Stop sequence matched (hidden - not included in output)
19    Stopped,
20    /// Stop sequence matched with text (visible - included in output)
21    StoppedWithText(String),
22}
23
24/// Configuration for stop sequences
25#[derive(Debug, Clone, Default)]
26pub struct StopSequenceConfig {
27    /// Token IDs that trigger a stop
28    pub stop_tokens: HashSet<TokenIdType>,
29    /// String sequences that trigger a stop
30    pub stop_sequences: Vec<String>,
31    /// Token IDs for visible stops (included in output)
32    pub visible_stop_tokens: HashSet<TokenIdType>,
33    /// String sequences for visible stops (included in output)
34    pub visible_stop_sequences: Vec<String>,
35}
36
37impl StopSequenceConfig {
38    /// Builder pattern - add a stop token
39    pub fn with_stop_token(mut self, token_id: TokenIdType) -> Self {
40        self.stop_tokens.insert(token_id);
41        self
42    }
43
44    /// Builder pattern - add a stop sequence
45    pub fn with_stop_sequence(mut self, sequence: impl Into<String>) -> Self {
46        self.stop_sequences.push(sequence.into());
47        self
48    }
49
50    /// Builder pattern - add a visible stop token
51    pub fn with_visible_stop_token(mut self, token_id: TokenIdType) -> Self {
52        self.visible_stop_tokens.insert(token_id);
53        self
54    }
55
56    /// Builder pattern - add a visible stop sequence
57    pub fn with_visible_stop_sequence(mut self, sequence: impl Into<String>) -> Self {
58        self.visible_stop_sequences.push(sequence.into());
59        self
60    }
61}
62
63/// Decoder that handles stop sequences
64pub struct StopSequenceDecoder {
65    /// Sequence for incremental decoding (replaces token_buffer + offsets)
66    sequence: Sequence,
67    config: StopSequenceConfig,
68    /// Aho-Corasick automaton for O(N) stop sequence matching
69    aho_corasick: Option<AhoCorasick>,
70    /// Index boundary: patterns [0..visible_boundary_idx) are hidden,
71    /// patterns [visible_boundary_idx..) are visible
72    visible_boundary_idx: usize,
73    /// Buffer for partial matches (the "jail")
74    jail_buffer: String,
75    /// Whether we've stopped
76    stopped: bool,
77}
78
79impl StopSequenceDecoder {
80    /// Create a new stop sequence decoder
81    pub fn new(
82        tokenizer: Arc<dyn traits::Tokenizer>,
83        config: StopSequenceConfig,
84        skip_special_tokens: bool,
85    ) -> Self {
86        // Build Aho-Corasick automaton from all stop sequences
87        // Hidden sequences come first, then visible sequences
88        let mut patterns: Vec<&str> = config
89            .stop_sequences
90            .iter()
91            .filter(|s| !s.is_empty())
92            .map(|s| s.as_str())
93            .collect();
94        let visible_boundary_idx = patterns.len();
95        patterns.extend(
96            config
97                .visible_stop_sequences
98                .iter()
99                .filter(|s| !s.is_empty())
100                .map(|s| s.as_str()),
101        );
102
103        let aho_corasick = if patterns.is_empty() {
104            None
105        } else {
106            // AhoCorasick::new is infallible for non-empty, pre-filtered string patterns.
107            // Failure would indicate a bug in the Aho-Corasick library itself.
108            #[expect(
109                clippy::expect_used,
110                reason = "AhoCorasick::new with pre-filtered non-empty &str patterns is practically infallible"
111            )]
112            Some(AhoCorasick::new(patterns).expect("Failed to build Aho-Corasick automaton"))
113        };
114
115        StopSequenceDecoder {
116            sequence: Sequence::new_with_options(tokenizer, skip_special_tokens),
117            config,
118            aho_corasick,
119            visible_boundary_idx,
120            jail_buffer: String::new(),
121            stopped: false,
122        }
123    }
124
125    /// Process a single token
126    pub fn process_token(&mut self, token_id: TokenIdType) -> Result<SequenceDecoderOutput> {
127        if self.stopped {
128            return Ok(SequenceDecoderOutput::Stopped);
129        }
130
131        // Check for token-level stops first
132        if self.config.stop_tokens.contains(&token_id) {
133            self.stopped = true;
134
135            // Flush any jailed text before stopping - use mem::take to avoid clone
136            if !self.jail_buffer.is_empty() {
137                return Ok(SequenceDecoderOutput::StoppedWithText(std::mem::take(
138                    &mut self.jail_buffer,
139                )));
140            }
141            return Ok(SequenceDecoderOutput::Stopped);
142        }
143
144        if self.config.visible_stop_tokens.contains(&token_id) {
145            self.stopped = true;
146
147            // Include jailed text plus the stop token
148            let stop_text = self
149                .sequence
150                .tokenizer()
151                .decode(&[token_id], self.sequence.skip_special_tokens())?;
152            let output = format!("{}{}", self.jail_buffer, stop_text);
153            self.jail_buffer.clear();
154            return Ok(SequenceDecoderOutput::StoppedWithText(output));
155        }
156
157        // Use Sequence for incremental decoding
158        let new_text = self.sequence.append_token(token_id)?;
159
160        self.jail_buffer.push_str(&new_text);
161
162        // Check for stop sequences using Aho-Corasick (O(N) single-pass)
163        if let Some(ac) = &self.aho_corasick {
164            if let Some(mat) = ac.find(&self.jail_buffer) {
165                self.stopped = true;
166                let is_visible = mat.pattern().as_usize() >= self.visible_boundary_idx;
167
168                if is_visible {
169                    // Visible stop sequence: include it in output
170                    let output = self.jail_buffer[..mat.end()].to_string();
171                    self.jail_buffer.clear();
172                    return Ok(SequenceDecoderOutput::StoppedWithText(output));
173                } else {
174                    // Hidden stop sequence: exclude it from output
175                    let output = self.jail_buffer[..mat.start()].to_string();
176                    self.jail_buffer.clear();
177                    return Ok(if output.is_empty() {
178                        SequenceDecoderOutput::Stopped
179                    } else {
180                        SequenceDecoderOutput::StoppedWithText(output)
181                    });
182                }
183            }
184        }
185
186        // Check for partial matches: is the end of jail_buffer the start of any stop_seq?
187        // This handles stop sequences split across tokens
188        let buffer_len = self.jail_buffer.len();
189        let mut best_split_pos: Option<usize> = None;
190
191        for stop_seq in self
192            .config
193            .stop_sequences
194            .iter()
195            .chain(&self.config.visible_stop_sequences)
196        {
197            let stop_len = stop_seq.len();
198
199            if stop_len <= 1 || buffer_len == 0 {
200                continue;
201            }
202
203            let max_len = buffer_len.min(stop_len - 1);
204
205            for len in (1..=max_len).rev() {
206                let suffix_start = buffer_len - len;
207
208                if !self.jail_buffer.is_char_boundary(suffix_start) {
209                    continue;
210                }
211
212                let suffix = &self.jail_buffer[suffix_start..];
213
214                if stop_seq.starts_with(suffix)
215                    && best_split_pos.is_none_or(|current| suffix_start < current)
216                {
217                    best_split_pos = Some(suffix_start);
218                    break;
219                }
220            }
221        }
222
223        if let Some(split_pos) = best_split_pos {
224            // Hold the partial match, flush the rest
225            // Use split_off for zero-copy: keeps [0..split_pos] in place, returns [split_pos..]
226            // Then swap so we output the prefix and keep the suffix
227            let suffix = self.jail_buffer.split_off(split_pos);
228            let to_output = std::mem::replace(&mut self.jail_buffer, suffix);
229
230            if to_output.is_empty() {
231                Ok(SequenceDecoderOutput::Held)
232            } else {
233                Ok(SequenceDecoderOutput::Text(to_output))
234            }
235        } else {
236            // No partial matches - flush everything
237            let output = std::mem::take(&mut self.jail_buffer);
238            if output.is_empty() {
239                Ok(SequenceDecoderOutput::Held)
240            } else {
241                Ok(SequenceDecoderOutput::Text(output))
242            }
243        }
244    }
245
246    /// Process multiple tokens
247    pub fn process_tokens(
248        &mut self,
249        token_ids: &[TokenIdType],
250    ) -> Result<Vec<SequenceDecoderOutput>> {
251        // Pre-allocate with exact capacity to avoid reallocations
252        let mut outputs = Vec::with_capacity(token_ids.len());
253        for &token_id in token_ids {
254            outputs.push(self.process_token(token_id)?);
255        }
256        Ok(outputs)
257    }
258
259    /// Flush any held text
260    pub fn flush(&mut self) -> SequenceDecoderOutput {
261        if self.jail_buffer.is_empty() {
262            SequenceDecoderOutput::Text(String::new())
263        } else {
264            // Use mem::take to avoid clone - transfers ownership and leaves empty string
265            SequenceDecoderOutput::Text(std::mem::take(&mut self.jail_buffer))
266        }
267    }
268
269    /// Check if decoding has stopped
270    pub fn is_stopped(&self) -> bool {
271        self.stopped
272    }
273
274    /// Reset the decoder state
275    pub fn reset(&mut self) {
276        self.jail_buffer.clear();
277        self.sequence.clear();
278        self.stopped = false;
279    }
280}
281
282/// Builder for StopSequenceDecoder
283pub struct StopSequenceDecoderBuilder {
284    tokenizer: Arc<dyn traits::Tokenizer>,
285    config: StopSequenceConfig,
286    skip_special_tokens: bool,
287}
288
289impl StopSequenceDecoderBuilder {
290    pub fn new(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
291        StopSequenceDecoderBuilder {
292            tokenizer,
293            config: StopSequenceConfig::default(),
294            skip_special_tokens: true,
295        }
296    }
297
298    pub fn stop_token(mut self, token_id: TokenIdType) -> Self {
299        self.config.stop_tokens.insert(token_id);
300        self
301    }
302
303    pub fn stop_sequence(mut self, sequence: impl Into<String>) -> Self {
304        self.config.stop_sequences.push(sequence.into());
305        self
306    }
307
308    pub fn visible_stop_token(mut self, token_id: TokenIdType) -> Self {
309        self.config.visible_stop_tokens.insert(token_id);
310        self
311    }
312
313    pub fn visible_stop_sequence(mut self, sequence: impl Into<String>) -> Self {
314        self.config.visible_stop_sequences.push(sequence.into());
315        self
316    }
317
318    pub fn skip_special_tokens(mut self, skip: bool) -> Self {
319        self.skip_special_tokens = skip;
320        self
321    }
322
323    pub fn build(self) -> StopSequenceDecoder {
324        StopSequenceDecoder::new(self.tokenizer, self.config, self.skip_special_tokens)
325    }
326}
327
328#[cfg(test)]
329mod tests {
330    use std::sync::Arc;
331
332    use super::StopSequenceDecoderBuilder;
333    use crate::{
334        mock::MockTokenizer, SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder,
335    };
336
337    #[test]
338    fn test_stop_token_detection() {
339        let tokenizer = Arc::new(MockTokenizer::new());
340        let config = StopSequenceConfig::default().with_stop_token(999); // <eos> token
341
342        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
343
344        // Process tokens before stop
345        let result = decoder.process_token(1).unwrap(); // "Hello"
346        assert!(matches!(result, SequenceDecoderOutput::Text(_)));
347
348        // Process stop token
349        let result = decoder.process_token(999).unwrap(); // <eos>
350        assert_eq!(result, SequenceDecoderOutput::Stopped);
351
352        // Further tokens should also return Stopped
353        let result = decoder.process_token(2).unwrap();
354        assert_eq!(result, SequenceDecoderOutput::Stopped);
355    }
356
357    #[test]
358    fn test_visible_stop_token() {
359        let tokenizer = Arc::new(MockTokenizer::new());
360        let config = StopSequenceConfig::default().with_visible_stop_token(999);
361
362        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
363
364        let result = decoder.process_token(999).unwrap();
365        assert!(matches!(result, SequenceDecoderOutput::StoppedWithText(_)));
366    }
367
368    #[test]
369    fn test_builder_pattern() {
370        let tokenizer = Arc::new(MockTokenizer::new());
371
372        let decoder = StopSequenceDecoderBuilder::new(tokenizer)
373            .stop_token(999)
374            .stop_sequence("STOP")
375            .visible_stop_token(1000)
376            .skip_special_tokens(true)
377            .build();
378
379        assert!(!decoder.is_stopped());
380    }
381
382    #[test]
383    fn test_incremental_decoding_no_repetition() {
384        // This test verifies the critical fix: no repeated output
385        let tokenizer = Arc::new(MockTokenizer::new());
386        let config = StopSequenceConfig::default();
387        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
388
389        // Process tokens one by one and collect outputs
390        let mut outputs = Vec::new();
391
392        // Token 1: "Hello"
393        let result = decoder.process_token(1).unwrap();
394        if let SequenceDecoderOutput::Text(text) = result {
395            outputs.push(text.clone());
396        }
397
398        // Token 2: "world"
399        let result = decoder.process_token(2).unwrap();
400        if let SequenceDecoderOutput::Text(text) = result {
401            outputs.push(text.clone());
402        }
403
404        // Token 3: "test"
405        let result = decoder.process_token(3).unwrap();
406        if let SequenceDecoderOutput::Text(text) = result {
407            outputs.push(text.clone());
408        }
409
410        // CRITICAL: Each output should be unique (no accumulation)
411        // The fix ensures we only output NEW text, not accumulated text
412        assert_eq!(outputs.len(), 3);
413
414        for i in 0..outputs.len() {
415            for j in i + 1..outputs.len() {
416                // No output should contain another (no accumulation)
417                assert!(!outputs[j].contains(&outputs[i]));
418            }
419        }
420    }
421
422    #[test]
423    fn test_stop_sequence_detection() {
424        let tokenizer = Arc::new(MockTokenizer::new());
425        let config = StopSequenceConfig::default().with_stop_sequence("test");
426        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
427
428        // Process "Hello world"
429        decoder.process_token(1).unwrap(); // "Hello"
430        decoder.process_token(2).unwrap(); // "world"
431
432        // Process "test" which should trigger stop
433        let result = decoder.process_token(3).unwrap(); // "test"
434
435        // Should stop when we hit "test"
436        assert!(matches!(
437            result,
438            SequenceDecoderOutput::Stopped | SequenceDecoderOutput::StoppedWithText(_)
439        ));
440    }
441
442    #[test]
443    fn test_flush_after_partial() {
444        let tokenizer = Arc::new(MockTokenizer::new());
445        let config = StopSequenceConfig::default().with_stop_sequence("NEVER_MATCH");
446        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
447
448        // Process a token
449        decoder.process_token(1).unwrap(); // "Hello"
450
451        // Flush should return any remaining text in jail
452        let result = decoder.flush();
453
454        // After processing, flush should work
455        assert!(matches!(result, SequenceDecoderOutput::Text(_)));
456    }
457
458    #[test]
459    fn test_reset_functionality() {
460        let tokenizer = Arc::new(MockTokenizer::new());
461        let config = StopSequenceConfig::default().with_stop_token(999);
462        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
463
464        // Process and stop
465        decoder.process_token(1).unwrap();
466        decoder.process_token(999).unwrap();
467        assert!(decoder.is_stopped());
468
469        // Reset should clear everything
470        decoder.reset();
471        assert!(!decoder.is_stopped());
472
473        // Should be able to process again
474        let result = decoder.process_token(2).unwrap();
475        assert!(matches!(result, SequenceDecoderOutput::Text(_)));
476    }
477
478    #[test]
479    fn test_visible_stop_sequence() {
480        let tokenizer = Arc::new(MockTokenizer::new());
481        let config = StopSequenceConfig::default().with_visible_stop_sequence("world");
482        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
483
484        // Process "Hello"
485        decoder.process_token(1).unwrap();
486
487        // Process "world" - should include it in output
488        let result = decoder.process_token(2).unwrap();
489
490        if let SequenceDecoderOutput::StoppedWithText(text) = result {
491            // Should include "world" in the output
492            assert!(text.contains("world"));
493        } else {
494            panic!("Expected StoppedWithText with visible stop sequence");
495        }
496    }
497
498    #[test]
499    fn test_multiple_tokens_processing() {
500        let tokenizer = Arc::new(MockTokenizer::new());
501        let config = StopSequenceConfig::default();
502        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
503
504        // Process multiple tokens at once
505        let results = decoder.process_tokens(&[1, 2, 3]).unwrap();
506
507        // Should get results for each token
508        assert_eq!(results.len(), 3);
509
510        // Each result should be Text (no stops configured)
511        for result in results {
512            assert!(matches!(
513                result,
514                SequenceDecoderOutput::Text(_) | SequenceDecoderOutput::Held
515            ));
516        }
517    }
518
519    /// Test that the jail buffer correctly handles a stop sequence that arrives
520    /// across 2+ tokens.  The MockTokenizer decodes token 1 as "Hello" and
521    /// token 2's incremental contribution as " world", so the jail buffer
522    /// progressively becomes "Hello" then "Hello world".
523    ///
524    /// With stop sequence "Hello world":
525    ///   - Token 1: jail = "Hello" — partial prefix match → Held (or Text of
526    ///     the portion before the potential match, which is empty here)
527    ///   - Token 2: jail = "Hello world" — full match → Stopped
528    #[test]
529    fn test_stop_sequence_spanning_multiple_tokens() {
530        let tokenizer = Arc::new(MockTokenizer::new());
531
532        // "Hello world" spans token 1 ("Hello") and token 2 (" world")
533        let config = StopSequenceConfig::default().with_stop_sequence("Hello world");
534        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
535
536        // Token 1 ("Hello"): The jail buffer now contains "Hello".
537        // "Hello" is a prefix of stop sequence "Hello world", so the text
538        // must be held — we should NOT see it emitted as Text yet.
539        let result1 = decoder.process_token(1).unwrap();
540        assert!(
541            matches!(result1, SequenceDecoderOutput::Held),
542            "Expected Held while jail buffer is a prefix of the stop sequence, got {result1:?}"
543        );
544        assert!(
545            !decoder.is_stopped(),
546            "Decoder should not be stopped after a partial match"
547        );
548
549        // Token 2 (" world"): The jail buffer now contains "Hello world",
550        // which fully matches the stop sequence. The decoder should stop.
551        let result2 = decoder.process_token(2).unwrap();
552        assert_eq!(
553            result2,
554            SequenceDecoderOutput::Stopped,
555            "Expected Stopped when jail buffer matches the hidden stop sequence"
556        );
557        assert!(
558            decoder.is_stopped(),
559            "Decoder should be stopped after the full stop sequence match"
560        );
561
562        // Any further tokens should also return Stopped
563        let result3 = decoder.process_token(3).unwrap();
564        assert_eq!(result3, SequenceDecoderOutput::Stopped);
565    }
566
567    /// Same as above but with a *visible* stop sequence.  When the stop
568    /// sequence "Hello world" is visible, the matched text should be included
569    /// in the output via StoppedWithText.
570    #[test]
571    fn test_visible_stop_sequence_spanning_multiple_tokens() {
572        let tokenizer = Arc::new(MockTokenizer::new());
573
574        let config = StopSequenceConfig::default().with_visible_stop_sequence("Hello world");
575        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
576
577        // Token 1 ("Hello"): partial match, should be held
578        let result1 = decoder.process_token(1).unwrap();
579        assert!(
580            matches!(result1, SequenceDecoderOutput::Held),
581            "Expected Held for partial visible stop sequence match, got {result1:?}"
582        );
583
584        // Token 2 (" world"): completes "Hello world" — visible stop
585        let result2 = decoder.process_token(2).unwrap();
586        match &result2 {
587            SequenceDecoderOutput::StoppedWithText(text) => {
588                assert!(
589                    text.contains("Hello world"),
590                    "Visible stop output should contain the full stop sequence, got: {text:?}"
591                );
592            }
593            other => panic!("Expected StoppedWithText for visible stop sequence, got {other:?}"),
594        }
595        assert!(decoder.is_stopped());
596    }
597
598    /// Test a stop sequence that spans 3 tokens, with preceding text that
599    /// should be emitted before the jailed portion.
600    ///
601    /// Tokens: 3 ("test"), 1 ("Hello"), 2 ("world")
602    /// Stop sequence: "Hello world"
603    ///
604    /// - Token 3: produces "test" — no overlap with "Hello world" → Text("test")
605    /// - Token 1: produces " Hello" — "Hello" is a prefix of "Hello world"
606    ///   so " " is flushed as Text and "Hello" is held
607    /// - Token 2: produces " world" — jail now "Hello world" → Stopped
608    #[test]
609    fn test_stop_sequence_spanning_tokens_with_preceding_text() {
610        let tokenizer = Arc::new(MockTokenizer::new());
611
612        let config = StopSequenceConfig::default().with_stop_sequence("Hello world");
613        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
614
615        // Token 3 ("test"): no overlap with "Hello world" at all
616        let result1 = decoder.process_token(3).unwrap();
617        assert!(
618            matches!(result1, SequenceDecoderOutput::Text(_)),
619            "Expected Text for token with no stop sequence overlap, got {result1:?}"
620        );
621
622        // Token 1 ("Hello"): the incremental text is " Hello" (because mock
623        // tokenizer joins with spaces). The tail "Hello" is a prefix of the
624        // stop sequence, so the decoder should split: emit the non-matching
625        // prefix as Text (the space " ") and hold "Hello".
626        let result2 = decoder.process_token(1).unwrap();
627        match &result2 {
628            SequenceDecoderOutput::Text(text) => {
629                // The emitted text should be the portion before the partial match
630                assert!(
631                    !text.contains("Hello"),
632                    "Partially-matched 'Hello' should be jailed, not emitted. Got: {text:?}"
633                );
634            }
635            SequenceDecoderOutput::Held => {
636                // Also acceptable if the entire chunk is held (implementation detail)
637            }
638            other => panic!("Expected Text (prefix before partial match) or Held, got {other:?}"),
639        }
640
641        // Token 2 ("world"): completes the stop sequence
642        let result3 = decoder.process_token(2).unwrap();
643        assert!(
644            matches!(
645                result3,
646                SequenceDecoderOutput::Stopped | SequenceDecoderOutput::StoppedWithText(_)
647            ),
648            "Expected Stopped or StoppedWithText when stop sequence completes, got {result3:?}"
649        );
650        assert!(decoder.is_stopped());
651    }
652
653    #[test]
654    fn test_utf8_multibyte_character_boundaries() {
655        // This test verifies the fix for the UTF-8 boundary panic
656        // The panic occurred when trying to slice jail_buffer at a byte index
657        // that was in the middle of a multi-byte UTF-8 character (e.g., '×')
658        use crate::mock::MockTokenizer;
659
660        let tokenizer = Arc::new(MockTokenizer::new());
661
662        // Configure stop sequence with a multi-byte character
663        let config = StopSequenceConfig::default().with_stop_sequence(" ×");
664
665        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
666
667        // Simulate the scenario: jail_buffer will contain " ×" (space + multiplication sign)
668        // The '×' character is UTF-8 encoded as bytes [0xC3, 0x97] (2 bytes)
669        // When checking for partial matches, we must not slice in the middle of these bytes
670
671        // This should not panic - the fix ensures we only slice at char boundaries
672        let result = decoder.process_token(1); // Will add some text to jail_buffer
673        assert!(result.is_ok());
674
675        // Even with multi-byte UTF-8 characters in the buffer, processing should work
676        let result = decoder.process_token(2);
677        assert!(result.is_ok());
678    }
679
680    #[test]
681    fn test_utf8_multibyte_delta_character() {
682        // Test for: byte index 1 is not a char boundary; it is inside 'Δ' (bytes 0..2) of `Δ`
683        // 'Δ' (U+0394 GREEK CAPITAL LETTER DELTA) is encoded as [0xCE, 0x94] (2 bytes)
684        let tokenizer = Arc::new(MockTokenizer::new());
685        let config = StopSequenceConfig::default().with_stop_sequence("Δ");
686
687        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
688
689        // Process tokens - should not panic when checking partial matches
690        let result = decoder.process_token(1);
691        assert!(result.is_ok());
692        let result = decoder.process_token(2);
693        assert!(result.is_ok());
694    }
695
696    #[test]
697    fn test_utf8_multibyte_degree_character() {
698        // Test for: byte index 1 is not a char boundary; it is inside '°' (bytes 0..2) of `°`
699        // '°' (U+00B0 DEGREE SIGN) is encoded as [0xC2, 0xB0] (2 bytes)
700        let tokenizer = Arc::new(MockTokenizer::new());
701        let config = StopSequenceConfig::default().with_stop_sequence("°");
702
703        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
704
705        // Process tokens - should not panic when checking partial matches
706        let result = decoder.process_token(1);
707        assert!(result.is_ok());
708        let result = decoder.process_token(2);
709        assert!(result.is_ok());
710    }
711
712    #[test]
713    fn test_utf8_multibyte_triangle_character() {
714        // Test for: byte index 4 is not a char boundary; it is inside '∆' (bytes 2..5) of ` (∆`
715        // '∆' (U+2206 INCREMENT) is encoded as [0xE2, 0x88, 0x86] (3 bytes)
716        let tokenizer = Arc::new(MockTokenizer::new());
717        let config = StopSequenceConfig::default().with_stop_sequence(" (∆");
718
719        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
720
721        // Process tokens - should not panic when checking partial matches
722        let result = decoder.process_token(1);
723        assert!(result.is_ok());
724        let result = decoder.process_token(2);
725        assert!(result.is_ok());
726        let result = decoder.process_token(3);
727        assert!(result.is_ok());
728    }
729
730    #[test]
731    fn test_utf8_multibyte_en_dash_character() {
732        // Test for: byte index 3 is not a char boundary; it is inside '–' (bytes 1..4) of ` –`
733        // '–' (U+2013 EN DASH) is encoded as [0xE2, 0x80, 0x93] (3 bytes)
734        let tokenizer = Arc::new(MockTokenizer::new());
735        let config = StopSequenceConfig::default().with_stop_sequence(" –");
736
737        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
738
739        // Process tokens - should not panic when checking partial matches
740        let result = decoder.process_token(1);
741        assert!(result.is_ok());
742        let result = decoder.process_token(2);
743        assert!(result.is_ok());
744        let result = decoder.process_token(3);
745        assert!(result.is_ok());
746    }
747
748    #[test]
749    fn test_utf8_multibyte_various_characters() {
750        // Comprehensive test with multiple multi-byte UTF-8 characters
751        // Tests 2-byte, 3-byte, and 4-byte UTF-8 sequences
752        let test_cases = vec![
753            ("×", "multiplication sign - 2 bytes"),
754            ("Δ", "Greek Delta - 2 bytes"),
755            ("°", "degree sign - 2 bytes"),
756            ("∆", "increment - 3 bytes"),
757            ("–", "en dash - 3 bytes"),
758            ("€", "euro sign - 3 bytes"),
759            ("中", "Chinese character - 3 bytes"),
760            ("🚀", "rocket emoji - 4 bytes"),
761            ("💡", "lightbulb emoji - 4 bytes"),
762        ];
763
764        for (stop_char, description) in test_cases {
765            let tokenizer = Arc::new(MockTokenizer::new());
766            let config = StopSequenceConfig::default().with_stop_sequence(stop_char);
767
768            let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
769
770            // Process multiple tokens - should not panic
771            for token_id in 1..=5 {
772                let result = decoder.process_token(token_id);
773                assert!(
774                    result.is_ok(),
775                    "Failed on {description} with token {token_id}"
776                );
777            }
778        }
779    }
780}