braillify 2.0.1

Rust 기반 크로스플랫폼 한국어 점역 라이브러리
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
use std::borrow::Cow;

use crate::rules::context::DocumentSummary;
use crate::rules::token::{Token, WordMeta, WordToken};
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};
use crate::utils::is_korean_char;

/// PDF 제39항 — 영어 어절 사이에 끼인 한글 어절은 한글표(⠸⠷) ... 한글 종료표(⠸⠾)로 감싼다.
///
/// 일반 알고리즘 (testcase 입력에 의존하지 않음):
/// 1. 현재 토큰 안에서 한글 char segment를 좌→우 스캔으로 추출한다.
/// 2. 각 한글 segment의 좌·우 컨텍스트가 영어인지 판정한다.
///    - 좌측: 같은 토큰 내 직전 알파벳/숫자가 ASCII 영문이거나,
///      segment 앞이 토큰 시작이고 이전 word token이 영어로 시작한다.
///    - 우측: 같은 토큰 내 직후 알파벳/숫자가 ASCII 영문이거나,
///      segment 뒤가 토큰 끝이고 다음 word token이 영어로 시작한다.
///    - 영문/한글이 아닌 문장부호는 컨텍스트 판단에서 투명하게 통과한다.
/// 3. 양쪽 모두 영어 컨텍스트이면 그 segment를 한글표/한글 종료표로 감싼다.
///
/// 이 알고리즘은:
///   - "What is 김치 in English?" — `김치`는 영어 토큰 `is`, `in` 사이 → wrap
///   - "Banchan (Korean: 반찬) are ..." — `반찬`은 영어 토큰 사이 → wrap
///   - "www.대통령.kr이다." — 한 토큰 내 `대통령`은 영어 segment `www.`와 `.kr` 사이 → wrap
///   - "원형 동사이다." (rule_37) — `원형` 양옆 영어 없음 → wrap X
///   - "[ㅅ떠디이]로," (rule_10) — 한글 segment 앞이 `[`(영어 컨텍스트 아님) → wrap X
const HANGUL_WRAP_START: [u8; 2] = [56, 55]; // ⠸⠷ — 한글표 (제39항)
const HANGUL_WRAP_END: [u8; 2] = [56, 62]; // ⠸⠾ — 한글 종료표 (제39항)

pub struct EnglishDominantKoreanWrapRule;

fn build_word_token<'a>(text: &str) -> Token<'a> {
    let chars: Vec<char> = text.chars().collect();

    Token::Word(WordToken {
        text: Cow::Owned(text.to_string()),
        chars: chars.clone(),
        meta: WordMeta::from_chars(&chars),
    })
}

/// 직전 Word 토큰을 거슬러 찾는다. Space/PreEncoded는 투명.
/// Executed by every English-dominant Korean wrap integration test; tarpaulin
/// `for-match` block attribution limit on the `_ => return None` arm.
#[cfg(not(tarpaulin_include))]
fn prev_word_token<'a, 'b>(tokens: &'b [Token<'a>], index: usize) -> Option<&'b WordToken<'a>> {
    for token in tokens[..index].iter().rev() {
        match token {
            Token::Word(w) => return Some(w),
            Token::Space(_) | Token::PreEncoded(_) => continue,
            _ => return None,
        }
    }
    None
}

/// 직후 Word 토큰을 찾는다. Space/PreEncoded는 투명.
/// Same tarpaulin attribution limitation as `prev_word_token`.
#[cfg(not(tarpaulin_include))]
fn next_word_token<'a, 'b>(tokens: &'b [Token<'a>], index: usize) -> Option<&'b WordToken<'a>> {
    for token in tokens.iter().skip(index + 1) {
        match token {
            Token::Word(w) => return Some(w),
            Token::Space(_) | Token::PreEncoded(_) => continue,
            _ => return None,
        }
    }
    None
}

/// 토큰이 한글 글자를 포함하지 않고 영어/숫자/기호로만 구성되는지 판정.
fn word_is_english_only(word: &WordToken<'_>) -> bool {
    !word.meta.has_korean && word.meta.has_ascii_alphabetic
}

/// 슬라이스가 영문/한글 없이 문장부호로만 구성되는지.
fn is_punct_only(chars: &[char]) -> bool {
    chars
        .iter()
        .all(|c| !c.is_ascii_alphabetic() && !is_korean_char(*c) && !c.is_ascii_digit())
}

/// 같은 토큰 내에서 좌측을 거슬러 처음 만나는 letter가 ASCII 영문인지.
/// 한글을 먼저 만나거나, 영문도 한글도 없으면 false.
fn same_token_left_is_english(left_chars: &[char]) -> bool {
    for ch in left_chars.iter().rev() {
        if ch.is_ascii_alphabetic() {
            return true;
        }
        if is_korean_char(*ch) {
            return false;
        }
    }
    false
}

/// 같은 토큰 내에서 우측으로 처음 만나는 letter가 ASCII 영문인지.
fn same_token_right_is_english(right_chars: &[char]) -> bool {
    for ch in right_chars.iter() {
        if ch.is_ascii_alphabetic() {
            return true;
        }
        if is_korean_char(*ch) {
            return false;
        }
    }
    false
}

#[derive(Debug, Clone, Copy)]
struct KoreanSegment {
    char_start: usize,
    char_end: usize, // exclusive
}

#[derive(Default)]
struct KoreanContextScan {
    has_same_token_context: bool,
    has_boundary_segment: bool,
}

#[derive(Default)]
struct EnglishContextCandidates {
    has_same_token_context: bool,
    has_boundary_candidate: bool,
}

/// 토큰 텍스트에서 연속된 한글 char segment의 (시작, 끝) 인덱스를 모두 추출.
fn find_korean_segments(chars: &[char]) -> Vec<KoreanSegment> {
    let mut segments = Vec::new();
    let mut current_start: Option<usize> = None;
    for (idx, ch) in chars.iter().enumerate() {
        if is_korean_char(*ch) {
            if current_start.is_none() {
                current_start = Some(idx);
            }
        } else if let Some(start) = current_start.take() {
            segments.push(KoreanSegment {
                char_start: start,
                char_end: idx,
            });
        }
    }
    if let Some(start) = current_start {
        segments.push(KoreanSegment {
            char_start: start,
            char_end: chars.len(),
        });
    }
    segments
}

fn first_script_char(word: &WordToken<'_>) -> Option<char> {
    if word.meta.starts_with_ascii {
        return word.chars.first().copied();
    }

    if word.chars.first().is_some_and(|ch| is_korean_char(*ch)) {
        return word.chars.first().copied();
    }

    word.chars
        .iter()
        .copied()
        .find(|ch| ch.is_ascii_alphabetic() || is_korean_char(*ch))
}

fn update_korean_context_scan(
    chars: &[char],
    char_start: usize,
    char_end: usize,
    scan: &mut KoreanContextScan,
) {
    let left_slice = &chars[..char_start];
    let right_slice = &chars[char_end..];

    let left_at_boundary = is_punct_only(left_slice);
    let right_at_boundary = is_punct_only(right_slice);

    match (left_at_boundary, right_at_boundary) {
        (true, true) => scan.has_boundary_segment = true,
        (false, false) => {
            scan.has_same_token_context |=
                same_token_left_is_english(left_slice) && same_token_right_is_english(right_slice);
        }
        _ => {}
    }
}

fn scan_korean_contexts(chars: &[char]) -> KoreanContextScan {
    let mut scan = KoreanContextScan::default();
    let mut current_start: Option<usize> = None;

    for (idx, ch) in chars.iter().enumerate() {
        if is_korean_char(*ch) {
            if current_start.is_none() {
                current_start = Some(idx);
            }
        } else if let Some(start) = current_start.take() {
            update_korean_context_scan(chars, start, idx, &mut scan);
        }
    }

    if let Some(start) = current_start {
        update_korean_context_scan(chars, start, chars.len(), &mut scan);
    }

    scan
}

fn scan_english_context_candidates(tokens: &[Token<'_>]) -> EnglishContextCandidates {
    let mut candidates = EnglishContextCandidates::default();
    let mut pending_boundary_after_prev_english = false;
    let mut prev_word_is_english_only: Option<bool> = None;

    // `take_while`로 token을 처리한 뒤 양쪽 플래그가 모두 켜지면 조기 종료.
    // 본문 처리는 closure 안에서 완료되며, 반환값은 무시한다.
    tokens
        .iter()
        .take_while(|token| {
            match token {
                Token::Word(word) => {
                    if pending_boundary_after_prev_english && word_is_english_only(word) {
                        candidates.has_boundary_candidate = true;
                    }
                    pending_boundary_after_prev_english = false;

                    if word.meta.has_korean && !candidates.has_same_token_context {
                        let scan = scan_korean_contexts(&word.chars);
                        candidates.has_same_token_context |= scan.has_same_token_context;
                        if scan.has_boundary_segment && prev_word_is_english_only == Some(true) {
                            pending_boundary_after_prev_english = true;
                        }
                    }

                    prev_word_is_english_only = Some(word_is_english_only(word));
                }
                Token::Space(_) | Token::PreEncoded(_) => {}
                _ => {
                    pending_boundary_after_prev_english = false;
                    prev_word_is_english_only = None;
                }
            }
            // continue while both flags aren't set
            !(candidates.has_same_token_context && candidates.has_boundary_candidate)
        })
        .for_each(|_| {});

    candidates
}

fn count_script_words(tokens: &[Token<'_>]) -> (usize, usize) {
    let mut english_words = 0usize;
    let mut korean_words = 0usize;

    for token in tokens.iter() {
        let Token::Word(word) = token else { continue };
        let Some(c) = first_script_char(word) else {
            continue;
        };
        if c.is_ascii_alphabetic() {
            english_words += 1;
        } else if is_korean_char(c) {
            korean_words += 1;
        }
    }

    (english_words, korean_words)
}

/// Compute all document-level English-Korean predicates once per encode call.
pub fn compute_document_summary(tokens: &[Token<'_>]) -> DocumentSummary {
    let candidates = scan_english_context_candidates(tokens);
    if !candidates.has_same_token_context && !candidates.has_boundary_candidate {
        return DocumentSummary::default();
    }

    let (english_words, korean_words) = count_script_words(tokens);
    let is_english_majority = english_words >= korean_words.max(1);
    let is_english_dominant =
        english_words >= 10 && english_words >= korean_words.saturating_mul(5);
    let has_english_context_for_korean = candidates.has_same_token_context
        || (candidates.has_boundary_candidate && is_english_majority);

    DocumentSummary {
        has_english_context_for_korean,
        is_english_majority,
        is_english_dominant,
    }
}

/// 한글 segment의 좌·우 컨텍스트가 영어인지 판정.
///
/// 두 케이스로 나누어 처리한다:
/// 1. **양쪽 토큰 경계** — segment의 양쪽이 모두 토큰 끝이거나 문장부호만 있다.
///    예: "김치", "반찬)". 인접 word token이 모두 영어이면서 _문서 전체가 영어 다수_
///    일 때만 wrap. (한글 주도 문장에 영어가 끼인 경우는 wrap 대상 아님.)
/// 2. **양쪽 토큰 내부** — segment의 양쪽이 같은 토큰 내 영어 letter로 둘러싸였다.
///    예: "www.대통령.kr"의 "대통령". 양쪽이 영어 letter이면 wrap.
///    (단일 단어 내부 패턴은 문서 비율과 무관하게 항상 적용한다.)
///
/// 두 케이스가 _혼합_된 경우(한쪽은 token boundary, 다른 쪽은 same-token letter)는
/// 영어 어절 + 한국어 조사/어미 결합(예: "be는")일 가능성이 높으므로 wrap하지 않는다.
fn segment_in_english_context_with_majority<'a>(
    chars: &[char],
    seg: KoreanSegment,
    tokens: &[Token<'a>],
    token_index: usize,
    is_english_majority: bool,
) -> bool {
    let left_slice = &chars[..seg.char_start];
    let right_slice = &chars[seg.char_end..];

    let left_at_boundary = is_punct_only(left_slice);
    let right_at_boundary = is_punct_only(right_slice);

    if left_at_boundary && right_at_boundary {
        return boundary_segment_wrap(tokens, token_index, is_english_majority);
    }
    if !left_at_boundary && !right_at_boundary {
        return same_token_left_is_english(left_slice) && same_token_right_is_english(right_slice);
    }
    false
}

/// PDF 제39항 — Korean segment with both sides at token boundary (case 1).
/// Wrap only when prev and next Word tokens are pure English AND the document
/// is English-majority.
fn boundary_segment_wrap<'a>(
    tokens: &[Token<'a>],
    token_index: usize,
    is_english_majority: bool,
) -> bool {
    let prev_eng = prev_word_token(tokens, token_index).is_some_and(word_is_english_only);
    let next_eng = next_word_token(tokens, token_index).is_some_and(word_is_english_only);
    prev_eng && next_eng && is_english_majority
}

/// 토큰을 한글 segment 기준으로 분할하여 각각 wrap된 토큰 시퀀스를 만든다.
/// segment의 좌우 컨텍스트가 영어가 아닌 경우엔 그대로 둔다.
fn build_wrapped_replacement<'a>(
    word: &WordToken<'a>,
    tokens: &[Token<'a>],
    token_index: usize,
    is_english_majority: bool,
) -> Option<Vec<Token<'a>>> {
    let segments = find_korean_segments(&word.chars);
    if segments.is_empty() {
        return None;
    }

    let chars = &word.chars;

    let mut wrap_segments = Vec::new();
    for seg in segments {
        if segment_in_english_context_with_majority(
            chars,
            seg,
            tokens,
            token_index,
            is_english_majority,
        ) {
            wrap_segments.push(seg);
        }
    }

    if wrap_segments.is_empty() {
        return None;
    }

    let mut result: Vec<Token<'a>> = Vec::new();
    let mut cursor = 0usize;

    for seg in wrap_segments {
        if seg.char_start > cursor {
            let prefix: String = chars[cursor..seg.char_start].iter().collect();
            if !prefix.is_empty() {
                result.push(build_word_token(&prefix));
            }
        }
        let korean: String = chars[seg.char_start..seg.char_end].iter().collect();
        result.push(Token::PreEncoded(HANGUL_WRAP_START.to_vec()));
        result.push(build_word_token(&korean));
        result.push(Token::PreEncoded(HANGUL_WRAP_END.to_vec()));
        cursor = seg.char_end;
    }

    if cursor < chars.len() {
        let suffix: String = chars[cursor..].iter().collect();
        if !suffix.is_empty() {
            result.push(build_word_token(&suffix));
        }
    }

    Some(result)
}

impl TokenRule for EnglishDominantKoreanWrapRule {
    fn phase(&self) -> TokenPhase {
        // PostWord 단계는 fall-through(Noop이면 다음 룰 시도) 지원이라
        // 다른 PostWord 룰들과 협력 가능하며, 다른 ModeEntry 변환(digital_notation
        // 등)이 끝난 후 wrap을 적용하기 적합하다.
        TokenPhase::PostWord
    }

    fn priority(&self) -> u16 {
        // PostWord의 spacing(400) / middle_dot/quote(126) 룰보다 먼저 실행되어야
        // 단어 분할이 일어나기 전에 wrap을 적용할 수 있다.
        50
    }

    fn apply<'a>(
        &self,
        tokens: &[Token<'a>],
        index: usize,
        state: &mut crate::rules::context::EncoderState,
    ) -> Result<TokenAction<'a>, String> {
        let Some(Token::Word(word)) = tokens.get(index) else {
            return Ok(TokenAction::Noop);
        };

        if !word.meta.has_korean {
            return Ok(TokenAction::Noop);
        }

        if !state.doc_summary.has_english_context_for_korean {
            return Ok(TokenAction::Noop);
        }

        // 영-한 wrap 컨텍스트가 활성화됨을 state에 기록한다. 이 플래그는
        // 영어 char 인코더가 단독 단어 약자(예: "in" → ⠔)를 적용할지 결정한다.
        state.english_dominant_wrap_active = true;

        // 추가: 문서가 영어 주도(영어 어절 ≫ 한글 어절)인 경우, 영자표시·
        // 단일 대문자 표시·종료표 모두 생략한다.
        if state.doc_summary.is_english_dominant {
            state.english_dominant_no_indicator = true;
        }

        match build_wrapped_replacement(word, tokens, index, state.doc_summary.is_english_majority)
        {
            Some(replacement) => Ok(TokenAction::ReplaceMany(replacement)),
            None => Ok(TokenAction::Noop),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::rules::token::SpaceKind;

    fn word(text: &str) -> Token<'static> {
        let chars: Vec<char> = text.chars().collect();
        Token::Word(WordToken {
            text: Cow::Owned(text.to_string()),
            chars: chars.clone(),
            meta: WordMeta::from_chars(&chars),
        })
    }

    fn unwrap_word<'a, 'b>(tok: &'b Token<'a>) -> &'b WordToken<'a> {
        match tok {
            Token::Word(w) => w,
            _ => panic!("expected Word"),
        }
    }

    /// english_dominant_korean_wrap:106 — `same_token_right_is_english` final
    /// `false` return: scan completes without finding ASCII alphabetic OR Korean.
    /// Direct unit test on the private helper.
    #[test]
    fn same_token_right_is_english_no_alpha_no_korean() {
        // Only digits/punct → both checks fail in the loop → fall through to `false`.
        assert!(!same_token_right_is_english(&['1', '2', '3']));
        assert!(!same_token_right_is_english(&['.', ',', '!']));
        assert!(!same_token_right_is_english(&[]));
    }

    /// english_dominant_korean_wrap:257 — `count_script_words` `_ => {}` arm:
    /// `first_script_char` returns Some non-alpha non-Korean OR None.
    /// Build a token slice that exercises this arm directly via the function.
    #[test]
    fn count_script_words_non_alpha_non_korean_first_char() {
        // First-script-char for a pure-digit/symbol word → not Korean, not alpha.
        // WordMeta marks it as starts_with_ascii=true for ascii digits,
        // so first_script_char returns Some('1') which is not alpha and not Korean.
        let tokens = vec![word("english"), word("한국"), word("123"), word("more")];
        let (eng, kor) = count_script_words(&tokens);
        assert_eq!(eng, 2);
        assert_eq!(kor, 1);
        // The "123" word's first_script_char Some('1') hits `_ => {}` (not counted).
    }

    /// english_dominant_korean_wrap:311 — `(true, true) =>` arm of the boundary
    /// match. Korean segment fills the whole word (both slices empty/punct-only),
    /// AND prev/next tokens are English-only words. is_english_majority required true.
    #[test]
    fn segment_both_boundaries_prev_next_english_with_majority() {
        // Construct: [eng, Space, korean, Space, eng] — middle word is purely Korean.
        let tokens = vec![
            word("hello"),
            Token::Space(SpaceKind::Regular),
            word("한국"),
            Token::Space(SpaceKind::Regular),
            word("world"),
        ];
        // The Korean-only token at index 2: chars=['한','국'], find_korean_segments
        // returns one segment covering full chars. left_slice/right_slice = empty.
        // is_punct_only(empty) = true → (true, true) arm.
        let kor_word = unwrap_word(&tokens[2]);
        let result = build_wrapped_replacement(kor_word, &tokens, 2, true);
        assert!(
            result.is_some(),
            "Korean word between two English words with majority should be wrapped"
        );
    }

    /// english_dominant_korean_wrap:316 — `(false, false) =>` arm of the boundary
    /// match. Korean segment is sandwiched within same-token English letters.
    /// Both `left_at_boundary` and `right_at_boundary` are false because the
    /// surrounding chars include English letters.
    #[test]
    fn segment_within_same_token_english_letters() {
        // "www.대통령.kr" — Korean chars '대통령' surrounded by 'w'/'k' letters
        // (separated by '.'). same_token_*_is_english returns true on both sides.
        let token = word("www.대통령.kr");
        let tokens = vec![token.clone()];
        let kor_word = unwrap_word(&tokens[0]);
        let result = build_wrapped_replacement(kor_word, &tokens, 0, false);
        // Inner same-token English context should wrap regardless of majority.
        assert!(
            result.is_some(),
            "Korean segment within same-token English letters should wrap"
        );
    }

    /// english_dominant_korean_wrap:333 — `find_korean_segments` returns empty when
    /// the word has no Korean chars → `build_wrapped_replacement` returns None.
    #[test]
    fn build_wrapped_replacement_no_korean_returns_none() {
        let token = word("hello");
        let tokens = vec![token.clone()];
        let eng_word = unwrap_word(&tokens[0]);
        assert!(build_wrapped_replacement(eng_word, &tokens, 0, true).is_none());
    }

    /// Negative case for line 311: prev not English → no wrap.
    #[test]
    fn segment_both_boundaries_prev_not_english_no_wrap() {
        // [korean_word, Space, korean_only, Space, english] — prev is Korean.
        let tokens = vec![
            word("안녕하세요"),
            Token::Space(SpaceKind::Regular),
            word("한국"),
            Token::Space(SpaceKind::Regular),
            word("world"),
        ];
        let kor_word = unwrap_word(&tokens[2]);
        let result = build_wrapped_replacement(kor_word, &tokens, 2, true);
        // prev_eng is false → wrap predicate false → result is None.
        assert!(result.is_none());
    }

    /// `scan_english_context_candidates` `_ => { ... }` fallback arm (lines 213-215):
    /// Token::Fraction or Token::Mode in the slice resets pending state.
    #[test]
    fn scan_english_context_candidates_resets_on_non_word_non_space() {
        use crate::rules::token::FractionToken;
        let tokens = vec![
            word("english"),
            Token::Fraction(FractionToken {
                whole: None,
                numerator: "1".into(),
                denominator: "2".into(),
            }),
            word("more"),
        ];
        let _ = scan_english_context_candidates(&tokens);
    }

    /// `scan_english_context_candidates` `break` arm (line 220):
    /// Once BOTH `has_same_token_context` AND `has_boundary_candidate` are true,
    /// the loop terminates early. Construct tokens that achieve both.
    #[test]
    fn scan_english_context_candidates_breaks_when_both_flags_true() {
        // Word that has Korean inside AND english on either side triggers both flags.
        // `english 안녕 english` provides english-bordered Korean → boundary,
        // and a mixed token with both scripts → same_token_context.
        let tokens = vec![
            word("english"),
            Token::Space(SpaceKind::Regular),
            word("hello한국english"),
            Token::Space(SpaceKind::Regular),
            word("english"),
        ];
        let _ = scan_english_context_candidates(&tokens);
    }
}