pretext 0.1.0 - Docs.rs

use std::cell::RefCell;
use std::collections::HashMap;
use std::ops::Range;
use std::rc::Rc;

use icu_locale::Locale;
use icu_segmenter::{
    options::{WordBreakInvariantOptions, WordBreakOptions},
    WordSegmenter,
};
use unicode_script::UnicodeScript;
use unicode_segmentation::UnicodeSegmentation;

use crate::engine::PrepareOptions;

#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum WhiteSpaceMode {
    Normal,
    PreWrap,
}

#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum WordBreakMode {
    Normal,
    KeepAll,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum GraphemeKind {
    Text,
    Space,
    Tab,
    Newline,
    SoftHyphen,
    ZeroWidthBreak,
    WordJoiner,
}

#[derive(Clone, Debug)]
pub(crate) struct AnalyzedGrapheme {
    pub byte_range: Range<usize>,
    pub kind: GraphemeKind,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum AnalysisSegmentKind {
    Text,
    Space,
    Tab,
    Newline,
    Glue,
    ZeroWidthBreak,
    SoftHyphen,
}

#[derive(Clone, Debug)]
pub(crate) struct AnalyzedSegment {
    pub byte_range: Range<usize>,
    pub grapheme_range: Range<usize>,
    #[allow(dead_code)]
    pub kind: AnalysisSegmentKind,
}

#[derive(Clone, Debug)]
pub(crate) struct TextAnalysis {
    pub normalized: String,
    pub graphemes: Vec<AnalyzedGrapheme>,
    pub segments: Vec<AnalyzedSegment>,
    pub urls: Vec<Range<usize>>,
    pub white_space: WhiteSpaceMode,
    pub word_break: WordBreakMode,
}

type LocaleWordSegmenterCache = RefCell<HashMap<String, Option<Rc<WordSegmenter>>>>;

thread_local! {
    static LOCALE_WORD_SEGMENTER_CACHE: LocaleWordSegmenterCache = RefCell::new(HashMap::new());
}

pub(crate) fn clear_runtime_caches() {
    LOCALE_WORD_SEGMENTER_CACHE.with(|cache| cache.borrow_mut().clear());
}

pub(crate) fn analyze_text(
    text: &str,
    opts: &PrepareOptions,
    locale: Option<&str>,
) -> TextAnalysis {
    let white_space = opts.white_space;
    let word_break = opts.word_break;
    let normalized = match white_space {
        WhiteSpaceMode::Normal => normalize_whitespace_normal(text),
        WhiteSpaceMode::PreWrap => normalize_whitespace_pre_wrap(text),
    };

    let graphemes: Vec<AnalyzedGrapheme> =
        UnicodeSegmentation::grapheme_indices(normalized.as_str(), true)
            .map(|(start, grapheme)| {
                let end = start + grapheme.len();
                AnalyzedGrapheme {
                    byte_range: start..end,
                    kind: classify_grapheme(grapheme, white_space),
                }
            })
            .collect();
    let urls = find_url_spans(&normalized);
    let segments = build_segments(
        &normalized,
        &graphemes,
        white_space,
        word_break,
        &urls,
        locale,
    );

    TextAnalysis {
        normalized,
        graphemes,
        segments,
        urls,
        white_space,
        word_break,
    }
}

pub fn normalize_whitespace_normal(text: &str) -> String {
    if text.is_empty() {
        return String::new();
    }

    let mut normalized = String::with_capacity(text.len());
    let mut saw_collapsible_space = false;

    for ch in text.chars() {
        match ch {
            ' ' | '\t' | '\n' | '\r' | '\u{000C}' => {
                saw_collapsible_space = true;
            }
            _ => {
                if saw_collapsible_space && !normalized.is_empty() {
                    normalized.push(' ');
                }
                saw_collapsible_space = false;
                normalized.push(ch);
            }
        }
    }

    if normalized.ends_with(' ') {
        normalized.pop();
    }

    normalized
}

pub fn normalize_whitespace_pre_wrap(text: &str) -> String {
    text.replace("\r\n", "\n")
        .replace('\r', "\n")
        .replace('\u{000C}', "\n")
}

pub(crate) fn slice_text<'a>(text: &'a str, range: &Range<usize>) -> &'a str {
    &text[range.clone()]
}

pub(crate) fn is_cjk(text: &str) -> bool {
    text.chars().all(is_cjk_char)
}

pub(crate) fn contains_cjk_text(text: &str) -> bool {
    text.chars().any(is_cjk_char)
}

pub(crate) fn can_continue_keep_all_text_run(text: &str) -> bool {
    text.chars().last().is_some_and(|ch| {
        !is_keep_all_glue_char(ch)
            && !is_left_sticky_punctuation_char(ch)
            && !is_keep_all_dash_break_char(ch)
    })
}

pub(crate) fn is_cjk_char(ch: char) -> bool {
    let c = ch as u32;
    matches!(
        c,
        0x4E00..=0x9FFF
            | 0x3400..=0x4DBF
            | 0x20000..=0x2A6DF
            | 0x2A700..=0x2B73F
            | 0x2B740..=0x2B81F
            | 0x2B820..=0x2CEAF
            | 0x2CEB0..=0x2EBEF
            | 0x2EBF0..=0x2EE5D
            | 0x30000..=0x3134F
            | 0x31350..=0x323AF
            | 0x323B0..=0x33479
            | 0xF900..=0xFAFF
            | 0x2F800..=0x2FA1F
            | 0x3000..=0x303F
            | 0x3040..=0x309F
            | 0x30A0..=0x30FF
            | 0x3130..=0x318F
            | 0xAC00..=0xD7AF
            | 0xFF00..=0xFFEF
    )
}

fn is_keep_all_glue_char(ch: char) -> bool {
    matches!(ch, '\u{00A0}' | '\u{202F}' | '\u{2060}' | '\u{FEFF}')
}

fn is_keep_all_dash_break_char(ch: char) -> bool {
    matches!(ch, '-' | '\u{2010}' | '\u{2013}' | '\u{2014}')
}

pub(crate) fn is_cjk_line_start_prohibited(text: &str) -> bool {
    text.chars().all(|ch| {
        matches!(
            ch,
            '\u{FF0C}'
                | '\u{FF0E}'
                | '\u{FF01}'
                | '\u{FF1A}'
                | '\u{FF1B}'
                | '\u{FF1F}'
                | '\u{3001}'
                | '\u{3002}'
                | '\u{30FB}'
                | '\u{FF09}'
                | '\u{3015}'
                | '\u{3009}'
                | '\u{300B}'
                | '\u{300D}'
                | '\u{300F}'
                | '\u{3011}'
                | '\u{3017}'
                | '\u{3019}'
                | '\u{301B}'
                | '\u{30FC}'
                | '\u{3005}'
                | '\u{303B}'
                | '\u{309D}'
                | '\u{309E}'
                | '\u{30FD}'
                | '\u{30FE}'
                | '.'
                | ','
                | '!'
                | '?'
                | ':'
                | ';'
                | ')'
                | ']'
                | '}'
                | '%'
                | '"'
                | '”'
                | '’'
                | '»'
                | '›'
                | '…'
        )
    })
}

pub(crate) fn is_cjk_line_end_prohibited(text: &str) -> bool {
    text.chars().all(|ch| {
        matches!(
            ch,
            '"' | '('
                | '['
                | '{'
                | '“'
                | '‘'
                | '«'
                | '‹'
                | '\u{FF08}'
                | '\u{3014}'
                | '\u{3008}'
                | '\u{300A}'
                | '\u{300C}'
                | '\u{300E}'
                | '\u{3010}'
                | '\u{3016}'
                | '\u{3018}'
                | '\u{301A}'
        )
    })
}

fn classify_grapheme(grapheme: &str, white_space: WhiteSpaceMode) -> GraphemeKind {
    match grapheme {
        "\n" => GraphemeKind::Newline,
        "\t" if white_space == WhiteSpaceMode::PreWrap => GraphemeKind::Tab,
        "\u{00AD}" => GraphemeKind::SoftHyphen,
        "\u{200B}" => GraphemeKind::ZeroWidthBreak,
        "\u{2060}" => GraphemeKind::WordJoiner,
        " " => GraphemeKind::Space,
        _ => GraphemeKind::Text,
    }
}

fn find_url_spans(text: &str) -> Vec<Range<usize>> {
    let mut spans = Vec::new();
    let mut start = 0usize;

    while start < text.len() {
        let remaining = &text[start..];
        let Some(offset) = remaining
            .find("http://")
            .or_else(|| remaining.find("https://"))
            .or_else(|| remaining.find("www."))
        else {
            break;
        };

        let absolute_start = start + offset;
        let mut absolute_end = text.len();
        for (idx, ch) in text[absolute_start..].char_indices() {
            if ch.is_whitespace() {
                absolute_end = absolute_start + idx;
                break;
            }
        }

        if absolute_end > absolute_start {
            spans.push(absolute_start..absolute_end);
        }
        start = absolute_end.saturating_add(1);
    }

    spans
}

#[derive(Clone, Debug)]
struct SegmentationPiece {
    byte_range: Range<usize>,
    grapheme_range: Range<usize>,
    kind: AnalysisSegmentKind,
    word_like: bool,
}

fn build_segments(
    normalized: &str,
    graphemes: &[AnalyzedGrapheme],
    white_space: WhiteSpaceMode,
    word_break: WordBreakMode,
    urls: &[Range<usize>],
    locale: Option<&str>,
) -> Vec<AnalyzedSegment> {
    if graphemes.is_empty() {
        return Vec::new();
    }

    let grapheme_starts: HashMap<usize, usize> = graphemes
        .iter()
        .enumerate()
        .map(|(index, grapheme)| (grapheme.byte_range.start, index))
        .collect();

    let mut pieces = build_initial_pieces(normalized, graphemes, white_space, locale);
    pieces = merge_initial_text_pieces(normalized, pieces);
    merge_escaped_quote_clusters_into_previous(normalized, &mut pieces);
    merge_forward_sticky_into_next(normalized, &mut pieces);
    pieces = merge_glue_connected_text_runs(normalized, word_break, pieces);
    pieces = merge_url_like_runs(normalized, urls, &grapheme_starts, graphemes, pieces);
    pieces = merge_numeric_runs(normalized, pieces);
    pieces = split_hyphenated_numeric_runs(normalized, &grapheme_starts, graphemes, pieces);
    pieces = merge_ascii_punctuation_chains(normalized, pieces);
    pieces = carry_trailing_forward_sticky_across_cjk_boundary(
        normalized,
        &grapheme_starts,
        graphemes,
        pieces,
    );
    merge_leading_marks_after_space(normalized, &mut pieces);
    if word_break == WordBreakMode::KeepAll {
        pieces = merge_keep_all_text_runs(normalized, pieces);
    }

    pieces
        .into_iter()
        .map(|piece| AnalyzedSegment {
            byte_range: piece.byte_range,
            grapheme_range: piece.grapheme_range,
            kind: piece.kind,
        })
        .collect()
}

fn build_initial_pieces(
    normalized: &str,
    graphemes: &[AnalyzedGrapheme],
    white_space: WhiteSpaceMode,
    locale: Option<&str>,
) -> Vec<SegmentationPiece> {
    let mut pieces = Vec::new();
    let mut grapheme_index = 0usize;

    for piece_range in word_piece_ranges(normalized, graphemes, locale) {
        let piece_start = piece_range.start;
        let piece_end = piece_range.end;
        let mut current_kind: Option<AnalysisSegmentKind> = None;
        let mut current_grapheme_start = grapheme_index;
        let mut current_piece_start = piece_start;

        while grapheme_index < graphemes.len()
            && graphemes[grapheme_index].byte_range.start < piece_end
        {
            let grapheme = &graphemes[grapheme_index];
            let grapheme_text = slice_text(normalized, &grapheme.byte_range);
            let kind = classify_segment_kind(grapheme_text, grapheme.kind, white_space);

            if current_kind.is_some_and(|current| current != kind) {
                let byte_range = current_piece_start..grapheme.byte_range.start;
                pieces.push(SegmentationPiece {
                    word_like: is_word_like_text(slice_text(normalized, &byte_range)),
                    byte_range,
                    grapheme_range: current_grapheme_start..grapheme_index,
                    kind: current_kind.expect("current kind must exist"),
                });
                current_piece_start = grapheme.byte_range.start;
                current_grapheme_start = grapheme_index;
            }

            current_kind = Some(kind);
            grapheme_index += 1;
        }

        if let Some(kind) = current_kind {
            let byte_range = current_piece_start..piece_end;
            pieces.push(SegmentationPiece {
                word_like: is_word_like_text(slice_text(normalized, &byte_range)),
                byte_range,
                grapheme_range: current_grapheme_start..grapheme_index,
                kind,
            });
        }
    }

    pieces
}

fn locale_word_segmenter(locale: Option<&str>) -> Option<Rc<WordSegmenter>> {
    let locale = locale?.trim();
    if locale.is_empty() {
        return None;
    }

    LOCALE_WORD_SEGMENTER_CACHE.with(|cache| {
        if let Some(cached) = cache.borrow().get(locale).cloned() {
            return cached;
        }

        let segmenter = build_locale_word_segmenter(locale);
        cache
            .borrow_mut()
            .insert(locale.to_owned(), segmenter.clone());
        segmenter
    })
}

fn build_locale_word_segmenter(locale: &str) -> Option<Rc<WordSegmenter>> {
    let locale = locale.parse::<Locale>().ok()?;
    let mut options = WordBreakOptions::default();
    options.content_locale = Some(&locale.id);
    WordSegmenter::try_new_auto(options).ok().map(Rc::new)
}

fn word_piece_ranges(
    normalized: &str,
    graphemes: &[AnalyzedGrapheme],
    locale: Option<&str>,
) -> Vec<Range<usize>> {
    let boundaries = if let Some(segmenter) = locale_word_segmenter(locale) {
        segmenter
            .as_borrowed()
            .segment_str(normalized)
            .collect::<Vec<_>>()
    } else {
        WordSegmenter::new_auto(WordBreakInvariantOptions::default())
            .segment_str(normalized)
            .collect::<Vec<_>>()
    };

    boundaries
        .windows(2)
        .filter_map(|window| {
            let start = window[0];
            let end = window[1];
            (start < end).then_some(start..end)
        })
        .flat_map(|piece_range| {
            let piece_text = slice_text(normalized, &piece_range);
            if !is_cjk(piece_text) {
                return vec![piece_range];
            }

            graphemes
                .iter()
                .filter(|grapheme| {
                    grapheme.byte_range.start >= piece_range.start
                        && grapheme.byte_range.start < piece_range.end
                })
                .map(|grapheme| grapheme.byte_range.clone())
                .collect::<Vec<_>>()
        })
        .collect()
}

fn merge_initial_text_pieces(
    normalized: &str,
    pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    let mut merged: Vec<SegmentationPiece> = Vec::with_capacity(pieces.len());

    for piece in pieces {
        if let Some(previous) = merged.last_mut() {
            if previous.kind == AnalysisSegmentKind::Text
                && piece.kind == AnalysisSegmentKind::Text
                && should_merge_text_piece(normalized, previous, &piece)
            {
                previous.byte_range.end = piece.byte_range.end;
                previous.grapheme_range.end = piece.grapheme_range.end;
                previous.word_like = previous.word_like || piece.word_like;
                continue;
            }
        }

        merged.push(piece);
    }

    merged
}

fn should_merge_text_piece(
    normalized: &str,
    previous: &SegmentationPiece,
    next: &SegmentationPiece,
) -> bool {
    let previous_text = slice_text(normalized, &previous.byte_range);
    let next_text = slice_text(normalized, &next.byte_range);

    if is_cjk(next_text) && is_cjk(previous_text) && is_cjk_line_start_prohibited(next_text) {
        return true;
    }

    if ends_with_myanmar_medial_glue(previous_text) {
        return true;
    }

    if next.word_like
        && contains_arabic_script(next_text)
        && ends_with_arabic_no_space_punctuation(previous_text)
    {
        return true;
    }

    if !next.word_like
        && next_text != "-"
        && next_text != "—"
        && is_repeated_single_char_run(previous_text, next_text)
    {
        return true;
    }

    if !previous.word_like && is_forward_sticky_cluster_segment(previous_text) {
        return true;
    }

    !next.word_like
        && !contains_cjk_text(previous_text)
        && (is_left_sticky_punctuation_segment(next_text)
            || (next_text == "-" && previous.word_like))
}

fn merge_escaped_quote_clusters_into_previous(
    normalized: &str,
    pieces: &mut Vec<SegmentationPiece>,
) {
    let mut index = 1usize;
    while index < pieces.len() {
        let merge = pieces[index].kind == AnalysisSegmentKind::Text
            && !pieces[index].word_like
            && pieces[index - 1].kind == AnalysisSegmentKind::Text
            && !contains_cjk_text(slice_text(normalized, &pieces[index - 1].byte_range))
            && is_escaped_quote_cluster_segment(slice_text(normalized, &pieces[index].byte_range));

        if merge {
            let current = pieces.remove(index);
            pieces[index - 1].byte_range.end = current.byte_range.end;
            pieces[index - 1].grapheme_range.end = current.grapheme_range.end;
            pieces[index - 1].word_like = pieces[index - 1].word_like || current.word_like;
        } else {
            index += 1;
        }
    }
}

fn merge_forward_sticky_into_next(normalized: &str, pieces: &mut Vec<SegmentationPiece>) {
    let mut index = 0usize;
    while index + 1 < pieces.len() {
        let merge = pieces[index].kind == AnalysisSegmentKind::Text
            && !pieces[index].word_like
            && pieces[index + 1].kind == AnalysisSegmentKind::Text
            && is_forward_sticky_cluster_segment(slice_text(normalized, &pieces[index].byte_range));

        if merge {
            let current = pieces.remove(index);
            pieces[index].byte_range.start = current.byte_range.start;
            pieces[index].grapheme_range.start = current.grapheme_range.start;
            pieces[index].word_like = pieces[index].word_like || current.word_like;
            index = index.saturating_sub(1);
        } else {
            index += 1;
        }
    }
}

fn merge_glue_connected_text_runs(
    normalized: &str,
    word_break: WordBreakMode,
    pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    let mut merged = Vec::with_capacity(pieces.len());
    let mut read = 0usize;

    while read < pieces.len() {
        let mut piece = pieces[read].clone();

        if piece.kind == AnalysisSegmentKind::Glue {
            let glue_start = piece.byte_range.start;
            let glue_grapheme_start = piece.grapheme_range.start;
            read += 1;
            while read < pieces.len() && pieces[read].kind == AnalysisSegmentKind::Glue {
                piece.byte_range.end = pieces[read].byte_range.end;
                piece.grapheme_range.end = pieces[read].grapheme_range.end;
                read += 1;
            }

            if read < pieces.len() && pieces[read].kind == AnalysisSegmentKind::Text {
                piece.kind = AnalysisSegmentKind::Text;
                piece.byte_range.end = pieces[read].byte_range.end;
                piece.grapheme_range.end = pieces[read].grapheme_range.end;
                piece.byte_range.start = glue_start;
                piece.grapheme_range.start = glue_grapheme_start;
                piece.word_like = pieces[read].word_like;
                read += 1;
            } else {
                merged.push(piece);
                continue;
            }
        } else {
            read += 1;
        }

        if piece.kind == AnalysisSegmentKind::Text {
            while read < pieces.len() && pieces[read].kind == AnalysisSegmentKind::Glue {
                let glue = pieces[read].clone();
                read += 1;
                let mut trailing_glue = glue.clone();
                while read < pieces.len() && pieces[read].kind == AnalysisSegmentKind::Glue {
                    trailing_glue.byte_range.end = pieces[read].byte_range.end;
                    trailing_glue.grapheme_range.end = pieces[read].grapheme_range.end;
                    read += 1;
                }

                if read < pieces.len() && pieces[read].kind == AnalysisSegmentKind::Text {
                    let current_text = slice_text(normalized, &piece.byte_range);
                    let next_text = slice_text(normalized, &pieces[read].byte_range);
                    if word_break == WordBreakMode::KeepAll
                        && !contains_cjk_text(current_text)
                        && contains_cjk_text(next_text)
                    {
                        piece.byte_range.end = trailing_glue.byte_range.end;
                        piece.grapheme_range.end = trailing_glue.grapheme_range.end;
                        break;
                    }
                    piece.byte_range.end = pieces[read].byte_range.end;
                    piece.grapheme_range.end = pieces[read].grapheme_range.end;
                    piece.word_like = piece.word_like || pieces[read].word_like;
                    read += 1;
                } else {
                    piece.byte_range.end = trailing_glue.byte_range.end;
                    piece.grapheme_range.end = trailing_glue.grapheme_range.end;
                }
            }
        }

        merged.push(piece);
    }

    merged
}

fn merge_url_like_runs(
    normalized: &str,
    urls: &[Range<usize>],
    grapheme_starts: &HashMap<usize, usize>,
    graphemes: &[AnalyzedGrapheme],
    pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    if urls.is_empty() {
        return pieces;
    }

    let mut merged = Vec::new();
    let mut piece_index = 0usize;
    let mut url_index = 0usize;

    while piece_index < pieces.len() {
        while url_index < urls.len() && pieces[piece_index].byte_range.start >= urls[url_index].end
        {
            url_index += 1;
        }

        if url_index < urls.len() {
            let span = &urls[url_index];
            if pieces[piece_index].byte_range.start >= span.start
                && pieces[piece_index].byte_range.end <= span.end
            {
                while piece_index < pieces.len() && pieces[piece_index].byte_range.end <= span.end {
                    piece_index += 1;
                }

                let url_text = &normalized[span.clone()];
                if let Some(query_offset) = url_text.find('?') {
                    let query_split = span.start + query_offset + '?'.len_utf8();
                    if query_split < span.end {
                        merged.push(make_text_piece(
                            span.start,
                            query_split,
                            grapheme_starts,
                            graphemes,
                        ));
                        merged.push(make_text_piece(
                            query_split,
                            span.end,
                            grapheme_starts,
                            graphemes,
                        ));
                        url_index += 1;
                        continue;
                    }
                }

                merged.push(make_text_piece(
                    span.start,
                    span.end,
                    grapheme_starts,
                    graphemes,
                ));
                url_index += 1;
                continue;
            }
        }

        merged.push(pieces[piece_index].clone());
        piece_index += 1;
    }

    merged
}

fn merge_numeric_runs(normalized: &str, pieces: Vec<SegmentationPiece>) -> Vec<SegmentationPiece> {
    let mut merged = Vec::new();
    let mut index = 0usize;

    while index < pieces.len() {
        let piece = &pieces[index];
        let piece_text = slice_text(normalized, &piece.byte_range);

        if piece.kind == AnalysisSegmentKind::Text
            && is_numeric_run_segment(piece_text)
            && contains_decimal_digit(piece_text)
        {
            let mut merged_piece = piece.clone();
            index += 1;
            while index < pieces.len()
                && pieces[index].kind == AnalysisSegmentKind::Text
                && is_numeric_run_segment(slice_text(normalized, &pieces[index].byte_range))
            {
                merged_piece.byte_range.end = pieces[index].byte_range.end;
                merged_piece.grapheme_range.end = pieces[index].grapheme_range.end;
                merged_piece.word_like = true;
                index += 1;
            }
            merged.push(merged_piece);
            continue;
        }

        merged.push(piece.clone());
        index += 1;
    }

    merged
}

fn split_hyphenated_numeric_runs(
    normalized: &str,
    grapheme_starts: &HashMap<usize, usize>,
    graphemes: &[AnalyzedGrapheme],
    pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    let mut split = Vec::new();

    for piece in pieces {
        let text = slice_text(normalized, &piece.byte_range);
        if piece.kind == AnalysisSegmentKind::Text && text.contains('-') {
            let parts: Vec<&str> = text.split('-').collect();
            let should_split = parts.len() > 1
                && parts.iter().all(|part| {
                    !part.is_empty() && contains_decimal_digit(part) && is_numeric_run_segment(part)
                });

            if should_split {
                let mut offset = piece.byte_range.start;
                for (index, part) in parts.iter().enumerate() {
                    let mut part_end = offset + part.len();
                    if index + 1 < parts.len() {
                        part_end += 1;
                    }
                    split.push(make_text_piece(
                        offset,
                        part_end,
                        grapheme_starts,
                        graphemes,
                    ));
                    offset = part_end;
                }
                continue;
            }
        }

        split.push(piece);
    }

    split
}

fn merge_ascii_punctuation_chains(
    normalized: &str,
    pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    let mut merged = Vec::new();
    let mut index = 0usize;

    while index < pieces.len() {
        let piece = &pieces[index];
        let text = slice_text(normalized, &piece.byte_range);

        if piece.kind == AnalysisSegmentKind::Text
            && piece.word_like
            && is_ascii_punctuation_chain_segment(text)
        {
            let mut merged_piece = piece.clone();
            let mut merged_text = text.to_owned();
            index += 1;

            while has_ascii_chain_trailing_joiners(&merged_text)
                && index < pieces.len()
                && pieces[index].kind == AnalysisSegmentKind::Text
                && pieces[index].word_like
                && is_ascii_punctuation_chain_segment(slice_text(
                    normalized,
                    &pieces[index].byte_range,
                ))
            {
                merged_piece.byte_range.end = pieces[index].byte_range.end;
                merged_piece.grapheme_range.end = pieces[index].grapheme_range.end;
                merged_text.push_str(slice_text(normalized, &pieces[index].byte_range));
                index += 1;
            }

            merged.push(merged_piece);
            continue;
        }

        merged.push(piece.clone());
        index += 1;
    }

    merged
}

fn carry_trailing_forward_sticky_across_cjk_boundary(
    normalized: &str,
    grapheme_starts: &HashMap<usize, usize>,
    graphemes: &[AnalyzedGrapheme],
    mut pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    if pieces.len() < 2 {
        return pieces;
    }

    for index in 0..pieces.len() - 1 {
        if pieces[index].kind != AnalysisSegmentKind::Text
            || pieces[index + 1].kind != AnalysisSegmentKind::Text
        {
            continue;
        }

        let left = slice_text(normalized, &pieces[index].byte_range);
        let right = slice_text(normalized, &pieces[index + 1].byte_range);
        if !is_cjk(left) || !is_cjk(right) {
            continue;
        }

        let Some(split_offset) = split_trailing_forward_sticky_cluster(left) else {
            continue;
        };
        let split_byte = pieces[index].byte_range.start + split_offset;

        pieces[index].byte_range.end = split_byte;
        pieces[index].grapheme_range.end =
            grapheme_index_for_byte(grapheme_starts, graphemes, split_byte);
        pieces[index + 1].byte_range.start = split_byte;
        pieces[index + 1].grapheme_range.start =
            grapheme_index_for_byte(grapheme_starts, graphemes, split_byte);
    }

    pieces
}

fn merge_leading_marks_after_space(normalized: &str, pieces: &mut Vec<SegmentationPiece>) {
    let mut index = 1usize;
    while index + 1 < pieces.len() {
        let merge = matches!(
            pieces[index - 1].kind,
            AnalysisSegmentKind::Space | AnalysisSegmentKind::Glue
        ) && pieces[index].kind == AnalysisSegmentKind::Text
            && !pieces[index].word_like
            && is_marks_only(slice_text(normalized, &pieces[index].byte_range))
            && pieces[index + 1].kind == AnalysisSegmentKind::Text
            && contains_arabic_script(slice_text(normalized, &pieces[index + 1].byte_range));

        if merge {
            let current = pieces.remove(index);
            pieces[index].byte_range.start = current.byte_range.start;
            pieces[index].grapheme_range.start = current.grapheme_range.start;
        } else {
            index += 1;
        }
    }
}

fn merge_keep_all_text_runs(
    normalized: &str,
    pieces: Vec<SegmentationPiece>,
) -> Vec<SegmentationPiece> {
    let mut merged: Vec<SegmentationPiece> = Vec::with_capacity(pieces.len());
    let mut group_start: Option<usize> = None;
    let mut group_contains_cjk = false;

    fn push_merged_group(
        normalized: &str,
        pieces: &[SegmentationPiece],
        merged: &mut Vec<SegmentationPiece>,
        start: usize,
        end: usize,
        contains_cjk: bool,
    ) {
        if start >= end {
            return;
        }

        if !contains_cjk {
            merged.extend(pieces[start..end].iter().cloned());
            return;
        }

        let mut piece = pieces[start].clone();
        for next in &pieces[start + 1..end] {
            piece.byte_range.end = next.byte_range.end;
            piece.grapheme_range.end = next.grapheme_range.end;
            piece.word_like = piece.word_like || next.word_like;
        }
        piece.kind = AnalysisSegmentKind::Text;
        debug_assert!(contains_cjk_text(slice_text(normalized, &piece.byte_range)));
        merged.push(piece);
    }

    for (index, piece) in pieces.iter().enumerate() {
        if piece.kind == AnalysisSegmentKind::Text {
            if let Some(start) = group_start {
                let previous_text = slice_text(normalized, &pieces[index - 1].byte_range);
                if !can_continue_keep_all_text_run(previous_text) {
                    push_merged_group(
                        normalized,
                        &pieces,
                        &mut merged,
                        start,
                        index,
                        group_contains_cjk,
                    );
                    group_start = Some(index);
                    group_contains_cjk = false;
                }
            } else {
                group_start = Some(index);
            }

            group_contains_cjk =
                group_contains_cjk || contains_cjk_text(slice_text(normalized, &piece.byte_range));
            continue;
        }

        if let Some(start) = group_start.take() {
            push_merged_group(
                normalized,
                &pieces,
                &mut merged,
                start,
                index,
                group_contains_cjk,
            );
            group_contains_cjk = false;
        }
        merged.push(piece.clone());
    }

    if let Some(start) = group_start {
        push_merged_group(
            normalized,
            &pieces,
            &mut merged,
            start,
            pieces.len(),
            group_contains_cjk,
        );
    }

    merged
}

fn make_text_piece(
    start: usize,
    end: usize,
    grapheme_starts: &HashMap<usize, usize>,
    graphemes: &[AnalyzedGrapheme],
) -> SegmentationPiece {
    SegmentationPiece {
        byte_range: start..end,
        grapheme_range: grapheme_index_for_byte(grapheme_starts, graphemes, start)
            ..grapheme_index_for_byte(grapheme_starts, graphemes, end),
        kind: AnalysisSegmentKind::Text,
        word_like: true,
    }
}

fn grapheme_index_for_byte(
    grapheme_starts: &HashMap<usize, usize>,
    graphemes: &[AnalyzedGrapheme],
    byte: usize,
) -> usize {
    grapheme_starts
        .get(&byte)
        .copied()
        .unwrap_or_else(|| graphemes.len())
}

fn classify_segment_kind(
    grapheme: &str,
    grapheme_kind: GraphemeKind,
    white_space: WhiteSpaceMode,
) -> AnalysisSegmentKind {
    match grapheme_kind {
        GraphemeKind::Space => AnalysisSegmentKind::Space,
        GraphemeKind::Tab if white_space == WhiteSpaceMode::PreWrap => AnalysisSegmentKind::Tab,
        GraphemeKind::Newline if white_space == WhiteSpaceMode::PreWrap => {
            AnalysisSegmentKind::Newline
        }
        GraphemeKind::ZeroWidthBreak => AnalysisSegmentKind::ZeroWidthBreak,
        GraphemeKind::SoftHyphen => AnalysisSegmentKind::SoftHyphen,
        GraphemeKind::WordJoiner => AnalysisSegmentKind::Glue,
        _ if matches!(grapheme, "\u{00A0}" | "\u{202F}" | "\u{FEFF}") => AnalysisSegmentKind::Glue,
        _ => AnalysisSegmentKind::Text,
    }
}

fn is_word_like_text(text: &str) -> bool {
    let mut saw_word = false;
    for ch in text.chars() {
        if is_word_char(ch) {
            saw_word = true;
            continue;
        }
        if is_combining_mark(ch) {
            continue;
        }
        return false;
    }
    saw_word
}

fn is_word_char(ch: char) -> bool {
    ch.is_alphanumeric()
        || ch == '_'
        || (is_cjk_char(ch)
            && !ch.is_whitespace()
            && !is_left_sticky_punctuation_char(ch)
            && !is_forward_sticky_char(ch))
}

fn is_combining_mark(ch: char) -> bool {
    matches!(
        ch as u32,
        0x0300..=0x036F
            | 0x0483..=0x0489
            | 0x0591..=0x05BD
            | 0x05BF
            | 0x05C1..=0x05C2
            | 0x05C4..=0x05C5
            | 0x05C7
            | 0x0610..=0x061A
            | 0x064B..=0x065F
            | 0x0670
            | 0x06D6..=0x06DC
            | 0x06DF..=0x06E4
            | 0x06E7..=0x06E8
            | 0x06EA..=0x06ED
            | 0x0711
            | 0x0730..=0x074A
            | 0x07A6..=0x07B0
            | 0x07EB..=0x07F3
            | 0x0816..=0x0819
            | 0x081B..=0x0823
            | 0x0825..=0x0827
            | 0x0829..=0x082D
            | 0x0859..=0x085B
            | 0x08D3..=0x08FF
            | 0x1AB0..=0x1AFF
            | 0x1DC0..=0x1DFF
            | 0x20D0..=0x20FF
            | 0xFE20..=0xFE2F
    )
}

fn contains_arabic_script(text: &str) -> bool {
    text.chars()
        .any(|ch| ch.script() == unicode_script::Script::Arabic)
}

fn ends_with_arabic_no_space_punctuation(text: &str) -> bool {
    text.chars()
        .last()
        .is_some_and(|ch| matches!(ch, ':' | '.' | '\u{060C}' | '\u{061B}'))
}

fn ends_with_myanmar_medial_glue(text: &str) -> bool {
    text.chars().last().is_some_and(|ch| ch == '\u{104F}')
}

fn is_left_sticky_punctuation_segment(segment: &str) -> bool {
    let mut saw_punctuation = false;
    for ch in segment.chars() {
        if is_left_sticky_punctuation_char(ch) {
            saw_punctuation = true;
            continue;
        }
        if saw_punctuation && is_combining_mark(ch) {
            continue;
        }
        return false;
    }
    saw_punctuation
}

fn is_forward_sticky_cluster_segment(segment: &str) -> bool {
    if is_escaped_quote_cluster_segment(segment) {
        return true;
    }

    !segment.is_empty()
        && segment
            .chars()
            .all(|ch| is_forward_sticky_char(ch) || is_combining_mark(ch))
}

fn is_escaped_quote_cluster_segment(segment: &str) -> bool {
    let mut saw_quote = false;
    for ch in segment.chars() {
        if ch == '\\' || is_combining_mark(ch) {
            continue;
        }
        if is_forward_sticky_char(ch) || is_left_sticky_punctuation_char(ch) {
            saw_quote = true;
            continue;
        }
        return false;
    }
    saw_quote
}

fn is_left_sticky_punctuation_char(ch: char) -> bool {
    matches!(
        ch,
        '.' | ','
            | '!'
            | '?'
            | ':'
            | ';'
            | ')'
            | ']'
            | '}'
            | '%'
            | '"'
            | '”'
            | '’'
            | '»'
            | '›'
            | '…'
            | '\u{060C}'
            | '\u{061B}'
            | '\u{061F}'
            | '\u{0964}'
            | '\u{0965}'
            | '\u{104A}'
            | '\u{104B}'
            | '\u{104C}'
            | '\u{104D}'
            | '\u{104F}'
            | '\u{FF0C}'
            | '\u{FF0E}'
            | '\u{FF01}'
            | '\u{FF1A}'
            | '\u{FF1B}'
            | '\u{FF1F}'
            | '\u{3001}'
            | '\u{3002}'
            | '\u{30FB}'
            | '\u{FF09}'
            | '\u{3015}'
            | '\u{3009}'
            | '\u{300B}'
            | '\u{300D}'
            | '\u{300F}'
            | '\u{3011}'
            | '\u{3017}'
            | '\u{3019}'
            | '\u{301B}'
            | '\u{30FC}'
            | '\u{3005}'
            | '\u{303B}'
            | '\u{309D}'
            | '\u{309E}'
            | '\u{30FD}'
            | '\u{30FE}'
    )
}

fn is_forward_sticky_char(ch: char) -> bool {
    matches!(
        ch,
        '"' | '('
            | '['
            | '{'
            | '“'
            | '‘'
            | '«'
            | '‹'
            | '\''
            | '’'
            | '\u{FF08}'
            | '\u{3014}'
            | '\u{3008}'
            | '\u{300A}'
            | '\u{300C}'
            | '\u{300E}'
            | '\u{3010}'
            | '\u{3016}'
            | '\u{3018}'
            | '\u{301A}'
    )
}

fn split_trailing_forward_sticky_cluster(text: &str) -> Option<usize> {
    let chars: Vec<(usize, char)> = text.char_indices().collect();
    let mut split_index = chars.len();

    while split_index > 0 {
        let ch = chars[split_index - 1].1;
        if is_combining_mark(ch) || is_forward_sticky_char(ch) {
            split_index -= 1;
            continue;
        }
        break;
    }

    if split_index == 0 || split_index == chars.len() {
        return None;
    }

    Some(chars[split_index].0)
}

fn is_repeated_single_char_run(segment: &str, ch_segment: &str) -> bool {
    let mut chars = ch_segment.chars();
    let Some(ch) = chars.next() else {
        return false;
    };
    if chars.next().is_some() {
        return false;
    }

    !segment.is_empty() && segment.chars().all(|part| part == ch)
}

fn contains_decimal_digit(text: &str) -> bool {
    text.chars().any(|ch| ch.to_digit(10).is_some())
}

fn is_numeric_run_segment(text: &str) -> bool {
    !text.is_empty()
        && text
            .chars()
            .all(|ch| ch.to_digit(10).is_some() || is_numeric_joiner_char(ch))
}

fn is_numeric_joiner_char(ch: char) -> bool {
    matches!(
        ch,
        ':' | '-' | '/' | '×' | ',' | '.' | '+' | '\u{2013}' | '\u{2014}'
    )
}

fn is_ascii_punctuation_chain_segment(text: &str) -> bool {
    let mut saw_word = false;
    let mut saw_trailing_joiner = false;

    for ch in text.chars() {
        if ch.is_ascii_alphanumeric() || ch == '_' {
            if saw_trailing_joiner {
                return false;
            }
            saw_word = true;
            continue;
        }

        if saw_word && matches!(ch, ',' | ':' | ';') {
            saw_trailing_joiner = true;
            continue;
        }

        return false;
    }

    saw_word
}

fn has_ascii_chain_trailing_joiners(text: &str) -> bool {
    let mut saw_joiner = false;
    for ch in text.chars().rev() {
        if matches!(ch, ',' | ':' | ';') {
            saw_joiner = true;
            continue;
        }
        break;
    }
    saw_joiner
}

fn is_marks_only(text: &str) -> bool {
    !text.is_empty() && text.chars().all(is_combining_mark)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::engine::PrepareOptions;

    fn segment_texts(text: &str, white_space: WhiteSpaceMode) -> Vec<String> {
        segment_texts_with_locale(text, white_space, None)
    }

    fn segment_texts_with_word_break(
        text: &str,
        white_space: WhiteSpaceMode,
        word_break: WordBreakMode,
    ) -> Vec<String> {
        let analysis = analyze_text(
            text,
            &PrepareOptions {
                white_space,
                word_break,
                paragraph_direction: crate::bidi::ParagraphDirection::Auto,
                letter_spacing: 0.0,
            },
            None,
        );
        analysis
            .segments
            .iter()
            .map(|segment| slice_text(&analysis.normalized, &segment.byte_range).to_owned())
            .collect()
    }

    fn segment_texts_with_locale(
        text: &str,
        white_space: WhiteSpaceMode,
        locale: Option<&str>,
    ) -> Vec<String> {
        let analysis = analyze_text(
            text,
            &PrepareOptions {
                white_space,
                word_break: WordBreakMode::Normal,
                paragraph_direction: crate::bidi::ParagraphDirection::Auto,
                letter_spacing: 0.0,
            },
            locale,
        );
        analysis
            .segments
            .iter()
            .map(|segment| slice_text(&analysis.normalized, &segment.byte_range).to_owned())
            .collect()
    }

    #[test]
    fn normal_mode_builds_expected_segments() {
        assert_eq!(
            segment_texts("  Hello\t \n  World  ", WhiteSpaceMode::Normal),
            vec!["Hello".to_owned(), " ".to_owned(), "World".to_owned()]
        );
    }

    #[test]
    fn pre_wrap_keeps_spaces_tabs_and_breaks_visible() {
        assert_eq!(
            segment_texts("Hello\tWorld\n  Tail", WhiteSpaceMode::PreWrap),
            vec![
                "Hello".to_owned(),
                "\t".to_owned(),
                "World".to_owned(),
                "\n".to_owned(),
                "  ".to_owned(),
                "Tail".to_owned()
            ]
        );
    }

    #[test]
    fn segmentation_keeps_urls_numeric_runs_and_punctuation_attached() {
        assert_eq!(
            segment_texts(
                "see https://example.com/reports/q3?lang=ar&mode=full now",
                WhiteSpaceMode::Normal
            ),
            vec![
                "see".to_owned(),
                " ".to_owned(),
                "https://example.com/reports/q3?".to_owned(),
                "lang=ar&mode=full".to_owned(),
                " ".to_owned(),
                "now".to_owned()
            ]
        );
        assert_eq!(
            segment_texts("window 7:00-9:00 only", WhiteSpaceMode::Normal),
            vec![
                "window".to_owned(),
                " ".to_owned(),
                "7:00-".to_owned(),
                "9:00".to_owned(),
                " ".to_owned(),
                "only".to_owned()
            ]
        );
        assert_eq!(
            segment_texts("foo;bar === heading ===", WhiteSpaceMode::Normal),
            vec![
                "foo;bar".to_owned(),
                " ".to_owned(),
                "===".to_owned(),
                " ".to_owned(),
                "heading".to_owned(),
                " ".to_owned(),
                "===".to_owned()
            ]
        );
    }

    #[test]
    fn segmentation_applies_cjk_and_arabic_attachment_rules() {
        assert_eq!(
            segment_texts("中文，测试。", WhiteSpaceMode::Normal),
            vec![
                "中".to_owned(),
                "文，".to_owned(),
                "测".to_owned(),
                "试。".to_owned()
            ]
        );
        assert_eq!(
            segment_texts("作者はさつき、「下人", WhiteSpaceMode::Normal),
            vec![
                "作".to_owned(),
                "者".to_owned(),
                "は".to_owned(),
                "さ".to_owned(),
                "つ".to_owned(),
                "き、".to_owned(),
                "「下".to_owned(),
                "人".to_owned()
            ]
        );
        assert_eq!(
            segment_texts("مرحبا، عالم؟", WhiteSpaceMode::Normal),
            vec!["مرحبا،".to_owned(), " ".to_owned(), "عالم؟".to_owned()]
        );
        assert_eq!(
            segment_texts("فيقول:وعليك السلام", WhiteSpaceMode::Normal),
            vec!["فيقول:وعليك".to_owned(), " ".to_owned(), "السلام".to_owned()]
        );
    }

    #[test]
    fn segmentation_treats_hangul_compatibility_jamo_as_cjk_units() {
        assert_eq!(
            segment_texts("ㅋㅋㅋ 진짜", WhiteSpaceMode::Normal),
            vec![
                "ㅋ".to_owned(),
                "ㅋ".to_owned(),
                "ㅋ".to_owned(),
                " ".to_owned(),
                "진".to_owned(),
                "짜".to_owned()
            ]
        );
    }

    #[test]
    fn segmentation_keeps_cjk_opening_brackets_with_following_annotations() {
        assert_eq!(
            segment_texts("서울(Seoul)과", WhiteSpaceMode::Normal),
            vec![
                "서".to_owned(),
                "울".to_owned(),
                "(Seoul)".to_owned(),
                "과".to_owned()
            ]
        );
        assert_eq!(
            segment_texts("東京(Tokyo)と", WhiteSpaceMode::Normal),
            vec![
                "東".to_owned(),
                "京".to_owned(),
                "(Tokyo)".to_owned(),
                "と".to_owned()
            ]
        );
        assert_eq!(
            segment_texts("참조[1]와", WhiteSpaceMode::Normal),
            vec![
                "참".to_owned(),
                "조".to_owned(),
                "[1]".to_owned(),
                "와".to_owned()
            ]
        );
    }

    #[test]
    fn locale_can_change_word_boundaries() {
        assert_eq!(
            segment_texts_with_locale("EU:ssä", WhiteSpaceMode::Normal, None),
            vec!["EU:".to_owned(), "ssä".to_owned()]
        );
        assert_eq!(
            segment_texts_with_locale("EU:ssä", WhiteSpaceMode::Normal, Some("fi")),
            vec!["EU:ssä".to_owned()]
        );
    }

    #[test]
    fn keep_all_keeps_cjk_containing_no_space_runs_cohesive_with_boundaries() {
        assert_eq!(
            segment_texts_with_word_break(
                "中文，测试。",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["中文，".to_owned(), "测试。".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                "한국어테스트",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["한국어테스트".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                &"漢".repeat(256),
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["漢".repeat(256)]
        );

        for text in [
            "abc日本語",
            "123日本語",
            "abc123日本語",
            "foo_bar日本語",
            "foo.bar日本語",
            "500円テスト",
            "日本語foo.bar",
        ] {
            assert_eq!(
                segment_texts_with_word_break(text, WhiteSpaceMode::Normal, WordBreakMode::KeepAll),
                vec![text.to_owned()]
            );
        }

        assert_eq!(
            segment_texts_with_word_break(
                "日本語foo-bar",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["日本語foo-".to_owned(), "bar".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                "日本語foo—bar",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["日本語foo—".to_owned(), "bar".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                "foo-bar日本語",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["foo-".to_owned(), "bar日本語".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                "foo—bar日本語",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["foo".to_owned(), "—".to_owned(), "bar日本語".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                "foo?bar日本語",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["foo?".to_owned(), "bar日本語".to_owned()]
        );
        assert_eq!(
            segment_texts_with_word_break(
                "foo\u{00A0}世界",
                WhiteSpaceMode::Normal,
                WordBreakMode::KeepAll
            ),
            vec!["foo\u{00A0}".to_owned(), "世界".to_owned()]
        );
    }

    #[test]
    fn is_cjk_covers_hangul_compatibility_jamo_and_newer_extension_blocks() {
        assert!(is_cjk("ㅋ"));
        assert!(is_cjk("\u{2EBF0}"));
        assert!(is_cjk("\u{31350}"));
        assert!(is_cjk("\u{323B0}"));
        assert!(!is_cjk("hello"));
    }
}