use unicode_segmentation::UnicodeSegmentation;
const ESCAPE_CODE_POINT: u32 = 27;
const C1_DCS_CODE_POINT: u32 = 144;
const C1_SOS_CODE_POINT: u32 = 152;
const C1_CSI_CODE_POINT: u32 = 155;
const C1_ST_CODE_POINT: u32 = 156;
const C1_OSC_CODE_POINT: u32 = 157;
const C1_PM_CODE_POINT: u32 = 158;
const C1_APC_CODE_POINT: u32 = 159;
fn is_escape_code_point(cp: u32) -> bool {
matches!(
cp,
ESCAPE_CODE_POINT
| C1_DCS_CODE_POINT
| C1_SOS_CODE_POINT
| C1_CSI_CODE_POINT
| C1_ST_CODE_POINT
| C1_OSC_CODE_POINT
| C1_PM_CODE_POINT
| C1_APC_CODE_POINT
)
}
const ESCAPE: char = '\u{1B}';
const ANSI_BELL: char = '\u{7}';
const ANSI_CSI: char = '[';
const ANSI_OSC: char = ']';
const ANSI_DCS: char = 'P';
const ANSI_SOS: char = 'X';
const ANSI_PM: char = '^';
const ANSI_APC: char = '_';
const ANSI_SGR_TERMINATOR: char = 'm';
const ANSI_OSC_TERMINATOR: char = '\\';
const ANSI_STRING_TERMINATOR: &str = "\u{1B}\\";
const C1_STRING_TERMINATOR: char = '\u{9C}';
const ANSI_HYPERLINK_ESC_PREFIX: &str = "\u{1B}]8;";
const ANSI_HYPERLINK_C1_PREFIX: &str = "\u{9D}8;";
const ANSI_HYPERLINK_ESC_CLOSE: &str = "\u{1B}]8;;";
const ANSI_HYPERLINK_C1_CLOSE: &str = "\u{9D}8;;";
const VARIATION_SELECTOR_16: u32 = 65_039;
const COMBINING_ENCLOSING_KEYCAP: u32 = 8419;
const REGIONAL_INDICATOR_A: u32 = 127_462;
const REGIONAL_INDICATOR_Z: u32 = 127_487;
const SGR_RESET_CODE: u32 = 0;
const SGR_EXTENDED_FOREGROUND_CODE: u32 = 38;
const SGR_DEFAULT_FOREGROUND_CODE: u32 = 39;
const SGR_EXTENDED_BACKGROUND_CODE: u32 = 48;
const SGR_DEFAULT_BACKGROUND_CODE: u32 = 49;
const SGR_COLOR_TYPE_ANSI_256: u32 = 5;
const SGR_COLOR_TYPE_TRUECOLOR: u32 = 2;
const SGR_ANSI_256_FRAGMENT_LENGTH: usize = 3;
const SGR_TRUECOLOR_FRAGMENT_LENGTH: usize = 5;
const SGR_ANSI_256_LAST_PARAMETER_OFFSET: usize = 2;
const SGR_TRUECOLOR_LAST_PARAMETER_OFFSET: usize = 4;
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum SgrFragment {
Reset,
Start { code: String, end_code: String },
End { end_code: String },
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum HyperlinkAction {
Open,
Close,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum Token {
Sgr {
code: String,
fragments: Vec<SgrFragment>,
},
Hyperlink {
code: String,
action: HyperlinkAction,
close_prefix: String,
terminator: String,
},
Control { code: String },
Character {
value: String,
visible_width: usize,
is_grapheme_continuation: bool,
},
}
struct ParseResult {
token: Token,
end_index: usize,
}
const FULLWIDTH_RANGES: &[(u32, u32)] = &[
(0x1100, 0x115F),
(0x231A, 0x231B),
(0x2329, 0x232A),
(0x23E9, 0x23EC),
(0x23F0, 0x23F0),
(0x23F3, 0x23F3),
(0x25FD, 0x25FE),
(0x2614, 0x2615),
(0x2630, 0x2637),
(0x2648, 0x2653),
(0x267F, 0x267F),
(0x268A, 0x268F),
(0x2693, 0x2693),
(0x26A1, 0x26A1),
(0x26AA, 0x26AB),
(0x26BD, 0x26BE),
(0x26C4, 0x26C5),
(0x26CE, 0x26CE),
(0x26D4, 0x26D4),
(0x26EA, 0x26EA),
(0x26F2, 0x26F3),
(0x26F5, 0x26F5),
(0x26FA, 0x26FA),
(0x26FD, 0x26FD),
(0x2705, 0x2705),
(0x270A, 0x270B),
(0x2728, 0x2728),
(0x274C, 0x274C),
(0x274E, 0x274E),
(0x2753, 0x2755),
(0x2757, 0x2757),
(0x2795, 0x2797),
(0x27B0, 0x27B0),
(0x27BF, 0x27BF),
(0x2B1B, 0x2B1C),
(0x2B50, 0x2B50),
(0x2B55, 0x2B55),
(0x2E80, 0x2E99),
(0x2E9B, 0x2EF3),
(0x2F00, 0x2FD5),
(0x2FF0, 0x303E),
(0x3041, 0x3096),
(0x3099, 0x30FF),
(0x3105, 0x312F),
(0x3131, 0x318E),
(0x3190, 0x31E5),
(0x31EF, 0x321E),
(0x3220, 0x3247),
(0x3250, 0xA48C),
(0xA490, 0xA4C6),
(0xA960, 0xA97C),
(0xAC00, 0xD7A3),
(0xF900, 0xFAFF),
(0xFE10, 0xFE19),
(0xFE30, 0xFE52),
(0xFE54, 0xFE66),
(0xFE68, 0xFE6B),
(0xFF01, 0xFF60),
(0xFFE0, 0xFFE6),
(0x16FE0, 0x16FE4),
(0x16FF0, 0x16FF6),
(0x17000, 0x18CD5),
(0x18CFF, 0x18D1E),
(0x18D80, 0x18DF2),
(0x1AFF0, 0x1AFF3),
(0x1AFF5, 0x1AFFB),
(0x1AFFD, 0x1AFFE),
(0x1B000, 0x1B122),
(0x1B132, 0x1B132),
(0x1B150, 0x1B152),
(0x1B155, 0x1B155),
(0x1B164, 0x1B167),
(0x1B170, 0x1B2FB),
(0x1D300, 0x1D356),
(0x1D360, 0x1D376),
(0x1F004, 0x1F004),
(0x1F0CF, 0x1F0CF),
(0x1F18E, 0x1F18E),
(0x1F191, 0x1F19A),
(0x1F200, 0x1F202),
(0x1F210, 0x1F23B),
(0x1F240, 0x1F248),
(0x1F250, 0x1F251),
(0x1F260, 0x1F265),
(0x1F300, 0x1F320),
(0x1F32D, 0x1F335),
(0x1F337, 0x1F37C),
(0x1F37E, 0x1F393),
(0x1F3A0, 0x1F3CA),
(0x1F3CF, 0x1F3D3),
(0x1F3E0, 0x1F3F0),
(0x1F3F4, 0x1F3F4),
(0x1F3F8, 0x1F43E),
(0x1F440, 0x1F440),
(0x1F442, 0x1F4FC),
(0x1F4FF, 0x1F53D),
(0x1F54B, 0x1F54E),
(0x1F550, 0x1F567),
(0x1F57A, 0x1F57A),
(0x1F595, 0x1F596),
(0x1F5A4, 0x1F5A4),
(0x1F5FB, 0x1F64F),
(0x1F680, 0x1F6C5),
(0x1F6CC, 0x1F6CC),
(0x1F6D0, 0x1F6D2),
(0x1F6D5, 0x1F6D8),
(0x1F6DC, 0x1F6DF),
(0x1F6EB, 0x1F6EC),
(0x1F6F4, 0x1F6FC),
(0x1F7E0, 0x1F7EB),
(0x1F7F0, 0x1F7F0),
(0x1F90C, 0x1F93A),
(0x1F93C, 0x1F945),
(0x1F947, 0x1F9FF),
(0x1FA70, 0x1FA7C),
(0x1FA80, 0x1FA8A),
(0x1FA8E, 0x1FAC6),
(0x1FAC8, 0x1FAC8),
(0x1FACD, 0x1FADC),
(0x1FADF, 0x1FAEA),
(0x1FAEF, 0x1FAF8),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
];
const EMOJI_PRESENTATION_RANGES: &[(u32, u32)] = &[
(0x231A, 0x231B),
(0x23E9, 0x23EC),
(0x23F0, 0x23F0),
(0x23F3, 0x23F3),
(0x25FD, 0x25FE),
(0x2614, 0x2615),
(0x2648, 0x2653),
(0x267F, 0x267F),
(0x2693, 0x2693),
(0x26A1, 0x26A1),
(0x26AA, 0x26AB),
(0x26BD, 0x26BE),
(0x26C4, 0x26C5),
(0x26CE, 0x26CE),
(0x26D4, 0x26D4),
(0x26EA, 0x26EA),
(0x26F2, 0x26F3),
(0x26F5, 0x26F5),
(0x26FA, 0x26FA),
(0x26FD, 0x26FD),
(0x2705, 0x2705),
(0x270A, 0x270B),
(0x2728, 0x2728),
(0x274C, 0x274C),
(0x274E, 0x274E),
(0x2753, 0x2755),
(0x2757, 0x2757),
(0x2795, 0x2797),
(0x27B0, 0x27B0),
(0x27BF, 0x27BF),
(0x2B1B, 0x2B1C),
(0x2B50, 0x2B50),
(0x2B55, 0x2B55),
(0x1F004, 0x1F004),
(0x1F0CF, 0x1F0CF),
(0x1F18E, 0x1F18E),
(0x1F191, 0x1F19A),
(0x1F1E6, 0x1F1FF),
(0x1F201, 0x1F201),
(0x1F21A, 0x1F21A),
(0x1F22F, 0x1F22F),
(0x1F232, 0x1F236),
(0x1F238, 0x1F23A),
(0x1F250, 0x1F251),
(0x1F300, 0x1F320),
(0x1F32D, 0x1F335),
(0x1F337, 0x1F37C),
(0x1F37E, 0x1F393),
(0x1F3A0, 0x1F3CA),
(0x1F3CF, 0x1F3D3),
(0x1F3E0, 0x1F3F0),
(0x1F3F4, 0x1F3F4),
(0x1F3F8, 0x1F43E),
(0x1F440, 0x1F440),
(0x1F442, 0x1F4FC),
(0x1F4FF, 0x1F53D),
(0x1F54B, 0x1F54E),
(0x1F550, 0x1F567),
(0x1F57A, 0x1F57A),
(0x1F595, 0x1F596),
(0x1F5A4, 0x1F5A4),
(0x1F5FB, 0x1F64F),
(0x1F680, 0x1F6C5),
(0x1F6CC, 0x1F6CC),
(0x1F6D0, 0x1F6D2),
(0x1F6D5, 0x1F6D8),
(0x1F6DC, 0x1F6DF),
(0x1F6EB, 0x1F6EC),
(0x1F6F4, 0x1F6FC),
(0x1F7E0, 0x1F7EB),
(0x1F7F0, 0x1F7F0),
(0x1F90C, 0x1F93A),
(0x1F93C, 0x1F945),
(0x1F947, 0x1F9FF),
(0x1FA70, 0x1FA7C),
(0x1FA80, 0x1FA8A),
(0x1FA8E, 0x1FAC6),
(0x1FAC8, 0x1FAC8),
(0x1FACD, 0x1FADC),
(0x1FADF, 0x1FAEA),
(0x1FAEF, 0x1FAF8),
];
fn in_ranges(table: &[(u32, u32)], cp: u32) -> bool {
table
.binary_search_by(|&(lo, hi)| {
if cp < lo {
std::cmp::Ordering::Greater
} else if cp > hi {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok()
}
fn is_fullwidth_code_point(cp: u32) -> bool {
in_ranges(FULLWIDTH_RANGES, cp)
}
fn has_emoji_presentation(grapheme: &str) -> bool {
grapheme
.chars()
.any(|c| in_ranges(EMOJI_PRESENTATION_RANGES, c as u32))
}
fn is_regional_indicator(cp: u32) -> bool {
(REGIONAL_INDICATOR_A..=REGIONAL_INDICATOR_Z).contains(&cp)
}
fn is_emoji_style_grapheme(grapheme: &str) -> bool {
if has_emoji_presentation(grapheme) {
return true;
}
grapheme
.chars()
.any(|c| matches!(c as u32, VARIATION_SELECTOR_16 | COMBINING_ENCLOSING_KEYCAP))
}
fn get_grapheme_width(grapheme: &str) -> usize {
let mut regional_count = 0u32;
for c in grapheme.chars() {
let cp = c as u32;
if is_fullwidth_code_point(cp) {
return 2;
}
if is_regional_indicator(cp) {
regional_count += 1;
}
}
if regional_count >= 1 {
return 2;
}
if is_emoji_style_grapheme(grapheme) {
return 2;
}
1
}
fn is_sgr_parameter_character(c: char) -> bool {
c.is_ascii_digit() || c == ';' || c == ':'
}
fn is_csi_parameter_character(c: char) -> bool {
('\u{30}'..='\u{3F}').contains(&c)
}
fn is_csi_intermediate_character(c: char) -> bool {
('\u{20}'..='\u{2F}').contains(&c)
}
fn is_csi_final_character(c: char) -> bool {
('\u{40}'..='\u{7E}').contains(&c)
}
fn ansi_styles_ansi(code: u32) -> String {
format!("\u{1B}[{code}m")
}
const ANSI_STYLES_RESET_OPEN: &str = "\u{1B}[0m";
fn is_end_code_number(code: u32) -> bool {
matches!(code, 0 | 22 | 23 | 24 | 27 | 28 | 29 | 39 | 49 | 55)
}
fn ansi_styles_codes_get(code: u32) -> Option<u32> {
Some(match code {
0 => 0,
1 | 2 => 22,
3 => 23,
4 => 24,
53 => 55,
7 => 27,
8 => 28,
9 => 29,
30..=37 | 90..=97 => 39,
40..=47 | 100..=107 => 49,
_ => return None,
})
}
fn get_sgr_prefix(code: &str) -> &'static str {
if code.starts_with('\u{9B}') {
"\u{9B}"
} else {
"\u{1B}["
}
}
fn create_sgr_code(prefix: &str, values: &[&str]) -> String {
format!("{prefix}{}m", values.join(";"))
}
fn get_sgr_fragments(code: &str) -> Vec<SgrFragment> {
let mut fragments = Vec::new();
let sgr_prefix = get_sgr_prefix(code);
let parameter_string: &str = if let Some(rest) = code.strip_prefix("\u{1B}[") {
strip_last_char(rest)
} else if let Some(rest) = code.strip_prefix('\u{9B}') {
strip_last_char(rest)
} else {
return fragments;
};
let raw_codes: Vec<&str> = if parameter_string.is_empty() {
vec!["0"]
} else {
parameter_string.split(';').collect()
};
let mut index = 0;
while index < raw_codes.len() {
let Some(code_number) = parse_int(raw_codes[index]) else {
index += 1;
continue;
};
if code_number == SGR_RESET_CODE {
fragments.push(SgrFragment::Reset);
index += 1;
continue;
}
if code_number == SGR_EXTENDED_FOREGROUND_CODE
|| code_number == SGR_EXTENDED_BACKGROUND_CODE
{
index = push_extended_color_fragment(
&mut fragments,
sgr_prefix,
&raw_codes,
index,
code_number,
);
continue;
}
if is_end_code_number(code_number) {
fragments.push(SgrFragment::End {
end_code: ansi_styles_ansi(code_number),
});
index += 1;
continue;
}
if let Some(mapped) = ansi_styles_codes_get(code_number) {
fragments.push(SgrFragment::Start {
code: create_sgr_code(sgr_prefix, &[raw_codes[index]]),
end_code: ansi_styles_ansi(mapped),
});
index += 1;
continue;
}
fragments.push(SgrFragment::Start {
code: create_sgr_code(sgr_prefix, &[raw_codes[index]]),
end_code: ANSI_STYLES_RESET_OPEN.to_owned(),
});
index += 1;
}
if fragments.is_empty() {
fragments.push(SgrFragment::Reset);
}
fragments
}
fn push_extended_color_fragment(
fragments: &mut Vec<SgrFragment>,
sgr_prefix: &str,
raw_codes: &[&str],
index: usize,
code_number: u32,
) -> usize {
let default_reset = if code_number == SGR_EXTENDED_FOREGROUND_CODE {
SGR_DEFAULT_FOREGROUND_CODE
} else {
SGR_DEFAULT_BACKGROUND_CODE
};
let end_code = ansi_styles_ansi(default_reset);
let color_type = raw_codes.get(index + 1).and_then(|s| parse_int(s));
if color_type == Some(SGR_COLOR_TYPE_ANSI_256)
&& index + SGR_ANSI_256_LAST_PARAMETER_OFFSET < raw_codes.len()
{
let open = create_sgr_code(
sgr_prefix,
&raw_codes[index..index + SGR_ANSI_256_FRAGMENT_LENGTH],
);
fragments.push(SgrFragment::Start {
code: open,
end_code,
});
return index + SGR_ANSI_256_FRAGMENT_LENGTH;
}
if color_type == Some(SGR_COLOR_TYPE_TRUECOLOR)
&& index + SGR_TRUECOLOR_LAST_PARAMETER_OFFSET < raw_codes.len()
{
let open = create_sgr_code(
sgr_prefix,
&raw_codes[index..index + SGR_TRUECOLOR_FRAGMENT_LENGTH],
);
fragments.push(SgrFragment::Start {
code: open,
end_code,
});
return index + SGR_TRUECOLOR_FRAGMENT_LENGTH;
}
let open = create_sgr_code(sgr_prefix, &[raw_codes[index]]);
fragments.push(SgrFragment::Start {
code: open,
end_code,
});
index + 1
}
fn parse_int(s: &str) -> Option<u32> {
let bytes = s.as_bytes();
let mut i = 0;
let negative = match bytes.first() {
Some(b'+') => {
i = 1;
false
}
Some(b'-') => {
i = 1;
true
}
_ => false,
};
let start = i;
let mut value: u64 = 0;
while i < bytes.len() && bytes[i].is_ascii_digit() {
value = value
.saturating_mul(10)
.saturating_add(u64::from(bytes[i] - b'0'));
i += 1;
}
if i == start {
return None; }
if negative {
return Some(u32::MAX); }
Some(value.min(u64::from(u32::MAX)) as u32)
}
fn strip_last_char(s: &str) -> &str {
match s.char_indices().next_back() {
Some((i, _)) => &s[..i],
None => s,
}
}
fn control_result(code: &str, end_index: usize) -> ParseResult {
ParseResult {
token: Token::Control {
code: code.to_owned(),
},
end_index,
}
}
fn code_point_at(s: &str, index: usize) -> Option<u32> {
s[index..].chars().next().map(|c| c as u32)
}
fn char_at(s: &str, index: usize) -> Option<char> {
s.get(index..).and_then(|sub| sub.chars().next())
}
fn parse_csi_code(string: &str, index: usize) -> Option<ParseResult> {
let escape_cp = code_point_at(string, index)?;
let sequence_start_index;
if escape_cp == ESCAPE_CODE_POINT {
if char_at(string, index + 1) != Some(ANSI_CSI) {
return None;
}
sequence_start_index = index + 2;
} else if escape_cp == C1_CSI_CODE_POINT {
sequence_start_index = index + ESCAPE_CSI_C1_LEN;
} else {
return None;
}
let mut has_canonical_sgr_parameters = true;
let mut seq_index = sequence_start_index;
while seq_index < string.len() {
let Some(c) = char_at(string, seq_index) else {
break;
};
if is_csi_final_character(c) {
let code = &string[index..seq_index + c.len_utf8()];
if c != ANSI_SGR_TERMINATOR || !has_canonical_sgr_parameters {
return Some(control_result(code, seq_index + c.len_utf8()));
}
return Some(ParseResult {
token: Token::Sgr {
code: code.to_owned(),
fragments: get_sgr_fragments(code),
},
end_index: seq_index + c.len_utf8(),
});
}
if is_csi_parameter_character(c) {
if !is_sgr_parameter_character(c) {
has_canonical_sgr_parameters = false;
}
seq_index += c.len_utf8();
continue;
}
if is_csi_intermediate_character(c) {
has_canonical_sgr_parameters = false;
seq_index += c.len_utf8();
continue;
}
return Some(control_result(&string[index..seq_index], seq_index));
}
Some(control_result(&string[index..], string.len()))
}
const ESCAPE_CSI_C1_LEN: usize = 2;
const C1_OSC_LEN: usize = 2;
fn parse_hyperlink_code(string: &str, index: usize) -> Option<ParseResult> {
let cp = code_point_at(string, index)?;
let (prefix_len, hyperlink_close): (usize, &str) =
if cp == ESCAPE_CODE_POINT && string[index..].starts_with(ANSI_HYPERLINK_ESC_PREFIX) {
(ANSI_HYPERLINK_ESC_PREFIX.len(), ANSI_HYPERLINK_ESC_CLOSE)
} else if cp == C1_OSC_CODE_POINT && string[index..].starts_with(ANSI_HYPERLINK_C1_PREFIX) {
(ANSI_HYPERLINK_C1_PREFIX.len(), ANSI_HYPERLINK_C1_CLOSE)
} else {
return None;
};
let search_from = index + prefix_len;
let uri_start = match string[search_from..].find(';') {
Some(rel) => search_from + rel,
None => return Some(control_result(&string[index..], string.len())),
};
let mut seq_index = uri_start + 1;
while seq_index < string.len() {
let Some(c) = char_at(string, seq_index) else {
break;
};
if c == ANSI_BELL {
let code = &string[index..seq_index + c.len_utf8()];
return Some(make_hyperlink(
code,
seq_index == uri_start + 1,
hyperlink_close,
ANSI_BELL.to_string(),
seq_index + c.len_utf8(),
));
}
if c == ESCAPE && char_at(string, seq_index + 1) == Some(ANSI_OSC_TERMINATOR) {
let code = &string[index..seq_index + 2];
return Some(make_hyperlink(
code,
seq_index == uri_start + 1,
hyperlink_close,
ANSI_STRING_TERMINATOR.to_owned(),
seq_index + 2,
));
}
if c == C1_STRING_TERMINATOR {
let code = &string[index..seq_index + c.len_utf8()];
return Some(make_hyperlink(
code,
seq_index == uri_start + 1,
hyperlink_close,
C1_STRING_TERMINATOR.to_string(),
seq_index + c.len_utf8(),
));
}
seq_index += c.len_utf8();
}
Some(control_result(&string[index..], string.len()))
}
fn make_hyperlink(
code: &str,
is_close: bool,
close_prefix: &str,
terminator: String,
end_index: usize,
) -> ParseResult {
ParseResult {
token: Token::Hyperlink {
code: code.to_owned(),
action: if is_close {
HyperlinkAction::Close
} else {
HyperlinkAction::Open
},
close_prefix: close_prefix.to_owned(),
terminator,
},
end_index,
}
}
fn parse_control_string_code(string: &str, index: usize) -> Option<ParseResult> {
let cp = code_point_at(string, index)?;
let sequence_start_index;
let mut supports_bell_terminator = false;
match cp {
ESCAPE_CODE_POINT => {
let command = char_at(string, index + 1);
match command {
Some(ANSI_OSC) => {
sequence_start_index = index + 2;
supports_bell_terminator = true;
}
Some(ANSI_DCS) | Some(ANSI_SOS) | Some(ANSI_PM) | Some(ANSI_APC) => {
sequence_start_index = index + 2;
}
Some(ANSI_OSC_TERMINATOR) => {
return Some(control_result(ANSI_STRING_TERMINATOR, index + 2));
}
_ => return None,
}
}
C1_OSC_CODE_POINT => {
sequence_start_index = index + C1_OSC_LEN;
supports_bell_terminator = true;
}
C1_DCS_CODE_POINT | C1_SOS_CODE_POINT | C1_PM_CODE_POINT | C1_APC_CODE_POINT => {
sequence_start_index = index + 2;
}
C1_ST_CODE_POINT => {
return Some(control_result(&C1_STRING_TERMINATOR.to_string(), index + 2));
}
_ => return None,
}
let mut seq_index = sequence_start_index;
while seq_index < string.len() {
let Some(c) = char_at(string, seq_index) else {
break;
};
if supports_bell_terminator && c == ANSI_BELL {
return Some(control_result(
&string[index..seq_index + c.len_utf8()],
seq_index + c.len_utf8(),
));
}
if c == ESCAPE && char_at(string, seq_index + 1) == Some(ANSI_OSC_TERMINATOR) {
return Some(control_result(&string[index..seq_index + 2], seq_index + 2));
}
if c == C1_STRING_TERMINATOR {
return Some(control_result(
&string[index..seq_index + c.len_utf8()],
seq_index + c.len_utf8(),
));
}
seq_index += c.len_utf8();
}
Some(control_result(&string[index..], string.len()))
}
fn parse_ansi_code(string: &str, index: usize) -> Option<ParseResult> {
let cp = code_point_at(string, index)?;
let is_hyperlink_opener = cp == ESCAPE_CODE_POINT || cp == C1_OSC_CODE_POINT;
if let Some(hyperlink) = is_hyperlink_opener
.then(|| parse_hyperlink_code(string, index))
.flatten()
{
return Some(hyperlink);
}
if let Some(control_string) = parse_control_string_code(string, index) {
return Some(control_string);
}
parse_csi_code(string, index)
}
fn parse_escape_at(string: &str, index: usize) -> Option<ParseResult> {
let cp = code_point_at(string, index)?;
if !is_escape_code_point(cp) {
return None;
}
parse_ansi_code(string, index)
}
fn append_trailing_ansi_tokens(string: &str, mut index: usize, tokens: &mut Vec<Token>) -> usize {
while index < string.len() {
let Some(escape_code) = parse_escape_at(string, index) else {
break;
};
tokens.push(escape_code.token);
index = escape_code.end_index;
}
index
}
fn parse_character_token_raw(
index: usize,
grapheme_starts: &[(usize, &str)],
) -> Option<ParseResult> {
let segment = grapheme_starts
.iter()
.find(|&&(start, seg)| start <= index && index < start + seg.len())?;
if segment.0 != index {
return None;
}
let value = segment.1;
Some(ParseResult {
token: Token::Character {
value: value.to_owned(),
visible_width: get_grapheme_width(value),
is_grapheme_continuation: false,
},
end_index: index + value.len(),
})
}
fn are_values_in_same_grapheme(left_value: &str, right_value: &str) -> bool {
let pair = format!("{left_value}{right_value}");
let split_index = left_value.len();
for (idx, _) in pair.grapheme_indices(true) {
if idx == split_index {
return false;
}
if idx > split_index {
return true;
}
}
true
}
struct VisibleCharacter {
value: char,
visible_width: usize,
is_grapheme_continuation: bool,
}
fn collect_visible_characters(string: &str) -> Vec<VisibleCharacter> {
let mut visible = Vec::new();
let mut index = 0;
while index < string.len() {
if let Some(code) = parse_escape_at(string, index) {
index = code.end_index;
continue;
}
let c = char_at(string, index).expect("index is a char boundary");
visible.push(VisibleCharacter {
value: c,
visible_width: 1,
is_grapheme_continuation: false,
});
index += c.len_utf8();
}
visible
}
fn apply_grapheme_metadata(visible: &mut [VisibleCharacter]) {
if visible.is_empty() {
return;
}
let visible_string: String = visible.iter().map(|v| v.value).collect();
let mut scalar_offsets = Vec::with_capacity(visible.len());
let mut offset = 0usize;
for v in visible.iter() {
scalar_offsets.push(offset);
offset += v.value.len_utf8();
}
let mut scalar_index = 0usize;
for (seg_index, segment) in visible_string.grapheme_indices(true) {
while scalar_index < visible.len() && scalar_offsets[scalar_index] < seg_index {
scalar_index += 1;
}
let mut grapheme_index = scalar_index;
let mut is_first = true;
let seg_end = seg_index + segment.len();
while grapheme_index < visible.len() && scalar_offsets[grapheme_index] < seg_end {
visible[grapheme_index].visible_width = if is_first {
get_grapheme_width(segment)
} else {
0
};
visible[grapheme_index].is_grapheme_continuation = !is_first;
is_first = false;
grapheme_index += 1;
}
scalar_index = grapheme_index;
}
}
fn tokenize_ansi_with_visible_segmentation(
string: &str,
end_character: Option<usize>,
) -> Vec<Token> {
let mut tokens = Vec::new();
let mut visible = collect_visible_characters(string);
apply_grapheme_metadata(&mut visible);
let mut index = 0usize;
let mut visible_index = 0usize;
let mut visible_count = 0usize;
while index < string.len() {
if let Some(code) = parse_escape_at(string, index) {
tokens.push(code.token);
index = code.end_index;
continue;
}
let c = char_at(string, index).expect("index is a char boundary");
let visible_character = visible.get(visible_index);
let fallback_width = if is_fullwidth_code_point(c as u32) {
2
} else {
utf16_len(c)
};
let visible_width = visible_character.map_or(fallback_width, |v| v.visible_width);
let is_continuation = visible_character.is_some_and(|v| v.is_grapheme_continuation);
tokens.push(Token::Character {
value: c.to_string(),
visible_width,
is_grapheme_continuation: is_continuation,
});
index += c.len_utf8();
visible_index += 1;
visible_count += visible_width;
if end_character.is_some_and(|end| visible_count >= end) {
let next_visible = visible.get(visible_index);
if next_visible.is_none_or(|v| !v.is_grapheme_continuation) {
let _ = append_trailing_ansi_tokens(string, index, &mut tokens);
break;
}
}
}
tokens
}
fn utf16_len(c: char) -> usize {
c.len_utf16()
}
fn has_ansi_split_continuation_ahead(
string: &str,
start_index: usize,
previous_visible_value: Option<&str>,
grapheme_starts: &[(usize, &str)],
) -> bool {
let Some(previous) = previous_visible_value else {
return false;
};
let mut index = start_index;
let mut has_ansi_code = false;
while index < string.len() {
if let Some(code) = parse_escape_at(string, index) {
has_ansi_code = true;
index = code.end_index;
continue;
}
if !has_ansi_code {
return false;
}
let Some(character_token) = parse_character_token_raw(index, grapheme_starts) else {
return true;
};
let Token::Character { value, .. } = &character_token.token else {
return true;
};
return are_values_in_same_grapheme(previous, value);
}
false
}
pub(crate) fn tokenize_ansi(string: &str, end_character: Option<usize>) -> Vec<Token> {
let mut tokens = Vec::new();
let grapheme_starts: Vec<(usize, &str)> = string.grapheme_indices(true).collect();
let mut index = 0usize;
let mut visible_count = 0usize;
let mut previous_visible_value: Option<String> = None;
let mut has_ansi_since_last_visible = false;
while index < string.len() {
if let Some(code) = parse_escape_at(string, index) {
tokens.push(code.token);
index = code.end_index;
has_ansi_since_last_visible = true;
continue;
}
let Some(character_token) = parse_character_token_raw(index, &grapheme_starts) else {
return tokenize_ansi_with_visible_segmentation(string, end_character);
};
let Token::Character {
value,
visible_width,
..
} = &character_token.token
else {
unreachable!("parse_character_token_raw returns a character token");
};
let value = value.clone();
let width = *visible_width;
let ansi_split_grapheme = has_ansi_since_last_visible
&& previous_visible_value
.as_deref()
.is_some_and(|prev| are_values_in_same_grapheme(prev, &value));
if ansi_split_grapheme {
return tokenize_ansi_with_visible_segmentation(string, end_character);
}
tokens.push(character_token.token);
index = character_token.end_index;
visible_count += width;
has_ansi_since_last_visible = false;
previous_visible_value = Some(value);
if end_character.is_some_and(|end| visible_count >= end) {
if has_ansi_split_continuation_ahead(
string,
index,
previous_visible_value.as_deref(),
&grapheme_starts,
) {
return tokenize_ansi_with_visible_segmentation(string, end_character);
}
let _ = append_trailing_ansi_tokens(string, index, &mut tokens);
break;
}
}
tokens
}