use compact_str::{CompactString, format_compact};
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthChar;
use crate::text::ansi_tokenize::ansi_codes::get_end_code;
use crate::text::ansi_tokenize::consts::{CSI, ESC, LINK_CODE_PREFIX, OSC, SGR_FINAL};
use crate::text::ansi_tokenize::types::{AnsiToken, CharToken, ControlToken, Token};
const C1_OPENER: char = '\u{9B}';
pub(crate) fn is_fullwidth_grapheme(grapheme: &str, base_code_point: u32) -> bool {
let base_is_wide = char::from_u32(base_code_point)
.map(|c| UnicodeWidthChar::width(c) == Some(2))
.unwrap_or(false);
if base_is_wide {
return true;
}
if grapheme.contains('\u{FE0F}') {
return true;
}
if (0x1F1E6..=0x1F1FF).contains(&base_code_point) {
return true;
}
false
}
fn find_sgr_end(s: &[u8], scan_start: usize) -> Option<usize> {
for (i, &byte) in s.iter().enumerate().skip(scan_start) {
match byte {
SGR_FINAL => return Some(i),
b';' | b'0'..=b'9' => {}
_ => return None,
}
}
None
}
fn parse_sgr_sequence(input: &str, offset: usize, opener_len: usize) -> Option<&str> {
let bytes = &input.as_bytes()[offset..];
let end = find_sgr_end(bytes, opener_len + 1)?;
Some(&input[offset..offset + end + 1])
}
fn split_compound_sgr(code: &str) -> Vec<CompactString> {
if !code.contains(';') {
return vec![code.into()];
}
let third_char = code
.char_indices()
.nth(2)
.map(|(off, _)| off)
.unwrap_or(code.len());
let inner = &code[third_char..code.len() - 1];
let parts: Vec<&str> = inner.split(';').collect();
let mut ret = Vec::new();
let mut i = 0;
while i < parts.len() {
let raw = parts[i];
if raw == "38" || raw == "48" {
if i + 2 < parts.len() && parts[i + 1] == "5" {
ret.push(parts[i..i + 3].join(";"));
i += 3;
continue;
} else if i + 4 < parts.len() && parts[i + 1] == "2" {
ret.push(parts[i..i + 5].join(";"));
i += 5;
continue;
}
}
ret.push(raw.to_owned());
i += 1;
}
ret.into_iter()
.map(|part| format_compact!("\x1B[{part}m"))
.collect()
}
fn find_osc_terminator(input: &str, start: usize) -> Option<usize> {
let mut it = input[start..].char_indices().peekable();
while let Some((rel, ch)) = it.next() {
let abs = start + rel;
match ch {
'\u{07}' => return Some(abs),
'\u{9C}' => return Some(abs + ch.len_utf8() - 1),
'\u{1B}' => {
if matches!(it.peek(), Some(&(_, '\\'))) {
return Some(abs + 1);
}
}
_ => {}
}
}
None
}
fn parse_link_code(input: &str, offset: usize, opener_len: usize) -> Option<&str> {
let s = &input[offset..];
let prefix_rest = &LINK_CODE_PREFIX[1..];
if !s[opener_len..].starts_with(prefix_rest) {
return None;
}
let after_prefix = opener_len + prefix_rest.len();
let params_end = s[after_prefix..].find(';').map(|p| p + after_prefix)?;
let term_last = find_osc_terminator(s, params_end + 1)?;
Some(&input[offset..offset + term_last + 1])
}
pub fn tokenize(input: &str, end_char: Option<usize>) -> Vec<Token<'_>> {
let end_char = end_char.unwrap_or(usize::MAX);
let mut tokens = Vec::new();
let mut visible = 0usize;
let mut i = 0usize;
while i < input.len() {
let rest = &input[i..];
let cp = rest.chars().next().expect("non-empty slice has a char");
if cp == ESC as char || cp == C1_OPENER {
let opener_len = cp.len_utf8();
let next_cp = rest[opener_len..].chars().next();
if next_cp == Some(OSC as char) {
if let Some(code) = parse_link_code(input, i, opener_len) {
let len = code.len();
let end_code = get_end_code(code);
tokens.push(Token::Ansi(AnsiToken {
code: code.into(),
end_code,
}));
i += len;
continue;
}
let scan_start = i + opener_len + (OSC as char).len_utf8();
if let Some(term_last) = find_osc_terminator(input, scan_start) {
let code = &input[i..=term_last];
tokens.push(Token::Control(ControlToken {
code: code.to_owned(),
}));
i = term_last + 1;
continue;
}
} else if next_cp == Some(CSI as char) {
if let Some(code) = parse_sgr_sequence(input, i, opener_len) {
let len = code.len();
if !code.contains(';') {
let end_code = get_end_code(code);
tokens.push(Token::Ansi(AnsiToken {
code: code.into(),
end_code,
}));
} else {
for part in split_compound_sgr(code) {
let end_code = get_end_code(&part);
tokens.push(Token::Ansi(AnsiToken {
code: part,
end_code,
}));
}
}
i += len;
continue;
}
}
}
let cluster = rest
.graphemes(true)
.next()
.expect("non-empty slice has at least one grapheme");
let base_cp = cluster.chars().next().map(|c| c as u32).unwrap_or(0);
let full_width = is_fullwidth_grapheme(cluster, base_cp);
tokens.push(Token::Char(CharToken {
value: cluster,
full_width,
}));
visible += if full_width { 2 } else { 1 };
if visible >= end_char {
break;
}
i += cluster.len();
}
tokens
}
#[cfg(test)]
mod tests {
use super::*;
use crate::text::ansi_tokenize::types::Token;
fn chars<'a>(tokens: &[Token<'a>]) -> Vec<&'a str> {
tokens
.iter()
.filter_map(|t| match t {
Token::Char(c) => Some(c.value),
_ => None,
})
.collect()
}
fn ansi_tokens<'a>(tokens: &'a [Token<'a>]) -> Vec<(&'a str, &'a str)> {
tokens
.iter()
.filter_map(|t| match t {
Token::Ansi(a) => Some((a.code.as_str(), a.end_code.as_str())),
_ => None,
})
.collect()
}
#[test]
fn plain_text_chars() {
let tokens = tokenize("abc", None);
assert_eq!(tokens.len(), 3);
assert_eq!(chars(&tokens), vec!["a", "b", "c"]);
for t in &tokens {
if let Token::Char(c) = t {
assert!(!c.full_width);
}
}
}
#[test]
fn cjk_fullwidth() {
let tokens = tokenize("中", None);
assert_eq!(tokens.len(), 1);
match &tokens[0] {
Token::Char(c) => assert!(c.full_width),
_ => panic!("expected Char"),
}
}
#[test]
fn vs16_fullwidth() {
let tokens = tokenize("✏️", None);
assert_eq!(tokens.len(), 1);
match &tokens[0] {
Token::Char(c) => {
assert_eq!(c.value, "✏️");
assert!(c.full_width);
}
_ => panic!("expected Char"),
}
}
#[test]
fn flag_fullwidth() {
let tokens = tokenize("🇩🇪", None);
assert_eq!(tokens.len(), 1, "flag is one grapheme cluster");
match &tokens[0] {
Token::Char(c) => assert!(c.full_width),
_ => panic!("expected Char"),
}
}
#[test]
fn red_then_reset_fg() {
let tokens = tokenize("\x1B[31mred\x1B[39m", None);
assert_eq!(
ansi_tokens(&tokens),
vec![("\x1B[31m", "\x1B[39m"), ("\x1B[39m", "\x1B[39m")]
);
assert_eq!(chars(&tokens), vec!["r", "e", "d"]);
}
#[test]
fn reset_code() {
let tokens = tokenize("\x1B[0m", None);
assert_eq!(ansi_tokens(&tokens), vec![("\x1B[0m", "\x1B[0m")]);
}
#[test]
fn compound_sgr_split() {
let tokens = tokenize("\x1B[1;3;31m", None);
assert_eq!(
ansi_tokens(&tokens),
vec![
("\x1B[1m", "\x1B[22m"),
("\x1B[3m", "\x1B[23m"),
("\x1B[31m", "\x1B[39m"),
]
);
}
#[test]
fn eight_bit_color() {
let tokens = tokenize("\x1B[38;5;200m", None);
assert_eq!(ansi_tokens(&tokens), vec![("\x1B[38;5;200m", "\x1B[39m")]);
}
#[test]
fn twenty_four_bit_color() {
let tokens = tokenize("\x1B[38;2;255;0;128m", None);
assert_eq!(
ansi_tokens(&tokens),
vec![("\x1B[38;2;255;0;128m", "\x1B[39m")]
);
}
#[test]
fn compound_with_embedded_24bit() {
let tokens = tokenize("\x1B[1;38;2;10;20;30;4m", None);
assert_eq!(
ansi_tokens(&tokens),
vec![
("\x1B[1m", "\x1B[22m"),
("\x1B[38;2;10;20;30m", "\x1B[39m"),
("\x1B[4m", "\x1B[24m"),
]
);
}
#[test]
fn osc8_link_bel() {
let s = "\x1B]8;;https://example.com\x07text\x1B]8;;\x07";
let tokens = tokenize(s, None);
assert_eq!(
ansi_tokens(&tokens),
vec![
("\x1B]8;;https://example.com\x07", "\x1B]8;;\x07"),
("\x1B]8;;\x07", "\x1B]8;;\x07"),
]
);
assert_eq!(chars(&tokens), vec!["t", "e", "x", "t"]);
}
#[test]
fn osc8_link_st() {
let s = "\x1B]8;;https://e.com\x1B\\hi\x1B]8;;\x1B\\";
let tokens = tokenize(s, None);
assert_eq!(
ansi_tokens(&tokens),
vec![
("\x1B]8;;https://e.com\x1B\\", "\x1B]8;;\x1B\\"),
("\x1B]8;;\x1B\\", "\x1B]8;;\x1B\\"),
]
);
assert_eq!(chars(&tokens), vec!["h", "i"]);
}
#[test]
fn osc_window_title() {
let tokens = tokenize("\x1B]0;title\x07X", None);
assert_eq!(tokens.len(), 2);
match &tokens[0] {
Token::Control(c) => assert_eq!(c.code, "\x1B]0;title\x07"),
_ => panic!("expected Control"),
}
assert_eq!(chars(&tokens), vec!["X"]);
}
#[test]
fn osc_window_title_st() {
let tokens = tokenize("\x1B]0;title\x1B\\X", None);
match &tokens[0] {
Token::Control(c) => assert_eq!(c.code, "\x1B]0;title\x1B\\"),
_ => panic!("expected Control"),
}
assert_eq!(chars(&tokens), vec!["X"]);
}
#[test]
fn invalid_sgr_falls_through() {
let tokens = tokenize("\x1B[31xred", None);
assert!(ansi_tokens(&tokens).is_empty());
assert_eq!(
chars(&tokens),
vec!["\x1B", "[", "3", "1", "x", "r", "e", "d"]
);
}
#[test]
fn end_char_limit() {
let tokens = tokenize("abcdef", Some(3));
assert_eq!(chars(&tokens), vec!["a", "b", "c"]);
}
#[test]
fn end_char_fullwidth_limit() {
let tokens = tokenize("中文X", Some(4));
assert_eq!(chars(&tokens), vec!["中", "文"]);
}
#[test]
fn c1_csi_opener_parses_sgr() {
let tokens = tokenize("\u{9B}[31mX", None);
assert_eq!(ansi_tokens(&tokens), vec![("\u{9B}[31m", "\x1B[39m")]);
assert_eq!(chars(&tokens), vec!["X"]);
}
#[test]
fn c1_opener_parses_osc8_link() {
let tokens = tokenize("\u{9B}]8;;https://x\u{07}T\u{9B}]8;;\u{07}", None);
assert_eq!(
ansi_tokens(&tokens),
vec![
("\u{9B}]8;;https://x\u{07}", "\x1B[28m"),
("\u{9B}]8;;\u{07}", "\x1B[28m"),
]
);
assert_eq!(chars(&tokens), vec!["T"]);
}
#[test]
fn ansi_tokenize_pipeline_raw_c1_and_null_bytes_no_panic() {
use crate::text::ansi_tokenize::{styled_chars_from_tokens, styled_chars_to_string};
let pipe = |s: &str| styled_chars_to_string(&styled_chars_from_tokens(&tokenize(s, None)));
assert_eq!(pipe("\u{9b}[31mhi\u{9b}[39m"), "\u{9b}[31mhi\x1b[39m");
let _ = pipe("\x00\x00\x00"); let _ = pipe("\x1b]8;;a\x07x\x1b]8;;b\x07y\x1b]8;;\x07"); }
}