use std::sync::LazyLock;
use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
static ANSI_RE: LazyLock<Regex> = LazyLock::new(|| {
const ST: &str = r"(?:\x07|\x1b\x5c|\x9c)";
let osc = format!(r"(?:\x1b\][\x00-\x{{10FFFF}}]*?{ST})");
let csi = r"[\x1b\x9b][\[\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~]";
Regex::new(&format!("{osc}|{csi}")).expect("ANSI_RE is a valid regex")
});
#[derive(Debug, Clone, Copy)]
pub struct Options {
pub ambiguous_is_narrow: bool,
pub count_ansi_escape_codes: bool,
}
impl Default for Options {
fn default() -> Self {
Self {
ambiguous_is_narrow: true,
count_ansi_escape_codes: false,
}
}
}
#[inline]
pub fn string_width(input: &str) -> usize {
string_width_with(input, Options::default())
}
pub fn string_width_with(input: &str, opts: Options) -> usize {
if input.is_empty() {
return 0;
}
let owned: String;
let s: &str = if !opts.count_ansi_escape_codes && ansi_present(input) {
owned = ANSI_RE.replace_all(input, "").into_owned();
&owned
} else {
input
};
if s.is_empty() {
return 0;
}
if is_all_printable_ascii(s) {
return s.len();
}
let ambiguous_as_wide = !opts.ambiguous_is_narrow;
let mut width = 0usize;
for segment in s.graphemes(true) {
if segment.len() == 1 && (0x20..=0x7E).contains(&segment.as_bytes()[0]) {
width += 1;
continue;
}
if is_zero_width_cluster(segment) {
continue;
}
if is_double_width_emoji(segment) {
width += 2;
continue;
}
let visible = base_visible(segment);
if let Some(hangul) = hangul_cluster_width(visible, ambiguous_as_wide) {
width += hangul;
continue;
}
let Some(first) = visible.chars().next() else {
continue;
};
width += east_asian_width(first as u32, ambiguous_as_wide);
width += trailing_halfwidth_width(visible, ambiguous_as_wide);
}
width
}
#[cfg(test)]
pub(crate) fn strip_ansi(input: &str) -> std::borrow::Cow<'_, str> {
if ansi_present(input) {
std::borrow::Cow::Owned(ANSI_RE.replace_all(input, "").into_owned())
} else {
std::borrow::Cow::Borrowed(input)
}
}
#[inline]
fn ansi_present(s: &str) -> bool {
s.chars().any(|c| c == '\u{1B}' || c == '\u{9B}')
}
#[inline(never)]
fn is_all_printable_ascii(s: &str) -> bool {
s.bytes()
.fold(true, |acc, b| acc & (b.wrapping_sub(0x20) < 0x5F))
}
fn in_ranges(table: &[(u32, u32)], cp: u32) -> bool {
table
.binary_search_by(|&(lo, hi)| {
if cp < lo {
std::cmp::Ordering::Greater
} else if cp > hi {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok()
}
#[inline]
fn is_zero_width_scalar(c: char) -> bool {
let cp = c as u32;
c.is_control()
|| in_ranges(DEFAULT_IGNORABLE_RANGES, cp)
|| in_ranges(FORMAT_RANGES, cp)
|| in_ranges(MARK_RANGES, cp)
}
#[inline]
fn is_zero_width_cluster(segment: &str) -> bool {
segment.chars().all(is_zero_width_scalar)
}
#[inline]
fn base_visible(segment: &str) -> &str {
let mut end = 0;
for c in segment.chars() {
if is_zero_width_scalar(c) {
end += c.len_utf8();
} else {
break;
}
}
&segment[end..]
}
#[inline]
fn east_asian_width(cp: u32, ambiguous_as_wide: bool) -> usize {
if in_ranges(EAW_FULLWIDTH_RANGES, cp)
|| in_ranges(EAW_WIDE_RANGES, cp)
|| (ambiguous_as_wide && in_ranges(EAW_AMBIGUOUS_RANGES, cp))
{
2
} else {
1
}
}
fn trailing_halfwidth_width(visible: &str, ambiguous_as_wide: bool) -> usize {
visible
.chars()
.skip(1)
.filter(|&c| ('\u{FF00}'..='\u{FFEF}').contains(&c))
.map(|c| east_asian_width(c as u32, ambiguous_as_wide))
.sum()
}
#[inline]
fn is_hangul_leading_jamo(cp: u32) -> bool {
(0x1100..=0x115F).contains(&cp) || (0xA960..=0xA97C).contains(&cp)
}
#[inline]
fn is_hangul_vowel_jamo(cp: u32) -> bool {
(0x1160..=0x11A7).contains(&cp) || (0xD7B0..=0xD7C6).contains(&cp)
}
#[inline]
fn is_hangul_trailing_jamo(cp: u32) -> bool {
(0x11A8..=0x11FF).contains(&cp) || (0xD7CB..=0xD7FB).contains(&cp)
}
#[inline]
fn is_hangul_jamo(cp: u32) -> bool {
is_hangul_leading_jamo(cp) || is_hangul_vowel_jamo(cp) || is_hangul_trailing_jamo(cp)
}
#[inline]
fn opt_is_vowel(cp: Option<u32>) -> bool {
cp.is_some_and(is_hangul_vowel_jamo)
}
#[inline]
fn opt_is_trailing(cp: Option<u32>) -> bool {
cp.is_some_and(is_hangul_trailing_jamo)
}
#[inline(never)]
fn hangul_cluster_width(visible: &str, ambiguous_as_wide: bool) -> Option<usize> {
let first = visible.chars().next()?;
if !is_hangul_jamo(first as u32) {
return None;
}
let code_points: Vec<u32> = visible
.chars()
.filter(|&c| !is_zero_width_scalar(c))
.map(|c| c as u32)
.collect();
if code_points.is_empty() {
return None;
}
let mut width = 0usize;
let mut index = 0usize;
while index < code_points.len() {
let cp = code_points[index];
if !is_hangul_jamo(cp) {
if width == 0 {
return None;
}
for &remaining in &code_points[index..] {
width += east_asian_width(remaining, ambiguous_as_wide);
}
return Some(width);
}
if is_hangul_leading_jamo(cp) && opt_is_vowel(code_points.get(index + 1).copied()) {
width += 2;
index += if opt_is_trailing(code_points.get(index + 2).copied()) {
3
} else {
2
};
continue;
}
width += east_asian_width(cp, ambiguous_as_wide);
index += 1;
}
Some(width)
}
const ZWJ: char = '\u{200D}';
const VS16: char = '\u{FE0F}';
const COMBINING_ENCLOSING_KEYCAP: char = '\u{20E3}';
const REGIONAL_INDICATOR_A: u32 = 0x1F1E6;
const REGIONAL_INDICATOR_Z: u32 = 0x1F1FF;
#[inline]
fn is_extended_pictographic(cp: u32) -> bool {
in_ranges(EXTENDED_PICTOGRAPHIC_RANGES, cp)
}
#[inline]
fn is_emoji_modifier_base(cp: u32) -> bool {
in_ranges(EMOJI_MODIFIER_BASE_RANGES, cp)
}
#[inline]
fn is_emoji_modifier(cp: u32) -> bool {
(0x1F3FB..=0x1F3FF).contains(&cp)
}
#[inline]
fn is_regional_indicator(cp: u32) -> bool {
(REGIONAL_INDICATOR_A..=REGIONAL_INDICATOR_Z).contains(&cp)
}
fn is_rgi_flag_pair(first: u32, second: u32) -> bool {
let a = (b'A' as u32 + first - REGIONAL_INDICATOR_A) as u8;
let b = (b'A' as u32 + second - REGIONAL_INDICATOR_A) as u8;
RGI_FLAG_PAIRS.binary_search(&[a, b]).is_ok()
}
fn is_double_width_emoji(segment: &str) -> bool {
if utf16_len(segment) > 50 {
return false;
}
if is_keycap_sequence(segment) {
return true;
}
if is_rgi_flag_sequence(segment) {
return true;
}
if segment.contains(ZWJ)
&& segment
.chars()
.filter(|&c| is_extended_pictographic(c as u32))
.count()
>= 2
{
return true;
}
let visible = base_visible(segment);
is_vs16_sequence(visible) || is_modifier_sequence(visible)
}
fn is_vs16_sequence(visible: &str) -> bool {
let mut chars = visible.chars();
let (Some(first), Some(VS16), None) = (chars.next(), chars.next(), chars.next()) else {
return false;
};
is_extended_pictographic(first as u32)
}
fn is_keycap_sequence(segment: &str) -> bool {
let mut chars = segment.chars();
let Some(base) = chars.next() else {
return false;
};
if !matches!(base, '0'..='9' | '#' | '*') {
return false;
}
let next = chars.next();
let after = match next {
Some(VS16) => chars.next(),
other => other,
};
after == Some(COMBINING_ENCLOSING_KEYCAP) && chars.next().is_none()
}
fn is_rgi_flag_sequence(segment: &str) -> bool {
let mut chars = segment.chars();
let (Some(a), Some(b), None) = (chars.next(), chars.next(), chars.next()) else {
return false;
};
let (a, b) = (a as u32, b as u32);
is_regional_indicator(a) && is_regional_indicator(b) && is_rgi_flag_pair(a, b)
}
fn is_modifier_sequence(visible: &str) -> bool {
let mut chars = visible.chars();
let (Some(first), Some(second), None) = (chars.next(), chars.next(), chars.next()) else {
return false;
};
is_emoji_modifier_base(first as u32) && is_emoji_modifier(second as u32)
}
#[inline]
fn utf16_len(s: &str) -> usize {
s.chars().map(char::len_utf16).sum()
}
include!("string_width_tables.rs");
#[cfg(test)]
mod tests {
use super::*;
fn narrow_false(input: &str) -> usize {
string_width_with(
input,
Options {
ambiguous_is_narrow: false,
..Default::default()
},
)
}
#[test]
fn empty_string_is_zero() {
assert_eq!(string_width(""), 0);
}
#[test]
fn printable_ascii() {
assert_eq!(string_width("hello"), 5);
}
#[test]
fn cjk_ideographs() {
assert_eq!(string_width("中文"), 4); }
#[test]
fn ansi_colored_string_stripped() {
assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3); }
#[test]
fn single_emoji_is_double_width() {
assert_eq!(string_width("😀"), 2); }
#[test]
fn keycap_one_is_double_width() {
assert_eq!(string_width("1\u{20E3}"), 2); }
#[test]
fn tab_is_zero_width() {
assert_eq!(string_width("\t"), 0); }
#[test]
fn fullwidth_latin_is_double_width() {
assert_eq!(string_width("a"), 2); }
#[test]
fn zwj_family_emoji_is_double_width() {
assert_eq!(string_width("👨\u{200D}👩\u{200D}👧"), 2); }
#[test]
fn combining_acute_on_base_char() {
assert_eq!(string_width("e\u{0301}"), 1); }
#[test]
fn lone_combining_mark_is_zero_width() {
assert_eq!(string_width("\u{0301}"), 0); }
#[test]
fn ambiguous_narrow_by_default() {
assert_eq!(string_width("¡"), 1); }
#[test]
fn ambiguous_wide_in_cjk_mode() {
assert_eq!(narrow_false("¡"), 2); }
#[test]
fn ellipsis_ambiguous_narrow() {
assert_eq!(string_width("…"), 1); }
#[test]
fn ellipsis_ambiguous_wide() {
assert_eq!(narrow_false("…"), 2); }
#[test]
fn keycap_variants() {
assert_eq!(string_width("#\u{20E3}"), 2); assert_eq!(string_width("*\u{20E3}"), 2); assert_eq!(string_width("0\u{20E3}"), 2); assert_eq!(string_width("9\u{20E3}"), 2); }
#[test]
fn emoji_modifier_sequence() {
assert_eq!(string_width("👍\u{1F3FB}"), 2); }
#[test]
fn control_chars_zero_width() {
assert_eq!(string_width("\n"), 0); assert_eq!(string_width("\r"), 0); assert_eq!(string_width("\x00"), 0); }
#[test]
fn mixed_ascii_and_wide() {
assert_eq!(string_width("hi中"), 4); }
#[test]
fn default_ignorable_zero_width() {
assert_eq!(string_width("\u{200B}"), 0); assert_eq!(string_width("\u{FEFF}"), 0); }
#[test]
fn complex_ansi_sequences() {
assert_eq!(string_width("\x1b[38;5;200mcolored\x1b[0m"), 7); assert_eq!(string_width("\x1b[1mbold\x1b[0m"), 4); }
#[test]
fn ansi_sgr_31m() {
assert_eq!(string_width("\x1b[31mX\x1b[0m"), 1); }
#[test]
fn ansi_sgr_1_31m() {
assert_eq!(string_width("\x1b[1;31mX\x1b[0m"), 1); }
#[test]
fn ansi_csi_hide_cursor() {
assert_eq!(string_width("\x1b[?25lX\x1b[?25h"), 1); }
#[test]
fn ansi_osc8_hyperlink() {
let s = "\x1b]8;;https://example.com\x07link\x1b]8;;\x07";
assert_eq!(string_width(s), 4); }
#[test]
fn keycap_fully_qualified() {
assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); assert_eq!(string_width("#\u{FE0F}\u{20E3}"), 2); assert_eq!(string_width("*\u{FE0F}\u{20E3}"), 2); }
#[test]
fn halfwidth_katakana_with_voiced_mark() {
assert_eq!(string_width("\u{FF76}\u{FF9E}"), 2); assert_eq!(string_width("\u{FF76}\u{FF9F}"), 2); }
#[test]
fn soft_hyphen_format_category() {
assert_eq!(string_width("\u{00AD}"), 0); assert_eq!(string_width("a\u{00AD}b"), 2); }
#[test]
fn zero_width_joiners_and_ignorables() {
assert_eq!(string_width("\u{200C}"), 0); assert_eq!(string_width("\u{200D}"), 0); assert_eq!(string_width("\u{FEFF}"), 0); assert_eq!(string_width("\u{200B}"), 0); }
#[test]
fn ansi_count_mode_includes_escapes() {
let opts = Options {
count_ansi_escape_codes: true,
..Default::default()
};
assert_eq!(string_width_with("\x1b[31mred\x1b[0m", opts), 10); }
#[test]
fn ansi_count_mode_bare_sgr() {
let opts = Options {
count_ansi_escape_codes: true,
..Default::default()
};
assert_eq!(string_width_with("\x1b[31m", opts), 4); }
#[test]
fn ansi_osc_generic_strip() {
assert_eq!(string_width("\x1b]0;My Title\x07hello"), 5); assert_eq!(string_width("\x1b]0;title\x07hello"), 5); assert_eq!(string_width("\x1b]0;title with spaces\x07x"), 1); }
#[test]
fn ansi_colon_sgr_strip() {
assert_eq!(string_width("\x1b[38:2:1:2:3m "), 1); }
#[test]
fn indic_tamil_clusters() {
assert_eq!(string_width("நி"), 1); assert_eq!(string_width("நிநி"), 2); assert_eq!(string_width("க்\u{200D}ஷ"), 2); assert_eq!(string_width("ி"), 0); }
#[test]
fn arabic_prepend_mark() {
assert_eq!(string_width("\u{0600}A"), 1); }
#[test]
fn zwj_minimally_qualified() {
assert_eq!(string_width("❤\u{200D}🔥"), 2); assert_eq!(string_width("🏳\u{200D}🌈"), 2); assert_eq!(string_width("👁\u{200D}🗨"), 2); }
#[test]
fn flag_sequences() {
assert_eq!(string_width("🇺🇸"), 2); assert_eq!(string_width("🇦🇦"), 1); assert_eq!(string_width("🇦"), 1); assert_eq!(string_width("🇦🇺🇸"), 3); }
#[test]
fn zwj_non_emoji_prefix() {
assert_eq!(string_width("a\u{200D}🔥"), 3); }
#[test]
fn modifier_with_trailing_zwj_not_double() {
assert_eq!(string_width("\u{270C}\u{1F3FB}\u{200D}"), 1);
}
#[test]
fn emoji_with_trailing_halfwidth_keeps_eaw_extra() {
assert_eq!(string_width("\u{1F600}\u{1F3FB}\u{FE0F}\u{FF9E}"), 3);
}
#[test]
fn vs16_presentation() {
assert_eq!(string_width("✌"), 1); assert_eq!(string_width("✌\u{FE0F}"), 2); assert_eq!(string_width("✌🏽"), 2); assert_eq!(string_width("1\u{FE0F}"), 1); assert_eq!(string_width("#\u{FE0F}"), 1); assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); assert_eq!(string_width("🔥\u{FE0F}"), 2); assert_eq!(string_width("❤"), 1); assert_eq!(string_width("❤\u{FE0F}"), 2); }
#[test]
fn tag_flag_sequences() {
let scotland = "🏴\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}";
assert_eq!(string_width(scotland), 2); assert_eq!(string_width("🏴\u{E0041}\u{E007F}"), 2); }
#[test]
fn modifier_on_wide_base() {
assert_eq!(string_width("👍🏽"), 2); }
#[test]
fn hangul_jamo_clusters() {
assert_eq!(string_width("\u{1100}\u{1161}"), 2); assert_eq!(string_width("\u{1100}\u{1100}\u{1161}"), 4); assert_eq!(string_width("\u{1161}"), 1); assert_eq!(string_width("\u{11A8}"), 1); assert_eq!(string_width("\u{1100}\u{1161}\u{11A8}"), 2); }
#[test]
fn halfwidth_forms() {
assert_eq!(string_width("ガ"), 2); assert_eq!(string_width("アー"), 2); }
#[test]
fn cjk_with_combining() {
assert_eq!(string_width("中\u{0300}"), 2); }
#[test]
fn thai_sara_am() {
assert_eq!(string_width("กำ"), 1); }
#[test]
fn tabs_are_zero_width() {
assert_eq!(string_width("a\tb"), 2); assert_eq!(string_width("a\t\tb"), 2); assert_eq!(string_width("\ta"), 1); assert_eq!(string_width("a\t"), 1); assert_eq!(string_width("\t\t"), 0); }
#[test]
fn hangul_leading_plus_trailing_without_vowel_is_additive() {
assert_eq!(string_width("\u{1100}\u{11A8}"), 3); }
#[test]
fn hangul_filler_leading_jamo_cluster() {
assert_eq!(string_width("\u{115F}\u{1161}"), 1); }
#[test]
fn hangul_jamo_then_precomposed_cluster_is_additive() {
assert_eq!(string_width("\u{1100}\u{AC00}"), 4); }
#[test]
fn precomposed_then_jamo_cluster_takes_eaw_path() {
assert_eq!(string_width("\u{AC00}\u{1161}"), 2); }
#[test]
fn del_is_zero_width() {
assert_eq!(string_width("\u{7F}"), 0); }
#[test]
fn del_plus_ascii_counts_only_the_ascii() {
assert_eq!(string_width("\u{7F}a"), 1); }
#[test]
fn lone_emoji_modifier_is_double_width() {
assert_eq!(string_width("\u{1F3FB}"), 2); }
#[test]
fn regional_indicator_plus_modifier_not_flag() {
assert_eq!(string_width("\u{1F1E6}\u{1F3FB}"), 1); }
#[test]
fn rtl_override_zwj_flag_modifier_garbage_chain() {
assert_eq!(
string_width("\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}"),
2
); }
#[test]
fn count_ansi_mode_unterminated_osc_counts_bytes() {
let opts = Options {
count_ansi_escape_codes: true,
..Default::default()
};
assert_eq!(string_width_with("\x1b]8;;http://x", opts), 12); assert_eq!(string_width("\x1b]8;;http://x"), 7); }
#[test]
fn esc_mid_emoji_count_vs_strip() {
let count = Options {
count_ansi_escape_codes: true,
..Default::default()
};
assert_eq!(string_width("\u{1F600}\x1b[0m"), 2); assert_eq!(string_width_with("\u{1F600}\x1b[0m", count), 5); }
#[test]
fn no_panic_on_adversarial_battery() {
let count = Options {
count_ansi_escape_codes: true,
..Default::default()
};
let small = [
"abc\x1b", "\x1b]8;;http://x", "\x1b[", "\x1b[38;5;", "a\x00b", "\x00\u{0301}", "\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}", "\u{1100}\u{11A8}", ];
let big = [
"a".repeat(10_000_000),
"中".repeat(3_000_000),
"\x1b".repeat(10_000_000),
];
for s in small.iter().map(|s| s.to_string()).chain(big) {
let _: usize = string_width(&s);
let _: usize = string_width_with(&s, count);
}
}
}