use unicode_segmentation::UnicodeSegmentation;
#[derive(Clone, Debug, Default)]
pub struct LineBreakResult {
pub positions: Vec<usize>,
pub lengths: Vec<u8>,
}
#[derive(Clone, Debug, Default)]
pub struct TabStopResult {
pub positions: Vec<usize>,
}
#[derive(Clone, Debug, Default)]
pub struct WrapBreakResult {
pub positions: Vec<usize>,
pub break_types: Vec<BreakType>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum BreakType {
Whitespace,
Punctuation,
OpenBracket,
CloseBracket,
Hyphen,
}
#[must_use]
#[inline]
pub fn is_ascii_only_fast(s: &str) -> bool {
s.is_ascii()
}
#[must_use]
pub fn is_printable_ascii_only(s: &str) -> bool {
s.bytes().all(|b| (32..=126).contains(&b))
}
#[must_use]
pub fn find_line_breaks(text: &str) -> LineBreakResult {
let mut result = LineBreakResult::default();
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
match bytes[i] {
b'\n' => {
result.positions.push(i);
result.lengths.push(1);
i += 1;
}
b'\r' => {
if i + 1 < len && bytes[i + 1] == b'\n' {
result.positions.push(i);
result.lengths.push(2);
i += 2;
} else {
result.positions.push(i);
result.lengths.push(1);
i += 1;
}
}
_ => i += 1,
}
}
result
}
#[must_use]
pub fn find_tab_stops(text: &str) -> TabStopResult {
let mut result = TabStopResult::default();
for (i, b) in text.bytes().enumerate() {
if b == b'\t' {
result.positions.push(i);
}
}
result
}
#[must_use]
pub fn find_wrap_breaks(text: &str) -> WrapBreakResult {
let mut result = WrapBreakResult::default();
for (i, ch) in text.char_indices() {
let break_type = match ch {
' ' | '\t' => Some(BreakType::Whitespace),
'.' | ',' | ';' | ':' | '!' | '?' => Some(BreakType::Punctuation),
'(' | '[' | '{' | '<' => Some(BreakType::OpenBracket),
')' | ']' | '}' | '>' => Some(BreakType::CloseBracket),
'-' | '\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' => {
Some(BreakType::Hyphen)
}
_ => None,
};
if let Some(bt) = break_type {
result.positions.push(i + ch.len_utf8());
result.break_types.push(bt);
}
}
result
}
#[must_use]
pub fn find_wrap_position(text: &str, max_columns: u32, tab_width: u8) -> Option<usize> {
if text.is_empty() || max_columns == 0 {
return None;
}
let tab_width = u32::from(tab_width).max(1);
let mut col = 0u32;
let mut last_break = None;
let mut last_break_col = 0u32;
for (byte_idx, grapheme) in text.grapheme_indices(true) {
let width = if grapheme == "\t" {
tab_width - (col % tab_width)
} else {
unicode_width::UnicodeWidthStr::width(grapheme) as u32
};
if let Some(ch) = grapheme.chars().next() {
if ch.is_whitespace() || is_break_char(ch) {
last_break = Some(byte_idx + grapheme.len());
last_break_col = col + width;
}
}
col += width;
if col > max_columns {
if let Some(break_pos) = last_break {
if last_break_col >= max_columns / 2 {
return Some(break_pos);
}
}
return Some(byte_idx);
}
}
None
}
fn is_break_char(ch: char) -> bool {
matches!(
ch,
'.' | ',' | ';' | ':' | '!' | '?' | '-' | '(' | ')' | '[' | ']' | '{' | '}' | '<' | '>'
)
}
#[must_use]
pub fn find_position_by_width(text: &str, target_column: u32, tab_width: u8) -> usize {
if text.is_empty() || target_column == 0 {
return 0;
}
let tab_width = u32::from(tab_width).max(1);
let mut col = 0u32;
for (byte_idx, grapheme) in text.grapheme_indices(true) {
let width = if grapheme == "\t" {
tab_width - (col % tab_width)
} else {
unicode_width::UnicodeWidthStr::width(grapheme) as u32
};
if col + width > target_column {
return byte_idx;
}
col += width;
if col >= target_column {
return byte_idx + grapheme.len();
}
}
text.len()
}
#[must_use]
pub fn get_prev_grapheme_start(
text: &str,
byte_offset: usize,
tab_width: u8,
) -> Option<(usize, u32)> {
if byte_offset == 0 || text.is_empty() {
return None;
}
let tab_width = u32::from(tab_width).max(1);
let prefix = &text[..byte_offset.min(text.len())];
let graphemes: Vec<_> = prefix.grapheme_indices(true).collect();
if graphemes.is_empty() {
return None;
}
let (start, grapheme) = graphemes.last()?;
let width = if *grapheme == "\t" {
let mut col = 0u32;
for (_, g) in &graphemes[..graphemes.len() - 1] {
if *g == "\t" {
col += tab_width - (col % tab_width);
} else {
col += unicode_width::UnicodeWidthStr::width(*g) as u32;
}
}
tab_width - (col % tab_width)
} else {
unicode_width::UnicodeWidthStr::width(*grapheme) as u32
};
Some((*start, width))
}
#[must_use]
pub fn calculate_text_width(text: &str, tab_width: u8) -> u32 {
if text.is_empty() {
return 0;
}
let tab_width = u32::from(tab_width).max(1);
let mut col = 0u32;
for grapheme in text.graphemes(true) {
if grapheme == "\t" {
col += tab_width - (col % tab_width);
} else {
col += unicode_width::UnicodeWidthStr::width(grapheme) as u32;
}
}
col
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_ascii_only_fast() {
assert!(is_ascii_only_fast("hello world"));
assert!(is_ascii_only_fast(""));
assert!(!is_ascii_only_fast("héllo"));
assert!(!is_ascii_only_fast("hello 🌍"));
}
#[test]
fn test_is_printable_ascii_only() {
assert!(is_printable_ascii_only("hello world"));
assert!(is_printable_ascii_only(""));
assert!(!is_printable_ascii_only("hello\tworld")); assert!(!is_printable_ascii_only("hello\nworld")); }
#[test]
fn test_find_line_breaks() {
let result = find_line_breaks("a\nb\r\nc\rd");
assert_eq!(result.positions, vec![1, 3, 6]);
assert_eq!(result.lengths, vec![1, 2, 1]);
}
#[test]
fn test_find_line_breaks_empty() {
let result = find_line_breaks("");
assert!(result.positions.is_empty());
}
#[test]
fn test_find_tab_stops() {
let result = find_tab_stops("a\tb\tc");
assert_eq!(result.positions, vec![1, 3]);
}
#[test]
fn test_find_wrap_breaks() {
let result = find_wrap_breaks("hello, world!");
assert!(!result.positions.is_empty());
assert!(result.break_types.contains(&BreakType::Whitespace));
assert!(result.break_types.contains(&BreakType::Punctuation));
}
#[test]
fn test_find_wrap_position() {
let pos = find_wrap_position("hello world", 6, 4);
assert_eq!(pos, Some(6)); }
#[test]
fn test_find_wrap_position_no_wrap_needed() {
let pos = find_wrap_position("hello", 10, 4);
assert_eq!(pos, None);
}
#[test]
fn test_find_position_by_width() {
assert_eq!(find_position_by_width("hello", 0, 4), 0);
assert_eq!(find_position_by_width("hello", 3, 4), 3);
assert_eq!(find_position_by_width("hello", 10, 4), 5);
}
#[test]
fn test_find_position_by_width_with_tab() {
let pos = find_position_by_width("a\tb", 3, 4);
assert_eq!(pos, 1);
let pos = find_position_by_width("a\tb", 4, 4);
assert_eq!(pos, 2); }
#[test]
fn test_get_prev_grapheme_start() {
let result = get_prev_grapheme_start("hello", 3, 4);
assert_eq!(result, Some((2, 1)));
let result = get_prev_grapheme_start("hello", 0, 4);
assert_eq!(result, None);
}
#[test]
fn test_calculate_text_width() {
assert_eq!(calculate_text_width("hello", 4), 5);
assert_eq!(calculate_text_width("", 4), 0);
assert_eq!(calculate_text_width("\t", 4), 4); assert_eq!(calculate_text_width("a\t", 4), 4); }
#[test]
fn test_calculate_text_width_wide_chars() {
assert_eq!(calculate_text_width("漢字", 4), 4); }
#[test]
fn test_tab_width_zero_does_not_panic() {
assert_eq!(calculate_text_width("a\tb", 0), 3); assert_eq!(find_position_by_width("a\tb", 2, 0), 2);
assert!(find_wrap_position("a\tb", 10, 0).is_none()); assert!(get_prev_grapheme_start("a\tb", 2, 0).is_some());
}
}