use regex::Regex;
static STOP_WORDS: &[&str] = &[
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
"from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
"did", "will", "would", "shall", "should", "may", "might", "must", "can", "could", "it", "its",
"not", "no",
];
pub fn sanitize(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
let ch = bytes[i];
if ch == b'\r' {
if i + 1 >= bytes.len() || bytes[i + 1] != b'\n' {
out.push('\n');
}
i += 1;
continue;
}
if ch == 0x1B
&& i + 1 < bytes.len()
&& let Some(seq_len) = check_ansi_seq(bytes, i + 1)
{
i += 1 + seq_len;
continue;
}
if !is_control(ch) {
out.push(ch as char);
}
i += 1;
}
out
}
pub fn split_words(text: &str) -> Vec<String> {
let mut words = Vec::new();
let mut current = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() || ch == '_' || ch == '-' || ch == '.' {
current.push(ch.to_ascii_lowercase());
} else if !current.is_empty() {
words.push(current.clone());
current.clear();
}
}
if !current.is_empty() {
words.push(current);
}
words
}
pub fn split_csv(csv: &str) -> Vec<String> {
csv.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
pub fn join_lines(lines: &[String], separator: &str) -> String {
lines.join(separator)
}
pub fn wrap_text(text: &str, width: usize, indent: &str) -> String {
let indent_len = indent.chars().count();
let available = width.saturating_sub(indent_len);
if available == 0 {
return String::new();
}
let mut out = String::new();
for line in text.lines() {
if line.is_empty() {
out.push('\n');
continue;
}
let wrapped = wrap_line_words(line, available);
for w in &wrapped {
out.push_str(indent);
out.push_str(w);
out.push('\n');
}
}
out.trim_end_matches('\n').to_string()
}
fn wrap_line_words(line: &str, available: usize) -> Vec<String> {
let words: Vec<&str> = line.split(' ').collect();
let mut lines = Vec::new();
let mut current = String::new();
for word in &words {
if current.is_empty() {
current = word.to_string();
} else if current.len() + 1 + word.len() <= available {
current.push(' ');
current.push_str(word);
} else {
lines.push(current);
current = word.to_string();
}
}
if !current.is_empty() {
lines.push(current);
}
lines
}
fn floor_char_boundary(s: &str, pos: usize) -> usize {
let mut pos = pos.min(s.len());
while !s.is_char_boundary(pos) {
pos -= 1;
}
pos
}
pub fn clip(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars {
return text.to_string();
}
if max_chars <= 3 {
let bound = floor_char_boundary(text, max_chars);
return text[..bound].to_string();
}
let search_bound = floor_char_boundary(text, max_chars - 3);
let end = text[..search_bound]
.rfind(' ')
.filter(|&p| p >= max_chars / 2)
.unwrap_or(search_bound);
let end = floor_char_boundary(text, end);
format!("{}...", &text[..end])
}
pub fn contains_casefold(haystack: &str, needle: &str) -> bool {
haystack
.to_ascii_lowercase()
.contains(&needle.to_ascii_lowercase())
}
pub fn make_regex(pattern: &str) -> Option<Regex> {
Regex::new(&format!("(?i){pattern}")).ok()
}
pub fn looks_like_regex(text: &str) -> bool {
text.contains([
'|', '*', '+', '?', '{', '}', '(', ')', '[', ']', '\\', '^', '$',
])
}
pub fn is_stop_word(word: &str) -> bool {
STOP_WORDS.contains(&word)
}
pub fn line_snippet_regex(text: &str, regex: &Regex, context_lines: usize) -> String {
let lines: Vec<&str> = text.split('\n').collect();
for (i, line) in lines.iter().enumerate() {
if regex.is_match(line) {
return line_snippet_at(&lines, i, context_lines);
}
}
String::new()
}
pub fn line_snippet_terms(text: &str, terms: &[String], context_lines: usize) -> String {
let lines: Vec<&str> = text.split('\n').collect();
for (i, line) in lines.iter().enumerate() {
for term in terms {
if contains_casefold(line, term) {
return line_snippet_at(&lines, i, context_lines);
}
}
}
String::new()
}
fn line_snippet_at(lines: &[&str], match_idx: usize, context_lines: usize) -> String {
let start = match_idx.saturating_sub(context_lines);
let end = (match_idx + context_lines + 1).min(lines.len());
let mut out = Vec::new();
if start > 0 {
out.push(format!("...({start} lines above)"));
}
for line in &lines[start..end] {
out.push(line.to_string());
}
if end < lines.len() {
out.push(format!("...({} lines below)", lines.len() - end));
}
out.join("\n")
}
fn check_ansi_seq(bytes: &[u8], pos: usize) -> Option<usize> {
if pos >= bytes.len() {
return None;
}
let ch = bytes[pos];
if ch == b'[' {
let mut i = pos + 1;
while i < bytes.len() {
let c = bytes[i];
if (0x30..=0x3F).contains(&c) || (0x20..=0x2F).contains(&c) {
i += 1;
} else if (0x40..=0x7E).contains(&c) {
return Some(i - pos + 1);
} else {
return None;
}
}
return None;
}
if (0x40..=0x5F).contains(&ch) || (0x60..=0x7E).contains(&ch) {
return Some(1);
}
None
}
fn is_control(ch: u8) -> bool {
if ch == b'\n' || ch == b'\t' {
return false;
}
ch < 0x20 || ch == 0x7f
}