static STOP_WORDS: &[&str] = &[
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
"from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
"did", "will", "would", "shall", "should", "may", "might", "must", "can", "could", "it", "its",
"not", "no",
];
pub fn sanitize(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
let ch = bytes[i];
if ch == b'\r' {
if i + 1 >= bytes.len() || bytes[i + 1] != b'\n' {
out.push('\n');
}
i += 1;
continue;
}
if ch == 0x1B && i + 1 < bytes.len() {
if let Some(seq_len) = check_ansi_seq(bytes, i + 1) {
i += 1 + seq_len;
continue;
}
}
if !is_control(ch) {
out.push(ch as char);
}
i += 1;
}
out
}
pub fn split_words(text: &str) -> Vec<String> {
let mut words = Vec::new();
let mut current = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() || ch == '_' || ch == '-' || ch == '.' {
current.push(ch.to_ascii_lowercase());
} else if !current.is_empty() {
words.push(std::mem::take(&mut current));
}
}
if !current.is_empty() {
words.push(current);
}
words
}
pub fn split_csv(csv: &str) -> Vec<String> {
csv.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
pub fn join_lines(lines: &[String], separator: &str) -> String {
lines.join(separator)
}
fn floor_char_boundary(s: &str, pos: usize) -> usize {
let mut pos = pos.min(s.len());
while !s.is_char_boundary(pos) {
pos -= 1;
}
pos
}
pub fn clip(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars {
return text.to_string();
}
if max_chars <= 3 {
let bound = floor_char_boundary(text, max_chars);
return text[..bound].to_string();
}
let search_bound = floor_char_boundary(text, max_chars - 3);
let end = text[..search_bound]
.rfind(' ')
.filter(|&p| p >= max_chars / 2)
.unwrap_or(search_bound);
let end = floor_char_boundary(text, end);
format!("{}...", &text[..end])
}
pub fn looks_like_query(text: &str) -> bool {
text.contains([
'|', '*', '+', '?', '{', '}', '(', ')', '[', ']', '\\', '^', '$',
])
}
pub fn is_stop_word(word: &str) -> bool {
STOP_WORDS.contains(&word)
}
#[must_use]
pub fn glob_match(pattern: &str, subject: &str) -> bool {
let pat: Vec<char> = pattern.chars().collect();
let txt: Vec<char> = subject.chars().collect();
let (mut pi, mut ti) = (0, 0);
let mut star: Option<usize> = None;
let mut star_ti = 0;
while ti < txt.len() {
if pi < pat.len() && (pat[pi] == '?' || pat[pi] == txt[ti]) {
pi += 1;
ti += 1;
} else if pi < pat.len() && pat[pi] == '*' {
star = Some(pi);
star_ti = ti;
pi += 1;
} else if let Some(star_pi) = star {
pi = star_pi + 1;
star_ti += 1;
ti = star_ti;
} else {
return false;
}
}
while pi < pat.len() && pat[pi] == '*' {
pi += 1;
}
pi == pat.len()
}
#[must_use]
pub fn glob_search(pattern: &str, subject: &str) -> bool {
glob_match(&format!("*{pattern}*"), subject)
}
pub fn line_snippet_terms(text: &str, terms: &[String], context_lines: usize) -> String {
let lines: Vec<&str> = text.split('\n').collect();
for (i, line) in lines.iter().enumerate() {
let line = line.to_ascii_lowercase();
if terms.iter().any(|term| line.contains(term)) {
return line_snippet_at(&lines, i, context_lines);
}
}
String::new()
}
pub(crate) fn line_snippet_at(lines: &[&str], match_idx: usize, context_lines: usize) -> String {
let start = match_idx.saturating_sub(context_lines);
let end = (match_idx + context_lines + 1).min(lines.len());
let mut out = Vec::new();
if start > 0 {
out.push(format!("...({start} lines above)"));
}
for line in &lines[start..end] {
out.push((*line).to_string());
}
if end < lines.len() {
out.push(format!("...({} lines below)", lines.len() - end));
}
out.join("\n")
}
fn check_ansi_seq(bytes: &[u8], pos: usize) -> Option<usize> {
if pos >= bytes.len() {
return None;
}
let ch = bytes[pos];
if ch == b'[' {
let mut i = pos + 1;
while i < bytes.len() {
let c = bytes[i];
if (0x30..=0x3F).contains(&c) || (0x20..=0x2F).contains(&c) {
i += 1;
} else if (0x40..=0x7E).contains(&c) {
return Some(i - pos + 1);
} else {
return None;
}
}
return None;
}
if (0x40..=0x5F).contains(&ch) || (0x60..=0x7E).contains(&ch) {
return Some(1);
}
None
}
fn is_control(ch: u8) -> bool {
if ch == b'\n' || ch == b'\t' {
return false;
}
ch < 0x20 || ch == 0x7f
}