goosedump 0.5.2

Coding agent context data browser
// SPDX-License-Identifier: LGPL-2.1-or-later
// Copyright (C) Jarkko Sakkinen 2026

static STOP_WORDS: &[&str] = &[
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
    "from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
    "did", "will", "would", "shall", "should", "may", "might", "must", "can", "could", "it", "its",
    "not", "no",
];

pub fn sanitize(text: &str) -> String {
    let mut out = String::with_capacity(text.len());
    let bytes = text.as_bytes();
    let mut i = 0;

    while i < bytes.len() {
        let ch = bytes[i];

        if ch == b'\r' {
            if i + 1 >= bytes.len() || bytes[i + 1] != b'\n' {
                out.push('\n');
            }
            i += 1;
            continue;
        }

        if ch == 0x1B && i + 1 < bytes.len() {
            if let Some(seq_len) = check_ansi_seq(bytes, i + 1) {
                i += 1 + seq_len;
                continue;
            }
        }

        if !is_control(ch) {
            out.push(ch as char);
        }
        i += 1;
    }

    out
}

pub fn split_words(text: &str) -> Vec<String> {
    let mut words = Vec::new();
    let mut current = String::new();

    for ch in text.chars() {
        if ch.is_alphanumeric() || ch == '_' || ch == '-' || ch == '.' {
            current.push(ch.to_ascii_lowercase());
        } else if !current.is_empty() {
            words.push(std::mem::take(&mut current));
        }
    }
    if !current.is_empty() {
        words.push(current);
    }
    words
}

pub fn split_csv(csv: &str) -> Vec<String> {
    csv.split(',')
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .collect()
}

pub fn join_lines(lines: &[String], separator: &str) -> String {
    lines.join(separator)
}

fn floor_char_boundary(s: &str, pos: usize) -> usize {
    let mut pos = pos.min(s.len());
    while !s.is_char_boundary(pos) {
        pos -= 1;
    }
    pos
}

pub fn clip(text: &str, max_chars: usize) -> String {
    if text.len() <= max_chars {
        return text.to_string();
    }
    if max_chars <= 3 {
        let bound = floor_char_boundary(text, max_chars);
        return text[..bound].to_string();
    }
    let search_bound = floor_char_boundary(text, max_chars - 3);
    let end = text[..search_bound]
        .rfind(' ')
        .filter(|&p| p >= max_chars / 2)
        .unwrap_or(search_bound);
    let end = floor_char_boundary(text, end);
    format!("{}...", &text[..end])
}

pub fn is_stop_word(word: &str) -> bool {
    STOP_WORDS.contains(&word)
}

/// Match `text` against a shell-style glob `pattern`, anchored at both ends.
/// `*` matches any run of characters (including none) and `?` matches exactly
/// one; every other character is a literal. There is no `/` specialness, so
/// the whole string is treated uniformly.
#[must_use]
pub fn glob_match(pattern: &str, subject: &str) -> bool {
    let pat: Vec<char> = pattern.chars().collect();
    let txt: Vec<char> = subject.chars().collect();
    let (mut pi, mut ti) = (0, 0);
    let mut star: Option<usize> = None;
    let mut star_ti = 0;

    while ti < txt.len() {
        if pi < pat.len() && (pat[pi] == '?' || pat[pi] == txt[ti]) {
            pi += 1;
            ti += 1;
        } else if pi < pat.len() && pat[pi] == '*' {
            star = Some(pi);
            star_ti = ti;
            pi += 1;
        } else if let Some(star_pi) = star {
            pi = star_pi + 1;
            star_ti += 1;
            ti = star_ti;
        } else {
            return false;
        }
    }

    while pi < pat.len() && pat[pi] == '*' {
        pi += 1;
    }
    pi == pat.len()
}

/// Search `subject` for the glob `pattern` anywhere within it: an unanchored
/// `glob_match` where the pattern need not span the whole string.
#[must_use]
pub fn glob_search(pattern: &str, subject: &str) -> bool {
    glob_match(&format!("*{pattern}*"), subject)
}

/// Build a snippet around the first line matching the glob `pattern`
/// (expected already lowercased). Falls back to a clipped prefix when the
/// match spans lines and no single line matches.
pub fn line_snippet_glob(text: &str, pattern: &str, context_lines: usize) -> String {
    let lines: Vec<&str> = text.split('\n').collect();
    for (i, line) in lines.iter().enumerate() {
        if glob_search(pattern, &line.to_ascii_lowercase()) {
            return line_snippet_at(&lines, i, context_lines);
        }
    }
    clip(text, 200)
}

pub fn line_snippet_terms(text: &str, terms: &[String], context_lines: usize) -> String {
    let lines: Vec<&str> = text.split('\n').collect();
    for (i, line) in lines.iter().enumerate() {
        let line = line.to_ascii_lowercase();
        if terms.iter().any(|term| line.contains(term)) {
            return line_snippet_at(&lines, i, context_lines);
        }
    }
    String::new()
}

pub(crate) fn line_snippet_at(lines: &[&str], match_idx: usize, context_lines: usize) -> String {
    let start = match_idx.saturating_sub(context_lines);
    let end = (match_idx + context_lines + 1).min(lines.len());
    let mut out = Vec::new();

    if start > 0 {
        out.push(format!("...({start} lines above)"));
    }
    for line in &lines[start..end] {
        out.push((*line).to_string());
    }
    if end < lines.len() {
        out.push(format!("...({} lines below)", lines.len() - end));
    }
    out.join("\n")
}

fn check_ansi_seq(bytes: &[u8], pos: usize) -> Option<usize> {
    if pos >= bytes.len() {
        return None;
    }
    let ch = bytes[pos];
    if ch == b'[' {
        let mut i = pos + 1;
        while i < bytes.len() {
            let c = bytes[i];
            if (0x30..=0x3F).contains(&c) || (0x20..=0x2F).contains(&c) {
                i += 1;
            } else if (0x40..=0x7E).contains(&c) {
                return Some(i - pos + 1);
            } else {
                return None;
            }
        }
        return None;
    }
    if (0x40..=0x5F).contains(&ch) || (0x60..=0x7E).contains(&ch) {
        return Some(1);
    }
    None
}

fn is_control(ch: u8) -> bool {
    if ch == b'\n' || ch == b'\t' {
        return false;
    }
    ch < 0x20 || ch == 0x7f
}