goosedump 0.1.3

Coding agent context data browser
// SPDX-License-Identifier: LGPL-2.1-or-later
// Copyright (C) Jarkko Sakkinen 2026

use regex::Regex;

static STOP_WORDS: &[&str] = &[
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
    "from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
    "did", "will", "would", "shall", "should", "may", "might", "must", "can", "could", "it", "its",
    "not", "no",
];

pub fn sanitize(text: &str) -> String {
    let mut out = String::with_capacity(text.len());
    let bytes = text.as_bytes();
    let mut i = 0;

    while i < bytes.len() {
        let ch = bytes[i];

        if ch == b'\r' {
            if i + 1 >= bytes.len() || bytes[i + 1] != b'\n' {
                out.push('\n');
            }
            i += 1;
            continue;
        }

        if ch == 0x1B
            && i + 1 < bytes.len()
            && let Some(seq_len) = check_ansi_seq(bytes, i + 1)
        {
            i += 1 + seq_len;
            continue;
        }

        if !is_control(ch) {
            out.push(ch as char);
        }
        i += 1;
    }

    out
}

pub fn split_words(text: &str) -> Vec<String> {
    let mut words = Vec::new();
    let mut current = String::new();

    for ch in text.chars() {
        if ch.is_alphanumeric() || ch == '_' || ch == '-' || ch == '.' {
            current.push(ch.to_ascii_lowercase());
        } else if !current.is_empty() {
            words.push(current.clone());
            current.clear();
        }
    }
    if !current.is_empty() {
        words.push(current);
    }
    words
}

pub fn split_csv(csv: &str) -> Vec<String> {
    csv.split(',')
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .collect()
}

pub fn join_lines(lines: &[String], separator: &str) -> String {
    lines.join(separator)
}

pub fn wrap_text(text: &str, width: usize, indent: &str) -> String {
    let indent_len = indent.chars().count();
    let available = width.saturating_sub(indent_len);
    if available == 0 {
        return String::new();
    }

    let mut out = String::new();
    for line in text.lines() {
        if line.is_empty() {
            out.push('\n');
            continue;
        }
        let wrapped = wrap_line_words(line, available);
        for w in &wrapped {
            out.push_str(indent);
            out.push_str(w);
            out.push('\n');
        }
    }
    out.trim_end_matches('\n').to_string()
}

fn wrap_line_words(line: &str, available: usize) -> Vec<String> {
    let words: Vec<&str> = line.split(' ').collect();
    let mut lines = Vec::new();
    let mut current = String::new();

    for word in &words {
        if current.is_empty() {
            current = word.to_string();
        } else if current.len() + 1 + word.len() <= available {
            current.push(' ');
            current.push_str(word);
        } else {
            lines.push(current);
            current = word.to_string();
        }
    }

    if !current.is_empty() {
        lines.push(current);
    }

    lines
}

fn floor_char_boundary(s: &str, pos: usize) -> usize {
    let mut pos = pos.min(s.len());
    while !s.is_char_boundary(pos) {
        pos -= 1;
    }
    pos
}

pub fn clip(text: &str, max_chars: usize) -> String {
    if text.len() <= max_chars {
        return text.to_string();
    }
    if max_chars <= 3 {
        let bound = floor_char_boundary(text, max_chars);
        return text[..bound].to_string();
    }
    let search_bound = floor_char_boundary(text, max_chars - 3);
    let end = text[..search_bound]
        .rfind(' ')
        .filter(|&p| p >= max_chars / 2)
        .unwrap_or(search_bound);
    let end = floor_char_boundary(text, end);
    format!("{}...", &text[..end])
}

pub fn contains_casefold(haystack: &str, needle: &str) -> bool {
    haystack
        .to_ascii_lowercase()
        .contains(&needle.to_ascii_lowercase())
}

pub fn make_regex(pattern: &str) -> Option<Regex> {
    Regex::new(&format!("(?i){pattern}")).ok()
}

pub fn looks_like_regex(text: &str) -> bool {
    text.contains([
        '|', '*', '+', '?', '{', '}', '(', ')', '[', ']', '\\', '^', '$',
    ])
}

pub fn is_stop_word(word: &str) -> bool {
    STOP_WORDS.contains(&word)
}

pub fn line_snippet_regex(text: &str, regex: &Regex, context_lines: usize) -> String {
    let lines: Vec<&str> = text.split('\n').collect();
    for (i, line) in lines.iter().enumerate() {
        if regex.is_match(line) {
            return line_snippet_at(&lines, i, context_lines);
        }
    }
    String::new()
}

pub fn line_snippet_terms(text: &str, terms: &[String], context_lines: usize) -> String {
    let lines: Vec<&str> = text.split('\n').collect();
    for (i, line) in lines.iter().enumerate() {
        for term in terms {
            if contains_casefold(line, term) {
                return line_snippet_at(&lines, i, context_lines);
            }
        }
    }
    String::new()
}

fn line_snippet_at(lines: &[&str], match_idx: usize, context_lines: usize) -> String {
    let start = match_idx.saturating_sub(context_lines);
    let end = (match_idx + context_lines + 1).min(lines.len());
    let mut out = Vec::new();

    if start > 0 {
        out.push(format!("...({start} lines above)"));
    }
    for line in &lines[start..end] {
        out.push(line.to_string());
    }
    if end < lines.len() {
        out.push(format!("...({} lines below)", lines.len() - end));
    }
    out.join("\n")
}

fn check_ansi_seq(bytes: &[u8], pos: usize) -> Option<usize> {
    if pos >= bytes.len() {
        return None;
    }
    let ch = bytes[pos];
    if ch == b'[' {
        let mut i = pos + 1;
        while i < bytes.len() {
            let c = bytes[i];
            if (0x30..=0x3F).contains(&c) || (0x20..=0x2F).contains(&c) {
                i += 1;
            } else if (0x40..=0x7E).contains(&c) {
                return Some(i - pos + 1);
            } else {
                return None;
            }
        }
        return None;
    }
    if (0x40..=0x5F).contains(&ch) || (0x60..=0x7E).contains(&ch) {
        return Some(1);
    }
    None
}

fn is_control(ch: u8) -> bool {
    if ch == b'\n' || ch == b'\t' {
        return false;
    }
    ch < 0x20 || ch == 0x7f
}