npcrs 0.1.15 - Docs.rs

pub fn chunk_text(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
    let chars: Vec<char> = text.chars().collect();
    if chars.is_empty() || chunk_size == 0 {
        return vec![];
    }
    let step = chunk_size.saturating_sub(overlap).max(1);
    let mut chunks = Vec::new();
    let mut start = 0;
    while start < chars.len() {
        let end = (start + chunk_size).min(chars.len());
        chunks.push(chars[start..end].iter().collect());
        start += step;
        if end == chars.len() {
            break;
        }
    }
    chunks
}

pub fn extract_urls(text: &str) -> Vec<String> {
    regex::Regex::new(r#"https?://[^\s<>"')\]]+"#)
        .unwrap()
        .find_iter(text)
        .map(|m| m.as_str().to_string())
        .collect()
}

pub fn strip_html(html: &str) -> String {
    regex::Regex::new(r"<[^>]+>")
        .unwrap()
        .replace_all(html, "")
        .to_string()
}

pub fn rag_search(
    query: &str,
    text_data: &std::collections::HashMap<String, String>,
    similarity_threshold: f64,
) -> Vec<(String, String)> {
    let ql = query.to_lowercase();
    let qw: std::collections::HashSet<&str> = ql.split_whitespace().collect();
    let mut results = Vec::new();
    for (filename, content) in text_data {
        let lines: Vec<&str> = content.lines().collect();
        for (idx, line) in lines.iter().enumerate() {
            let ll = line.to_lowercase();
            let lw: std::collections::HashSet<&str> = ll.split_whitespace().collect();
            if lw.is_empty() {
                continue;
            }
            let inter = qw.intersection(&lw).count();
            let score = inter as f64 / qw.len().max(1) as f64;
            if score >= similarity_threshold {
                let s = idx.saturating_sub(10);
                let e = (idx + 11).min(lines.len());
                results.push((filename.clone(), lines[s..e].join("\n")));
            }
        }
    }
    results
}

pub fn rag_search_text(query: &str, text: &str, similarity_threshold: f64) -> Vec<String> {
    let ql = query.to_lowercase();
    let qw: std::collections::HashSet<&str> = ql.split_whitespace().collect();
    let sentences: Vec<&str> = text.split('.').collect();
    let mut results = Vec::new();
    for (idx, sentence) in sentences.iter().enumerate() {
        let sl = sentence.to_lowercase();
        let sw: std::collections::HashSet<&str> = sl.split_whitespace().collect();
        if sw.is_empty() {
            continue;
        }
        let inter = qw.intersection(&sw).count();
        let score = inter as f64 / qw.len().max(1) as f64;
        if score >= similarity_threshold {
            let s = idx.saturating_sub(10);
            let e = (idx + 11).min(sentences.len());
            results.push(sentences[s..e].join(". "));
        }
    }
    results
}

pub fn load_all_files(
    directory: &str,
    extensions: Option<&[&str]>,
    depth: usize,
) -> std::collections::HashMap<String, String> {
    let default_exts = [
        ".txt", ".md", ".py", ".java", ".c", ".cpp", ".html", ".css", ".js", ".ts", ".tsx", ".npc",
    ];
    let exts = extensions.unwrap_or(&default_exts);
    let mut text_data = std::collections::HashMap::new();
    if depth < 1 {
        return text_data;
    }
    let entries = match std::fs::read_dir(directory) {
        Ok(e) => e,
        Err(_) => return text_data,
    };
    for entry in entries.flatten() {
        let path = entry.path();
        if path.is_file() {
            let ps = path.to_string_lossy().to_string();
            if exts.iter().any(|ext| ps.ends_with(ext)) {
                if let Ok(content) = std::fs::read_to_string(&path) {
                    text_data.insert(ps, content);
                }
            }
        } else if path.is_dir() {
            text_data.extend(load_all_files(
                &path.to_string_lossy(),
                extensions,
                depth - 1,
            ));
        }
    }
    text_data
}