pub fn chunk_text(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
let chars: Vec<char> = text.chars().collect();
if chars.is_empty() || chunk_size == 0 {
return vec![];
}
let step = chunk_size.saturating_sub(overlap).max(1);
let mut chunks = Vec::new();
let mut start = 0;
while start < chars.len() {
let end = (start + chunk_size).min(chars.len());
chunks.push(chars[start..end].iter().collect());
start += step;
if end == chars.len() {
break;
}
}
chunks
}
pub fn extract_urls(text: &str) -> Vec<String> {
regex::Regex::new(r#"https?://[^\s<>"')\]]+"#)
.unwrap()
.find_iter(text)
.map(|m| m.as_str().to_string())
.collect()
}
pub fn strip_html(html: &str) -> String {
regex::Regex::new(r"<[^>]+>")
.unwrap()
.replace_all(html, "")
.to_string()
}
pub fn rag_search(
query: &str,
text_data: &std::collections::HashMap<String, String>,
similarity_threshold: f64,
) -> Vec<(String, String)> {
let ql = query.to_lowercase();
let qw: std::collections::HashSet<&str> = ql.split_whitespace().collect();
let mut results = Vec::new();
for (filename, content) in text_data {
let lines: Vec<&str> = content.lines().collect();
for (idx, line) in lines.iter().enumerate() {
let ll = line.to_lowercase();
let lw: std::collections::HashSet<&str> = ll.split_whitespace().collect();
if lw.is_empty() {
continue;
}
let inter = qw.intersection(&lw).count();
let score = inter as f64 / qw.len().max(1) as f64;
if score >= similarity_threshold {
let s = idx.saturating_sub(10);
let e = (idx + 11).min(lines.len());
results.push((filename.clone(), lines[s..e].join("\n")));
}
}
}
results
}
pub fn rag_search_text(query: &str, text: &str, similarity_threshold: f64) -> Vec<String> {
let ql = query.to_lowercase();
let qw: std::collections::HashSet<&str> = ql.split_whitespace().collect();
let sentences: Vec<&str> = text.split('.').collect();
let mut results = Vec::new();
for (idx, sentence) in sentences.iter().enumerate() {
let sl = sentence.to_lowercase();
let sw: std::collections::HashSet<&str> = sl.split_whitespace().collect();
if sw.is_empty() {
continue;
}
let inter = qw.intersection(&sw).count();
let score = inter as f64 / qw.len().max(1) as f64;
if score >= similarity_threshold {
let s = idx.saturating_sub(10);
let e = (idx + 11).min(sentences.len());
results.push(sentences[s..e].join(". "));
}
}
results
}
pub fn load_all_files(
directory: &str,
extensions: Option<&[&str]>,
depth: usize,
) -> std::collections::HashMap<String, String> {
let default_exts = [
".txt", ".md", ".py", ".java", ".c", ".cpp", ".html", ".css", ".js", ".ts", ".tsx", ".npc",
];
let exts = extensions.unwrap_or(&default_exts);
let mut text_data = std::collections::HashMap::new();
if depth < 1 {
return text_data;
}
let entries = match std::fs::read_dir(directory) {
Ok(e) => e,
Err(_) => return text_data,
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_file() {
let ps = path.to_string_lossy().to_string();
if exts.iter().any(|ext| ps.ends_with(ext)) {
if let Ok(content) = std::fs::read_to_string(&path) {
text_data.insert(ps, content);
}
}
} else if path.is_dir() {
text_data.extend(load_all_files(
&path.to_string_lossy(),
extensions,
depth - 1,
));
}
}
text_data
}