use std::collections::HashMap;
use std::path::Path;
use crate::brain::rsi_proposals::ProposedBrainDedup;
const BRAIN_FILES_TO_SCAN: &[&str] = &[
"SOUL.md",
"USER.md",
"AGENTS.md",
"CODE.md",
"TOOLS.md",
"SECURITY.md",
"MEMORY.md",
"BOOT.md",
"BOOTSTRAP.md",
"HEARTBEAT.md",
];
const MIN_LINE_LEN: usize = 10;
const MIN_DUPLICATE_COUNT: usize = 2;
pub(crate) fn canonical_file_rank(filename: &str) -> u8 {
match filename {
"SOUL.md" => 0,
"AGENTS.md" => 1,
"TOOLS.md" => 2,
"CODE.md" => 3,
"SECURITY.md" => 4,
"MEMORY.md" => 5,
"USER.md" => 6,
_ => u8::MAX,
}
}
pub(crate) fn is_structural_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() {
return true;
}
if trimmed.len() < MIN_LINE_LEN {
return true;
}
if trimmed.starts_with('#') {
return true;
}
if trimmed
.chars()
.all(|c| c == '-' || c == '=' || c == '*' || c == '_')
{
return true;
}
if trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.contains("---") {
return true;
}
if trimmed.starts_with('>') && trimmed.len() < 20 {
return true;
}
false
}
#[derive(Debug, Clone)]
pub struct DuplicateCluster {
pub text: String,
pub locations: Vec<(String, Vec<usize>)>,
pub total_count: usize,
}
pub fn scan_brain_files(brain_path: &Path) -> Vec<DuplicateCluster> {
let mut line_occurrences: HashMap<String, Vec<(String, usize)>> = HashMap::new();
for filename in BRAIN_FILES_TO_SCAN {
let file_path = brain_path.join(filename);
if !file_path.exists() {
continue;
}
let Ok(content) = std::fs::read_to_string(&file_path) else {
continue;
};
for (line_idx, line) in content.lines().enumerate() {
if is_structural_line(line) {
continue;
}
let normalized = line.trim().to_string();
if normalized.len() < MIN_LINE_LEN {
continue;
}
line_occurrences
.entry(normalized)
.or_default()
.push((filename.to_string(), line_idx + 1));
}
}
let mut clusters: Vec<DuplicateCluster> = Vec::new();
for (text, locations) in line_occurrences {
let total_count: usize = locations.len();
if total_count < MIN_DUPLICATE_COUNT {
continue;
}
let mut by_file: HashMap<String, Vec<usize>> = HashMap::new();
for (file, line) in &locations {
by_file.entry(file.clone()).or_default().push(*line);
}
let mut loc_vec: Vec<(String, Vec<usize>)> = by_file.into_iter().collect();
loc_vec.sort_by(|a, b| {
canonical_file_rank(&a.0)
.cmp(&canonical_file_rank(&b.0))
.then(a.0.cmp(&b.0))
});
clusters.push(DuplicateCluster {
text,
locations: loc_vec,
total_count,
});
}
clusters.sort_by(|a, b| b.total_count.cmp(&a.total_count).then(a.text.cmp(&b.text)));
clusters
}
pub fn cluster_to_proposals(cluster: &DuplicateCluster) -> Vec<ProposedBrainDedup> {
if cluster.locations.is_empty() {
return Vec::new();
}
let (canonical_file, canonical_lines) = &cluster.locations[0];
let mut proposals = Vec::new();
if cluster.locations.len() == 1 {
if canonical_lines.len() <= 1 {
return proposals;
}
let lines_to_remove = canonical_lines[1..].to_vec();
if let Some(p) = build_proposal(
cluster,
canonical_file,
canonical_lines[0],
canonical_file,
&lines_to_remove,
) {
proposals.push(p);
}
return proposals;
}
for (other_file, other_lines) in cluster.locations.iter().skip(1) {
if let Some(p) = build_proposal(
cluster,
canonical_file,
canonical_lines[0],
other_file,
other_lines,
) {
proposals.push(p);
}
}
proposals
}
#[deprecated(note = "use cluster_to_proposals for N-1 per-file proposals")]
pub fn cluster_to_proposal(cluster: &DuplicateCluster) -> Option<ProposedBrainDedup> {
cluster_to_proposals(cluster).into_iter().next()
}
fn build_proposal(
cluster: &DuplicateCluster,
canonical_file: &str,
canonical_first_line: usize,
target_file: &str,
lines_to_remove: &[usize],
) -> Option<ProposedBrainDedup> {
if lines_to_remove.is_empty() {
return None;
}
let line_range = if lines_to_remove.len() == 1 {
format!("{}", lines_to_remove[0])
} else {
format!(
"{}-{}",
lines_to_remove.iter().min().unwrap(),
lines_to_remove.iter().max().unwrap()
)
};
let duplicate_of = format!("{}:{}", canonical_file, canonical_first_line);
Some(ProposedBrainDedup {
target_file: target_file.to_string(),
duplicate_text: cluster.text.clone(),
line_range,
duplicate_of,
count: lines_to_remove.len(),
warnings: Vec::new(),
})
}
pub fn generate_dedup_proposals(brain_path: &Path) -> Vec<(ProposedBrainDedup, String)> {
let clusters = scan_brain_files(brain_path);
let mut results = Vec::new();
let mut planned_removals: HashMap<String, Vec<usize>> = HashMap::new();
let mut staged: Vec<(ProposedBrainDedup, String)> = Vec::new();
for cluster in &clusters {
for proposal in cluster_to_proposals(cluster) {
let rationale = format!(
"Found '{}' appearing {} times across brain files. \
Keeping canonical copy at {}, removing duplicate(s).",
&cluster.text[..cluster.text.len().min(80)],
cluster.total_count,
proposal.duplicate_of,
);
planned_removals
.entry(proposal.target_file.clone())
.or_default()
.extend(parse_line_range(&proposal.line_range));
staged.push((proposal, rationale));
}
}
let stub_risk_by_file: HashMap<String, Vec<String>> = planned_removals
.iter()
.map(|(filename, removed)| {
let warnings = compute_stub_risk(brain_path, filename, removed);
(filename.clone(), warnings)
})
.collect();
for (mut proposal, rationale) in staged {
if let Some(warnings) = stub_risk_by_file.get(&proposal.target_file)
&& !warnings.is_empty()
{
proposal.warnings = warnings.clone();
}
results.push((proposal, rationale));
}
results
}
fn parse_line_range(range: &str) -> Vec<usize> {
if let Some((start, end)) = range.split_once('-') {
let start = start.trim().parse::<usize>().ok();
let end = end.trim().parse::<usize>().ok();
match (start, end) {
(Some(s), Some(e)) if e >= s => (s..=e).collect(),
_ => Vec::new(),
}
} else if let Ok(n) = range.trim().parse::<usize>() {
vec![n]
} else {
Vec::new()
}
}
fn compute_stub_risk(brain_path: &Path, filename: &str, removed: &[usize]) -> Vec<String> {
let file_path = brain_path.join(filename);
let Ok(content) = std::fs::read_to_string(&file_path) else {
return Vec::new();
};
let removed_set: std::collections::HashSet<usize> = removed.iter().copied().collect();
let post: Vec<&str> = content
.lines()
.enumerate()
.filter_map(|(i, l)| {
if removed_set.contains(&(i + 1)) {
None
} else {
Some(l)
}
})
.collect();
let post_str = post.join("\n");
let pre_headers = headers_with_empty_body(&content);
let post_headers = headers_with_empty_body(&post_str);
let mut out: Vec<String> = post_headers
.into_iter()
.filter(|h| !pre_headers.contains(h))
.collect();
out.sort();
out.dedup();
out
}
fn headers_with_empty_body(content: &str) -> std::collections::HashSet<String> {
let res = crate::brain::filter::strip_empty_sections(content);
res.stripped_headers.into_iter().collect()
}
pub fn file_dedup_proposals(
brain_path: &Path,
store: &crate::brain::rsi_proposals::ProposalsStore,
) -> usize {
let proposals = generate_dedup_proposals(brain_path);
let mut count = 0;
for (dedup, rationale) in proposals {
match store.add_brain_dedup_proposal("rsi-dedup-scan", rationale, dedup) {
Ok(_id) => count += 1,
Err(e) => {
tracing::warn!("Failed to file brain dedup proposal: {e}");
}
}
}
count
}