use regex::Regex;
use std::collections::HashSet;
use std::sync::OnceLock;
use crate::regex_util::static_regex;
static GENERIC_PATTERNS: OnceLock<Vec<Regex>> = OnceLock::new();
static_regex!(fn negative_pattern, r"(?i)\b(don't|do\s+not|never|avoid|shouldn't|should\s+not)\b");
static_regex!(fn positive_pattern, r"(?i)\b(instead|rather|prefer|better\s+to|alternative|always|use\s+\w|ensure|verify|check|open\s+an?\b|run\s+\w|apply|create|add|set|enable)\b");
static_regex!(fn weak_language_pattern, r"(?i)\b(should|try\s+to|consider|maybe|might\s+want\s+to|could|possibly)\b");
static_regex!(fn critical_section_pattern, r"(?i)^#+\s*.*(critical|important|required|mandatory|rules|must|essential)");
static_regex!(fn critical_keyword_pattern, r"(?i)\b(critical|important|must|required|essential|mandatory|crucial)\b");
static_regex!(fn npm_run_pattern, r"npm\s+run\s+([a-zA-Z0-9_:-]+)");
pub fn generic_patterns() -> &'static Vec<Regex> {
GENERIC_PATTERNS.get_or_init(|| {
const PATTERNS: &[&str] = &[
r"(?i)\bbe\s+helpful",
r"(?i)\bbe\s+accurate",
r"(?i)\bthink\s+step\s+by\s+step",
r"(?i)\bbe\s+concise",
r"(?i)\bformat.*properly",
r"(?i)\bprovide.*clear.*explanations",
r"(?i)\bmake\s+sure\s+to",
r"(?i)\balways\s+be",
r"(?i)^#+\s*(?:you\s+are|your\s+role)\b",
r"(?i)^\s*-?\s*you(?:'re|\s+are)\s+a\s+(?:helpful|expert|senior|skilled|experienced)\b",
r"(?i)\bfollow\s+(?:best\s+practices|coding\s+standards|clean\s+code)\b",
r"(?i)\bwrite\s+clean\s+(?:and\s+)?(?:maintainable|readable)\s+code\b",
];
PATTERNS
.iter()
.map(|&p| Regex::new(p).unwrap_or_else(|e| panic!("BUG: invalid regex '{}': {}", p, e)))
.collect()
})
}
pub fn find_generic_instructions(content: &str) -> Vec<GenericInstruction> {
let mut results = Vec::new();
let patterns = generic_patterns();
let mut byte_offset: usize = 0;
for (line_num, line) in content.lines().enumerate() {
let line_start = byte_offset;
let line_bytes = line.len();
let remaining = &content.as_bytes()[byte_offset + line_bytes..];
let newline_len = if remaining.starts_with(b"\r\n") {
2
} else if remaining.starts_with(b"\n") {
1
} else {
0 };
let line_end = byte_offset + line_bytes + newline_len;
for pattern in patterns {
if let Some(mat) = pattern.find(line) {
results.push(GenericInstruction {
line: line_num + 1,
column: mat.start() + 1,
text: mat.as_str().to_string(),
pattern: pattern.as_str().to_string(),
start_byte: line_start,
end_byte: line_end,
});
}
}
byte_offset = line_end;
}
results
}
#[derive(Debug, Clone)]
pub struct GenericInstruction {
pub line: usize,
pub column: usize,
pub text: String,
#[allow(dead_code)] pub pattern: String,
pub start_byte: usize,
pub end_byte: usize,
}
#[derive(Debug, Clone)]
pub struct TokenCountExceeded {
#[allow(dead_code)] pub char_count: usize,
pub estimated_tokens: usize,
pub limit: usize,
}
pub fn check_token_count(content: &str) -> Option<TokenCountExceeded> {
let char_count = content.chars().count();
let estimated_tokens = char_count / 4; let limit = 1500;
if estimated_tokens > limit {
Some(TokenCountExceeded {
char_count,
estimated_tokens,
limit,
})
} else {
None
}
}
#[derive(Debug, Clone)]
pub struct NegativeInstruction {
pub line: usize,
pub column: usize,
pub text: String,
}
pub fn find_negative_without_positive(content: &str) -> Vec<NegativeInstruction> {
let mut results = Vec::new();
let neg_pattern = negative_pattern();
let pos_pattern = positive_pattern();
let lines: Vec<&str> = content.lines().collect();
for (line_num, line) in lines.iter().enumerate() {
if let Some(mat) = neg_pattern.find(line) {
let has_positive_after = if mat.end() < line.len() {
let after_negative = &line[mat.end()..];
after_negative.contains(" - ") && pos_pattern.is_match(after_negative)
|| after_negative.contains(", ") && pos_pattern.is_match(after_negative)
|| after_negative.contains("; ") && pos_pattern.is_match(after_negative)
} else {
false
};
let has_positive_transition = pos_pattern.is_match(line);
let has_positive_before = if mat.start() > 0 {
let before_raw = &line[..mat.start()];
let before_trimmed = before_raw.trim();
before_trimmed.len() > 5
&& (before_raw.contains(',')
|| before_raw.contains(';')
|| before_raw.contains(" - "))
&& !before_trimmed.starts_with("//")
&& !before_trimmed.starts_with('#')
} else {
false
};
let has_positive_next_line = lines
.get(line_num + 1)
.is_some_and(|next| pos_pattern.is_match(next));
if !has_positive_transition
&& !has_positive_before
&& !has_positive_after
&& !has_positive_next_line
{
results.push(NegativeInstruction {
line: line_num + 1,
column: mat.start() + 1,
text: mat.as_str().to_string(),
});
}
}
}
results
}
#[derive(Debug, Clone)]
pub struct WeakConstraint {
pub line: usize,
pub column: usize,
pub text: String,
pub section: String,
pub start_byte: usize,
pub end_byte: usize,
}
pub fn find_weak_constraints(content: &str) -> Vec<WeakConstraint> {
let mut results = Vec::new();
let weak_pattern = weak_language_pattern();
let section_pattern = critical_section_pattern();
let mut byte_offset: usize = 0;
let mut current_section: Option<String> = None;
for (line_num, line) in content.lines().enumerate() {
let line_start = byte_offset;
let line_bytes = line.len();
let remaining = &content.as_bytes()[byte_offset + line_bytes..];
let newline_len = if remaining.starts_with(b"\r\n") {
2
} else if remaining.starts_with(b"\n") {
1
} else {
0 };
if line.starts_with('#') {
if section_pattern.is_match(line) {
current_section = Some(line.trim_start_matches('#').trim().to_string());
} else {
current_section = None;
}
}
if let Some(section_name) = ¤t_section {
if let Some(mat) = weak_pattern.find(line) {
results.push(WeakConstraint {
line: line_num + 1,
column: mat.start() + 1,
text: mat.as_str().to_string(),
section: section_name.clone(),
start_byte: line_start + mat.start(),
end_byte: line_start + mat.end(),
});
}
}
byte_offset += line_bytes + newline_len;
}
results
}
#[derive(Debug, Clone)]
pub struct CriticalInMiddle {
pub line: usize,
pub column: usize,
pub keyword: String,
pub position_percent: f64,
}
pub fn find_critical_in_middle(content: &str) -> Vec<CriticalInMiddle> {
let mut results = Vec::new();
let pattern = critical_keyword_pattern();
let lines: Vec<&str> = content.lines().collect();
let total_lines = lines.len();
if total_lines < 10 {
return results;
}
for (line_num, line) in lines.iter().enumerate() {
if let Some(mat) = pattern.find(line) {
let position_percent = (line_num as f64 / total_lines as f64) * 100.0;
if position_percent > 40.0 && position_percent < 60.0 {
results.push(CriticalInMiddle {
line: line_num + 1,
column: mat.start() + 1,
keyword: mat.as_str().to_string(),
position_percent,
});
}
}
}
results
}
#[derive(Debug, Clone)]
pub struct NpmScriptReference {
pub line: usize,
pub column: usize,
pub script_name: String,
}
pub fn extract_npm_scripts(content: &str) -> Vec<NpmScriptReference> {
let mut results = Vec::new();
let pattern = npm_run_pattern();
for (line_num, line) in content.lines().enumerate() {
for cap in pattern.captures_iter(line) {
if let Some(script_match) = cap.get(1) {
results.push(NpmScriptReference {
line: line_num + 1,
column: cap.get(0).map(|m| m.start() + 1).unwrap_or(1),
script_name: script_match.as_str().to_string(),
});
}
}
}
results
}
pub fn calculate_text_overlap(text1: &str, text2: &str) -> f64 {
let text1_lower = text1.to_lowercase();
let text2_lower = text2.to_lowercase();
let words1: HashSet<&str> = text1_lower
.split_whitespace()
.filter(|w| w.len() > 3) .collect();
let words2: HashSet<&str> = text2_lower
.split_whitespace()
.filter(|w| w.len() > 3)
.collect();
if words1.is_empty() || words2.is_empty() {
return 0.0;
}
let intersection = words1.intersection(&words2).count();
let union = words1.union(&words2).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
#[derive(Debug, Clone)]
pub struct ReadmeDuplication {
pub overlap_percent: f64,
pub threshold: f64,
}
pub fn check_readme_duplication(claude_md: &str, readme: &str) -> Option<ReadmeDuplication> {
let overlap = calculate_text_overlap(claude_md, readme);
let threshold = 0.40;
if overlap > threshold {
Some(ReadmeDuplication {
overlap_percent: overlap * 100.0,
threshold: threshold * 100.0,
})
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_regex_patterns_compile() {
let _ = generic_patterns();
let _ = negative_pattern();
let _ = positive_pattern();
let _ = weak_language_pattern();
let _ = critical_section_pattern();
let _ = critical_keyword_pattern();
let _ = npm_run_pattern();
}
#[test]
fn test_find_generic_instructions() {
let content = "Be helpful and accurate when responding.\nUse project-specific guidelines.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
assert!(results[0].text.to_lowercase().contains("helpful"));
}
#[test]
fn test_no_generic_instructions() {
let content = "Use the coding style defined in .editorconfig\nFollow team conventions";
let results = find_generic_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_detect_role_play_header() {
let content = "# You Are an Expert\nDo this specific task.";
let results = find_generic_instructions(content);
assert!(
results.iter().any(|r| r.pattern.contains("you\\s+are")),
"Should detect role-play header '# You Are'"
);
}
#[test]
fn test_detect_role_play_preamble() {
let content = "You are a helpful assistant for this project.";
let results = find_generic_instructions(content);
assert!(
!results.is_empty(),
"Should detect 'You are a helpful assistant'"
);
}
#[test]
fn test_no_false_positive_role_play() {
let content = "You are a developer who writes Rust code.";
let results = find_generic_instructions(content);
let role_play: Vec<_> = results
.iter()
.filter(|r| r.pattern.contains("you\\s+are\\s+a"))
.collect();
assert!(
role_play.is_empty(),
"Should not match 'You are a developer' (not a generic role-play)"
);
}
#[test]
fn test_detect_generic_programming_principles() {
let content = "Follow best practices when writing code.";
let results = find_generic_instructions(content);
assert!(
results
.iter()
.any(|r| r.text.to_lowercase().contains("follow best practices")),
"Should detect 'follow best practices'"
);
let content2 = "Write clean and maintainable code.";
let results2 = find_generic_instructions(content2);
assert!(
!results2.is_empty(),
"Should detect 'write clean and maintainable code'"
);
}
#[test]
fn test_no_false_positive_specific_standards() {
let content = "Follow the airbnb style guide for JavaScript.";
let results = find_generic_instructions(content);
assert!(
results.is_empty(),
"Project-specific style references should not trigger"
);
}
#[test]
fn test_check_token_count_under_limit() {
let content = "Short content that is well under the limit.";
assert!(check_token_count(content).is_none());
}
#[test]
fn test_check_token_count_over_limit() {
let content = "x".repeat(6100);
let result = check_token_count(&content);
assert!(result.is_some());
let exceeded = result.unwrap();
assert!(exceeded.estimated_tokens > 1500);
assert_eq!(exceeded.limit, 1500);
}
#[test]
fn test_find_negative_without_positive() {
let content = "Don't use var in JavaScript.\nNever use global variables.";
let results = find_negative_without_positive(content);
assert_eq!(results.len(), 2);
assert!(results[0].text.to_lowercase().contains("don"));
}
#[test]
fn test_negative_with_positive_same_line() {
let content = "Don't use var, instead prefer const or let.";
let results = find_negative_without_positive(content);
assert!(results.is_empty());
}
#[test]
fn test_negative_with_positive_next_line() {
let content = "Don't use var.\nUse const instead of var.";
let results = find_negative_without_positive(content);
assert!(results.is_empty());
}
#[test]
fn test_negative_with_positive_before_comma() {
let content = "Fetch web resources fresh, don't rely on cached data";
let results = find_negative_without_positive(content);
assert!(results.is_empty(), "Should recognize positive before comma");
let content2 = "Use const or let, never use var";
let results2 = find_negative_without_positive(content2);
assert!(
results2.is_empty(),
"Should recognize positive before comma"
);
let content3 = "Don't use var";
let results3 = find_negative_without_positive(content3);
assert_eq!(results3.len(), 1, "Standalone negative should trigger");
}
#[test]
fn test_negative_with_positive_after_dash() {
let content = "NEVER assume - always verify with tests and benchmarks";
let results = find_negative_without_positive(content);
assert!(
results.is_empty(),
"NEVER with dash-separated positive should not trigger"
);
let content2 = "NEVER ignore bugs, even out of scope - open an issue";
let results2 = find_negative_without_positive(content2);
assert!(
results2.is_empty(),
"NEVER with dash-separated action should not trigger"
);
}
#[test]
fn test_negative_with_positive_before_dash() {
let content = "Disagree? Raise it, but don't change without approval";
let results = find_negative_without_positive(content);
assert!(
results.is_empty(),
"don't with preceding positive context should not trigger"
);
}
#[test]
fn test_negative_with_bold_positive_before_dash() {
let content = "7. **Report script failures before manual fallback** - Never silently bypass broken tooling.";
let results = find_negative_without_positive(content);
assert!(
results.is_empty(),
"Bold positive before dash-separated Never should not trigger"
);
let content2 = "3. **Always commit before rebasing** - Don't rebase uncommitted work.";
let results2 = find_negative_without_positive(content2);
assert!(
results2.is_empty(),
"Bold positive before dash-separated Don't should not trigger"
);
}
#[test]
fn test_standalone_negative_still_triggers() {
let content = "Never use global variables";
let results = find_negative_without_positive(content);
assert_eq!(
results.len(),
1,
"Standalone NEVER without alternative should trigger"
);
let content2 = "Don't hardcode values";
let results2 = find_negative_without_positive(content2);
assert_eq!(
results2.len(),
1,
"Standalone don't without alternative should trigger"
);
}
#[test]
fn test_find_weak_constraints_in_critical() {
let content = "# Critical Rules\n\nYou should follow the coding style.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].text.to_lowercase(), "should");
}
#[test]
fn test_find_weak_constraints_outside_critical() {
let content = "# General Guidelines\n\nYou should follow the coding style.";
let results = find_weak_constraints(content);
assert!(results.is_empty());
}
#[test]
fn test_weak_constraints_section_ends() {
let content =
"# Critical Rules\n\nMust follow style.\n\n# Other\n\nYou should do this too.";
let results = find_weak_constraints(content);
assert!(results.is_empty());
}
#[test]
fn test_find_critical_in_middle() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[10] = "This is critical information.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle(&content);
assert_eq!(results.len(), 1);
assert!(results[0].position_percent > 40.0);
assert!(results[0].position_percent < 60.0);
}
#[test]
fn test_critical_at_top() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[1] = "This is critical information.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle(&content);
assert!(results.is_empty());
}
#[test]
fn test_critical_at_bottom() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[18] = "This is critical information.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle(&content);
assert!(results.is_empty());
}
#[test]
fn test_short_document_no_critical_middle() {
let content = "Critical info here.\nAnother line.";
let results = find_critical_in_middle(content);
assert!(results.is_empty());
}
#[test]
fn test_extract_npm_scripts() {
let content = "Run tests with npm run test\nBuild with npm run build";
let results = extract_npm_scripts(content);
assert_eq!(results.len(), 2);
assert_eq!(results[0].script_name, "test");
assert_eq!(results[1].script_name, "build");
}
#[test]
fn test_extract_npm_scripts_with_colon() {
let content = "Run npm run test:unit for unit tests";
let results = extract_npm_scripts(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].script_name, "test:unit");
}
#[test]
fn test_no_npm_scripts() {
let content = "Use cargo test for testing.";
let results = extract_npm_scripts(content);
assert!(results.is_empty());
}
#[test]
fn test_calculate_text_overlap_identical() {
let text = "This is some sample text with enough words to test overlap calculation.";
let overlap = calculate_text_overlap(text, text);
assert!((overlap - 1.0).abs() < 0.01);
}
#[test]
fn test_calculate_text_overlap_different() {
let text1 = "This project uses Rust for performance.";
let text2 = "Python is great for machine learning.";
let overlap = calculate_text_overlap(text1, text2);
assert!(overlap < 0.3);
}
#[test]
fn test_check_readme_duplication_detected() {
let claude_md =
"This is a project about Rust validation. It validates agent configurations.";
let readme = "This is a project about Rust validation. It validates agent configurations.";
let result = check_readme_duplication(claude_md, readme);
assert!(result.is_some());
}
#[test]
fn test_check_readme_duplication_not_detected() {
let claude_md = "Project-specific instructions for Claude. Focus on these guidelines.";
let readme = "Welcome to the project. Installation: npm install. Usage: npm start.";
let result = check_readme_duplication(claude_md, readme);
assert!(result.is_none());
}
#[test]
fn test_generic_instruction_byte_offsets_single_line() {
let content = "Be helpful and accurate.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
assert_eq!(inst.start_byte, 0);
assert_eq!(inst.end_byte, 24); }
#[test]
fn test_generic_instruction_byte_offsets_multiline() {
let content = "Line one.\nBe helpful and accurate.\nLine three.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
assert_eq!(inst.start_byte, 10);
assert_eq!(inst.end_byte, 35);
}
#[test]
fn test_generic_instruction_byte_offsets_last_line_no_newline() {
let content = "Line one.\nBe helpful and accurate.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
assert_eq!(inst.start_byte, 10);
assert_eq!(inst.end_byte, 34);
}
#[test]
fn test_generic_instruction_delete_produces_expected() {
let content = "Line one.\nBe helpful and accurate.\nLine three.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
let mut modified = content.to_string();
modified.replace_range(inst.start_byte..inst.end_byte, "");
assert_eq!(modified, "Line one.\nLine three.");
}
#[test]
fn test_weak_constraint_byte_offsets() {
let content = "# Critical Rules\n\nYou should follow the coding style.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 1);
let wc = &results[0];
assert_eq!(wc.text.to_lowercase(), "should");
assert_eq!(wc.start_byte, 22);
assert_eq!(wc.end_byte, 28);
}
#[test]
fn test_weak_constraint_replace_produces_expected() {
let content = "# Critical Rules\n\nYou should follow the coding style.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 1);
let wc = &results[0];
let mut modified = content.to_string();
modified.replace_range(wc.start_byte..wc.end_byte, "must");
assert_eq!(
modified,
"# Critical Rules\n\nYou must follow the coding style."
);
}
#[test]
fn test_weak_constraint_try_to() {
let content = "# Critical Rules\n\nTry to follow the coding style.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 1);
let wc = &results[0];
assert_eq!(wc.text.to_lowercase(), "try to");
let mut modified = content.to_string();
modified.replace_range(wc.start_byte..wc.end_byte, "must");
assert_eq!(
modified,
"# Critical Rules\n\nmust follow the coding style."
);
}
#[test]
fn test_weak_constraint_multiple_on_same_line() {
let content = "# Critical Rules\n\nYou should consider doing this.";
let results = find_weak_constraints(content);
assert!(!results.is_empty());
assert_eq!(results[0].text.to_lowercase(), "should");
}
#[test]
fn test_weak_constraint_multiline() {
let content = "# Critical Rules\n\nYou should do this.\nYou could do that.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 2);
assert_eq!(results[0].text.to_lowercase(), "should");
assert_eq!(results[1].text.to_lowercase(), "could");
let mut modified = content.to_string();
modified.replace_range(results[1].start_byte..results[1].end_byte, "must");
modified.replace_range(results[0].start_byte..results[0].end_byte, "must");
assert_eq!(
modified,
"# Critical Rules\n\nYou must do this.\nYou must do that."
);
}
#[test]
fn test_generic_instruction_byte_offsets_crlf() {
let content = "Line one.\r\nBe helpful and accurate.\r\nLine three.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
assert_eq!(inst.start_byte, 11);
assert_eq!(inst.end_byte, 37);
}
#[test]
fn test_generic_instruction_delete_produces_expected_crlf() {
let content = "Line one.\r\nBe helpful and accurate.\r\nLine three.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
let mut modified = content.to_string();
modified.replace_range(inst.start_byte..inst.end_byte, "");
assert_eq!(modified, "Line one.\r\nLine three.");
}
#[test]
fn test_generic_instruction_byte_offsets_crlf_last_line() {
let content = "Line one.\r\nBe helpful and accurate.";
let results = find_generic_instructions(content);
assert!(!results.is_empty());
let inst = &results[0];
assert_eq!(inst.start_byte, 11);
assert_eq!(inst.end_byte, 35);
}
#[test]
fn test_weak_constraint_byte_offsets_crlf() {
let content = "# Critical Rules\r\n\r\nYou should follow the coding style.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 1);
let wc = &results[0];
assert_eq!(wc.text.to_lowercase(), "should");
assert_eq!(wc.start_byte, 24);
assert_eq!(wc.end_byte, 30);
}
#[test]
fn test_weak_constraint_replace_produces_expected_crlf() {
let content = "# Critical Rules\r\n\r\nYou should follow the coding style.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 1);
let wc = &results[0];
let mut modified = content.to_string();
modified.replace_range(wc.start_byte..wc.end_byte, "must");
assert_eq!(
modified,
"# Critical Rules\r\n\r\nYou must follow the coding style."
);
}
#[test]
fn test_weak_constraint_multiline_crlf() {
let content = "# Critical Rules\r\n\r\nYou should do this.\r\nYou could do that.";
let results = find_weak_constraints(content);
assert_eq!(results.len(), 2);
assert_eq!(results[0].text.to_lowercase(), "should");
assert_eq!(results[1].text.to_lowercase(), "could");
let mut modified = content.to_string();
modified.replace_range(results[1].start_byte..results[1].end_byte, "must");
modified.replace_range(results[0].start_byte..results[0].end_byte, "must");
assert_eq!(
modified,
"# Critical Rules\r\n\r\nYou must do this.\r\nYou must do that."
);
}
}