use regex::Regex;
use crate::parsers::markdown::MAX_REGEX_INPUT_SIZE;
use crate::regex_util::static_regex;
static_regex!(fn critical_keyword_pattern, r"(?i)\b(critical|important|must|required|essential|mandatory|crucial|never|always)\b");
static_regex!(fn cot_phrase_pattern, r"(?i)\b(think\s+step\s+by\s+step|let'?s\s+think|reason\s+through|break\s+(?:it\s+)?down\s+into\s+steps|work\s+through\s+this\s+(?:step\s+by\s+step|systematically))\b");
static_regex!(fn simple_task_indicator_pattern, r"(?i)\b(read\s+(?:the\s+)?file|write\s+(?:the\s+)?file|copy\s+(?:the\s+)?file|move\s+(?:the\s+)?file|delete\s+(?:the\s+)?file|list\s+files|run\s+(?:the\s+)?(?:command|script)|execute\s+(?:the\s+)?(?:command|script)|format\s+(?:the\s+)?(?:code|output)|rename\s+(?:the\s+)?file|create\s+(?:a\s+)?(?:file|directory|folder)|check\s+(?:if|whether)\s+(?:file|directory)\s+exists)\b");
static_regex!(fn weak_language_pattern, r"(?i)\b(should|try\s+to|consider|maybe|might|could|possibly|preferably|ideally|optionally)\b");
static_regex!(fn critical_section_pattern, r"(?i)^#+\s*.*\b(critical|important|required|mandatory|rules|must|essential|security|danger)\b");
static_regex!(fn ambiguous_term_pattern, r"(?i)\b(usually|sometimes|if\s+possible|when\s+appropriate|as\s+needed|often|occasionally|generally|typically|normally|frequently|regularly|commonly)\b");
#[derive(Debug, Clone)]
pub struct CriticalInMiddle {
pub line: usize,
pub column: usize,
pub keyword: String,
pub position_percent: f64,
}
pub fn find_critical_in_middle_pe(content: &str) -> Vec<CriticalInMiddle> {
if content.len() > MAX_REGEX_INPUT_SIZE {
return Vec::new();
}
let mut results = Vec::new();
let pattern = critical_keyword_pattern();
let lines: Vec<&str> = content.lines().collect();
let total_lines = lines.len();
if total_lines < 10 {
return results;
}
for (line_num, line) in lines.iter().enumerate() {
if let Some(mat) = pattern.find(line) {
let position_percent = (line_num as f64 / total_lines as f64) * 100.0;
if (40.0..60.0).contains(&position_percent) {
results.push(CriticalInMiddle {
line: line_num + 1,
column: mat.start() + 1,
keyword: mat.as_str().to_string(),
position_percent,
});
}
}
}
results
}
#[derive(Debug, Clone)]
pub struct CotOnSimpleTask {
pub line: usize,
pub column: usize,
pub phrase: String,
pub task_indicator: String,
}
pub fn find_cot_on_simple_tasks(content: &str) -> Vec<CotOnSimpleTask> {
if content.len() > MAX_REGEX_INPUT_SIZE {
return Vec::new();
}
let mut results = Vec::new();
let cot_pattern = cot_phrase_pattern();
let simple_pattern = simple_task_indicator_pattern();
let simple_tasks: Vec<_> = content
.lines()
.enumerate()
.filter_map(|(line_num, line)| {
simple_pattern
.find(line)
.map(|mat| (line_num, mat.as_str().to_string()))
})
.collect();
if simple_tasks.is_empty() {
return results;
}
for (line_num, line) in content.lines().enumerate() {
if let Some(mat) = cot_pattern.find(line) {
for (task_line, task) in &simple_tasks {
let distance = if line_num > *task_line {
line_num - task_line
} else {
task_line - line_num
};
if distance <= 5 {
results.push(CotOnSimpleTask {
line: line_num + 1,
column: mat.start() + 1,
phrase: mat.as_str().to_string(),
task_indicator: task.clone(),
});
break; }
}
}
}
results
}
#[derive(Debug, Clone)]
pub struct WeakLanguageInCritical {
pub line: usize,
pub column: usize,
pub weak_term: String,
pub section_name: String,
pub byte_offset: usize,
}
fn advance_past_line_ending(content: &[u8], byte_pos: &mut usize) {
if content.get(*byte_pos) == Some(&b'\r') {
*byte_pos += 1;
}
if content.get(*byte_pos) == Some(&b'\n') {
*byte_pos += 1;
}
}
pub fn find_weak_imperative_language(content: &str) -> Vec<WeakLanguageInCritical> {
if content.len() > MAX_REGEX_INPUT_SIZE {
return Vec::new();
}
let mut results = Vec::new();
let weak_pattern = weak_language_pattern();
let section_pattern = critical_section_pattern();
let mut current_section: Option<String> = None;
let mut byte_pos = 0usize;
for (line_num, line) in content.lines().enumerate() {
if line.starts_with('#') {
if section_pattern.is_match(line) {
current_section = Some(line.trim_start_matches('#').trim().to_string());
} else {
current_section = None;
}
}
if let Some(section_name) = ¤t_section {
if let Some(mat) = weak_pattern.find(line) {
results.push(WeakLanguageInCritical {
line: line_num + 1,
column: mat.start() + 1, weak_term: mat.as_str().to_string(),
section_name: section_name.clone(),
byte_offset: byte_pos + mat.start(),
});
}
}
byte_pos += line.len();
advance_past_line_ending(content.as_bytes(), &mut byte_pos);
}
results
}
#[derive(Debug, Clone)]
pub struct AmbiguousInstruction {
pub line: usize,
pub column: usize,
pub term: String,
#[allow(dead_code)] pub context: String,
}
pub fn find_ambiguous_instructions(content: &str) -> Vec<AmbiguousInstruction> {
if content.len() > MAX_REGEX_INPUT_SIZE {
return Vec::new();
}
let mut results = Vec::new();
let pattern = ambiguous_term_pattern();
let mut in_code_block = false;
for (line_num, line) in content.lines().enumerate() {
let trimmed = line.trim_start();
if trimmed.starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue;
}
if trimmed.starts_with("//") || trimmed.starts_with("#!") {
continue;
}
for mat in pattern.find_iter(line) {
let before = &line[..mat.start()];
let after = &line[mat.end()..];
if before
.rfind('(')
.is_some_and(|p| before[p..].find(')').is_none())
&& after.find(')').is_some()
{
continue;
}
let target_start = mat.start().saturating_sub(20);
let target_end = (mat.end() + 20).min(line.len());
let start = line
.char_indices()
.map(|(i, _)| i)
.take_while(|&i| i <= target_start)
.last()
.unwrap_or(0);
let end = line
.char_indices()
.map(|(i, _)| i)
.find(|&i| i >= target_end)
.unwrap_or(line.len());
let context = line[start..end].to_string();
results.push(AmbiguousInstruction {
line: line_num + 1,
column: mat.start() + 1,
term: mat.as_str().to_string(),
context,
});
}
}
results
}
static_regex!(fn redundant_instruction_pattern, r"(?i)\b(be helpful|be accurate|be concise|follow instructions|do your best|try your hardest|respond accurately|answer correctly|be thorough|be detailed|be precise|be clear|be professional|be consistent|be efficient)\b");
#[derive(Debug, Clone)]
pub struct RedundantInstruction {
pub line: usize,
pub column: usize,
pub phrase: String,
#[allow(dead_code)] pub byte_offset: usize,
#[allow(dead_code)] pub byte_len: usize,
}
pub fn find_redundant_instructions(content: &str) -> Vec<RedundantInstruction> {
if content.len() > MAX_REGEX_INPUT_SIZE {
return Vec::new();
}
let mut results = Vec::new();
let pattern = redundant_instruction_pattern();
let mut in_code_block = false;
let mut byte_pos = 0usize;
for (line_num, line) in content.lines().enumerate() {
if line.trim_start().starts_with("```") {
in_code_block = !in_code_block;
byte_pos += line.len();
advance_past_line_ending(content.as_bytes(), &mut byte_pos);
continue;
}
if in_code_block {
byte_pos += line.len();
advance_past_line_ending(content.as_bytes(), &mut byte_pos);
continue;
}
for mat in pattern.find_iter(line) {
results.push(RedundantInstruction {
line: line_num + 1,
column: mat.start() + 1,
phrase: mat.as_str().to_string(),
byte_offset: byte_pos + mat.start(),
byte_len: mat.as_str().len(),
});
}
byte_pos += line.len();
advance_past_line_ending(content.as_bytes(), &mut byte_pos);
}
results
}
static_regex!(fn negative_only_pattern, r"(?i)^[*\-]?\s*(don't|do not|never|avoid|refrain from)\b");
static_regex!(fn positive_alternative_pattern, r"(?i)\b(instead|rather|prefer|use\s+\S+\s+instead|better to|should\s+\S+\s+instead)\b");
#[derive(Debug, Clone)]
pub struct NegativeOnlyInstruction {
pub line: usize,
pub column: usize,
pub text: String,
}
pub fn find_negative_only_instructions(content: &str) -> Vec<NegativeOnlyInstruction> {
if content.len() > MAX_REGEX_INPUT_SIZE {
return Vec::new();
}
let neg_pattern = negative_only_pattern();
let pos_pattern = positive_alternative_pattern();
let lines: Vec<&str> = content.lines().collect();
let mut results = Vec::new();
let mut in_code_block = false;
for (line_num, line) in lines.iter().enumerate() {
if line.trim_start().starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue;
}
if let Some(mat) = neg_pattern.find(line) {
let window_end = (line_num + 3).min(lines.len());
let mut window_in_code = false;
let has_positive = lines[line_num..window_end].iter().any(|l| {
if l.trim_start().starts_with("```") {
window_in_code = !window_in_code;
return false;
}
if window_in_code {
return false;
}
pos_pattern.is_match(l)
});
if !has_positive {
results.push(NegativeOnlyInstruction {
line: line_num + 1,
column: mat.start() + 1,
text: line.trim().to_string(),
});
}
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_regex_patterns_compile() {
let _ = critical_keyword_pattern();
let _ = cot_phrase_pattern();
let _ = simple_task_indicator_pattern();
let _ = weak_language_pattern();
let _ = critical_section_pattern();
let _ = ambiguous_term_pattern();
let _ = redundant_instruction_pattern();
let _ = negative_only_pattern();
let _ = positive_alternative_pattern();
}
#[test]
fn test_find_critical_in_middle() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[10] = "This is critical information.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert_eq!(results.len(), 1);
assert!(results[0].position_percent > 40.0);
assert!(results[0].position_percent < 60.0);
assert_eq!(results[0].keyword.to_lowercase(), "critical");
}
#[test]
fn test_critical_at_top_no_issue() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[1] = "This is critical information.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert!(results.is_empty());
}
#[test]
fn test_critical_at_bottom_no_issue() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[18] = "This is critical information.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert!(results.is_empty());
}
#[test]
fn test_short_document_skipped() {
let content = "Critical info here.\nAnother line.";
let results = find_critical_in_middle_pe(content);
assert!(results.is_empty());
}
#[test]
fn test_multiple_keywords_in_middle() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[9] = "This is important and essential.".to_string();
lines[10] = "This is critical and mandatory.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert!(results.len() >= 2);
}
#[test]
fn test_cot_on_simple_read_file() {
let content = r#"# Read File Skill
When the user asks to read the file, think step by step:
1. First check if file exists
2. Then read contents
"#;
let results = find_cot_on_simple_tasks(content);
assert_eq!(results.len(), 1);
assert!(
results[0]
.phrase
.to_lowercase()
.contains("think step by step")
);
}
#[test]
fn test_cot_on_simple_copy_file() {
let content = r#"# Copy File Utility
Let's think through copying the file:
- Source path
- Destination path
"#;
let results = find_cot_on_simple_tasks(content);
assert_eq!(results.len(), 1);
}
#[test]
fn test_no_cot_on_complex_task() {
let content = r#"# Code Review Skill
When reviewing code, think step by step:
1. Check for security issues
2. Verify logic correctness
3. Assess performance
"#;
let results = find_cot_on_simple_tasks(content);
assert!(results.is_empty());
}
#[test]
fn test_simple_task_without_cot() {
let content = r#"# Read File Skill
Read the file and return its contents.
"#;
let results = find_cot_on_simple_tasks(content);
assert!(results.is_empty());
}
#[test]
fn test_weak_language_in_critical_section() {
let content = r#"# Critical Rules
You should follow the coding style.
Code could be formatted better.
"#;
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 2);
assert!(
results
.iter()
.any(|r| r.weak_term.to_lowercase() == "should")
);
assert!(
results
.iter()
.any(|r| r.weak_term.to_lowercase() == "could")
);
}
#[test]
fn test_weak_language_outside_critical_section() {
let content = r#"# General Guidelines
You should follow the coding style.
"#;
let results = find_weak_imperative_language(content);
assert!(results.is_empty());
}
#[test]
fn test_weak_language_section_boundary() {
let content = r#"# Important Security Rules
You should sanitize inputs.
# Other Info
You could do this too.
"#;
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].weak_term.to_lowercase(), "should");
}
#[test]
fn test_multiple_critical_sections() {
let content = r#"# Critical Rules
You should do A.
# General Section
Normal content.
# Mandatory Requirements
You might want to consider B.
"#;
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 2);
}
#[test]
fn test_find_ambiguous_usually() {
let content = "Usually format the output as JSON.";
let results = find_ambiguous_instructions(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].term.to_lowercase(), "usually");
}
#[test]
fn test_find_ambiguous_if_possible() {
let content = "Include tests if possible.";
let results = find_ambiguous_instructions(content);
assert_eq!(results.len(), 1);
assert!(results[0].term.to_lowercase().contains("if possible"));
}
#[test]
fn test_find_multiple_ambiguous() {
let content = r#"Usually do X.
Sometimes do Y.
When appropriate, do Z.
"#;
let results = find_ambiguous_instructions(content);
assert_eq!(results.len(), 3);
}
#[test]
fn test_skip_code_blocks() {
let content = r#"```rust
// Usually this is fine in comments
fn usually_called() {}
```"#;
let results = find_ambiguous_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_skip_multiline_code_blocks() {
let content = r#"Some text here.
```
function usually_runs() {
// usually in code
}
```
More text after."#;
let results = find_ambiguous_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_no_ambiguous_in_clear_instructions() {
let content = r#"# Rules
Always format output as JSON.
Never include sensitive data.
"#;
let results = find_ambiguous_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_ambiguous_context_captured() {
let content = "This rule is generally applicable to all files.";
let results = find_ambiguous_instructions(content);
assert_eq!(results.len(), 1);
assert!(results[0].context.contains("generally"));
}
#[test]
fn test_pe_001_exactly_ten_lines_boundary() {
let lines: Vec<String> = (0..10).map(|i| format!("Line {}", i)).collect();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert!(results.is_empty());
}
#[test]
fn test_pe_001_nine_lines_under_minimum() {
let lines: Vec<String> = (0..9).map(|i| format!("Line {}", i)).collect();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert!(results.is_empty());
}
#[test]
fn test_pe_001_eleven_lines_just_above_minimum() {
let mut lines: Vec<String> = (0..11).map(|i| format!("Line {}", i)).collect();
lines[5] = "This is critical information at 45%.".to_string();
let content = lines.join("\n");
let results = find_critical_in_middle_pe(&content);
assert_eq!(results.len(), 1);
assert!(results[0].position_percent >= 40.0 && results[0].position_percent <= 60.0);
}
#[test]
fn test_pe_003_word_boundary_hypercritical() {
let content = r#"# Hypercritical Information
You should do X.
"#;
let results = find_weak_imperative_language(content);
assert!(
results.is_empty(),
"Hypercritical should not match critical with word boundaries"
);
}
#[test]
fn test_pe_003_critical_case_insensitive() {
let content = r#"# CRITICAL INFORMATION
You should do X.
"#;
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].weak_term.to_lowercase(), "should");
}
#[test]
fn test_pe_003_important_header_detected() {
let content = r#"# Important Configuration
You should enable this.
"#;
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 1);
assert!(results[0].section_name.to_lowercase().contains("important"));
}
#[test]
fn test_pe_003_required_header_detected() {
let content = r#"# Required Fields
Code could be cleaner.
"#;
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].weak_term.to_lowercase(), "could");
}
#[test]
fn test_pe_004_inline_code_backticks_still_flagged() {
let content = "Format with `usually` for clarity.";
let results = find_ambiguous_instructions(content);
assert!(!results.is_empty());
}
#[test]
fn test_pe_004_comment_line_skipped() {
let content = "// Usually this is in a comment";
let results = find_ambiguous_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_pe_004_shebang_skipped() {
let content = "#!/usr/bin/env usually";
let results = find_ambiguous_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_empty_string_all_validators() {
let empty = "";
let critical = find_critical_in_middle_pe(empty);
let cot = find_cot_on_simple_tasks(empty);
let weak = find_weak_imperative_language(empty);
let ambiguous = find_ambiguous_instructions(empty);
assert!(
critical.is_empty(),
"Empty content should have no critical in middle"
);
assert!(cot.is_empty(), "Empty content should have no CoT issues");
assert!(
weak.is_empty(),
"Empty content should have no weak language"
);
assert!(
ambiguous.is_empty(),
"Empty content should have no ambiguous terms"
);
}
#[test]
fn test_single_line_all_validators() {
let single = "This is critical.";
let critical = find_critical_in_middle_pe(single);
let cot = find_cot_on_simple_tasks(single);
let weak = find_weak_imperative_language(single);
let ambiguous = find_ambiguous_instructions(single);
assert!(critical.is_empty());
assert!(cot.is_empty());
assert!(weak.is_empty());
assert!(ambiguous.is_empty());
}
#[test]
fn test_pe_003_crlf_byte_offsets() {
let content = "# Critical Rules\r\n\r\nYou should follow the style.\r\nAnother line.\r\n";
let results = find_weak_imperative_language(content);
assert_eq!(results.len(), 1);
assert_eq!(results[0].weak_term.to_lowercase(), "should");
let found =
&content[results[0].byte_offset..results[0].byte_offset + results[0].weak_term.len()];
assert_eq!(
found.to_lowercase(),
results[0].weak_term.to_lowercase(),
"Byte offset should correctly point to the weak term in CRLF content"
);
}
#[test]
fn test_pe_005_crlf_byte_offsets() {
let content = "# Rules\r\n\r\nBe helpful in all interactions.\r\nAnother line.\r\n";
let results = find_redundant_instructions(content);
assert_eq!(results.len(), 1);
let found = &content[results[0].byte_offset..results[0].byte_offset + results[0].byte_len];
assert_eq!(
found.to_lowercase(),
results[0].phrase.to_lowercase(),
"Byte offset should correctly point to the redundant phrase in CRLF content"
);
}
#[test]
fn test_find_critical_in_middle_oversized_input() {
let large_content = "a".repeat(MAX_REGEX_INPUT_SIZE + 1000);
let results = find_critical_in_middle_pe(&large_content);
assert!(
results.is_empty(),
"Oversized content should be skipped for ReDoS protection"
);
}
#[test]
fn test_find_cot_on_simple_tasks_oversized_input() {
let large_content = "a".repeat(MAX_REGEX_INPUT_SIZE + 1000);
let results = find_cot_on_simple_tasks(&large_content);
assert!(
results.is_empty(),
"Oversized content should be skipped for ReDoS protection"
);
}
#[test]
fn test_find_weak_imperative_language_oversized_input() {
let large_content = "a".repeat(MAX_REGEX_INPUT_SIZE + 1000);
let results = find_weak_imperative_language(&large_content);
assert!(
results.is_empty(),
"Oversized content should be skipped for ReDoS protection"
);
}
#[test]
fn test_find_ambiguous_instructions_oversized_input() {
let large_content = "a".repeat(MAX_REGEX_INPUT_SIZE + 1000);
let results = find_ambiguous_instructions(&large_content);
assert!(
results.is_empty(),
"Oversized content should be skipped for ReDoS protection"
);
}
#[test]
fn test_find_redundant_be_helpful() {
let content = "Be helpful and accurate when responding.";
let results = find_redundant_instructions(content);
assert_eq!(results.len(), 1);
assert!(results[0].phrase.to_lowercase().contains("be helpful"));
}
#[test]
fn test_find_redundant_be_concise() {
let content = "# Rules\n\nBe concise in your responses.";
let results = find_redundant_instructions(content);
assert_eq!(results.len(), 1);
assert!(results[0].phrase.to_lowercase().contains("be concise"));
}
#[test]
fn test_find_multiple_redundant() {
let content = "Be helpful.\nBe accurate.\nBe concise.";
let results = find_redundant_instructions(content);
assert_eq!(results.len(), 3);
}
#[test]
fn test_no_redundant_in_specific_instructions() {
let content =
"Format all output as JSON with 2-space indentation.\nAlways include error codes.";
let results = find_redundant_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_redundant_skips_code_blocks() {
let content = "```\nBe helpful and accurate.\n```";
let results = find_redundant_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_redundant_case_insensitive() {
let content = "BE HELPFUL in all interactions.";
let results = find_redundant_instructions(content);
assert_eq!(results.len(), 1);
}
#[test]
fn test_find_redundant_instructions_oversized_input() {
let large_content = "a".repeat(MAX_REGEX_INPUT_SIZE + 1000);
let results = find_redundant_instructions(&large_content);
assert!(
results.is_empty(),
"Oversized content should be skipped for ReDoS protection"
);
}
#[test]
fn test_find_negative_only_dont() {
let content = "Don't use global variables.";
let results = find_negative_only_instructions(content);
assert_eq!(results.len(), 1);
assert!(results[0].text.contains("Don't"));
}
#[test]
fn test_find_negative_only_never() {
let content = "Never use console.log in production.";
let results = find_negative_only_instructions(content);
assert_eq!(results.len(), 1);
}
#[test]
fn test_find_negative_only_avoid() {
let content = "Avoid inline styles.";
let results = find_negative_only_instructions(content);
assert_eq!(results.len(), 1);
}
#[test]
fn test_negative_with_positive_alternative() {
let content = "Don't use global variables. Instead, pass values as function parameters.";
let results = find_negative_only_instructions(content);
assert!(
results.is_empty(),
"Should not flag when positive alternative is on same line"
);
}
#[test]
fn test_negative_with_positive_on_next_line() {
let content = "Don't use global variables.\nInstead, pass values as function parameters.";
let results = find_negative_only_instructions(content);
assert!(
results.is_empty(),
"Should not flag when positive alternative is within 2 lines"
);
}
#[test]
fn test_negative_with_positive_two_lines_away() {
let content =
"Don't use global variables.\nSome explanation.\nUse function parameters instead.";
let results = find_negative_only_instructions(content);
assert!(
results.is_empty(),
"Should not flag when positive alternative is within window"
);
}
#[test]
fn test_negative_without_positive_too_far() {
let content =
"Don't use global variables.\nLine 2.\nLine 3.\nLine 4.\nUse functions instead.";
let results = find_negative_only_instructions(content);
assert_eq!(
results.len(),
1,
"Should flag when positive alternative is beyond 2-line window"
);
}
#[test]
fn test_negative_skips_code_blocks() {
let content = "```\nDon't use global variables.\n```";
let results = find_negative_only_instructions(content);
assert!(results.is_empty());
}
#[test]
fn test_multiple_negative_only() {
let content = "Don't use globals.\nNever use eval.\nAvoid inline styles.";
let results = find_negative_only_instructions(content);
assert_eq!(results.len(), 3);
}
#[test]
fn test_negative_with_list_marker() {
let content = "- Don't use global variables.";
let results = find_negative_only_instructions(content);
assert_eq!(results.len(), 1);
}
#[test]
fn test_find_negative_only_instructions_oversized_input() {
let large_content = "a".repeat(MAX_REGEX_INPUT_SIZE + 1000);
let results = find_negative_only_instructions(&large_content);
assert!(
results.is_empty(),
"Oversized content should be skipped for ReDoS protection"
);
}
#[test]
fn test_empty_content_pe_005_pe_006() {
let empty = "";
let redundant = find_redundant_instructions(empty);
let negative = find_negative_only_instructions(empty);
assert!(redundant.is_empty());
assert!(negative.is_empty());
}
#[test]
fn test_find_critical_in_middle_exactly_at_64kb_limit() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[10] = "This is critical information.".to_string();
let base = lines.join("\n");
let needed = MAX_REGEX_INPUT_SIZE - base.len();
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE,
"Content must be exactly at the limit"
);
let results = find_critical_in_middle_pe(&content);
assert!(
!results.is_empty(),
"Content at exactly the limit should be processed"
);
}
#[test]
fn test_find_critical_in_middle_one_byte_over_limit() {
let mut lines: Vec<String> = (0..20).map(|i| format!("Line {}", i)).collect();
lines[10] = "This is critical information.".to_string();
let base = lines.join("\n");
let needed = MAX_REGEX_INPUT_SIZE - base.len() + 1;
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE + 1,
"Content must be one byte over the limit"
);
let results = find_critical_in_middle_pe(&content);
assert!(
results.is_empty(),
"Content one byte over the limit should be skipped"
);
}
#[test]
fn test_find_cot_on_simple_tasks_exactly_at_64kb_limit() {
let base = "When you read the file, think step by step.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len();
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE,
"Content must be exactly at the limit"
);
let results = find_cot_on_simple_tasks(&content);
assert!(
!results.is_empty(),
"Content at exactly the limit should be processed"
);
}
#[test]
fn test_find_cot_on_simple_tasks_one_byte_over_limit() {
let base = "When you read the file, think step by step.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len() + 1;
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE + 1,
"Content must be one byte over the limit"
);
let results = find_cot_on_simple_tasks(&content);
assert!(
results.is_empty(),
"Content one byte over the limit should be skipped"
);
}
#[test]
fn test_find_weak_imperative_language_exactly_at_64kb_limit() {
let base = "# Critical Rules\nYou should follow the style.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len();
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE,
"Content must be exactly at the limit"
);
let results = find_weak_imperative_language(&content);
assert!(
!results.is_empty(),
"Content at exactly the limit should be processed"
);
}
#[test]
fn test_find_weak_imperative_language_one_byte_over_limit() {
let base = "# Critical Rules\nYou should follow the style.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len() + 1;
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE + 1,
"Content must be one byte over the limit"
);
let results = find_weak_imperative_language(&content);
assert!(
results.is_empty(),
"Content one byte over the limit should be skipped"
);
}
#[test]
fn test_find_ambiguous_instructions_exactly_at_64kb_limit() {
let base = "Usually format the output as JSON.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len();
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE,
"Content must be exactly at the limit"
);
let results = find_ambiguous_instructions(&content);
assert!(
!results.is_empty(),
"Content at exactly the limit should be processed"
);
}
#[test]
fn test_find_ambiguous_instructions_one_byte_over_limit() {
let base = "Usually format the output as JSON.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len() + 1;
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE + 1,
"Content must be one byte over the limit"
);
let results = find_ambiguous_instructions(&content);
assert!(
results.is_empty(),
"Content one byte over the limit should be skipped"
);
}
#[test]
fn test_find_redundant_instructions_exactly_at_64kb_limit() {
let base = "Be helpful in all interactions.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len();
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE,
"Content must be exactly at the limit"
);
let results = find_redundant_instructions(&content);
assert!(
!results.is_empty(),
"Content at exactly the limit should be processed"
);
}
#[test]
fn test_find_redundant_instructions_one_byte_over_limit() {
let base = "Be helpful in all interactions.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len() + 1;
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE + 1,
"Content must be one byte over the limit"
);
let results = find_redundant_instructions(&content);
assert!(
results.is_empty(),
"Content one byte over the limit should be skipped"
);
}
#[test]
fn test_find_negative_only_instructions_exactly_at_64kb_limit() {
let base = "Don't use globals.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len();
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE,
"Content must be exactly at the limit"
);
let results = find_negative_only_instructions(&content);
assert!(
!results.is_empty(),
"Content at exactly the limit should be processed"
);
}
#[test]
fn test_find_negative_only_instructions_one_byte_over_limit() {
let base = "Don't use globals.\n";
let needed = MAX_REGEX_INPUT_SIZE - base.len() + 1;
let content = format!("{}{}", base, "a".repeat(needed));
assert_eq!(
content.len(),
MAX_REGEX_INPUT_SIZE + 1,
"Content must be one byte over the limit"
);
let results = find_negative_only_instructions(&content);
assert!(
results.is_empty(),
"Content one byte over the limit should be skipped"
);
}
}