const MAX_INPUT_BYTES: usize = 102_400;
const WHITESPACE_RATIO_THRESHOLD: f64 = 0.90;
const MAX_CONSECUTIVE_REPEATS: usize = 20;
#[derive(Debug, Clone)]
pub struct ValidationResult {
pub valid: bool,
pub warnings: Vec<String>,
pub errors: Vec<String>,
}
impl ValidationResult {
fn ok() -> Self {
Self {
valid: true,
warnings: Vec::new(),
errors: Vec::new(),
}
}
fn add_error(&mut self, msg: impl Into<String>) {
self.valid = false;
self.errors.push(msg.into());
}
fn add_warning(&mut self, msg: impl Into<String>) {
self.warnings.push(msg.into());
}
}
pub struct ContentValidator {
max_bytes: usize,
}
impl ContentValidator {
pub fn new() -> Self {
Self {
max_bytes: MAX_INPUT_BYTES,
}
}
pub fn validate(&self, input: &str) -> ValidationResult {
let mut result = ValidationResult::ok();
self.check_length(input, &mut result);
self.check_null_bytes(input, &mut result);
self.check_whitespace_ratio(input, &mut result);
self.check_repetition(input, &mut result);
self.check_control_characters(input, &mut result);
result
}
fn check_length(&self, input: &str, result: &mut ValidationResult) {
if input.len() > self.max_bytes {
result.add_error(format!(
"Input exceeds maximum length: {} bytes (limit: {} bytes)",
input.len(),
self.max_bytes,
));
}
}
fn check_null_bytes(&self, input: &str, result: &mut ValidationResult) {
if input.contains('\0') {
result.add_error("Input contains null byte(s)");
}
}
fn check_whitespace_ratio(&self, input: &str, result: &mut ValidationResult) {
if input.is_empty() {
return;
}
let total = input.chars().count();
let whitespace = input.chars().filter(|c| c.is_whitespace()).count();
let ratio = whitespace as f64 / total as f64;
if ratio > WHITESPACE_RATIO_THRESHOLD {
result.add_warning(format!(
"Input is {:.0}% whitespace ({} of {} characters)",
ratio * 100.0,
whitespace,
total,
));
}
}
fn check_repetition(&self, input: &str, result: &mut ValidationResult) {
let mut chars = input.chars();
let Some(mut prev) = chars.next() else {
return;
};
let mut run: usize = 1;
for ch in chars {
if ch == prev {
run += 1;
if run > MAX_CONSECUTIVE_REPEATS {
result.add_warning(format!(
"Character {:?} repeats {} consecutive times (threshold: {})",
prev, run, MAX_CONSECUTIVE_REPEATS,
));
break;
}
} else {
prev = ch;
run = 1;
}
}
}
fn check_control_characters(&self, input: &str, result: &mut ValidationResult) {
let found: Vec<u8> = input
.bytes()
.filter(|&b| is_unusual_control(b))
.collect::<std::collections::HashSet<u8>>()
.into_iter()
.collect();
if !found.is_empty() {
result.add_warning(format!(
"Input contains unusual control character(s): {:?}",
found,
));
}
}
}
impl Default for ContentValidator {
fn default() -> Self {
Self::new()
}
}
fn is_unusual_control(b: u8) -> bool {
matches!(b, 0..=8 | 14..=31)
}
#[cfg(test)]
mod tests {
use super::*;
fn validator() -> ContentValidator {
ContentValidator::new()
}
#[test]
fn test_length_under_limit() {
let input = "a".repeat(1_000);
let r = validator().validate(&input);
assert!(r.valid);
assert!(r.errors.is_empty());
}
#[test]
fn test_length_exactly_at_limit() {
let input = "x".repeat(MAX_INPUT_BYTES);
let r = validator().validate(&input);
assert!(r.valid, "exactly at limit should be valid");
assert!(r.errors.is_empty());
}
#[test]
fn test_length_over_limit() {
let input = "y".repeat(MAX_INPUT_BYTES + 1);
let r = validator().validate(&input);
assert!(!r.valid);
assert!(r.errors.iter().any(|e| e.contains("exceeds maximum")));
}
#[test]
fn test_null_byte_detected() {
let input = "hello\0world";
let r = validator().validate(input);
assert!(!r.valid);
assert!(r.errors.iter().any(|e| e.contains("null byte")));
}
#[test]
fn test_no_null_bytes() {
let r = validator().validate("hello world");
assert!(r.valid);
}
#[test]
fn test_high_whitespace_ratio() {
let input = format!("{}{}", " ".repeat(95), "abcde");
let r = validator().validate(&input);
assert!(r.valid, "whitespace is a warning, not an error");
assert!(r.warnings.iter().any(|w| w.contains("whitespace")));
}
#[test]
fn test_normal_whitespace_ratio() {
let input = "The quick brown fox jumps over the lazy dog";
let r = validator().validate(input);
assert!(r.valid);
assert!(
!r.warnings.iter().any(|w| w.contains("whitespace")),
"normal text should not trigger whitespace warning"
);
}
#[test]
fn test_excessive_repetition() {
let input = "a".repeat(25); let r = validator().validate(&input);
assert!(r.valid, "repetition is a warning, not an error");
assert!(r.warnings.iter().any(|w| w.contains("repeats")));
}
#[test]
fn test_acceptable_repetition() {
let input = "a".repeat(20); let r = validator().validate(&input);
assert!(!r.warnings.iter().any(|w| w.contains("repeats")));
}
#[test]
fn test_unusual_control_char() {
let input = format!("hello{}world", char::from(1));
let r = validator().validate(&input);
assert!(r.valid, "control chars produce warnings, not errors");
assert!(r.warnings.iter().any(|w| w.contains("control character")));
}
#[test]
fn test_normal_control_chars_allowed() {
let input = "line1\n\tindented\r\nline2";
let r = validator().validate(input);
assert!(
!r.warnings.iter().any(|w| w.contains("control character")),
"common whitespace controls should not trigger warning"
);
}
#[test]
fn test_clean_input_passes() {
let r = validator().validate("Hello, how are you today?");
assert!(r.valid);
assert!(r.warnings.is_empty());
assert!(r.errors.is_empty());
}
#[test]
fn test_empty_input_passes() {
let r = validator().validate("");
assert!(r.valid);
assert!(r.warnings.is_empty());
assert!(r.errors.is_empty());
}
#[test]
fn test_multiple_issues() {
let input = format!("{}\0{}", " ".repeat(95), "abcde");
let r = validator().validate(&input);
assert!(!r.valid, "null byte should make it invalid");
assert!(
!r.warnings.is_empty(),
"should also have whitespace warning"
);
assert!(!r.errors.is_empty(), "should have null byte error");
}
}