#![allow(dead_code)]
#[derive(Clone, Debug)]
pub struct SanitizeConfig {
pub strip_html_tags: bool,
pub strip_ssa_tags: bool,
pub remove_hi_descriptions: bool,
pub normalize_whitespace: bool,
pub trim_lines: bool,
pub remove_empty_lines: bool,
pub max_line_breaks: usize,
}
impl Default for SanitizeConfig {
fn default() -> Self {
Self {
strip_html_tags: true,
strip_ssa_tags: true,
remove_hi_descriptions: false,
normalize_whitespace: true,
trim_lines: true,
remove_empty_lines: true,
max_line_breaks: 2,
}
}
}
impl SanitizeConfig {
#[must_use]
pub fn all() -> Self {
Self {
strip_html_tags: true,
strip_ssa_tags: true,
remove_hi_descriptions: true,
normalize_whitespace: true,
trim_lines: true,
remove_empty_lines: true,
max_line_breaks: 1,
}
}
#[must_use]
pub fn minimal() -> Self {
Self {
strip_html_tags: false,
strip_ssa_tags: false,
remove_hi_descriptions: false,
normalize_whitespace: true,
trim_lines: true,
remove_empty_lines: false,
max_line_breaks: 0,
}
}
}
#[must_use]
pub fn strip_html_tags(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut in_tag = false;
for ch in text.chars() {
if ch == '<' {
in_tag = true;
} else if ch == '>' {
in_tag = false;
} else if !in_tag {
result.push(ch);
}
}
result
}
#[must_use]
pub fn strip_ssa_tags(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let chars: Vec<char> = text.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
if i + 1 < len && chars[i] == '{' && chars[i + 1] == '\\' {
while i < len && chars[i] != '}' {
i += 1;
}
if i < len {
i += 1; }
} else {
result.push(chars[i]);
i += 1;
}
}
result
}
#[must_use]
pub fn remove_hi_descriptions(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut depth_square = 0i32;
let mut depth_paren = 0i32;
for ch in text.chars() {
match ch {
'[' => depth_square += 1,
']' => {
if depth_square > 0 {
depth_square -= 1;
} else {
result.push(ch);
}
}
'(' => depth_paren += 1,
')' => {
if depth_paren > 0 {
depth_paren -= 1;
} else {
result.push(ch);
}
}
_ => {
if depth_square == 0 && depth_paren == 0 {
result.push(ch);
}
}
}
}
result
}
#[must_use]
pub fn normalize_whitespace(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut prev_ws = false;
for ch in text.chars() {
if ch == ' ' || ch == '\t' {
if !prev_ws {
result.push(' ');
prev_ws = true;
}
} else {
result.push(ch);
prev_ws = false;
}
}
result
}
#[must_use]
pub fn trim_lines(text: &str) -> String {
text.lines().map(str::trim).collect::<Vec<_>>().join("\n")
}
#[must_use]
pub fn remove_empty_lines(text: &str) -> String {
text.lines()
.filter(|line| !line.trim().is_empty())
.collect::<Vec<_>>()
.join("\n")
}
#[must_use]
pub fn limit_line_breaks(text: &str, max: usize) -> String {
if max == 0 {
return text.to_string();
}
let separator = "\n".repeat(max);
let mut result = String::with_capacity(text.len());
let mut consecutive_newlines = 0usize;
for ch in text.chars() {
if ch == '\n' {
consecutive_newlines += 1;
if consecutive_newlines <= max {
result.push('\n');
}
} else {
consecutive_newlines = 0;
result.push(ch);
}
}
let _ = separator; result
}
#[must_use]
pub fn sanitize(text: &str, config: &SanitizeConfig) -> String {
let mut result = text.to_string();
if config.strip_html_tags {
result = strip_html_tags(&result);
}
if config.strip_ssa_tags {
result = strip_ssa_tags(&result);
}
if config.remove_hi_descriptions {
result = remove_hi_descriptions(&result);
}
if config.normalize_whitespace {
result = normalize_whitespace(&result);
}
if config.trim_lines {
result = trim_lines(&result);
}
if config.remove_empty_lines {
result = remove_empty_lines(&result);
}
if config.max_line_breaks > 0 {
result = limit_line_breaks(&result, config.max_line_breaks);
}
result
}
#[must_use]
pub fn has_encoding_issues(text: &str) -> bool {
let patterns = ["\u{FFFD}", "é", "è", "ü", "ö", "ä", "Â"];
for pat in &patterns {
if text.contains(pat) {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_html_tags_basic() {
assert_eq!(strip_html_tags("<i>italic</i>"), "italic");
assert_eq!(strip_html_tags("<b>bold</b> text"), "bold text");
}
#[test]
fn test_strip_html_tags_no_tags() {
assert_eq!(strip_html_tags("plain text"), "plain text");
}
#[test]
fn test_strip_html_tags_nested() {
assert_eq!(strip_html_tags("<b><i>nested</i></b>"), "nested");
}
#[test]
fn test_strip_ssa_tags() {
assert_eq!(strip_ssa_tags("{\\b1}bold{\\b0}"), "bold");
assert_eq!(strip_ssa_tags("{\\an8}top text"), "top text");
}
#[test]
fn test_strip_ssa_tags_no_tags() {
assert_eq!(strip_ssa_tags("no tags here"), "no tags here");
}
#[test]
fn test_remove_hi_descriptions() {
assert_eq!(remove_hi_descriptions("[music] Hello"), " Hello");
assert_eq!(remove_hi_descriptions("(laughing) Ha!"), " Ha!");
}
#[test]
fn test_remove_hi_nested() {
assert_eq!(remove_hi_descriptions("[outer [inner]]"), "");
}
#[test]
fn test_normalize_whitespace() {
assert_eq!(normalize_whitespace("hello world"), "hello world");
assert_eq!(normalize_whitespace("a\t\tb"), "a b");
}
#[test]
fn test_trim_lines() {
assert_eq!(trim_lines(" hello \n world "), "hello\nworld");
}
#[test]
fn test_remove_empty_lines() {
assert_eq!(remove_empty_lines("a\n\nb\n\nc"), "a\nb\nc");
}
#[test]
fn test_limit_line_breaks() {
assert_eq!(limit_line_breaks("a\n\n\nb", 1), "a\nb");
assert_eq!(limit_line_breaks("a\n\n\nb", 2), "a\n\nb");
}
#[test]
fn test_sanitize_full() {
let config = SanitizeConfig::all();
let input = "<i>Hello</i> [music] world ";
let result = sanitize(input, &config);
assert_eq!(result, "Hello world");
}
#[test]
fn test_has_encoding_issues_clean() {
assert!(!has_encoding_issues("Hello world"));
}
#[test]
fn test_has_encoding_issues_mojibake() {
assert!(has_encoding_issues("Café latte"));
assert!(has_encoding_issues("replacement \u{FFFD} char"));
}
}