ass_core/utils/utf8/
mod.rs1mod bom;
37mod encoding;
38mod normalization;
39mod validation;
40
41pub use bom::{detect_bom, strip_bom, BomType};
43pub use encoding::{detect_encoding, is_likely_ass_content, EncodingInfo};
44pub use normalization::{
45 normalize_line_endings, normalize_whitespace, remove_control_chars, trim_lines,
46};
47pub use validation::{
48 count_replacement_chars, is_valid_ass_text, recover_utf8, truncate_at_char_boundary,
49 validate_utf8,
50};
51
52#[cfg(test)]
53mod tests {
54 use super::*;
55
56 #[test]
57 fn integration_bom_detection() {
58 let text_with_bom = "\u{FEFF}Hello World";
59 let (stripped, had_bom) = strip_bom(text_with_bom);
60 assert_eq!(stripped, "Hello World");
61 assert!(had_bom);
62
63 let bytes = &[0xEF, 0xBB, 0xBF, b'H', b'i'];
64 let (bom_type, skip) = detect_bom(bytes).unwrap();
65 assert_eq!(bom_type, BomType::Utf8);
66 assert_eq!(skip, 3);
67 }
68
69 #[test]
70 fn integration_encoding_detection() {
71 let text = "[Script Info]\nTitle: Test Script";
72 let encoding = detect_encoding(text.as_bytes());
73 assert_eq!(encoding.encoding, "UTF-8");
74 assert!(encoding.confidence > 0.9); assert!(!encoding.has_bom);
76
77 assert!(is_likely_ass_content(text));
78 assert!(!is_likely_ass_content("Just regular text"));
79 }
80
81 #[test]
82 fn integration_validation_and_recovery() {
83 let valid_text = "Hello, 世界! 🎵";
84 assert!(validate_utf8(valid_text.as_bytes()).is_ok());
85
86 let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
87 let (recovered, replacements) = recover_utf8(invalid_bytes);
88 assert_eq!(recovered, "Hi�!");
89 assert_eq!(replacements, 1);
90
91 assert_eq!(count_replacement_chars(&recovered), 1);
92 }
93
94 #[test]
95 fn integration_normalization() {
96 let input = "Line 1\r\nLine 2\rLine 3\n";
97 let normalized = normalize_line_endings(input);
98 assert_eq!(normalized, "Line 1\nLine 2\nLine 3\n");
99
100 let whitespace_text = "Hello World Test";
101 let normalized_ws = normalize_whitespace(whitespace_text, true);
102 assert_eq!(normalized_ws, "Hello World Test");
103
104 let input_with_control = "Hello\x00World\x1FTest\nValid";
105 let cleaned = remove_control_chars(input_with_control);
106 assert_eq!(cleaned, "HelloWorldTest\nValid");
107 }
108
109 #[test]
110 fn integration_text_validation() {
111 assert!(is_valid_ass_text("Hello World"));
112 assert!(is_valid_ass_text("Hello\tWorld\n"));
113 assert!(is_valid_ass_text("Hello 世界"));
114 assert!(!is_valid_ass_text("Hello\x00World")); assert!(!is_valid_ass_text("Hello\x1FWorld")); }
117
118 #[test]
119 fn integration_truncation() {
120 let text = "Hello World";
121 let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
122 assert_eq!(truncated, "Hello");
123 assert!(was_truncated);
124
125 let unicode_text = "Hello 世界";
126 let (truncated, was_truncated) = truncate_at_char_boundary(unicode_text, 8);
127 assert_eq!(truncated, "Hello "); assert!(was_truncated);
129
130 let short_text = "Hi";
131 let (truncated, was_truncated) = truncate_at_char_boundary(short_text, 10);
132 assert_eq!(truncated, "Hi");
133 assert!(!was_truncated);
134 }
135
136 #[test]
137 fn integration_full_workflow() {
138 let input = "\u{FEFF}[Script Info]\r\nTitle: Test\x00Script\r\n\r\n[Events]\nDialogue: Hello World";
140
141 let (without_bom, had_bom) = strip_bom(input);
143 assert!(had_bom);
144
145 let normalized = normalize_line_endings(without_bom);
147
148 let cleaned = remove_control_chars(&normalized);
150
151 let final_text = normalize_whitespace(&cleaned, true);
153
154 assert!(final_text.contains("[Script Info]"));
156 assert!(final_text.contains("Title: TestScript")); assert!(!final_text.contains('\r')); assert!(final_text.contains("Dialogue: Hello World")); }
160}