1use crate::content::{Cell, Code, Column, Content, Math, Media, MediaSource, Row, Table, Text};
8use unicode_normalization::UnicodeNormalization;
9
10#[derive(Debug, Clone, Copy, Default)]
12pub struct NormalizationConfig {
13 pub unicode_form: UnicodeForm,
15 pub whitespace: WhitespaceNorm,
17 pub line_endings: LineEndingNorm,
19}
20
21#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
23pub enum UnicodeForm {
24 #[default]
26 NFC,
27 NFD,
29 NFKC,
31 NFKD,
33}
34
35#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
37pub enum WhitespaceNorm {
38 #[default]
40 Collapse,
41 Preserve,
43 Trim,
45}
46
47#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
49pub enum LineEndingNorm {
50 #[default]
52 LF,
53 CRLF,
55 Preserve,
57}
58
59pub fn normalize_content(content: &Content) -> String {
78 match content {
79 Content::Text(text) => normalize_text_content(text),
80 Content::Code(code) => normalize_code_content(code),
81 Content::Table(table) => normalize_table_content(table),
82 Content::Math(math) => normalize_math_content(math),
83 Content::Media(media) => normalize_media_content(media),
84 Content::Json { value, .. } => canonical_json(value),
85 Content::Binary {
86 data, mime_type, ..
87 } => {
88 format!("{}:{}", mime_type, hex::encode(sha256_hash(data)))
89 }
90 Content::Composite { layout, children } => {
91 let children_str: Vec<String> = children.iter().map(|id| id.to_string()).collect();
92 format!("{:?}:[{}]", layout, children_str.join(","))
93 }
94 }
95}
96
97fn normalize_text_content(text: &Text) -> String {
98 normalize_text(
99 &text.text,
100 NormalizationConfig {
101 whitespace: WhitespaceNorm::Collapse,
102 ..Default::default()
103 },
104 )
105}
106
107fn normalize_code_content(code: &Code) -> String {
108 let config = NormalizationConfig {
110 whitespace: WhitespaceNorm::Preserve,
111 line_endings: LineEndingNorm::LF,
112 ..Default::default()
113 };
114 format!(
115 "{}:{}",
116 code.language.to_lowercase(),
117 normalize_text(&code.source, config)
118 )
119}
120
121fn normalize_table_content(table: &Table) -> String {
122 let columns: Vec<String> = table.columns.iter().map(normalize_column).collect();
123
124 let rows: Vec<String> = table.rows.iter().map(normalize_row).collect();
125
126 format!("columns:[{}],rows:[{}]", columns.join(","), rows.join(","))
127}
128
129fn normalize_column(column: &Column) -> String {
130 let name = normalize_text(&column.name, NormalizationConfig::default());
131 match &column.data_type {
132 Some(dt) => format!("{}:{:?}", name, dt),
133 None => name,
134 }
135}
136
137fn normalize_row(row: &Row) -> String {
138 let cells: Vec<String> = row.cells.iter().map(normalize_cell).collect();
139 format!("[{}]", cells.join(","))
140}
141
142fn normalize_cell(cell: &Cell) -> String {
143 match cell {
144 Cell::Null => "null".to_string(),
145 Cell::Text(s) => format!("\"{}\"", normalize_text(s, NormalizationConfig::default())),
146 Cell::Number(n) => {
147 if n.fract() == 0.0 {
149 format!("{:.0}", n)
150 } else {
151 format!("{}", n)
152 }
153 }
154 Cell::Boolean(b) => b.to_string(),
155 Cell::Date(s) => format!("d:{}", s),
156 Cell::DateTime(s) => format!("dt:{}", s),
157 Cell::Json(v) => canonical_json(v),
158 }
159}
160
161fn normalize_math_content(math: &Math) -> String {
162 let normalized_expr = normalize_text(
163 &math.expression,
164 NormalizationConfig {
165 whitespace: WhitespaceNorm::Collapse,
166 ..Default::default()
167 },
168 );
169 format!("{:?}:{}", math.format, normalized_expr)
170}
171
172fn normalize_media_content(media: &Media) -> String {
173 let source = match &media.source {
174 MediaSource::Url(url) => format!("url:{}", url),
175 MediaSource::Base64(data) => format!("b64:{}", &data[..data.len().min(32)]),
176 MediaSource::Reference(id) => format!("ref:{}", id),
177 MediaSource::External(ext) => {
178 format!("ext:{}:{}:{}", ext.provider, ext.bucket, ext.key)
179 }
180 };
181
182 match &media.content_hash {
183 Some(hash) => format!(
184 "{:?}:{}:hash:{}",
185 media.media_type,
186 source,
187 hex::encode(hash)
188 ),
189 None => format!("{:?}:{}", media.media_type, source),
190 }
191}
192
193pub fn normalize_text(text: &str, config: NormalizationConfig) -> String {
195 let unicode_normalized = match config.unicode_form {
197 UnicodeForm::NFC => text.nfc().collect::<String>(),
198 UnicodeForm::NFD => text.nfd().collect::<String>(),
199 UnicodeForm::NFKC => text.nfkc().collect::<String>(),
200 UnicodeForm::NFKD => text.nfkd().collect::<String>(),
201 };
202
203 let line_normalized = match config.line_endings {
205 LineEndingNorm::LF => unicode_normalized.replace("\r\n", "\n").replace('\r', "\n"),
206 LineEndingNorm::CRLF => unicode_normalized
207 .replace("\r\n", "\n")
208 .replace('\r', "\n")
209 .replace('\n', "\r\n"),
210 LineEndingNorm::Preserve => unicode_normalized,
211 };
212
213 match config.whitespace {
215 WhitespaceNorm::Collapse => line_normalized
216 .split_whitespace()
217 .collect::<Vec<_>>()
218 .join(" "),
219 WhitespaceNorm::Trim => line_normalized.trim().to_string(),
220 WhitespaceNorm::Preserve => line_normalized,
221 }
222}
223
224pub fn canonical_json(value: &serde_json::Value) -> String {
230 match value {
231 serde_json::Value::Object(map) => {
232 let mut pairs: Vec<_> = map.iter().collect();
233 pairs.sort_by(|a, b| a.0.cmp(b.0));
234 let inner: Vec<String> = pairs
235 .iter()
236 .map(|(k, v)| format!("\"{}\":{}", escape_json_string(k), canonical_json(v)))
237 .collect();
238 format!("{{{}}}", inner.join(","))
239 }
240 serde_json::Value::Array(arr) => {
241 let inner: Vec<String> = arr.iter().map(canonical_json).collect();
242 format!("[{}]", inner.join(","))
243 }
244 serde_json::Value::String(s) => format!("\"{}\"", escape_json_string(s)),
245 serde_json::Value::Number(n) => {
246 if let Some(i) = n.as_i64() {
248 i.to_string()
249 } else if let Some(f) = n.as_f64() {
250 if f.fract() == 0.0 && f.abs() < 1e15 {
251 format!("{:.0}", f)
252 } else {
253 format!("{}", f)
254 }
255 } else {
256 n.to_string()
257 }
258 }
259 serde_json::Value::Bool(b) => b.to_string(),
260 serde_json::Value::Null => "null".to_string(),
261 }
262}
263
264fn escape_json_string(s: &str) -> String {
266 let mut result = String::with_capacity(s.len());
267 for c in s.chars() {
268 match c {
269 '"' => result.push_str("\\\""),
270 '\\' => result.push_str("\\\\"),
271 '\n' => result.push_str("\\n"),
272 '\r' => result.push_str("\\r"),
273 '\t' => result.push_str("\\t"),
274 c if c.is_control() => {
275 result.push_str(&format!("\\u{:04x}", c as u32));
276 }
277 c => result.push(c),
278 }
279 }
280 result
281}
282
283fn sha256_hash(data: &[u8]) -> [u8; 32] {
285 use sha2::{Digest, Sha256};
286 let mut hasher = Sha256::new();
287 hasher.update(data);
288 let result = hasher.finalize();
289 let mut hash = [0u8; 32];
290 hash.copy_from_slice(&result);
291 hash
292}
293
294pub fn is_cjk_character(c: char) -> bool {
296 matches!(c,
297 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
304}
305
306#[cfg(test)]
307mod tests {
308 use super::*;
309 use crate::content::TextFormat;
310
311 #[test]
312 fn test_normalize_text_whitespace() {
313 let result = normalize_text(" hello world ", NormalizationConfig::default());
314 assert_eq!(result, "hello world");
315 }
316
317 #[test]
318 fn test_normalize_text_preserve() {
319 let config = NormalizationConfig {
320 whitespace: WhitespaceNorm::Preserve,
321 ..Default::default()
322 };
323 let result = normalize_text(" hello world ", config);
324 assert_eq!(result, " hello world ");
325 }
326
327 #[test]
328 fn test_normalize_line_endings() {
329 let config = NormalizationConfig {
330 line_endings: LineEndingNorm::LF,
331 whitespace: WhitespaceNorm::Preserve,
332 ..Default::default()
333 };
334 let result = normalize_text("line1\r\nline2\rline3", config);
335 assert_eq!(result, "line1\nline2\nline3");
336 }
337
338 #[test]
339 fn test_canonical_json_sorted_keys() {
340 let json = serde_json::json!({"b": 1, "a": 2});
341 let canonical = canonical_json(&json);
342 assert_eq!(canonical, "{\"a\":2,\"b\":1}");
343 }
344
345 #[test]
346 fn test_canonical_json_nested() {
347 let json = serde_json::json!({"outer": {"b": 1, "a": 2}});
348 let canonical = canonical_json(&json);
349 assert_eq!(canonical, "{\"outer\":{\"a\":2,\"b\":1}}");
350 }
351
352 #[test]
353 fn test_normalize_content_text() {
354 let content = Content::Text(Text {
355 text: " Hello World ".to_string(),
356 format: TextFormat::Plain,
357 });
358 let normalized = normalize_content(&content);
359 assert_eq!(normalized, "Hello World");
360 }
361
362 #[test]
363 fn test_normalize_content_code() {
364 let content = Content::Code(Code {
365 language: "Rust".to_string(),
366 source: "fn main() {\n println!(\"hello\");\n}".to_string(),
367 highlights: vec![],
368 });
369 let normalized = normalize_content(&content);
370 assert!(normalized.starts_with("rust:"));
371 }
372
373 #[test]
374 fn test_is_cjk() {
375 assert!(is_cjk_character('中'));
376 assert!(is_cjk_character('あ'));
377 assert!(is_cjk_character('한'));
378 assert!(!is_cjk_character('a'));
379 assert!(!is_cjk_character('1'));
380 }
381}