Skip to main content

crw_diff/
snapshot.rs

1//! Markdown normalization + content hashing. Single source of truth for the
2//! `content_hash` so cosmetic churn (trailing whitespace, blank-line runs,
3//! CRLF) never flips a page from `same` to `changed`.
4
5use serde_json::Value;
6use sha2::{Digest, Sha256};
7
8/// Normalize markdown before hashing/diffing:
9/// - normalize CRLF / CR to LF
10/// - strip trailing whitespace on every line
11/// - collapse runs of 3+ blank lines to a single blank line
12/// - trim leading/trailing blank lines
13///
14/// Diffing operates on the normalized form so the unified diff and AST never
15/// report whitespace-only noise.
16pub fn normalize_markdown(input: &str) -> String {
17    let unified = input.replace("\r\n", "\n").replace('\r', "\n");
18    let mut out_lines: Vec<&str> = Vec::new();
19    let mut blank_run = 0usize;
20    for raw in unified.split('\n') {
21        let line = raw.trim_end();
22        if line.is_empty() {
23            blank_run += 1;
24            // keep at most one blank line in a run
25            if blank_run <= 1 {
26                out_lines.push("");
27            }
28        } else {
29            blank_run = 0;
30            out_lines.push(line);
31        }
32    }
33    // trim leading/trailing blank lines
34    while out_lines.first() == Some(&"") {
35        out_lines.remove(0);
36    }
37    while out_lines.last() == Some(&"") {
38        out_lines.pop();
39    }
40    out_lines.join("\n")
41}
42
43/// Hex SHA-256 of a string.
44pub fn hash_str(s: &str) -> String {
45    let mut hasher = Sha256::new();
46    hasher.update(s.as_bytes());
47    hex::encode(hasher.finalize())
48}
49
50/// Hex SHA-256 of the normalized markdown.
51pub fn hash_markdown(markdown: &str) -> String {
52    hash_str(&normalize_markdown(markdown))
53}
54
55/// Hex SHA-256 of a canonicalized JSON value (object keys sorted recursively),
56/// so logically-equal extractions with different key ordering hash equal.
57pub fn hash_json(value: &Value) -> String {
58    hash_str(&canonical_json_string(value))
59}
60
61/// Serialize a JSON value with object keys sorted recursively. Deterministic
62/// regardless of input key order.
63pub fn canonical_json_string(value: &Value) -> String {
64    let canonical = canonicalize(value);
65    serde_json::to_string(&canonical).unwrap_or_default()
66}
67
68fn canonicalize(value: &Value) -> Value {
69    match value {
70        Value::Object(map) => {
71            let mut keys: Vec<&String> = map.keys().collect();
72            keys.sort();
73            let mut out = serde_json::Map::with_capacity(map.len());
74            for k in keys {
75                out.insert(k.clone(), canonicalize(&map[k]));
76            }
77            Value::Object(out)
78        }
79        Value::Array(items) => Value::Array(items.iter().map(canonicalize).collect()),
80        other => other.clone(),
81    }
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    #[test]
89    fn normalize_collapses_blank_runs_and_trailing_ws() {
90        let input = "# Title  \n\n\n\nbody   \n\n";
91        assert_eq!(normalize_markdown(input), "# Title\n\nbody");
92    }
93
94    #[test]
95    fn normalize_handles_crlf() {
96        assert_eq!(normalize_markdown("a\r\nb\r\n"), "a\nb");
97    }
98
99    #[test]
100    fn whitespace_only_change_hashes_equal() {
101        let a = "# Hello\n\nworld";
102        let b = "# Hello   \n\n\n\nworld  \n";
103        assert_eq!(hash_markdown(a), hash_markdown(b));
104    }
105
106    #[test]
107    fn json_key_order_hashes_equal() {
108        let a: Value = serde_json::json!({"a": 1, "b": [1, 2]});
109        let b: Value = serde_json::json!({"b": [1, 2], "a": 1});
110        assert_eq!(hash_json(&a), hash_json(&b));
111    }
112}