Skip to main content

cognee_ingestion/
content_hasher.rs

1use md5::Md5;
2use sha2::Sha256;
3use tokio::io::{AsyncRead, AsyncReadExt};
4
5/// Selects which hash algorithm to use for content hashing.
6///
7/// - `Md5` (default) — matches Python cognee's `hashlib.md5(content).hexdigest()`.
8///   Use this when cross-SDK database sharing is needed.
9/// - `Sha256` — more secure, not compatible with Python DB values.
10#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
11pub enum HashAlgorithm {
12    #[default]
13    Md5,
14    Sha256,
15}
16
17pub struct ContentHasher;
18
19impl ContentHasher {
20    /// Hash raw bytes using the given algorithm.
21    /// Hash is content-only (no owner_id), matching Python's behaviour.
22    pub fn hash_content(content: &[u8], algorithm: HashAlgorithm) -> String {
23        match algorithm {
24            HashAlgorithm::Md5 => {
25                use md5::Digest;
26                let result = Md5::digest(content);
27                format!("{result:x}")
28            }
29            HashAlgorithm::Sha256 => {
30                use sha2::Digest;
31                let result = Sha256::digest(content);
32                format!("{result:x}")
33            }
34        }
35    }
36
37    /// Stream-hash an async reader, returning the hex digest.
38    pub async fn hash_content_stream<R: AsyncRead + Unpin>(
39        reader: &mut R,
40        algorithm: HashAlgorithm,
41    ) -> Result<String, std::io::Error> {
42        let mut buffer = [0u8; 8192];
43
44        match algorithm {
45            HashAlgorithm::Md5 => {
46                use md5::Digest;
47                let mut hasher = Md5::new();
48                loop {
49                    let n = reader.read(&mut buffer).await?;
50                    if n == 0 {
51                        break;
52                    }
53                    hasher.update(&buffer[..n]);
54                }
55                Ok(format!("{:x}", hasher.finalize()))
56            }
57            HashAlgorithm::Sha256 => {
58                use sha2::Digest;
59                let mut hasher = Sha256::new();
60                loop {
61                    let n = reader.read(&mut buffer).await?;
62                    if n == 0 {
63                        break;
64                    }
65                    hasher.update(&buffer[..n]);
66                }
67                Ok(format!("{:x}", hasher.finalize()))
68            }
69        }
70    }
71}
72
73#[cfg(test)]
74#[allow(
75    clippy::unwrap_used,
76    clippy::expect_used,
77    reason = "test code — panics are acceptable failures"
78)]
79mod tests {
80    use super::*;
81
82    // Pre-computed reference values (verified against Python hashlib)
83    const HELLO_WORLD_MD5: &str = "5eb63bbbe01eeed093cb22bb8f5acdc3";
84    const EMPTY_MD5: &str = "d41d8cd98f00b204e9800998ecf8427e";
85    const HELLO_WORLD_SHA256: &str =
86        "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9";
87
88    #[test]
89    fn test_md5_known_values() {
90        assert_eq!(
91            ContentHasher::hash_content(b"hello world", HashAlgorithm::Md5),
92            HELLO_WORLD_MD5
93        );
94        assert_eq!(
95            ContentHasher::hash_content(b"", HashAlgorithm::Md5),
96            EMPTY_MD5
97        );
98    }
99
100    #[test]
101    fn test_sha256_known_value() {
102        // sha256("hello world") from Python: hashlib.sha256(b"hello world").hexdigest()
103        let result = ContentHasher::hash_content(b"hello world", HashAlgorithm::Sha256);
104        assert_eq!(result, HELLO_WORLD_SHA256);
105    }
106
107    #[test]
108    fn test_default_algorithm_is_md5() {
109        let algo = HashAlgorithm::default();
110        assert_eq!(algo, HashAlgorithm::Md5);
111    }
112
113    #[test]
114    fn test_md5_deterministic() {
115        let h1 = ContentHasher::hash_content(b"test content", HashAlgorithm::Md5);
116        let h2 = ContentHasher::hash_content(b"test content", HashAlgorithm::Md5);
117        assert_eq!(h1, h2);
118    }
119
120    #[test]
121    fn test_different_content_different_hash() {
122        let h1 = ContentHasher::hash_content(b"Content A", HashAlgorithm::Md5);
123        let h2 = ContentHasher::hash_content(b"Content B", HashAlgorithm::Md5);
124        assert_ne!(h1, h2);
125    }
126
127    #[test]
128    fn test_same_content_same_hash_across_owners() {
129        // Content hash must NOT include owner_id (Python compat requires this)
130        let content = b"Same content";
131        let h1 = ContentHasher::hash_content(content, HashAlgorithm::Md5);
132        let h2 = ContentHasher::hash_content(content, HashAlgorithm::Md5);
133        assert_eq!(h1, h2);
134    }
135
136    #[tokio::test]
137    async fn test_stream_hash_matches_in_memory() {
138        let content = b"Stream test content";
139        let expected = ContentHasher::hash_content(content, HashAlgorithm::Md5);
140
141        let mut cursor = std::io::Cursor::new(content);
142        let stream_result =
143            ContentHasher::hash_content_stream(&mut cursor, HashAlgorithm::Md5).await;
144        assert!(stream_result.is_ok());
145        assert_eq!(stream_result.unwrap(), expected);
146    }
147
148    #[tokio::test]
149    async fn test_stream_sha256_matches_in_memory() {
150        let content = b"SHA256 stream test";
151        let expected = ContentHasher::hash_content(content, HashAlgorithm::Sha256);
152
153        let mut cursor = std::io::Cursor::new(content);
154        let stream_result =
155            ContentHasher::hash_content_stream(&mut cursor, HashAlgorithm::Sha256).await;
156        assert!(stream_result.is_ok());
157        assert_eq!(stream_result.unwrap(), expected);
158    }
159}