cognee_ingestion/
content_hasher.rs1use md5::Md5;
2use sha2::Sha256;
3use tokio::io::{AsyncRead, AsyncReadExt};
4
5#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
11pub enum HashAlgorithm {
12 #[default]
13 Md5,
14 Sha256,
15}
16
17pub struct ContentHasher;
18
19impl ContentHasher {
20 pub fn hash_content(content: &[u8], algorithm: HashAlgorithm) -> String {
23 match algorithm {
24 HashAlgorithm::Md5 => {
25 use md5::Digest;
26 let result = Md5::digest(content);
27 format!("{result:x}")
28 }
29 HashAlgorithm::Sha256 => {
30 use sha2::Digest;
31 let result = Sha256::digest(content);
32 format!("{result:x}")
33 }
34 }
35 }
36
37 pub async fn hash_content_stream<R: AsyncRead + Unpin>(
39 reader: &mut R,
40 algorithm: HashAlgorithm,
41 ) -> Result<String, std::io::Error> {
42 let mut buffer = [0u8; 8192];
43
44 match algorithm {
45 HashAlgorithm::Md5 => {
46 use md5::Digest;
47 let mut hasher = Md5::new();
48 loop {
49 let n = reader.read(&mut buffer).await?;
50 if n == 0 {
51 break;
52 }
53 hasher.update(&buffer[..n]);
54 }
55 Ok(format!("{:x}", hasher.finalize()))
56 }
57 HashAlgorithm::Sha256 => {
58 use sha2::Digest;
59 let mut hasher = Sha256::new();
60 loop {
61 let n = reader.read(&mut buffer).await?;
62 if n == 0 {
63 break;
64 }
65 hasher.update(&buffer[..n]);
66 }
67 Ok(format!("{:x}", hasher.finalize()))
68 }
69 }
70 }
71}
72
73#[cfg(test)]
74#[allow(
75 clippy::unwrap_used,
76 clippy::expect_used,
77 reason = "test code — panics are acceptable failures"
78)]
79mod tests {
80 use super::*;
81
82 const HELLO_WORLD_MD5: &str = "5eb63bbbe01eeed093cb22bb8f5acdc3";
84 const EMPTY_MD5: &str = "d41d8cd98f00b204e9800998ecf8427e";
85 const HELLO_WORLD_SHA256: &str =
86 "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9";
87
88 #[test]
89 fn test_md5_known_values() {
90 assert_eq!(
91 ContentHasher::hash_content(b"hello world", HashAlgorithm::Md5),
92 HELLO_WORLD_MD5
93 );
94 assert_eq!(
95 ContentHasher::hash_content(b"", HashAlgorithm::Md5),
96 EMPTY_MD5
97 );
98 }
99
100 #[test]
101 fn test_sha256_known_value() {
102 let result = ContentHasher::hash_content(b"hello world", HashAlgorithm::Sha256);
104 assert_eq!(result, HELLO_WORLD_SHA256);
105 }
106
107 #[test]
108 fn test_default_algorithm_is_md5() {
109 let algo = HashAlgorithm::default();
110 assert_eq!(algo, HashAlgorithm::Md5);
111 }
112
113 #[test]
114 fn test_md5_deterministic() {
115 let h1 = ContentHasher::hash_content(b"test content", HashAlgorithm::Md5);
116 let h2 = ContentHasher::hash_content(b"test content", HashAlgorithm::Md5);
117 assert_eq!(h1, h2);
118 }
119
120 #[test]
121 fn test_different_content_different_hash() {
122 let h1 = ContentHasher::hash_content(b"Content A", HashAlgorithm::Md5);
123 let h2 = ContentHasher::hash_content(b"Content B", HashAlgorithm::Md5);
124 assert_ne!(h1, h2);
125 }
126
127 #[test]
128 fn test_same_content_same_hash_across_owners() {
129 let content = b"Same content";
131 let h1 = ContentHasher::hash_content(content, HashAlgorithm::Md5);
132 let h2 = ContentHasher::hash_content(content, HashAlgorithm::Md5);
133 assert_eq!(h1, h2);
134 }
135
136 #[tokio::test]
137 async fn test_stream_hash_matches_in_memory() {
138 let content = b"Stream test content";
139 let expected = ContentHasher::hash_content(content, HashAlgorithm::Md5);
140
141 let mut cursor = std::io::Cursor::new(content);
142 let stream_result =
143 ContentHasher::hash_content_stream(&mut cursor, HashAlgorithm::Md5).await;
144 assert!(stream_result.is_ok());
145 assert_eq!(stream_result.unwrap(), expected);
146 }
147
148 #[tokio::test]
149 async fn test_stream_sha256_matches_in_memory() {
150 let content = b"SHA256 stream test";
151 let expected = ContentHasher::hash_content(content, HashAlgorithm::Sha256);
152
153 let mut cursor = std::io::Cursor::new(content);
154 let stream_result =
155 ContentHasher::hash_content_stream(&mut cursor, HashAlgorithm::Sha256).await;
156 assert!(stream_result.is_ok());
157 assert_eq!(stream_result.unwrap(), expected);
158 }
159}