datafold/schema/
hasher.rs1use crate::schema::SchemaError;
2use log::info;
3use serde_json::{Map, Value};
4use sha2::{Digest, Sha256};
5
6pub struct SchemaHasher;
8
9impl SchemaHasher {
10 pub fn calculate_hash(schema_json: &Value) -> Result<String, SchemaError> {
12 let mut schema_for_hash = schema_json.clone();
14 Self::remove_excluded_fields(&mut schema_for_hash);
15
16 let canonical_json = Self::to_canonical_json(&schema_for_hash)?;
18
19 let mut hasher = Sha256::new();
21 hasher.update(canonical_json.as_bytes());
22 let hash_bytes = hasher.finalize();
23
24 Ok(format!("{:x}", hash_bytes))
26 }
27
28 fn remove_excluded_fields(value: &mut Value) {
30 match value {
31 Value::Object(obj) => {
32 obj.remove("hash");
34 obj.remove("payment_config");
35 obj.remove("permission_policy");
36 obj.remove("name"); for (_, nested_value) in obj.iter_mut() {
40 Self::remove_excluded_fields(nested_value);
41 }
42 }
43 Value::Array(arr) => {
44 for item in arr.iter_mut() {
46 Self::remove_excluded_fields(item);
47 }
48 }
49 _ => {
50 }
52 }
53 }
54
55 pub fn add_hash_to_schema(schema_json: &mut Value) -> Result<String, SchemaError> {
57 let hash = Self::calculate_hash(schema_json)?;
58
59 if let Value::Object(ref mut obj) = schema_json {
60 obj.insert("hash".to_string(), Value::String(hash.clone()));
61 } else {
62 return Err(SchemaError::InvalidData(
63 "Schema must be a JSON object".to_string(),
64 ));
65 }
66
67 Ok(hash)
68 }
69
70 pub fn verify_schema_hash(schema_json: &Value) -> Result<bool, SchemaError> {
72 if let Value::Object(obj) = schema_json {
73 if let Some(Value::String(stored_hash)) = obj.get("hash") {
74 let calculated_hash = Self::calculate_hash(schema_json)?;
75 Ok(stored_hash == &calculated_hash)
76 } else {
77 Ok(false)
79 }
80 } else {
81 Err(SchemaError::InvalidData(
82 "Schema must be a JSON object".to_string(),
83 ))
84 }
85 }
86
87 pub fn hash_schema_file(file_path: &std::path::Path) -> Result<String, SchemaError> {
89 info!("Processing schema file: {}", file_path.display());
90
91 let content = std::fs::read_to_string(file_path)
93 .map_err(|e| SchemaError::InvalidData(format!("Failed to read file: {}", e)))?;
94
95 let mut schema_json: Value = serde_json::from_str(&content)
97 .map_err(|e| SchemaError::InvalidData(format!("Invalid JSON: {}", e)))?;
98
99 let hash = Self::add_hash_to_schema(&mut schema_json)?;
101
102 let formatted_json = serde_json::to_string_pretty(&schema_json)
104 .map_err(|e| SchemaError::InvalidData(format!("Failed to format JSON: {}", e)))?;
105
106 std::fs::write(file_path, formatted_json)
107 .map_err(|e| SchemaError::InvalidData(format!("Failed to write file: {}", e)))?;
108
109 info!(
110 "Updated schema file {} with hash: {}",
111 file_path.display(),
112 hash
113 );
114 Ok(hash)
115 }
116
117 pub fn hash_schemas_directory<P: AsRef<std::path::Path>>(
119 directory_path: P,
120 ) -> Result<Vec<(String, String)>, SchemaError> {
121 let available_schemas_dir = directory_path.as_ref();
122 let mut results = Vec::new();
123
124 info!(
125 "Processing schemas in directory: {}",
126 available_schemas_dir.display()
127 );
128
129 if !available_schemas_dir.exists() {
130 return Err(SchemaError::InvalidData(format!(
131 "Directory does not exist: {}",
132 available_schemas_dir.display()
133 )));
134 }
135
136 let entries = std::fs::read_dir(available_schemas_dir)
137 .map_err(|e| SchemaError::InvalidData(format!("Failed to read directory: {}", e)))?;
138
139 for entry in entries {
140 let entry = entry
141 .map_err(|e| SchemaError::InvalidData(format!("Failed to read entry: {}", e)))?;
142 let path = entry.path();
143
144 if path.extension().map(|e| e == "json").unwrap_or(false) {
146 match Self::hash_schema_file(&path) {
147 Ok(hash) => {
148 let filename = path
149 .file_name()
150 .and_then(|n| n.to_str())
151 .unwrap_or("unknown")
152 .to_string();
153 info!("✅ Processed: {} -> {}", filename, hash);
154 results.push((filename.clone(), hash.clone()));
155 }
156 Err(e) => {
157 let filename = path
158 .file_name()
159 .and_then(|n| n.to_str())
160 .unwrap_or("unknown");
161 info!("❌ Failed to process {}: {}", filename, e);
162 return Err(e);
163 }
164 }
165 }
166 }
167
168 info!("Successfully processed {} schema files", results.len());
169 Ok(results)
170 }
171
172 pub fn hash_available_schemas_directory() -> Result<Vec<(String, String)>, SchemaError> {
174 SchemaHasher::hash_schemas_directory("available_schemas")
175 }
176
177 pub fn verify_schemas_directory<P: AsRef<std::path::Path>>(
179 directory_path: P,
180 ) -> Result<Vec<(String, bool)>, SchemaError> {
181 let available_schemas_dir = directory_path.as_ref();
182 let mut results = Vec::new();
183
184 info!(
185 "Verifying schemas in directory: {}",
186 available_schemas_dir.display()
187 );
188
189 let entries = std::fs::read_dir(available_schemas_dir)
190 .map_err(|e| SchemaError::InvalidData(format!("Failed to read directory: {}", e)))?;
191
192 for entry in entries {
193 let entry = entry
194 .map_err(|e| SchemaError::InvalidData(format!("Failed to read entry: {}", e)))?;
195 let path = entry.path();
196
197 if path.extension().map(|e| e == "json").unwrap_or(false) {
198 let content = std::fs::read_to_string(&path)
199 .map_err(|e| SchemaError::InvalidData(format!("Failed to read file: {}", e)))?;
200
201 let schema_json: Value = serde_json::from_str(&content)
202 .map_err(|e| SchemaError::InvalidData(format!("Invalid JSON: {}", e)))?;
203
204 let is_valid = Self::verify_schema_hash(&schema_json)?;
205 let filename = path
206 .file_name()
207 .and_then(|n| n.to_str())
208 .unwrap_or("unknown")
209 .to_string();
210
211 results.push((filename.clone(), is_valid));
212
213 if is_valid {
214 info!("✅ Valid hash: {}", filename);
215 } else {
216 info!("❌ Invalid/missing hash: {}", filename);
217 }
218 }
219 }
220
221 Ok(results)
222 }
223
224 pub fn verify_available_schemas_directory() -> Result<Vec<(String, bool)>, SchemaError> {
226 SchemaHasher::verify_schemas_directory("available_schemas")
227 }
228
229 fn to_canonical_json(value: &Value) -> Result<String, SchemaError> {
231 match value {
232 Value::Object(obj) => {
233 let mut sorted_obj = Map::new();
234 let mut keys: Vec<_> = obj.keys().collect();
235 keys.sort();
236
237 for key in keys {
238 if let Some(val) = obj.get(key) {
239 let canonical_val = Self::to_canonical_json(val)?;
240 let parsed_val: Value =
241 serde_json::from_str(&canonical_val).map_err(|e| {
242 SchemaError::InvalidData(format!(
243 "Failed to parse canonical JSON: {}",
244 e
245 ))
246 })?;
247 sorted_obj.insert(key.clone(), parsed_val);
248 }
249 }
250
251 serde_json::to_string(&Value::Object(sorted_obj)).map_err(|e| {
252 SchemaError::InvalidData(format!("Failed to serialize canonical JSON: {}", e))
253 })
254 }
255 Value::Array(arr) => {
256 let mut canonical_items = Vec::new();
257 for item in arr {
258 let canonical_item = Self::to_canonical_json(item)?;
259 let parsed_item: Value =
260 serde_json::from_str(&canonical_item).map_err(|e| {
261 SchemaError::InvalidData(format!(
262 "Failed to parse canonical JSON: {}",
263 e
264 ))
265 })?;
266 canonical_items.push(parsed_item);
267 }
268
269 serde_json::to_string(&Value::Array(canonical_items)).map_err(|e| {
270 SchemaError::InvalidData(format!("Failed to serialize canonical JSON: {}", e))
271 })
272 }
273 _ => serde_json::to_string(value)
274 .map_err(|e| SchemaError::InvalidData(format!("Failed to serialize JSON: {}", e))),
275 }
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282 use serde_json::json;
283
284 #[test]
285 fn test_calculate_hash() {
286 let schema = json!({
287 "name": "TestSchema",
288 "fields": {
289 "field1": {
290 "field_type": "Single"
291 }
292 }
293 });
294
295 let hash = SchemaHasher::calculate_hash(&schema).unwrap();
296 assert!(!hash.is_empty());
297 assert_eq!(hash.len(), 64); }
299
300 #[test]
301 fn test_hash_consistency() {
302 let schema1 = json!({
303 "name": "TestSchema",
304 "fields": {
305 "field1": {"field_type": "Single"},
306 "field2": {"field_type": "Collection"}
307 }
308 });
309
310 let schema2 = json!({
311 "fields": {
312 "field2": {"field_type": "Collection"},
313 "field1": {"field_type": "Single"}
314 },
315 "name": "TestSchema"
316 });
317
318 let hash1 = SchemaHasher::calculate_hash(&schema1).unwrap();
319 let hash2 = SchemaHasher::calculate_hash(&schema2).unwrap();
320 assert_eq!(hash1, hash2);
321 }
322
323 #[test]
324 fn test_hash_excludes_existing_hash() {
325 let mut schema = json!({
326 "name": "TestSchema",
327 "fields": {
328 "field1": {"field_type": "Single"}
329 }
330 });
331
332 let hash1 = SchemaHasher::calculate_hash(&schema).unwrap();
333
334 SchemaHasher::add_hash_to_schema(&mut schema).unwrap();
336
337 let hash2 = SchemaHasher::calculate_hash(&schema).unwrap();
339 assert_eq!(hash1, hash2);
340 }
341
342 #[test]
343 fn test_add_and_verify_hash() {
344 let mut schema = json!({
345 "name": "TestSchema",
346 "fields": {
347 "field1": {"field_type": "Single"}
348 }
349 });
350
351 let hash = SchemaHasher::add_hash_to_schema(&mut schema).unwrap();
352 assert!(!hash.is_empty());
353
354 let is_valid = SchemaHasher::verify_schema_hash(&schema).unwrap();
355 assert!(is_valid);
356 }
357}