datafold/schema/
hasher.rs

1use crate::schema::SchemaError;
2use log::info;
3use serde_json::{Map, Value};
4use sha2::{Digest, Sha256};
5
6/// Schema hasher for adding integrity verification to schemas
7pub struct SchemaHasher;
8
9impl SchemaHasher {
10    /// Calculate hash for a schema JSON, excluding hash, payment_config, and permission_policy fields
11    pub fn calculate_hash(schema_json: &Value) -> Result<String, SchemaError> {
12        // Clone the schema and recursively remove fields that should not affect the hash
13        let mut schema_for_hash = schema_json.clone();
14        Self::remove_excluded_fields(&mut schema_for_hash);
15
16        // Serialize to canonical JSON (sorted keys)
17        let canonical_json = Self::to_canonical_json(&schema_for_hash)?;
18
19        // Calculate SHA256 hash
20        let mut hasher = Sha256::new();
21        hasher.update(canonical_json.as_bytes());
22        let hash_bytes = hasher.finalize();
23
24        // Convert to hex string
25        Ok(format!("{:x}", hash_bytes))
26    }
27
28    /// Recursively remove excluded fields from JSON value
29    fn remove_excluded_fields(value: &mut Value) {
30        match value {
31            Value::Object(obj) => {
32                // Remove excluded fields at this level
33                obj.remove("hash");
34                obj.remove("payment_config");
35                obj.remove("permission_policy");
36                obj.remove("name"); // Exclude name to detect content duplicates regardless of schema name
37
38                // Recursively process all nested objects
39                for (_, nested_value) in obj.iter_mut() {
40                    Self::remove_excluded_fields(nested_value);
41                }
42            }
43            Value::Array(arr) => {
44                // Recursively process array elements
45                for item in arr.iter_mut() {
46                    Self::remove_excluded_fields(item);
47                }
48            }
49            _ => {
50                // Primitive values don't need processing
51            }
52        }
53    }
54
55    /// Add or update hash field in a schema JSON
56    pub fn add_hash_to_schema(schema_json: &mut Value) -> Result<String, SchemaError> {
57        let hash = Self::calculate_hash(schema_json)?;
58
59        if let Value::Object(ref mut obj) = schema_json {
60            obj.insert("hash".to_string(), Value::String(hash.clone()));
61        } else {
62            return Err(SchemaError::InvalidData(
63                "Schema must be a JSON object".to_string(),
64            ));
65        }
66
67        Ok(hash)
68    }
69
70    /// Verify that a schema's hash matches its content
71    pub fn verify_schema_hash(schema_json: &Value) -> Result<bool, SchemaError> {
72        if let Value::Object(obj) = schema_json {
73            if let Some(Value::String(stored_hash)) = obj.get("hash") {
74                let calculated_hash = Self::calculate_hash(schema_json)?;
75                Ok(stored_hash == &calculated_hash)
76            } else {
77                // No hash field present
78                Ok(false)
79            }
80        } else {
81            Err(SchemaError::InvalidData(
82                "Schema must be a JSON object".to_string(),
83            ))
84        }
85    }
86
87    /// Process a schema file: read, add/update hash, and write back
88    pub fn hash_schema_file(file_path: &std::path::Path) -> Result<String, SchemaError> {
89        info!("Processing schema file: {}", file_path.display());
90
91        // Read the file
92        let content = std::fs::read_to_string(file_path)
93            .map_err(|e| SchemaError::InvalidData(format!("Failed to read file: {}", e)))?;
94
95        // Parse JSON
96        let mut schema_json: Value = serde_json::from_str(&content)
97            .map_err(|e| SchemaError::InvalidData(format!("Invalid JSON: {}", e)))?;
98
99        // Add/update hash
100        let hash = Self::add_hash_to_schema(&mut schema_json)?;
101
102        // Write back to file with pretty formatting
103        let formatted_json = serde_json::to_string_pretty(&schema_json)
104            .map_err(|e| SchemaError::InvalidData(format!("Failed to format JSON: {}", e)))?;
105
106        std::fs::write(file_path, formatted_json)
107            .map_err(|e| SchemaError::InvalidData(format!("Failed to write file: {}", e)))?;
108
109        info!(
110            "Updated schema file {} with hash: {}",
111            file_path.display(),
112            hash
113        );
114        Ok(hash)
115    }
116
117    /// Process all schema files in the specified directory
118    pub fn hash_schemas_directory<P: AsRef<std::path::Path>>(
119        directory_path: P,
120    ) -> Result<Vec<(String, String)>, SchemaError> {
121        let available_schemas_dir = directory_path.as_ref();
122        let mut results = Vec::new();
123
124        info!(
125            "Processing schemas in directory: {}",
126            available_schemas_dir.display()
127        );
128
129        if !available_schemas_dir.exists() {
130            return Err(SchemaError::InvalidData(format!(
131                "Directory does not exist: {}",
132                available_schemas_dir.display()
133            )));
134        }
135
136        let entries = std::fs::read_dir(available_schemas_dir)
137            .map_err(|e| SchemaError::InvalidData(format!("Failed to read directory: {}", e)))?;
138
139        for entry in entries {
140            let entry = entry
141                .map_err(|e| SchemaError::InvalidData(format!("Failed to read entry: {}", e)))?;
142            let path = entry.path();
143
144            // Only process .json files, skip README.md and other files
145            if path.extension().map(|e| e == "json").unwrap_or(false) {
146                match Self::hash_schema_file(&path) {
147                    Ok(hash) => {
148                        let filename = path
149                            .file_name()
150                            .and_then(|n| n.to_str())
151                            .unwrap_or("unknown")
152                            .to_string();
153                        info!("✅ Processed: {} -> {}", filename, hash);
154                        results.push((filename.clone(), hash.clone()));
155                    }
156                    Err(e) => {
157                        let filename = path
158                            .file_name()
159                            .and_then(|n| n.to_str())
160                            .unwrap_or("unknown");
161                        info!("❌ Failed to process {}: {}", filename, e);
162                        return Err(e);
163                    }
164                }
165            }
166        }
167
168        info!("Successfully processed {} schema files", results.len());
169        Ok(results)
170    }
171
172    /// Process all schema files in the available_schemas directory (convenience method)
173    pub fn hash_available_schemas_directory() -> Result<Vec<(String, String)>, SchemaError> {
174        SchemaHasher::hash_schemas_directory("available_schemas")
175    }
176
177    /// Verify all schemas in the specified directory
178    pub fn verify_schemas_directory<P: AsRef<std::path::Path>>(
179        directory_path: P,
180    ) -> Result<Vec<(String, bool)>, SchemaError> {
181        let available_schemas_dir = directory_path.as_ref();
182        let mut results = Vec::new();
183
184        info!(
185            "Verifying schemas in directory: {}",
186            available_schemas_dir.display()
187        );
188
189        let entries = std::fs::read_dir(available_schemas_dir)
190            .map_err(|e| SchemaError::InvalidData(format!("Failed to read directory: {}", e)))?;
191
192        for entry in entries {
193            let entry = entry
194                .map_err(|e| SchemaError::InvalidData(format!("Failed to read entry: {}", e)))?;
195            let path = entry.path();
196
197            if path.extension().map(|e| e == "json").unwrap_or(false) {
198                let content = std::fs::read_to_string(&path)
199                    .map_err(|e| SchemaError::InvalidData(format!("Failed to read file: {}", e)))?;
200
201                let schema_json: Value = serde_json::from_str(&content)
202                    .map_err(|e| SchemaError::InvalidData(format!("Invalid JSON: {}", e)))?;
203
204                let is_valid = Self::verify_schema_hash(&schema_json)?;
205                let filename = path
206                    .file_name()
207                    .and_then(|n| n.to_str())
208                    .unwrap_or("unknown")
209                    .to_string();
210
211                results.push((filename.clone(), is_valid));
212
213                if is_valid {
214                    info!("✅ Valid hash: {}", filename);
215                } else {
216                    info!("❌ Invalid/missing hash: {}", filename);
217                }
218            }
219        }
220
221        Ok(results)
222    }
223
224    /// Verify all schemas in the available_schemas directory (convenience method)
225    pub fn verify_available_schemas_directory() -> Result<Vec<(String, bool)>, SchemaError> {
226        SchemaHasher::verify_schemas_directory("available_schemas")
227    }
228
229    /// Convert JSON to canonical form (sorted keys) for consistent hashing
230    fn to_canonical_json(value: &Value) -> Result<String, SchemaError> {
231        match value {
232            Value::Object(obj) => {
233                let mut sorted_obj = Map::new();
234                let mut keys: Vec<_> = obj.keys().collect();
235                keys.sort();
236
237                for key in keys {
238                    if let Some(val) = obj.get(key) {
239                        let canonical_val = Self::to_canonical_json(val)?;
240                        let parsed_val: Value =
241                            serde_json::from_str(&canonical_val).map_err(|e| {
242                                SchemaError::InvalidData(format!(
243                                    "Failed to parse canonical JSON: {}",
244                                    e
245                                ))
246                            })?;
247                        sorted_obj.insert(key.clone(), parsed_val);
248                    }
249                }
250
251                serde_json::to_string(&Value::Object(sorted_obj)).map_err(|e| {
252                    SchemaError::InvalidData(format!("Failed to serialize canonical JSON: {}", e))
253                })
254            }
255            Value::Array(arr) => {
256                let mut canonical_items = Vec::new();
257                for item in arr {
258                    let canonical_item = Self::to_canonical_json(item)?;
259                    let parsed_item: Value =
260                        serde_json::from_str(&canonical_item).map_err(|e| {
261                            SchemaError::InvalidData(format!(
262                                "Failed to parse canonical JSON: {}",
263                                e
264                            ))
265                        })?;
266                    canonical_items.push(parsed_item);
267                }
268
269                serde_json::to_string(&Value::Array(canonical_items)).map_err(|e| {
270                    SchemaError::InvalidData(format!("Failed to serialize canonical JSON: {}", e))
271                })
272            }
273            _ => serde_json::to_string(value)
274                .map_err(|e| SchemaError::InvalidData(format!("Failed to serialize JSON: {}", e))),
275        }
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282    use serde_json::json;
283
284    #[test]
285    fn test_calculate_hash() {
286        let schema = json!({
287            "name": "TestSchema",
288            "fields": {
289                "field1": {
290                    "field_type": "Single"
291                }
292            }
293        });
294
295        let hash = SchemaHasher::calculate_hash(&schema).unwrap();
296        assert!(!hash.is_empty());
297        assert_eq!(hash.len(), 64); // SHA256 hex string length
298    }
299
300    #[test]
301    fn test_hash_consistency() {
302        let schema1 = json!({
303            "name": "TestSchema",
304            "fields": {
305                "field1": {"field_type": "Single"},
306                "field2": {"field_type": "Collection"}
307            }
308        });
309
310        let schema2 = json!({
311            "fields": {
312                "field2": {"field_type": "Collection"},
313                "field1": {"field_type": "Single"}
314            },
315            "name": "TestSchema"
316        });
317
318        let hash1 = SchemaHasher::calculate_hash(&schema1).unwrap();
319        let hash2 = SchemaHasher::calculate_hash(&schema2).unwrap();
320        assert_eq!(hash1, hash2);
321    }
322
323    #[test]
324    fn test_hash_excludes_existing_hash() {
325        let mut schema = json!({
326            "name": "TestSchema",
327            "fields": {
328                "field1": {"field_type": "Single"}
329            }
330        });
331
332        let hash1 = SchemaHasher::calculate_hash(&schema).unwrap();
333
334        // Add hash to schema
335        SchemaHasher::add_hash_to_schema(&mut schema).unwrap();
336
337        // Hash should be the same even with hash field present
338        let hash2 = SchemaHasher::calculate_hash(&schema).unwrap();
339        assert_eq!(hash1, hash2);
340    }
341
342    #[test]
343    fn test_add_and_verify_hash() {
344        let mut schema = json!({
345            "name": "TestSchema",
346            "fields": {
347                "field1": {"field_type": "Single"}
348            }
349        });
350
351        let hash = SchemaHasher::add_hash_to_schema(&mut schema).unwrap();
352        assert!(!hash.is_empty());
353
354        let is_valid = SchemaHasher::verify_schema_hash(&schema).unwrap();
355        assert!(is_valid);
356    }
357}