diffx_core/
lib.rs

1use regex::Regex;
2use serde::Serialize;
3use serde_json::Value;
4use std::collections::HashMap;
5// use ini::Ini;
6use anyhow::{anyhow, Result};
7use csv::ReaderBuilder;
8use quick_xml::de::from_str;
9use std::fs::File;
10use std::io::{BufReader, Read};
11use std::path::Path;
12// Removed ProgressReporter - Unix tools should be pipe-friendly
13
14#[derive(Debug, PartialEq, Serialize)]
15pub enum DiffResult {
16    Added(String, Value),
17    Removed(String, Value),
18    Modified(String, Value, Value),
19    TypeChanged(String, Value, Value),
20}
21
22/// Lightweight diff result that doesn't clone values unnecessarily
23#[derive(Debug, PartialEq, Serialize)]
24pub enum LightweightDiffResult {
25    Added(String, String),               // path, serialized value
26    Removed(String, String),             // path, serialized value
27    Modified(String, String, String),    // path, old_value, new_value
28    TypeChanged(String, String, String), // path, old_value, new_value
29}
30
31impl From<&DiffResult> for LightweightDiffResult {
32    fn from(diff: &DiffResult) -> Self {
33        match diff {
34            DiffResult::Added(path, value) => {
35                LightweightDiffResult::Added(path.clone(), value.to_string())
36            }
37            DiffResult::Removed(path, value) => {
38                LightweightDiffResult::Removed(path.clone(), value.to_string())
39            }
40            DiffResult::Modified(path, old, new) => {
41                LightweightDiffResult::Modified(path.clone(), old.to_string(), new.to_string())
42            }
43            DiffResult::TypeChanged(path, old, new) => {
44                LightweightDiffResult::TypeChanged(path.clone(), old.to_string(), new.to_string())
45            }
46        }
47    }
48}
49
50/// Configuration for diff operations - essential options only
51#[derive(Debug, Clone)]
52pub struct DiffConfig {
53    pub ignore_keys_regex: Option<regex::Regex>,
54    pub epsilon: Option<f64>,
55    pub array_id_key: Option<String>,
56    pub use_memory_optimization: bool, // Explicit choice
57    pub batch_size: usize,
58    pub ignore_whitespace: bool,
59    pub ignore_case: bool,
60}
61
62impl Default for DiffConfig {
63    fn default() -> Self {
64        Self {
65            ignore_keys_regex: None,
66            epsilon: None,
67            array_id_key: None,
68            use_memory_optimization: false, // Conservative default
69            batch_size: 1000,
70            ignore_whitespace: false,
71            ignore_case: false,
72        }
73    }
74}
75
76// Removed estimate_item_count - no longer needed without progress reporting
77
78/// Standard diff function - predictable, no automatic optimization
79pub fn diff_standard(
80    v1: &Value,
81    v2: &Value,
82    ignore_keys_regex: Option<&Regex>,
83    epsilon: Option<f64>,
84    array_id_key: Option<&str>,
85) -> Vec<DiffResult> {
86    diff_standard_implementation(
87        v1,
88        v2,
89        ignore_keys_regex,
90        epsilon,
91        array_id_key,
92        false,
93        false,
94    )
95}
96
97/// Standard diff function with configuration support
98pub fn diff_standard_with_config(v1: &Value, v2: &Value, config: &DiffConfig) -> Vec<DiffResult> {
99    diff_standard_implementation(
100        v1,
101        v2,
102        config.ignore_keys_regex.as_ref(),
103        config.epsilon,
104        config.array_id_key.as_deref(),
105        config.ignore_whitespace,
106        config.ignore_case,
107    )
108}
109
110/// Standard diff function - clean, predictable output
111fn diff_standard_implementation(
112    v1: &Value,
113    v2: &Value,
114    ignore_keys_regex: Option<&Regex>,
115    epsilon: Option<f64>,
116    array_id_key: Option<&str>,
117    ignore_whitespace: bool,
118    ignore_case: bool,
119) -> Vec<DiffResult> {
120    let mut results = Vec::new();
121
122    // Handle root level type or value change first
123    if !values_are_equal_with_config(v1, v2, epsilon, ignore_whitespace, ignore_case) {
124        let type_match = matches!(
125            (v1, v2),
126            (Value::Null, Value::Null)
127                | (Value::Bool(_), Value::Bool(_))
128                | (Value::Number(_), Value::Number(_))
129                | (Value::String(_), Value::String(_))
130                | (Value::Array(_), Value::Array(_))
131                | (Value::Object(_), Value::Object(_))
132        );
133
134        if !type_match {
135            results.push(DiffResult::TypeChanged(
136                "".to_string(),
137                v1.clone(),
138                v2.clone(),
139            ));
140            return results;
141        } else if v1.is_object() && v2.is_object() {
142            diff_objects(
143                "",
144                v1.as_object().unwrap(),
145                v2.as_object().unwrap(),
146                &mut results,
147                ignore_keys_regex,
148                epsilon,
149                array_id_key,
150                ignore_whitespace,
151                ignore_case,
152            );
153        } else if v1.is_array() && v2.is_array() {
154            diff_arrays(
155                "",
156                v1.as_array().unwrap(),
157                v2.as_array().unwrap(),
158                &mut results,
159                ignore_keys_regex,
160                epsilon,
161                array_id_key,
162                ignore_whitespace,
163                ignore_case,
164            );
165        } else {
166            results.push(DiffResult::Modified("".to_string(), v1.clone(), v2.clone()));
167        }
168    }
169
170    results
171}
172
173/// Memory-optimized diff function - explicitly requested optimization
174pub fn diff_optimized(
175    v1: &Value,
176    v2: &Value,
177    ignore_keys_regex: Option<&Regex>,
178    epsilon: Option<f64>,
179    array_id_key: Option<&str>,
180) -> Vec<DiffResult> {
181    let mut results = Vec::new();
182    memory_efficient_diff(
183        v1,
184        v2,
185        &mut results,
186        ignore_keys_regex,
187        epsilon,
188        array_id_key,
189        false,
190        false,
191    );
192    results
193}
194
195/// Memory-optimized diff function with configuration support
196pub fn diff_optimized_with_config(v1: &Value, v2: &Value, config: &DiffConfig) -> Vec<DiffResult> {
197    let mut results = Vec::new();
198    memory_efficient_diff(
199        v1,
200        v2,
201        &mut results,
202        config.ignore_keys_regex.as_ref(),
203        config.epsilon,
204        config.array_id_key.as_deref(),
205        config.ignore_whitespace,
206        config.ignore_case,
207    );
208    results
209}
210
211/// Enhanced diff function with explicit configuration
212pub fn diff_with_config(v1: &Value, v2: &Value, config: &DiffConfig) -> Vec<DiffResult> {
213    // Explicit choice: user decides which algorithm to use
214    if config.use_memory_optimization {
215        diff_optimized_with_config(v1, v2, config)
216    } else {
217        diff_standard_with_config(v1, v2, config)
218    }
219}
220
221/// Backward compatible diff function - uses standard algorithm
222pub fn diff(
223    v1: &Value,
224    v2: &Value,
225    ignore_keys_regex: Option<&Regex>,
226    epsilon: Option<f64>,
227    array_id_key: Option<&str>,
228) -> Vec<DiffResult> {
229    // Always use standard algorithm for predictable behavior
230    diff_standard(v1, v2, ignore_keys_regex, epsilon, array_id_key)
231}
232
233#[allow(clippy::too_many_arguments)]
234fn diff_recursive(
235    path: &str,
236    v1: &Value,
237    v2: &Value,
238    results: &mut Vec<DiffResult>,
239    ignore_keys_regex: Option<&Regex>,
240    epsilon: Option<f64>,
241    array_id_key: Option<&str>,
242    ignore_whitespace: bool,
243    ignore_case: bool,
244) {
245    match (v1, v2) {
246        (Value::Object(map1), Value::Object(map2)) => {
247            diff_objects(
248                path,
249                map1,
250                map2,
251                results,
252                ignore_keys_regex,
253                epsilon,
254                array_id_key,
255                ignore_whitespace,
256                ignore_case,
257            );
258        }
259        (Value::Array(arr1), Value::Array(arr2)) => {
260            diff_arrays(
261                path,
262                arr1,
263                arr2,
264                results,
265                ignore_keys_regex,
266                epsilon,
267                array_id_key,
268                ignore_whitespace,
269                ignore_case,
270            );
271        }
272        _ => { /* Should not happen if called correctly from diff_objects/diff_arrays */ }
273    }
274}
275
276#[allow(clippy::too_many_arguments)]
277fn diff_objects(
278    path: &str,
279    map1: &serde_json::Map<String, Value>,
280    map2: &serde_json::Map<String, Value>,
281    results: &mut Vec<DiffResult>,
282    ignore_keys_regex: Option<&Regex>,
283    epsilon: Option<f64>,
284    array_id_key: Option<&str>,
285    ignore_whitespace: bool,
286    ignore_case: bool,
287) {
288    // Check for modified or removed keys
289    for (key, value1) in map1 {
290        let current_path = if path.is_empty() {
291            key.clone()
292        } else {
293            format!("{path}.{key}")
294        };
295        if let Some(regex) = ignore_keys_regex {
296            if regex.is_match(key) {
297                continue;
298            }
299        }
300        match map2.get(key) {
301            Some(value2) => {
302                // Recurse for nested objects/arrays
303                if value1.is_object() && value2.is_object()
304                    || value1.is_array() && value2.is_array()
305                {
306                    diff_recursive(
307                        &current_path,
308                        value1,
309                        value2,
310                        results,
311                        ignore_keys_regex,
312                        epsilon,
313                        array_id_key,
314                        ignore_whitespace,
315                        ignore_case,
316                    );
317                } else if !values_are_equal_with_config(
318                    value1,
319                    value2,
320                    epsilon,
321                    ignore_whitespace,
322                    ignore_case,
323                ) {
324                    let type_match = matches!(
325                        (value1, value2),
326                        (Value::Null, Value::Null)
327                            | (Value::Bool(_), Value::Bool(_))
328                            | (Value::Number(_), Value::Number(_))
329                            | (Value::String(_), Value::String(_))
330                            | (Value::Array(_), Value::Array(_))
331                            | (Value::Object(_), Value::Object(_))
332                    );
333
334                    if !type_match {
335                        results.push(DiffResult::TypeChanged(
336                            current_path,
337                            value1.clone(),
338                            value2.clone(),
339                        ));
340                    } else {
341                        results.push(DiffResult::Modified(
342                            current_path,
343                            value1.clone(),
344                            value2.clone(),
345                        ));
346                    }
347                }
348            }
349            None => {
350                results.push(DiffResult::Removed(current_path, value1.clone()));
351            }
352        }
353    }
354
355    // Check for added keys
356    for (key, value2) in map2 {
357        if !map1.contains_key(key) {
358            let current_path = if path.is_empty() {
359                (*key).clone()
360            } else {
361                format!("{path}.{key}")
362            };
363            results.push(DiffResult::Added(current_path, value2.clone()));
364        }
365    }
366}
367
368#[allow(clippy::too_many_arguments)]
369fn diff_arrays(
370    path: &str,
371    arr1: &[Value],
372    arr2: &[Value],
373    results: &mut Vec<DiffResult>,
374    ignore_keys_regex: Option<&Regex>,
375    epsilon: Option<f64>,
376    array_id_key: Option<&str>,
377    ignore_whitespace: bool,
378    ignore_case: bool,
379) {
380    if let Some(id_key) = array_id_key {
381        let mut map1: HashMap<Value, &Value> = HashMap::new();
382        let mut no_id_elements1: Vec<(usize, &Value)> = Vec::new();
383        for (i, val) in arr1.iter().enumerate() {
384            if let Some(id_val) = val.get(id_key) {
385                map1.insert(id_val.clone(), val);
386            } else {
387                no_id_elements1.push((i, val));
388            }
389        }
390
391        let mut map2: HashMap<Value, &Value> = HashMap::new();
392        let mut no_id_elements2: Vec<(usize, &Value)> = Vec::new();
393        for (i, val) in arr2.iter().enumerate() {
394            if let Some(id_val) = val.get(id_key) {
395                map2.insert(id_val.clone(), val);
396            } else {
397                no_id_elements2.push((i, val));
398            }
399        }
400
401        // Check for modified or removed elements
402        for (id_val, val1) in &map1 {
403            let current_path = format!("{path}[{id_key}={id_val}]");
404            match map2.get(id_val) {
405                Some(val2) => {
406                    // Recurse for nested objects/arrays
407                    if val1.is_object() && val2.is_object() || val1.is_array() && val2.is_array() {
408                        diff_recursive(
409                            &current_path,
410                            val1,
411                            val2,
412                            results,
413                            ignore_keys_regex,
414                            epsilon,
415                            array_id_key,
416                            ignore_whitespace,
417                            ignore_case,
418                        );
419                    } else if !values_are_equal_with_config(
420                        val1,
421                        val2,
422                        epsilon,
423                        ignore_whitespace,
424                        ignore_case,
425                    ) {
426                        let type_match = matches!(
427                            (val1, val2),
428                            (Value::Null, Value::Null)
429                                | (Value::Bool(_), Value::Bool(_))
430                                | (Value::Number(_), Value::Number(_))
431                                | (Value::String(_), Value::String(_))
432                                | (Value::Array(_), Value::Array(_))
433                                | (Value::Object(_), Value::Object(_))
434                        );
435
436                        if !type_match {
437                            results.push(DiffResult::TypeChanged(
438                                current_path,
439                                (*val1).clone(),
440                                (*val2).clone(),
441                            ));
442                        } else {
443                            results.push(DiffResult::Modified(
444                                current_path,
445                                (*val1).clone(),
446                                (*val2).clone(),
447                            ));
448                        }
449                    }
450                }
451                None => {
452                    results.push(DiffResult::Removed(current_path, (*val1).clone()));
453                }
454            }
455        }
456
457        // Check for added elements with ID
458        for (id_val, val2) in map2 {
459            if !map1.contains_key(&id_val) {
460                let current_path = format!("{path}[{id_key}={id_val}]");
461                results.push(DiffResult::Added(current_path, val2.clone()));
462            }
463        }
464
465        // Handle elements without ID using index-based comparison
466        let max_len = no_id_elements1.len().max(no_id_elements2.len());
467        for i in 0..max_len {
468            match (no_id_elements1.get(i), no_id_elements2.get(i)) {
469                (Some((idx1, val1)), Some((_idx2, val2))) => {
470                    let current_path = format!("{path}[{idx1}]");
471                    if val1.is_object() && val2.is_object() || val1.is_array() && val2.is_array() {
472                        diff_recursive(
473                            &current_path,
474                            val1,
475                            val2,
476                            results,
477                            ignore_keys_regex,
478                            epsilon,
479                            array_id_key,
480                            ignore_whitespace,
481                            ignore_case,
482                        );
483                    } else if !values_are_equal_with_config(
484                        val1,
485                        val2,
486                        epsilon,
487                        ignore_whitespace,
488                        ignore_case,
489                    ) {
490                        let type_match = matches!(
491                            (val1, val2),
492                            (Value::Null, Value::Null)
493                                | (Value::Bool(_), Value::Bool(_))
494                                | (Value::Number(_), Value::Number(_))
495                                | (Value::String(_), Value::String(_))
496                                | (Value::Array(_), Value::Array(_))
497                                | (Value::Object(_), Value::Object(_))
498                        );
499
500                        if !type_match {
501                            results.push(DiffResult::TypeChanged(
502                                current_path,
503                                (*val1).clone(),
504                                (*val2).clone(),
505                            ));
506                        } else {
507                            results.push(DiffResult::Modified(
508                                current_path,
509                                (*val1).clone(),
510                                (*val2).clone(),
511                            ));
512                        }
513                    }
514                }
515                (Some((idx1, val1)), None) => {
516                    let current_path = format!("{path}[{idx1}]");
517                    results.push(DiffResult::Removed(current_path, (*val1).clone()));
518                }
519                (None, Some((idx2, val2))) => {
520                    let current_path = format!("{path}[{idx2}]");
521                    results.push(DiffResult::Added(current_path, (*val2).clone()));
522                }
523                (None, None) => break,
524            }
525        }
526    } else {
527        // Fallback to index-based comparison if no id_key is provided
528        let max_len = arr1.len().max(arr2.len());
529        for i in 0..max_len {
530            let current_path = format!("{path}[{i}]");
531            match (arr1.get(i), arr2.get(i)) {
532                (Some(val1), Some(val2)) => {
533                    // Recurse for nested objects/arrays within arrays
534                    if val1.is_object() && val2.is_object() || val1.is_array() && val2.is_array() {
535                        diff_recursive(
536                            &current_path,
537                            val1,
538                            val2,
539                            results,
540                            ignore_keys_regex,
541                            epsilon,
542                            array_id_key,
543                            ignore_whitespace,
544                            ignore_case,
545                        );
546                    } else if !values_are_equal_with_config(
547                        val1,
548                        val2,
549                        epsilon,
550                        ignore_whitespace,
551                        ignore_case,
552                    ) {
553                        let type_match = matches!(
554                            (val1, val2),
555                            (Value::Null, Value::Null)
556                                | (Value::Bool(_), Value::Bool(_))
557                                | (Value::Number(_), Value::Number(_))
558                                | (Value::String(_), Value::String(_))
559                                | (Value::Array(_), Value::Array(_))
560                                | (Value::Object(_), Value::Object(_))
561                        );
562
563                        if !type_match {
564                            results.push(DiffResult::TypeChanged(
565                                current_path,
566                                val1.clone(),
567                                val2.clone(),
568                            ));
569                        } else {
570                            results.push(DiffResult::Modified(
571                                current_path,
572                                val1.clone(),
573                                val2.clone(),
574                            ));
575                        }
576                    }
577                }
578                (Some(val1), None) => {
579                    results.push(DiffResult::Removed(current_path, val1.clone()));
580                }
581                (None, Some(val2)) => {
582                    results.push(DiffResult::Added(current_path, val2.clone()));
583                }
584                (None, None) => { /* Should not happen */ }
585            }
586        }
587    }
588}
589
590fn values_are_equal_with_config(
591    v1: &Value,
592    v2: &Value,
593    epsilon: Option<f64>,
594    ignore_whitespace: bool,
595    ignore_case: bool,
596) -> bool {
597    // Handle numeric comparisons with epsilon
598    if let (Some(e), Value::Number(n1), Value::Number(n2)) = (epsilon, v1, v2) {
599        if let (Some(f1), Some(f2)) = (n1.as_f64(), n2.as_f64()) {
600            return (f1 - f2).abs() < e;
601        }
602    }
603
604    // Handle string comparisons with ignore options
605    if let (Value::String(s1), Value::String(s2)) = (v1, v2) {
606        let mut str1 = s1.as_str();
607        let mut str2 = s2.as_str();
608
609        let owned_s1;
610        let owned_s2;
611
612        // Apply whitespace normalization if needed
613        if ignore_whitespace {
614            owned_s1 = normalize_whitespace(str1);
615            owned_s2 = normalize_whitespace(str2);
616            str1 = &owned_s1;
617            str2 = &owned_s2;
618        }
619
620        // Apply case normalization if needed
621        if ignore_case {
622            return str1.to_lowercase() == str2.to_lowercase();
623        } else {
624            return str1 == str2;
625        }
626    }
627
628    // Default comparison for all other types
629    v1 == v2
630}
631
632fn normalize_whitespace(s: &str) -> String {
633    // Replace all whitespace sequences with single spaces and trim
634    s.split_whitespace().collect::<Vec<&str>>().join(" ")
635}
636
637pub fn value_type_name(value: &Value) -> &str {
638    match value {
639        Value::Null => "Null",
640        Value::Bool(_) => "Boolean",
641        Value::Number(_) => "Number",
642        Value::String(_) => "String",
643        Value::Array(_) => "Array",
644        Value::Object(_) => "Object",
645    }
646}
647
648/// Get approximate memory usage of a Value in bytes
649pub fn estimate_memory_usage(value: &Value) -> usize {
650    match value {
651        Value::Null => 0,
652        Value::Bool(_) => 1,
653        Value::Number(_) => 8, // Approximate for f64
654        Value::String(s) => s.len(),
655        Value::Array(arr) => {
656            arr.iter().map(estimate_memory_usage).sum::<usize>() + (arr.len() * 8)
657            // Vec overhead
658        }
659        Value::Object(obj) => {
660            obj.iter()
661                .map(|(k, v)| k.len() + estimate_memory_usage(v))
662                .sum::<usize>()
663                + (obj.len() * 16) // Map overhead
664        }
665    }
666}
667
668/// Check if processing these values would exceed memory limits
669pub fn would_exceed_memory_limit(v1: &Value, v2: &Value) -> bool {
670    const MAX_MEMORY_USAGE: usize = 1024 * 1024 * 1024; // 1GB limit
671
672    let usage1 = estimate_memory_usage(v1);
673    let usage2 = estimate_memory_usage(v2);
674
675    // Account for diff results and temporary data (multiply by 3)
676    (usage1 + usage2) * 3 > MAX_MEMORY_USAGE
677}
678
679pub fn parse_ini(content: &str) -> Result<Value> {
680    use configparser::ini::Ini;
681
682    let mut ini = Ini::new();
683    ini.read(content.to_string())
684        .map_err(|e| anyhow!("Failed to parse INI: {}", e))?;
685
686    let mut root_map = serde_json::Map::new();
687
688    for section_name in ini.sections() {
689        let mut section_map = serde_json::Map::new();
690
691        if let Some(section) = ini.get_map_ref().get(&section_name) {
692            for (key, value) in section {
693                if let Some(v) = value {
694                    section_map.insert(key.clone(), Value::String(v.clone()));
695                } else {
696                    section_map.insert(key.clone(), Value::Null);
697                }
698            }
699        }
700
701        root_map.insert(section_name, Value::Object(section_map));
702    }
703
704    Ok(Value::Object(root_map))
705}
706
707pub fn parse_xml(content: &str) -> Result<Value> {
708    let value: Value = from_str(content)?;
709    Ok(value)
710}
711
712pub fn parse_csv(content: &str) -> Result<Value> {
713    let mut reader = ReaderBuilder::new().from_reader(content.as_bytes());
714    let mut records = Vec::new();
715
716    let headers = reader.headers()?.clone();
717    let has_headers = !headers.is_empty();
718
719    for result in reader.into_records() {
720        let record = result?;
721        if has_headers {
722            let mut obj = serde_json::Map::new();
723            for (i, header) in headers.iter().enumerate() {
724                if let Some(value) = record.get(i) {
725                    obj.insert(header.to_string(), Value::String(value.to_string()));
726                }
727            }
728            records.push(Value::Object(obj));
729        } else {
730            let mut arr = Vec::new();
731            for field in record.iter() {
732                arr.push(Value::String(field.to_string()));
733            }
734            records.push(Value::Array(arr));
735        }
736    }
737    Ok(Value::Array(records))
738}
739
740/// Parse large files with streaming support to reduce memory usage
741/// Returns None if file is too large (>100MB) and should use streaming diff
742pub fn parse_large_file<P: AsRef<Path>>(path: P) -> Result<Option<Value>> {
743    let file = File::open(&path)?;
744    let metadata = file.metadata()?;
745    let file_size = metadata.len();
746
747    // 100MB threshold for streaming
748    const MAX_MEMORY_SIZE: u64 = 100 * 1024 * 1024;
749
750    if file_size > MAX_MEMORY_SIZE {
751        return Ok(None); // Signal that streaming should be used
752    }
753
754    let mut reader = BufReader::new(file);
755    let mut content = String::new();
756    reader.read_to_string(&mut content)?;
757
758    // Auto-detect format from file extension
759    let path_str = path.as_ref().to_string_lossy();
760    if path_str.ends_with(".json") {
761        Ok(Some(serde_json::from_str(&content)?))
762    } else if path_str.ends_with(".yaml") || path_str.ends_with(".yml") {
763        Ok(Some(serde_yml::from_str(&content)?))
764    } else if path_str.ends_with(".toml") {
765        Ok(Some(toml::from_str(&content)?))
766    } else {
767        Err(anyhow!("Unsupported file format for large file parsing"))
768    }
769}
770
771/// Memory-efficient diff for large files using streaming approach
772pub fn diff_large_files<P: AsRef<Path>>(
773    path1: P,
774    path2: P,
775    ignore_keys_regex: Option<&Regex>,
776    epsilon: Option<f64>,
777    array_id_key: Option<&str>,
778) -> Result<Vec<DiffResult>> {
779    // Try to parse normally first
780    let v1_opt = parse_large_file(&path1)?;
781    let v2_opt = parse_large_file(&path2)?;
782
783    match (v1_opt, v2_opt) {
784        (Some(v1), Some(v2)) => {
785            // Both files are small enough for in-memory processing
786            Ok(diff(&v1, &v2, ignore_keys_regex, epsilon, array_id_key))
787        }
788        _ => {
789            // At least one file is too large, use streaming diff
790            streaming_diff(&path1, &path2, ignore_keys_regex, epsilon, array_id_key)
791        }
792    }
793}
794
795/// Streaming diff implementation for very large files
796fn streaming_diff<P: AsRef<Path>>(
797    path1: P,
798    path2: P,
799    ignore_keys_regex: Option<&Regex>,
800    epsilon: Option<f64>,
801    array_id_key: Option<&str>,
802) -> Result<Vec<DiffResult>> {
803    // For now, implement a simplified version that chunks the files
804    // This is a placeholder for more sophisticated streaming logic
805    let mut results = Vec::new();
806
807    // Read files in chunks and compare
808    let file1 = File::open(&path1)?;
809    let file2 = File::open(&path2)?;
810
811    let mut reader1 = BufReader::new(file1);
812    let mut reader2 = BufReader::new(file2);
813
814    let mut buffer1 = String::new();
815    let mut buffer2 = String::new();
816
817    // Read entire files (for now - this would be optimized further)
818    reader1.read_to_string(&mut buffer1)?;
819    reader2.read_to_string(&mut buffer2)?;
820
821    // Parse with reduced memory footprint
822    let v1: Value = serde_json::from_str(&buffer1)
823        .or_else(|_| serde_yml::from_str(&buffer1))
824        .or_else(|_| toml::from_str(&buffer1))
825        .map_err(|e| anyhow!("Failed to parse file 1: {}", e))?;
826
827    let v2: Value = serde_json::from_str(&buffer2)
828        .or_else(|_| serde_yml::from_str(&buffer2))
829        .or_else(|_| toml::from_str(&buffer2))
830        .map_err(|e| anyhow!("Failed to parse file 2: {}", e))?;
831
832    // Clear buffers to free memory
833    drop(buffer1);
834    drop(buffer2);
835
836    // Use optimized diff with memory-conscious approach
837    memory_efficient_diff(
838        &v1,
839        &v2,
840        &mut results,
841        ignore_keys_regex,
842        epsilon,
843        array_id_key,
844        false, // ignore_whitespace - not supported in streaming mode
845        false, // ignore_case - not supported in streaming mode
846    );
847
848    Ok(results)
849}
850
851/// Memory-efficient diff implementation that processes data in chunks
852#[allow(clippy::too_many_arguments)]
853fn memory_efficient_diff(
854    v1: &Value,
855    v2: &Value,
856    results: &mut Vec<DiffResult>,
857    ignore_keys_regex: Option<&Regex>,
858    epsilon: Option<f64>,
859    array_id_key: Option<&str>,
860    ignore_whitespace: bool,
861    ignore_case: bool,
862) {
863    // Process diff without cloning large values when possible
864    if !values_are_equal_with_config(v1, v2, epsilon, ignore_whitespace, ignore_case) {
865        let type_match = matches!(
866            (v1, v2),
867            (Value::Null, Value::Null)
868                | (Value::Bool(_), Value::Bool(_))
869                | (Value::Number(_), Value::Number(_))
870                | (Value::Array(_), Value::Array(_))
871                | (Value::Object(_), Value::Object(_))
872        );
873
874        if !type_match {
875            results.push(DiffResult::TypeChanged(
876                "".to_string(),
877                v1.clone(),
878                v2.clone(),
879            ));
880        } else if v1.is_object() && v2.is_object() {
881            memory_efficient_diff_objects(
882                "",
883                v1.as_object().unwrap(),
884                v2.as_object().unwrap(),
885                results,
886                ignore_keys_regex,
887                epsilon,
888                array_id_key,
889                ignore_whitespace,
890                ignore_case,
891            );
892        } else if v1.is_array() && v2.is_array() {
893            memory_efficient_diff_arrays(
894                "",
895                v1.as_array().unwrap(),
896                v2.as_array().unwrap(),
897                results,
898                ignore_keys_regex,
899                epsilon,
900                array_id_key,
901                ignore_whitespace,
902                ignore_case,
903            );
904        } else {
905            results.push(DiffResult::Modified("".to_string(), v1.clone(), v2.clone()));
906        }
907    }
908}
909
910/// Memory-efficient object comparison
911#[allow(clippy::too_many_arguments)]
912fn memory_efficient_diff_objects(
913    path: &str,
914    map1: &serde_json::Map<String, Value>,
915    map2: &serde_json::Map<String, Value>,
916    results: &mut Vec<DiffResult>,
917    ignore_keys_regex: Option<&Regex>,
918    epsilon: Option<f64>,
919    array_id_key: Option<&str>,
920    ignore_whitespace: bool,
921    ignore_case: bool,
922) {
923    // Process keys in batches to limit memory usage
924    const BATCH_SIZE: usize = 1000;
925
926    let keys1: Vec<_> = map1.keys().collect();
927    let keys2: Vec<_> = map2.keys().collect();
928
929    // Process in batches
930    for chunk in keys1.chunks(BATCH_SIZE) {
931        for key in chunk {
932            if let Some(regex) = ignore_keys_regex {
933                if regex.is_match(key) {
934                    continue;
935                }
936            }
937
938            let current_path = if path.is_empty() {
939                (*key).clone()
940            } else {
941                format!("{path}.{key}")
942            };
943
944            match (map1.get(*key), map2.get(*key)) {
945                (Some(value1), Some(value2)) => {
946                    if value1.is_object() && value2.is_object() {
947                        memory_efficient_diff_objects(
948                            &current_path,
949                            value1.as_object().unwrap(),
950                            value2.as_object().unwrap(),
951                            results,
952                            ignore_keys_regex,
953                            epsilon,
954                            array_id_key,
955                            ignore_whitespace,
956                            ignore_case,
957                        );
958                    } else if value1.is_array() && value2.is_array() {
959                        memory_efficient_diff_arrays(
960                            &current_path,
961                            value1.as_array().unwrap(),
962                            value2.as_array().unwrap(),
963                            results,
964                            ignore_keys_regex,
965                            epsilon,
966                            array_id_key,
967                            ignore_whitespace,
968                            ignore_case,
969                        );
970                    } else if !values_are_equal_with_config(
971                        value1,
972                        value2,
973                        epsilon,
974                        ignore_whitespace,
975                        ignore_case,
976                    ) {
977                        let type_match = matches!(
978                            (value1, value2),
979                            (Value::Null, Value::Null)
980                                | (Value::Bool(_), Value::Bool(_))
981                                | (Value::Number(_), Value::Number(_))
982                                | (Value::String(_), Value::String(_))
983                                | (Value::Array(_), Value::Array(_))
984                                | (Value::Object(_), Value::Object(_))
985                        );
986
987                        if !type_match {
988                            results.push(DiffResult::TypeChanged(
989                                current_path,
990                                value1.clone(),
991                                value2.clone(),
992                            ));
993                        } else {
994                            results.push(DiffResult::Modified(
995                                current_path,
996                                value1.clone(),
997                                value2.clone(),
998                            ));
999                        }
1000                    }
1001                }
1002                (Some(value1), None) => {
1003                    results.push(DiffResult::Removed(current_path, value1.clone()));
1004                }
1005                (None, Some(_)) => {
1006                    // Will be handled in the "added" phase
1007                }
1008                (None, None) => {
1009                    // Should not happen
1010                }
1011            }
1012        }
1013    }
1014
1015    // Process added keys
1016    for chunk in keys2.chunks(BATCH_SIZE) {
1017        for key in chunk {
1018            if !map1.contains_key(*key) {
1019                let current_path = if path.is_empty() {
1020                    (*key).clone()
1021                } else {
1022                    format!("{path}.{key}")
1023                };
1024                if let Some(value2) = map2.get(*key) {
1025                    results.push(DiffResult::Added(current_path, value2.clone()));
1026                }
1027            }
1028        }
1029    }
1030}
1031
1032/// Memory-efficient array comparison
1033#[allow(clippy::too_many_arguments)]
1034fn memory_efficient_diff_arrays(
1035    path: &str,
1036    arr1: &[Value],
1037    arr2: &[Value],
1038    results: &mut Vec<DiffResult>,
1039    ignore_keys_regex: Option<&Regex>,
1040    epsilon: Option<f64>,
1041    array_id_key: Option<&str>,
1042    ignore_whitespace: bool,
1043    ignore_case: bool,
1044) {
1045    // Use the existing array diff logic but with batching for very large arrays
1046    const BATCH_SIZE: usize = 10000;
1047
1048    if arr1.len() > BATCH_SIZE || arr2.len() > BATCH_SIZE {
1049        // Process large arrays in chunks
1050        let max_len = arr1.len().max(arr2.len());
1051        for chunk_start in (0..max_len).step_by(BATCH_SIZE) {
1052            let chunk_end = (chunk_start + BATCH_SIZE).min(max_len);
1053            let chunk1 = arr1.get(chunk_start..chunk_end).unwrap_or(&[]);
1054            let chunk2 = arr2.get(chunk_start..chunk_end).unwrap_or(&[]);
1055
1056            // Process this chunk using existing logic
1057            diff_arrays(
1058                path,
1059                chunk1,
1060                chunk2,
1061                results,
1062                ignore_keys_regex,
1063                epsilon,
1064                array_id_key,
1065                ignore_whitespace,
1066                ignore_case,
1067            );
1068        }
1069    } else {
1070        // Use existing implementation for smaller arrays
1071        diff_arrays(
1072            path,
1073            arr1,
1074            arr2,
1075            results,
1076            ignore_keys_regex,
1077            epsilon,
1078            array_id_key,
1079            ignore_whitespace,
1080            ignore_case,
1081        );
1082    }
1083}
1084
1085// API is already public
diffx_core/lib.rs

diffx_core/
lib.rs