Skip to main content

void_core/diff/
content.rs

1//! Content-level diff generation.
2//!
3//! Builds on the hash-only `TreeDiff` to produce line-by-line unified diffs.
4//! Content is only loaded when unified output is needed — the fast hash-only
5//! diff phase is unchanged.
6
7use std::fs;
8use std::path::Path;
9
10use crate::crypto::SecretKey;
11use crate::diff::types::{DiffKind, TreeDiff};
12use crate::index::WorkspaceIndex;
13
14use crate::staged::read_staged_blob;
15use crate::{cid, refs, ContentHash, Result, VoidContext};
16
17use crate::cid::VoidCid;
18
19/// Maximum file size (in bytes) we'll diff. Larger files get a placeholder.
20const MAX_DIFF_SIZE: usize = 1024 * 1024; // 1 MB
21
22/// Number of bytes to scan for binary detection.
23const BINARY_PROBE_SIZE: usize = 8192;
24
25/// A content-level diff for a single file.
26#[derive(Debug, Clone)]
27pub struct ContentDiff {
28    /// File path (new path for renames).
29    pub path: String,
30    /// Type of change.
31    pub kind: DiffKind,
32    /// True if binary content detected.
33    pub binary: bool,
34    /// True if file exceeds size limit.
35    pub too_large: bool,
36    /// Diff hunks (empty if binary or too large).
37    pub hunks: Vec<Hunk>,
38    /// Original path for renames.
39    pub rename_from: Option<String>,
40}
41
42/// A contiguous range of changed lines with surrounding context.
43#[derive(Debug, Clone)]
44pub struct Hunk {
45    /// 1-based start line in old file.
46    pub old_start: usize,
47    /// Number of lines from old file.
48    pub old_count: usize,
49    /// 1-based start line in new file.
50    pub new_start: usize,
51    /// Number of lines from new file.
52    pub new_count: usize,
53    /// Lines in this hunk (context, additions, deletions).
54    pub lines: Vec<DiffLine>,
55}
56
57/// A single line in a diff hunk.
58#[derive(Debug, Clone)]
59pub struct DiffLine {
60    /// '+' for additions, '-' for deletions, ' ' for context.
61    pub tag: char,
62    /// Line content without trailing newline.
63    pub content: String,
64}
65
66// ---------------------------------------------------------------------------
67// Binary / size detection
68// ---------------------------------------------------------------------------
69
70/// Checks if content appears to be binary (null bytes in first 8KB).
71fn is_binary(content: &[u8]) -> bool {
72    let probe = &content[..content.len().min(BINARY_PROBE_SIZE)];
73    probe.contains(&0)
74}
75
76// ---------------------------------------------------------------------------
77// Hunk generation via `similar`
78// ---------------------------------------------------------------------------
79
80/// Generates unified diff hunks from old and new text.
81fn generate_hunks(old: &str, new: &str) -> Vec<Hunk> {
82    let diff = similar::TextDiff::from_lines(old, new);
83    let mut hunks = Vec::new();
84
85    for group in diff.grouped_ops(3) {
86        let mut lines = Vec::new();
87        let mut old_start = 0;
88        let mut old_count = 0;
89        let mut new_start = 0;
90        let mut new_count = 0;
91
92        for op in &group {
93            // Track range boundaries from first op
94            if lines.is_empty() {
95                old_start = op.old_range().start;
96                new_start = op.new_range().start;
97            }
98
99            match op.tag() {
100                similar::DiffTag::Equal => {
101                    for value in diff.iter_changes(op) {
102                        lines.push(DiffLine {
103                            tag: ' ',
104                            content: value.value().trim_end_matches('\n').to_string(),
105                        });
106                        old_count += 1;
107                        new_count += 1;
108                    }
109                }
110                similar::DiffTag::Delete => {
111                    for value in diff.iter_changes(op) {
112                        lines.push(DiffLine {
113                            tag: '-',
114                            content: value.value().trim_end_matches('\n').to_string(),
115                        });
116                        old_count += 1;
117                    }
118                }
119                similar::DiffTag::Insert => {
120                    for value in diff.iter_changes(op) {
121                        lines.push(DiffLine {
122                            tag: '+',
123                            content: value.value().trim_end_matches('\n').to_string(),
124                        });
125                        new_count += 1;
126                    }
127                }
128                similar::DiffTag::Replace => {
129                    for value in diff.iter_changes(op) {
130                        match value.tag() {
131                            similar::ChangeTag::Delete => {
132                                lines.push(DiffLine {
133                                    tag: '-',
134                                    content: value.value().trim_end_matches('\n').to_string(),
135                                });
136                                old_count += 1;
137                            }
138                            similar::ChangeTag::Insert => {
139                                lines.push(DiffLine {
140                                    tag: '+',
141                                    content: value.value().trim_end_matches('\n').to_string(),
142                                });
143                                new_count += 1;
144                            }
145                            similar::ChangeTag::Equal => {
146                                lines.push(DiffLine {
147                                    tag: ' ',
148                                    content: value.value().trim_end_matches('\n').to_string(),
149                                });
150                                old_count += 1;
151                                new_count += 1;
152                            }
153                        }
154                    }
155                }
156            }
157        }
158
159        hunks.push(Hunk {
160            old_start: old_start + 1, // 1-based
161            old_count,
162            new_start: new_start + 1, // 1-based
163            new_count,
164            lines,
165        });
166    }
167
168    hunks
169}
170
171// ---------------------------------------------------------------------------
172// Single-file diff builder
173// ---------------------------------------------------------------------------
174
175/// Builds a `ContentDiff` from raw old/new content bytes.
176fn diff_single_file(
177    path: &str,
178    kind: DiffKind,
179    old_content: Option<&[u8]>,
180    new_content: Option<&[u8]>,
181) -> ContentDiff {
182    let rename_from = match &kind {
183        DiffKind::Renamed { from, .. } => Some(from.clone()),
184        _ => None,
185    };
186
187    let old_bytes = old_content.unwrap_or(&[]);
188    let new_bytes = new_content.unwrap_or(&[]);
189
190    // Size check
191    if old_bytes.len() > MAX_DIFF_SIZE || new_bytes.len() > MAX_DIFF_SIZE {
192        return ContentDiff {
193            path: path.to_string(),
194            kind,
195            binary: false,
196            too_large: true,
197            hunks: Vec::new(),
198            rename_from,
199        };
200    }
201
202    // Binary check
203    if (!old_bytes.is_empty() && is_binary(old_bytes))
204        || (!new_bytes.is_empty() && is_binary(new_bytes))
205    {
206        return ContentDiff {
207            path: path.to_string(),
208            kind,
209            binary: true,
210            too_large: false,
211            hunks: Vec::new(),
212            rename_from,
213        };
214    }
215
216    // UTF-8 validation — treat non-UTF-8 as binary
217    let old_str = match std::str::from_utf8(old_bytes) {
218        Ok(s) => s,
219        Err(_) => {
220            return ContentDiff {
221                path: path.to_string(),
222                kind,
223                binary: true,
224                too_large: false,
225                hunks: Vec::new(),
226                rename_from,
227            };
228        }
229    };
230    let new_str = match std::str::from_utf8(new_bytes) {
231        Ok(s) => s,
232        Err(_) => {
233            return ContentDiff {
234                path: path.to_string(),
235                kind,
236                binary: true,
237                too_large: false,
238                hunks: Vec::new(),
239                rename_from,
240            };
241        }
242    };
243
244    let hunks = generate_hunks(old_str, new_str);
245
246    ContentDiff {
247        path: path.to_string(),
248        kind,
249        binary: false,
250        too_large: false,
251        hunks,
252        rename_from,
253    }
254}
255
256// ---------------------------------------------------------------------------
257// Content retrieval helpers
258// ---------------------------------------------------------------------------
259
260/// Loaded commit state needed for manifest-driven file reads.
261struct CommitFiles {
262    store: crate::store::FsStore,
263    commit: crate::metadata::Commit,
264    reader: crate::crypto::CommitReader,
265    ancestor_keys: Vec<void_crypto::ContentKey>,
266}
267
268impl CommitFiles {
269    fn load(ctx: &VoidContext, commit_cid: &VoidCid) -> Result<Self> {
270        let store = ctx.open_store()?;
271        let (commit, reader) = ctx.load_commit(&store, commit_cid)?;
272        let ancestor_keys =
273            crate::crypto::collect_ancestor_content_keys_vault(&ctx.crypto.vault, &store, &commit);
274        Ok(Self { store, commit, reader, ancestor_keys })
275    }
276
277    fn read_file(&self, ctx: &VoidContext, path: &str) -> Option<Vec<u8>> {
278        ctx.read_file_from_commit(&self.store, &self.commit, &self.reader, &self.ancestor_keys, path)
279            .ok()
280            .map(|fc| fc.into())
281    }
282}
283
284/// Reads a file from the workspace.
285fn read_file_from_workspace(workspace: &Path, path: &str) -> Option<Vec<u8>> {
286    let full = workspace.join(path);
287    fs::read(&full).ok()
288}
289
290/// Reads a file from the staged blob store.
291fn read_file_from_staged(
292    void_dir: &Path,
293    staged_key: &SecretKey,
294    content_hash: &ContentHash,
295) -> Option<Vec<u8>> {
296    read_staged_blob(void_dir, staged_key, content_hash).ok()
297}
298
299// ---------------------------------------------------------------------------
300// Public entry points — one per diff mode
301// ---------------------------------------------------------------------------
302
303/// Content diff for index vs workspace (unstaged changes).
304///
305/// Old content: staged blob (by content hash from index).
306/// New content: workspace file on disk.
307pub fn content_diff_index(
308    tree_diff: &TreeDiff,
309    index: &WorkspaceIndex,
310    workspace: &Path,
311    void_dir: &Path,
312    staged_key: &SecretKey,
313) -> Result<Vec<ContentDiff>> {
314    // Build index lookup: path -> content_hash
315    let index_map: std::collections::HashMap<&str, ContentHash> = index
316        .iter()
317        .map(|e| (e.path.as_str(), e.content_hash))
318        .collect();
319
320    let diffs = tree_diff
321        .files
322        .iter()
323        .map(|file| {
324            let old_path = match &file.kind {
325                DiffKind::Renamed { from, .. } => from.as_str(),
326                _ => file.path.as_str(),
327            };
328
329            let old_content = index_map
330                .get(old_path)
331                .and_then(|hash| read_file_from_staged(void_dir, staged_key, hash));
332
333            let new_content = match &file.kind {
334                DiffKind::Deleted => None,
335                _ => read_file_from_workspace(workspace, &file.path),
336            };
337
338            diff_single_file(
339                &file.path,
340                file.kind.clone(),
341                old_content.as_deref(),
342                new_content.as_deref(),
343            )
344        })
345        .collect();
346
347    Ok(diffs)
348}
349
350/// Content diff for commit vs workspace (working tree changes).
351///
352/// Old content: committed file via manifest.
353/// New content: workspace file on disk.
354pub fn content_diff_working(
355    tree_diff: &TreeDiff,
356    ctx: &VoidContext,
357    commit_cid: &VoidCid,
358    workspace: &Path,
359) -> Result<Vec<ContentDiff>> {
360    let files = CommitFiles::load(ctx, commit_cid)?;
361
362    let diffs = tree_diff
363        .files
364        .iter()
365        .map(|file| {
366            let old_path = match &file.kind {
367                DiffKind::Renamed { from, .. } => from.as_str(),
368                _ => file.path.as_str(),
369            };
370
371            let old_content = files.read_file(ctx, old_path);
372
373            let new_content = match &file.kind {
374                DiffKind::Deleted => None,
375                _ => read_file_from_workspace(workspace, &file.path),
376            };
377
378            diff_single_file(
379                &file.path,
380                file.kind.clone(),
381                old_content.as_deref(),
382                new_content.as_deref(),
383            )
384        })
385        .collect();
386
387    Ok(diffs)
388}
389
390/// Content diff between two commits.
391///
392/// Old content: old commit's files via manifest.
393/// New content: new commit's files via manifest.
394pub fn content_diff_commits(
395    tree_diff: &TreeDiff,
396    ctx: &VoidContext,
397    old_cid: &VoidCid,
398    new_cid: &VoidCid,
399) -> Result<Vec<ContentDiff>> {
400    let old_files = CommitFiles::load(ctx, old_cid)?;
401    let new_files = CommitFiles::load(ctx, new_cid)?;
402
403    let diffs = tree_diff
404        .files
405        .iter()
406        .map(|file| {
407            let old_path = match &file.kind {
408                DiffKind::Renamed { from, .. } => from.as_str(),
409                _ => file.path.as_str(),
410            };
411
412            let old_content = old_files.read_file(ctx, old_path);
413
414            let new_content = match &file.kind {
415                DiffKind::Deleted => None,
416                _ => new_files.read_file(ctx, &file.path),
417            };
418
419            diff_single_file(
420                &file.path,
421                file.kind.clone(),
422                old_content.as_deref(),
423                new_content.as_deref(),
424            )
425        })
426        .collect();
427
428    Ok(diffs)
429}
430
431/// Content diff for staged changes (HEAD vs index).
432///
433/// Old content: HEAD commit's files via manifest.
434/// New content: staged blobs.
435pub fn content_diff_staged(
436    tree_diff: &TreeDiff,
437    ctx: &VoidContext,
438) -> Result<Vec<ContentDiff>> {
439    // Load HEAD commit for old content
440    let head_commit_cid = refs::resolve_head(&ctx.paths.void_dir)?;
441    let head_files = match head_commit_cid {
442        Some(commit_cid) => {
443            let head_cid = cid::from_bytes(commit_cid.as_bytes())?;
444            Some(CommitFiles::load(ctx, &head_cid)?)
445        }
446        None => None,
447    };
448
449    let staged_key = ctx.crypto.vault.staged_key()?;
450    let void_dir = ctx.paths.void_dir.as_std_path();
451
452    let diffs = tree_diff
453        .files
454        .iter()
455        .map(|file| {
456            let old_path = match &file.kind {
457                DiffKind::Renamed { from, .. } => from.as_str(),
458                _ => file.path.as_str(),
459            };
460
461            let old_content = head_files.as_ref().and_then(|f| f.read_file(ctx, old_path));
462
463            let new_content = match &file.kind {
464                DiffKind::Deleted => None,
465                _ => file
466                    .new_hash
467                    .as_ref()
468                    .and_then(|hash| read_file_from_staged(void_dir, staged_key, hash)),
469            };
470
471            diff_single_file(
472                &file.path,
473                file.kind.clone(),
474                old_content.as_deref(),
475                new_content.as_deref(),
476            )
477        })
478        .collect();
479
480    Ok(diffs)
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486
487    #[test]
488    fn binary_detection_null_byte() {
489        assert!(is_binary(b"hello\x00world"));
490        assert!(!is_binary(b"hello world"));
491        assert!(!is_binary(b""));
492    }
493
494    #[test]
495    fn hunk_generation_simple_add() {
496        let old = "line1\nline2\nline3\n";
497        let new = "line1\nline2\nnew line\nline3\n";
498
499        let hunks = generate_hunks(old, new);
500        assert_eq!(hunks.len(), 1);
501
502        let hunk = &hunks[0];
503        assert!(hunk.lines.iter().any(|l| l.tag == '+' && l.content == "new line"));
504    }
505
506    #[test]
507    fn hunk_generation_simple_delete() {
508        let old = "line1\nline2\nline3\n";
509        let new = "line1\nline3\n";
510
511        let hunks = generate_hunks(old, new);
512        assert_eq!(hunks.len(), 1);
513
514        let hunk = &hunks[0];
515        assert!(hunk.lines.iter().any(|l| l.tag == '-' && l.content == "line2"));
516    }
517
518    #[test]
519    fn hunk_generation_modify() {
520        let old = "aaa\nbbb\nccc\n";
521        let new = "aaa\nBBB\nccc\n";
522
523        let hunks = generate_hunks(old, new);
524        assert_eq!(hunks.len(), 1);
525
526        let hunk = &hunks[0];
527        assert!(hunk.lines.iter().any(|l| l.tag == '-' && l.content == "bbb"));
528        assert!(hunk.lines.iter().any(|l| l.tag == '+' && l.content == "BBB"));
529    }
530
531    #[test]
532    fn diff_single_file_binary_detected() {
533        let old = b"hello\x00binary";
534        let new = b"changed\x00binary";
535
536        let result = diff_single_file("test.bin", DiffKind::Modified, Some(old), Some(new));
537        assert!(result.binary);
538        assert!(result.hunks.is_empty());
539    }
540
541    #[test]
542    fn diff_single_file_too_large() {
543        let big = vec![b'x'; MAX_DIFF_SIZE + 1];
544
545        let result = diff_single_file("big.txt", DiffKind::Modified, Some(&big), Some(b"small"));
546        assert!(result.too_large);
547        assert!(result.hunks.is_empty());
548    }
549
550    #[test]
551    fn diff_single_file_added() {
552        let new = b"line1\nline2\n";
553
554        let result = diff_single_file("new.txt", DiffKind::Added, None, Some(new));
555        assert!(!result.binary);
556        assert!(!result.too_large);
557        assert_eq!(result.hunks.len(), 1);
558        assert!(result.hunks[0].lines.iter().all(|l| l.tag == '+'));
559    }
560
561    #[test]
562    fn diff_single_file_deleted() {
563        let old = b"line1\nline2\n";
564
565        let result = diff_single_file("old.txt", DiffKind::Deleted, Some(old), None);
566        assert_eq!(result.hunks.len(), 1);
567        assert!(result.hunks[0].lines.iter().all(|l| l.tag == '-'));
568    }
569
570    #[test]
571    fn empty_file_diff() {
572        let hunks = generate_hunks("", "");
573        assert!(hunks.is_empty());
574    }
575
576    #[test]
577    fn non_utf8_treated_as_binary() {
578        let old = &[0xFF, 0xFE, 0x41, 0x42]; // not valid UTF-8, no null bytes
579        let new = b"hello";
580
581        let result = diff_single_file("test.dat", DiffKind::Modified, Some(old), Some(new));
582        assert!(result.binary);
583    }
584}