delta-compression 0.5.0

Binary delta compression — diff and apply for byte sequences, with rolling-hash matching, splay-tree indexing, and in-place patching.
Documentation
use crate::types::{Command, DeltaError, PlacedCommand};

/// Compute the total output size of algorithm commands.
pub fn output_size(commands: &[Command]) -> usize {
    commands
        .iter()
        .map(|cmd| match cmd {
            Command::Copy { length, .. } => *length,
            Command::Add { data } => data.len(),
        })
        .sum()
}

/// Convert algorithm output to placed commands with sequential destinations.
///
/// Consumes the command list so that `Add` payloads are moved, not copied.
pub fn place_commands(commands: Vec<Command>) -> Vec<PlacedCommand> {
    let mut placed = Vec::with_capacity(commands.len());
    let mut dst = 0;
    for cmd in commands {
        match cmd {
            Command::Copy { offset, length } => {
                placed.push(PlacedCommand::Copy { src: offset, dst, length });
                dst += length;
            }
            Command::Add { data } => {
                let len = data.len();
                placed.push(PlacedCommand::Add { dst, data }); // move, no clone
                dst += len;
            }
        }
    }
    placed
}

/// Convert placed commands back to algorithm commands (strip destinations).
///
/// Commands are sorted by destination offset to recover original sequential
/// order, then each PlacedCopy/PlacedAdd is converted to Copy/Add.
///
/// Consumes the placed list so that `Add` payloads are moved, not copied.
///
/// # Panics
///
/// Panics if any `PlacedCommand::Move` is present.  Move commands are DLT\x04-only
/// and have no algorithm-level equivalent; they must not be unplaced.
pub fn unplace_commands(mut placed: Vec<PlacedCommand>) -> Vec<Command> {
    placed.sort_by_key(|c| match c {
        PlacedCommand::Copy { dst, .. } => *dst,
        PlacedCommand::Add  { dst, .. } => *dst,
        PlacedCommand::Move { dst, .. } => *dst,
    });
    placed
        .into_iter()
        .map(|c| match c {
            PlacedCommand::Copy { src, length, .. } => Command::Copy { offset: src, length },
            PlacedCommand::Add  { data, .. }        => Command::Add { data },
            PlacedCommand::Move { .. } => panic!(
                "unplace_commands: Move has no algorithm-level equivalent; \
                 Move commands are DLT\\x04-only and cannot be unplaced"
            ),
        })
        .collect()
}

/// Apply placed commands in standard mode: read from R, write to out.
///
/// Returns the number of bytes written.
pub fn apply_placed_to(r: &[u8], commands: &[PlacedCommand], out: &mut [u8]) -> usize {
    let mut max_written = 0;
    for cmd in commands {
        match cmd {
            PlacedCommand::Copy { src, dst, length } => {
                out[*dst..*dst + *length].copy_from_slice(&r[*src..*src + *length]);
                let end = dst + length;
                if end > max_written { max_written = end; }
            }
            PlacedCommand::Add { dst, data } => {
                out[*dst..*dst + data.len()].copy_from_slice(data);
                let end = dst + data.len();
                if end > max_written { max_written = end; }
            }
            PlacedCommand::Move { src, dst, length } => {
                // LZ77-style self-referential copy: reads from already-written output.
                // Encoder invariant: this command must appear after all commands that
                // write [src, src+length); validate_placed_commands checks src+length<=dst
                // (necessary) but not execution ordering (encoder's responsibility).
                out.copy_within(*src..*src + *length, *dst);
                let end = dst + length;
                if end > max_written { max_written = end; }
            }
        }
    }
    max_written
}

/// Apply placed commands in-place within a single buffer.
///
/// Uses `copy_within` (maps to libc `memmove`) so overlapping src/dst is safe.
pub fn apply_placed_inplace_to(commands: &[PlacedCommand], buf: &mut [u8]) {
    for cmd in commands {
        match cmd {
            PlacedCommand::Copy { src, dst, length } => {
                buf.copy_within(*src..*src + *length, *dst);
            }
            PlacedCommand::Add { dst, data } => {
                buf[*dst..*dst + data.len()].copy_from_slice(data);
            }
            PlacedCommand::Move { src, dst, length } => {
                // Same as Copy in the inplace buffer; copy_within handles overlaps.
                buf.copy_within(*src..*src + *length, *dst);
            }
        }
    }
}

/// Validate placed commands before apply so malformed deltas fail cleanly.
pub fn validate_placed_commands(
    commands: &[PlacedCommand],
    reference_size: usize,
    version_size: usize,
    inplace: bool,
) -> Result<(), DeltaError> {
    let source_limit = if inplace {
        reference_size.max(version_size)
    } else {
        reference_size
    };
    for cmd in commands {
        match cmd {
            PlacedCommand::Copy { src, dst, length } => {
                validate_apply_range(*dst, *length, version_size, "copy destination")?;
                validate_apply_range(*src, *length, source_limit, "copy source")?;
            }
            PlacedCommand::Add { dst, data } => {
                validate_apply_range(*dst, data.len(), version_size, "add destination")?;
            }
            PlacedCommand::Move { src, dst, length } => {
                validate_apply_range(*dst, *length, version_size, "move destination")?;
                validate_apply_range(*src, *length, version_size, "move source")?;
                // Necessary geometric constraint: src + length <= dst.
                // This prevents self-referential loops but does NOT prove the src
                // region has been written; that is the encoder's responsibility
                // (Move commands must follow all commands writing their src region).
                if src + length > *dst {
                    return Err(DeltaError::InvalidFormat(format!(
                        "move src+len ({}+{}) > dst ({}): source not yet written",
                        src, length, dst
                    )));
                }
            }
        }
    }
    Ok(())
}

// ── convenience wrappers (Command → output) ─────────────────────────────

/// Apply algorithm commands, writing into a pre-allocated buffer.
///
/// Returns the number of bytes written.
pub fn apply_delta_to(r: &[u8], commands: &[Command], out: &mut [u8]) -> usize {
    let mut pos = 0;
    for cmd in commands {
        match cmd {
            Command::Add { data } => {
                out[pos..pos + data.len()].copy_from_slice(data);
                pos += data.len();
            }
            Command::Copy { offset, length } => {
                out[pos..pos + *length].copy_from_slice(&r[*offset..*offset + *length]);
                pos += *length;
            }
        }
    }
    pos
}

/// Reconstruct the version from reference + algorithm commands.
pub fn apply_delta(r: &[u8], commands: &[Command]) -> Vec<u8> {
    let mut out = vec![0u8; output_size(commands)];
    apply_delta_to(r, commands, &mut out);
    out
}

/// Apply placed in-place commands to a buffer initialized with R.
pub fn apply_delta_inplace(
    r: &[u8],
    commands: &[PlacedCommand],
    version_size: usize,
) -> Vec<u8> {
    let buf_size = r.len().max(version_size);
    let mut buf = vec![0u8; buf_size];
    buf[..r.len()].copy_from_slice(r);
    apply_placed_inplace_to(commands, &mut buf);
    buf.truncate(version_size);
    buf
}

fn validate_apply_range(
    start: usize,
    length: usize,
    limit: usize,
    name: &str,
) -> Result<(), DeltaError> {
    if start > limit || length > limit.saturating_sub(start) {
        return Err(DeltaError::InvalidFormat(format!("{} out of range", name)));
    }
    Ok(())
}