sequoia-git 0.5.0

A tool for managing and enforcing a commit signing policy.
Documentation
//! A set of uniformly distributed 32-byte values that can be
//! persisted.
//!
//! The version 0 file format is:
//!
//! | Offset | Description |
//! | -----: | ----------- |
//! |        | Header
//! |      0 | 15-byte magic value `b"StoredSortedSet"`
//! |     15 | Version (`0`)
//! |     16 | Context, 12 bytes of application-specific, opaque data
//! |        | Content
//! |     28 | Entry count -  big endian 32-bit unsigned integer
//! |     32 | Entry 0 - big endian 32-byte unsigned integer
//! |     48 | Entry 1
//! |    ... | ...
//!
//! The entries are sorted, which allows doing an in-place binary
//! search.  They are interpreted as big endian 32-byte unsigned
//! integers.

use std::{
    collections::BTreeSet,
    io::{
        Seek,
        SeekFrom,
        Write,
    },
    path::Path,
};
use buffered_reader::{BufferedReader};

const VALUE_BYTES: usize = 32;
pub type Value = [u8; VALUE_BYTES];

type File = buffered_reader::File<'static, ()>;

pub struct Set {
    header: Header,
    store: File,
    scratch: BTreeSet<Value>,
}

/// A set data type with 32-byte keys, which can be easily persisted.
///
/// The entire data structure is `mmap`ed or read into memory.
///
/// Currently, there is no way to remove entries.
impl Set {
    /// Returns the number of entries.
    #[allow(dead_code)]
    fn len(&self) -> usize {
        usize::try_from(self.header.entries).expect("representable")
            + self.scratch.len()
        // XXX: overestimate b/c of how insert is implemented
    }

    /// Returns `true` if the set contains an element equal to the value.
    pub fn contains(&mut self, value: &Value) -> Result<bool> {
        Ok(self.stored_values()?.binary_search(value).is_ok()
           || self.scratch.contains(value))
    }

    /// Adds a value to the set.
    pub fn insert(&mut self, value: Value) {
        // We insert it into our overlay without checking whether it
        // exists in the stored set to avoid the lookup overhead.  We
        // sort this out when persisting any changes to disk.
        self.scratch.insert(value);
    }

    fn stored_values(&mut self) -> Result<&[Value]> {
        let entries = self.header.entries as usize;
        let bytes = self.store.data_hard(entries * VALUE_BYTES)?;
        unsafe {
            Ok(std::slice::from_raw_parts(bytes.as_ptr() as *const Value,
                                          entries))
        }
    }

    pub fn read<P: AsRef<Path>>(path: P, context: &str) -> Result<Self> {
        // We are going to read an array of values into memory, and
        // use them as is.  Check that layout matches (i.e., that Rust
        // doesn't expect any padding).
        assert_eq!(VALUE_BYTES, std::mem::size_of::<Value>());
        assert_eq!(std::mem::size_of::<[Value; 2]>(),
                   2 * VALUE_BYTES,
                   "values are unpadded");

        let context: [u8; CONTEXT_BYTES] = context.as_bytes()
            .try_into()
            .map_err(|_| Error::BadContext)?;

        let (header, reader) = match File::open(path) {
            Ok(mut f) => {
                let header = Header::read(&mut f, context)?;
                (header, f)
            },
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                let t = tempfile::NamedTempFile::new()?;
                // XXX: Rather, we should be using t.reopen() here and
                // constructing a File from that:
                let f = File::open(t.path())?;
                (Header::new(context), f)
            },
            Err(e) => return Err(e.into()),
        };

        // XXX: check here if the number of entries is plausible by
        // looking at the file metadata.  This is currently not
        // possible to do in a race free manner.
        Ok(Set {
            header,
            store: reader,
            scratch: Default::default(),
        })
    }

    pub fn write<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
        // If we didn't change anything, we're done.
        if self.scratch.is_empty() {
            return Ok(());
        }

        let mut sink = tempfile::NamedTempFile::new_in(
            path.as_ref().parent().ok_or(Error::BadPath)?)?;

        // First update and write the header.
        let mut h = self.header.clone();
        h.entries = 0; // Fill be fixed later.
        h.write(&mut sink)?;

        // Then, merge the two sets while writing them out.
        let mut entries = 0;
        let scratch = std::mem::replace(&mut self.scratch, Default::default());
        let mut stored = self.stored_values()?;

        for new in scratch.iter() {
            let p = stored.partition_point(|v| v < new);

            let before = &stored[..p];
            let before_bytes = unsafe {
                std::slice::from_raw_parts(before.as_ptr() as *const u8,
                                           before.len() * VALUE_BYTES)
            };
            sink.write_all(before_bytes)?;
            entries += p;

            // See if this is actually new.
            if before.is_empty() || &before[p - 1] != new {
                sink.write_all(new)?;
                entries += 1;
            }

            // Now advance the stored "iterator".
            stored = &stored[p..];
        }

        // Now write out the final chunk.
        {
            let stored_bytes = unsafe {
                std::slice::from_raw_parts(stored.as_ptr() as *const u8,
                                           stored.len() * VALUE_BYTES)
            };
            sink.write_all(stored_bytes)?;
            entries += stored.len();
        }

        // And put scratch back.
        self.scratch = scratch;

        // Finally, write the header again, this time with the correct
        // number of values.
        sink.as_file_mut().seek(SeekFrom::Start(0))?;
        h.entries = entries.try_into().map_err(|_| Error::TooManyEntries)?;
        h.write(&mut sink)?;
        sink.flush()?;

        sink.persist(path).map_err(|pe| pe.error)?;
        Ok(())
    }
}

const CONTEXT_BYTES: usize = 12;

#[derive(Debug, Clone)]
struct Header {
    version: u8,
    context: [u8; CONTEXT_BYTES],
    entries: u32,
}

impl Header {
    const MAGIC: &'static [u8; 15] = b"StoredSortedSet";

    fn new(context: [u8; CONTEXT_BYTES]) -> Self {
        Header {
            version: 1,
            context,
            entries: 0,
        }
    }

    fn read(reader: &mut File, context: [u8; CONTEXT_BYTES]) -> Result<Self> {
        let m = reader.data_consume_hard(Self::MAGIC.len())?;
        if &m[..Self::MAGIC.len()] != &Self::MAGIC[..] {
            return Err(Error::BadMagic);
        }
        let v = reader.data_consume_hard(1)?;
        let version = v[0];
        if version != 1 {
            return Err(Error::UnsupportedVersion(version));
        }

        let c = &reader.data_consume_hard(context.len())?[..context.len()];
        if &c[..] != &context[..] {
            return Err(Error::BadContext);
        }

        let e = &reader.data_consume_hard(4)?[..4];
        let entries =
            u32::from_be_bytes(e.try_into().expect("we read 4 bytes"));

        Ok(Header {
            version,
            context,
            entries,
        })
    }

    fn write(&self, sink: &mut dyn Write) -> Result<()> {
        sink.write_all(Self::MAGIC)?;
        sink.write_all(&[self.version])?;
        sink.write_all(&self.context)?;
        sink.write_all(&self.entries.to_be_bytes())?;
        Ok(())
    }
}

/// Errors for this crate.
#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("Bad magic read from file")]
    BadMagic,
    #[error("Unsupported version: {0}")]
    UnsupportedVersion(u8),
    #[error("Bad context read from file")]
    BadContext,
    #[error("Too many entries")]
    TooManyEntries,
    #[error("Bad path")]
    BadPath,
    #[error("Io error")]
    Io(#[from] std::io::Error),
}

/// Result specialization.
pub type Result<T> = ::std::result::Result<T, Error>;