cordoba 0.3.1

Zero-copy and mmapable CDB access library with a Python interface
Documentation
use std::mem;
use std::collections::BTreeSet;
use std::io::{Seek, SeekFrom, Write};

use super::*;

#[derive(Copy, Clone, Debug)]
struct HashPos(CDBHash, u32);

impl HashPos {
    #[inline]
    fn distance(&self, tlen: usize, pos: usize) -> usize {
        let startslot = self.0.slot(tlen);
        pos.checked_sub(startslot).unwrap_or_else(|| pos + tlen - startslot)
    }
}

const FILLFACTOR: usize = 2;

pub struct CDBWriter<T> {
    file: T,
    pos: u64,
    tables: Vec<Vec<HashPos>>,
    header: [PosLen; ENTRIES],
}

impl<T> CDBWriter<T>
where
    T: Write + Seek,
{
    pub fn new(mut file: T) -> Result<Self, std::io::Error> {
        let pos = (ENTRIES * PAIR_SIZE) as u64;
        let mut tables = Vec::with_capacity(ENTRIES);
        file.seek(SeekFrom::Start(pos))?;

        for _ in 0..ENTRIES {
            tables.push(Vec::new());
        }
        Ok(CDBWriter {
            file,
            pos,
            tables,
            header: [PosLen { pos: 0, len: 0 }; ENTRIES],
        })
    }

    fn write_kv(&mut self, k: &[u8], v: &[u8]) -> Result<(), std::io::Error> {
        self.file.write_all(&(k.len() as u32).to_le_bytes())?;
        self.file.write_all(&(v.len() as u32).to_le_bytes())?;
        self.file.write_all(k)?;
        self.file.write_all(v)?;

        self.pos += (PAIR_SIZE + k.len() + v.len()) as u64;

        Ok(())
    }

    pub fn write(&mut self, k: &[u8], v: &[u8]) -> Result<(), std::io::Error> {
        let hash = CDBHash::new(k);
        let tableidx = hash.table();

        self.tables[tableidx].push(HashPos(hash, self.pos as u32));

        self.write_kv(k, v)?;

        Ok(())
    }

    fn write_header(&mut self) -> Result<(), std::io::Error> {
        self.file.seek(SeekFrom::Start(0))?;

        for header in self.header.iter() {
            self.file.write_all(&(header.pos as u32).to_le_bytes())?;
            self.file.write_all(&(header.len as u32).to_le_bytes())?;
        }

        Ok(())
    }

    fn finish_generic<F>(mut self, fill: F) -> Result<(), std::io::Error>
    where
        F: Fn(&[HashPos], &mut Vec<HashPos>),
    {
        let mut tout = Vec::new();

        for (i, table) in self.tables.iter().enumerate() {
            fill(&table, &mut tout);
            self.header[i] = PosLen {
                pos: self.pos as usize,
                len: tout.len(),
            };
            for row in &tout {
                let hash : u32 = row.0.into();
                self.file.write_all(&hash.to_le_bytes())?;
                self.file.write_all(&row.1.to_le_bytes())?;
            }
            self.pos += (PAIR_SIZE * tout.len()) as u64;
        }
        self.write_header()?;
        self.file.flush()?;

        Ok(())
    }

    pub fn finish(self) -> Result<(), std::io::Error> {
        self.finish_robinhood()
    }

    pub fn finish_naive(self) -> Result<(), std::io::Error> {
        self.finish_generic(fill_table_naive)
    }

    pub fn finish_btree(self) -> Result<(), std::io::Error> {
        self.finish_generic(fill_table_btree)
    }

    pub fn finish_robinhood(self) -> Result<(), std::io::Error> {
        self.finish_generic(fill_table_robinhood)
    }

    pub fn into_file(self) -> T {
        self.file
    }

    pub fn get_file(&self) -> &T {
        &self.file
    }
}

fn fill_table_naive(input: &[HashPos], output: &mut Vec<HashPos>) {
    let tlen = input.len() * FILLFACTOR;
    output.clear();
    output.resize(tlen, HashPos(CDBHash(0), 0));

    for hp in input {
        let (left, right) = output.split_at_mut(hp.0.slot(tlen));

        for slot in right.iter_mut().chain(left.iter_mut()) {
            if slot.1 == 0 {
                *slot = *hp;
                break;
            }
        }
    }
}

fn fill_table_btree(input: &[HashPos], output: &mut Vec<HashPos>) {
    let mut cache = BTreeSet::new();
    let tlen = input.len() * FILLFACTOR;
    output.clear();
    output.resize(tlen, HashPos(CDBHash(0), 0));

    cache.extend(0..tlen);

    for hp in input {
        let startpos = hp.0.slot(tlen);
        let idx = *cache
            .range(startpos..)
            .chain(cache.range(0..startpos))
            .nth(0)
            .unwrap();
        cache.take(&idx);

        debug_assert_eq!(output[idx].1, 0);
        output[idx] = *hp;
    }
}

fn fill_table_robinhood(input: &[HashPos], output: &mut Vec<HashPos>) {
    let tlen = input.len() * FILLFACTOR;
    output.clear();
    output.resize(tlen, HashPos(CDBHash(0), 0));

    for mut hp in input.iter().cloned() {
        let startslot = hp.0.slot(tlen);
        let (left, right) = output.split_at_mut(startslot);
        let mut slotnum = startslot;
        let mut distance = 0;

        for slot in right.iter_mut().chain(left.iter_mut()) {
            if slot.1 == 0 {
                *slot = hp;
                break;
            } else {
                if slot.distance(tlen, slotnum) < distance {
                    mem::swap(slot, &mut hp);
                    distance = hp.distance(tlen, slotnum);
                }
            }
            distance += 1;
            slotnum = (slotnum + 1) % tlen;
        }
    }
}