dson 0.1.1

A delta-state CRDT implementation
Documentation
// (c) Copyright 2025 Helsing GmbH. All rights reserved.
use rand::Rng as _;
use rand_distr::Distribution;
use std::{
    collections::BTreeSet,
    fs::File,
    io::{BufRead, BufReader, Write},
    ops::Bound,
    path::Path,
};

/// Increment to indicate that random_dots.rs generated by an older build.rs must be re-generated.
const GEN: usize = 1;

fn main() {
    println!("cargo:rustc-link-arg-benches=-rdynamic");
    println!("cargo:rerun-if-changed=build.rs");

    let gencheck = format!("// gen-version={GEN}");

    // generate sets of random Dots to use in benchmarks.
    // we generate this file rather than committing it so that the logic for generating it is
    // checked in somewhere and documented, which also means we can change it if we wish. the
    // variability of generated dots themselves should (hopefully) not influence the measurements
    // too much. and even if they do, that's also a valuable signal.
    let out_dir = std::env::var_os("OUT_DIR").unwrap();
    let dest_path = Path::new(&out_dir).join("random_dots.rs");

    // don't re-generate if it already exists to ensure benchmarks
    // of re-builds (e.g., tango) use the same data sets.
    // but _do_ generate if build.rs has changed its code version.
    if dest_path.exists() {
        if BufReader::new(File::open(&dest_path).unwrap())
            .lines()
            .next()
            .unwrap()
            .unwrap()
            == gencheck
        {
            eprintln!("existing random_dots.rs is new enough");
            return;
        } else {
            eprintln!("existing random_dots.rs is outdated");
        }
    } else {
        eprintln!("no existing random_dots.rs, so generating");
    }

    let mut out = File::create(dest_path).unwrap();
    writeln!(out, "{gencheck}").unwrap();

    let mut rng = rand::rng();
    // NOTE: 2.2 is arbitrarily chosen here as a "quite skewed" distribution, meaning
    // we'll get nearly all dots near the beginning of the sequence (i.e., consecutive dots),
    // and only a few sparse dots in the tail (i.e., in the dot cloud).
    let zipf = rand_distr::Zipf::new(TOP as f64, 2.2).unwrap();
    const TOP: usize = 8192;
    const IDS: usize = 8;
    for i in [1, 2] {
        let mut all = BTreeSet::new();
        for id in 1..=IDS {
            // higher ids have fewer holes because we sample more
            for _ in 0..(id * 4 * TOP) {
                let sample = zipf.sample(&mut rng) as usize;
                all.insert((id, sample));
            }
        }
        writeln!(out, "#[allow(dead_code)]").unwrap();
        writeln!(out, "pub(crate) const BIG{i}: &[Dot] = &[").unwrap();
        for &(id, seq) in &all {
            writeln!(out, "Dot::mint(Identifier::new({id}, 0), {seq}),").unwrap();
        }
        writeln!(out, "];").unwrap();
        let mut in_small = BTreeSet::new();
        // small only has new dots from a subset of the IDs, as that's more common
        // (this goes to 3, not IDS)
        for id in 1..=3 {
            // one sender has a fair number of new dots, one has some, one has few
            let adj = match id {
                1 => 1.0,
                2 => 0.5,
                3 => 0.1,
                _ => unreachable!(),
            };
            let max = all
                .range((Bound::Included((id, 1)), Bound::Included((id, usize::MAX))))
                .next_back()
                .map(|(_, seq)| seq)
                .copied()
                .unwrap_or(1);
            for seq in 1..=TOP {
                if all.contains(&(id, seq)) {
                    // small should contain a few dots (~5) that are in BIG
                    if rng.random_bool(adj * 10.0 / TOP as f64) {
                        in_small.insert((id, seq));
                    }
                } else {
                    // some that fill holes (preferring early ones)
                    // NOTE: the * 2.0 in there is because we're iterating through all the
                    // seqs, so the last value will average to 0.5, which would otherwise
                    // effectively halve the n/TOP value.
                    if seq < max {
                        if rng.random_bool(
                            adj * (20.0 / TOP as f64) * 2.0 * (1.0 - seq as f64 / max as f64),
                        ) {
                            in_small.insert((id, seq));
                        }
                    } else {
                        // and a small amount that extend the max
                        // biased towards those close to max
                        if rng.random_bool(
                            adj * (5.0 / TOP as f64)
                                * 2.0
                                * (1.0 - (seq - max) as f64 / (TOP - max) as f64),
                        ) {
                            in_small.insert((id, seq));
                        }
                    }
                }
            }
        }
        writeln!(out, "#[allow(dead_code)]").unwrap();
        writeln!(out, "pub(crate) const SMALL{i}: &[Dot] = &[").unwrap();
        for &(id, seq) in &in_small {
            writeln!(out, "Dot::mint(Identifier::new({id}, 0), {seq}),").unwrap();
        }
        writeln!(out, "];").unwrap();
    }
    out.flush().unwrap();
}