#![doc = include_str!("../README.md")]
#[cfg(feature = "cmph-sys")] mod cmph;
use builder::TypeToQuery;
#[cfg(feature = "cmph-sys")] use cmph::chd_benchmark;
mod builder;
pub use builder::MPHFBuilder;
mod stats;
use ph::phast::compressed_array::{CompactFast, LeastSquares, LinearRegressionArray, Simple};
use ph::phast::{bits_per_seed_to_100_bucket_size, DefaultCompressedArray, SeedOnly, ShiftOnly, ShiftOnlyWrapped, ShiftSeedWrapped};
pub use stats::{SearchStats, BuildStats, BenchmarkResult, file, print_input_stats};
mod inout;
use inout::{gen_data, RandomStrings, RawLines};
#[cfg(feature = "fmph")] mod fmph;
#[cfg(feature = "fmph")] use fmph::{fmph_benchmark, fmphgo_benchmark_all, fmphgo_run, FMPHGOBuildParams, FMPHGO_HEADER};
mod phast;
use phast::phast_benchmark;
#[cfg(feature = "ptr_hash")] mod ptrhash;
#[cfg(feature = "ptr_hash")] use ptrhash::ptrhash_benchmark;
use butils::{XorShift32, XorShift64};
use clap::{Parser, ValueEnum, Subcommand, Args};
use std::hash::Hash;
use std::fmt::Debug;
use rayon::current_num_threads;
#[cfg(feature = "fxhash")] type IntHasher = ph::Seedable<fxhash::FxBuildHasher>;
#[cfg(not(feature = "fxhash"))] type IntHasher = ph::BuildDefaultSeededHasher;
type StrHasher = ph::BuildDefaultSeededHasher;
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum KeyAccess {
Indices8,
#[cfg(feature = "fmph-key-access")]
Indices16,
#[cfg(feature = "fmph-key-access")]
Copy
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum Threads {
Single = 1,
Multi = 2,
Both = 2 | 1
}
#[cfg(feature = "fmph")]
#[allow(non_camel_case_types)]
#[derive(Args)]
pub struct FMPHConf {
#[arg(short='l', long)]
pub level_size: Option<u16>,
#[arg(short='c', long, default_value_t = usize::MAX)]
pub cache_threshold: usize,
#[arg(value_enum, short='a', long, default_value_t = KeyAccess::Indices8)]
pub key_access: KeyAccess,
}
#[allow(non_camel_case_types)]
#[derive(Args)]
pub struct FMPHGOConf {
#[arg(short='s', long, value_parser = clap::value_parser!(u8).range(1..16))]
pub bits_per_group_seed: Option<u8>,
#[arg(short='b', long, value_parser = clap::value_parser!(u8).range(1..63))]
pub group_size: Option<u8>,
#[arg(short='l', long)]
pub level_size: Option<u16>,
#[arg(short='c', long, default_value_t = usize::MAX)]
pub cache_threshold: usize,
#[arg(value_enum, short='a', long, default_value_t = KeyAccess::Indices8)]
pub key_access: KeyAccess,
}
#[derive(Args)]
pub struct PHastConf {
#[arg(default_value_t = 8, value_parser = clap::value_parser!(u8).range(1..16))]
pub bits_per_seed: u8,
#[arg()]
pub bucket_size: Option<u16>,
#[arg(short='e', long="ef", default_value_t = false)]
pub elias_fano: bool,
#[arg(short='c', long, default_value_t = false)]
pub compact: bool,
#[arg(short='l', long="ls", default_value_t = false)]
pub linear_simple: bool,
#[arg(short='s', default_value_t = false)]
pub least_squares: bool
}
impl PHastConf {
fn bucket_size(&self) -> u16 {
self.bucket_size.unwrap_or_else(|| bits_per_seed_to_100_bucket_size(self.bits_per_seed))
}
fn elias_fano(&self) -> bool {
self.elias_fano || !(self.compact || self.linear_simple || self.least_squares)
}
}
#[allow(non_camel_case_types)]
#[derive(Subcommand)]
pub enum Method {
#[cfg(feature = "fmph")]
FMPHGO_all,
#[cfg(feature = "fmph")]
FMPHGO(FMPHGOConf),
#[cfg(feature = "fmph")]
FMPH(FMPHConf),
phast(PHastConf),
plus(PHastConf),
plus1wrap(PHastConf),
plus2wrap(PHastConf),
plus3wrap(PHastConf),
plusshift1 {
bits_per_shift: u8,
#[command(flatten)] phast_conf: PHastConf
},
plusshift2 {
bits_per_shift: u8,
#[command(flatten)] phast_conf: PHastConf
},
plusshift3 {
bits_per_shift: u8,
#[command(flatten)] phast_conf: PHastConf
},
#[cfg(feature = "boomphf")]
Boomphf {
#[arg(short='l', long)]
level_size: Option<u16>
},
#[cfg(feature = "cmph-sys")] CHD {
#[arg(short='l', long, value_parser = clap::value_parser!(u8).range(1..32))]
lambda: Option<u8>
},
#[cfg(feature = "ptr_hash")]
PtrHash {
#[arg(default_value_t = 1, value_parser = clap::value_parser!(u8).range(0..=2))]
speed: u8
},
None
}
#[allow(non_camel_case_types)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum KeySource {
xs32,
xs64,
stdin,
stdinz,
randstr
}
#[derive(Parser)]
#[command(author="Piotr Beling", version, about, long_about = None)]
pub struct Conf {
#[command(subcommand)]
pub method: Method,
#[arg(short='l', long, default_value_t = 1)]
pub lookup_runs: u32,
#[arg(short='b', long, default_value_t = 1, value_parser = clap::value_parser!(u32).range(1..))]
pub build_runs: u32,
#[arg(short='v', long, default_value_t = false)]
pub verify: bool,
#[arg(short='s', long, value_enum, default_value_t = KeySource::stdin)]
pub key_source: KeySource,
#[arg(short='n', long)]
pub keys_num: Option<usize>,
#[arg(short='f', long, default_value_t = 0)]
pub foreign_keys_num: usize,
#[arg(short='t', long, value_enum, default_value_t = Threads::Both)]
pub threads: Threads,
#[arg(short='d', long, default_value_t = false)]
pub save_details: bool,
#[arg(long, default_value_t = 1234, value_parser = clap::value_parser!(u64).range(1..))]
pub seed: u64,
#[arg(short='c', long, default_value_t = 200)]
pub cooling: u16,
}
#[cfg(feature = "cmph-sys")] trait CanBeKey: Hash + Sync + Send + Clone + Debug + Default + cmph::CMPHSource + TypeToQuery {}
#[cfg(feature = "cmph-sys")] impl<T: Hash + Sync + Send + Clone + Debug + Default + cmph::CMPHSource + TypeToQuery> CanBeKey for T {}
#[cfg(not(feature = "cmph-sys"))] trait CanBeKey: Hash + Sync + Send + Clone + Debug + Default + TypeToQuery {}
#[cfg(not(feature = "cmph-sys"))] impl<T: Hash + Sync + Send + Clone + Debug + Default + TypeToQuery> CanBeKey for T {}
fn run<K: CanBeKey>(conf: &Conf, i: &(Vec<K>, Vec<K>)) {
match conf.method {
#[cfg(feature = "fmph")] Method::FMPHGO_all =>
fmphgo_benchmark_all(file("FMPHGO_all", &conf, i.0.len(), i.1.len(), FMPHGO_HEADER),
&i, &conf, KeyAccess::Indices8),
#[cfg(feature = "fmph")] Method::FMPHGO(ref fmphgo_conf) => {
let mut file = file("FMPHGO", &conf, i.0.len(), i.1.len(), FMPHGO_HEADER);
println!("FMPHGO hash caching threshold={}: s b gamma results...", fmphgo_conf.cache_threshold);
let mut p = FMPHGOBuildParams {
relative_level_size: fmphgo_conf.level_size.unwrap_or(0),
cache_threshold: fmphgo_conf.cache_threshold,
key_access: fmphgo_conf.key_access,
};
match (fmphgo_conf.bits_per_group_seed, fmphgo_conf.group_size) {
(None, None) => {
for (bits_per_group_seed, bits_per_group) in [(1, 8), (2, 16), (4, 16), (8, 32)] {
fmphgo_run(&mut file, i, conf, bits_per_group_seed, bits_per_group, &mut p);
}
},
(Some(bits_per_group_seed), Some(bits_per_group)) => fmphgo_run(&mut file, i, conf, bits_per_group_seed, bits_per_group, &mut p),
(Some(1), None) | (None, Some(8)) => fmphgo_run(&mut file, i, conf, 1, 8, &mut p),
(Some(2), None) => fmphgo_run(&mut file, i, conf, 2, 16, &mut p),
(Some(4), None) => fmphgo_run(&mut file, i, conf, 4, 16, &mut p),
(None, Some(16)) => {
fmphgo_run(&mut file, i, conf, 2, 16, &mut p);
fmphgo_run(&mut file, i, conf, 4, 16, &mut p);
}
(Some(8), None) | (None, Some(32)) => fmphgo_run(&mut file, i, conf, 8, 32, &mut p),
_ => eprintln!("Cannot deduce for which pairs of (bits per group seed, group size) calculate.")
}
}
#[cfg(feature = "fmph")] Method::FMPH(ref fmph_conf) => {
match conf.key_source {
KeySource::xs32 | KeySource::xs64 => fmph_benchmark(i, conf, fmph_conf.level_size, Some((IntHasher::default(), fmph_conf))),
_ => fmph_benchmark(i, conf, fmph_conf.level_size, Some((StrHasher::default(), fmph_conf)))
}
},
Method::phast(ref phast_conf) => {
println!("PHast {} {}: encoder results...", phast_conf.bits_per_seed, phast_conf.bucket_size());
let mut csv_file = file("phast", &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
if phast_conf.elias_fano() {
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, SeedOnly, phast_conf, "EF");
}
if phast_conf.compact {
phast_benchmark::<CompactFast, _, _>(&mut csv_file, i, conf, SeedOnly, phast_conf, "C");
}
if phast_conf.linear_simple {
phast_benchmark::<LinearRegressionArray<Simple>, _, _>(&mut csv_file, i, conf, SeedOnly, phast_conf, "LSimp");
}
if phast_conf.least_squares {
phast_benchmark::<LinearRegressionArray<LeastSquares>, _, _>(&mut csv_file, i, conf,SeedOnly, phast_conf, "LSqr");
}
},
Method::plus(ref phast_conf) => {
println!("PHast+ {} {}: encoder results...", phast_conf.bits_per_seed, phast_conf.bucket_size());
let mut csv_file = file("PHastPlus", &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
if phast_conf.elias_fano() {
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftOnly::default(), phast_conf, "EF");
}
if phast_conf.compact {
phast_benchmark::<CompactFast, _, _>(&mut csv_file, i, conf, ShiftOnly::default(), phast_conf, "C");
}
if phast_conf.linear_simple {
phast_benchmark::<LinearRegressionArray<Simple>, _, _>(&mut csv_file, i, conf, ShiftOnly::default(), phast_conf, "LSimp");
}
if phast_conf.least_squares {
phast_benchmark::<LinearRegressionArray<LeastSquares>, _, _>(&mut csv_file, i, conf, ShiftOnly::default(), phast_conf, "LSqr");
}
},
Method::plus1wrap(ref phast_conf) => {
println!("PHast+2wrap {} {}: encoder results...", phast_conf.bits_per_seed, phast_conf.bucket_size());
let mut csv_file = file("PHastPlus1wrap", &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
if phast_conf.elias_fano() {
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "EF");
}
if phast_conf.compact {
phast_benchmark::<CompactFast, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "C");
}
if phast_conf.linear_simple {
phast_benchmark::<LinearRegressionArray<Simple>, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "LSimp");
}
if phast_conf.least_squares {
phast_benchmark::<LinearRegressionArray<LeastSquares>, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "LSqr");
}
},
Method::plus2wrap(ref phast_conf) => {
println!("PHast+2wrap {} {}: encoder results...", phast_conf.bits_per_seed, phast_conf.bucket_size());
let mut csv_file = file("PHastPlus2wrap", &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
if phast_conf.elias_fano() {
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "EF");
}
if phast_conf.compact {
phast_benchmark::<CompactFast, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "C");
}
if phast_conf.linear_simple {
phast_benchmark::<LinearRegressionArray<Simple>, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "LSimp");
}
if phast_conf.least_squares {
phast_benchmark::<LinearRegressionArray<LeastSquares>, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<2>, phast_conf, "LSqr");
}
},
Method::plus3wrap(ref phast_conf) => {
println!("PHast+3wrap {} {}: encoder results...", phast_conf.bits_per_seed, phast_conf.bucket_size());
let mut csv_file = file("PHastPlus3wrap", &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
if phast_conf.elias_fano() {
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<3>, phast_conf, "EF");
}
if phast_conf.compact {
phast_benchmark::<CompactFast, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<3>, phast_conf, "C");
}
if phast_conf.linear_simple {
phast_benchmark::<LinearRegressionArray<Simple>, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<3>, phast_conf, "LSimp");
}
if phast_conf.least_squares {
phast_benchmark::<LinearRegressionArray<LeastSquares>, _, _>(&mut csv_file, i, conf, ShiftOnlyWrapped::<3>, phast_conf, "LSqr");
}
},
#[cfg(feature = "boomphf")]
Method::Boomphf{level_size} => {
match conf.key_source {
KeySource::xs32 | KeySource::xs64 => fmph_benchmark::<IntHasher, _>(i, conf, level_size, None),
_ => fmph_benchmark::<StrHasher, _>(i, conf, level_size, None)
}
}
#[cfg(feature = "cmph-sys")] Method::CHD{lambda} => {
println!("CHD: lambda results...");
let mut csv = file("CHD", &conf, i.0.len(), i.1.len(), "lambda");
if let Some(lambda) = lambda {
chd_benchmark(&mut csv, i, conf, lambda);
} else {
for lambda in 1..=6 { chd_benchmark(&mut csv, i, conf, lambda); }
}
}
#[cfg(feature = "ptr_hash")] Method::PtrHash{ speed } => {
println!("PtrHash: results...");
let mut csv_file = file("PtrHash", &conf, i.0.len(), i.1.len(), "speed");
match conf.key_source {
KeySource::xs32 | KeySource::xs64 => ptrhash_benchmark::<ptr_hash::hash::FxHash, _>(&mut csv_file, i, conf, speed),
_ => ptrhash_benchmark::<ptrhash::StrHasherForPtr, _>(&mut csv_file, i, conf, speed),
}
},
Method::plusshift1 { bits_per_shift, ref phast_conf } => {
println!("PHastPlusShift1 {}+{bits_per_shift} {}: encoder results...", phast_conf.bits_per_seed-bits_per_shift, phast_conf.bucket_size());
let mut csv_file = file(&format!("plus{bits_per_shift}shift1"), &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftSeedWrapped::<1>(bits_per_shift), phast_conf, "EF");
},
Method::plusshift2 { bits_per_shift, ref phast_conf } => {
println!("PHastPlusShift2 {}+{bits_per_shift} {}: encoder results...", phast_conf.bits_per_seed-bits_per_shift, phast_conf.bucket_size());
let mut csv_file = file(&format!("plus{bits_per_shift}shift2"), &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftSeedWrapped::<2>(bits_per_shift), phast_conf, "EF");
},
Method::plusshift3 { bits_per_shift, ref phast_conf } => {
println!("PHastPlusShift3 {}+{bits_per_shift} {}: encoder results...", phast_conf.bits_per_seed-bits_per_shift, phast_conf.bucket_size());
let mut csv_file = file(&format!("plus{bits_per_shift}shift3"), &conf, i.0.len(), i.1.len(), "bits_per_seed bucket_size100 encoder");
phast_benchmark::<DefaultCompressedArray, _, _>(&mut csv_file, i, conf, ShiftSeedWrapped::<3>(bits_per_shift), phast_conf, "EF");
},
Method::None => {},
}
}
fn main() {
let conf: Conf = Conf::parse();
println!("multi-threaded calculations use {} threads (to set by the RAYON_NUM_THREADS environment variable)", current_num_threads());
println!("build and lookup times are averaged over {} and {} runs, respectively", conf.build_runs, conf.lookup_runs);
println!("hasher: integer {} string {}", std::any::type_name::<IntHasher>(), std::any::type_name::<StrHasher>());
match conf.key_source {
KeySource::xs32 => run(&conf, &gen_data(conf.keys_num.unwrap_or(1000000), conf.foreign_keys_num, XorShift32(conf.seed as u32))),
KeySource::xs64 => run(&conf, &gen_data(conf.keys_num.unwrap_or(1000000), conf.foreign_keys_num, XorShift64(conf.seed))),
KeySource::stdin|KeySource::stdinz => {
let lines = if conf.key_source == KeySource::stdin {
RawLines::separated_by_newlines(std::io::stdin().lock())
} else {
RawLines::separated_by_zeros(std::io::stdin().lock())
}.map(|l| l.unwrap());
let i = if let Some(keys_num) = conf.keys_num {
gen_data(keys_num, conf.foreign_keys_num, lines)
} else {
(lines.collect(), Vec::new())
};
print_input_stats("key set", &i.0);
print_input_stats("foreign key set", &i.1);
run(&conf, &i);
},
KeySource::randstr => run(&conf, &gen_data(conf.keys_num.unwrap(), conf.foreign_keys_num, RandomStrings::new(10..50, conf.seed)))
};
}