use anyhow::Result;
use diskann_benchmark_runner::{
dispatcher::{FailureScore, MatchScore},
output::Output,
registry::Benchmarks,
utils::{percentiles, MicroSeconds},
Benchmark, Checkpoint,
};
use diskann_label_filter::{
kv_index::GenericIndex,
read_and_parse_queries, read_baselabels,
stores::bftree_store::BfTreeStore,
traits::{
inverted_index_trait::InvertedIndexProvider, key_codec::DefaultKeyCodec,
posting_list_trait::RoaringPostingList,
},
};
use serde::Serialize;
use std::{io::Write, path::Path, sync::Arc};
use crate::{
inputs::filters::{InvertedIndexKind, MetadataIndexBuild},
utils::filters::QueryBitmapEvaluator,
};
pub(crate) fn register_benchmarks(benchmarks: &mut Benchmarks) {
benchmarks.register("metadata-index-build", MetadataIndexJob);
}
#[derive(Debug)]
struct MetadataIndexJob;
impl Benchmark for MetadataIndexJob {
type Input = MetadataIndexBuild;
type Output = MetadataIndexBuildStats;
fn try_match(&self, _input: &MetadataIndexBuild) -> Result<MatchScore, FailureScore> {
Ok(MatchScore(1))
}
fn description(
&self,
_f: &mut std::fmt::Formatter<'_>,
_input: Option<&MetadataIndexBuild>,
) -> std::fmt::Result {
Ok(())
}
fn run(
&self,
input: &MetadataIndexBuild,
checkpoint: Checkpoint<'_>,
output: &mut dyn Output,
) -> anyhow::Result<MetadataIndexBuildStats> {
run(input, checkpoint, output)
}
}
fn run(
input: &crate::inputs::filters::MetadataIndexBuild,
checkpoint: Checkpoint<'_>,
mut output: &mut dyn Output,
) -> Result<MetadataIndexBuildStats, anyhow::Error> {
writeln!(output, "{}", input)?;
let filter_params = &input.filter_params;
let (bitmap_filters_vec, filter_search_results, _label_count) =
prepare_bitmap_filters_from_paths_with_kind(
filter_params.data_labels.as_ref(),
filter_params.query_predicates.as_ref(),
input.inverted_index_type,
checkpoint,
)?;
let counts: Vec<usize> = bitmap_filters_vec.iter().map(|bf| bf.count()).collect();
let query_count = counts.len();
let total_matching: usize = counts.iter().cloned().sum();
let mut sorted = counts.clone();
let (
counts_p1,
counts_p5,
counts_p10,
counts_p50,
counts_p90,
counts_p95,
counts_p99,
counts_avg,
) = if sorted.is_empty() {
(
0usize, 0usize, 0usize, 0usize, 0usize, 0usize, 0usize, 0.0f64,
)
} else {
sorted.sort_unstable();
let p = percentiles::compute_percentiles(&mut sorted)?;
let p50 = p.median.round() as usize;
let p90 = p.p90;
let p99 = p.p99;
let n = sorted.len();
let p1 = sorted[(n / 100).min(n - 1)];
let p5 = sorted[((5 * n) / 100).min(n - 1)];
let p10 = sorted[((10 * n) / 100).min(n - 1)];
let p95 = sorted[((95 * n) / 100).min(n - 1)];
(p1, p5, p10, p50, p90, p95, p99, p.mean)
};
let stats = MetadataIndexBuildStats {
label_count: _label_count,
query_count,
total_matching,
counts_avg,
counts_p1,
counts_p5,
counts_p10,
counts_p50,
counts_p90,
counts_p95,
counts_p99,
filter: filter_search_results,
};
writeln!(output, "\n\n{}", stats)?;
Ok(stats)
}
#[derive(Debug, Serialize)]
pub struct FilterSearchResults {
pub base_label_latencies: MicroSeconds,
pub inverted_index_build_latencies: MicroSeconds,
pub query_parsing_latencies: MicroSeconds,
pub search_strategy_latencies: MicroSeconds,
}
impl Default for FilterSearchResults {
fn default() -> Self {
Self {
base_label_latencies: MicroSeconds::new(0),
inverted_index_build_latencies: MicroSeconds::new(0),
query_parsing_latencies: MicroSeconds::new(0),
search_strategy_latencies: MicroSeconds::new(0),
}
}
}
impl std::fmt::Display for FilterSearchResults {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Filter Search Metrics:")?;
writeln!(
f,
" Base Label Latencies: {} s",
self.base_label_latencies.as_seconds()
)?;
writeln!(
f,
" Inverted Index Build Latencies: {} s",
self.inverted_index_build_latencies.as_seconds()
)?;
writeln!(
f,
" Query Parsing Latencies: {} s",
self.query_parsing_latencies.as_seconds()
)?;
writeln!(
f,
" Search Strategy Latencies: {} s",
self.search_strategy_latencies.as_seconds()
)?;
Ok(())
}
}
#[derive(serde::Serialize, Debug)]
pub(super) struct MetadataIndexBuildStats {
pub(super) label_count: usize,
pub(super) query_count: usize,
pub(super) total_matching: usize,
pub(super) counts_avg: f64,
pub(super) counts_p1: usize,
pub(super) counts_p5: usize,
pub(super) counts_p10: usize,
pub(super) counts_p50: usize,
pub(super) counts_p90: usize,
pub(super) counts_p95: usize,
pub(super) counts_p99: usize,
pub(super) filter: FilterSearchResults,
}
impl std::fmt::Display for MetadataIndexBuildStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Metadata Index Build Stats:")?;
writeln!(f, " Label Count: {}", self.label_count)?;
writeln!(f, " Query Count: {}", self.query_count)?;
writeln!(f, " Total Matching: {}", self.total_matching)?;
writeln!(f, " Match Count Avg: {:.2}", self.counts_avg)?;
writeln!(f, " Match Count P1: {}", self.counts_p1)?;
writeln!(f, " Match Count P5: {}", self.counts_p5)?;
writeln!(f, " Match Count P10: {}", self.counts_p10)?;
writeln!(f, " Match Count P50: {}", self.counts_p50)?;
writeln!(f, " Match Count P90: {}", self.counts_p90)?;
writeln!(f, " Match Count P95: {}", self.counts_p95)?;
writeln!(f, " Match Count P99: {}", self.counts_p99)?;
writeln!(f, "{}", self.filter)?;
Ok(())
}
}
pub fn prepare_bitmap_filters_from_paths_with_kind(
data_labels: &Path,
query_predicates: &Path,
kind: InvertedIndexKind,
_checkpoint: Checkpoint<'_>,
) -> anyhow::Result<(Vec<QueryBitmapEvaluator>, FilterSearchResults, usize)> {
let timer = std::time::Instant::now();
let base_labels = read_baselabels(data_labels)?;
let label_count = base_labels.len();
let base_label_latency = timer.elapsed();
let timer = std::time::Instant::now();
let inverted_index = match kind {
InvertedIndexKind::BfTree => {
let store = Arc::new(BfTreeStore::memory().unwrap());
let mut idx =
GenericIndex::<BfTreeStore, RoaringPostingList, DefaultKeyCodec>::new(store)
.with_field_normalizer(|field| format!("/{}", field.replace(".", "/")));
for (doc_id, doc) in base_labels.iter().enumerate() {
idx.insert(doc_id, &doc.flatten_metadata())?;
}
Arc::new(idx)
}
};
let inverted_index_build_latency = timer.elapsed();
let timer = std::time::Instant::now();
let parsed_queries = read_and_parse_queries(query_predicates)?;
let query_parsing_latency = timer.elapsed();
let timer = std::time::Instant::now();
let bitmap_filters: Vec<QueryBitmapEvaluator> = parsed_queries
.into_iter()
.map(|(_, query_predicate)| {
QueryBitmapEvaluator::new(query_predicate, inverted_index.as_ref())
})
.collect();
let search_strategy_latency = timer.elapsed();
let filter_search_results = FilterSearchResults {
base_label_latencies: base_label_latency.into(),
inverted_index_build_latencies: inverted_index_build_latency.into(),
query_parsing_latencies: query_parsing_latency.into(),
search_strategy_latencies: search_strategy_latency.into(),
};
Ok((bitmap_filters, filter_search_results, label_count))
}