Skip to main content

diskann_tools/utils/
build_pq.rs

1/*
2 * Copyright (c) Microsoft Corporation.
3 * Licensed under the MIT license.
4 */
5
6use diskann::ANNResult;
7use diskann_providers::storage::StorageReadProvider;
8use diskann_providers::{
9    model::{
10        graph::traits::GraphDataType, GeneratePivotArguments, MAX_PQ_TRAINING_SET_SIZE,
11        NUM_KMEANS_REPS_PQ, NUM_PQ_CENTROIDS,
12    },
13    storage::{
14        get_disk_index_compressed_pq_file, get_disk_index_pq_pivot_file, FileStorageProvider,
15        PQStorage,
16    },
17    utils::{load_metadata_from_file, Timer},
18};
19use diskann_vector::distance::Metric;
20use tracing::info;
21
22pub struct BuildPQParameters<'a> {
23    pub metric: Metric,
24    pub data_path: &'a str,
25    pub index_path_prefix: &'a str,
26    pub num_threads: usize,
27    pub p_val: f64,
28    pub pq_bytes: f64,
29}
30
31pub fn build_pq<Data: GraphDataType>(
32    storage_provider: &impl StorageReadProvider,
33    parameters: BuildPQParameters,
34) -> ANNResult<()> {
35    let num_pq_chunks = parameters.pq_bytes as usize;
36
37    let data_path = parameters.data_path;
38    let disk_pq_pivot_path = get_disk_index_pq_pivot_file(parameters.index_path_prefix);
39    let disk_pq_compressed_data_path =
40        get_disk_index_compressed_pq_file(parameters.index_path_prefix);
41
42    let mut pq_storage = PQStorage::new(
43        &disk_pq_pivot_path,
44        &disk_pq_compressed_data_path,
45        Some(data_path),
46    );
47
48    let metadata = load_metadata_from_file(storage_provider, parameters.data_path)?;
49    info!(
50        "Compressing dim-{} data into {} chunks(bytes) for PQ",
51        metadata.ndims(),
52        num_pq_chunks
53    );
54
55    let p_val = MAX_PQ_TRAINING_SET_SIZE / (metadata.npoints() as f64);
56
57    let timer = Timer::new();
58    let storage_provider = FileStorageProvider;
59    let random_provider = diskann_providers::utils::create_rnd_provider_from_seed(42);
60
61    let (mut train_data_vector, num_train, train_dim) = pq_storage
62        .get_random_train_data_slice::<Data::VectorDataType, _>(
63            p_val,
64            &storage_provider,
65            &mut random_provider.create_rnd(),
66        )?;
67
68    diskann_providers::model::pq::generate_pq_pivots(
69        GeneratePivotArguments::new(
70            num_train,
71            train_dim,
72            NUM_PQ_CENTROIDS,
73            num_pq_chunks,
74            NUM_KMEANS_REPS_PQ,
75            false,
76        )?,
77        &mut train_data_vector,
78        &pq_storage,
79        &storage_provider,
80        random_provider,
81        parameters.num_threads,
82    )?;
83
84    diskann_providers::model::pq::generate_pq_data_from_pivots::<f32, _, _>(
85        NUM_PQ_CENTROIDS,
86        num_pq_chunks,
87        &mut pq_storage,
88        &storage_provider,
89        false,
90        0,
91        parameters.num_threads,
92    )?;
93
94    info!(
95         "PQ build completed in {:.3} seconds, {:.3}B cycles, {:.3}% CPU time, peak memory {:.3} GBs for {} chunks, using {} threads",
96         timer.elapsed_seconds(),
97         timer.elapsed_gcycles(),
98         timer.get_average_cpu_time_in_percents(),
99         timer.get_peak_memory_usage(),
100         num_pq_chunks,
101         parameters.num_threads
102     );
103
104    Ok(())
105}