Skip to main content

diskann_tools/utils/
build_pq.rs

1/*
2 * Copyright (c) Microsoft Corporation.
3 * Licensed under the MIT license.
4 */
5
6use diskann::ANNResult;
7use diskann_disk::data_model::GraphDataType;
8use diskann_providers::storage::StorageReadProvider;
9use diskann_providers::{
10    model::{
11        GeneratePivotArguments, MAX_PQ_TRAINING_SET_SIZE, NUM_KMEANS_REPS_PQ, NUM_PQ_CENTROIDS,
12    },
13    storage::{
14        get_disk_index_compressed_pq_file, get_disk_index_pq_pivot_file, FileStorageProvider,
15        PQStorage,
16    },
17    utils::{load_metadata_from_file, Timer},
18};
19use diskann_vector::distance::Metric;
20use tracing::info;
21
22pub struct BuildPQParameters<'a> {
23    pub metric: Metric,
24    pub data_path: &'a str,
25    pub index_path_prefix: &'a str,
26    pub num_threads: usize,
27    pub p_val: f64,
28    pub pq_bytes: f64,
29}
30
31pub fn build_pq<Data: GraphDataType>(
32    storage_provider: &impl StorageReadProvider,
33    parameters: BuildPQParameters,
34) -> ANNResult<()> {
35    let num_pq_chunks = parameters.pq_bytes as usize;
36
37    let data_path = parameters.data_path;
38    let disk_pq_pivot_path = get_disk_index_pq_pivot_file(parameters.index_path_prefix);
39    let disk_pq_compressed_data_path =
40        get_disk_index_compressed_pq_file(parameters.index_path_prefix);
41
42    let mut pq_storage = PQStorage::new(
43        &disk_pq_pivot_path,
44        &disk_pq_compressed_data_path,
45        Some(data_path),
46    );
47
48    let metadata = load_metadata_from_file(storage_provider, parameters.data_path)?;
49    info!(
50        "Compressing dim-{} data into {} chunks(bytes) for PQ",
51        metadata.ndims(),
52        num_pq_chunks
53    );
54
55    let p_val = MAX_PQ_TRAINING_SET_SIZE / (metadata.npoints() as f64);
56
57    let timer = Timer::new();
58    let storage_provider = FileStorageProvider;
59    let random_provider = diskann_providers::utils::create_rnd_provider_from_seed(42);
60
61    let (mut train_data_vector, num_train, train_dim) = pq_storage
62        .get_random_train_data_slice::<Data::VectorDataType, _>(
63            p_val,
64            &storage_provider,
65            &mut random_provider.create_rnd(),
66        )?;
67
68    diskann_providers::model::pq::generate_pq_pivots(
69        GeneratePivotArguments::new(
70            num_train,
71            train_dim,
72            NUM_PQ_CENTROIDS,
73            num_pq_chunks,
74            NUM_KMEANS_REPS_PQ,
75            false,
76        )?,
77        &mut train_data_vector,
78        &pq_storage,
79        &storage_provider,
80        random_provider,
81        parameters.num_threads,
82    )?;
83
84    diskann_providers::model::pq::generate_pq_data_from_pivots::<f32, _, _>(
85        NUM_PQ_CENTROIDS,
86        num_pq_chunks,
87        &mut pq_storage,
88        &storage_provider,
89        0,
90        parameters.num_threads,
91    )?;
92
93    info!(
94         "PQ build completed in {:.3} seconds, {:.3}B cycles, {:.3}% CPU time, peak memory {:.3} GBs for {} chunks, using {} threads",
95         timer.elapsed_seconds(),
96         timer.elapsed_gcycles(),
97         timer.get_average_cpu_time_in_percents(),
98         timer.get_peak_memory_usage(),
99         num_pq_chunks,
100         parameters.num_threads
101     );
102
103    Ok(())
104}