diskann_tools/utils/
build_pq.rs1use diskann::ANNResult;
7use diskann_providers::storage::StorageReadProvider;
8use diskann_providers::{
9 model::{
10 graph::traits::GraphDataType, GeneratePivotArguments, MAX_PQ_TRAINING_SET_SIZE,
11 NUM_KMEANS_REPS_PQ, NUM_PQ_CENTROIDS,
12 },
13 storage::{
14 get_disk_index_compressed_pq_file, get_disk_index_pq_pivot_file, FileStorageProvider,
15 PQStorage,
16 },
17 utils::{load_metadata_from_file, Timer},
18};
19use diskann_vector::distance::Metric;
20use tracing::info;
21
22pub struct BuildPQParameters<'a> {
23 pub metric: Metric,
24 pub data_path: &'a str,
25 pub index_path_prefix: &'a str,
26 pub num_threads: usize,
27 pub p_val: f64,
28 pub pq_bytes: f64,
29}
30
31pub fn build_pq<Data: GraphDataType>(
32 storage_provider: &impl StorageReadProvider,
33 parameters: BuildPQParameters,
34) -> ANNResult<()> {
35 let num_pq_chunks = parameters.pq_bytes as usize;
36
37 let data_path = parameters.data_path;
38 let disk_pq_pivot_path = get_disk_index_pq_pivot_file(parameters.index_path_prefix);
39 let disk_pq_compressed_data_path =
40 get_disk_index_compressed_pq_file(parameters.index_path_prefix);
41
42 let mut pq_storage = PQStorage::new(
43 &disk_pq_pivot_path,
44 &disk_pq_compressed_data_path,
45 Some(data_path),
46 );
47
48 let metadata = load_metadata_from_file(storage_provider, parameters.data_path)?;
49 info!(
50 "Compressing dim-{} data into {} chunks(bytes) for PQ",
51 metadata.ndims(),
52 num_pq_chunks
53 );
54
55 let p_val = MAX_PQ_TRAINING_SET_SIZE / (metadata.npoints() as f64);
56
57 let timer = Timer::new();
58 let storage_provider = FileStorageProvider;
59 let random_provider = diskann_providers::utils::create_rnd_provider_from_seed(42);
60
61 let (mut train_data_vector, num_train, train_dim) = pq_storage
62 .get_random_train_data_slice::<Data::VectorDataType, _>(
63 p_val,
64 &storage_provider,
65 &mut random_provider.create_rnd(),
66 )?;
67
68 diskann_providers::model::pq::generate_pq_pivots(
69 GeneratePivotArguments::new(
70 num_train,
71 train_dim,
72 NUM_PQ_CENTROIDS,
73 num_pq_chunks,
74 NUM_KMEANS_REPS_PQ,
75 false,
76 )?,
77 &mut train_data_vector,
78 &pq_storage,
79 &storage_provider,
80 random_provider,
81 parameters.num_threads,
82 )?;
83
84 diskann_providers::model::pq::generate_pq_data_from_pivots::<f32, _, _>(
85 NUM_PQ_CENTROIDS,
86 num_pq_chunks,
87 &mut pq_storage,
88 &storage_provider,
89 false,
90 0,
91 parameters.num_threads,
92 )?;
93
94 info!(
95 "PQ build completed in {:.3} seconds, {:.3}B cycles, {:.3}% CPU time, peak memory {:.3} GBs for {} chunks, using {} threads",
96 timer.elapsed_seconds(),
97 timer.elapsed_gcycles(),
98 timer.get_average_cpu_time_in_percents(),
99 timer.get_peak_memory_usage(),
100 num_pq_chunks,
101 parameters.num_threads
102 );
103
104 Ok(())
105}