next_plaid/
update.rs

1//! Index update functionality for adding new documents.
2//!
3//! This module provides functions to incrementally update an existing PLAID index
4//! with new documents, matching fast-plaid's behavior:
5//! - Buffer mechanism for small updates
6//! - Centroid expansion for outliers
7//! - Cluster threshold updates
8
9use std::collections::HashMap;
10use std::fs;
11use std::fs::File;
12use std::io::{BufReader, BufWriter};
13use std::path::Path;
14
15use serde::{Deserialize, Serialize};
16
17use ndarray::{s, Array1, Array2, Axis};
18use rayon::prelude::*;
19
20use crate::codec::ResidualCodec;
21use crate::error::Error;
22use crate::error::Result;
23use crate::index::Metadata;
24use crate::kmeans::compute_kmeans;
25use crate::kmeans::ComputeKmeansConfig;
26use crate::utils::quantile;
27
28/// Default batch size for processing documents (matches fast-plaid).
29const DEFAULT_BATCH_SIZE: usize = 50_000;
30
31/// Configuration for index updates.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct UpdateConfig {
34    /// Batch size for processing documents (default: 50,000)
35    pub batch_size: usize,
36    /// Number of K-means iterations for centroid expansion (default: 4)
37    pub kmeans_niters: usize,
38    /// Max points per centroid for K-means (default: 256)
39    pub max_points_per_centroid: usize,
40    /// Number of samples for K-means (default: auto-calculated)
41    pub n_samples_kmeans: Option<usize>,
42    /// Random seed (default: 42)
43    pub seed: u64,
44    /// If index has fewer docs than this, rebuild from scratch (default: 999)
45    pub start_from_scratch: usize,
46    /// Buffer size before triggering centroid expansion (default: 100)
47    pub buffer_size: usize,
48}
49
50impl Default for UpdateConfig {
51    fn default() -> Self {
52        Self {
53            batch_size: DEFAULT_BATCH_SIZE,
54            kmeans_niters: 4,
55            max_points_per_centroid: 256,
56            n_samples_kmeans: None,
57            seed: 42,
58            start_from_scratch: 999,
59            buffer_size: 100,
60        }
61    }
62}
63
64impl UpdateConfig {
65    /// Convert to ComputeKmeansConfig for centroid expansion.
66    pub fn to_kmeans_config(&self) -> ComputeKmeansConfig {
67        ComputeKmeansConfig {
68            kmeans_niters: self.kmeans_niters,
69            max_points_per_centroid: self.max_points_per_centroid,
70            seed: self.seed,
71            n_samples_kmeans: self.n_samples_kmeans,
72            num_partitions: None,
73        }
74    }
75}
76
77// ============================================================================
78// Buffer Management
79// ============================================================================
80
81/// Load buffered embeddings from buffer.npy.
82///
83/// Returns an empty vector if buffer.npy doesn't exist.
84/// Uses buffer_lengths.json to split the flattened array back into per-document arrays.
85pub fn load_buffer(index_path: &Path) -> Result<Vec<Array2<f32>>> {
86    use ndarray_npy::ReadNpyExt;
87
88    let buffer_path = index_path.join("buffer.npy");
89    let lengths_path = index_path.join("buffer_lengths.json");
90
91    if !buffer_path.exists() {
92        return Ok(Vec::new());
93    }
94
95    // Load the flattened embeddings array
96    let flat: Array2<f32> = match Array2::read_npy(File::open(&buffer_path)?) {
97        Ok(arr) => arr,
98        Err(_) => return Ok(Vec::new()),
99    };
100
101    // Load lengths to split back into per-document arrays
102    if lengths_path.exists() {
103        let lengths: Vec<i64> =
104            serde_json::from_reader(BufReader::new(File::open(&lengths_path)?))?;
105
106        let mut result = Vec::with_capacity(lengths.len());
107        let mut offset = 0;
108
109        for &len in &lengths {
110            let len_usize = len as usize;
111            if offset + len_usize > flat.nrows() {
112                break;
113            }
114            let doc_emb = flat.slice(s![offset..offset + len_usize, ..]).to_owned();
115            result.push(doc_emb);
116            offset += len_usize;
117        }
118
119        return Ok(result);
120    }
121
122    // Fallback: if no lengths file, return as single document (legacy behavior)
123    Ok(vec![flat])
124}
125
126/// Save embeddings to buffer.npy.
127///
128/// Also saves buffer_info.json with the number of documents for deletion tracking.
129pub fn save_buffer(index_path: &Path, embeddings: &[Array2<f32>]) -> Result<()> {
130    use ndarray_npy::WriteNpyExt;
131
132    let buffer_path = index_path.join("buffer.npy");
133
134    // For simplicity, concatenate all embeddings into one array
135    // and store the lengths separately
136    if embeddings.is_empty() {
137        return Ok(());
138    }
139
140    let dim = embeddings[0].ncols();
141    let total_rows: usize = embeddings.iter().map(|e| e.nrows()).sum();
142
143    let mut flat = Array2::<f32>::zeros((total_rows, dim));
144    let mut offset = 0;
145    let mut lengths = Vec::new();
146
147    for emb in embeddings {
148        let n = emb.nrows();
149        flat.slice_mut(s![offset..offset + n, ..]).assign(emb);
150        lengths.push(n as i64);
151        offset += n;
152    }
153
154    flat.write_npy(File::create(&buffer_path)?)?;
155
156    // Save lengths
157    let lengths_path = index_path.join("buffer_lengths.json");
158    serde_json::to_writer(BufWriter::new(File::create(&lengths_path)?), &lengths)?;
159
160    // Save buffer info for deletion tracking (number of documents)
161    let info_path = index_path.join("buffer_info.json");
162    let buffer_info = serde_json::json!({ "num_docs": embeddings.len() });
163    serde_json::to_writer(BufWriter::new(File::create(&info_path)?), &buffer_info)?;
164
165    Ok(())
166}
167
168/// Load buffer info (number of buffered documents).
169///
170/// Returns 0 if buffer_info.json doesn't exist.
171pub fn load_buffer_info(index_path: &Path) -> Result<usize> {
172    let info_path = index_path.join("buffer_info.json");
173    if !info_path.exists() {
174        return Ok(0);
175    }
176
177    let info: serde_json::Value = serde_json::from_reader(BufReader::new(File::open(&info_path)?))?;
178
179    Ok(info.get("num_docs").and_then(|v| v.as_u64()).unwrap_or(0) as usize)
180}
181
182/// Clear buffer files.
183pub fn clear_buffer(index_path: &Path) -> Result<()> {
184    let buffer_path = index_path.join("buffer.npy");
185    let lengths_path = index_path.join("buffer_lengths.json");
186    let info_path = index_path.join("buffer_info.json");
187
188    if buffer_path.exists() {
189        fs::remove_file(&buffer_path)?;
190    }
191    if lengths_path.exists() {
192        fs::remove_file(&lengths_path)?;
193    }
194    if info_path.exists() {
195        fs::remove_file(&info_path)?;
196    }
197
198    Ok(())
199}
200
201/// Load embeddings stored for rebuild (embeddings.npy + embeddings_lengths.json).
202///
203/// This function loads raw embeddings that were saved for start-from-scratch rebuilds.
204/// The embeddings are stored in a flat 2D array with a separate lengths file.
205pub fn load_embeddings_npy(index_path: &Path) -> Result<Vec<Array2<f32>>> {
206    use ndarray_npy::ReadNpyExt;
207
208    let emb_path = index_path.join("embeddings.npy");
209    let lengths_path = index_path.join("embeddings_lengths.json");
210
211    if !emb_path.exists() {
212        return Ok(Vec::new());
213    }
214
215    // Load flat embeddings array
216    let flat: Array2<f32> = Array2::read_npy(File::open(&emb_path)?)?;
217
218    // Load lengths to split back into per-document arrays
219    if lengths_path.exists() {
220        let lengths: Vec<i64> =
221            serde_json::from_reader(BufReader::new(File::open(&lengths_path)?))?;
222
223        let mut result = Vec::with_capacity(lengths.len());
224        let mut offset = 0;
225
226        for &len in &lengths {
227            let len_usize = len as usize;
228            if offset + len_usize > flat.nrows() {
229                break;
230            }
231            let doc_emb = flat.slice(s![offset..offset + len_usize, ..]).to_owned();
232            result.push(doc_emb);
233            offset += len_usize;
234        }
235
236        return Ok(result);
237    }
238
239    // Fallback: if no lengths file, return as single document
240    Ok(vec![flat])
241}
242
243/// Save embeddings for potential rebuild (start-from-scratch mode).
244///
245/// Stores embeddings in embeddings.npy (flat array) + embeddings_lengths.json.
246/// This matches fast-plaid's behavior of storing raw embeddings when the index
247/// is below the start_from_scratch threshold.
248pub fn save_embeddings_npy(index_path: &Path, embeddings: &[Array2<f32>]) -> Result<()> {
249    use ndarray_npy::WriteNpyExt;
250
251    if embeddings.is_empty() {
252        return Ok(());
253    }
254
255    let dim = embeddings[0].ncols();
256    let total_rows: usize = embeddings.iter().map(|e| e.nrows()).sum();
257
258    let mut flat = Array2::<f32>::zeros((total_rows, dim));
259    let mut offset = 0;
260    let mut lengths = Vec::with_capacity(embeddings.len());
261
262    for emb in embeddings {
263        let n = emb.nrows();
264        flat.slice_mut(s![offset..offset + n, ..]).assign(emb);
265        lengths.push(n as i64);
266        offset += n;
267    }
268
269    // Save flat embeddings
270    let emb_path = index_path.join("embeddings.npy");
271    flat.write_npy(File::create(&emb_path)?)?;
272
273    // Save lengths for reconstruction
274    let lengths_path = index_path.join("embeddings_lengths.json");
275    serde_json::to_writer(BufWriter::new(File::create(&lengths_path)?), &lengths)?;
276
277    Ok(())
278}
279
280/// Clear embeddings.npy and embeddings_lengths.json.
281pub fn clear_embeddings_npy(index_path: &Path) -> Result<()> {
282    let emb_path = index_path.join("embeddings.npy");
283    let lengths_path = index_path.join("embeddings_lengths.json");
284
285    if emb_path.exists() {
286        fs::remove_file(&emb_path)?;
287    }
288    if lengths_path.exists() {
289        fs::remove_file(&lengths_path)?;
290    }
291    Ok(())
292}
293
294/// Check if embeddings.npy exists for start-from-scratch mode.
295pub fn embeddings_npy_exists(index_path: &Path) -> bool {
296    index_path.join("embeddings.npy").exists()
297}
298
299// ============================================================================
300// Cluster Threshold Management
301// ============================================================================
302
303/// Load cluster threshold from cluster_threshold.npy.
304pub fn load_cluster_threshold(index_path: &Path) -> Result<f32> {
305    use ndarray_npy::ReadNpyExt;
306
307    let thresh_path = index_path.join("cluster_threshold.npy");
308    if !thresh_path.exists() {
309        return Err(Error::Update("cluster_threshold.npy not found".into()));
310    }
311
312    let arr: Array1<f32> = Array1::read_npy(File::open(&thresh_path)?)?;
313    Ok(arr[0])
314}
315
316/// Update cluster_threshold.npy with weighted average.
317pub fn update_cluster_threshold(
318    index_path: &Path,
319    new_residual_norms: &Array1<f32>,
320    old_total_embeddings: usize,
321) -> Result<()> {
322    use ndarray_npy::{ReadNpyExt, WriteNpyExt};
323
324    let new_count = new_residual_norms.len();
325    if new_count == 0 {
326        return Ok(());
327    }
328
329    let new_threshold = quantile(new_residual_norms, 0.75);
330
331    let thresh_path = index_path.join("cluster_threshold.npy");
332    let final_threshold = if thresh_path.exists() {
333        let old_arr: Array1<f32> = Array1::read_npy(File::open(&thresh_path)?)?;
334        let old_threshold = old_arr[0];
335        let total = old_total_embeddings + new_count;
336        (old_threshold * old_total_embeddings as f32 + new_threshold * new_count as f32)
337            / total as f32
338    } else {
339        new_threshold
340    };
341
342    Array1::from_vec(vec![final_threshold]).write_npy(File::create(&thresh_path)?)?;
343
344    Ok(())
345}
346
347// ============================================================================
348// Centroid Expansion
349// ============================================================================
350
351/// Find outlier embeddings that are far from all existing centroids.
352///
353/// Returns indices of embeddings where min L2² distance > threshold².
354fn find_outliers(
355    flat_embeddings: &Array2<f32>,
356    centroids: &Array2<f32>,
357    threshold_sq: f32,
358) -> Vec<usize> {
359    flat_embeddings
360        .axis_iter(Axis(0))
361        .into_par_iter()
362        .enumerate()
363        .filter_map(|(i, emb)| {
364            // Find minimum squared distance to any centroid
365            let min_dist_sq = centroids
366                .axis_iter(Axis(0))
367                .map(|c| {
368                    // L2 squared distance
369                    emb.iter()
370                        .zip(c.iter())
371                        .map(|(a, b)| (a - b).powi(2))
372                        .sum::<f32>()
373                })
374                .fold(f32::INFINITY, f32::min);
375
376            if min_dist_sq > threshold_sq {
377                Some(i)
378            } else {
379                None
380            }
381        })
382        .collect()
383}
384
385/// Expand centroids by clustering embeddings far from existing centroids.
386///
387/// This implements fast-plaid's update_centroids() function:
388/// 1. Flatten all new embeddings
389/// 2. Find outliers (distance > cluster_threshold²)
390/// 3. Cluster outliers to get new centroids
391/// 4. Append new centroids to centroids.npy
392/// 5. Extend ivf_lengths.npy with zeros
393/// 6. Update metadata.json num_partitions
394///
395/// Returns the number of new centroids added.
396pub fn update_centroids(
397    index_path: &Path,
398    new_embeddings: &[Array2<f32>],
399    cluster_threshold: f32,
400    config: &UpdateConfig,
401) -> Result<usize> {
402    use ndarray_npy::{ReadNpyExt, WriteNpyExt};
403
404    let centroids_path = index_path.join("centroids.npy");
405    if !centroids_path.exists() {
406        return Ok(0);
407    }
408
409    // Load existing centroids
410    let existing_centroids: Array2<f32> = Array2::read_npy(File::open(&centroids_path)?)?;
411
412    // Flatten all new embeddings
413    let dim = existing_centroids.ncols();
414    let total_tokens: usize = new_embeddings.iter().map(|e| e.nrows()).sum();
415
416    if total_tokens == 0 {
417        return Ok(0);
418    }
419
420    let mut flat_embeddings = Array2::<f32>::zeros((total_tokens, dim));
421    let mut offset = 0;
422
423    for emb in new_embeddings {
424        let n = emb.nrows();
425        flat_embeddings
426            .slice_mut(s![offset..offset + n, ..])
427            .assign(emb);
428        offset += n;
429    }
430
431    // Find outliers
432    let threshold_sq = cluster_threshold * cluster_threshold;
433    let outlier_indices = find_outliers(&flat_embeddings, &existing_centroids, threshold_sq);
434
435    let num_outliers = outlier_indices.len();
436    if num_outliers == 0 {
437        return Ok(0);
438    }
439
440    // Extract outlier embeddings
441    let mut outliers = Array2::<f32>::zeros((num_outliers, dim));
442    for (i, &idx) in outlier_indices.iter().enumerate() {
443        outliers.row_mut(i).assign(&flat_embeddings.row(idx));
444    }
445
446    // Compute number of new centroids
447    // k_update = max(1, ceil(num_outliers / max_points_per_centroid) * 4)
448    let target_k =
449        ((num_outliers as f64 / config.max_points_per_centroid as f64).ceil() as usize).max(1) * 4;
450    let k_update = target_k.min(num_outliers); // Can't have more centroids than points
451
452    // Cluster outliers to get new centroids
453    let kmeans_config = ComputeKmeansConfig {
454        kmeans_niters: config.kmeans_niters,
455        max_points_per_centroid: config.max_points_per_centroid,
456        seed: config.seed,
457        n_samples_kmeans: config.n_samples_kmeans,
458        num_partitions: Some(k_update),
459    };
460
461    // Convert outliers to vector of single-token "documents" for compute_kmeans
462    let outlier_docs: Vec<Array2<f32>> = outlier_indices
463        .iter()
464        .map(|&idx| flat_embeddings.slice(s![idx..idx + 1, ..]).to_owned())
465        .collect();
466
467    let new_centroids = compute_kmeans(&outlier_docs, &kmeans_config)?;
468    let k_new = new_centroids.nrows();
469
470    // Concatenate centroids
471    let new_num_centroids = existing_centroids.nrows() + k_new;
472    let mut final_centroids = Array2::<f32>::zeros((new_num_centroids, dim));
473    final_centroids
474        .slice_mut(s![..existing_centroids.nrows(), ..])
475        .assign(&existing_centroids);
476    final_centroids
477        .slice_mut(s![existing_centroids.nrows().., ..])
478        .assign(&new_centroids);
479
480    // Save updated centroids
481    final_centroids.write_npy(File::create(&centroids_path)?)?;
482
483    // Extend ivf_lengths.npy with zeros for new centroids
484    let ivf_lengths_path = index_path.join("ivf_lengths.npy");
485    if ivf_lengths_path.exists() {
486        let old_lengths: Array1<i32> = Array1::read_npy(File::open(&ivf_lengths_path)?)?;
487        let mut new_lengths = Array1::<i32>::zeros(new_num_centroids);
488        new_lengths
489            .slice_mut(s![..old_lengths.len()])
490            .assign(&old_lengths);
491        new_lengths.write_npy(File::create(&ivf_lengths_path)?)?;
492    }
493
494    // Update metadata.json num_partitions
495    let meta_path = index_path.join("metadata.json");
496    if meta_path.exists() {
497        let mut meta: serde_json::Value =
498            serde_json::from_reader(BufReader::new(File::open(&meta_path)?))?;
499
500        if let Some(obj) = meta.as_object_mut() {
501            obj.insert("num_partitions".to_string(), new_num_centroids.into());
502        }
503
504        serde_json::to_writer_pretty(BufWriter::new(File::create(&meta_path)?), &meta)?;
505    }
506
507    Ok(k_new)
508}
509
510// ============================================================================
511// Low-Level Index Update
512// ============================================================================
513
514/// Update an existing index with new documents.
515///
516/// # Arguments
517///
518/// * `embeddings` - List of new document embeddings, each of shape `[num_tokens, dim]`
519/// * `index_path` - Path to the existing index directory
520/// * `codec` - The loaded ResidualCodec for compression
521/// * `batch_size` - Optional batch size for processing (default: 50,000)
522/// * `update_threshold` - Whether to update the cluster threshold
523///
524/// # Returns
525///
526/// The number of new documents added
527pub fn update_index(
528    embeddings: &[Array2<f32>],
529    index_path: &str,
530    codec: &ResidualCodec,
531    batch_size: Option<usize>,
532    update_threshold: bool,
533) -> Result<usize> {
534    let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
535    let index_dir = Path::new(index_path);
536
537    // Load existing metadata
538    let metadata_path = index_dir.join("metadata.json");
539    let metadata: Metadata = serde_json::from_reader(BufReader::new(
540        File::open(&metadata_path)
541            .map_err(|e| Error::IndexLoad(format!("Failed to open metadata: {}", e)))?,
542    ))?;
543
544    let num_existing_chunks = metadata.num_chunks;
545    let old_num_documents = metadata.num_documents;
546    let old_total_embeddings = metadata.num_embeddings;
547    let num_centroids = codec.num_centroids();
548    let embedding_dim = codec.embedding_dim();
549    let nbits = metadata.nbits;
550
551    // Determine starting chunk index
552    let mut start_chunk_idx = num_existing_chunks;
553    let mut append_to_last = false;
554    let mut current_emb_offset = old_total_embeddings;
555
556    // Check if we should append to the last chunk (if it has < 2000 documents)
557    if start_chunk_idx > 0 {
558        let last_idx = start_chunk_idx - 1;
559        let last_meta_path = index_dir.join(format!("{}.metadata.json", last_idx));
560
561        if last_meta_path.exists() {
562            let last_meta: serde_json::Value =
563                serde_json::from_reader(BufReader::new(File::open(&last_meta_path).map_err(
564                    |e| Error::IndexLoad(format!("Failed to open chunk metadata: {}", e)),
565                )?))?;
566
567            if let Some(nd) = last_meta.get("num_documents").and_then(|x| x.as_u64()) {
568                if nd < 2000 {
569                    start_chunk_idx = last_idx;
570                    append_to_last = true;
571
572                    if let Some(off) = last_meta.get("embedding_offset").and_then(|x| x.as_u64()) {
573                        current_emb_offset = off as usize;
574                    } else {
575                        let embs_in_last = last_meta
576                            .get("num_embeddings")
577                            .and_then(|x| x.as_u64())
578                            .unwrap_or(0) as usize;
579                        current_emb_offset = old_total_embeddings - embs_in_last;
580                    }
581                }
582            }
583        }
584    }
585
586    // Process new documents
587    let num_new_documents = embeddings.len();
588    let n_new_chunks = (num_new_documents as f64 / batch_size as f64).ceil() as usize;
589
590    let mut new_codes_accumulated: Vec<Vec<usize>> = Vec::new();
591    let mut new_doclens_accumulated: Vec<i64> = Vec::new();
592    let mut all_residual_norms: Vec<f32> = Vec::new();
593
594    let progress = indicatif::ProgressBar::new(n_new_chunks as u64);
595    progress.set_message("Updating index...");
596
597    let packed_dim = embedding_dim * nbits / 8;
598
599    for i in 0..n_new_chunks {
600        let global_chunk_idx = start_chunk_idx + i;
601        let chk_offset = i * batch_size;
602        let chk_end = (chk_offset + batch_size).min(num_new_documents);
603        let chunk_docs = &embeddings[chk_offset..chk_end];
604
605        // Collect document lengths
606        let mut chk_doclens: Vec<i64> = chunk_docs.iter().map(|d| d.nrows() as i64).collect();
607        let total_tokens: usize = chk_doclens.iter().sum::<i64>() as usize;
608
609        // Concatenate all embeddings in the chunk for batch processing
610        let mut batch_embeddings = ndarray::Array2::<f32>::zeros((total_tokens, embedding_dim));
611        let mut offset = 0;
612        for doc in chunk_docs {
613            let n = doc.nrows();
614            batch_embeddings
615                .slice_mut(s![offset..offset + n, ..])
616                .assign(doc);
617            offset += n;
618        }
619
620        // BATCH: Compress all embeddings at once
621        let batch_codes = codec.compress_into_codes(&batch_embeddings);
622
623        // BATCH: Compute residuals using parallel subtraction
624        let mut batch_residuals = batch_embeddings;
625        {
626            let centroids = &codec.centroids;
627            batch_residuals
628                .axis_iter_mut(Axis(0))
629                .into_par_iter()
630                .zip(batch_codes.as_slice().unwrap().par_iter())
631                .for_each(|(mut row, &code)| {
632                    let centroid = centroids.row(code);
633                    row.iter_mut()
634                        .zip(centroid.iter())
635                        .for_each(|(r, c)| *r -= c);
636                });
637        }
638
639        // Collect residual norms if updating threshold
640        if update_threshold {
641            for row in batch_residuals.axis_iter(Axis(0)) {
642                let norm = row.dot(&row).sqrt();
643                all_residual_norms.push(norm);
644            }
645        }
646
647        // BATCH: Quantize all residuals at once
648        let batch_packed = codec.quantize_residuals(&batch_residuals)?;
649
650        // Convert to lists for chunk saving
651        let mut chk_codes_list: Vec<usize> = batch_codes.iter().copied().collect();
652        let mut chk_residuals_list: Vec<u8> = batch_packed.iter().copied().collect();
653
654        // Split codes back into per-document arrays for IVF building
655        let mut code_offset = 0;
656        for &len in &chk_doclens {
657            let len_usize = len as usize;
658            let codes: Vec<usize> = batch_codes
659                .slice(s![code_offset..code_offset + len_usize])
660                .iter()
661                .copied()
662                .collect();
663            new_codes_accumulated.push(codes);
664            new_doclens_accumulated.push(len);
665            code_offset += len_usize;
666        }
667
668        // Handle appending to last chunk
669        if i == 0 && append_to_last {
670            use ndarray_npy::ReadNpyExt;
671
672            let old_doclens_path = index_dir.join(format!("doclens.{}.json", global_chunk_idx));
673
674            if old_doclens_path.exists() {
675                let old_doclens: Vec<i64> =
676                    serde_json::from_reader(BufReader::new(File::open(&old_doclens_path)?))?;
677
678                let old_codes_path = index_dir.join(format!("{}.codes.npy", global_chunk_idx));
679                let old_residuals_path =
680                    index_dir.join(format!("{}.residuals.npy", global_chunk_idx));
681
682                let old_codes: Array1<i64> = Array1::read_npy(File::open(&old_codes_path)?)?;
683                let old_residuals: Array2<u8> = Array2::read_npy(File::open(&old_residuals_path)?)?;
684
685                // Prepend old data
686                let mut combined_codes: Vec<usize> =
687                    old_codes.iter().map(|&x| x as usize).collect();
688                combined_codes.extend(chk_codes_list);
689                chk_codes_list = combined_codes;
690
691                let mut combined_residuals: Vec<u8> = old_residuals.iter().copied().collect();
692                combined_residuals.extend(chk_residuals_list);
693                chk_residuals_list = combined_residuals;
694
695                let mut combined_doclens = old_doclens;
696                combined_doclens.extend(chk_doclens);
697                chk_doclens = combined_doclens;
698            }
699        }
700
701        // Save chunk data
702        {
703            use ndarray_npy::WriteNpyExt;
704
705            let codes_arr: Array1<i64> = chk_codes_list.iter().map(|&x| x as i64).collect();
706            let codes_path = index_dir.join(format!("{}.codes.npy", global_chunk_idx));
707            codes_arr.write_npy(File::create(&codes_path)?)?;
708
709            let num_tokens = chk_codes_list.len();
710            let residuals_arr =
711                Array2::from_shape_vec((num_tokens, packed_dim), chk_residuals_list)
712                    .map_err(|e| Error::Shape(format!("Failed to reshape residuals: {}", e)))?;
713            let residuals_path = index_dir.join(format!("{}.residuals.npy", global_chunk_idx));
714            residuals_arr.write_npy(File::create(&residuals_path)?)?;
715        }
716
717        // Save doclens
718        let doclens_path = index_dir.join(format!("doclens.{}.json", global_chunk_idx));
719        serde_json::to_writer(BufWriter::new(File::create(&doclens_path)?), &chk_doclens)?;
720
721        // Save chunk metadata
722        let chk_meta = serde_json::json!({
723            "num_documents": chk_doclens.len(),
724            "num_embeddings": chk_codes_list.len(),
725            "embedding_offset": current_emb_offset,
726        });
727        current_emb_offset += chk_codes_list.len();
728
729        let meta_path = index_dir.join(format!("{}.metadata.json", global_chunk_idx));
730        serde_json::to_writer_pretty(BufWriter::new(File::create(&meta_path)?), &chk_meta)?;
731
732        progress.inc(1);
733    }
734    progress.finish();
735
736    // Update cluster threshold if requested
737    if update_threshold && !all_residual_norms.is_empty() {
738        let norms = Array1::from_vec(all_residual_norms);
739        update_cluster_threshold(index_dir, &norms, old_total_embeddings)?;
740    }
741
742    // Build new partial IVF
743    let mut partition_pids_map: HashMap<usize, Vec<i64>> = HashMap::new();
744    let mut pid_counter = old_num_documents as i64;
745
746    for doc_codes in &new_codes_accumulated {
747        for &code in doc_codes {
748            partition_pids_map
749                .entry(code)
750                .or_default()
751                .push(pid_counter);
752        }
753        pid_counter += 1;
754    }
755
756    // Load old IVF and merge
757    {
758        use ndarray_npy::{ReadNpyExt, WriteNpyExt};
759
760        let ivf_path = index_dir.join("ivf.npy");
761        let ivf_lengths_path = index_dir.join("ivf_lengths.npy");
762
763        let old_ivf: Array1<i64> = if ivf_path.exists() {
764            Array1::read_npy(File::open(&ivf_path)?)?
765        } else {
766            Array1::zeros(0)
767        };
768
769        let old_ivf_lengths: Array1<i32> = if ivf_lengths_path.exists() {
770            Array1::read_npy(File::open(&ivf_lengths_path)?)?
771        } else {
772            Array1::zeros(num_centroids)
773        };
774
775        // Compute old offsets
776        let mut old_offsets = vec![0i64];
777        for &len in old_ivf_lengths.iter() {
778            old_offsets.push(old_offsets.last().unwrap() + len as i64);
779        }
780
781        // Merge IVF
782        let mut new_ivf_data: Vec<i64> = Vec::new();
783        let mut new_ivf_lengths: Vec<i32> = Vec::with_capacity(num_centroids);
784
785        for centroid_id in 0..num_centroids {
786            // Get old PIDs for this centroid
787            let old_start = old_offsets[centroid_id] as usize;
788            let old_len = if centroid_id < old_ivf_lengths.len() {
789                old_ivf_lengths[centroid_id] as usize
790            } else {
791                0
792            };
793
794            let mut pids: Vec<i64> = if old_len > 0 && old_start + old_len <= old_ivf.len() {
795                old_ivf.slice(s![old_start..old_start + old_len]).to_vec()
796            } else {
797                Vec::new()
798            };
799
800            // Add new PIDs
801            if let Some(new_pids) = partition_pids_map.get(&centroid_id) {
802                pids.extend(new_pids);
803            }
804
805            // Deduplicate and sort
806            pids.sort_unstable();
807            pids.dedup();
808
809            new_ivf_lengths.push(pids.len() as i32);
810            new_ivf_data.extend(pids);
811        }
812
813        // Save updated IVF
814        let new_ivf = Array1::from_vec(new_ivf_data);
815        new_ivf.write_npy(File::create(&ivf_path)?)?;
816
817        let new_lengths = Array1::from_vec(new_ivf_lengths);
818        new_lengths.write_npy(File::create(&ivf_lengths_path)?)?;
819    }
820
821    // Update global metadata
822    let new_total_chunks = start_chunk_idx + n_new_chunks;
823    let new_tokens_count: i64 = new_doclens_accumulated.iter().sum();
824    let num_embeddings = old_total_embeddings + new_tokens_count as usize;
825    let total_num_documents = old_num_documents + num_new_documents;
826
827    let new_avg_doclen = if total_num_documents > 0 {
828        let old_sum = metadata.avg_doclen * old_num_documents as f64;
829        (old_sum + new_tokens_count as f64) / total_num_documents as f64
830    } else {
831        0.0
832    };
833
834    let new_metadata = Metadata {
835        num_chunks: new_total_chunks,
836        nbits,
837        num_partitions: num_centroids,
838        num_embeddings,
839        avg_doclen: new_avg_doclen,
840        num_documents: total_num_documents,
841    };
842
843    serde_json::to_writer_pretty(BufWriter::new(File::create(&metadata_path)?), &new_metadata)?;
844
845    Ok(num_new_documents)
846}
847
848#[cfg(test)]
849mod tests {
850    use super::*;
851
852    #[test]
853    fn test_update_config_default() {
854        let config = UpdateConfig::default();
855        assert_eq!(config.batch_size, 50_000);
856        assert_eq!(config.buffer_size, 100);
857        assert_eq!(config.start_from_scratch, 999);
858    }
859
860    #[test]
861    fn test_find_outliers() {
862        // Create centroids at (0,0), (1,1)
863        let centroids = Array2::from_shape_vec((2, 2), vec![0.0, 0.0, 1.0, 1.0]).unwrap();
864
865        // Create embeddings: one close to (0,0), one close to (1,1), one far away at (5,5)
866        let embeddings =
867            Array2::from_shape_vec((3, 2), vec![0.1, 0.1, 0.9, 0.9, 5.0, 5.0]).unwrap();
868
869        // Threshold of 1.0 squared = 1.0
870        let outliers = find_outliers(&embeddings, &centroids, 1.0);
871
872        // Only the point at (5,5) should be an outlier
873        assert_eq!(outliers.len(), 1);
874        assert_eq!(outliers[0], 2);
875    }
876
877    #[test]
878    fn test_buffer_roundtrip() {
879        use tempfile::TempDir;
880
881        let dir = TempDir::new().unwrap();
882
883        // Create 3 documents with different numbers of embeddings
884        let embeddings = vec![
885            Array2::from_shape_vec((3, 4), (0..12).map(|x| x as f32).collect()).unwrap(),
886            Array2::from_shape_vec((2, 4), (12..20).map(|x| x as f32).collect()).unwrap(),
887            Array2::from_shape_vec((5, 4), (20..40).map(|x| x as f32).collect()).unwrap(),
888        ];
889
890        // Save buffer
891        save_buffer(dir.path(), &embeddings).unwrap();
892
893        // Load buffer and verify we get 3 documents, not 1
894        let loaded = load_buffer(dir.path()).unwrap();
895
896        assert_eq!(loaded.len(), 3, "Should have 3 documents, not 1");
897        assert_eq!(loaded[0].nrows(), 3, "First doc should have 3 rows");
898        assert_eq!(loaded[1].nrows(), 2, "Second doc should have 2 rows");
899        assert_eq!(loaded[2].nrows(), 5, "Third doc should have 5 rows");
900
901        // Verify content matches
902        assert_eq!(loaded[0], embeddings[0]);
903        assert_eq!(loaded[1], embeddings[1]);
904        assert_eq!(loaded[2], embeddings[2]);
905    }
906
907    #[test]
908    fn test_buffer_info_matches_buffer_len() {
909        use tempfile::TempDir;
910
911        let dir = TempDir::new().unwrap();
912
913        // Create 5 documents
914        let embeddings: Vec<Array2<f32>> = (0..5)
915            .map(|i| {
916                let rows = i + 2; // 2, 3, 4, 5, 6 rows
917                Array2::from_shape_fn((rows, 4), |(r, c)| (r * 4 + c) as f32)
918            })
919            .collect();
920
921        save_buffer(dir.path(), &embeddings).unwrap();
922
923        // Verify buffer_info.json matches actual document count
924        let info_count = load_buffer_info(dir.path()).unwrap();
925        let loaded = load_buffer(dir.path()).unwrap();
926
927        assert_eq!(info_count, 5, "buffer_info should report 5 docs");
928        assert_eq!(
929            loaded.len(),
930            5,
931            "load_buffer should return 5 docs to match buffer_info"
932        );
933    }
934}
next_plaid/update.rs

next_plaid/
update.rs