Skip to main content

rust_memex/
progress.rs

1//! Smart progress tracking for batch indexing operations.
2//!
3//! Provides three-phase progress display:
4//! 1. Pre-scan: Count files and estimate chunks
5//! 2. Calibration: Measure embedding speed on first file
6//! 3. Indexing: Progress bar with ETA based on calibration
7//!
8//! This module is only available when the `cli` feature is enabled.
9
10use indicatif::{ProgressBar, ProgressStyle};
11use std::path::PathBuf;
12use std::time::Instant;
13
14/// Format bytes into human-readable string
15fn format_bytes(bytes: u64) -> String {
16    const KB: u64 = 1024;
17    const MB: u64 = KB * 1024;
18    const GB: u64 = MB * 1024;
19
20    if bytes >= GB {
21        format!("{:.1} GB", bytes as f64 / GB as f64)
22    } else if bytes >= MB {
23        format!("{:.1} MB", bytes as f64 / MB as f64)
24    } else if bytes >= KB {
25        format!("{:.1} KB", bytes as f64 / KB as f64)
26    } else {
27        format!("{} B", bytes)
28    }
29}
30
31/// Progress tracker for batch indexing operations.
32///
33/// Implements a three-phase progress display:
34/// 1. Pre-scan phase: counts files and estimates total chunks
35/// 2. Calibration phase: measures embedding speed on first file
36/// 3. Indexing phase: displays progress bar with ETA
37pub struct IndexProgressTracker {
38    // Pre-scan phase
39    /// Total number of files to process
40    pub total_files: usize,
41    /// Total size of all files in bytes
42    pub total_bytes: u64,
43    /// Average chunk size in characters (used for estimation)
44    pub chunk_size: usize,
45    /// Estimated total chunks based on file sizes
46    pub estimated_chunks: usize,
47
48    // Calibration phase
49    /// When calibration started
50    calibration_start: Option<Instant>,
51    /// Measured chunks per second (EMA - updated dynamically)
52    pub chunks_per_sec: Option<f64>,
53    /// Name of the embedder model (for display)
54    pub embedder_model: Option<String>,
55    /// Whether calibration is complete
56    calibration_done: bool,
57
58    // Dynamic speed tracking (rolling average)
59    /// Last time we updated the speed measurement
60    last_speed_update: Option<Instant>,
61    /// Chunks processed since last speed update
62    chunks_since_update: usize,
63    /// EMA smoothing factor (0.3 = 30% new, 70% old)
64    speed_ema_alpha: f64,
65
66    // Progress tracking
67    /// Number of chunks processed so far
68    pub processed_chunks: usize,
69    /// Number of files processed (indexed + skipped + failed)
70    pub processed_files: usize,
71    /// Number of files skipped (duplicates)
72    pub skipped_files: usize,
73    /// Number of files that failed to index
74    pub failed_files: usize,
75    /// Number of files successfully indexed
76    pub indexed_files: usize,
77
78    // Progress bar
79    progress_bar: Option<ProgressBar>,
80}
81
82impl IndexProgressTracker {
83    /// Create a new progress tracker by pre-scanning the given paths.
84    ///
85    /// This performs Phase 1: Pre-scan to count files and estimate chunks.
86    pub fn pre_scan(paths: &[PathBuf]) -> Self {
87        let total_bytes: u64 = paths
88            .iter()
89            .filter_map(|p| std::fs::metadata(p).ok())
90            .map(|m| m.len())
91            .sum();
92
93        // Rough estimate: 1 chunk per 500 chars, avg 1 byte = 1 char for text files
94        // This is conservative - actual chunk count depends on slice mode
95        let chunk_size = 500;
96        let estimated_chunks = (total_bytes as usize) / chunk_size;
97
98        Self {
99            total_files: paths.len(),
100            total_bytes,
101            chunk_size,
102            estimated_chunks,
103            calibration_start: None,
104            chunks_per_sec: None,
105            embedder_model: None,
106            calibration_done: false,
107            last_speed_update: None,
108            chunks_since_update: 0,
109            speed_ema_alpha: 0.3, // 30% weight for new measurements
110            processed_chunks: 0,
111            processed_files: 0,
112            skipped_files: 0,
113            failed_files: 0,
114            indexed_files: 0,
115            progress_bar: None,
116        }
117    }
118
119    /// Display Phase 1 pre-scan summary to stderr.
120    pub fn display_pre_scan(&self) {
121        eprintln!();
122        eprintln!("Phase 1: Pre-scan");
123        eprintln!("  |-- Files: {}", self.total_files);
124        eprintln!("  |-- Total size: {}", format_bytes(self.total_bytes));
125        eprintln!(
126            "  `-- Est. chunks: ~{} (@ {} chars/chunk)",
127            self.estimated_chunks, self.chunk_size
128        );
129    }
130
131    /// Start the calibration phase (Phase 2).
132    ///
133    /// Call this before processing the first file.
134    pub fn start_calibration(&mut self) {
135        self.calibration_start = Some(Instant::now());
136        eprintln!();
137        eprintln!("Phase 2: Calibration (first file)...");
138    }
139
140    /// Finish calibration with results from first file.
141    ///
142    /// # Arguments
143    /// * `chunks_processed` - Number of chunks indexed in first file
144    /// * `model` - Name of the embedder model used
145    pub fn finish_calibration(&mut self, chunks_processed: usize, model: &str) {
146        if let Some(start) = self.calibration_start {
147            let elapsed = start.elapsed();
148            if elapsed.as_secs_f64() > 0.0 && chunks_processed > 0 {
149                self.chunks_per_sec = Some(chunks_processed as f64 / elapsed.as_secs_f64());
150            }
151            self.embedder_model = Some(model.to_string());
152            self.calibration_done = true;
153            // Initialize dynamic speed tracking
154            self.last_speed_update = Some(Instant::now());
155            self.chunks_since_update = 0;
156
157            eprintln!(
158                "  `-- Speed: {:.1} chunks/sec ({}) [dynamic]",
159                self.chunks_per_sec.unwrap_or(0.0),
160                model
161            );
162        }
163    }
164
165    /// Check if calibration has been completed.
166    pub fn is_calibrated(&self) -> bool {
167        self.calibration_done
168    }
169
170    /// Start the progress bar for Phase 3: Indexing.
171    ///
172    /// Creates a progress bar based on estimated chunks.
173    pub fn start_progress_bar(&mut self) {
174        let pb = ProgressBar::new(self.estimated_chunks as u64);
175        pb.set_style(
176            ProgressStyle::default_bar()
177                .template(
178                    "{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} chunks | ETA: {eta} | {msg}",
179                )
180                .expect("Invalid progress bar template")
181                .progress_chars("#>-"),
182        );
183
184        eprintln!();
185        eprintln!("Phase 3: Indexing...");
186        self.progress_bar = Some(pb);
187    }
188
189    /// Increment the chunk counter and update progress bar.
190    ///
191    /// Also updates the rolling average speed (EMA) every 2 seconds
192    /// to reflect actual embedding performance after GPU warm-up.
193    pub fn inc_chunks(&mut self, count: usize) {
194        self.processed_chunks += count;
195        self.chunks_since_update += count;
196
197        // Update speed every 2 seconds using EMA
198        if let Some(last_update) = self.last_speed_update {
199            let elapsed = last_update.elapsed().as_secs_f64();
200            if elapsed >= 2.0 && self.chunks_since_update > 0 {
201                let current_speed = self.chunks_since_update as f64 / elapsed;
202
203                // Exponential Moving Average: new = alpha * current + (1-alpha) * old
204                self.chunks_per_sec = Some(match self.chunks_per_sec {
205                    Some(old_speed) => {
206                        self.speed_ema_alpha * current_speed
207                            + (1.0 - self.speed_ema_alpha) * old_speed
208                    }
209                    None => current_speed,
210                });
211
212                // Reset for next measurement window
213                self.last_speed_update = Some(Instant::now());
214                self.chunks_since_update = 0;
215            }
216        }
217
218        if let Some(ref pb) = self.progress_bar {
219            pb.set_position(self.processed_chunks as u64);
220        }
221    }
222
223    /// Record a successfully indexed file.
224    ///
225    /// # Arguments
226    /// * `chunks` - Number of chunks created from this file
227    pub fn file_indexed(&mut self, chunks: usize) {
228        self.indexed_files += 1;
229        self.processed_files += 1;
230        self.inc_chunks(chunks);
231    }
232
233    /// Record a skipped file (duplicate).
234    pub fn file_skipped(&mut self) {
235        self.skipped_files += 1;
236        self.processed_files += 1;
237    }
238
239    /// Record a failed file.
240    pub fn file_failed(&mut self) {
241        self.failed_files += 1;
242        self.processed_files += 1;
243    }
244
245    /// Set the current message on the progress bar.
246    pub fn set_message(&mut self, msg: &str) {
247        if let Some(ref pb) = self.progress_bar {
248            pb.set_message(msg.to_string());
249        }
250    }
251
252    /// Finish the progress bar with completion message.
253    pub fn finish(&mut self) {
254        if let Some(ref pb) = self.progress_bar {
255            pb.finish_with_message("Complete");
256        }
257    }
258
259    /// Display final summary after indexing is complete.
260    pub fn display_summary(&self) {
261        eprintln!();
262        eprintln!("Indexing complete:");
263        eprintln!("  |-- Chunks indexed: {}", self.processed_chunks);
264        eprintln!("  |-- Files processed: {}", self.processed_files);
265        eprintln!("  |   |-- Indexed: {}", self.indexed_files);
266        if self.skipped_files > 0 {
267            eprintln!("  |   |-- Skipped (duplicate): {}", self.skipped_files);
268        }
269        if self.failed_files > 0 {
270            eprintln!("  |   `-- Failed: {}", self.failed_files);
271        }
272        if let Some(speed) = self.chunks_per_sec {
273            eprintln!("  `-- Avg speed: {:.1} chunks/sec", speed);
274        }
275    }
276
277    /// Adjust estimated chunks based on actual chunk count from calibration file.
278    ///
279    /// This improves ETA accuracy after calibration by using the actual
280    /// bytes-to-chunks ratio observed in the first file.
281    pub fn adjust_estimate(&mut self, file_bytes: u64, actual_chunks: usize) {
282        if file_bytes > 0 && actual_chunks > 0 {
283            // Calculate actual bytes per chunk ratio
284            let bytes_per_chunk = file_bytes as f64 / actual_chunks as f64;
285            // Remaining bytes to process
286            let remaining_bytes = self.total_bytes.saturating_sub(file_bytes);
287            // Estimate remaining chunks based on actual ratio
288            let remaining_chunks = (remaining_bytes as f64 / bytes_per_chunk) as usize;
289            // Update total estimate
290            self.estimated_chunks = actual_chunks + remaining_chunks;
291
292            // Update progress bar length if active
293            if let Some(ref pb) = self.progress_bar {
294                pb.set_length(self.estimated_chunks as u64);
295            }
296        }
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303    use std::io::Write;
304    use tempfile::TempDir;
305
306    #[test]
307    fn test_format_bytes() {
308        assert_eq!(format_bytes(0), "0 B");
309        assert_eq!(format_bytes(500), "500 B");
310        assert_eq!(format_bytes(1024), "1.0 KB");
311        assert_eq!(format_bytes(1536), "1.5 KB");
312        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
313        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
314    }
315
316    #[test]
317    fn test_pre_scan_empty() {
318        let tracker = IndexProgressTracker::pre_scan(&[]);
319        assert_eq!(tracker.total_files, 0);
320        assert_eq!(tracker.total_bytes, 0);
321        assert_eq!(tracker.estimated_chunks, 0);
322    }
323
324    #[test]
325    fn test_pre_scan_with_files() {
326        let temp = TempDir::new().unwrap();
327        let file1 = temp.path().join("file1.txt");
328        let file2 = temp.path().join("file2.txt");
329
330        // Create test files with known sizes
331        let mut f1 = std::fs::File::create(&file1).unwrap();
332        f1.write_all(&[b'a'; 1000]).unwrap();
333
334        let mut f2 = std::fs::File::create(&file2).unwrap();
335        f2.write_all(&[b'b'; 500]).unwrap();
336
337        let paths = vec![file1, file2];
338        let tracker = IndexProgressTracker::pre_scan(&paths);
339
340        assert_eq!(tracker.total_files, 2);
341        assert_eq!(tracker.total_bytes, 1500);
342        // 1500 bytes / 500 chars per chunk = 3 estimated chunks
343        assert_eq!(tracker.estimated_chunks, 3);
344    }
345
346    #[test]
347    fn test_file_tracking() {
348        let tracker_paths: Vec<PathBuf> = vec![];
349        let mut tracker = IndexProgressTracker::pre_scan(&tracker_paths);
350
351        // Simulate indexing some files
352        tracker.file_indexed(10);
353        tracker.file_indexed(5);
354        tracker.file_skipped();
355        tracker.file_failed();
356
357        assert_eq!(tracker.processed_files, 4);
358        assert_eq!(tracker.indexed_files, 2);
359        assert_eq!(tracker.skipped_files, 1);
360        assert_eq!(tracker.failed_files, 1);
361        assert_eq!(tracker.processed_chunks, 15);
362    }
363
364    #[test]
365    fn test_calibration_flow() {
366        let tracker_paths: Vec<PathBuf> = vec![];
367        let mut tracker = IndexProgressTracker::pre_scan(&tracker_paths);
368
369        assert!(!tracker.is_calibrated());
370
371        tracker.start_calibration();
372        // Simulate some processing time
373        std::thread::sleep(std::time::Duration::from_millis(10));
374        tracker.finish_calibration(100, "test-model");
375
376        assert!(tracker.is_calibrated());
377        assert!(tracker.chunks_per_sec.is_some());
378        assert_eq!(tracker.embedder_model, Some("test-model".to_string()));
379    }
380
381    #[test]
382    fn test_adjust_estimate() {
383        let tracker_paths: Vec<PathBuf> = vec![];
384        let mut tracker = IndexProgressTracker::pre_scan(&tracker_paths);
385        tracker.total_bytes = 10000;
386        tracker.estimated_chunks = 20;
387
388        // First file: 1000 bytes produced 5 chunks
389        // Ratio: 200 bytes per chunk
390        // Remaining: 9000 bytes -> 45 chunks
391        // Total: 5 + 45 = 50 chunks
392        tracker.adjust_estimate(1000, 5);
393
394        assert_eq!(tracker.estimated_chunks, 50);
395    }
396}