Skip to main content

codesearch/utils/
mod.rs

1//! Utility functions and helpers for codesearch
2//!
3//! This module contains reusable utility functions used across the codebase.
4
5use crate::chunker::Chunk;
6use std::collections::HashMap;
7
8/// Group chunks by their file path
9///
10/// This is a common pattern used in indexing and search operations.
11/// It takes an iterator of (chunk, value) pairs and groups them by the chunk's path.
12///
13/// # Arguments
14/// * `items` - Iterator of (chunk, value) pairs to group
15///
16/// # Returns
17/// * HashMap mapping file paths (as strings) to vectors of values
18pub fn group_chunks_by_path<T>(items: impl Iterator<Item = (Chunk, T)>) -> HashMap<String, Vec<T>>
19where
20    T: Clone,
21{
22    items.fold(HashMap::new(), |mut acc, (chunk, value)| {
23        acc.entry(chunk.path).or_default().push(value);
24        acc
25    })
26}
27
28/// Group chunks by their file path with pre-allocated capacity
29///
30/// Same as `group_chunks_by_path` but allows pre-allocating HashMap capacity
31/// for better performance when the number of files is known.
32///
33/// # Arguments
34/// * `items` - Iterator of (chunk, value) pairs to group
35/// * `capacity` - Expected number of unique file paths
36///
37/// # Returns
38/// * HashMap mapping file paths (as strings) to vectors of values
39pub fn group_chunks_by_path_with_capacity<T>(
40    items: impl Iterator<Item = (Chunk, T)>,
41    capacity: usize,
42) -> HashMap<String, Vec<T>>
43where
44    T: Clone,
45{
46    let mut map: HashMap<String, Vec<T>> = HashMap::with_capacity(capacity);
47    for (chunk, value) in items {
48        map.entry(chunk.path).or_default().push(value);
49    }
50    map
51}
52
53/// Group embedded chunks by their file path
54///
55/// Specialized version for embedded chunks (which already contain the chunk data).
56///
57/// # Arguments
58/// * `embedded_chunks` - Slice of embedded chunks to group
59/// * `chunk_ids` - Slice of chunk IDs corresponding to the embedded chunks
60///
61/// # Returns
62/// * HashMap mapping file paths (as strings) to vectors of chunk IDs
63pub fn group_embedded_chunks_by_path(
64    embedded_chunks: &[crate::embed::EmbeddedChunk],
65    chunk_ids: &[u32],
66) -> HashMap<String, Vec<u32>> {
67    let capacity = embedded_chunks.len() / 10; // Estimate: ~10 chunks per file
68    let mut map: HashMap<String, Vec<u32>> = HashMap::with_capacity(capacity.max(1));
69
70    for (chunk, chunk_id) in embedded_chunks.iter().zip(chunk_ids.iter()) {
71        map.entry(chunk.chunk.path.clone())
72            .or_default()
73            .push(*chunk_id);
74    }
75    map
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81    use crate::chunker::ChunkKind;
82
83    #[test]
84    fn test_group_chunks_by_path() {
85        let chunk1 = Chunk::new(
86            "content1".to_string(),
87            1,
88            1,
89            ChunkKind::Other,
90            "path1.rs".to_string(),
91        );
92        let chunk2 = Chunk::new(
93            "content2".to_string(),
94            2,
95            2,
96            ChunkKind::Other,
97            "path1.rs".to_string(),
98        );
99        let chunk3 = Chunk::new(
100            "content3".to_string(),
101            3,
102            3,
103            ChunkKind::Other,
104            "path2.rs".to_string(),
105        );
106
107        let items = vec![(chunk1, 1), (chunk2, 2), (chunk3, 3)];
108
109        let grouped = group_chunks_by_path(items.into_iter());
110
111        assert_eq!(grouped.len(), 2);
112        assert_eq!(grouped.get("path1.rs"), Some(&vec![1, 2]));
113        assert_eq!(grouped.get("path2.rs"), Some(&vec![3]));
114    }
115
116    #[test]
117    fn test_group_chunks_by_path_with_capacity() {
118        let chunk1 = Chunk::new(
119            "content1".to_string(),
120            1,
121            1,
122            ChunkKind::Other,
123            "path1.rs".to_string(),
124        );
125
126        let chunk2 = Chunk::new(
127            "content2".to_string(),
128            2,
129            2,
130            ChunkKind::Other,
131            "path2.rs".to_string(),
132        );
133
134        let items = vec![(chunk1, 1), (chunk2, 2)];
135
136        let grouped = group_chunks_by_path_with_capacity(items.into_iter(), 2);
137
138        assert_eq!(grouped.len(), 2);
139        assert_eq!(grouped.get("path1.rs"), Some(&vec![1]));
140        assert_eq!(grouped.get("path2.rs"), Some(&vec![2]));
141    }
142}