Skip to main content

repograph_core/search/
chunk.rs

1//! Turning a repo's git-tracked files into indexable chunks.
2//!
3//! Chunking is deliberately language-agnostic: a file is split into bounded
4//! windows of lines with a small overlap, each carrying a contextual prefix
5//! (`repo › relpath › Lstart-end`) so both the lexical index and the embedding
6//! model see where a chunk came from. Tree-sitter symbol-aware chunking is a
7//! later change; this keeps v1 shippable across every language.
8
9use std::path::Path;
10
11use git2::{ObjectType, Repository};
12
13/// Maximum file size we index, in bytes.
14///
15/// Matches codegraph's `maxFileSize` guard — larger files are almost always
16/// vendored assets, minified bundles, or generated blobs that pollute
17/// retrieval without adding signal.
18pub const MAX_FILE_BYTES: u64 = 1_048_576;
19
20/// Number of lines per chunk window.
21pub const CHUNK_LINES: usize = 40;
22
23/// Lines of overlap between consecutive chunks, so a construct that straddles a
24/// window boundary still appears whole in at least one chunk.
25pub const CHUNK_OVERLAP: usize = 10;
26
27/// One indexable unit: a window of lines from a single file plus the metadata
28/// the store and renderer need.
29#[derive(Debug, Clone, PartialEq, Eq)]
30pub struct Chunk {
31    /// Repo-relative path, forward-slashed.
32    pub path: String,
33    /// 1-based start line of the window (inclusive).
34    pub start_line: u32,
35    /// 1-based end line of the window (inclusive).
36    pub end_line: u32,
37    /// Raw source lines — the snippet shown to the user.
38    pub content: String,
39    /// Contextual prefix prepended before lexical indexing / embedding.
40    pub prefix: String,
41}
42
43impl Chunk {
44    /// The text fed to the lexical index and the embedding model: the
45    /// contextual prefix followed by the raw content.
46    #[must_use]
47    pub fn index_text(&self) -> String {
48        format!("{}\n{}", self.prefix, self.content)
49    }
50}
51
52/// A git-tracked file resolved to its current working-tree bytes, ready to
53/// chunk. `content_hash` is the git blob SHA of `text`, used to detect changes
54/// for incremental reindexing.
55#[derive(Debug, Clone)]
56pub struct TrackedFile {
57    pub path: String,
58    pub content_hash: String,
59    pub text: String,
60}
61
62/// Enumerate the git-tracked files of `repo_path` eligible for indexing.
63///
64/// Eligible means tracked (present in the git index), under [`MAX_FILE_BYTES`],
65/// and valid UTF-8. Ignored and untracked files are excluded by construction —
66/// only index entries are walked. Files staged-deleted (gone from the working
67/// tree) are skipped.
68///
69/// `repo` is the already-opened repository; the caller owns it so HEAD can be
70/// inspected separately for the indexed-commit record.
71///
72/// # Errors
73///
74/// Returns the underlying [`git2::Error`] when the index cannot be read.
75pub fn tracked_files(repo: &Repository, repo_path: &Path) -> Result<Vec<TrackedFile>, git2::Error> {
76    let index = repo.index()?;
77    let mut out = Vec::new();
78    for i in 0..index.len() {
79        let Some(entry) = index.get(i) else {
80            continue;
81        };
82        let Ok(rel) = std::str::from_utf8(&entry.path) else {
83            continue; // non-UTF-8 path: skip rather than guess an encoding.
84        };
85        let rel = rel.replace('\\', "/");
86        let abs = repo_path.join(&rel);
87        let Ok(meta) = std::fs::metadata(&abs) else {
88            continue; // staged-deleted or unreadable: nothing to index.
89        };
90        if !meta.is_file() || meta.len() > MAX_FILE_BYTES {
91            continue;
92        }
93        let Ok(bytes) = std::fs::read(&abs) else {
94            continue;
95        };
96        let Ok(text) = String::from_utf8(bytes) else {
97            continue; // binary / non-UTF-8 content: not searchable text.
98        };
99        let content_hash = blob_hash(text.as_bytes());
100        out.push(TrackedFile {
101            path: rel,
102            content_hash,
103            text,
104        });
105    }
106    Ok(out)
107}
108
109/// Git blob SHA of `bytes` — the same identity git uses for file content. Reused
110/// as the incremental-reindex change key so no extra hashing dependency is
111/// needed. Falls back to a length tag only if libgit2 cannot hash (it does not
112/// touch the object database, so this effectively never fails).
113fn blob_hash(bytes: &[u8]) -> String {
114    git2::Oid::hash_object(ObjectType::Blob, bytes)
115        .map_or_else(|_| format!("len:{}", bytes.len()), |oid| oid.to_string())
116}
117
118/// Split a file's `text` into overlapping line-window [`Chunk`]s. An empty or
119/// whitespace-only file yields no chunks.
120#[must_use]
121pub fn chunk_file(repo: &str, path: &str, text: &str) -> Vec<Chunk> {
122    let lines: Vec<&str> = text.lines().collect();
123    if lines.iter().all(|l| l.trim().is_empty()) {
124        return Vec::new();
125    }
126    let stride = CHUNK_LINES.saturating_sub(CHUNK_OVERLAP).max(1);
127    let mut chunks = Vec::new();
128    let mut start = 0usize;
129    while start < lines.len() {
130        let end = (start + CHUNK_LINES).min(lines.len());
131        let content = lines[start..end].join("\n");
132        if !content.trim().is_empty() {
133            let start_line = u32::try_from(start + 1).unwrap_or(u32::MAX);
134            let end_line = u32::try_from(end).unwrap_or(u32::MAX);
135            let prefix = format!("{repo} › {path} › L{start_line}-{end_line}");
136            chunks.push(Chunk {
137                path: path.to_string(),
138                start_line,
139                end_line,
140                content,
141                prefix,
142            });
143        }
144        if end == lines.len() {
145            break;
146        }
147        start += stride;
148    }
149    chunks
150}
151
152#[cfg(test)]
153mod tests {
154    // Tests build fixtures with literal sizes/line counts; the lossless-cast
155    // and format-collect lints are noise here.
156    #![allow(
157        clippy::unwrap_used,
158        clippy::cast_possible_truncation,
159        clippy::format_collect
160    )]
161    use super::*;
162    use std::path::PathBuf;
163    use tempfile::TempDir;
164
165    fn init_repo_with(files: &[(&str, &str)]) -> (TempDir, PathBuf) {
166        let tmp = TempDir::new().unwrap();
167        let dir = tmp.path().join("r");
168        std::fs::create_dir_all(&dir).unwrap();
169        let repo = git2::Repository::init(&dir).unwrap();
170        for (rel, body) in files {
171            let abs = dir.join(rel);
172            if let Some(parent) = abs.parent() {
173                std::fs::create_dir_all(parent).unwrap();
174            }
175            std::fs::write(&abs, body).unwrap();
176        }
177        let mut index = repo.index().unwrap();
178        index
179            .add_all(["*"], git2::IndexAddOption::DEFAULT, None)
180            .unwrap();
181        index.write().unwrap();
182        (tmp, dir)
183    }
184
185    #[test]
186    fn tracked_files_returns_added_text_files() {
187        let (_tmp, dir) = init_repo_with(&[("src/a.rs", "fn a() {}\n"), ("README.md", "# hi\n")]);
188        let repo = git2::Repository::open(&dir).unwrap();
189        let mut files = tracked_files(&repo, &dir).unwrap();
190        files.sort_by(|a, b| a.path.cmp(&b.path));
191        let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
192        assert_eq!(paths, vec!["README.md", "src/a.rs"]);
193        assert!(files.iter().all(|f| !f.content_hash.is_empty()));
194    }
195
196    #[test]
197    fn tracked_files_excludes_untracked() {
198        let (_tmp, dir) = init_repo_with(&[("tracked.rs", "fn t() {}\n")]);
199        std::fs::write(dir.join("untracked.rs"), "fn u() {}\n").unwrap();
200        let repo = git2::Repository::open(&dir).unwrap();
201        let files = tracked_files(&repo, &dir).unwrap();
202        let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
203        assert_eq!(paths, vec!["tracked.rs"]);
204    }
205
206    #[test]
207    fn tracked_files_skips_oversize_and_binary() {
208        let big = "x".repeat((MAX_FILE_BYTES + 1) as usize);
209        let (_tmp, dir) = init_repo_with(&[
210            ("ok.txt", "small\n"),
211            ("big.txt", big.as_str()),
212            ("bin.dat", "\u{0}"),
213        ]);
214        // Replace bin.dat content with real non-UTF-8 bytes after add.
215        std::fs::write(dir.join("bin.dat"), [0xff, 0xfe, 0x00]).unwrap();
216        let repo = git2::Repository::open(&dir).unwrap();
217        let files = tracked_files(&repo, &dir).unwrap();
218        let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
219        assert_eq!(paths, vec!["ok.txt"], "oversize + binary skipped");
220    }
221
222    #[test]
223    fn blob_hash_matches_git_blob_identity() {
224        // The empty blob SHA is a well-known git constant.
225        assert_eq!(blob_hash(b""), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391");
226    }
227
228    #[test]
229    fn chunk_file_windows_with_overlap_and_prefix() {
230        let body: String = (1..=100).map(|n| format!("line{n}\n")).collect();
231        let chunks = chunk_file("repo", "src/big.rs", &body);
232        assert!(chunks.len() > 1, "long file splits into multiple chunks");
233        assert_eq!(chunks[0].start_line, 1);
234        assert_eq!(chunks[0].end_line, CHUNK_LINES as u32);
235        // Second window starts at stride+1 (overlap retained).
236        let stride = (CHUNK_LINES - CHUNK_OVERLAP) as u32;
237        assert_eq!(chunks[1].start_line, stride + 1);
238        assert!(chunks[0].prefix.contains("repo › src/big.rs › L1-"));
239        assert!(chunks[0].index_text().starts_with("repo › src/big.rs"));
240    }
241
242    #[test]
243    fn chunk_file_empty_yields_nothing() {
244        assert!(chunk_file("r", "empty.txt", "   \n\n").is_empty());
245    }
246
247    #[test]
248    fn chunk_file_short_file_is_single_chunk() {
249        let chunks = chunk_file("r", "a.rs", "fn a() {}\nfn b() {}\n");
250        assert_eq!(chunks.len(), 1);
251        assert_eq!(chunks[0].start_line, 1);
252        assert_eq!(chunks[0].end_line, 2);
253    }
254}