Skip to main content

gobby_code/index/
chunker.rs

1//! Content chunking: 100-line chunks with 10-line overlap.
2//! Ports logic from src/gobby/code_index/chunker.py.
3//!
4//! This remains gcode-owned because BM25 content indexing stores
5//! line-based `ContentChunk` records with project, path, line range, language,
6//! and timestamp fields. The generic `gobby_core::indexing::Chunk` and
7//! `ChunkIdentity` primitives model byte ranges with opaque metadata, so
8//! composing them here would hide a domain-specific projection rather than
9//! remove shared foundation logic. gcode also derives incremental state from
10//! PostgreSQL `indexed_files.content_hash` rows instead of consuming core
11//! `IndexEvent` snapshots.
12
13use crate::models::ContentChunk;
14
15const CHUNK_SIZE: usize = 100;
16const CHUNK_OVERLAP: usize = 10;
17
18/// Split file content into overlapping chunks for FTS indexing.
19pub fn chunk_file_content(
20    source: &[u8],
21    rel_path: &str,
22    project_id: &str,
23    language: Option<&str>,
24) -> Vec<ContentChunk> {
25    let text = String::from_utf8_lossy(source);
26    let lines: Vec<&str> = text.split('\n').collect();
27    if lines.is_empty() {
28        return Vec::new();
29    }
30
31    let step = CHUNK_SIZE.saturating_sub(CHUNK_OVERLAP).max(1);
32    let mut chunks = Vec::new();
33    let mut chunk_index: usize = 0;
34    let mut start = 0;
35
36    while start < lines.len() {
37        let end = (start + CHUNK_SIZE).min(lines.len());
38        let chunk_content: String = lines[start..end].join("\n");
39
40        if !chunk_content.trim().is_empty() {
41            chunks.push(ContentChunk {
42                id: ContentChunk::make_id(project_id, rel_path, chunk_index),
43                project_id: project_id.to_string(),
44                file_path: rel_path.to_string(),
45                chunk_index,
46                line_start: start + 1,
47                line_end: end,
48                content: chunk_content,
49                language: language.unwrap_or("unknown").to_string(),
50                created_at: epoch_secs_str(),
51            });
52            chunk_index += 1;
53        }
54
55        if end >= lines.len() {
56            break;
57        }
58        start += step;
59    }
60
61    chunks
62}
63
64fn epoch_secs_str() -> String {
65    // Unix epoch seconds as a string (no chrono dependency)
66    use std::time::SystemTime;
67    let secs = SystemTime::now()
68        .duration_since(SystemTime::UNIX_EPOCH)
69        .unwrap_or_default()
70        .as_secs();
71    format!("{secs}")
72}
73
74#[cfg(test)]
75mod tests {
76    #[test]
77    fn chunker_stays_gcode_owned_with_documented_narrowing() {
78        let source = include_str!("chunker.rs");
79        let doc_phrase = ["line-based `ContentChunk`", " records"].concat();
80        assert!(source.contains(&doc_phrase));
81
82        for forbidden in [
83            ["use gobby_core", "::indexing::Chunk"].concat(),
84            ["use gobby_core", "::indexing::ChunkIdentity"].concat(),
85            ["use gobby_core", "::indexing::IndexEvent"].concat(),
86            ["use gobby_core", "::indexing::index_events_from_hashes"].concat(),
87        ] {
88            assert!(!source.contains(&forbidden));
89        }
90    }
91}