greppy/parse/
chunker.rs

1use crate::core::config::{CHUNK_MAX_LINES, CHUNK_OVERLAP};
2use crate::parse::walker::detect_language;
3use std::path::Path;
4
5/// A chunk of code to be indexed
6#[derive(Debug, Clone)]
7pub struct Chunk {
8    pub path: String,
9    pub content: String,
10    pub symbol_name: Option<String>,
11    pub symbol_type: Option<String>,
12    pub start_line: usize,
13    pub end_line: usize,
14    pub language: String,
15    pub file_hash: String,
16}
17
18impl Chunk {
19    /// Generate unique ID for this chunk
20    pub fn id(&self) -> String {
21        format!("{}:{}:{}", self.path, self.start_line, self.end_line)
22    }
23}
24
25/// Chunk a file into indexable pieces
26pub fn chunk_file(path: &Path, content: &str) -> Vec<Chunk> {
27    let language = detect_language(path);
28    let file_hash = compute_hash(content);
29    let path_str = path.to_string_lossy().to_string();
30
31    let lines: Vec<&str> = content.lines().collect();
32
33    if lines.is_empty() {
34        return Vec::new();
35    }
36
37    let mut chunks = Vec::new();
38    let mut start = 0;
39
40    while start < lines.len() {
41        let end = (start + CHUNK_MAX_LINES).min(lines.len());
42        let chunk_content = lines[start..end].join("\n");
43
44        // Try to extract symbol name from first non-empty line
45        let (symbol_name, symbol_type) = extract_symbol(&lines[start..end]);
46
47        chunks.push(Chunk {
48            path: path_str.clone(),
49            content: chunk_content,
50            symbol_name,
51            symbol_type,
52            start_line: start + 1, // 1-indexed
53            end_line: end,
54            language: language.clone(),
55            file_hash: file_hash.clone(),
56        });
57
58        if end >= lines.len() {
59            break;
60        }
61
62        start = end.saturating_sub(CHUNK_OVERLAP);
63    }
64
65    chunks
66}
67
68/// Compute hash of content
69fn compute_hash(content: &str) -> String {
70    let hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
71    format!("{:016x}", hash)
72}
73
74/// Extract symbol name and type from code lines (simple heuristic)
75fn extract_symbol(lines: &[&str]) -> (Option<String>, Option<String>) {
76    for line in lines {
77        let trimmed = line.trim();
78
79        // Function patterns
80        if let Some(name) = extract_function_name(trimmed) {
81            return (Some(name), Some("function".to_string()));
82        }
83
84        // Class patterns
85        if let Some(name) = extract_class_name(trimmed) {
86            return (Some(name), Some("class".to_string()));
87        }
88
89        // Method patterns
90        if let Some(name) = extract_method_name(trimmed) {
91            return (Some(name), Some("method".to_string()));
92        }
93    }
94
95    (None, None)
96}
97
98fn extract_function_name(line: &str) -> Option<String> {
99    // fn name(
100    if line.starts_with("fn ") {
101        return line
102            .strip_prefix("fn ")?
103            .split('(')
104            .next()
105            .map(|s| s.trim().to_string());
106    }
107
108    // function name(
109    if line.starts_with("function ") {
110        return line
111            .strip_prefix("function ")?
112            .split('(')
113            .next()
114            .map(|s| s.trim().to_string());
115    }
116
117    // def name(
118    if line.starts_with("def ") {
119        return line
120            .strip_prefix("def ")?
121            .split('(')
122            .next()
123            .map(|s| s.trim().to_string());
124    }
125
126    // func name(
127    if line.starts_with("func ") {
128        return line
129            .strip_prefix("func ")?
130            .split('(')
131            .next()
132            .map(|s| s.trim().to_string());
133    }
134
135    // const name = (  or  const name = function
136    if line.starts_with("const ") || line.starts_with("let ") || line.starts_with("var ") {
137        let rest = line.split_whitespace().nth(1)?;
138        if line.contains("=>") || line.contains("function") {
139            return Some(rest.trim_end_matches(|c| c == '=' || c == ' ').to_string());
140        }
141    }
142
143    // export function name(
144    if line.starts_with("export function ") {
145        return line
146            .strip_prefix("export function ")?
147            .split('(')
148            .next()
149            .map(|s| s.trim().to_string());
150    }
151
152    // export const name =
153    if line.starts_with("export const ") && (line.contains("=>") || line.contains("function")) {
154        return line
155            .strip_prefix("export const ")?
156            .split('=')
157            .next()
158            .map(|s| s.trim().to_string());
159    }
160
161    None
162}
163
164fn extract_class_name(line: &str) -> Option<String> {
165    // class Name
166    if line.starts_with("class ") {
167        return line
168            .strip_prefix("class ")?
169            .split(|c| c == ' ' || c == '{' || c == '(' || c == ':')
170            .next()
171            .map(|s| s.trim().to_string());
172    }
173
174    // struct Name
175    if line.starts_with("struct ") || line.starts_with("pub struct ") {
176        let rest = if line.starts_with("pub ") {
177            line.strip_prefix("pub struct ")?
178        } else {
179            line.strip_prefix("struct ")?
180        };
181        return rest
182            .split(|c| c == ' ' || c == '{' || c == '(' || c == '<')
183            .next()
184            .map(|s| s.trim().to_string());
185    }
186
187    // impl Name
188    if line.starts_with("impl ") || line.starts_with("impl<") {
189        let rest = line.strip_prefix("impl")?;
190        let rest = rest
191            .trim_start_matches(|c: char| c == '<' || c.is_alphanumeric() || c == '_' || c == ',');
192        let rest = rest.trim_start_matches('>').trim();
193        return rest
194            .split(|c| c == ' ' || c == '{' || c == '<')
195            .next()
196            .map(|s| s.trim().to_string());
197    }
198
199    None
200}
201
202fn extract_method_name(line: &str) -> Option<String> {
203    // pub fn name( or pub async fn name(
204    if line.contains("pub ") && line.contains("fn ") {
205        let idx = line.find("fn ")?;
206        let rest = &line[idx + 3..];
207        return rest.split('(').next().map(|s| s.trim().to_string());
208    }
209
210    // async name( in class context
211    if line.trim().starts_with("async ") {
212        return line
213            .trim()
214            .strip_prefix("async ")?
215            .split('(')
216            .next()
217            .map(|s| s.trim().to_string());
218    }
219
220    None
221}