qex_core/chunk/
multi_language.rs1use crate::chunk::languages::{all_chunkers, LanguageChunker};
2use crate::chunk::tree_sitter::TreeSitterEngine;
3use crate::chunk::CodeChunk;
4use anyhow::{Context, Result};
5use rayon::prelude::*;
6use std::collections::HashMap;
7use std::path::Path;
8
9pub struct MultiLanguageChunker {
11 extension_map: HashMap<String, usize>,
13 chunkers: Vec<Box<dyn LanguageChunker>>,
15}
16
17impl MultiLanguageChunker {
18 pub fn new() -> Self {
19 let chunkers = all_chunkers();
20 let mut extension_map = HashMap::new();
21
22 for (idx, chunker) in chunkers.iter().enumerate() {
23 for ext in chunker.file_extensions() {
24 extension_map.insert(ext.to_string(), idx);
25 }
26 }
27
28 Self {
29 extension_map,
30 chunkers,
31 }
32 }
33
34 pub fn is_supported(&self, path: &str) -> bool {
36 Path::new(path)
37 .extension()
38 .and_then(|e| e.to_str())
39 .map(|ext| self.extension_map.contains_key(ext))
40 .unwrap_or(false)
41 }
42
43 pub fn language_for_file(&self, path: &str) -> Option<&str> {
45 let ext = Path::new(path).extension()?.to_str()?;
46 let idx = self.extension_map.get(ext)?;
47 Some(self.chunkers[*idx].language_name())
48 }
49
50 pub fn supported_extensions(&self) -> Vec<&str> {
52 self.chunkers
53 .iter()
54 .flat_map(|c| c.file_extensions().iter().copied())
55 .collect()
56 }
57
58 pub fn chunk_file(
60 &self,
61 file_path: &str,
62 relative_path: &str,
63 source: &str,
64 ) -> Result<Vec<CodeChunk>> {
65 let ext = Path::new(file_path)
66 .extension()
67 .and_then(|e| e.to_str())
68 .context("File has no extension")?;
69
70 let idx = self
71 .extension_map
72 .get(ext)
73 .context(format!("Unsupported extension: {}", ext))?;
74
75 let chunker = &self.chunkers[*idx];
76
77 TreeSitterEngine::parse_file(source, file_path, relative_path, chunker.language_name(), chunker.as_ref())
78 }
79
80 pub fn chunk_files(
82 &self,
83 files: &[(String, String)], ) -> Vec<(String, Result<Vec<CodeChunk>>)> {
85 files
86 .par_iter()
87 .filter_map(|(abs_path, rel_path)| {
88 if !self.is_supported(abs_path) {
89 return None;
90 }
91 let source = match std::fs::read_to_string(abs_path) {
92 Ok(s) => s,
93 Err(e) => {
94 return Some((
95 rel_path.clone(),
96 Err(anyhow::anyhow!("Failed to read {}: {}", abs_path, e)),
97 ));
98 }
99 };
100 let result = self.chunk_file(abs_path, rel_path, &source);
101 Some((rel_path.clone(), result))
102 })
103 .collect()
104 }
105}
106
107impl Default for MultiLanguageChunker {
108 fn default() -> Self {
109 Self::new()
110 }
111}
112
113#[cfg(test)]
114mod tests {
115 use super::*;
116
117 #[test]
118 fn test_supported_extensions() {
119 let chunker = MultiLanguageChunker::new();
120 assert!(chunker.is_supported("test.py"));
121 assert!(chunker.is_supported("test.rs"));
122 assert!(chunker.is_supported("test.js"));
123 assert!(chunker.is_supported("test.ts"));
124 assert!(chunker.is_supported("test.tsx"));
125 assert!(chunker.is_supported("test.go"));
126 assert!(chunker.is_supported("test.java"));
127 assert!(chunker.is_supported("test.c"));
128 assert!(chunker.is_supported("test.cpp"));
129 assert!(chunker.is_supported("test.cs"));
130 assert!(chunker.is_supported("test.md"));
131 assert!(!chunker.is_supported("test.xyz"));
132 }
133
134 #[test]
135 fn test_language_detection() {
136 let chunker = MultiLanguageChunker::new();
137 assert_eq!(chunker.language_for_file("test.py"), Some("python"));
138 assert_eq!(chunker.language_for_file("test.rs"), Some("rust"));
139 assert_eq!(chunker.language_for_file("test.ts"), Some("typescript"));
140 assert_eq!(chunker.language_for_file("test.tsx"), Some("tsx"));
141 }
142
143 #[test]
144 fn test_chunk_python() {
145 let chunker = MultiLanguageChunker::new();
146 let source = r#"
147def hello(name):
148 """Say hello to someone."""
149 print(f"Hello, {name}!")
150
151class Greeter:
152 """A greeter class."""
153
154 def greet(self, name):
155 return f"Hello, {name}!"
156"#;
157 let chunks = chunker
158 .chunk_file("/test/hello.py", "hello.py", source)
159 .unwrap();
160 assert!(!chunks.is_empty());
161 let names: Vec<_> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
163 assert!(names.contains(&"hello"));
164 assert!(names.contains(&"Greeter"));
165 assert!(names.contains(&"greet"));
166 }
167
168 #[test]
169 fn test_chunk_rust() {
170 let chunker = MultiLanguageChunker::new();
171 let source = r#"
172pub struct Config {
173 pub name: String,
174 pub value: i32,
175}
176
177impl Config {
178 pub fn new(name: String) -> Self {
179 Self { name, value: 0 }
180 }
181
182 pub fn set_value(&mut self, value: i32) {
183 self.value = value;
184 }
185}
186
187pub fn process(config: &Config) -> String {
188 format!("{}: {}", config.name, config.value)
189}
190"#;
191 let chunks = chunker
192 .chunk_file("/test/config.rs", "config.rs", source)
193 .unwrap();
194 assert!(!chunks.is_empty());
195 let names: Vec<_> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
196 assert!(names.contains(&"Config"));
197 assert!(names.contains(&"process"));
198 }
199
200 #[test]
201 fn test_chunk_javascript() {
202 let chunker = MultiLanguageChunker::new();
203 let source = r#"
204function fetchUser(id) {
205 return fetch(`/api/users/${id}`);
206}
207
208class UserService {
209 constructor(baseUrl) {
210 this.baseUrl = baseUrl;
211 }
212
213 getUser(id) {
214 return fetch(`${this.baseUrl}/users/${id}`);
215 }
216}
217"#;
218 let chunks = chunker
219 .chunk_file("/test/user.js", "user.js", source)
220 .unwrap();
221 assert!(!chunks.is_empty());
222 let names: Vec<_> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
223 assert!(names.contains(&"fetchUser"));
224 assert!(names.contains(&"UserService"));
225 }
226}