Skip to main content

qex_core/chunk/
multi_language.rs

1use crate::chunk::languages::{all_chunkers, LanguageChunker};
2use crate::chunk::tree_sitter::TreeSitterEngine;
3use crate::chunk::CodeChunk;
4use anyhow::{Context, Result};
5use rayon::prelude::*;
6use std::collections::HashMap;
7use std::path::Path;
8
9/// Multi-language chunker that dispatches to the appropriate language chunker
10pub struct MultiLanguageChunker {
11    /// Map from file extension to language chunker
12    extension_map: HashMap<String, usize>,
13    /// All chunkers
14    chunkers: Vec<Box<dyn LanguageChunker>>,
15}
16
17impl MultiLanguageChunker {
18    pub fn new() -> Self {
19        let chunkers = all_chunkers();
20        let mut extension_map = HashMap::new();
21
22        for (idx, chunker) in chunkers.iter().enumerate() {
23            for ext in chunker.file_extensions() {
24                extension_map.insert(ext.to_string(), idx);
25            }
26        }
27
28        Self {
29            extension_map,
30            chunkers,
31        }
32    }
33
34    /// Check if a file extension is supported
35    pub fn is_supported(&self, path: &str) -> bool {
36        Path::new(path)
37            .extension()
38            .and_then(|e| e.to_str())
39            .map(|ext| self.extension_map.contains_key(ext))
40            .unwrap_or(false)
41    }
42
43    /// Get the language name for a file
44    pub fn language_for_file(&self, path: &str) -> Option<&str> {
45        let ext = Path::new(path).extension()?.to_str()?;
46        let idx = self.extension_map.get(ext)?;
47        Some(self.chunkers[*idx].language_name())
48    }
49
50    /// Get all supported extensions
51    pub fn supported_extensions(&self) -> Vec<&str> {
52        self.chunkers
53            .iter()
54            .flat_map(|c| c.file_extensions().iter().copied())
55            .collect()
56    }
57
58    /// Chunk a single file
59    pub fn chunk_file(
60        &self,
61        file_path: &str,
62        relative_path: &str,
63        source: &str,
64    ) -> Result<Vec<CodeChunk>> {
65        let ext = Path::new(file_path)
66            .extension()
67            .and_then(|e| e.to_str())
68            .context("File has no extension")?;
69
70        let idx = self
71            .extension_map
72            .get(ext)
73            .context(format!("Unsupported extension: {}", ext))?;
74
75        let chunker = &self.chunkers[*idx];
76
77        TreeSitterEngine::parse_file(source, file_path, relative_path, chunker.language_name(), chunker.as_ref())
78    }
79
80    /// Chunk multiple files in parallel
81    pub fn chunk_files(
82        &self,
83        files: &[(String, String)], // (absolute_path, relative_path)
84    ) -> Vec<(String, Result<Vec<CodeChunk>>)> {
85        files
86            .par_iter()
87            .filter_map(|(abs_path, rel_path)| {
88                if !self.is_supported(abs_path) {
89                    return None;
90                }
91                let source = match std::fs::read_to_string(abs_path) {
92                    Ok(s) => s,
93                    Err(e) => {
94                        return Some((
95                            rel_path.clone(),
96                            Err(anyhow::anyhow!("Failed to read {}: {}", abs_path, e)),
97                        ));
98                    }
99                };
100                let result = self.chunk_file(abs_path, rel_path, &source);
101                Some((rel_path.clone(), result))
102            })
103            .collect()
104    }
105}
106
107impl Default for MultiLanguageChunker {
108    fn default() -> Self {
109        Self::new()
110    }
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn test_supported_extensions() {
119        let chunker = MultiLanguageChunker::new();
120        assert!(chunker.is_supported("test.py"));
121        assert!(chunker.is_supported("test.rs"));
122        assert!(chunker.is_supported("test.js"));
123        assert!(chunker.is_supported("test.ts"));
124        assert!(chunker.is_supported("test.tsx"));
125        assert!(chunker.is_supported("test.go"));
126        assert!(chunker.is_supported("test.java"));
127        assert!(chunker.is_supported("test.c"));
128        assert!(chunker.is_supported("test.cpp"));
129        assert!(chunker.is_supported("test.cs"));
130        assert!(chunker.is_supported("test.md"));
131        assert!(!chunker.is_supported("test.xyz"));
132    }
133
134    #[test]
135    fn test_language_detection() {
136        let chunker = MultiLanguageChunker::new();
137        assert_eq!(chunker.language_for_file("test.py"), Some("python"));
138        assert_eq!(chunker.language_for_file("test.rs"), Some("rust"));
139        assert_eq!(chunker.language_for_file("test.ts"), Some("typescript"));
140        assert_eq!(chunker.language_for_file("test.tsx"), Some("tsx"));
141    }
142
143    #[test]
144    fn test_chunk_python() {
145        let chunker = MultiLanguageChunker::new();
146        let source = r#"
147def hello(name):
148    """Say hello to someone."""
149    print(f"Hello, {name}!")
150
151class Greeter:
152    """A greeter class."""
153
154    def greet(self, name):
155        return f"Hello, {name}!"
156"#;
157        let chunks = chunker
158            .chunk_file("/test/hello.py", "hello.py", source)
159            .unwrap();
160        assert!(!chunks.is_empty());
161        // Should have: hello function, Greeter class, greet method
162        let names: Vec<_> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
163        assert!(names.contains(&"hello"));
164        assert!(names.contains(&"Greeter"));
165        assert!(names.contains(&"greet"));
166    }
167
168    #[test]
169    fn test_chunk_rust() {
170        let chunker = MultiLanguageChunker::new();
171        let source = r#"
172pub struct Config {
173    pub name: String,
174    pub value: i32,
175}
176
177impl Config {
178    pub fn new(name: String) -> Self {
179        Self { name, value: 0 }
180    }
181
182    pub fn set_value(&mut self, value: i32) {
183        self.value = value;
184    }
185}
186
187pub fn process(config: &Config) -> String {
188    format!("{}: {}", config.name, config.value)
189}
190"#;
191        let chunks = chunker
192            .chunk_file("/test/config.rs", "config.rs", source)
193            .unwrap();
194        assert!(!chunks.is_empty());
195        let names: Vec<_> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
196        assert!(names.contains(&"Config"));
197        assert!(names.contains(&"process"));
198    }
199
200    #[test]
201    fn test_chunk_javascript() {
202        let chunker = MultiLanguageChunker::new();
203        let source = r#"
204function fetchUser(id) {
205    return fetch(`/api/users/${id}`);
206}
207
208class UserService {
209    constructor(baseUrl) {
210        this.baseUrl = baseUrl;
211    }
212
213    getUser(id) {
214        return fetch(`${this.baseUrl}/users/${id}`);
215    }
216}
217"#;
218        let chunks = chunker
219            .chunk_file("/test/user.js", "user.js", source)
220            .unwrap();
221        assert!(!chunks.is_empty());
222        let names: Vec<_> = chunks.iter().filter_map(|c| c.name.as_deref()).collect();
223        assert!(names.contains(&"fetchUser"));
224        assert!(names.contains(&"UserService"));
225    }
226}