1pub mod regex;
10pub mod treesitter;
11
12use std::path::Path;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum SymbolKind {
21 Function,
22 Method,
23 Class,
24 Struct,
25 Enum,
26 Interface,
27 TypeAlias,
28 Constant,
29 Variable,
30 Module,
31 Trait,
32 Impl,
33}
34
35impl SymbolKind {
36 pub fn as_str(&self) -> &'static str {
37 match self {
38 Self::Function => "function",
39 Self::Method => "method",
40 Self::Class => "class",
41 Self::Struct => "struct",
42 Self::Enum => "enum",
43 Self::Interface => "interface",
44 Self::TypeAlias => "type_alias",
45 Self::Constant => "constant",
46 Self::Variable => "variable",
47 Self::Module => "module",
48 Self::Trait => "trait",
49 Self::Impl => "impl",
50 }
51 }
52}
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub enum RefKind {
57 Read,
58 Write,
59 Call,
60 TypeAnnotation,
61 Import,
62 Export,
63 Construction,
64}
65
66impl RefKind {
67 pub fn as_str(&self) -> &'static str {
68 match self {
69 Self::Read => "read",
70 Self::Write => "write",
71 Self::Call => "call",
72 Self::TypeAnnotation => "type_annotation",
73 Self::Import => "import",
74 Self::Export => "export",
75 Self::Construction => "construction",
76 }
77 }
78}
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub enum ScopeKind {
83 File,
84 Module,
85 Class,
86 Function,
87 Block,
88 Loop,
89 Conditional,
90}
91
92impl ScopeKind {
93 pub fn as_str(&self) -> &'static str {
94 match self {
95 Self::File => "file",
96 Self::Module => "module",
97 Self::Class => "class",
98 Self::Function => "function",
99 Self::Block => "block",
100 Self::Loop => "loop",
101 Self::Conditional => "conditional",
102 }
103 }
104}
105
106#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108pub enum TokenKind {
109 Identifier,
110 Keyword,
111 Operator,
112 Literal,
113 Comment,
114 Unknown,
115}
116
117impl TokenKind {
118 pub fn as_str(&self) -> &'static str {
119 match self {
120 Self::Identifier => "identifier",
121 Self::Keyword => "keyword",
122 Self::Operator => "operator",
123 Self::Literal => "literal",
124 Self::Comment => "comment",
125 Self::Unknown => "unknown",
126 }
127 }
128}
129
130#[derive(Debug, Clone)]
136pub struct ExtractedSymbol {
137 pub name: String,
138 pub kind: SymbolKind,
139 pub start_line: u32,
140 pub end_line: u32,
141 pub start_column: u16,
142 pub end_column: u16,
143 pub is_exported: bool,
144 pub is_async: bool,
145 pub parent_symbol: Option<String>,
146}
147
148#[derive(Debug, Clone)]
150pub struct ExtractedCall {
151 pub callee_name: String,
152 pub line: u32,
153 pub column: u16,
154 pub containing_symbol: Option<String>,
155 pub is_method_call: bool,
156 pub receiver: Option<String>,
157}
158
159#[derive(Debug, Clone)]
161pub struct ExtractedRef {
162 pub name: String,
163 pub kind: RefKind,
164 pub line: u32,
165 pub column: u16,
166 pub containing_symbol: Option<String>,
167}
168
169#[derive(Debug, Clone)]
171pub struct ExtractedScope {
172 pub kind: ScopeKind,
173 pub name: Option<String>,
174 pub start_line: u32,
175 pub end_line: u32,
176 pub parent_index: Option<usize>,
177}
178
179#[derive(Debug, Clone)]
181pub struct ExtractedToken {
182 pub name: String,
183 pub kind: TokenKind,
184 pub line: u32,
185 pub column: u16,
186}
187
188#[derive(Debug, Clone, Default)]
194pub struct ExtractedData {
195 pub symbols: Vec<ExtractedSymbol>,
196 pub calls: Vec<ExtractedCall>,
197 pub references: Vec<ExtractedRef>,
198 pub scopes: Vec<ExtractedScope>,
199 pub tokens: Vec<ExtractedToken>,
200 pub language: String,
201 pub extraction_method: ExtractionMethod,
202}
203
204#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
206pub enum ExtractionMethod {
207 TreeSitter,
208 #[default]
209 Regex,
210}
211
212impl ExtractionMethod {
213 pub fn as_str(&self) -> &'static str {
214 match self {
215 Self::TreeSitter => "tree-sitter",
216 Self::Regex => "regex",
217 }
218 }
219}
220
221impl ExtractedData {
222 pub fn empty(language: &str) -> Self {
224 Self {
225 language: language.to_string(),
226 ..Default::default()
227 }
228 }
229
230 pub fn is_empty(&self) -> bool {
232 self.symbols.is_empty()
233 && self.calls.is_empty()
234 && self.references.is_empty()
235 && self.tokens.is_empty()
236 }
237
238 pub fn total_items(&self) -> usize {
240 self.symbols.len()
241 + self.calls.len()
242 + self.references.len()
243 + self.scopes.len()
244 + self.tokens.len()
245 }
246}
247
248#[derive(Debug, Clone)]
254pub enum ExtractError {
255 ParseFailed { language: String, message: String },
257 UnsupportedLanguage { language: String },
259 IoError { message: String },
261}
262
263impl std::fmt::Display for ExtractError {
264 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
265 match self {
266 Self::ParseFailed { language, message } => {
267 write!(f, "Failed to parse {} code: {}", language, message)
268 }
269 Self::UnsupportedLanguage { language } => {
270 write!(f, "Language '{}' not supported by tree-sitter", language)
271 }
272 Self::IoError { message } => {
273 write!(f, "IO error: {}", message)
274 }
275 }
276 }
277}
278
279impl std::error::Error for ExtractError {}
280
281pub fn detect_language(path: &Path) -> &'static str {
287 path.extension()
288 .and_then(|ext| ext.to_str())
289 .map(|ext| match ext.to_lowercase().as_str() {
290 "ts" | "tsx" | "mts" | "cts" => "typescript",
291 "js" | "jsx" | "mjs" | "cjs" => "javascript",
292 "py" | "pyi" => "python",
293 "rs" => "rust",
294 "go" => "go",
295 "c" | "h" => "c",
296 "cpp" | "hpp" | "cc" | "cxx" => "cpp",
297 "java" => "java",
298 "rb" => "ruby",
299 "php" => "php",
300 "swift" => "swift",
301 "kt" | "kts" => "kotlin",
302 "cs" => "csharp",
303 "lua" => "lua",
304 "sh" | "bash" | "zsh" => "bash",
305 "json" => "json",
306 "yaml" | "yml" => "yaml",
307 "toml" => "toml",
308 "md" | "markdown" => "markdown",
309 "html" | "htm" => "html",
310 "css" | "scss" | "sass" | "less" => "css",
311 "sql" => "sql",
312 "zig" => "zig",
313 "ex" | "exs" => "elixir",
314 "erl" | "hrl" => "erlang",
315 "hs" | "lhs" => "haskell",
316 "ml" | "mli" => "ocaml",
317 "scala" | "sc" => "scala",
318 "clj" | "cljs" | "cljc" => "clojure",
319 "v" | "vh" => "verilog",
320 "svelte" => "svelte",
321 "vue" => "vue",
322 _ => "unknown",
323 })
324 .unwrap_or("unknown")
325}
326
327pub fn is_treesitter_supported(language: &str) -> bool {
329 matches!(
330 language,
331 "typescript" | "javascript" | "python" | "rust" | "go"
332 )
333}
334
335pub fn extract_file(path: &Path, content: &str, language: Option<&str>) -> ExtractedData {
353 let detected_lang = language.unwrap_or_else(|| detect_language(path));
354
355 if content.trim().is_empty() {
357 return ExtractedData::empty(detected_lang);
358 }
359
360 if is_treesitter_supported(detected_lang) {
362 match treesitter::extract(content, detected_lang) {
363 Ok(mut data) => {
364 data.language = detected_lang.to_string();
365 data.extraction_method = ExtractionMethod::TreeSitter;
366 return data;
367 }
368 Err(e) => {
369 tracing::warn!(
371 "Tree-sitter extraction failed for {}: {}, falling back to regex",
372 path.display(),
373 e
374 );
375 }
376 }
377 }
378
379 let mut data = regex::extract(content, detected_lang);
381 data.language = detected_lang.to_string();
382 data.extraction_method = ExtractionMethod::Regex;
383 data
384}
385
386#[cfg(test)]
391mod tests {
392 use super::*;
393
394 #[test]
395 fn test_detect_language() {
396 assert_eq!(detect_language(Path::new("foo.ts")), "typescript");
397 assert_eq!(detect_language(Path::new("foo.tsx")), "typescript");
398 assert_eq!(detect_language(Path::new("foo.js")), "javascript");
399 assert_eq!(detect_language(Path::new("foo.py")), "python");
400 assert_eq!(detect_language(Path::new("foo.rs")), "rust");
401 assert_eq!(detect_language(Path::new("foo.go")), "go");
402 assert_eq!(detect_language(Path::new("foo.xyz")), "unknown");
403 }
404
405 #[test]
406 fn test_is_treesitter_supported() {
407 assert!(is_treesitter_supported("typescript"));
408 assert!(is_treesitter_supported("javascript"));
409 assert!(is_treesitter_supported("python"));
410 assert!(is_treesitter_supported("rust"));
411 assert!(is_treesitter_supported("go"));
412 assert!(!is_treesitter_supported("ruby"));
413 assert!(!is_treesitter_supported("unknown"));
414 }
415
416 #[test]
417 fn test_extracted_data_empty() {
418 let data = ExtractedData::empty("rust");
419 assert!(data.is_empty());
420 assert_eq!(data.total_items(), 0);
421 assert_eq!(data.language, "rust");
422 }
423}