greppy/trace/extract/
mod.rs

1//! Trace Extract Module
2//!
3//! Unified extraction interface for parsing source files and extracting
4//! symbols, calls, references, scopes, and tokens using tree-sitter (primary)
5//! with regex fallback for unsupported languages.
6//!
7//! @module trace/extract
8
9pub mod regex;
10pub mod treesitter;
11
12use std::path::Path;
13
14// =============================================================================
15// EXTRACTED TYPES
16// =============================================================================
17
18/// Kind of symbol extracted from source code
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum SymbolKind {
21    Function,
22    Method,
23    Class,
24    Struct,
25    Enum,
26    Interface,
27    TypeAlias,
28    Constant,
29    Variable,
30    Module,
31    Trait,
32    Impl,
33}
34
35impl SymbolKind {
36    pub fn as_str(&self) -> &'static str {
37        match self {
38            Self::Function => "function",
39            Self::Method => "method",
40            Self::Class => "class",
41            Self::Struct => "struct",
42            Self::Enum => "enum",
43            Self::Interface => "interface",
44            Self::TypeAlias => "type_alias",
45            Self::Constant => "constant",
46            Self::Variable => "variable",
47            Self::Module => "module",
48            Self::Trait => "trait",
49            Self::Impl => "impl",
50        }
51    }
52}
53
54/// Kind of reference extracted from source code
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub enum RefKind {
57    Read,
58    Write,
59    Call,
60    TypeAnnotation,
61    Import,
62    Export,
63    Construction,
64}
65
66impl RefKind {
67    pub fn as_str(&self) -> &'static str {
68        match self {
69            Self::Read => "read",
70            Self::Write => "write",
71            Self::Call => "call",
72            Self::TypeAnnotation => "type_annotation",
73            Self::Import => "import",
74            Self::Export => "export",
75            Self::Construction => "construction",
76        }
77    }
78}
79
80/// Kind of scope in the AST
81#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub enum ScopeKind {
83    File,
84    Module,
85    Class,
86    Function,
87    Block,
88    Loop,
89    Conditional,
90}
91
92impl ScopeKind {
93    pub fn as_str(&self) -> &'static str {
94        match self {
95            Self::File => "file",
96            Self::Module => "module",
97            Self::Class => "class",
98            Self::Function => "function",
99            Self::Block => "block",
100            Self::Loop => "loop",
101            Self::Conditional => "conditional",
102        }
103    }
104}
105
106/// Kind of token extracted
107#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108pub enum TokenKind {
109    Identifier,
110    Keyword,
111    Operator,
112    Literal,
113    Comment,
114    Unknown,
115}
116
117impl TokenKind {
118    pub fn as_str(&self) -> &'static str {
119        match self {
120            Self::Identifier => "identifier",
121            Self::Keyword => "keyword",
122            Self::Operator => "operator",
123            Self::Literal => "literal",
124            Self::Comment => "comment",
125            Self::Unknown => "unknown",
126        }
127    }
128}
129
130// =============================================================================
131// EXTRACTED STRUCTURES
132// =============================================================================
133
134/// A symbol definition extracted from source code
135#[derive(Debug, Clone)]
136pub struct ExtractedSymbol {
137    pub name: String,
138    pub kind: SymbolKind,
139    pub start_line: u32,
140    pub end_line: u32,
141    pub start_column: u16,
142    pub end_column: u16,
143    pub is_exported: bool,
144    pub is_async: bool,
145    pub parent_symbol: Option<String>,
146}
147
148/// A function/method call extracted from source code
149#[derive(Debug, Clone)]
150pub struct ExtractedCall {
151    pub callee_name: String,
152    pub line: u32,
153    pub column: u16,
154    pub containing_symbol: Option<String>,
155    pub is_method_call: bool,
156    pub receiver: Option<String>,
157}
158
159/// A reference to a symbol (variable read/write, type annotation, import)
160#[derive(Debug, Clone)]
161pub struct ExtractedRef {
162    pub name: String,
163    pub kind: RefKind,
164    pub line: u32,
165    pub column: u16,
166    pub containing_symbol: Option<String>,
167}
168
169/// A scope in the AST hierarchy
170#[derive(Debug, Clone)]
171pub struct ExtractedScope {
172    pub kind: ScopeKind,
173    pub name: Option<String>,
174    pub start_line: u32,
175    pub end_line: u32,
176    pub parent_index: Option<usize>,
177}
178
179/// A token (identifier) extracted from source code
180#[derive(Debug, Clone)]
181pub struct ExtractedToken {
182    pub name: String,
183    pub kind: TokenKind,
184    pub line: u32,
185    pub column: u16,
186}
187
188// =============================================================================
189// EXTRACTED DATA CONTAINER
190// =============================================================================
191
192/// Complete extraction results from a source file
193#[derive(Debug, Clone, Default)]
194pub struct ExtractedData {
195    pub symbols: Vec<ExtractedSymbol>,
196    pub calls: Vec<ExtractedCall>,
197    pub references: Vec<ExtractedRef>,
198    pub scopes: Vec<ExtractedScope>,
199    pub tokens: Vec<ExtractedToken>,
200    pub language: String,
201    pub extraction_method: ExtractionMethod,
202}
203
204/// Method used for extraction
205#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
206pub enum ExtractionMethod {
207    TreeSitter,
208    #[default]
209    Regex,
210}
211
212impl ExtractionMethod {
213    pub fn as_str(&self) -> &'static str {
214        match self {
215            Self::TreeSitter => "tree-sitter",
216            Self::Regex => "regex",
217        }
218    }
219}
220
221impl ExtractedData {
222    /// Create empty extraction result
223    pub fn empty(language: &str) -> Self {
224        Self {
225            language: language.to_string(),
226            ..Default::default()
227        }
228    }
229
230    /// Check if any data was extracted
231    pub fn is_empty(&self) -> bool {
232        self.symbols.is_empty()
233            && self.calls.is_empty()
234            && self.references.is_empty()
235            && self.tokens.is_empty()
236    }
237
238    /// Total number of items extracted
239    pub fn total_items(&self) -> usize {
240        self.symbols.len()
241            + self.calls.len()
242            + self.references.len()
243            + self.scopes.len()
244            + self.tokens.len()
245    }
246}
247
248// =============================================================================
249// EXTRACTION ERROR
250// =============================================================================
251
252/// Errors that can occur during extraction
253#[derive(Debug, Clone)]
254pub enum ExtractError {
255    /// Tree-sitter parsing failed
256    ParseFailed { language: String, message: String },
257    /// Language not supported by tree-sitter
258    UnsupportedLanguage { language: String },
259    /// IO error reading file
260    IoError { message: String },
261}
262
263impl std::fmt::Display for ExtractError {
264    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
265        match self {
266            Self::ParseFailed { language, message } => {
267                write!(f, "Failed to parse {} code: {}", language, message)
268            }
269            Self::UnsupportedLanguage { language } => {
270                write!(f, "Language '{}' not supported by tree-sitter", language)
271            }
272            Self::IoError { message } => {
273                write!(f, "IO error: {}", message)
274            }
275        }
276    }
277}
278
279impl std::error::Error for ExtractError {}
280
281// =============================================================================
282// LANGUAGE DETECTION
283// =============================================================================
284
285/// Detect language from file path extension
286pub fn detect_language(path: &Path) -> &'static str {
287    path.extension()
288        .and_then(|ext| ext.to_str())
289        .map(|ext| match ext.to_lowercase().as_str() {
290            "ts" | "tsx" | "mts" | "cts" => "typescript",
291            "js" | "jsx" | "mjs" | "cjs" => "javascript",
292            "py" | "pyi" => "python",
293            "rs" => "rust",
294            "go" => "go",
295            "c" | "h" => "c",
296            "cpp" | "hpp" | "cc" | "cxx" => "cpp",
297            "java" => "java",
298            "rb" => "ruby",
299            "php" => "php",
300            "swift" => "swift",
301            "kt" | "kts" => "kotlin",
302            "cs" => "csharp",
303            "lua" => "lua",
304            "sh" | "bash" | "zsh" => "bash",
305            "json" => "json",
306            "yaml" | "yml" => "yaml",
307            "toml" => "toml",
308            "md" | "markdown" => "markdown",
309            "html" | "htm" => "html",
310            "css" | "scss" | "sass" | "less" => "css",
311            "sql" => "sql",
312            "zig" => "zig",
313            "ex" | "exs" => "elixir",
314            "erl" | "hrl" => "erlang",
315            "hs" | "lhs" => "haskell",
316            "ml" | "mli" => "ocaml",
317            "scala" | "sc" => "scala",
318            "clj" | "cljs" | "cljc" => "clojure",
319            "v" | "vh" => "verilog",
320            "svelte" => "svelte",
321            "vue" => "vue",
322            _ => "unknown",
323        })
324        .unwrap_or("unknown")
325}
326
327/// Check if language is supported by tree-sitter
328pub fn is_treesitter_supported(language: &str) -> bool {
329    matches!(
330        language,
331        "typescript" | "javascript" | "python" | "rust" | "go"
332    )
333}
334
335// =============================================================================
336// MAIN EXTRACTION FUNCTION
337// =============================================================================
338
339/// Extract symbols, calls, references, scopes, and tokens from a source file.
340///
341/// Uses tree-sitter for supported languages, falls back to regex for others.
342/// Tree-sitter provides more accurate extraction with full AST access.
343/// Regex extraction is a best-effort fallback with lower confidence.
344///
345/// # Arguments
346/// * `path` - Path to the source file (used for language detection)
347/// * `content` - Source code content
348/// * `language` - Optional language override (if None, detected from path)
349///
350/// # Returns
351/// ExtractedData containing all extracted information
352pub fn extract_file(path: &Path, content: &str, language: Option<&str>) -> ExtractedData {
353    let detected_lang = language.unwrap_or_else(|| detect_language(path));
354
355    // Skip empty files
356    if content.trim().is_empty() {
357        return ExtractedData::empty(detected_lang);
358    }
359
360    // Try tree-sitter first for supported languages
361    if is_treesitter_supported(detected_lang) {
362        match treesitter::extract(content, detected_lang) {
363            Ok(mut data) => {
364                data.language = detected_lang.to_string();
365                data.extraction_method = ExtractionMethod::TreeSitter;
366                return data;
367            }
368            Err(e) => {
369                // Log warning and fall back to regex
370                tracing::warn!(
371                    "Tree-sitter extraction failed for {}: {}, falling back to regex",
372                    path.display(),
373                    e
374                );
375            }
376        }
377    }
378
379    // Fall back to regex extraction
380    let mut data = regex::extract(content, detected_lang);
381    data.language = detected_lang.to_string();
382    data.extraction_method = ExtractionMethod::Regex;
383    data
384}
385
386// =============================================================================
387// TESTS
388// =============================================================================
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_detect_language() {
396        assert_eq!(detect_language(Path::new("foo.ts")), "typescript");
397        assert_eq!(detect_language(Path::new("foo.tsx")), "typescript");
398        assert_eq!(detect_language(Path::new("foo.js")), "javascript");
399        assert_eq!(detect_language(Path::new("foo.py")), "python");
400        assert_eq!(detect_language(Path::new("foo.rs")), "rust");
401        assert_eq!(detect_language(Path::new("foo.go")), "go");
402        assert_eq!(detect_language(Path::new("foo.xyz")), "unknown");
403    }
404
405    #[test]
406    fn test_is_treesitter_supported() {
407        assert!(is_treesitter_supported("typescript"));
408        assert!(is_treesitter_supported("javascript"));
409        assert!(is_treesitter_supported("python"));
410        assert!(is_treesitter_supported("rust"));
411        assert!(is_treesitter_supported("go"));
412        assert!(!is_treesitter_supported("ruby"));
413        assert!(!is_treesitter_supported("unknown"));
414    }
415
416    #[test]
417    fn test_extracted_data_empty() {
418        let data = ExtractedData::empty("rust");
419        assert!(data.is_empty());
420        assert_eq!(data.total_items(), 0);
421        assert_eq!(data.language, "rust");
422    }
423}