codanna 0.9.19

Code Intelligence for Large Language Models
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
//! Language parser trait
//!
//! This module defines the common interface that all language parsers
//! must implement to work with the indexing system.

use crate::parsing::method_call::MethodCall;
use crate::types::SymbolCounter;
use crate::{FileId, Range, Symbol};
use std::any::Any;
use std::collections::HashSet;
use tree_sitter::Node;

/// Common interface for all language parsers
pub trait LanguageParser: Send + Sync {
    /// Parse source code and extract symbols
    fn parse(
        &mut self,
        code: &str,
        file_id: FileId,
        symbol_counter: &mut SymbolCounter,
    ) -> Vec<Symbol>;

    /// Enable downcasting to concrete parser types
    fn as_any(&self) -> &dyn Any;

    /// Extract documentation comment for a node
    ///
    /// Each language has its own documentation conventions:
    /// - Rust: `///` and `/** */`
    /// - Python: Docstrings (first string literal)
    /// - JavaScript/TypeScript: JSDoc `/** */`
    fn extract_doc_comment(&self, node: &Node, code: &str) -> Option<String>;

    /// Find function/method calls in the code
    ///
    /// Returns tuples of (caller_name, callee_name, range)
    /// Zero-cost: Returns string slices into the source code
    fn find_calls<'a>(&mut self, code: &'a str) -> Vec<(&'a str, &'a str, Range)>;

    /// Find method calls with rich receiver information
    ///
    /// Default implementation converts from find_calls() for backward compatibility.
    /// Parsers can override this method to provide enhanced receiver tracking.
    ///
    /// # Returns
    ///
    /// A vector of MethodCall structs with structured receiver information
    fn find_method_calls(&mut self, code: &str) -> Vec<MethodCall> {
        // Default: convert from find_calls tuples (parsers should override for richer info)
        self.find_calls(code)
            .into_iter()
            .map(|(caller, target, range)| MethodCall::new(caller, target, range))
            .collect()
    }

    /// Find trait/interface implementations
    ///
    /// Returns tuples of (type_name, trait_name, range)
    /// Zero-cost: Returns string slices into the source code
    fn find_implementations<'a>(&mut self, code: &'a str) -> Vec<(&'a str, &'a str, Range)>;

    /// Find inheritance relationships (extends for classes/interfaces)
    ///
    /// Returns tuples of (derived_type, base_type, range)
    /// Zero-cost: Returns string slices into the source code
    fn find_extends<'a>(&mut self, _code: &'a str) -> Vec<(&'a str, &'a str, Range)> {
        // Default implementation returns empty for languages without inheritance
        Vec::new()
    }

    /// Find type usage (in fields, parameters, returns)
    ///
    /// Returns tuples of (context_name, used_type, range)
    /// Zero-cost: Returns string slices into the source code
    fn find_uses<'a>(&mut self, code: &'a str) -> Vec<(&'a str, &'a str, Range)>;

    /// Find method definitions (in traits/interfaces or types)
    ///
    /// Returns tuples of (definer_name, method_name, range)
    /// Zero-cost: Returns string slices into the source code
    fn find_defines<'a>(&mut self, code: &'a str) -> Vec<(&'a str, &'a str, Range)>;

    /// Find import statements in the code
    ///
    /// Returns Import structs with path, alias, and glob information
    fn find_imports(&mut self, code: &str, file_id: FileId) -> Vec<crate::parsing::Import>;

    /// Get the language this parser handles
    fn language(&self) -> crate::parsing::Language;

    /// Extract variable bindings with their types
    /// Returns tuples of (variable_name, type_name, range)
    /// Zero-cost: Returns string slices into the source code
    fn find_variable_types<'a>(&mut self, _code: &'a str) -> Vec<(&'a str, &'a str, Range)> {
        // Default implementation returns empty - languages can override
        Vec::new()
    }

    /// Optional: Extract variable types with complex generic substitution
    ///
    /// Returns owned strings to support type substitution like `List<T>` β†’ `List<Int>`.
    /// Languages with generics can override this for richer type inference.
    /// Default returns None, causing indexer to fall back to zero-copy version.
    fn find_variable_types_with_substitution(
        &mut self,
        _code: &str,
    ) -> Option<Vec<(String, String, Range)>> {
        None
    }

    /// Find inherent methods (methods defined directly on types)
    /// Returns tuples of (type_name, method_name, range)
    ///
    /// This is for methods defined directly on types (not through traits/interfaces).
    /// Default implementation returns empty - languages can override.
    ///
    /// Note: Returns owned strings to support complex type names that need construction
    /// (e.g., Rust's `Option<String>`, `Vec<T>`, etc.)
    fn find_inherent_methods(&mut self, _code: &str) -> Vec<(String, String, Range)> {
        Vec::new()
    }
}

/// Trait for creating language parsers
pub trait ParserFactory: Send + Sync {
    /// Create a new parser instance
    fn create(&self) -> Result<Box<dyn LanguageParser>, String>;
}

/// Information about a handled AST node
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct HandledNode {
    pub name: String,
    pub id: u16,
}

/// Extension trait for tracking which AST node types a parser handles
///
/// This enables dynamic audit reporting by automatically tracking which
/// node types receive explicit handling during parsing. Eliminates the
/// need for manually maintaining static lists of implemented nodes.
pub trait NodeTracker {
    /// Get the set of node types this parser has encountered and handled
    fn get_handled_nodes(&self) -> &HashSet<HandledNode>;

    /// Register that we've handled a specific node type with its tree-sitter ID
    fn register_handled_node(&mut self, node_kind: &str, node_id: u16);
}

/// Default implementation of NodeTracker using a HashSet
///
/// Parsers can include this struct and delegate to it for zero-cost node tracking.
#[derive(Debug, Default)]
pub struct NodeTrackingState {
    handled_nodes: HashSet<HandledNode>,
}

impl NodeTrackingState {
    /// Create a new empty tracking state
    pub fn new() -> Self {
        Self {
            handled_nodes: HashSet::new(),
        }
    }
}

impl NodeTracker for NodeTrackingState {
    fn get_handled_nodes(&self) -> &HashSet<HandledNode> {
        &self.handled_nodes
    }

    #[inline]
    fn register_handled_node(&mut self, node_kind: &str, node_id: u16) {
        // Create node info - only allocates if we haven't seen this exact node before
        let node_info = HandledNode {
            name: node_kind.to_string(),
            id: node_id,
        };

        // HashSet::insert is efficient - only stores if not already present
        self.handled_nodes.insert(node_info);
    }
}

/// Safely truncate a UTF-8 string at a character boundary.
/// Returns a slice up to the last valid character boundary before max_bytes.
/// Zero-cost: returns a slice, no allocations.
#[inline]
pub fn safe_truncate_str(s: &str, max_bytes: usize) -> &str {
    if s.len() <= max_bytes {
        return s;
    }

    // Find the last valid UTF-8 boundary before or at max_bytes
    let mut boundary = max_bytes;
    while boundary > 0 && !s.is_char_boundary(boundary) {
        boundary -= 1;
    }

    &s[..boundary]
}

/// Maximum recursion depth for AST traversal to prevent stack overflow
///
/// This limit protects against deeply nested structures (e.g., large array initializers,
/// deeply nested function calls). When the limit is reached, traversal stops and a
/// warning is logged if debug mode is enabled.
///
/// Value chosen based on:
/// - Default Rust stack size: 2MB
/// - Average stack frame size: ~4KB per recursive call
/// - Safety margin: 500 levels uses ~2MB, well within limits
pub const MAX_AST_DEPTH: usize = 500;

/// Check if recursion depth exceeds safe limits
///
/// This function provides centralized depth checking to prevent stack overflow
/// when processing deeply nested AST structures. All language parsers should
/// call this at the start of their recursive extract_symbols_from_node method.
///
/// # Arguments
///
/// * `depth` - Current recursion depth
/// * `node` - The tree-sitter node being processed (for error reporting)
///
/// # Returns
///
/// `true` if depth is safe to continue, `false` if limit exceeded
///
/// # Example
///
/// ```rust,ignore
/// fn extract_symbols_from_node(
///     &mut self,
///     node: Node,
///     code: &str,
///     file_id: FileId,
///     symbols: &mut Vec<Symbol>,
///     counter: &mut SymbolCounter,
///     depth: usize,
/// ) {
///     // Guard against stack overflow at the start
///     if !check_recursion_depth(depth, node) {
///         return;
///     }
///
///     // ... process node ...
///
///     // Recursive calls pass depth + 1
///     for child in node.children(&mut node.walk()) {
///         self.extract_symbols_from_node(child, code, file_id, symbols, counter, depth + 1);
///     }
/// }
/// ```
#[inline]
pub fn check_recursion_depth(depth: usize, node: Node) -> bool {
    if depth > MAX_AST_DEPTH {
        tracing::warn!(
            "[parser] maximum AST depth ({MAX_AST_DEPTH}) exceeded at line {}:{}. Skipping subtree to prevent stack overflow.",
            node.start_position().row + 1,
            node.start_position().column + 1
        );
        return false;
    }
    true
}

/// Safely extract a substring window from source code, respecting UTF-8 boundaries.
///
/// This function creates a window of up to `window_size` bytes before the `end_byte` position,
/// ensuring we never slice in the middle of a UTF-8 character.
///
/// # Arguments
/// * `code` - The source code string
/// * `end_byte` - The ending byte position (exclusive)
/// * `window_size` - Maximum size of the window in bytes
///
/// # Returns
/// A slice of the code that:
/// - Ends at `end_byte`
/// - Starts at most `window_size` bytes before `end_byte`
/// - Respects UTF-8 character boundaries
///
/// # Example
/// ```ignore
/// let code = "export class πŸ” Scanner";
/// let window = safe_substring_window(code, 20, 10);
/// // Returns a safe slice without cutting the emoji
/// ```
pub fn safe_substring_window(code: &str, end_byte: usize, window_size: usize) -> &str {
    // Clamp end_byte to string length
    let end = end_byte.min(code.len());

    // Calculate the desired start position
    let start_raw = end.saturating_sub(window_size);

    // Find the nearest valid UTF-8 character boundary
    let start = if start_raw > 0 && !code.is_char_boundary(start_raw) {
        // Search forward for a valid boundary (up to 3 bytes for UTF-8)
        (start_raw..=start_raw.saturating_add(3).min(end))
            .find(|&i| code.is_char_boundary(i))
            .unwrap_or(end) // If no boundary found, return empty string
    } else {
        start_raw
    };

    &code[start..end]
}

/// Creates a truncated preview with ellipsis for display purposes.
/// Used for signatures and previews in parsers.
#[inline]
pub fn truncate_for_display(s: &str, max_bytes: usize) -> String {
    let truncated = safe_truncate_str(s, max_bytes);
    if truncated.len() < s.len() {
        format!("{truncated}...")
    } else {
        truncated.to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_safe_truncate_with_emoji_panic() {
        // This test reproduces issue #29 - emoji at bytes 8-12
        let text = "Status: πŸ” Active";
        eprintln!("Input text: '{}' (len: {} bytes)", text, text.len());
        eprintln!("Attempting to truncate at byte 10...");

        // This would panic with &text[..10] as it cuts the emoji in half
        let result = safe_truncate_str(text, 10);
        eprintln!("Result: '{}' (len: {} bytes)", result, result.len());

        assert_eq!(result, "Status: "); // Should stop before the 4-byte emoji
        assert!(result.len() <= 10);
        eprintln!("βœ… Safe truncation avoided panic at emoji boundary!");
    }

    #[test]
    fn test_safe_truncate_exact_boundary() {
        let text = "Hello, World!";
        let result = safe_truncate_str(text, 7);
        assert_eq!(result, "Hello, ");
    }

    #[test]
    fn test_safe_truncate_multi_byte_chars() {
        // Test with 2-byte char (Γ© is 2 bytes in UTF-8)
        let text = "CafΓ© is nice";
        eprintln!("\n2-byte char test:");
        eprintln!("  Input: '{}' (len: {} bytes)", text, text.len());
        eprintln!("  'Γ©' starts at byte 3, is 2 bytes long");
        let result = safe_truncate_str(text, 4);
        eprintln!("  Truncate at 4: '{}' (len: {})", result, result.len());
        assert_eq!(result, "Caf"); // Should not include partial Γ©

        // Test with 3-byte char (β”œ is 3 bytes in UTF-8)
        let text = "Treeβ”œβ”€β”€branch";
        eprintln!("\n3-byte char test:");
        eprintln!("  Input: '{}' (len: {} bytes)", text, text.len());
        eprintln!("  'β”œ' starts at byte 4, is 3 bytes long");
        let result = safe_truncate_str(text, 5);
        eprintln!("  Truncate at 5: '{}' (len: {})", result, result.len());
        assert_eq!(result, "Tree"); // Should not include partial β”œ
        eprintln!("βœ… Multi-byte character boundaries handled correctly!");
    }

    #[test]
    fn test_truncate_for_display() {
        let text = "This is a very long string that needs truncation";
        let result = truncate_for_display(text, 10);
        assert_eq!(result, "This is a ...");

        let short_text = "Short";
        let result = truncate_for_display(short_text, 10);
        assert_eq!(result, "Short");
    }

    #[test]
    fn test_issue_29_exact_case() {
        // Exact case from issue #29
        let text = r#"[
            f"πŸ” System Status: {health.status.title()} {health.status_emoji}",
            f"β”œβ”€β”€ Active Processes: {health.process_count}/{self.config.critical_threshold} ""#;

        eprintln!("\nπŸ› Issue #29 - Exact reproduction case:");
        eprintln!("Input text length: {} bytes", text.len());
        eprintln!("Text contains emojis: πŸ” at byte ~15, β”œ at byte ~95");

        // Should not panic when truncating at byte 100
        eprintln!("\nAttempting truncation at byte 100...");
        let result = safe_truncate_str(text, 100);
        eprintln!("Truncated to {} bytes without panic!", result.len());
        eprintln!(
            "Result ends with: '{}'",
            &result[result.len().saturating_sub(20)..]
        );
        assert!(result.len() <= 100);
        assert!(text.starts_with(result));

        // Test display truncation
        let display = truncate_for_display(text, 100);
        eprintln!(
            "\nDisplay truncation result: {} bytes (includes '...' if truncated)",
            display.len()
        );
        assert!(display.len() <= 103); // 100 + "..."

        eprintln!("βœ… Issue #29 fixed - no panic on emoji boundaries!");
    }
}