Skip to main content

atomcode_core/semantic/
mod.rs

1pub mod cache;
2pub mod language;
3
4use std::path::Path;
5
6use tree_sitter::{Query, QueryCursor, StreamingIterator};
7
8use cache::ASTCache;
9use language::{Lang, LanguageRegistry};
10
11/// A symbol extracted from source code.
12#[derive(Debug, Clone)]
13pub struct Symbol {
14    /// Symbol name (function name, class name, etc.)
15    pub name: String,
16    /// Start line (1-indexed)
17    pub start_line: usize,
18    /// End line (1-indexed)
19    pub end_line: usize,
20    /// Start byte offset in source
21    pub start_byte: usize,
22    /// End byte offset in source
23    pub end_byte: usize,
24    /// The node kind from tree-sitter (e.g. "function_item", "class_definition")
25    pub kind: String,
26}
27
28impl Symbol {
29    /// Check if this symbol has a Chinese name.
30    pub fn is_chinese(&self) -> bool {
31        contains_chinese(&self.name)
32    }
33
34    /// Check if this symbol looks like a Pinyin variable name.
35    pub fn is_pinyin(&self) -> bool {
36        is_pinyin_identifier(&self.name)
37    }
38
39    /// Check if this symbol is likely Chinese-related (Chinese name or Pinyin).
40    pub fn is_chinese_related(&self) -> bool {
41        self.is_chinese() || self.is_pinyin()
42    }
43}
44
45/// Check if a character is a Chinese character (CJK Unified Ideographs).
46fn is_chinese(c: char) -> bool {
47    matches!(c,
48        '\u{4E00}'..='\u{9FFF}' |  // CJK Unified Ideographs
49        '\u{3400}'..='\u{4DBF}' |  // CJK Unified Ideographs Extension A
50        '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
51        '\u{F900}'..='\u{FAFF}' |  // CJK Compatibility Ideographs
52        '\u{2F800}'..='\u{2FA1F}'  // CJK Compatibility Ideographs Supplement
53    )
54}
55
56/// Check if a string contains Chinese characters.
57fn contains_chinese(s: &str) -> bool {
58    s.chars().any(is_chinese)
59}
60
61/// Check if a string looks like a Pinyin variable name (e.g., yonghuMing, dingdanList).
62fn is_pinyin_identifier(s: &str) -> bool {
63    if s.is_empty() {
64        return false;
65    }
66
67    // Must start with alphabetic character
68    let first = s.chars().next().unwrap();
69    if !first.is_ascii_alphabetic() {
70        return false;
71    }
72
73    // Common Pinyin syllables found in Chinese variable names (deduplicated, sorted).
74    // Derived from the original curated list — excludes syllables that overlap with
75    // common English fragments (e.g., "ge", "tu", "se", "he", "de", "le") to avoid
76    // false positives on English identifiers like "getUser".
77    let pinyin_syllables = [
78        "ba", "bai", "bei", "biao", "chang", "chu", "da", "dan", "di", "ding",
79        "dong", "duan", "duo", "er", "fen", "gao", "guo", "hao", "hou", "hu",
80        "huai", "ji", "jian", "jiu", "kuai", "kuan", "leng", "li", "lie", "lu",
81        "man", "miao", "ming", "mu", "nan", "nei", "nian", "qi", "qian", "re",
82        "ren", "ri", "san", "shang", "shao", "shen", "shi", "shu", "si", "tian",
83        "wai", "wan", "wen", "wu", "xi", "xia", "xiao", "xin", "xing", "yi",
84        "yong", "you", "yue", "zhai", "zhong", "zuo",
85    ];
86
87    let lower = s.to_lowercase();
88    let remaining_str = lower.as_str();
89
90    // Greedy longest-match: try longest syllables first to avoid short syllables
91    // consuming characters that belong to longer ones (e.g., "xi" eating into "xiang").
92    let mut pos = 0usize;
93    let mut consumed_count = 0usize;
94    let mut syllable_count = 0usize;
95
96    while pos < remaining_str.len() {
97        let mut matched_len = 0usize;
98        for len in (1..=5.min(remaining_str.len() - pos)).rev() {
99            let candidate = &remaining_str[pos..pos + len];
100            if pinyin_syllables.binary_search(&candidate).is_ok() {
101                matched_len = len;
102                break;
103            }
104        }
105        if matched_len > 0 {
106            pos += matched_len;
107            consumed_count += matched_len;
108            syllable_count += 1;
109        } else {
110            break;
111        }
112    }
113
114    // Require: (1) at least 2 syllable matches, AND (2) 80% coverage.
115    // This prevents false positives on English words like "getUser" which
116    // would match zero syllables from the restricted list.
117    syllable_count >= 2 && consumed_count as f64 / lower.len() as f64 > 0.8
118}
119
120/// Semantic code searcher: fuses Ripgrep speed with Tree-sitter precision.
121pub struct SemanticSearcher {
122    cache: ASTCache,
123}
124
125impl SemanticSearcher {
126    pub fn new() -> Self {
127        Self {
128            cache: ASTCache::new(),
129        }
130    }
131
132    /// List all top-level symbols in a file.
133    /// Returns function/class/struct signatures with line ranges.
134    pub fn list_symbols(&mut self, path: &Path) -> Option<Vec<Symbol>> {
135        let source = std::fs::read_to_string(path).ok()?;
136
137        let lang = LanguageRegistry::detect(path);
138
139        if let Some(lang) = lang {
140            let mut symbols = self.list_symbols_treesitter(path, &source, lang)?;
141
142            // Vue SFC: also parse <template> section with HTML parser
143            if lang.is_vue() {
144                if let Some(html_symbols) = self.list_vue_template_symbols(&source) {
145                    symbols.extend(html_symbols);
146                }
147            }
148
149            Some(symbols)
150        } else {
151            Some(self.list_symbols_indent(&source, path))
152        }
153    }
154
155    /// Extract a specific symbol (function/class) by name from a file.
156    /// Returns the complete source text of that symbol.
157    pub fn extract_symbol(&mut self, path: &Path, symbol_name: &str) -> Option<SymbolSlice> {
158        let source = std::fs::read_to_string(path).ok()?;
159        let lang = LanguageRegistry::detect(path)?;
160        let symbols = self.list_symbols_treesitter(path, &source, lang)?;
161
162        // Find the symbol with matching name
163        let sym = symbols.iter().find(|s| s.name == symbol_name)?;
164        let text = source[sym.start_byte..sym.end_byte].to_string();
165
166        Some(SymbolSlice {
167            name: sym.name.clone(),
168            kind: sym.kind.clone(),
169            start_line: sym.start_line,
170            end_line: sym.end_line,
171            start_byte: sym.start_byte,
172            end_byte: sym.end_byte,
173            text,
174        })
175    }
176
177    /// Generate a skeleton of a file: signatures only, bodies replaced with { ... }.
178    pub fn skeleton(&mut self, path: &Path) -> Option<String> {
179        let source = std::fs::read_to_string(path).ok()?;
180        let lang = LanguageRegistry::detect(path);
181
182        if let Some(lang) = lang {
183            self.skeleton_treesitter(path, &source, lang)
184        } else {
185            Some(self.skeleton_indent(&source, path))
186        }
187    }
188
189    /// Invalidate cache for a file (call after edit_file).
190    pub fn invalidate(&mut self, path: &Path) {
191        self.cache.invalidate(path);
192    }
193
194    /// Count ERROR nodes in source code. Language-agnostic.
195    /// Returns (error_count, first few error line numbers).
196    pub fn count_syntax_errors(&mut self, source: &str, path: &Path) -> (usize, Vec<usize>) {
197        let lang = match language::LanguageRegistry::detect(path) {
198            Some(l) => l,
199            None => return (0, vec![]),
200        };
201        let tree = match self.cache.parse_source(source, lang) {
202            Some(t) => t,
203            None => return (0, vec![]),
204        };
205
206        let mut errors = Vec::new();
207        Self::collect_errors(tree.root_node(), &mut errors);
208        let count = errors.len();
209        errors.truncate(5); // Only report first 5
210        (count, errors)
211    }
212
213    fn collect_errors(node: tree_sitter::Node, errors: &mut Vec<usize>) {
214        if node.is_error() || node.is_missing() {
215            errors.push(node.start_position().row + 1);
216        }
217        let mut cursor = node.walk();
218        if cursor.goto_first_child() {
219            loop {
220                Self::collect_errors(cursor.node(), errors);
221                if !cursor.goto_next_sibling() {
222                    break;
223                }
224            }
225        }
226    }
227
228    /// Find all call sites in a file that match a pattern (e.g., "tagRepository")
229    /// and report their line numbers and enclosing function.
230    ///
231    /// Language-agnostic: works on any tree-sitter supported language by searching
232    /// for method_invocation / call_expression nodes whose text contains the pattern.
233    ///
234    /// Used by auto_diagnose to give the model a complete list of similar call sites
235    /// when a stack trace points to one — preventing the "fix one, miss nine" pattern.
236    pub fn find_similar_calls(&mut self, path: &Path, pattern: &str) -> Option<String> {
237        let source = std::fs::read_to_string(path).ok()?;
238        let lang = LanguageRegistry::detect(path)?;
239        let tree = self.cache.parse_source(&source, lang)?;
240
241        let pattern_lower = pattern.to_lowercase();
242        let mut results: Vec<(usize, String, String)> = Vec::new(); // (line, call_text, enclosing_fn)
243
244        Self::walk_matching_calls(tree.root_node(), &source, &pattern_lower, &mut results, "");
245
246        if results.is_empty() {
247            return None;
248        }
249
250        let short_name = path
251            .file_name()
252            .map(|n| n.to_string_lossy().to_string())
253            .unwrap_or_else(|| path.to_string_lossy().to_string());
254
255        let mut out = format!(
256            "{} calls matching '{}' in {}:\n",
257            results.len(),
258            pattern,
259            short_name
260        );
261        for (line, call_text, func) in &results {
262            if func.is_empty() {
263                out.push_str(&format!("  L{}: {}\n", line, call_text));
264            } else {
265                out.push_str(&format!("  L{}: {} (in {})\n", line, call_text, func));
266            }
267        }
268        Some(out)
269    }
270
271    /// Walk AST to find call expressions matching a pattern.
272    fn walk_matching_calls(
273        node: tree_sitter::Node,
274        source: &str,
275        pattern: &str,
276        results: &mut Vec<(usize, String, String)>,
277        enclosing_fn: &str,
278    ) {
279        // Track enclosing function name
280        let mut current_fn = enclosing_fn.to_string();
281        let kind = node.kind();
282        if kind.contains("function") || kind.contains("method") || kind == "constructor_declaration"
283        {
284            if let Some(name_node) = node.child_by_field_name("name") {
285                current_fn = source[name_node.start_byte()..name_node.end_byte()].to_string();
286            }
287        }
288
289        // Match method_invocation (Java), call_expression (JS/TS/Python/Go/Rust)
290        if kind == "method_invocation" || kind == "call_expression" {
291            let call_text = &source[node.start_byte()..node.end_byte()];
292            // Truncate long call texts (keep first 80 chars)
293            let short = if call_text.len() > 80 {
294                let mut end = 77;
295                while !call_text.is_char_boundary(end) {
296                    end -= 1;
297                }
298                format!("{}...", &call_text[..end])
299            } else {
300                call_text.to_string()
301            };
302            // Remove newlines for display
303            let oneline = short.replace('\n', " ").replace("  ", " ");
304
305            if call_text.to_lowercase().contains(pattern) {
306                let line = node.start_position().row + 1;
307                results.push((line, oneline, current_fn.clone()));
308            }
309        }
310
311        // Recurse
312        let mut cursor = node.walk();
313        if cursor.goto_first_child() {
314            loop {
315                Self::walk_matching_calls(cursor.node(), source, pattern, results, &current_fn);
316                if !cursor.goto_next_sibling() {
317                    break;
318                }
319            }
320        }
321    }
322
323    /// Extract symbols from Vue <template> section using tree-sitter-html.
324    /// Returns key HTML elements as symbols so they appear in skeleton/file tree.
325    fn list_vue_template_symbols(&mut self, source: &str) -> Option<Vec<Symbol>> {
326        // Find <template> section
327        let template_start = source.find("<template")?;
328        let template_end = source.rfind("</template>")?;
329        if template_start >= template_end {
330            return None;
331        }
332
333        // Byte offset of <template> in the original file
334        let template_content_start = source[template_start..].find('>')? + template_start + 1;
335        let template_content = &source[template_content_start..template_end];
336
337        // Line offset: count newlines before template start
338        let line_offset = source[..template_content_start].lines().count();
339
340        // Parse with HTML grammar
341        let html_grammar = Lang::html_grammar();
342        let mut parser = tree_sitter::Parser::new();
343        parser.set_language(&html_grammar).ok()?;
344        let tree = parser.parse(template_content, None)?;
345
346        let query_str = Lang::Html.symbols_query();
347        let query = tree_sitter::Query::new(&html_grammar, query_str).ok()?;
348        let mut cursor = tree_sitter::QueryCursor::new();
349        let mut matches = cursor.matches(&query, tree.root_node(), template_content.as_bytes());
350
351        let name_idx = query.capture_index_for_name("name")?;
352        let def_idx = query.capture_index_for_name("definition")?;
353
354        let mut symbols = Vec::new();
355        let mut seen_lines = std::collections::HashSet::new();
356
357        while let Some(m) = matches.next() {
358            let name_cap = match m.captures.iter().find(|c| c.index == name_idx) {
359                Some(c) => c,
360                None => continue,
361            };
362            let def_cap = match m.captures.iter().find(|c| c.index == def_idx) {
363                Some(c) => c,
364                None => continue,
365            };
366            let name_node = name_cap.node;
367            let def_node = def_cap.node;
368
369            let tag_name = &template_content[name_node.start_byte()..name_node.end_byte()];
370            let start_line = def_node.start_position().row + line_offset;
371
372            // Skip common noise tags, keep structural/component elements
373            if matches!(
374                tag_name,
375                "div"
376                    | "span"
377                    | "p"
378                    | "a"
379                    | "li"
380                    | "ul"
381                    | "ol"
382                    | "br"
383                    | "hr"
384                    | "img"
385                    | "i"
386                    | "b"
387                    | "strong"
388                    | "em"
389                    | "small"
390                    | "label"
391                    | "input"
392                    | "option"
393                    | "thead"
394                    | "tbody"
395                    | "tr"
396                    | "td"
397                    | "th"
398            ) {
399                // Only keep div/span if they have interesting attributes
400                let line = template_content
401                    .lines()
402                    .nth(def_node.start_position().row)
403                    .unwrap_or("");
404                let has_vue_attr = line.contains("v-if")
405                    || line.contains("v-for")
406                    || line.contains("v-show")
407                    || line.contains("@click")
408                    || line.contains("v-model");
409                if !has_vue_attr {
410                    continue;
411                }
412            }
413
414            // Dedup by line
415            if !seen_lines.insert(start_line) {
416                continue;
417            }
418
419            let end_line = def_node.end_position().row + line_offset;
420            symbols.push(Symbol {
421                name: format!("<{}>", tag_name),
422                start_line,
423                end_line,
424                start_byte: def_node.start_byte() + template_content_start,
425                end_byte: def_node.end_byte() + template_content_start,
426                kind: "element".to_string(),
427            });
428
429            if symbols.len() >= 20 {
430                break;
431            } // Cap to avoid noise
432        }
433
434        if symbols.is_empty() {
435            None
436        } else {
437            Some(symbols)
438        }
439    }
440
441    // ── Tree-sitter implementation ──
442
443    fn list_symbols_treesitter(
444        &mut self,
445        path: &Path,
446        source: &str,
447        lang: Lang,
448    ) -> Option<Vec<Symbol>> {
449        // Vue/Svelte SFC: extract <script> section, parse as TypeScript, adjust offsets.
450        if lang == Lang::Vue {
451            return self.list_symbols_vue(path, source);
452        }
453
454        let tree = self.cache.parse_source(source, lang)?;
455        let query_src = lang.symbols_query();
456        let grammar = lang.grammar();
457        let query = Query::new(&grammar, query_src).ok()?;
458
459        let def_idx = query.capture_index_for_name("definition")?;
460        let name_idx = query.capture_index_for_name("name")?;
461
462        let mut cursor = QueryCursor::new();
463
464        let mut symbols = Vec::new();
465        let mut seen_ranges: std::collections::HashSet<(usize, usize)> =
466            std::collections::HashSet::new();
467
468        let mut matches = cursor.matches(&query, tree.root_node(), source.as_bytes());
469        loop {
470            matches.advance();
471            let m = match matches.get() {
472                Some(m) => m,
473                None => break,
474            };
475
476            let mut sym_name = None;
477            let mut def_start = 0usize;
478            let mut def_end = 0usize;
479            let mut def_start_row = 0usize;
480            let mut def_end_row = 0usize;
481            let mut def_kind = "";
482            let mut has_def = false;
483
484            for capture in m.captures {
485                if capture.index == name_idx {
486                    sym_name = Some(
487                        source[capture.node.start_byte()..capture.node.end_byte()].to_string(),
488                    );
489                }
490                if capture.index == def_idx {
491                    def_start = capture.node.start_byte();
492                    def_end = capture.node.end_byte();
493                    def_start_row = capture.node.start_position().row;
494                    def_end_row = capture.node.end_position().row;
495                    def_kind = capture.node.kind();
496                    has_def = true;
497                }
498            }
499
500            if let (Some(name), true) = (sym_name, has_def) {
501                let range = (def_start, def_end);
502                if seen_ranges.contains(&range) {
503                    continue;
504                }
505                seen_ranges.insert(range);
506
507                symbols.push(Symbol {
508                    name,
509                    start_line: def_start_row + 1,
510                    end_line: def_end_row + 1,
511                    start_byte: def_start,
512                    end_byte: def_end,
513                    kind: def_kind.to_string(),
514                });
515            }
516        }
517
518        Some(symbols)
519    }
520
521    fn skeleton_treesitter(&mut self, path: &Path, source: &str, lang: Lang) -> Option<String> {
522        let symbols = self.list_symbols_treesitter(path, source, lang)?;
523        let lines: Vec<&str> = source.lines().collect();
524        let mut out = String::new();
525
526        // Collect import/use lines at the top
527        for (i, line) in lines.iter().enumerate() {
528            let trimmed = line.trim();
529            if trimmed.starts_with("use ")
530                || trimmed.starts_with("import ")
531                || trimmed.starts_with("from ")
532                || trimmed.starts_with("#include")
533                || trimmed.starts_with("package ")
534                || trimmed.starts_with("require")
535            {
536                out.push_str(&format!("{:4}| {}\n", i + 1, line));
537            }
538        }
539
540        if !out.is_empty() {
541            out.push('\n');
542        }
543
544        for sym in &symbols {
545            // Get the first line (signature) of the symbol
546            let sig_line = if sym.start_line <= lines.len() {
547                lines[sym.start_line - 1]
548            } else {
549                &sym.name
550            };
551
552            let line_range = format!("L{}-{}", sym.start_line, sym.end_line);
553            let body_lines = sym.end_line - sym.start_line + 1;
554
555            out.push_str(&format!(
556                "{:4}| {}  {{ ... }}  // {} ({} lines)\n",
557                sym.start_line,
558                sig_line.trim_end(),
559                line_range,
560                body_lines
561            ));
562        }
563
564        Some(out)
565    }
566
567    // ── Vue/Svelte SFC support ──
568
569    /// Extract <script> section from a Vue/Svelte SFC, parse as TypeScript.
570    fn extract_script_section(source: &str) -> Option<(String, usize, usize)> {
571        // Find <script...> opening tag
572        let script_start = source.find("<script")?;
573        let tag_end = source[script_start..].find('>')? + script_start + 1;
574        // Find </script> closing tag
575        let script_end = source[tag_end..].find("</script>")? + tag_end;
576        let script_content = &source[tag_end..script_end];
577
578        // Calculate line offset: how many lines before the script content
579        let line_offset = source[..tag_end].lines().count();
580        let byte_offset = tag_end;
581
582        Some((script_content.to_string(), line_offset, byte_offset))
583    }
584
585    fn list_symbols_vue(&mut self, _path: &Path, source: &str) -> Option<Vec<Symbol>> {
586        let (script, line_offset, byte_offset) = Self::extract_script_section(source)?;
587        let tree = self.cache.parse_source(&script, Lang::Vue)?;
588        let query_src = Lang::Vue.symbols_query();
589        let grammar = Lang::Vue.grammar();
590        let query = Query::new(&grammar, query_src).ok()?;
591
592        let def_idx = query.capture_index_for_name("definition")?;
593        let name_idx = query.capture_index_for_name("name")?;
594
595        let mut cursor = QueryCursor::new();
596        let mut symbols = Vec::new();
597        let mut seen_ranges: std::collections::HashSet<(usize, usize)> =
598            std::collections::HashSet::new();
599
600        let mut matches = cursor.matches(&query, tree.root_node(), script.as_bytes());
601        loop {
602            matches.advance();
603            let m = match matches.get() {
604                Some(m) => m,
605                None => break,
606            };
607
608            let mut sym_name = None;
609            let mut def_start = 0usize;
610            let mut def_end = 0usize;
611            let mut def_start_row = 0usize;
612            let mut def_end_row = 0usize;
613            let mut def_kind = "";
614            let mut has_def = false;
615
616            for capture in m.captures {
617                if capture.index == name_idx {
618                    sym_name = Some(
619                        script[capture.node.start_byte()..capture.node.end_byte()].to_string(),
620                    );
621                }
622                if capture.index == def_idx {
623                    def_start = capture.node.start_byte();
624                    def_end = capture.node.end_byte();
625                    def_start_row = capture.node.start_position().row;
626                    def_end_row = capture.node.end_position().row;
627                    def_kind = capture.node.kind();
628                    has_def = true;
629                }
630            }
631
632            if let (Some(name), true) = (sym_name, has_def) {
633                let range = (def_start, def_end);
634                if seen_ranges.contains(&range) {
635                    continue;
636                }
637                seen_ranges.insert(range);
638
639                symbols.push(Symbol {
640                    name,
641                    // Adjust line/byte offsets to be relative to the full .vue file
642                    start_line: def_start_row + line_offset,
643                    end_line: def_end_row + line_offset,
644                    start_byte: def_start + byte_offset,
645                    end_byte: def_end + byte_offset,
646                    kind: def_kind.to_string(),
647                });
648            }
649        }
650
651        // Add SFC section boundaries (<template>/<script>/<style>) as pseudo-symbols.
652        // This lets the skeleton show where each section lives, so the model can
653        // target-read the right section (e.g., template for HTML, script for logic).
654        let lines: Vec<&str> = source.lines().collect();
655        for (i, line) in lines.iter().enumerate() {
656            let trimmed = line.trim();
657            if trimmed.starts_with("<template")
658                || trimmed.starts_with("<script")
659                || trimmed.starts_with("<style")
660            {
661                let tag = if trimmed.starts_with("<template") {
662                    "template"
663                } else if trimmed.starts_with("<script") {
664                    "script"
665                } else {
666                    "style"
667                };
668                let close_tag = format!("</{}>", tag);
669                let end_line = lines[i..]
670                    .iter()
671                    .position(|l| l.trim().starts_with(&close_tag))
672                    .map(|p| i + p + 1)
673                    .unwrap_or(lines.len());
674                let start_byte = lines[..i].iter().map(|l| l.len() + 1).sum::<usize>();
675                let end_byte = lines[..end_line].iter().map(|l| l.len() + 1).sum::<usize>();
676                symbols.push(Symbol {
677                    name: format!("<{}>", tag),
678                    start_line: i + 1,
679                    end_line,
680                    start_byte,
681                    end_byte,
682                    kind: "sfc_section".to_string(),
683                });
684            }
685        }
686
687        symbols.sort_by_key(|s| s.start_line);
688        Some(symbols)
689    }
690
691    // ── File-type-aware fallback for languages without tree-sitter ──
692    //
693    // Single source of truth for skeleton generation of CSS/HTML/JSON/YAML/Markdown
694    // and code files without tree-sitter support. read.rs has ZERO file-type logic.
695
696    fn list_symbols_indent(&self, source: &str, path: &Path) -> Vec<Symbol> {
697        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
698        let lines: Vec<&str> = source.lines().collect();
699
700        match ext {
701            "css" | "scss" | "less" | "sass" => self.list_symbols_css(&lines),
702            "html" | "htm" => self.list_symbols_html(&lines),
703            "json" => self.list_symbols_json(&lines),
704            "yaml" | "yml" | "toml" => self.list_symbols_yaml(&lines),
705            "md" | "mdx" => self.list_symbols_markdown(&lines),
706            _ => self.list_symbols_code_indent(&lines),
707        }
708    }
709
710    /// CSS/SCSS: :root, @rules, comment headers, top-level selectors
711    fn list_symbols_css(&self, lines: &[&str]) -> Vec<Symbol> {
712        let mut symbols = Vec::new();
713        for (i, line) in lines.iter().enumerate() {
714            let trimmed = line.trim();
715            if trimmed.is_empty() {
716                continue;
717            }
718            let indent = line.len() - line.trim_start().len();
719            let is_match = trimmed.starts_with(":root")
720                || trimmed.starts_with("@keyframes")
721                || trimmed.starts_with("@media")
722                || trimmed.starts_with("@layer")
723                || trimmed.starts_with("@import")
724                || trimmed.starts_with("@font-face")
725                || trimmed.starts_with("/* ===")
726                || trimmed.starts_with("/* ---")
727                || trimmed.starts_with("/* ***")
728                || (indent == 0 && trimmed.starts_with('.') && trimmed.contains('{'))
729                || (indent == 0 && trimmed.starts_with('#') && trimmed.contains('{'));
730
731            if is_match {
732                // Find the block end (matching closing brace)
733                let end = find_block_end(lines, i);
734                let name = trimmed
735                    .split('{')
736                    .next()
737                    .unwrap_or(trimmed)
738                    .trim()
739                    .to_string();
740                symbols.push(make_symbol(name, "css_rule", i, end, lines));
741            }
742        }
743        symbols
744    }
745
746    /// HTML: structural tags
747    fn list_symbols_html(&self, lines: &[&str]) -> Vec<Symbol> {
748        let mut symbols = Vec::new();
749        let tags = [
750            "<head",
751            "<body",
752            "<header",
753            "<main",
754            "<footer",
755            "<nav",
756            "<section",
757            "<article",
758            "<!DOCTYPE",
759        ];
760        for (i, line) in lines.iter().enumerate() {
761            let trimmed = line.trim();
762            if tags.iter().any(|t| trimmed.starts_with(t)) {
763                let name = trimmed
764                    .split(|c: char| c == '>' || c == ' ')
765                    .next()
766                    .unwrap_or(trimmed)
767                    .to_string();
768                symbols.push(make_symbol(name, "html_tag", i, i + 1, lines));
769            }
770        }
771        symbols
772    }
773
774    /// JSON: top-level keys
775    fn list_symbols_json(&self, lines: &[&str]) -> Vec<Symbol> {
776        let mut symbols = Vec::new();
777        for (i, line) in lines.iter().enumerate() {
778            let trimmed = line.trim();
779            let indent = line.len() - line.trim_start().len();
780            // Top-level keys: indent ≤ 2, starts with "
781            if indent <= 2 && trimmed.starts_with('"') && trimmed.contains(':') {
782                let name = trimmed
783                    .split(':')
784                    .next()
785                    .unwrap_or(trimmed)
786                    .trim_matches('"')
787                    .trim()
788                    .to_string();
789                symbols.push(make_symbol(name, "json_key", i, i + 1, lines));
790            }
791        }
792        symbols
793    }
794
795    /// YAML/TOML: top-level keys
796    fn list_symbols_yaml(&self, lines: &[&str]) -> Vec<Symbol> {
797        let mut symbols = Vec::new();
798        for (i, line) in lines.iter().enumerate() {
799            let trimmed = line.trim();
800            let indent = line.len() - line.trim_start().len();
801            if indent == 0
802                && !trimmed.is_empty()
803                && !trimmed.starts_with('#')
804                && !trimmed.starts_with("---")
805            {
806                let name = trimmed
807                    .split(':')
808                    .next()
809                    .unwrap_or(trimmed)
810                    .trim()
811                    .to_string();
812                if !name.is_empty() {
813                    symbols.push(make_symbol(name, "yaml_key", i, i + 1, lines));
814                }
815            }
816        }
817        symbols
818    }
819
820    /// Markdown: headings
821    fn list_symbols_markdown(&self, lines: &[&str]) -> Vec<Symbol> {
822        let mut symbols = Vec::new();
823        for (i, line) in lines.iter().enumerate() {
824            let trimmed = line.trim();
825            if trimmed.starts_with('#') {
826                let name = trimmed.trim_start_matches('#').trim().to_string();
827                // Find next heading or end
828                let end = lines[i + 1..]
829                    .iter()
830                    .position(|l| l.trim().starts_with('#'))
831                    .map(|p| i + 1 + p)
832                    .unwrap_or(lines.len());
833                symbols.push(make_symbol(name, "heading", i, end, lines));
834            }
835        }
836        symbols
837    }
838
839    /// Code files: indent-level-0 definitions (fn/class/def/etc.)
840    fn list_symbols_code_indent(&self, lines: &[&str]) -> Vec<Symbol> {
841        let mut symbols = Vec::new();
842
843        // Pass 1: Extract Chinese variable assignments at any indent level.
844        // This runs independently of the definition block detection below,
845        // ensuring variables inside function bodies are also captured.
846        for (i, line) in lines.iter().enumerate() {
847            let trimmed = line.trim();
848            if trimmed.is_empty() {
849                continue;
850            }
851            let indent = line.len() - line.trim_start().len();
852            if indent <= 8 && contains_chinese(trimmed) {
853                if let Some(eq_pos) = trimmed.find('=') {
854                    let var_name = trimmed[..eq_pos].trim();
855                    if contains_chinese(var_name) && !var_name.contains(' ') {
856                        symbols.push(make_symbol(
857                            var_name.to_string(),
858                            "chinese_variable",
859                            i,
860                            i + 1,
861                            lines,
862                        ));
863                    }
864                }
865            }
866        }
867
868        // Pass 2: Extract indent-level-0 definition blocks (fn/class/def/etc.)
869        let mut i = 0;
870        while i < lines.len() {
871            let line = lines[i];
872            let trimmed = line.trim();
873
874            if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with('#') {
875                i += 1;
876                continue;
877            }
878
879            let indent = line.len() - line.trim_start().len();
880            if indent == 0 && !trimmed.starts_with('}') && !trimmed.starts_with(')') {
881                let is_def = trimmed.starts_with("fn ")
882                    || trimmed.starts_with("pub ")
883                    || trimmed.starts_with("def ")
884                    || trimmed.starts_with("class ")
885                    || trimmed.starts_with("function ")
886                    || trimmed.starts_with("func ")
887                    || trimmed.starts_with("type ")
888                    || trimmed.starts_with("struct ")
889                    || trimmed.starts_with("enum ")
890                    || trimmed.starts_with("interface ")
891                    || trimmed.starts_with("impl ")
892                    || trimmed.starts_with("trait ")
893                    || trimmed.starts_with("const ")
894                    || trimmed.starts_with("export ")
895                    || trimmed.starts_with("async ")
896                    || trimmed.starts_with("public ")
897                    || trimmed.starts_with("private ")
898                    || trimmed.starts_with("protected ");
899
900                if is_def {
901                    let start = i;
902                    let mut end = i + 1;
903                    while end < lines.len() {
904                        let next = lines[end];
905                        let next_trimmed = next.trim();
906                        if next_trimmed.is_empty() {
907                            end += 1;
908                            continue;
909                        }
910                        let next_indent = next.len() - next.trim_start().len();
911                        if next_indent == 0 && !next_trimmed.starts_with('}') {
912                            break;
913                        }
914                        end += 1;
915                    }
916                    if end < lines.len() && lines[end].trim() == "}" {
917                        end += 1;
918                    }
919
920                    let name = extract_indent_name(trimmed);
921                    symbols.push(make_symbol(name, "indent_block", start, end, lines));
922
923                    i = end;
924                    continue;
925                }
926            }
927
928            i += 1;
929        }
930
931        symbols
932    }
933
934    fn skeleton_indent(&self, source: &str, path: &Path) -> String {
935        let symbols = self.list_symbols_indent(source, path);
936        let lines: Vec<&str> = source.lines().collect();
937        let mut out = String::new();
938
939        for sym in &symbols {
940            if sym.start_line <= lines.len() {
941                let sig = lines[sym.start_line - 1];
942                let body_lines = sym.end_line - sym.start_line + 1;
943                out.push_str(&format!(
944                    "{:4}| {}  // L{}-{} ({} lines)\n",
945                    sym.start_line,
946                    sig.trim_end(),
947                    sym.start_line,
948                    sym.end_line,
949                    body_lines
950                ));
951            }
952        }
953
954        out
955    }
956}
957
958/// A precise slice of source code for a single symbol.
959#[derive(Debug, Clone)]
960pub struct SymbolSlice {
961    pub name: String,
962    pub kind: String,
963    pub start_line: usize,
964    pub end_line: usize,
965    pub start_byte: usize,
966    pub end_byte: usize,
967    pub text: String,
968}
969
970/// Create a Symbol from line indices.
971fn make_symbol(name: String, kind: &str, start: usize, end: usize, lines: &[&str]) -> Symbol {
972    let start_byte = lines[..start].iter().map(|l| l.len() + 1).sum::<usize>();
973    let end_byte = lines[..end].iter().map(|l| l.len() + 1).sum::<usize>();
974    Symbol {
975        name,
976        start_line: start + 1,
977        end_line: end,
978        start_byte,
979        end_byte,
980        kind: kind.to_string(),
981    }
982}
983
984/// Find the end of a CSS block starting at `start` (matching closing brace).
985fn find_block_end(lines: &[&str], start: usize) -> usize {
986    let mut depth = 0i32;
987    for i in start..lines.len() {
988        for ch in lines[i].chars() {
989            if ch == '{' {
990                depth += 1;
991            }
992            if ch == '}' {
993                depth -= 1;
994            }
995        }
996        if depth <= 0 && i > start {
997            return i + 1;
998        }
999    }
1000    (start + 1).min(lines.len())
1001}
1002
1003/// Extract a plausible name from an indent-level-0 definition line.
1004fn extract_indent_name(line: &str) -> String {
1005    let tokens: Vec<&str> = line.split_whitespace().collect();
1006    // Skip keywords, take the first identifier-like token
1007    for (i, tok) in tokens.iter().enumerate() {
1008        if i == 0 {
1009            continue; // skip the keyword itself
1010        }
1011        // Strip common suffixes: (, {, :, <
1012        let clean = tok
1013            .trim_start_matches('*')
1014            .trim_end_matches(|c: char| "({:<".contains(c));
1015        if !clean.is_empty()
1016            && clean
1017                .chars()
1018                .next()
1019                .map_or(false, |c| c.is_alphabetic() || c == '_')
1020        {
1021            return clean.to_string();
1022        }
1023    }
1024    tokens.first().unwrap_or(&"unknown").to_string()
1025}
1026
1027#[cfg(test)]
1028mod tests {
1029    use super::*;
1030    use std::io::Write;
1031
1032    #[test]
1033    fn test_language_detection() {
1034        assert_eq!(
1035            LanguageRegistry::detect(Path::new("foo.rs")),
1036            Some(Lang::Rust)
1037        );
1038        assert_eq!(
1039            LanguageRegistry::detect(Path::new("bar.py")),
1040            Some(Lang::Python)
1041        );
1042        assert_eq!(
1043            LanguageRegistry::detect(Path::new("baz.js")),
1044            Some(Lang::JavaScript)
1045        );
1046        assert_eq!(
1047            LanguageRegistry::detect(Path::new("qux.ts")),
1048            Some(Lang::TypeScript)
1049        );
1050        assert_eq!(
1051            LanguageRegistry::detect(Path::new("main.go")),
1052            Some(Lang::Go)
1053        );
1054        assert_eq!(
1055            LanguageRegistry::detect(Path::new("App.java")),
1056            Some(Lang::Java)
1057        );
1058        assert_eq!(LanguageRegistry::detect(Path::new("main.c")), Some(Lang::C));
1059        assert_eq!(
1060            LanguageRegistry::detect(Path::new("main.cpp")),
1061            Some(Lang::Cpp)
1062        );
1063        assert_eq!(
1064            LanguageRegistry::detect(Path::new("Program.cs")),
1065            Some(Lang::CSharp)
1066        );
1067        assert_eq!(
1068            LanguageRegistry::detect(Path::new("index.php")),
1069            Some(Lang::Php)
1070        );
1071        assert_eq!(LanguageRegistry::detect(Path::new("readme.md")), None);
1072    }
1073
1074    #[test]
1075    fn test_list_symbols_rust() {
1076        let mut searcher = SemanticSearcher::new();
1077        let source = r#"
1078pub fn hello() {
1079    println!("hello");
1080}
1081
1082pub struct Point {
1083    x: f64,
1084    y: f64,
1085}
1086
1087impl Point {
1088    pub fn new(x: f64, y: f64) -> Self {
1089        Self { x, y }
1090    }
1091}
1092"#;
1093        let mut tmp = tempfile::NamedTempFile::with_suffix(".rs").unwrap();
1094        tmp.write_all(source.as_bytes()).unwrap();
1095
1096        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1097        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1098        assert!(names.contains(&"hello"), "symbols: {:?}", names);
1099        assert!(names.contains(&"Point"), "symbols: {:?}", names);
1100    }
1101
1102    #[test]
1103    fn test_extract_symbol_rust() {
1104        let mut searcher = SemanticSearcher::new();
1105        let source = r#"pub fn add(a: i32, b: i32) -> i32 {
1106    a + b
1107}
1108
1109pub fn sub(a: i32, b: i32) -> i32 {
1110    a - b
1111}
1112"#;
1113        let mut tmp = tempfile::NamedTempFile::with_suffix(".rs").unwrap();
1114        tmp.write_all(source.as_bytes()).unwrap();
1115
1116        let slice = searcher.extract_symbol(tmp.path(), "add").unwrap();
1117        assert!(slice.text.contains("a + b"), "text: {}", slice.text);
1118        assert!(!slice.text.contains("a - b"), "should not contain sub");
1119    }
1120
1121    #[test]
1122    fn test_skeleton_rust() {
1123        let mut searcher = SemanticSearcher::new();
1124        let source = r#"use std::io;
1125
1126pub fn hello() {
1127    println!("hello");
1128}
1129
1130pub fn world() {
1131    println!("world");
1132}
1133"#;
1134        let mut tmp = tempfile::NamedTempFile::with_suffix(".rs").unwrap();
1135        tmp.write_all(source.as_bytes()).unwrap();
1136
1137        let skel = searcher.skeleton(tmp.path()).unwrap();
1138        assert!(skel.contains("hello"), "skeleton: {}", skel);
1139        assert!(skel.contains("world"), "skeleton: {}", skel);
1140        assert!(skel.contains("use std::io"), "skeleton: {}", skel);
1141    }
1142
1143    #[test]
1144    fn test_list_symbols_python() {
1145        let mut searcher = SemanticSearcher::new();
1146        let source = r#"
1147def greet(name):
1148    print(f"hello {name}")
1149
1150class Calculator:
1151    def add(self, a, b):
1152        return a + b
1153"#;
1154        let mut tmp = tempfile::NamedTempFile::with_suffix(".py").unwrap();
1155        tmp.write_all(source.as_bytes()).unwrap();
1156
1157        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1158        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1159        assert!(names.contains(&"greet"), "symbols: {:?}", names);
1160        assert!(names.contains(&"Calculator"), "symbols: {:?}", names);
1161    }
1162
1163    #[test]
1164    fn test_list_symbols_csharp() {
1165        let mut searcher = SemanticSearcher::new();
1166        let source = r#"
1167class Program {
1168    Program() {}
1169
1170    public static void Main(string[] args) {
1171    }
1172}
1173
1174interface IGreeter {
1175    void Greet();
1176}
1177"#;
1178        let mut tmp = tempfile::NamedTempFile::with_suffix(".cs").unwrap();
1179        tmp.write_all(source.as_bytes()).unwrap();
1180
1181        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1182        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1183        assert!(names.contains(&"Program"), "symbols: {:?}", names);
1184        assert!(names.contains(&"Main"), "symbols: {:?}", names);
1185        assert!(names.contains(&"IGreeter"), "symbols: {:?}", names);
1186    }
1187
1188    #[test]
1189    fn test_list_symbols_php() {
1190        let mut searcher = SemanticSearcher::new();
1191        let source = r#"
1192<?php
1193
1194class Calculator {
1195    public function add($a, $b) {
1196        return $a + $b;
1197    }
1198}
1199
1200function greet($name) {
1201    return "Hello, $name";
1202}
1203
1204interface Printable {
1205    public function print();
1206}
1207"#;
1208        let mut tmp = tempfile::NamedTempFile::with_suffix(".php").unwrap();
1209        tmp.write_all(source.as_bytes()).unwrap();
1210
1211        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1212        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1213        assert!(names.contains(&"Calculator"), "php: {:?}", names);
1214        assert!(names.contains(&"add"), "php: {:?}", names);
1215        assert!(names.contains(&"greet"), "php: {:?}", names);
1216        assert!(names.contains(&"Printable"), "php: {:?}", names);
1217    }
1218
1219    #[test]
1220    fn test_indent_fallback() {
1221        let mut searcher = SemanticSearcher::new();
1222        let source = r#"
1223def hello():
1224    print("hello")
1225
1226def world():
1227    print("world")
1228"#;
1229        // Use .txt extension so no grammar is detected
1230        let mut tmp = tempfile::NamedTempFile::with_suffix(".txt").unwrap();
1231        tmp.write_all(source.as_bytes()).unwrap();
1232
1233        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1234        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1235        assert!(
1236            names.contains(&"hello()"),
1237            "indent fallback symbols: {:?}",
1238            names
1239        );
1240    }
1241
1242    #[test]
1243    fn test_chinese_character_detection() {
1244        assert!(is_chinese('中'));
1245        assert!(is_chinese('文'));
1246        assert!(!is_chinese('a'));
1247        assert!(!is_chinese('1'));
1248        assert!(!is_chinese('_'));
1249    }
1250
1251    #[test]
1252    fn test_contains_chinese() {
1253        assert!(contains_chinese("用户名"));
1254        assert!(contains_chinese("hello世界"));
1255        assert!(!contains_chinese("hello"));
1256        assert!(!contains_chinese("123"));
1257    }
1258
1259    #[test]
1260    fn test_pinyin_identifier_detection() {
1261        // Valid Pinyin identifiers
1262        assert!(is_pinyin_identifier("yonghuMing"));
1263        assert!(is_pinyin_identifier("dingdanList"));
1264        assert!(is_pinyin_identifier("zhongguoRen"));
1265        assert!(is_pinyin_identifier("wenjianMuLu"));
1266
1267        // Invalid Pinyin identifiers
1268        assert!(!is_pinyin_identifier("hello"));
1269        assert!(!is_pinyin_identifier("getUser"));
1270        assert!(!is_pinyin_identifier(""));
1271        assert!(!is_pinyin_identifier("123"));
1272    }
1273
1274    #[test]
1275    fn test_symbol_chinese_detection() {
1276        let sym = Symbol {
1277            name: "用户名".to_string(),
1278            start_line: 1,
1279            end_line: 1,
1280            start_byte: 0,
1281            end_byte: 9,
1282            kind: "variable".to_string(),
1283        };
1284        assert!(sym.is_chinese());
1285        assert!(!sym.is_pinyin());
1286        assert!(sym.is_chinese_related());
1287
1288        let sym_pinyin = Symbol {
1289            name: "yonghuMing".to_string(),
1290            start_line: 1,
1291            end_line: 1,
1292            start_byte: 0,
1293            end_byte: 10,
1294            kind: "variable".to_string(),
1295        };
1296        assert!(!sym_pinyin.is_chinese());
1297        assert!(sym_pinyin.is_pinyin());
1298        assert!(sym_pinyin.is_chinese_related());
1299
1300        let sym_english = Symbol {
1301            name: "getUser".to_string(),
1302            start_line: 1,
1303            end_line: 1,
1304            start_byte: 0,
1305            end_byte: 7,
1306            kind: "function".to_string(),
1307        };
1308        assert!(!sym_english.is_chinese());
1309        assert!(!sym_english.is_pinyin());
1310        assert!(!sym_english.is_chinese_related());
1311    }
1312
1313    #[test]
1314    fn test_chinese_variable_extraction() {
1315        let mut searcher = SemanticSearcher::new();
1316        let source = r#"用户名 = "张三"
1317年龄 = 25
1318def get_user():
1319    return 用户名
1320"#;
1321        let mut tmp = tempfile::NamedTempFile::with_suffix(".txt").unwrap();
1322        tmp.write_all(source.as_bytes()).unwrap();
1323
1324        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1325        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1326        assert!(names.contains(&"用户名"), "symbols: {:?}", names);
1327    }
1328
1329    #[test]
1330    fn test_mixed_chinese_english_detection() {
1331        // Mixed identifiers: English prefix + Chinese suffix
1332        assert!(contains_chinese("getUser用户名"));
1333        assert!(contains_chinese("query_订单列表"));
1334        assert!(contains_chinese("test数据"));
1335        assert!(contains_chinese("order详情"));
1336
1337        // Mixed identifiers should be detected as Chinese-related
1338        let sym_mixed1 = Symbol {
1339            name: "getUser用户名".to_string(),
1340            start_line: 1,
1341            end_line: 1,
1342            start_byte: 0,
1343            end_byte: 0,
1344            kind: "variable".to_string(),
1345        };
1346        assert!(sym_mixed1.is_chinese_related());
1347
1348        let sym_mixed2 = Symbol {
1349            name: "query_订单列表".to_string(),
1350            start_line: 1,
1351            end_line: 1,
1352            start_byte: 0,
1353            end_byte: 0,
1354            kind: "variable".to_string(),
1355        };
1356        assert!(sym_mixed2.is_chinese_related());
1357
1358        // Pure English should NOT be detected
1359        assert!(!contains_chinese("getUser"));
1360        assert!(!contains_chinese("queryOrderList"));
1361    }
1362
1363    #[test]
1364    fn test_mixed_content_extraction() {
1365        let mut searcher = SemanticSearcher::new();
1366        let source = r#"getUser用户名 = "张三"
1367query_订单列表 = []
1368test数据 = 42
1369"#;
1370        let mut tmp = tempfile::NamedTempFile::with_suffix(".txt").unwrap();
1371        tmp.write_all(source.as_bytes()).unwrap();
1372
1373        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1374        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1375        assert!(names.contains(&"getUser用户名"), "symbols: {:?}", names);
1376        assert!(names.contains(&"query_订单列表"), "symbols: {:?}", names);
1377        assert!(names.contains(&"test数据"), "symbols: {:?}", names);
1378    }
1379
1380    #[test]
1381    fn test_chinese_variable_nested_indent() {
1382        // Chinese variables inside nested blocks (indent > 0) should be extracted
1383        let mut searcher = SemanticSearcher::new();
1384        let source = r#"def process():
1385    用户名 = "张三"
1386    订单列表 = []
1387    if True:
1388        配置项 = "value"
1389"#;
1390        let mut tmp = tempfile::NamedTempFile::with_suffix(".txt").unwrap();
1391        tmp.write_all(source.as_bytes()).unwrap();
1392
1393        let symbols = searcher.list_symbols(tmp.path()).unwrap();
1394        let names: Vec<&str> = symbols.iter().map(|s| s.name.as_str()).collect();
1395        assert!(names.contains(&"用户名"), "nested symbols: {:?}", names);
1396        assert!(names.contains(&"订单列表"), "nested symbols: {:?}", names);
1397        assert!(names.contains(&"配置项"), "nested symbols: {:?}", names);
1398    }
1399}