Skip to main content

seshat_scanner/parser/
mod.rs

1//! Parser trait, language dispatch, and content hashing.
2//!
3//! The [`Parser`] trait defines the interface all language parsers implement.
4//! [`parse_file`] dispatches to the correct parser based on [`Language`],
5//! and computes the SHA-256 content hash in shared code so individual parsers
6//! do not duplicate that logic.
7
8mod javascript_parser;
9mod python_parser;
10mod rust_parser;
11mod typescript_parser;
12
13use std::collections::{HashSet, VecDeque};
14use std::path::Path;
15
16use seshat_core::{FunctionCall, Language, ProjectFile};
17use sha2::{Digest, Sha256};
18use tree_sitter::Node;
19
20use crate::ScanError;
21use javascript_parser::JavaScriptParser;
22use python_parser::PythonParser;
23use rust_parser::RustParser;
24use seshat_core::ir::DependencyUsage;
25use typescript_parser::TypeScriptParser;
26
27/// Common trait for all language parsers.
28///
29/// Implementations extract imports, exports, functions, types, and
30/// language-specific IR from source code. Content hashing is handled
31/// by the shared [`parse_file`] function — parsers should **not**
32/// compute the hash themselves.
33pub trait Parser {
34    /// Parse source code at `path` into a [`ProjectFile`].
35    ///
36    /// The `content_hash` field on the returned `ProjectFile` may be left
37    /// empty; [`parse_file`] will overwrite it with the SHA-256 hash.
38    fn parse(&self, path: &Path, source: &str) -> Result<ProjectFile, ScanError>;
39}
40
41// ---------------------------------------------------------------------------
42// Shared tree-sitter helpers used by all language parsers.
43// ---------------------------------------------------------------------------
44
45/// Extract UTF-8 text for a tree-sitter node from source bytes.
46///
47/// Returns `""` if the node's byte range is not valid UTF-8.
48pub(super) fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str {
49    node.utf8_text(source).unwrap_or("")
50}
51
52/// Find the first direct child of `node` whose `kind()` equals `kind`.
53pub(super) fn find_child_node<'a>(node: &'a Node, kind: &str) -> Option<Node<'a>> {
54    (0..node.child_count())
55        .filter_map(|i| node.child(i as u32))
56        .find(|c| c.kind() == kind)
57}
58
59/// Find the first child of `kind` and return its text as an owned `String`.
60pub(super) fn find_child_text(node: &Node, kind: &str, source: &[u8]) -> Option<String> {
61    find_child_node(node, kind).map(|n| node_text(&n, source).to_string())
62}
63
64/// Check whether `node` has any direct child whose `kind()` equals `kind`.
65pub(super) fn has_child_kind(node: &Node, kind: &str) -> bool {
66    find_child_node(node, kind).is_some()
67}
68
69// ---------------------------------------------------------------------------
70// Call-site shared helpers (used by all language parsers)
71// ---------------------------------------------------------------------------
72
73/// Maximum number of function call entries to collect per file.
74pub(crate) const MAX_FUNCTION_CALLS_PER_FILE: usize = 500;
75
76/// Number of context lines to include **before** the opening line of the call.
77pub(crate) const CALL_SNIPPET_LINES_BEFORE: usize = 2;
78
79/// Number of context lines to include **after** the closing line of the call.
80pub(crate) const CALL_SNIPPET_LINES_AFTER: usize = 4;
81
82/// Maximum total lines in a call-site snippet.
83pub(crate) const CALL_SNIPPET_MAX_LINES: usize = 30;
84
85/// Build a context snippet around a call-site from a pre-split line slice.
86///
87/// Layout:
88/// ```text
89/// [CALL_SNIPPET_LINES_BEFORE lines before `line`]
90/// [all lines of the call expression: `line` ..= `end_line`]
91/// [CALL_SNIPPET_LINES_AFTER lines after `end_line`]
92/// ```
93///
94/// The total is capped at `CALL_SNIPPET_MAX_LINES`.
95/// Lines are taken verbatim from `source_lines` (original indentation preserved).
96pub fn build_call_snippet_from_lines(
97    source_lines: &[&str],
98    line: usize,
99    end_line: usize,
100) -> String {
101    let total = source_lines.len();
102    if total == 0 || line == 0 || end_line == 0 {
103        return String::new();
104    }
105
106    // Convert to 0-indexed, clamp to file bounds.
107    let call_start_0 = (line - 1).min(total - 1);
108    let call_end_0 = (end_line - 1).min(total - 1);
109    // Guard against inverted spans (tree-sitter error-recovery nodes).
110    let call_end_0 = call_end_0.max(call_start_0);
111
112    let snippet_start = call_start_0.saturating_sub(CALL_SNIPPET_LINES_BEFORE);
113    let snippet_end_uncapped = (call_end_0 + CALL_SNIPPET_LINES_AFTER + 1).min(total);
114
115    // Hard cap: never exceed CALL_SNIPPET_MAX_LINES total.
116    let snippet_end = snippet_end_uncapped.min(snippet_start + CALL_SNIPPET_MAX_LINES);
117
118    source_lines[snippet_start..snippet_end].join("\n")
119}
120
121/// Convenience wrapper: splits `source` into lines and delegates to
122/// [`build_call_snippet_from_lines`].  Use the `_from_lines` variant directly
123/// when building many snippets from the same file to avoid repeated allocation.
124pub fn build_call_snippet(source: &str, line: usize, end_line: usize) -> String {
125    let lines: Vec<&str> = source.lines().collect();
126    build_call_snippet_from_lines(&lines, line, end_line)
127}
128
129/// Walk the entire syntax tree (BFS) collecting function call nodes.
130///
131/// `call_kind`: tree-sitter node kind to match.
132///   - `"call_expression"` for Rust, TypeScript, JavaScript
133///   - `"call"` for Python
134///
135/// `skip_kinds`: node kinds to prune entirely (no descent into their children).
136///   Pass `&["token_tree"]` for Rust; pass `&[]` for other languages.
137///
138/// `extract_fn`: language-specific closure that extracts a [`FunctionCall`] from a
139/// matched node.  Receives `(node, source, source_lines)` — `source_lines` is the
140/// pre-split line slice so snippet builders don't re-allocate it per call.
141/// Returns `None` for nodes that should be skipped.
142///
143/// Deduplicates by callee name via a `HashSet` (first occurrence wins, O(1) lookup).
144/// Stops enqueuing new children as soon as `MAX_FUNCTION_CALLS_PER_FILE` is reached.
145pub fn collect_calls_bfs<F>(
146    root: &tree_sitter::Node,
147    source: &str,
148    call_kind: &str,
149    skip_kinds: &[&str],
150    extract_fn: F,
151    out: &mut Vec<FunctionCall>,
152) where
153    F: Fn(&tree_sitter::Node, &str, &[&str]) -> Option<FunctionCall>,
154{
155    // Split lines once for the entire file; passed to every extract_fn call.
156    let source_lines: Vec<&str> = source.lines().collect();
157
158    let mut seen: HashSet<String> = HashSet::new();
159    let mut queue: VecDeque<(tree_sitter::Node, usize)> = VecDeque::new();
160    for i in 0..root.child_count() {
161        if let Some(child) = root.child(i as u32) {
162            queue.push_back((child, 0));
163        }
164    }
165
166    const MAX_DEPTH: usize = 60;
167
168    while let Some((node, depth)) = queue.pop_front() {
169        // Hard stop: don't enqueue more children once the cap is reached.
170        if out.len() >= MAX_FUNCTION_CALLS_PER_FILE {
171            break;
172        }
173        if depth > MAX_DEPTH {
174            continue;
175        }
176
177        // Language-specific subtrees to skip entirely (no descent).
178        if skip_kinds.contains(&node.kind()) {
179            continue;
180        }
181
182        if node.kind() == call_kind {
183            if let Some(call) = extract_fn(&node, source, &source_lines) {
184                // O(1) dedup via HashSet.
185                if seen.insert(call.callee.clone()) {
186                    out.push(call);
187                }
188            }
189            // Still recurse into call children (nested calls).
190        }
191
192        for i in 0..node.child_count() {
193            if let Some(child) = node.child(i as u32) {
194                queue.push_back((child, depth + 1));
195            }
196        }
197    }
198}
199
200// ---------------------------------------------------------------------------
201// Doc-comment extraction helpers (shared across parsers)
202// ---------------------------------------------------------------------------
203
204/// Collect consecutive leading `///` doc-comment lines immediately preceding
205/// a Rust AST node.
206///
207/// Walks **backwards** from the node's previous named sibling, collecting
208/// adjacent `line_comment` nodes whose text starts with `///`. Returns them
209/// joined as a single string (with `///` prefix stripped), or `None` if no
210/// doc comments were found.
211pub(super) fn collect_rust_doc_comment(node: &Node, source: &[u8]) -> Option<String> {
212    let mut comments: Vec<String> = Vec::new();
213    let mut current = node.prev_sibling();
214    while let Some(prev) = current {
215        match prev.kind() {
216            "line_comment" => {
217                let text = node_text(&prev, source);
218                if let Some(doc) = text.strip_prefix("///") {
219                    comments.push(doc.trim().to_owned());
220                    current = prev.prev_sibling();
221                    continue;
222                }
223                break;
224            }
225            // #[derive(...)], #[cfg(test)], etc. sit between `///` lines and the
226            // item definition in Rust syntax — skip them and keep walking back.
227            "attribute_item" => {
228                current = prev.prev_sibling();
229            }
230            _ => break,
231        }
232    }
233    if comments.is_empty() {
234        return None;
235    }
236    comments.reverse();
237    Some(comments.join("\n"))
238}
239
240/// Extract a file-level doc comment from the first node in a JS/TS parse tree.
241///
242/// Scans root children in order; returns the cleaned text of the first
243/// `comment` node encountered, skipping only `hash_bang_line` nodes.
244/// Stops immediately on any non-comment, non-shebang node.
245pub(super) fn extract_js_ts_file_doc(root: &Node, source: &[u8]) -> Option<String> {
246    for i in 0..(root.child_count()) {
247        let Some(child) = root.child(i as u32) else {
248            break;
249        };
250        if child.kind() == "comment" {
251            let raw = node_text(&child, source);
252            let cleaned = clean_js_comment(raw);
253            return if cleaned.is_empty() {
254                None
255            } else {
256                Some(cleaned)
257            };
258        }
259        // Skip shebangs; stop on anything else.
260        if child.kind() != "hash_bang_line" {
261            break;
262        }
263    }
264    None
265}
266
267/// Collect a leading JSDoc or block comment immediately preceding a TS/JS node.
268///
269/// Uses `prev_named_sibling()` to find the nearest preceding `comment` node.
270/// Returns the cleaned text (strips `/** */` and `//` markers) or `None`.
271pub(super) fn collect_js_doc_comment(node: &Node, source: &[u8]) -> Option<String> {
272    let prev = node.prev_named_sibling()?;
273    if prev.kind() != "comment" {
274        return None;
275    }
276    let raw = node_text(&prev, source);
277    let cleaned = clean_js_comment(raw);
278    if cleaned.is_empty() {
279        None
280    } else {
281        Some(cleaned)
282    }
283}
284
285/// Strip JSDoc (`/** ... */`), block-comment (`/* ... */`), or line-comment
286/// (`//`) markers from a raw comment string, returning trimmed human-readable
287/// text.
288///
289/// Both `/** */` and `/* */` are handled with shared logic — only the prefix
290/// length differs.
291pub(super) fn clean_js_comment(raw: &str) -> String {
292    let s = raw.trim();
293
294    // Block comment: /** ... */ or /* ... */
295    if s.starts_with("/*") && s.ends_with("*/") {
296        // Skip either 3 bytes (/**) or 2 bytes (/*).
297        let prefix_len = if s.starts_with("/**") { 3 } else { 2 };
298        let inner = &s[prefix_len..s.len() - 2];
299        return inner
300            .lines()
301            .map(|l| l.trim().trim_start_matches('*').trim())
302            .filter(|l| !l.is_empty())
303            .collect::<Vec<_>>()
304            .join(" ");
305    }
306
307    // Line comment: // ...
308    if let Some(rest) = s.strip_prefix("//") {
309        return rest.trim().to_owned();
310    }
311
312    s.to_owned()
313}
314
315/// Extract a Python docstring from the first statement of a `block` node.
316///
317/// Returns the stripped content of the first triple-quoted or single-quoted
318/// string literal in the block, or `None` if no docstring is present.
319pub(super) fn extract_python_docstring(block: &Node, source: &[u8]) -> Option<String> {
320    // The first named child of a `block` that is an `expression_statement`
321    // containing a bare `string` literal is the docstring.
322    let first = block.named_child(0)?;
323    if first.kind() != "expression_statement" {
324        return None;
325    }
326    // The expression_statement should have exactly one named child: a string.
327    let expr = first.named_child(0)?;
328    if expr.kind() == "string" {
329        let raw = node_text(&expr, source);
330        return Some(clean_python_docstring(raw));
331    }
332    None
333}
334
335/// Strip surrounding triple/single/double quotes from a Python string literal.
336///
337/// Uses byte-length arithmetic which is safe here because all quote delimiters
338/// (`"""`, `'''`, `"`, `'`) are ASCII (1 byte each).
339fn clean_python_docstring(raw: &str) -> String {
340    let s = raw.trim();
341
342    // Strip triple quotes first (""" or ''')
343    for delim in &[r#"""""#, "'''"] {
344        let dlen = delim.len(); // 3 bytes, always ASCII
345        if s.starts_with(delim) && s.ends_with(delim) && s.len() >= dlen * 2 {
346            let inner = &s[dlen..s.len() - dlen];
347            return inner.trim().to_owned();
348        }
349    }
350
351    // Strip single double/single quote (" or ')
352    // Both are ASCII (1 byte), so byte-indexing is safe.
353    for delim in &[r#"""#, "'"] {
354        if s.starts_with(delim) && s.ends_with(delim) && s.len() >= 2 {
355            let inner = &s[1..s.len() - 1];
356            return inner.trim().to_owned();
357        }
358    }
359
360    s.to_owned()
361}
362
363/// Extract the string content from a `string` node (strips surrounding quotes).
364///
365/// Shared between the TypeScript and JavaScript parsers for ESM import paths.
366pub(super) fn extract_string_value(node: &Node, source: &[u8]) -> Option<String> {
367    let string_node = find_child_node(node, "string")?;
368    let fragment = find_child_node(&string_node, "string_fragment")?;
369    Some(node_text(&fragment, source).to_string())
370}
371
372/// Extract names from an ESM `import_clause` node.
373///
374/// Shared between the TypeScript and JavaScript parsers.
375pub(super) fn extract_import_names(clause: &Node, source: &[u8]) -> Vec<String> {
376    let mut names = Vec::new();
377
378    for i in 0..(clause.child_count()) {
379        let Some(child) = clause.child(i as u32) else {
380            continue;
381        };
382        match child.kind() {
383            "identifier" => {
384                // Default import: `import Foo from ...`
385                names.push(node_text(&child, source).to_string());
386            }
387            "named_imports" => {
388                // Named imports: `import { Foo, Bar } from ...`
389                for j in 0..(child.child_count()) {
390                    if let Some(spec) = child.child(j as u32) {
391                        if spec.kind() == "import_specifier" {
392                            if let Some(name_node) = spec.child(0) {
393                                names.push(node_text(&name_node, source).to_string());
394                            }
395                        }
396                    }
397                }
398            }
399            "namespace_import" => {
400                // Namespace import: `import * as ns from ...`
401                if let Some(alias) = find_child_text(&child, "identifier", source) {
402                    names.push(format!("* as {alias}"));
403                } else {
404                    names.push("*".to_string());
405                }
406            }
407            _ => {}
408        }
409    }
410
411    names
412}
413
414/// Extract exports and functions from `export const/let/var` (lexical) declarations.
415///
416/// Shared between the TypeScript and JavaScript parsers.
417///
418/// `line` and `end_line` are the start/end of the surrounding `export_statement`
419/// node (passed by the caller) so every emitted [`Export`] carries the full
420/// source range of the declaration — which the hunk-intersection logic in
421/// `map_diff_impact` uses to decide whether a changed hunk touches the symbol.
422#[allow(clippy::too_many_arguments)]
423pub(super) fn extract_exported_lexical(
424    node: &Node,
425    source: &[u8],
426    exports: &mut Vec<seshat_core::Export>,
427    functions: &mut Vec<seshat_core::Function>,
428    is_default: bool,
429    line: usize,
430    end_line: usize,
431) {
432    for i in 0..(node.child_count()) {
433        let Some(child) = node.child(i as u32) else {
434            continue;
435        };
436        if child.kind() == "variable_declarator" {
437            let name = find_child_text(&child, "identifier", source).unwrap_or_default();
438
439            // Check if the value is an arrow function or function expression
440            let func_node = find_arrow_or_function_expr(&child);
441            let is_func = func_node.is_some();
442
443            if is_func {
444                let is_async = child_has_async_value(&child, source);
445                let parameters = func_node
446                    .map(|n| extract_js_ts_parameters(&n, source))
447                    .unwrap_or_default();
448                functions.push(seshat_core::Function {
449                    name: name.clone(),
450                    is_public: true,
451                    is_async,
452                    line: child.start_position().row + 1,
453                    end_line: child.end_position().row + 1,
454                    parameters,
455                    // doc_comment for lexical arrow-functions is not yet extracted
456                    // (no prev_named_sibling hook available here without refactoring).
457                    doc_comment: None,
458                });
459            }
460
461            if !name.is_empty() {
462                exports.push(seshat_core::Export {
463                    name,
464                    is_default,
465                    is_type_only: false,
466                    line,
467                    end_line,
468                });
469            }
470        }
471    }
472}
473
474/// Extract a `function_declaration` node into a [`seshat_core::Function`].
475///
476/// Shared between the TypeScript and JavaScript parsers.
477pub(super) fn extract_function_declaration(node: &Node, source: &[u8]) -> seshat_core::Function {
478    let name = find_child_text(node, "identifier", source).unwrap_or_default();
479    let is_async = has_child_kind(node, "async");
480    let parameters = extract_js_ts_parameters(node, source);
481
482    seshat_core::Function {
483        name,
484        is_public: false, // will be set to true by export handling
485        is_async,
486        line: node.start_position().row + 1,
487        end_line: node.end_position().row + 1,
488        parameters,
489        // doc_comment is set by the caller (parser main loop) via collect_js_doc_comment.
490        doc_comment: None,
491    }
492}
493
494/// Check if a `variable_declarator` value child (arrow_function or
495/// function_expression) is async.
496///
497/// Shared between the TypeScript and JavaScript parsers.
498pub(super) fn child_has_async_value(declarator: &Node, source: &[u8]) -> bool {
499    for i in 0..(declarator.child_count()) {
500        if let Some(child) = declarator.child(i as u32) {
501            if child.kind() == "arrow_function" || child.kind() == "function_expression" {
502                return has_child_kind(&child, "async");
503            }
504        }
505    }
506    // Fallback: check the whole declarator text
507    node_text(declarator, source).contains("async")
508}
509
510/// Find the first `arrow_function` or `function_expression` child of a
511/// `variable_declarator` node.
512///
513/// Shared between the TypeScript and JavaScript parsers.
514pub(super) fn find_arrow_or_function_expr<'a>(declarator: &'a Node) -> Option<Node<'a>> {
515    for i in 0..(declarator.child_count()) {
516        if let Some(child) = declarator.child(i as u32) {
517            match child.kind() {
518                "arrow_function" | "function_expression" => return Some(child),
519                _ => {}
520            }
521        }
522    }
523    None
524}
525
526/// Extract parameter names from a JS/TS function node.
527///
528/// Works for `function_declaration`, `arrow_function`, `function_expression`,
529/// and `method_definition` nodes. Looks for a `formal_parameters` child and
530/// extracts identifier names from each parameter.
531///
532/// Shared between the TypeScript and JavaScript parsers.
533pub(super) fn extract_js_ts_parameters(func_node: &Node, source: &[u8]) -> Vec<String> {
534    let Some(params) = find_child_node(func_node, "formal_parameters") else {
535        return Vec::new();
536    };
537    let mut names = Vec::new();
538    for i in 0..(params.child_count()) {
539        let Some(child) = params.child(i as u32) else {
540            continue;
541        };
542        match child.kind() {
543            // Simple identifier parameter: `function f(x) {}`
544            "identifier" => {
545                let name = node_text(&child, source).to_string();
546                if !name.is_empty() {
547                    names.push(name);
548                }
549            }
550            // TS required parameter: `function f(x: number) {}`
551            // TS optional parameter: `function f(x?: number) {}`
552            "required_parameter" | "optional_parameter" => {
553                // The first identifier child is the parameter name
554                if let Some(name) = find_child_text(&child, "identifier", source) {
555                    if !name.is_empty() {
556                        names.push(name);
557                    }
558                }
559            }
560            // Default parameter: `function f(x = 5) {}`
561            "assignment_pattern" => {
562                // Left side of the assignment is the parameter name
563                if let Some(first) = child.child(0) {
564                    if first.kind() == "identifier" {
565                        let name = node_text(&first, source).to_string();
566                        if !name.is_empty() {
567                            names.push(name);
568                        }
569                    }
570                }
571            }
572            // Rest parameter: `function f(...args) {}`
573            "rest_pattern" => {
574                if let Some(name) = find_child_text(&child, "identifier", source) {
575                    if !name.is_empty() {
576                        names.push(name);
577                    }
578                }
579            }
580            _ => {}
581        }
582    }
583    names
584}
585
586/// Extract a JS/TS class body's methods as bare-named [`seshat_core::Function`]s
587/// so symbol search can find them (a class is otherwise recorded only as a
588/// `TypeDef`, leaving its methods unindexed).
589///
590/// Methods keep their bare name (`render`, not `Component.render`) so an
591/// exact-name lookup scores 1.0; `file_path` + `line` disambiguate distinct
592/// same-named methods. Covers regular / `async` / `get` / `set` / `static`
593/// methods, `#private` members, and class fields bound to an arrow/function
594/// expression (`handleClick = () => {}`). Methods with computed or
595/// string/number-literal keys are skipped (no plain identifier name).
596///
597/// Shared between the TypeScript and JavaScript parsers.
598pub(super) fn extract_class_methods(
599    class_node: &Node,
600    source: &[u8],
601    functions: &mut Vec<seshat_core::Function>,
602) {
603    let Some(body) = find_child_node(class_node, "class_body") else {
604        return;
605    };
606    // A member's name is a `property_identifier`, or a `private_property_identifier`
607    // for `#private` members; computed/literal keys have neither and are skipped.
608    let member_name = |member: &Node| {
609        find_child_text(member, "property_identifier", source)
610            .or_else(|| find_child_text(member, "private_property_identifier", source))
611            .filter(|n| !n.is_empty())
612    };
613    for i in 0..(body.child_count()) {
614        let Some(member) = body.child(i as u32) else {
615            continue;
616        };
617        match member.kind() {
618            // `foo() {}`, `async foo() {}`, `get x() {}`, `static bar() {}`, `#priv() {}`
619            "method_definition" => {
620                let Some(name) = member_name(&member) else {
621                    continue;
622                };
623                functions.push(seshat_core::Function {
624                    name,
625                    is_public: false,
626                    is_async: has_child_kind(&member, "async"),
627                    line: member.start_position().row + 1,
628                    end_line: member.end_position().row + 1,
629                    parameters: extract_js_ts_parameters(&member, source),
630                    doc_comment: None,
631                });
632            }
633            // Class field bound to an arrow/function expression, e.g.
634            // `handleClick = () => {}` (`public_field_definition` in TS,
635            // `field_definition` in JS).
636            "public_field_definition" | "field_definition" => {
637                let Some(fn_node) = find_arrow_or_function_expr(&member) else {
638                    continue;
639                };
640                let Some(name) = member_name(&member) else {
641                    continue;
642                };
643                functions.push(seshat_core::Function {
644                    name,
645                    is_public: false,
646                    is_async: has_child_kind(&fn_node, "async"),
647                    line: member.start_position().row + 1,
648                    end_line: member.end_position().row + 1,
649                    parameters: extract_js_ts_parameters(&fn_node, source),
650                    doc_comment: None,
651                });
652            }
653            _ => {}
654        }
655    }
656}
657
658/// Compute the SHA-256 hex digest of the given source content.
659pub fn content_hash(source: &str) -> String {
660    let mut hasher = Sha256::new();
661    hasher.update(source.as_bytes());
662    let hash = hasher.finalize();
663    let mut hex = String::with_capacity(hash.len() * 2);
664    for byte in hash {
665        use std::fmt::Write;
666        let _ = write!(hex, "{byte:02x}");
667    }
668    hex
669}
670
671/// Read `abs_path` from disk, [`parse_file`] it under `stored_path`, then
672/// strip `local_packages` from `dependencies_used`. Returns the parsed
673/// `ProjectFile` alongside the original source so callers can populate a
674/// `source_map` for the detection pipeline.
675///
676/// `abs_path` is the on-disk path used for I/O; `stored_path` is what
677/// the resulting `ProjectFile.path` carries (and ultimately becomes the
678/// `files_ir.file_path` key on upsert). Splitting the two lets callers
679/// store paths relative to the project root — so cross-worktree scans of
680/// the same git tree share a single `(branch_id, file_path)` IR row
681/// instead of one row per worktree-prefix variant (Bug #3).
682///
683/// Single source of truth for the read+parse+strip-local-packages pattern
684/// shared by the full scan orchestrator, the hot-tier watcher, and the
685/// incremental freshness sync. Keeping every path through one helper means
686/// detector evidence (snippets) is built consistently regardless of which
687/// trigger drove the IR upsert.
688pub fn read_and_parse_file(
689    abs_path: &Path,
690    stored_path: &Path,
691    language: Language,
692    local_packages: &[String],
693) -> std::io::Result<(ProjectFile, String)> {
694    let source = std::fs::read_to_string(abs_path)?;
695    let mut project_file = parse_file(stored_path, &source, language);
696    if !local_packages.is_empty() {
697        project_file
698            .dependencies_used
699            .retain(|dep| !local_packages.contains(&dep.package));
700    }
701    Ok((project_file, source))
702}
703
704/// Parse a source file by dispatching to the appropriate language parser.
705///
706/// This is the primary entry point for parsing. It:
707/// 1. Selects the parser for the given [`Language`].
708/// 2. Delegates to the parser's [`Parser::parse`] method.
709/// 3. Overwrites `content_hash` with a SHA-256 digest of `source`.
710/// 4. On parser error, returns an empty [`ProjectFile`] with a
711///    `tracing::warn` log (graceful degradation).
712pub fn parse_file(path: &Path, source: &str, language: Language) -> ProjectFile {
713    let parser: &dyn Parser = match language {
714        Language::Rust => &RustParser,
715        Language::TypeScript => &TypeScriptParser,
716        Language::JavaScript => &JavaScriptParser,
717        Language::Python => &PythonParser,
718    };
719
720    let hash = content_hash(source);
721
722    match parser.parse(path, source) {
723        Ok(mut pf) => {
724            pf.content_hash = hash;
725            pf
726        }
727        Err(e) => {
728            tracing::warn!(path = %path.display(), error = %e, "Parser failed; returning empty IR");
729            empty_project_file(path, language, hash)
730        }
731    }
732}
733
734/// Create an empty `ProjectFile` for graceful degradation.
735fn empty_project_file(path: &Path, language: Language, hash: String) -> ProjectFile {
736    use seshat_core::*;
737
738    let language_ir = match language {
739        Language::Rust => LanguageIR::Rust(RustIR::default()),
740        Language::TypeScript => LanguageIR::TypeScript(TypeScriptIR::default()),
741        Language::JavaScript => LanguageIR::JavaScript(JavaScriptIR::default()),
742        Language::Python => LanguageIR::Python(PythonIR::default()),
743    };
744
745    ProjectFile {
746        path: path.to_path_buf(),
747        language,
748        content_hash: hash,
749        imports: Vec::new(),
750        exports: Vec::new(),
751        functions: Vec::new(),
752        types: Vec::new(),
753        dependencies_used: Vec::new(),
754        language_ir,
755        file_doc: None,
756    }
757}
758
759// ---------------------------------------------------------------------------
760// Dependency classification helpers (shared by all language parsers)
761// ---------------------------------------------------------------------------
762
763/// Returns `true` if a Rust `use` path refers to a built-in / first-party
764/// module that should not be counted as an external dependency.
765pub(super) fn is_rust_builtin(module: &str) -> bool {
766    let first = module.split("::").next().unwrap_or(module);
767    matches!(
768        first,
769        "std" | "core" | "alloc" | "proc_macro" | "test" | "self" | "super" | "crate"
770    )
771}
772
773/// Returns `true` if a Python import path is part of the standard library
774/// or is a relative import (`.foo`, `..bar`).
775pub(super) fn is_python_stdlib_or_relative(module: &str) -> bool {
776    if module.starts_with('.') {
777        return true;
778    }
779    let root = module.split('.').next().unwrap_or(module);
780    matches!(
781        root,
782        "os" | "sys"
783            | "re"
784            | "json"
785            | "math"
786            | "io"
787            | "abc"
788            | "ast"
789            | "copy"
790            | "datetime"
791            | "enum"
792            | "functools"
793            | "itertools"
794            | "logging"
795            | "pathlib"
796            | "typing"
797            | "collections"
798            | "dataclasses"
799            | "contextlib"
800            | "subprocess"
801            | "threading"
802            | "asyncio"
803            | "time"
804            | "hashlib"
805            | "hmac"
806            | "base64"
807            | "urllib"
808            | "http"
809            | "email"
810            | "csv"
811            | "sqlite3"
812            | "unittest"
813            | "tempfile"
814            | "shutil"
815            | "glob"
816            | "inspect"
817            | "traceback"
818            | "warnings"
819            | "weakref"
820            | "gc"
821            | "struct"
822            | "socket"
823            | "ssl"
824            | "uuid"
825            | "string"
826            | "textwrap"
827            | "random"
828            | "secrets"
829            | "decimal"
830            | "fractions"
831            | "statistics"
832            | "pprint"
833            | "builtins"
834            | "__future__"
835            | "typing_extensions"
836            | "types"
837            | "operator"
838            // Additional stdlib modules that were missing:
839            | "argparse"
840            | "configparser"
841            | "xml"
842            | "zipfile"
843            | "tarfile"
844            | "pickle"
845            | "shelve"
846            | "queue"
847            | "shlex"
848            | "platform"
849            | "multiprocessing"
850            | "concurrent"
851            | "signal"
852            | "fnmatch"
853            | "difflib"
854            | "dis"
855            | "compileall"
856            | "runpy"
857            | "importlib"
858            | "pkgutil"
859            | "ctypes"
860            | "array"
861            | "bisect"
862            | "heapq"
863            | "pdb"
864            | "profile"
865            | "cProfile"
866            | "timeit"
867            | "doctest"
868            | "getopt"
869            | "getpass"
870            | "curses"
871            | "readline"
872            | "rlcompleter"
873            | "zipimport"
874            | "zlib"
875            | "gzip"
876            | "bz2"
877            | "lzma"
878    )
879}
880
881/// Returns `true` if a TypeScript / JavaScript import path refers to a
882/// local module (relative path, path alias) or Node built-in.
883pub(super) fn is_ts_js_builtin(module: &str) -> bool {
884    module.starts_with("./")
885        || module.starts_with("../")
886        || module.starts_with("@/")   // common path alias
887        || module.starts_with("~/")   // common path alias
888        || module.starts_with("node:") // explicit Node built-in protocol
889        || module.starts_with('#') // Node subpath imports
890}
891
892/// Extract the NPM package name from a TypeScript / JavaScript import specifier.
893///
894/// For scoped packages (`@angular/core/testing`) the scope + first segment is
895/// returned (`@angular/core`).  For unscoped packages (`react/hooks`) only the
896/// top-level package name is returned (`react`).
897pub(super) fn ts_package_name(module: &str) -> String {
898    if let Some(rest) = module.strip_prefix('@') {
899        // Scoped package: @scope/name[/deep]
900        let segments: Vec<&str> = rest.splitn(3, '/').collect();
901        if segments.len() >= 2 {
902            return format!("@{}/{}", segments[0], segments[1]);
903        }
904        return format!("@{}", rest);
905    }
906    module.split('/').next().unwrap_or(module).to_owned()
907}
908
909/// Build a [`DependencyUsage`] from a Rust import path if it is an external
910/// dependency (i.e. not a stdlib / crate-internal path).
911///
912/// Returns `None` for an empty `module` so downstream callers never see a
913/// `("", "")` ghost dependency — historically this could happen when the
914/// parser failed to extract a path prefix from a brace-grouped use.
915pub(super) fn rust_dep_from_import(module: &str, line: usize) -> Option<DependencyUsage> {
916    if module.is_empty() || is_rust_builtin(module) {
917        return None;
918    }
919    let package = module.split("::").next().unwrap_or(module).to_owned();
920    if package.is_empty() {
921        return None;
922    }
923    Some(DependencyUsage {
924        package,
925        import_path: module.to_owned(),
926        line,
927    })
928}
929
930/// Build a [`DependencyUsage`] from a Python import path if it is an external
931/// dependency (i.e. not stdlib or relative).
932pub(super) fn python_dep_from_import(module: &str, line: usize) -> Option<DependencyUsage> {
933    if is_python_stdlib_or_relative(module) {
934        return None;
935    }
936    let package = module.split('.').next().unwrap_or(module).to_owned();
937    Some(DependencyUsage {
938        package,
939        import_path: module.to_owned(),
940        line,
941    })
942}
943
944/// Build a [`DependencyUsage`] from a TypeScript / JavaScript import specifier
945/// if it is an external package.
946pub(super) fn ts_dep_from_import(module: &str, line: usize) -> Option<DependencyUsage> {
947    if is_ts_js_builtin(module) {
948        return None;
949    }
950    let package = ts_package_name(module);
951    Some(DependencyUsage {
952        package,
953        import_path: module.to_owned(),
954        line,
955    })
956}
957
958#[cfg(test)]
959mod tests {
960    use super::*;
961    use std::path::PathBuf;
962
963    #[test]
964    fn content_hash_deterministic() {
965        let a = content_hash("hello world");
966        let b = content_hash("hello world");
967        assert_eq!(a, b);
968        assert!(!a.is_empty());
969    }
970
971    #[test]
972    fn content_hash_differs_for_different_input() {
973        let a = content_hash("hello");
974        let b = content_hash("world");
975        assert_ne!(a, b);
976    }
977
978    #[test]
979    fn content_hash_is_sha256_hex() {
980        let h = content_hash("hello world");
981        // SHA-256 produces 64 hex characters
982        assert_eq!(h.len(), 64);
983        assert!(h.chars().all(|c| c.is_ascii_hexdigit()));
984    }
985
986    #[test]
987    fn dispatch_selects_rust_parser() {
988        let path = PathBuf::from("src/main.rs");
989        let pf = parse_file(&path, "fn main() {}", Language::Rust);
990        assert_eq!(pf.language, Language::Rust);
991        assert_eq!(pf.path, path);
992        assert!(!pf.content_hash.is_empty());
993        assert!(matches!(pf.language_ir, seshat_core::LanguageIR::Rust(_)));
994    }
995
996    #[test]
997    fn dispatch_selects_typescript_parser() {
998        let path = PathBuf::from("src/index.ts");
999        let pf = parse_file(&path, "export const x = 1;", Language::TypeScript);
1000        assert_eq!(pf.language, Language::TypeScript);
1001        assert!(matches!(
1002            pf.language_ir,
1003            seshat_core::LanguageIR::TypeScript(_)
1004        ));
1005    }
1006
1007    #[test]
1008    fn dispatch_selects_javascript_parser() {
1009        let path = PathBuf::from("src/index.js");
1010        let pf = parse_file(&path, "const x = 1;", Language::JavaScript);
1011        assert_eq!(pf.language, Language::JavaScript);
1012        assert!(matches!(
1013            pf.language_ir,
1014            seshat_core::LanguageIR::JavaScript(_)
1015        ));
1016    }
1017
1018    #[test]
1019    fn dispatch_selects_python_parser() {
1020        let path = PathBuf::from("src/main.py");
1021        let pf = parse_file(&path, "def main(): pass", Language::Python);
1022        assert_eq!(pf.language, Language::Python);
1023        assert!(matches!(pf.language_ir, seshat_core::LanguageIR::Python(_)));
1024    }
1025
1026    // -----------------------------------------------------------------------
1027    // Dependency extraction helpers
1028    // -----------------------------------------------------------------------
1029
1030    #[test]
1031    fn rust_builtin_filter() {
1032        assert!(is_rust_builtin("std"));
1033        assert!(is_rust_builtin("std::io"));
1034        assert!(is_rust_builtin("core::fmt"));
1035        assert!(is_rust_builtin("alloc::vec"));
1036        assert!(is_rust_builtin("crate::foo"));
1037        assert!(is_rust_builtin("super::bar"));
1038        assert!(is_rust_builtin("self::baz"));
1039        assert!(!is_rust_builtin("reqwest"));
1040        assert!(!is_rust_builtin("serde::Serialize"));
1041        assert!(!is_rust_builtin("tokio::runtime"));
1042    }
1043
1044    #[test]
1045    fn rust_dep_from_import_rejects_empty_module() {
1046        // Defensive guard: an empty module string must never produce a
1047        // ghost dependency entry. Pre-fix, a parser miss on grouped
1048        // `use crate::{A, B};` produced `module = ""`, and this helper
1049        // accepted it, emitting `{"package":"","import_path":"","line":N}`.
1050        assert!(rust_dep_from_import("", 1).is_none());
1051    }
1052
1053    #[test]
1054    fn python_builtin_filter() {
1055        assert!(is_python_stdlib_or_relative("os"));
1056        assert!(is_python_stdlib_or_relative("sys"));
1057        assert!(is_python_stdlib_or_relative("typing"));
1058        assert!(is_python_stdlib_or_relative(".relative"));
1059        assert!(is_python_stdlib_or_relative("..parent"));
1060        assert!(!is_python_stdlib_or_relative("requests"));
1061        assert!(!is_python_stdlib_or_relative("fastapi"));
1062        assert!(!is_python_stdlib_or_relative("pydantic"));
1063    }
1064
1065    #[test]
1066    fn ts_package_name_extraction() {
1067        assert_eq!(ts_package_name("react"), "react");
1068        assert_eq!(ts_package_name("react/hooks"), "react");
1069        assert_eq!(ts_package_name("@angular/core"), "@angular/core");
1070        assert_eq!(ts_package_name("@angular/core/testing"), "@angular/core");
1071    }
1072
1073    #[test]
1074    fn ts_builtin_filter() {
1075        assert!(is_ts_js_builtin("./local"));
1076        assert!(is_ts_js_builtin("../parent"));
1077        assert!(is_ts_js_builtin("@/alias"));
1078        assert!(is_ts_js_builtin("~/home"));
1079        assert!(is_ts_js_builtin("node:fs"));
1080        assert!(is_ts_js_builtin("#internal"));
1081        assert!(!is_ts_js_builtin("react"));
1082        assert!(!is_ts_js_builtin("@angular/core"));
1083        assert!(!is_ts_js_builtin("axios"));
1084    }
1085
1086    #[test]
1087    fn content_hash_computed_in_shared_code() {
1088        let source = "fn main() {}";
1089        let expected_hash = content_hash(source);
1090        let pf = parse_file(Path::new("test.rs"), source, Language::Rust);
1091        assert_eq!(pf.content_hash, expected_hash);
1092    }
1093
1094    #[test]
1095    fn all_language_variants_dispatched() {
1096        // Ensure every Language variant has a parser (no panics, no unreachable)
1097        let languages = [
1098            Language::Rust,
1099            Language::TypeScript,
1100            Language::JavaScript,
1101            Language::Python,
1102        ];
1103        for lang in languages {
1104            let pf = parse_file(Path::new("test"), "source", lang);
1105            assert_eq!(pf.language, lang);
1106            assert!(!pf.content_hash.is_empty());
1107        }
1108    }
1109}