Skip to main content

mcp_memory/code/
mod.rs

1//! Tree-sitter code-symbol indexing.
2//!
3//! Pure parsing layer: turns source files into [`ParsedFile`]s (definitions and
4//! references) and provides a gitignore-aware directory walk. It has no
5//! knowledge of the graph store — [`crate::actions::code`] maps the parsed
6//! output onto entities/relations and handles incremental hashing.
7
8pub mod lang;
9
10use std::path::{Path, PathBuf};
11use std::sync::atomic::AtomicUsize;
12
13pub use lang::{Lang, detect};
14
15/// Files larger than this are skipped (parsing huge generated/minified files
16/// is slow and rarely useful for a symbol map).
17pub const MAX_FILE_BYTES: u64 = 2 * 1024 * 1024;
18
19/// Signatures and doc lines are capped to keep observations compact.
20const MAX_SIGNATURE_CHARS: usize = 240;
21const MAX_DOC_CHARS: usize = 240;
22
23/// A symbol definition extracted from a file.
24#[derive(Debug, Clone)]
25pub struct Def {
26    /// Normalized kind: `function`, `method`, `class`, `module`, `constant`, …
27    pub kind: String,
28    /// Bare symbol name.
29    pub name: String,
30    /// 1-based inclusive line range of the definition.
31    pub line_start: usize,
32    pub line_end: usize,
33    /// First (declaration) line of the definition, trimmed.
34    pub signature: String,
35    /// First line of the associated doc comment, if any.
36    pub doc: Option<String>,
37}
38
39/// A reference (call / type use) extracted from a file.
40#[derive(Debug, Clone)]
41pub struct Ref {
42    /// Reference kind from the grammar's tags query, e.g. `call`, `type`.
43    pub kind: String,
44    /// Bare name referenced.
45    pub name: String,
46    /// 1-based line of the reference.
47    pub line: usize,
48}
49
50/// Parsed symbols for a single file.
51#[derive(Debug, Clone, Default)]
52pub struct ParsedFile {
53    pub defs: Vec<Def>,
54    pub refs: Vec<Ref>,
55}
56
57/// BLAKE3 content hash (hex) used for incremental change detection.
58pub fn hash_bytes(bytes: &[u8]) -> String {
59    blake3::hash(bytes).to_hex().to_string()
60}
61
62/// Normalize a grammar's definition kind into our small entity vocabulary.
63fn normalize_def_kind(raw: &str) -> &str {
64    match raw {
65        "function" | "macro" => "function",
66        "method" => "method",
67        "class" | "interface" | "struct" | "type" | "enum" | "trait" => "class",
68        "module" | "namespace" => "module",
69        "constant" => "constant",
70        other => other,
71    }
72}
73
74/// Extract the trimmed first line of `source` starting at byte offset `start`.
75fn first_line(source: &[u8], start: usize) -> String {
76    let end = source[start..]
77        .iter()
78        .position(|&b| b == b'\n')
79        .map(|p| start + p)
80        .unwrap_or(source.len());
81    let mut s = String::from_utf8_lossy(&source[start..end]).trim().to_string();
82    if s.chars().count() > MAX_SIGNATURE_CHARS {
83        s = s.chars().take(MAX_SIGNATURE_CHARS).collect::<String>() + "…";
84    }
85    s
86}
87
88fn clamp_doc(doc: &str) -> Option<String> {
89    let line = doc.lines().find(|l| !l.trim().is_empty())?.trim();
90    if line.is_empty() {
91        return None;
92    }
93    let s = if line.chars().count() > MAX_DOC_CHARS {
94        line.chars().take(MAX_DOC_CHARS).collect::<String>() + "…"
95    } else {
96        line.to_string()
97    };
98    Some(s)
99}
100
101/// Parse one in-memory source buffer into defs/refs. Returns an empty result
102/// for unsupported languages or unbuildable tag configs.
103pub fn parse_source(lang: Lang, source: &[u8]) -> ParsedFile {
104    let Some(config) = lang::config(lang) else {
105        return ParsedFile::default();
106    };
107
108    let mut ctx = tree_sitter_tags::TagsContext::new();
109    let cancel = AtomicUsize::new(0);
110    let (tags, _failed) = match ctx.generate_tags(config, source, Some(&cancel)) {
111        Ok(v) => v,
112        Err(_) => return ParsedFile::default(),
113    };
114
115    // Byte offset of the start of each line, for O(log n) byte→line lookups.
116    // `tag.range` spans the whole definition node (body included), while
117    // `tag.span` is only the name; we derive the def's line range from `range`.
118    let line_starts: Vec<usize> = std::iter::once(0)
119        .chain(source.iter().enumerate().filter(|&(_, &b)| b == b'\n').map(|(i, _)| i + 1))
120        .collect();
121    let line_of = |byte: usize| line_starts.partition_point(|&s| s <= byte).max(1);
122
123    let mut out = ParsedFile::default();
124    for tag in tags.flatten() {
125        let name = String::from_utf8_lossy(&source[tag.name_range.clone()]).to_string();
126        if name.is_empty() {
127            continue;
128        }
129        let kind = config.syntax_type_name(tag.syntax_type_id).to_string();
130        if tag.is_definition {
131            let end_byte = tag.range.end.saturating_sub(1).max(tag.range.start);
132            out.defs.push(Def {
133                kind: normalize_def_kind(&kind).to_string(),
134                name,
135                line_start: line_of(tag.range.start),
136                line_end: line_of(end_byte),
137                signature: first_line(source, tag.range.start),
138                doc: tag.docs.as_deref().and_then(clamp_doc),
139            });
140        } else {
141            out.refs.push(Ref {
142                kind,
143                name,
144                line: tag.span.start.row + 1,
145            });
146        }
147    }
148    out
149}
150
151/// Walk `root` (a file or directory) and collect indexable source files,
152/// honoring `.gitignore`/hidden-file rules and skipping oversized files.
153pub fn walk(root: &Path, max_bytes: u64) -> Vec<PathBuf> {
154    let mut files = Vec::new();
155    if root.is_file() {
156        if detect(root).is_some()
157            && std::fs::metadata(root).map(|m| m.len() <= max_bytes).unwrap_or(false)
158        {
159            files.push(root.to_path_buf());
160        }
161        return files;
162    }
163
164    let walker = ignore::WalkBuilder::new(root)
165        .standard_filters(true)
166        .hidden(true)
167        .git_ignore(true)
168        .git_global(true)
169        .require_git(false)
170        .filter_entry(|e| {
171            // Belt-and-suspenders: skip common build/vendor dirs even when no
172            // .gitignore is present.
173            let name = e.file_name().to_string_lossy();
174            !matches!(name.as_ref(), "target" | "node_modules" | ".git" | "dist" | "build")
175        })
176        .build();
177
178    for entry in walker.flatten() {
179        let path = entry.path();
180        if !path.is_file() || detect(path).is_none() {
181            continue;
182        }
183        if std::fs::metadata(path).map(|m| m.len() > max_bytes).unwrap_or(true) {
184            continue;
185        }
186        files.push(path.to_path_buf());
187    }
188    files
189}