Skip to main content

codemem_engine/index/
blocklist.rs

1//! Per-language call noise blocklist.
2//!
3//! Filters builtins and stdlib methods from reference extraction to prevent
4//! structural noise from calls like `print`, `len`, `console.log`, `useState`, `println!`.
5
6use std::collections::{HashMap, HashSet};
7use std::sync::LazyLock;
8
9/// Normalize language aliases to canonical names.
10fn normalize_language(language: &str) -> &str {
11    match language {
12        "tsx" | "jsx" | "javascript" => "typescript",
13        other => other,
14    }
15}
16
17/// Returns `true` if `name` is a blocked builtin/stdlib call for the given language.
18///
19/// Used by the ast-grep extraction path. Only the simple (unqualified) name
20/// should be passed — callers should split on the language's scope separator first.
21pub fn is_blocked_call(language: &str, name: &str) -> bool {
22    let lang = normalize_language(language);
23    BLOCKLISTS.get(lang).is_some_and(|set| set.contains(name))
24}
25
26/// Returns `true` if the given SCIP symbol represents a blocked builtin/stdlib call.
27///
28/// Extracts the language from the SCIP package manager prefix and the simple
29/// name from the descriptor chain.
30pub fn is_blocked_call_scip(scip_symbol: &str) -> bool {
31    let Some(lang) = scip_language(scip_symbol) else {
32        return false;
33    };
34    let Some(simple) = scip_simple_name(scip_symbol) else {
35        return false;
36    };
37    is_blocked_call(lang, simple)
38}
39
40/// Extract language from SCIP package manager prefix.
41fn scip_language(symbol: &str) -> Option<&str> {
42    // SCIP symbols start with "scip-<manager> " or "<manager> "
43    let s = symbol.strip_prefix("scip-").unwrap_or(symbol);
44    let prefix = s.split_whitespace().next()?;
45    match prefix {
46        "cargo" | "crates" => Some("rust"),
47        "npm" | "node" => Some("typescript"),
48        "pip" | "pypi" | "python" => Some("python"),
49        "go" | "gomod" => Some("go"),
50        "maven" | "gradle" => Some("java"),
51        _ => None,
52    }
53}
54
55/// Extract simple name from SCIP descriptor chain.
56/// Splits on `#` and `.`, strips trailing `()`.
57fn scip_simple_name(symbol: &str) -> Option<&str> {
58    // Take the last segment after space-separated parts, then split descriptors
59    let descriptor_part = symbol.rsplit_once(' ').map(|(_, d)| d).unwrap_or(symbol);
60
61    // Split on `.` and `#`, take the last non-empty segment
62    let last = descriptor_part.split(['.', '#']).rfind(|s| !s.is_empty())?;
63
64    // Strip trailing "()" or "(anything)"
65    let name = if let Some(idx) = last.find('(') {
66        &last[..idx]
67    } else {
68        last
69    };
70
71    if name.is_empty() {
72        None
73    } else {
74        Some(name)
75    }
76}
77
78static BLOCKLISTS: LazyLock<HashMap<&'static str, HashSet<&'static str>>> = LazyLock::new(|| {
79    let mut m = HashMap::new();
80
81    m.insert(
82        "python",
83        HashSet::from([
84            "print",
85            "len",
86            "range",
87            "map",
88            "filter",
89            "sorted",
90            "reversed",
91            "list",
92            "dict",
93            "set",
94            "str",
95            "int",
96            "float",
97            "bool",
98            "type",
99            "super",
100            "isinstance",
101            "issubclass",
102            "hasattr",
103            "getattr",
104            "setattr",
105            "delattr",
106            "open",
107            "enumerate",
108            "zip",
109            "any",
110            "all",
111            "min",
112            "max",
113            "sum",
114            "abs",
115            "round",
116            "repr",
117            "id",
118            "hash",
119            "dir",
120            "vars",
121            "input",
122            "format",
123            "tuple",
124            "frozenset",
125            "bytes",
126            "bytearray",
127            "callable",
128            "iter",
129            "next",
130            "property",
131            "staticmethod",
132            "classmethod",
133            "append",
134            "extend",
135            "update",
136            "pop",
137            "get",
138            "items",
139            "keys",
140            "values",
141            "split",
142            "join",
143            "strip",
144            "lstrip",
145            "rstrip",
146            "replace",
147            "startswith",
148            "endswith",
149            "lower",
150            "upper",
151            "encode",
152            "decode",
153            "read",
154            "write",
155            "close",
156            "flush",
157            "seek",
158        ]),
159    );
160
161    m.insert(
162        "typescript",
163        HashSet::from([
164            "console",
165            "setTimeout",
166            "setInterval",
167            "clearTimeout",
168            "clearInterval",
169            "JSON",
170            "Array",
171            "Object",
172            "Promise",
173            "Math",
174            "Date",
175            "Error",
176            "Symbol",
177            "parseInt",
178            "parseFloat",
179            "fetch",
180            "require",
181            "document",
182            "window",
183            "process",
184            "Buffer",
185            "URL",
186            "URLSearchParams",
187            "RegExp",
188            "Map",
189            "Set",
190            "WeakMap",
191            "WeakSet",
192            "Proxy",
193            "Reflect",
194            "Number",
195            "String",
196            "Boolean",
197            "log",
198            "error",
199            "warn",
200            "info",
201            "debug",
202            "trace",
203            "parse",
204            "stringify",
205            "assign",
206            "freeze",
207            "keys",
208            "values",
209            "entries",
210            "isArray",
211            // NOTE: "from", "of", "resolve", "reject", "all", "now" removed —
212            // too ambiguous as simple names (e.g., MyClass.from(), Promise.resolve()
213            // can't be distinguished from user-defined methods at the simple-name level).
214            "allSettled",
215            "floor",
216            "ceil",
217            "round",
218            "random",
219            "abs",
220            "min",
221            "max",
222            "pow",
223            "sqrt",
224            "toISOString",
225            "toJSON",
226            "push",
227            "pop",
228            "shift",
229            "unshift",
230            "splice",
231            "slice",
232            "concat",
233            "map",
234            "filter",
235            "reduce",
236            "forEach",
237            "find",
238            "findIndex",
239            "some",
240            "every",
241            "includes",
242            "indexOf",
243            "join",
244            "sort",
245            "reverse",
246            "flat",
247            "flatMap",
248            "then",
249            "catch",
250            "finally",
251            "toString",
252            "valueOf",
253            "hasOwnProperty",
254            "addEventListener",
255            "removeEventListener",
256            "preventDefault",
257            "stopPropagation",
258            "querySelector",
259            "querySelectorAll",
260            "getElementById",
261            "createElement",
262            "appendChild",
263            "removeChild",
264            "useState",
265            "useEffect",
266            "useRef",
267            "useCallback",
268            "useMemo",
269            "useContext",
270            "useReducer",
271            "useLayoutEffect",
272            "useImperativeHandle",
273            "useDebugValue",
274            "useId",
275            "useTransition",
276            "useDeferredValue",
277            "useSyncExternalStore",
278            "useInsertionEffect",
279        ]),
280    );
281
282    m.insert(
283        "rust",
284        HashSet::from([
285            "println!",
286            "eprintln!",
287            "print!",
288            "eprint!",
289            "format!",
290            "write!",
291            "writeln!",
292            "vec!",
293            "todo!",
294            "unimplemented!",
295            "unreachable!",
296            "panic!",
297            "assert!",
298            "assert_eq!",
299            "assert_ne!",
300            "debug_assert!",
301            "debug_assert_eq!",
302            "debug_assert_ne!",
303            "cfg!",
304            "env!",
305            "include!",
306            "include_str!",
307            "include_bytes!",
308            "concat!",
309            "stringify!",
310            "file!",
311            "line!",
312            "column!",
313            "module_path!",
314            "dbg!",
315            "matches!",
316            "compile_error!",
317            "trace!",
318            "debug!",
319            "info!",
320            "warn!",
321            "error!",
322            "clone",
323            "to_string",
324            "to_owned",
325            "into",
326            "from",
327            "default",
328            "fmt",
329            "eq",
330            "ne",
331            "cmp",
332            "partial_cmp",
333            "hash",
334            "map",
335            "filter",
336            "and_then",
337            "or_else",
338            "unwrap",
339            "unwrap_or",
340            "unwrap_or_else",
341            "unwrap_or_default",
342            "expect",
343            "ok",
344            "err",
345            "is_some",
346            "is_none",
347            "is_ok",
348            "is_err",
349            "collect",
350            "iter",
351            "into_iter",
352            "next",
353            "push",
354            "pop",
355            "insert",
356            "remove",
357            "contains",
358            "get",
359            "len",
360            "is_empty",
361        ]),
362    );
363
364    m.insert(
365        "go",
366        HashSet::from([
367            "make", "len", "cap", "append", "copy", "delete", "close", "new", "panic", "recover",
368            "print", "println", "complex", "real", "imag", "Error", "String", "Len", "Less",
369            "Swap", "Read", "Write", "Close", "Seek", "Lock", "Unlock", "RLock", "RUnlock",
370            "Println", "Printf", "Sprintf", "Fprintf", "Errorf",
371        ]),
372    );
373
374    m.insert(
375        "java",
376        HashSet::from([
377            // NOTE: "System.out.println" etc. removed — after splitting on "."
378            // (Java's scope separator), only the simple name "println" remains,
379            // which is already in this list.
380            "toString",
381            "equals",
382            "hashCode",
383            "compareTo",
384            "clone",
385            "println",
386            "print",
387            "printf",
388            "get",
389            "set",
390            "add",
391            "remove",
392            "contains",
393            "size",
394            "isEmpty",
395            "put",
396            "containsKey",
397            "containsValue",
398            "keySet",
399            "values",
400            "entrySet",
401            "length",
402            "charAt",
403            "substring",
404            "indexOf",
405            "trim",
406            "split",
407            "valueOf",
408            "parseInt",
409            "parseDouble",
410            "parseLong",
411            "close",
412            "read",
413            "write",
414            "flush",
415        ]),
416    );
417
418    m
419});
420
421#[cfg(test)]
422#[path = "tests/blocklist_tests.rs"]
423mod tests;