tokensave 3.3.2

Code intelligence tool that builds a semantic knowledge graph from Rust, Go, Java, Scala, TypeScript, Python, C, C++, Kotlin, C#, Swift, and many more codebases
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
// Rust guideline compliant 2025-10-17
use std::collections::HashSet;
use std::fs;
use std::path::Path;

use crate::db::Database;
use crate::errors::Result;
use crate::graph::GraphTraverser;
use crate::types::*;

/// Builds AI-ready context by combining search, graph traversal, and source code extraction.
pub struct ContextBuilder<'a> {
    db: &'a Database,
    project_root: &'a Path,
}

impl<'a> ContextBuilder<'a> {
    /// Creates a new `ContextBuilder` backed by the given database and project root.
    pub fn new(db: &'a Database, project_root: &'a Path) -> Self {
        Self { db, project_root }
    }

    /// Builds a complete task context for the given query.
    ///
    /// Pipeline:
    /// 1. Extract symbol names from the query
    /// 2. Search for matching nodes via FTS and exact name lookup
    /// 3. Expand graph around entry points using BFS traversal
    /// 4. Extract code blocks by reading source files
    /// 5. Build and return `TaskContext`
    pub async fn build_context(&self, query: &str, options: &BuildContextOptions) -> Result<TaskContext> {
        debug_assert!(!query.is_empty(), "build_context called with empty query");
        debug_assert!(options.max_nodes > 0, "max_nodes must be positive");
        // Step 1-3: find relevant subgraph and entry points
        let symbols = extract_symbols_from_query(query);
        let entry_points = self.find_entry_points(query, &symbols, options).await?;
        let subgraph = self.expand_subgraph(&entry_points, options).await?;

        // Step 4: extract code blocks from source files
        let code_blocks = if options.include_code {
            self.extract_code_blocks(&entry_points, options).await?
        } else {
            Vec::new()
        };

        // Collect unique related files
        let related_files = self.collect_related_files(&subgraph);

        // Build summary
        let summary = self.build_summary(query, &entry_points, &subgraph);

        Ok(TaskContext {
            query: query.to_string(),
            summary,
            subgraph,
            entry_points,
            code_blocks,
            related_files,
        })
    }

    /// Finds the relevant subgraph for a query without extracting code blocks.
    ///
    /// Extracts symbols from the query, searches for matching nodes, and expands
    /// via BFS traversal to the configured depth.
    pub async fn find_relevant_context(
        &self,
        query: &str,
        options: &BuildContextOptions,
    ) -> Result<Subgraph> {
        let symbols = extract_symbols_from_query(query);
        let entry_points = self.find_entry_points(query, &symbols, options).await?;
        self.expand_subgraph(&entry_points, options).await
    }

    /// Reads the source file and extracts the code for a node.
    ///
    /// Returns `None` if the file cannot be read or the line range is invalid.
    pub async fn get_code(&self, node: &Node) -> Result<Option<String>> {
        debug_assert!(!node.file_path.is_empty(), "get_code called with empty file_path");
        debug_assert!(!node.id.is_empty(), "get_code called with empty node id");
        let file_path = self.project_root.join(&node.file_path);
        // Prevent path traversal: ensure the resolved path stays within the project root.
        if let (Ok(canonical), Ok(root)) = (file_path.canonicalize(), self.project_root.canonicalize()) {
            if !canonical.starts_with(&root) {
                return Ok(None);
            }
        }
        let content = match fs::read_to_string(&file_path) {
            Ok(c) => c,
            Err(_) => return Ok(None),
        };

        let lines: Vec<&str> = content.lines().collect();
        if node.start_line == 0 || node.end_line == 0 {
            return Ok(None);
        }

        let start = (node.start_line as usize).saturating_sub(1);
        let end = node.end_line as usize;

        if start >= lines.len() {
            return Ok(None);
        }

        let end = end.min(lines.len());
        let snippet: String = lines[start..end].join("\n");
        if snippet.is_empty() {
            Ok(None)
        } else {
            Ok(Some(snippet))
        }
    }

    // -----------------------------------------------------------------------
    // Private helpers
    // -----------------------------------------------------------------------

    /// Searches for entry-point nodes matching the query and extracted symbols.
    ///
    /// The search results from the database are already ranked by relevance and
    /// limited. We apply `min_score` only when it is positive, allowing the
    /// caller to disable filtering with `min_score = 0.0`.
    async fn find_entry_points(
        &self,
        query: &str,
        symbols: &[String],
        options: &BuildContextOptions,
    ) -> Result<Vec<Node>> {
        debug_assert!(!query.is_empty(), "find_entry_points called with empty query");
        debug_assert!(options.search_limit > 0, "search_limit must be positive");
        let mut seen_ids: HashSet<String> = HashSet::new();
        let mut entry_points: Vec<Node> = Vec::new();

        // Search using the full query
        let search_results = self.db.search_nodes(query, options.search_limit).await?;
        for sr in &search_results {
            if self.score_passes(sr.score, options.min_score) && seen_ids.insert(sr.node.id.clone())
            {
                entry_points.push(sr.node.clone());
            }
        }

        // Search for each extracted symbol individually
        for symbol in symbols {
            if entry_points.len() >= options.max_nodes {
                break;
            }
            let results = self.db.search_nodes(symbol, options.search_limit).await?;
            for sr in results {
                if self.score_passes(sr.score, options.min_score)
                    && seen_ids.insert(sr.node.id.clone())
                {
                    entry_points.push(sr.node.clone());
                }
            }
        }

        // Cap at max_nodes
        entry_points.truncate(options.max_nodes);
        debug_assert!(entry_points.len() <= options.max_nodes, "entry_points exceeds max_nodes after truncation");
        Ok(entry_points)
    }

    /// Expands the subgraph around entry points using BFS traversal.
    async fn expand_subgraph(
        &self,
        entry_points: &[Node],
        options: &BuildContextOptions,
    ) -> Result<Subgraph> {
        debug_assert!(options.traversal_depth > 0, "traversal_depth must be positive");
        debug_assert!(options.max_nodes > 0, "max_nodes must be positive for expand_subgraph");
        let traverser = GraphTraverser::new(self.db);
        let mut all_nodes: Vec<Node> = Vec::new();
        let mut all_edges: Vec<Edge> = Vec::new();
        let mut all_roots: Vec<String> = Vec::new();
        let mut seen_node_ids: HashSet<String> = HashSet::new();
        let mut seen_edge_keys: HashSet<(String, String, String)> = HashSet::new();

        let traversal_opts = TraversalOptions {
            max_depth: options.traversal_depth as u32,
            edge_kinds: None,
            node_kinds: None,
            direction: TraversalDirection::Both,
            limit: options.max_nodes as u32,
            include_start: true,
        };

        for node in entry_points {
            let sub = traverser.traverse_bfs(&node.id, &traversal_opts).await?;

            for root in sub.roots {
                if !all_roots.contains(&root) {
                    all_roots.push(root);
                }
            }

            for n in sub.nodes {
                if seen_node_ids.insert(n.id.clone()) {
                    all_nodes.push(n);
                }
            }

            for e in sub.edges {
                let key = (
                    e.source.clone(),
                    e.target.clone(),
                    e.kind.as_str().to_string(),
                );
                if seen_edge_keys.insert(key) {
                    all_edges.push(e);
                }
            }

            if all_nodes.len() >= options.max_nodes {
                break;
            }
        }

        all_nodes.truncate(options.max_nodes);

        Ok(Subgraph {
            nodes: all_nodes,
            edges: all_edges,
            roots: all_roots,
        })
    }

    /// Extracts code blocks for the entry-point nodes.
    async fn extract_code_blocks(
        &self,
        entry_points: &[Node],
        options: &BuildContextOptions,
    ) -> Result<Vec<CodeBlock>> {
        debug_assert!(options.max_code_blocks > 0, "max_code_blocks must be positive");
        debug_assert!(options.max_code_block_size > 0, "max_code_block_size must be positive");
        let mut blocks: Vec<CodeBlock> = Vec::new();

        for node in entry_points {
            if blocks.len() >= options.max_code_blocks {
                break;
            }

            if let Some(code) = self.get_code(node).await? {
                let truncated = if code.len() > options.max_code_block_size {
                    let mut end = options.max_code_block_size;
                    // Ensure we land on a valid UTF-8 boundary
                    while !code.is_char_boundary(end) && end > 0 {
                        end -= 1;
                    }
                    // Try to truncate at a line boundary
                    if let Some(pos) = code[..end].rfind('\n') {
                        end = pos;
                    }
                    format!("{}...", &code[..end])
                } else {
                    code
                };

                blocks.push(CodeBlock {
                    content: truncated,
                    file_path: node.file_path.clone(),
                    start_line: node.start_line,
                    end_line: node.end_line,
                    node_id: Some(node.id.clone()),
                });
            }
        }

        Ok(blocks)
    }

    /// Checks whether a search score passes the minimum threshold.
    ///
    /// FTS5 ranks are small negative numbers (closer to zero = better). After
    /// negation the scores are small positive values that may not clear a high
    /// threshold. We accept any result whose score is positive (i.e. the FTS
    /// engine considered it a match) unless the caller explicitly set a
    /// non-default threshold above 0.
    fn score_passes(&self, score: f64, min_score: f64) -> bool {
        score > 0.0 && score >= min_score
    }

    /// Collects unique file paths from all nodes in the subgraph.
    fn collect_related_files(&self, subgraph: &Subgraph) -> Vec<String> {
        let mut seen: HashSet<String> = HashSet::new();
        let mut files: Vec<String> = Vec::new();

        for node in &subgraph.nodes {
            if seen.insert(node.file_path.clone()) {
                files.push(node.file_path.clone());
            }
        }

        files
    }

    /// Builds a human-readable summary string.
    fn build_summary(&self, query: &str, entry_points: &[Node], subgraph: &Subgraph) -> String {
        let ep_count = entry_points.len();
        let node_count = subgraph.nodes.len();
        let edge_count = subgraph.edges.len();

        if ep_count == 0 {
            format!("No matching symbols found for \"{query}\"")
        } else {
            format!(
                "Found {ep_count} entry point(s) for \"{query}\" with {node_count} related node(s) and {edge_count} edge(s)"
            )
        }
    }
}

/// Extracts potential symbol names from natural language text.
///
/// Recognizes the following patterns:
/// - CamelCase words (e.g. `UserService`, `processRequest`)
/// - snake_case words (e.g. `process_request`, `user_service`)
/// - SCREAMING_SNAKE_CASE words (e.g. `MAX_RETRIES`)
/// - Qualified paths with `::` separators (e.g. `crate::types::Node` yields `Node`)
///
/// Common English stop words are filtered out.
pub fn extract_symbols_from_query(query: &str) -> Vec<String> {
    debug_assert!(!query.is_empty(), "extract_symbols_from_query called with empty query");
    let stop_words: HashSet<&str> = SYMBOL_STOP_WORDS.iter().copied().collect();

    let mut symbols: Vec<String> = Vec::new();
    let mut seen: HashSet<String> = HashSet::new();

    for token in query.split_whitespace() {
        let clean = token.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != ':');
        classify_token(clean, &stop_words, &mut symbols, &mut seen);
    }

    symbols
}

/// Stop words filtered out during symbol extraction from natural language.
const SYMBOL_STOP_WORDS: &[&str] = &[
    "the", "is", "in", "for", "to", "a", "an", "of", "and", "or", "not",
    "this", "that", "it", "with", "on", "at", "by", "from", "as", "be",
    "was", "are", "been", "being", "have", "has", "had", "do", "does", "did",
    "will", "would", "could", "should", "may", "might", "can", "shall",
    "how", "what", "where", "when", "who", "which", "why",
    "if", "then", "else", "but", "so", "up", "out", "no", "yes",
    "all", "any", "each", "every",
    "fix", "look", "update", "add", "remove", "delete", "change", "check",
    "find", "get", "set", "use", "make", "call",
    "function", "method", "class", "struct", "type", "module", "file",
    "handler", "implement", "create", "about",
];

/// Classify a single cleaned token and push any symbols it yields.
fn classify_token(
    clean: &str,
    stop_words: &HashSet<&str>,
    symbols: &mut Vec<String>,
    seen: &mut HashSet<String>,
) {
    if clean.is_empty() { return; }

    if clean.contains("::") {
        // Qualified path: extract last segment and full path
        if let Some(last) = clean.rsplit("::").next() {
            if !last.is_empty()
                && !stop_words.contains(last.to_lowercase().as_str())
                && seen.insert(last.to_string())
            {
                symbols.push(last.to_string());
            }
        }
        let full = clean.to_string();
        if seen.insert(full.clone()) {
            symbols.push(full);
        }
        return;
    }

    // snake_case or SCREAMING_SNAKE
    if clean.contains('_') {
        if !stop_words.contains(clean.to_lowercase().as_str()) && seen.insert(clean.to_string()) {
            symbols.push(clean.to_string());
        }
        return;
    }

    // CamelCase
    if is_camel_case(clean) {
        if !stop_words.contains(clean.to_lowercase().as_str()) && seen.insert(clean.to_string()) {
            symbols.push(clean.to_string());
        }
    }
}

/// Returns `true` if `word` looks like CamelCase.
///
/// The word must contain at least one uppercase letter after the first character
/// and consist only of ASCII alphanumeric characters.
fn is_camel_case(word: &str) -> bool {
    if word.len() < 2 {
        return false;
    }
    // Must be all alphanumeric
    if !word.chars().all(|c| c.is_ascii_alphanumeric()) {
        return false;
    }
    // Must have at least one uppercase letter after the first char
    word[1..].chars().any(|c| c.is_ascii_uppercase())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_snake_case() {
        let symbols = extract_symbols_from_query("fix the process_request function");
        assert!(symbols.contains(&"process_request".to_string()));
    }

    #[test]
    fn test_extract_camel_case() {
        let symbols = extract_symbols_from_query("update UserService handler");
        assert!(symbols.contains(&"UserService".to_string()));
    }

    #[test]
    fn test_extract_screaming_snake() {
        let symbols = extract_symbols_from_query("increase MAX_RETRIES limit");
        assert!(symbols.contains(&"MAX_RETRIES".to_string()));
    }

    #[test]
    fn test_extract_qualified_path() {
        let symbols = extract_symbols_from_query("look at crate::types::Node");
        assert!(symbols.iter().any(|s| s.contains("Node")));
    }

    #[test]
    fn test_filters_stop_words() {
        let symbols = extract_symbols_from_query("the is in for to a an");
        assert!(symbols.is_empty());
    }

    #[test]
    fn test_is_camel_case() {
        assert!(is_camel_case("UserService"));
        assert!(is_camel_case("processRequest"));
        assert!(!is_camel_case("user"));
        assert!(!is_camel_case("U"));
        assert!(!is_camel_case("process_request"));
    }
}