Skip to main content

scute_core/code_similarity/
tokenize.rs

1use super::language::{LanguageConfig, NodeRole};
2use crate::parser::AstParser;
3
4/// A normalized token from source code.
5#[derive(Debug, Clone, PartialEq)]
6pub struct Token {
7    pub text: String,
8    pub start_line: usize,
9    pub end_line: usize,
10}
11
12impl Token {
13    fn new(text: &str, node: &tree_sitter::Node) -> Self {
14        Self {
15            text: text.to_string(),
16            start_line: node.start_position().row + 1,
17            end_line: node.end_position().row + 1,
18        }
19    }
20}
21
22#[derive(Debug)]
23pub struct TokenizeError;
24
25impl std::fmt::Display for TokenizeError {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        write!(f, "failed to produce a parse tree")
28    }
29}
30
31impl std::error::Error for TokenizeError {}
32
33/// Tokenize source code into a normalized token stream.
34///
35/// The parser is borrowed mutably and its language is reconfigured on
36/// each call. Callers should reuse the same parser across calls to
37/// benefit from buffer recycling.
38///
39/// # Errors
40///
41/// Returns `TokenizeError` if the parser fails to produce a parse tree.
42pub fn tokenize(
43    parser: &mut dyn AstParser,
44    source: &str,
45    config: &LanguageConfig,
46) -> Result<Vec<Token>, TokenizeError> {
47    let tree = parser
48        .parse(source, config.language())
49        .map_err(|_| TokenizeError)?;
50
51    let mut tokens = Vec::new();
52    collect_tokens(tree.root_node(), source.as_bytes(), config, &mut tokens);
53    Ok(tokens)
54}
55
56enum TokenAction {
57    /// Emit this token and stop recursing.
58    Emit(Token),
59    /// Skip this node entirely (comments, decorations).
60    Skip,
61    /// Recurse into children.
62    Recurse,
63}
64
65fn classify_node(node: &tree_sitter::Node, source: &[u8], config: &LanguageConfig) -> TokenAction {
66    if node.is_error() || node.is_missing() {
67        return TokenAction::Skip;
68    }
69
70    if !node.is_named() {
71        return classify_unnamed(node);
72    }
73
74    classify_by_role(node, source, config)
75}
76
77fn classify_unnamed(node: &tree_sitter::Node) -> TokenAction {
78    if node.child_count() == 0 {
79        TokenAction::Emit(Token::new(node.kind(), node))
80    } else {
81        TokenAction::Recurse
82    }
83}
84
85fn classify_by_role(
86    node: &tree_sitter::Node,
87    source: &[u8],
88    config: &LanguageConfig,
89) -> TokenAction {
90    match config.classify(node.kind()) {
91        NodeRole::Identifier => TokenAction::Emit(Token::new("$ID", node)),
92        NodeRole::Literal => TokenAction::Emit(Token::new("$LIT", node)),
93        NodeRole::Comment | NodeRole::Decoration => TokenAction::Skip,
94        NodeRole::Other if node.child_count() == 0 => {
95            let text = node.utf8_text(source).unwrap_or("");
96            TokenAction::Emit(Token::new(text, node))
97        }
98        NodeRole::Other => TokenAction::Recurse,
99    }
100}
101
102fn collect_tokens(
103    node: tree_sitter::Node,
104    source: &[u8],
105    config: &LanguageConfig,
106    tokens: &mut Vec<Token>,
107) {
108    match classify_node(&node, source, config) {
109        TokenAction::Emit(token) => tokens.push(token),
110        TokenAction::Skip => {}
111        TokenAction::Recurse => {
112            let mut cursor = node.walk();
113            for child in node.children(&mut cursor) {
114                collect_tokens(child, source, config, tokens);
115            }
116        }
117    }
118}