scute_core/code_similarity/
tokenize.rs1use super::language::{LanguageConfig, NodeRole};
2use crate::parser::AstParser;
3
4#[derive(Debug, Clone, PartialEq)]
6pub struct Token {
7 pub text: String,
8 pub start_line: usize,
9 pub end_line: usize,
10}
11
12impl Token {
13 fn new(text: &str, node: &tree_sitter::Node) -> Self {
14 Self {
15 text: text.to_string(),
16 start_line: node.start_position().row + 1,
17 end_line: node.end_position().row + 1,
18 }
19 }
20}
21
22#[derive(Debug)]
23pub struct TokenizeError;
24
25impl std::fmt::Display for TokenizeError {
26 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27 write!(f, "failed to produce a parse tree")
28 }
29}
30
31impl std::error::Error for TokenizeError {}
32
33pub fn tokenize(
43 parser: &mut dyn AstParser,
44 source: &str,
45 config: &LanguageConfig,
46) -> Result<Vec<Token>, TokenizeError> {
47 let tree = parser
48 .parse(source, config.language())
49 .map_err(|_| TokenizeError)?;
50
51 let mut tokens = Vec::new();
52 collect_tokens(tree.root_node(), source.as_bytes(), config, &mut tokens);
53 Ok(tokens)
54}
55
56enum TokenAction {
57 Emit(Token),
59 Skip,
61 Recurse,
63}
64
65fn classify_node(node: &tree_sitter::Node, source: &[u8], config: &LanguageConfig) -> TokenAction {
66 if node.is_error() || node.is_missing() {
67 return TokenAction::Skip;
68 }
69
70 if !node.is_named() {
71 return classify_unnamed(node);
72 }
73
74 classify_by_role(node, source, config)
75}
76
77fn classify_unnamed(node: &tree_sitter::Node) -> TokenAction {
78 if node.child_count() == 0 {
79 TokenAction::Emit(Token::new(node.kind(), node))
80 } else {
81 TokenAction::Recurse
82 }
83}
84
85fn classify_by_role(
86 node: &tree_sitter::Node,
87 source: &[u8],
88 config: &LanguageConfig,
89) -> TokenAction {
90 match config.classify(node.kind()) {
91 NodeRole::Identifier => TokenAction::Emit(Token::new("$ID", node)),
92 NodeRole::Literal => TokenAction::Emit(Token::new("$LIT", node)),
93 NodeRole::Comment | NodeRole::Decoration => TokenAction::Skip,
94 NodeRole::Other if node.child_count() == 0 => {
95 let text = node.utf8_text(source).unwrap_or("");
96 TokenAction::Emit(Token::new(text, node))
97 }
98 NodeRole::Other => TokenAction::Recurse,
99 }
100}
101
102fn collect_tokens(
103 node: tree_sitter::Node,
104 source: &[u8],
105 config: &LanguageConfig,
106 tokens: &mut Vec<Token>,
107) {
108 match classify_node(&node, source, config) {
109 TokenAction::Emit(token) => tokens.push(token),
110 TokenAction::Skip => {}
111 TokenAction::Recurse => {
112 let mut cursor = node.walk();
113 for child in node.children(&mut cursor) {
114 collect_tokens(child, source, config, tokens);
115 }
116 }
117 }
118}