Skip to main content

ane/commands/syntax_engine/
tree_sitter_parse.rs

1use crate::data::lsp::types::{Language, SemanticToken};
2
3#[cfg(test)]
4thread_local! {
5    pub(crate) static PARSE_COUNT: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
6}
7
8const MAX_PARSE_SIZE: usize = 512 * 1024;
9
10pub fn parse(lang: Language, content: &str) -> Vec<SemanticToken> {
11    #[cfg(test)]
12    PARSE_COUNT.with(|c| c.set(c.get() + 1));
13    if content.len() > MAX_PARSE_SIZE {
14        return vec![];
15    }
16    let result = match lang {
17        Language::Rust => parse_with(&tree_sitter_rust::LANGUAGE.into(), content, rust_node_type),
18        Language::Go => parse_with(&tree_sitter_go::LANGUAGE.into(), content, go_node_type),
19        Language::TypeScript => parse_with(
20            &tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
21            content,
22            ts_node_type,
23        ),
24        Language::Python => parse_with(
25            &tree_sitter_python::LANGUAGE.into(),
26            content,
27            python_node_type,
28        ),
29        Language::Markdown => Some(parse_markdown(content)),
30        Language::Json => Some(parse_json(content)),
31        Language::Yaml => Some(parse_yaml(content)),
32        Language::Toml => parse_with(
33            &tree_sitter_toml_ng::LANGUAGE.into(),
34            content,
35            toml_node_type,
36        ),
37        Language::Dockerfile => parse_with(
38            &tree_sitter_containerfile::LANGUAGE.into(),
39            content,
40            dockerfile_node_type,
41        ),
42        Language::Xml => Some(parse_xml(content)),
43    };
44    result.unwrap_or_default()
45}
46
47fn parse_with(
48    language: &tree_sitter::Language,
49    content: &str,
50    map_fn: fn(&str) -> Option<&'static str>,
51) -> Option<Vec<SemanticToken>> {
52    let mut parser = tree_sitter::Parser::new();
53    parser.set_language(language).ok()?;
54    let tree = parser.parse(content, None)?;
55    let root = tree.root_node();
56
57    let mut tokens = Vec::new();
58    let mut cursor = root.walk();
59    walk_tree(&mut cursor, content, map_fn, &mut tokens);
60    tokens.sort_by_key(|t| (t.line, t.start_col));
61    Some(tokens)
62}
63
64fn walk_tree(
65    cursor: &mut tree_sitter::TreeCursor,
66    content: &str,
67    map_fn: fn(&str) -> Option<&'static str>,
68    tokens: &mut Vec<SemanticToken>,
69) {
70    loop {
71        let node = cursor.node();
72        let kind = node.kind();
73
74        if let Some(token_type) = map_fn(kind)
75            && (node.child_count() == 0 || is_leaf_like(kind))
76        {
77            emit_tokens_for_node(&node, content, token_type, tokens);
78        }
79
80        if !is_leaf_like(kind) && cursor.goto_first_child() {
81            walk_tree(cursor, content, map_fn, tokens);
82            cursor.goto_parent();
83        }
84
85        if !cursor.goto_next_sibling() {
86            break;
87        }
88    }
89}
90
91fn is_leaf_like(kind: &str) -> bool {
92    matches!(
93        kind,
94        "string_literal"
95            | "raw_string_literal"
96            | "char_literal"
97            | "line_comment"
98            | "block_comment"
99            | "comment"
100            | "interpreted_string_literal"
101            | "rune_literal"
102            | "string"
103            | "template_string"
104            | "concatenated_string"
105            | "atx_heading"
106            | "setext_heading"
107            | "code_span"
108            | "emphasis"
109            | "strong_emphasis"
110            | "inline_link"
111            | "full_reference_link"
112            | "collapsed_reference_link"
113            | "shortcut_link"
114            | "uri_autolink"
115            | "email_autolink"
116            | "image"
117            | "strikethrough"
118            | "double_quote_scalar"
119            | "single_quote_scalar"
120            | "block_scalar"
121            | "double_quoted_string"
122            | "single_quoted_string"
123            | "heredoc_block"
124            | "image_tag"
125            | "image_digest"
126    )
127}
128
129fn emit_tokens_for_node(
130    node: &tree_sitter::Node,
131    content: &str,
132    token_type: &'static str,
133    tokens: &mut Vec<SemanticToken>,
134) {
135    let start_line = node.start_position().row;
136    let end_line = node.end_position().row;
137
138    if start_line == end_line {
139        let start_col = byte_to_char_col(content, start_line, node.start_position().column);
140        let end_col = byte_to_char_col(content, end_line, node.end_position().column);
141        if end_col > start_col {
142            tokens.push(SemanticToken {
143                line: start_line,
144                start_col,
145                length: end_col - start_col,
146                token_type: token_type.to_string(),
147            });
148        }
149    } else {
150        let lines: Vec<&str> = content.lines().collect();
151        for line_num in start_line..=end_line {
152            if let Some(line_text) = lines.get(line_num) {
153                let char_count = line_text.chars().count();
154                let (start_col, end_col) = if line_num == start_line {
155                    let sc = byte_to_char_col(content, line_num, node.start_position().column);
156                    (sc, char_count)
157                } else if line_num == end_line {
158                    let ec = byte_to_char_col(content, line_num, node.end_position().column);
159                    (0, ec)
160                } else {
161                    (0, char_count)
162                };
163                if end_col > start_col {
164                    tokens.push(SemanticToken {
165                        line: line_num,
166                        start_col,
167                        length: end_col - start_col,
168                        token_type: token_type.to_string(),
169                    });
170                }
171            }
172        }
173    }
174}
175
176fn byte_to_char_col(content: &str, line_num: usize, byte_col: usize) -> usize {
177    content
178        .lines()
179        .nth(line_num)
180        .map(|line| {
181            let safe_byte = byte_col.min(line.len());
182            line[..safe_byte].chars().count()
183        })
184        .unwrap_or(0)
185}
186
187fn parse_markdown(content: &str) -> Vec<SemanticToken> {
188    let mut tokens = Vec::new();
189
190    // Phase 1: Block-level parsing (headings, code blocks, quotes, list markers)
191    if let Some(block_tokens) = parse_with(&tree_sitter_md::LANGUAGE.into(), content, md_node_type)
192    {
193        tokens.extend(block_tokens);
194    }
195
196    // Phase 2: Inline-level parsing (emphasis, strong, code spans, links)
197    if let Some(inline_tokens) = parse_with(
198        &tree_sitter_md::INLINE_LANGUAGE.into(),
199        content,
200        md_inline_node_type,
201    ) {
202        tokens.extend(inline_tokens);
203    }
204
205    tokens.sort_by_key(|t| (t.line, t.start_col));
206    tokens
207}
208
209// --- Language-specific node type mappings ---
210
211fn rust_node_type(kind: &str) -> Option<&'static str> {
212    match kind {
213        "use" | "let" | "mut" | "const" | "static" | "fn" | "pub" | "mod" | "struct" | "enum"
214        | "impl" | "trait" | "type" | "where" | "for" | "in" | "loop" | "while" | "if" | "else"
215        | "match" | "return" | "break" | "continue" | "as" | "ref" | "self" | "super" | "crate"
216        | "async" | "await" | "move" | "unsafe" | "extern" | "dyn" | "true" | "false" => {
217            Some("keyword")
218        }
219        "type_identifier" | "primitive_type" => Some("type"),
220        "identifier" => None,
221        "function_item" => None,
222        "string_literal" | "raw_string_literal" | "char_literal" => Some("string"),
223        "integer_literal" | "float_literal" => Some("number"),
224        "line_comment" | "block_comment" => Some("comment"),
225        "attribute_item" | "inner_attribute_item" => Some("macro"),
226        "macro_invocation" => None,
227        "!" => None,
228        _ => None,
229    }
230}
231
232fn go_node_type(kind: &str) -> Option<&'static str> {
233    match kind {
234        "package" | "import" | "func" | "return" | "var" | "const" | "type" | "struct"
235        | "interface" | "map" | "chan" | "go" | "defer" | "if" | "else" | "for" | "range"
236        | "switch" | "case" | "default" | "select" | "break" | "continue" | "fallthrough"
237        | "goto" | "nil" | "true" | "false" => Some("keyword"),
238        "type_identifier" => Some("type"),
239        "field_identifier" => Some("property"),
240        "identifier" => None,
241        "interpreted_string_literal" | "raw_string_literal" | "rune_literal" => Some("string"),
242        "int_literal" | "float_literal" | "imaginary_literal" => Some("number"),
243        "comment" => Some("comment"),
244        _ => None,
245    }
246}
247
248fn ts_node_type(kind: &str) -> Option<&'static str> {
249    match kind {
250        "import" | "export" | "from" | "const" | "let" | "var" | "function" | "return" | "if"
251        | "else" | "for" | "while" | "do" | "switch" | "case" | "break" | "continue" | "class"
252        | "extends" | "implements" | "new" | "this" | "super" | "typeof" | "instanceof" | "in"
253        | "of" | "async" | "await" | "yield" | "throw" | "try" | "catch" | "finally"
254        | "default" | "void" | "delete" | "true" | "false" | "null" | "undefined" | "type"
255        | "interface" | "enum" | "namespace" | "declare" | "as" | "readonly" | "abstract"
256        | "static" | "private" | "protected" | "public" | "keyof" | "infer" | "satisfies" => {
257            Some("keyword")
258        }
259        "type_identifier" | "predefined_type" => Some("type"),
260        "property_identifier" => Some("property"),
261        "identifier" => None,
262        "string" | "template_string" => Some("string"),
263        "number" | "regex" => Some("number"),
264        "comment" => Some("comment"),
265        _ => None,
266    }
267}
268
269fn python_node_type(kind: &str) -> Option<&'static str> {
270    match kind {
271        "import" | "from" | "def" | "class" | "return" | "if" | "elif" | "else" | "for"
272        | "while" | "break" | "continue" | "pass" | "raise" | "try" | "except" | "finally"
273        | "with" | "as" | "lambda" | "yield" | "global" | "nonlocal" | "assert" | "del" | "and"
274        | "or" | "not" | "is" | "in" | "True" | "False" | "None" | "async" | "await" => {
275            Some("keyword")
276        }
277        "identifier" => None,
278        "type" => Some("type"),
279        "string" | "concatenated_string" => Some("string"),
280        "integer" | "float" => Some("number"),
281        "comment" => Some("comment"),
282        "decorator" => Some("macro"),
283        _ => None,
284    }
285}
286
287fn md_node_type(kind: &str) -> Option<&'static str> {
288    match kind {
289        "atx_heading" | "setext_heading" | "atx_h1_marker" | "atx_h2_marker" | "atx_h3_marker"
290        | "atx_h4_marker" | "atx_h5_marker" | "atx_h6_marker" => Some("heading"),
291        "fenced_code_block" | "indented_code_block" | "code_fence_content" | "info_string" => {
292            Some("code")
293        }
294        "block_quote" | "block_quote_marker" => Some("quote"),
295        "list_marker_dot"
296        | "list_marker_minus"
297        | "list_marker_star"
298        | "list_marker_plus"
299        | "list_marker_parenthesis" => Some("list_marker"),
300        "thematic_break" => Some("punctuation"),
301        _ => None,
302    }
303}
304
305fn md_inline_node_type(kind: &str) -> Option<&'static str> {
306    match kind {
307        "code_span" => Some("code"),
308        "emphasis" => Some("emphasis"),
309        "strong_emphasis" => Some("strong"),
310        "inline_link"
311        | "full_reference_link"
312        | "collapsed_reference_link"
313        | "shortcut_link"
314        | "uri_autolink"
315        | "email_autolink"
316        | "image" => Some("link"),
317        "strikethrough" => Some("punctuation"),
318        _ => None,
319    }
320}
321
322// --- JSON ---
323
324fn parse_json(content: &str) -> Vec<SemanticToken> {
325    let mut parser = tree_sitter::Parser::new();
326    let lang: tree_sitter::Language = tree_sitter_json::LANGUAGE.into();
327    if parser.set_language(&lang).is_err() {
328        return vec![];
329    }
330    let tree = match parser.parse(content, None) {
331        Some(t) => t,
332        None => return vec![],
333    };
334    let mut tokens = Vec::new();
335    let mut cursor = tree.root_node().walk();
336    walk_json(&mut cursor, content, &mut tokens);
337    tokens.sort_by_key(|t| (t.line, t.start_col));
338    tokens
339}
340
341fn walk_json(cursor: &mut tree_sitter::TreeCursor, content: &str, tokens: &mut Vec<SemanticToken>) {
342    loop {
343        let node = cursor.node();
344        let kind = node.kind();
345
346        match kind {
347            "pair" => {
348                if cursor.goto_first_child() {
349                    // First child is the key — emit it directly as "key"
350                    let key_node = cursor.node();
351                    if key_node.kind() == "string" {
352                        emit_tokens_for_node(&key_node, content, "key", tokens);
353                    }
354                    // Advance past the ":" separator and recurse into the value subtree
355                    while cursor.goto_next_sibling() {
356                        if cursor.node().kind() == ":" {
357                            continue;
358                        }
359                        walk_json(cursor, content, tokens);
360                        break;
361                    }
362                    cursor.goto_parent();
363                }
364            }
365            "string" => {
366                emit_tokens_for_node(&node, content, "string", tokens);
367            }
368            "number" => {
369                emit_tokens_for_node(&node, content, "number", tokens);
370            }
371            "true" | "false" | "null" => {
372                emit_tokens_for_node(&node, content, "keyword", tokens);
373            }
374            "comment" => {
375                emit_tokens_for_node(&node, content, "comment", tokens);
376            }
377            _ => {
378                if cursor.goto_first_child() {
379                    walk_json(cursor, content, tokens);
380                    cursor.goto_parent();
381                }
382            }
383        }
384
385        if !cursor.goto_next_sibling() {
386            break;
387        }
388    }
389}
390
391// --- YAML ---
392
393fn parse_yaml(content: &str) -> Vec<SemanticToken> {
394    let mut parser = tree_sitter::Parser::new();
395    let lang: tree_sitter::Language = tree_sitter_yaml::LANGUAGE.into();
396    if parser.set_language(&lang).is_err() {
397        return vec![];
398    }
399    let tree = match parser.parse(content, None) {
400        Some(t) => t,
401        None => return vec![],
402    };
403    let mut tokens = Vec::new();
404    let mut cursor = tree.root_node().walk();
405    walk_yaml(&mut cursor, content, &mut tokens, false);
406    tokens.sort_by_key(|t| (t.line, t.start_col));
407    tokens
408}
409
410fn walk_yaml(
411    cursor: &mut tree_sitter::TreeCursor,
412    content: &str,
413    tokens: &mut Vec<SemanticToken>,
414    is_key: bool,
415) {
416    loop {
417        let node = cursor.node();
418        let kind = node.kind();
419
420        match kind {
421            "block_mapping_pair" | "flow_pair" => {
422                if cursor.goto_first_child() {
423                    walk_yaml(cursor, content, tokens, true);
424                    cursor.goto_parent();
425                }
426            }
427            "plain_scalar" | "string_scalar" => {
428                if is_key {
429                    emit_tokens_for_node(&node, content, "key", tokens);
430                } else {
431                    // The typed child (integer_scalar, boolean_scalar, etc.) is nested
432                    // inside plain_scalar — inspect the first child's kind to pick the
433                    // correct token type rather than defaulting everything to "string".
434                    let token_type = if cursor.goto_first_child() {
435                        let child_kind = cursor.node().kind();
436                        cursor.goto_parent();
437                        match child_kind {
438                            "integer_scalar" | "float_scalar" | "timestamp_scalar" => "number",
439                            "boolean_scalar" | "null_scalar" => "keyword",
440                            _ => "string",
441                        }
442                    } else {
443                        "string"
444                    };
445                    emit_tokens_for_node(&node, content, token_type, tokens);
446                }
447            }
448            "double_quote_scalar" | "single_quote_scalar" | "block_scalar" => {
449                let token_type = if is_key { "key" } else { "string" };
450                emit_tokens_for_node(&node, content, token_type, tokens);
451            }
452            "integer_scalar" | "float_scalar" | "timestamp_scalar" => {
453                emit_tokens_for_node(&node, content, "number", tokens);
454            }
455            "boolean_scalar" | "null_scalar" => {
456                emit_tokens_for_node(&node, content, "keyword", tokens);
457            }
458            "comment" => {
459                emit_tokens_for_node(&node, content, "comment", tokens);
460            }
461            "anchor" | "alias" | "tag" => {
462                emit_tokens_for_node(&node, content, "type", tokens);
463            }
464            ":" => {
465                // After the colon, subsequent siblings are values
466                if cursor.goto_next_sibling() {
467                    walk_yaml(cursor, content, tokens, false);
468                }
469                break;
470            }
471            _ => {
472                if cursor.goto_first_child() {
473                    walk_yaml(cursor, content, tokens, is_key);
474                    cursor.goto_parent();
475                }
476            }
477        }
478
479        if !cursor.goto_next_sibling() {
480            break;
481        }
482    }
483}
484
485// --- TOML ---
486
487fn toml_node_type(kind: &str) -> Option<&'static str> {
488    match kind {
489        "bare_key" | "quoted_key" => Some("key"),
490        "table" | "table_array_element" => Some("type"),
491        "string" => Some("string"),
492        "integer" | "float" | "offset_date_time" | "local_date_time" | "local_date"
493        | "local_time" => Some("number"),
494        "boolean" => Some("keyword"),
495        "comment" => Some("comment"),
496        _ => None,
497    }
498}
499
500// --- Dockerfile ---
501
502fn dockerfile_node_type(kind: &str) -> Option<&'static str> {
503    match kind {
504        "FROM" | "RUN" | "CMD" | "LABEL" | "MAINTAINER" | "EXPOSE" | "ENV" | "ADD" | "COPY"
505        | "ENTRYPOINT" | "VOLUME" | "USER" | "WORKDIR" | "ARG" | "ONBUILD" | "STOPSIGNAL"
506        | "HEALTHCHECK" | "SHELL" | "CROSS_BUILD" | "AS" => Some("keyword"),
507        "image_name" | "image_alias" => Some("type"),
508        "image_tag" | "image_digest" => Some("string"),
509        "double_quoted_string" | "single_quoted_string" | "json_string" => Some("string"),
510        "comment" => Some("comment"),
511        "variable" => Some("variable"),
512        _ => None,
513    }
514}
515
516// --- XML ---
517
518fn parse_xml(content: &str) -> Vec<SemanticToken> {
519    let mut parser = tree_sitter::Parser::new();
520    let lang: tree_sitter::Language = tree_sitter_xml::LANGUAGE_XML.into();
521    if parser.set_language(&lang).is_err() {
522        return vec![];
523    }
524    let tree = match parser.parse(content, None) {
525        Some(t) => t,
526        None => return vec![],
527    };
528    let mut tokens = Vec::new();
529    let mut cursor = tree.root_node().walk();
530    walk_xml(&mut cursor, content, &mut tokens);
531    tokens.sort_by_key(|t| (t.line, t.start_col));
532    tokens
533}
534
535fn walk_xml(cursor: &mut tree_sitter::TreeCursor, content: &str, tokens: &mut Vec<SemanticToken>) {
536    loop {
537        let node = cursor.node();
538        let kind = node.kind();
539
540        match kind {
541            "Comment" => {
542                emit_tokens_for_node(&node, content, "comment", tokens);
543            }
544            "CDSect" | "CData" => {
545                emit_tokens_for_node(&node, content, "string", tokens);
546            }
547            "PI" => {
548                emit_tokens_for_node(&node, content, "keyword", tokens);
549            }
550            "CharData" => {
551                emit_tokens_for_node(&node, content, "variable", tokens);
552            }
553            "Attribute" => {
554                // First child is the Name (attribute name), then =, then AttValue
555                if cursor.goto_first_child() {
556                    walk_xml_attribute(cursor, content, tokens);
557                    cursor.goto_parent();
558                }
559            }
560            "Name" => {
561                emit_tokens_for_node(&node, content, "type", tokens);
562            }
563            "AttValue" => {
564                emit_tokens_for_node(&node, content, "string", tokens);
565            }
566            _ => {
567                if cursor.goto_first_child() {
568                    walk_xml(cursor, content, tokens);
569                    cursor.goto_parent();
570                }
571            }
572        }
573
574        if !cursor.goto_next_sibling() {
575            break;
576        }
577    }
578}
579
580fn walk_xml_attribute(
581    cursor: &mut tree_sitter::TreeCursor,
582    content: &str,
583    tokens: &mut Vec<SemanticToken>,
584) {
585    loop {
586        let node = cursor.node();
587        let kind = node.kind();
588
589        match kind {
590            "Name" => {
591                emit_tokens_for_node(&node, content, "key", tokens);
592            }
593            "AttValue" => {
594                emit_tokens_for_node(&node, content, "string", tokens);
595            }
596            _ => {}
597        }
598
599        if !cursor.goto_next_sibling() {
600            break;
601        }
602    }
603}
604
605#[cfg(test)]
606mod tests {
607    use super::*;
608    use crate::data::lsp::types::Language;
609
610    fn has_type(tokens: &[SemanticToken], ty: &str) -> bool {
611        tokens.iter().any(|t| t.token_type == ty)
612    }
613
614    fn count_type(tokens: &[SemanticToken], ty: &str) -> usize {
615        tokens.iter().filter(|t| t.token_type == ty).count()
616    }
617
618    #[test]
619    fn json_node_type_mappings() {
620        let content = r#"{"key": "value", "count": 42, "active": true, "nothing": null}"#;
621        let tokens = parse(Language::Json, content);
622
623        assert!(
624            has_type(&tokens, "key"),
625            "expected 'key' tokens for object keys"
626        );
627        assert!(
628            has_type(&tokens, "string"),
629            "expected 'string' tokens for string values"
630        );
631        assert!(
632            has_type(&tokens, "number"),
633            "expected 'number' token for 42"
634        );
635        assert!(
636            has_type(&tokens, "keyword"),
637            "expected 'keyword' tokens for true and null"
638        );
639
640        assert_eq!(count_type(&tokens, "key"), 4, "four object keys");
641        assert_eq!(count_type(&tokens, "string"), 1, "one string value");
642    }
643
644    #[test]
645    fn yaml_node_type_mappings() {
646        let content = "name: hello\ncount: 42\nactive: true\n# a comment\n";
647        let tokens = parse(Language::Yaml, content);
648
649        assert!(
650            has_type(&tokens, "key"),
651            "YAML mapping keys should be 'key'"
652        );
653        assert!(
654            has_type(&tokens, "string"),
655            "YAML plain scalars as values should be 'string'"
656        );
657        assert!(
658            has_type(&tokens, "number"),
659            "YAML integer should be 'number'"
660        );
661        assert!(
662            has_type(&tokens, "keyword"),
663            "YAML boolean should be 'keyword'"
664        );
665        assert!(
666            has_type(&tokens, "comment"),
667            "YAML comment should be 'comment'"
668        );
669    }
670
671    #[test]
672    fn toml_node_type_mappings() {
673        let content = "[section]\nkey = \"value\"\ncount = 42\nactive = true\n# comment\n";
674        let tokens = parse(Language::Toml, content);
675
676        assert!(has_type(&tokens, "key"), "TOML bare keys should be 'key'");
677        assert!(
678            has_type(&tokens, "string"),
679            "TOML strings should be 'string'"
680        );
681        assert!(
682            has_type(&tokens, "number"),
683            "TOML integers should be 'number'"
684        );
685        assert!(
686            has_type(&tokens, "keyword"),
687            "TOML booleans should be 'keyword'"
688        );
689        assert!(
690            has_type(&tokens, "comment"),
691            "TOML comments should be 'comment'"
692        );
693    }
694
695    #[test]
696    fn dockerfile_node_type_mappings() {
697        let content = "FROM ubuntu:22.04\nRUN apt-get update\n# comment\n";
698        let tokens = parse(Language::Dockerfile, content);
699
700        assert!(
701            has_type(&tokens, "keyword"),
702            "FROM and RUN instructions should be 'keyword'"
703        );
704        assert!(
705            has_type(&tokens, "type"),
706            "image name 'ubuntu' should be 'type'"
707        );
708        assert!(
709            has_type(&tokens, "string"),
710            "image tag '22.04' should be 'string'"
711        );
712        assert!(
713            has_type(&tokens, "comment"),
714            "# comment should be 'comment'"
715        );
716    }
717
718    #[test]
719    fn xml_node_type_mappings() {
720        let content = r#"<root id="1">text</root>"#;
721        let tokens = parse(Language::Xml, content);
722
723        assert!(has_type(&tokens, "type"), "tag names should be 'type'");
724        assert!(
725            has_type(&tokens, "key"),
726            "attribute name 'id' should be 'key'"
727        );
728        assert!(
729            has_type(&tokens, "string"),
730            "attribute value should be 'string'"
731        );
732        assert!(
733            has_type(&tokens, "variable"),
734            "text content should be 'variable'"
735        );
736    }
737
738    #[test]
739    fn yaml_multiline_block_scalar() {
740        let content = "text: |\n  line one\n  line two\n  line three\n";
741        let tokens = parse(Language::Yaml, content);
742
743        let string_tokens: Vec<_> = tokens.iter().filter(|t| t.token_type == "string").collect();
744        // block_scalar spans the content lines; lines 1–3 must each have a string token
745        assert!(
746            string_tokens.len() >= 3,
747            "expected at least 3 string tokens for 3-line block scalar, got {}",
748            string_tokens.len()
749        );
750        let string_lines: std::collections::HashSet<usize> =
751            string_tokens.iter().map(|t| t.line).collect();
752        assert!(
753            string_lines.contains(&1),
754            "line 1 should have a string token"
755        );
756        assert!(
757            string_lines.contains(&2),
758            "line 2 should have a string token"
759        );
760        assert!(
761            string_lines.contains(&3),
762            "line 3 should have a string token"
763        );
764        for tok in &string_tokens {
765            assert_eq!(tok.token_type, "string");
766        }
767    }
768
769    #[test]
770    fn large_file_guard_returns_empty() {
771        let huge = "x".repeat(512 * 1024 + 1);
772        let tokens = parse(Language::Json, &huge);
773        assert!(
774            tokens.is_empty(),
775            "content over 512 KB should return empty tokens"
776        );
777    }
778}