1use crate::data::lsp::types::{Language, SemanticToken};
2
3#[cfg(test)]
4thread_local! {
5 pub(crate) static PARSE_COUNT: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
6}
7
8const MAX_PARSE_SIZE: usize = 512 * 1024;
9
10pub fn parse(lang: Language, content: &str) -> Vec<SemanticToken> {
11 #[cfg(test)]
12 PARSE_COUNT.with(|c| c.set(c.get() + 1));
13 if content.len() > MAX_PARSE_SIZE {
14 return vec![];
15 }
16 let result = match lang {
17 Language::Rust => parse_with(&tree_sitter_rust::LANGUAGE.into(), content, rust_node_type),
18 Language::Go => parse_with(&tree_sitter_go::LANGUAGE.into(), content, go_node_type),
19 Language::TypeScript => parse_with(
20 &tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
21 content,
22 ts_node_type,
23 ),
24 Language::Python => parse_with(
25 &tree_sitter_python::LANGUAGE.into(),
26 content,
27 python_node_type,
28 ),
29 Language::Markdown => Some(parse_markdown(content)),
30 Language::Json => Some(parse_json(content)),
31 Language::Yaml => Some(parse_yaml(content)),
32 Language::Toml => parse_with(
33 &tree_sitter_toml_ng::LANGUAGE.into(),
34 content,
35 toml_node_type,
36 ),
37 Language::Dockerfile => parse_with(
38 &tree_sitter_containerfile::LANGUAGE.into(),
39 content,
40 dockerfile_node_type,
41 ),
42 Language::Xml => Some(parse_xml(content)),
43 };
44 result.unwrap_or_default()
45}
46
47fn parse_with(
48 language: &tree_sitter::Language,
49 content: &str,
50 map_fn: fn(&str) -> Option<&'static str>,
51) -> Option<Vec<SemanticToken>> {
52 let mut parser = tree_sitter::Parser::new();
53 parser.set_language(language).ok()?;
54 let tree = parser.parse(content, None)?;
55 let root = tree.root_node();
56
57 let mut tokens = Vec::new();
58 let mut cursor = root.walk();
59 walk_tree(&mut cursor, content, map_fn, &mut tokens);
60 tokens.sort_by_key(|t| (t.line, t.start_col));
61 Some(tokens)
62}
63
64fn walk_tree(
65 cursor: &mut tree_sitter::TreeCursor,
66 content: &str,
67 map_fn: fn(&str) -> Option<&'static str>,
68 tokens: &mut Vec<SemanticToken>,
69) {
70 loop {
71 let node = cursor.node();
72 let kind = node.kind();
73
74 if let Some(token_type) = map_fn(kind)
75 && (node.child_count() == 0 || is_leaf_like(kind))
76 {
77 emit_tokens_for_node(&node, content, token_type, tokens);
78 }
79
80 if !is_leaf_like(kind) && cursor.goto_first_child() {
81 walk_tree(cursor, content, map_fn, tokens);
82 cursor.goto_parent();
83 }
84
85 if !cursor.goto_next_sibling() {
86 break;
87 }
88 }
89}
90
91fn is_leaf_like(kind: &str) -> bool {
92 matches!(
93 kind,
94 "string_literal"
95 | "raw_string_literal"
96 | "char_literal"
97 | "line_comment"
98 | "block_comment"
99 | "comment"
100 | "interpreted_string_literal"
101 | "rune_literal"
102 | "string"
103 | "template_string"
104 | "concatenated_string"
105 | "atx_heading"
106 | "setext_heading"
107 | "code_span"
108 | "emphasis"
109 | "strong_emphasis"
110 | "inline_link"
111 | "full_reference_link"
112 | "collapsed_reference_link"
113 | "shortcut_link"
114 | "uri_autolink"
115 | "email_autolink"
116 | "image"
117 | "strikethrough"
118 | "double_quote_scalar"
119 | "single_quote_scalar"
120 | "block_scalar"
121 | "double_quoted_string"
122 | "single_quoted_string"
123 | "heredoc_block"
124 | "image_tag"
125 | "image_digest"
126 )
127}
128
129fn emit_tokens_for_node(
130 node: &tree_sitter::Node,
131 content: &str,
132 token_type: &'static str,
133 tokens: &mut Vec<SemanticToken>,
134) {
135 let start_line = node.start_position().row;
136 let end_line = node.end_position().row;
137
138 if start_line == end_line {
139 let start_col = byte_to_char_col(content, start_line, node.start_position().column);
140 let end_col = byte_to_char_col(content, end_line, node.end_position().column);
141 if end_col > start_col {
142 tokens.push(SemanticToken {
143 line: start_line,
144 start_col,
145 length: end_col - start_col,
146 token_type: token_type.to_string(),
147 });
148 }
149 } else {
150 let lines: Vec<&str> = content.lines().collect();
151 for line_num in start_line..=end_line {
152 if let Some(line_text) = lines.get(line_num) {
153 let char_count = line_text.chars().count();
154 let (start_col, end_col) = if line_num == start_line {
155 let sc = byte_to_char_col(content, line_num, node.start_position().column);
156 (sc, char_count)
157 } else if line_num == end_line {
158 let ec = byte_to_char_col(content, line_num, node.end_position().column);
159 (0, ec)
160 } else {
161 (0, char_count)
162 };
163 if end_col > start_col {
164 tokens.push(SemanticToken {
165 line: line_num,
166 start_col,
167 length: end_col - start_col,
168 token_type: token_type.to_string(),
169 });
170 }
171 }
172 }
173 }
174}
175
176fn byte_to_char_col(content: &str, line_num: usize, byte_col: usize) -> usize {
177 content
178 .lines()
179 .nth(line_num)
180 .map(|line| {
181 let safe_byte = byte_col.min(line.len());
182 line[..safe_byte].chars().count()
183 })
184 .unwrap_or(0)
185}
186
187fn parse_markdown(content: &str) -> Vec<SemanticToken> {
188 let mut tokens = Vec::new();
189
190 if let Some(block_tokens) = parse_with(&tree_sitter_md::LANGUAGE.into(), content, md_node_type)
192 {
193 tokens.extend(block_tokens);
194 }
195
196 if let Some(inline_tokens) = parse_with(
198 &tree_sitter_md::INLINE_LANGUAGE.into(),
199 content,
200 md_inline_node_type,
201 ) {
202 tokens.extend(inline_tokens);
203 }
204
205 tokens.sort_by_key(|t| (t.line, t.start_col));
206 tokens
207}
208
209fn rust_node_type(kind: &str) -> Option<&'static str> {
212 match kind {
213 "use" | "let" | "mut" | "const" | "static" | "fn" | "pub" | "mod" | "struct" | "enum"
214 | "impl" | "trait" | "type" | "where" | "for" | "in" | "loop" | "while" | "if" | "else"
215 | "match" | "return" | "break" | "continue" | "as" | "ref" | "self" | "super" | "crate"
216 | "async" | "await" | "move" | "unsafe" | "extern" | "dyn" | "true" | "false" => {
217 Some("keyword")
218 }
219 "type_identifier" | "primitive_type" => Some("type"),
220 "identifier" => None,
221 "function_item" => None,
222 "string_literal" | "raw_string_literal" | "char_literal" => Some("string"),
223 "integer_literal" | "float_literal" => Some("number"),
224 "line_comment" | "block_comment" => Some("comment"),
225 "attribute_item" | "inner_attribute_item" => Some("macro"),
226 "macro_invocation" => None,
227 "!" => None,
228 _ => None,
229 }
230}
231
232fn go_node_type(kind: &str) -> Option<&'static str> {
233 match kind {
234 "package" | "import" | "func" | "return" | "var" | "const" | "type" | "struct"
235 | "interface" | "map" | "chan" | "go" | "defer" | "if" | "else" | "for" | "range"
236 | "switch" | "case" | "default" | "select" | "break" | "continue" | "fallthrough"
237 | "goto" | "nil" | "true" | "false" => Some("keyword"),
238 "type_identifier" => Some("type"),
239 "field_identifier" => Some("property"),
240 "identifier" => None,
241 "interpreted_string_literal" | "raw_string_literal" | "rune_literal" => Some("string"),
242 "int_literal" | "float_literal" | "imaginary_literal" => Some("number"),
243 "comment" => Some("comment"),
244 _ => None,
245 }
246}
247
248fn ts_node_type(kind: &str) -> Option<&'static str> {
249 match kind {
250 "import" | "export" | "from" | "const" | "let" | "var" | "function" | "return" | "if"
251 | "else" | "for" | "while" | "do" | "switch" | "case" | "break" | "continue" | "class"
252 | "extends" | "implements" | "new" | "this" | "super" | "typeof" | "instanceof" | "in"
253 | "of" | "async" | "await" | "yield" | "throw" | "try" | "catch" | "finally"
254 | "default" | "void" | "delete" | "true" | "false" | "null" | "undefined" | "type"
255 | "interface" | "enum" | "namespace" | "declare" | "as" | "readonly" | "abstract"
256 | "static" | "private" | "protected" | "public" | "keyof" | "infer" | "satisfies" => {
257 Some("keyword")
258 }
259 "type_identifier" | "predefined_type" => Some("type"),
260 "property_identifier" => Some("property"),
261 "identifier" => None,
262 "string" | "template_string" => Some("string"),
263 "number" | "regex" => Some("number"),
264 "comment" => Some("comment"),
265 _ => None,
266 }
267}
268
269fn python_node_type(kind: &str) -> Option<&'static str> {
270 match kind {
271 "import" | "from" | "def" | "class" | "return" | "if" | "elif" | "else" | "for"
272 | "while" | "break" | "continue" | "pass" | "raise" | "try" | "except" | "finally"
273 | "with" | "as" | "lambda" | "yield" | "global" | "nonlocal" | "assert" | "del" | "and"
274 | "or" | "not" | "is" | "in" | "True" | "False" | "None" | "async" | "await" => {
275 Some("keyword")
276 }
277 "identifier" => None,
278 "type" => Some("type"),
279 "string" | "concatenated_string" => Some("string"),
280 "integer" | "float" => Some("number"),
281 "comment" => Some("comment"),
282 "decorator" => Some("macro"),
283 _ => None,
284 }
285}
286
287fn md_node_type(kind: &str) -> Option<&'static str> {
288 match kind {
289 "atx_heading" | "setext_heading" | "atx_h1_marker" | "atx_h2_marker" | "atx_h3_marker"
290 | "atx_h4_marker" | "atx_h5_marker" | "atx_h6_marker" => Some("heading"),
291 "fenced_code_block" | "indented_code_block" | "code_fence_content" | "info_string" => {
292 Some("code")
293 }
294 "block_quote" | "block_quote_marker" => Some("quote"),
295 "list_marker_dot"
296 | "list_marker_minus"
297 | "list_marker_star"
298 | "list_marker_plus"
299 | "list_marker_parenthesis" => Some("list_marker"),
300 "thematic_break" => Some("punctuation"),
301 _ => None,
302 }
303}
304
305fn md_inline_node_type(kind: &str) -> Option<&'static str> {
306 match kind {
307 "code_span" => Some("code"),
308 "emphasis" => Some("emphasis"),
309 "strong_emphasis" => Some("strong"),
310 "inline_link"
311 | "full_reference_link"
312 | "collapsed_reference_link"
313 | "shortcut_link"
314 | "uri_autolink"
315 | "email_autolink"
316 | "image" => Some("link"),
317 "strikethrough" => Some("punctuation"),
318 _ => None,
319 }
320}
321
322fn parse_json(content: &str) -> Vec<SemanticToken> {
325 let mut parser = tree_sitter::Parser::new();
326 let lang: tree_sitter::Language = tree_sitter_json::LANGUAGE.into();
327 if parser.set_language(&lang).is_err() {
328 return vec![];
329 }
330 let tree = match parser.parse(content, None) {
331 Some(t) => t,
332 None => return vec![],
333 };
334 let mut tokens = Vec::new();
335 let mut cursor = tree.root_node().walk();
336 walk_json(&mut cursor, content, &mut tokens);
337 tokens.sort_by_key(|t| (t.line, t.start_col));
338 tokens
339}
340
341fn walk_json(cursor: &mut tree_sitter::TreeCursor, content: &str, tokens: &mut Vec<SemanticToken>) {
342 loop {
343 let node = cursor.node();
344 let kind = node.kind();
345
346 match kind {
347 "pair" => {
348 if cursor.goto_first_child() {
349 let key_node = cursor.node();
351 if key_node.kind() == "string" {
352 emit_tokens_for_node(&key_node, content, "key", tokens);
353 }
354 while cursor.goto_next_sibling() {
356 if cursor.node().kind() == ":" {
357 continue;
358 }
359 walk_json(cursor, content, tokens);
360 break;
361 }
362 cursor.goto_parent();
363 }
364 }
365 "string" => {
366 emit_tokens_for_node(&node, content, "string", tokens);
367 }
368 "number" => {
369 emit_tokens_for_node(&node, content, "number", tokens);
370 }
371 "true" | "false" | "null" => {
372 emit_tokens_for_node(&node, content, "keyword", tokens);
373 }
374 "comment" => {
375 emit_tokens_for_node(&node, content, "comment", tokens);
376 }
377 _ => {
378 if cursor.goto_first_child() {
379 walk_json(cursor, content, tokens);
380 cursor.goto_parent();
381 }
382 }
383 }
384
385 if !cursor.goto_next_sibling() {
386 break;
387 }
388 }
389}
390
391fn parse_yaml(content: &str) -> Vec<SemanticToken> {
394 let mut parser = tree_sitter::Parser::new();
395 let lang: tree_sitter::Language = tree_sitter_yaml::LANGUAGE.into();
396 if parser.set_language(&lang).is_err() {
397 return vec![];
398 }
399 let tree = match parser.parse(content, None) {
400 Some(t) => t,
401 None => return vec![],
402 };
403 let mut tokens = Vec::new();
404 let mut cursor = tree.root_node().walk();
405 walk_yaml(&mut cursor, content, &mut tokens, false);
406 tokens.sort_by_key(|t| (t.line, t.start_col));
407 tokens
408}
409
410fn walk_yaml(
411 cursor: &mut tree_sitter::TreeCursor,
412 content: &str,
413 tokens: &mut Vec<SemanticToken>,
414 is_key: bool,
415) {
416 loop {
417 let node = cursor.node();
418 let kind = node.kind();
419
420 match kind {
421 "block_mapping_pair" | "flow_pair" => {
422 if cursor.goto_first_child() {
423 walk_yaml(cursor, content, tokens, true);
424 cursor.goto_parent();
425 }
426 }
427 "plain_scalar" | "string_scalar" => {
428 if is_key {
429 emit_tokens_for_node(&node, content, "key", tokens);
430 } else {
431 let token_type = if cursor.goto_first_child() {
435 let child_kind = cursor.node().kind();
436 cursor.goto_parent();
437 match child_kind {
438 "integer_scalar" | "float_scalar" | "timestamp_scalar" => "number",
439 "boolean_scalar" | "null_scalar" => "keyword",
440 _ => "string",
441 }
442 } else {
443 "string"
444 };
445 emit_tokens_for_node(&node, content, token_type, tokens);
446 }
447 }
448 "double_quote_scalar" | "single_quote_scalar" | "block_scalar" => {
449 let token_type = if is_key { "key" } else { "string" };
450 emit_tokens_for_node(&node, content, token_type, tokens);
451 }
452 "integer_scalar" | "float_scalar" | "timestamp_scalar" => {
453 emit_tokens_for_node(&node, content, "number", tokens);
454 }
455 "boolean_scalar" | "null_scalar" => {
456 emit_tokens_for_node(&node, content, "keyword", tokens);
457 }
458 "comment" => {
459 emit_tokens_for_node(&node, content, "comment", tokens);
460 }
461 "anchor" | "alias" | "tag" => {
462 emit_tokens_for_node(&node, content, "type", tokens);
463 }
464 ":" => {
465 if cursor.goto_next_sibling() {
467 walk_yaml(cursor, content, tokens, false);
468 }
469 break;
470 }
471 _ => {
472 if cursor.goto_first_child() {
473 walk_yaml(cursor, content, tokens, is_key);
474 cursor.goto_parent();
475 }
476 }
477 }
478
479 if !cursor.goto_next_sibling() {
480 break;
481 }
482 }
483}
484
485fn toml_node_type(kind: &str) -> Option<&'static str> {
488 match kind {
489 "bare_key" | "quoted_key" => Some("key"),
490 "table" | "table_array_element" => Some("type"),
491 "string" => Some("string"),
492 "integer" | "float" | "offset_date_time" | "local_date_time" | "local_date"
493 | "local_time" => Some("number"),
494 "boolean" => Some("keyword"),
495 "comment" => Some("comment"),
496 _ => None,
497 }
498}
499
500fn dockerfile_node_type(kind: &str) -> Option<&'static str> {
503 match kind {
504 "FROM" | "RUN" | "CMD" | "LABEL" | "MAINTAINER" | "EXPOSE" | "ENV" | "ADD" | "COPY"
505 | "ENTRYPOINT" | "VOLUME" | "USER" | "WORKDIR" | "ARG" | "ONBUILD" | "STOPSIGNAL"
506 | "HEALTHCHECK" | "SHELL" | "CROSS_BUILD" | "AS" => Some("keyword"),
507 "image_name" | "image_alias" => Some("type"),
508 "image_tag" | "image_digest" => Some("string"),
509 "double_quoted_string" | "single_quoted_string" | "json_string" => Some("string"),
510 "comment" => Some("comment"),
511 "variable" => Some("variable"),
512 _ => None,
513 }
514}
515
516fn parse_xml(content: &str) -> Vec<SemanticToken> {
519 let mut parser = tree_sitter::Parser::new();
520 let lang: tree_sitter::Language = tree_sitter_xml::LANGUAGE_XML.into();
521 if parser.set_language(&lang).is_err() {
522 return vec![];
523 }
524 let tree = match parser.parse(content, None) {
525 Some(t) => t,
526 None => return vec![],
527 };
528 let mut tokens = Vec::new();
529 let mut cursor = tree.root_node().walk();
530 walk_xml(&mut cursor, content, &mut tokens);
531 tokens.sort_by_key(|t| (t.line, t.start_col));
532 tokens
533}
534
535fn walk_xml(cursor: &mut tree_sitter::TreeCursor, content: &str, tokens: &mut Vec<SemanticToken>) {
536 loop {
537 let node = cursor.node();
538 let kind = node.kind();
539
540 match kind {
541 "Comment" => {
542 emit_tokens_for_node(&node, content, "comment", tokens);
543 }
544 "CDSect" | "CData" => {
545 emit_tokens_for_node(&node, content, "string", tokens);
546 }
547 "PI" => {
548 emit_tokens_for_node(&node, content, "keyword", tokens);
549 }
550 "CharData" => {
551 emit_tokens_for_node(&node, content, "variable", tokens);
552 }
553 "Attribute" => {
554 if cursor.goto_first_child() {
556 walk_xml_attribute(cursor, content, tokens);
557 cursor.goto_parent();
558 }
559 }
560 "Name" => {
561 emit_tokens_for_node(&node, content, "type", tokens);
562 }
563 "AttValue" => {
564 emit_tokens_for_node(&node, content, "string", tokens);
565 }
566 _ => {
567 if cursor.goto_first_child() {
568 walk_xml(cursor, content, tokens);
569 cursor.goto_parent();
570 }
571 }
572 }
573
574 if !cursor.goto_next_sibling() {
575 break;
576 }
577 }
578}
579
580fn walk_xml_attribute(
581 cursor: &mut tree_sitter::TreeCursor,
582 content: &str,
583 tokens: &mut Vec<SemanticToken>,
584) {
585 loop {
586 let node = cursor.node();
587 let kind = node.kind();
588
589 match kind {
590 "Name" => {
591 emit_tokens_for_node(&node, content, "key", tokens);
592 }
593 "AttValue" => {
594 emit_tokens_for_node(&node, content, "string", tokens);
595 }
596 _ => {}
597 }
598
599 if !cursor.goto_next_sibling() {
600 break;
601 }
602 }
603}
604
605#[cfg(test)]
606mod tests {
607 use super::*;
608 use crate::data::lsp::types::Language;
609
610 fn has_type(tokens: &[SemanticToken], ty: &str) -> bool {
611 tokens.iter().any(|t| t.token_type == ty)
612 }
613
614 fn count_type(tokens: &[SemanticToken], ty: &str) -> usize {
615 tokens.iter().filter(|t| t.token_type == ty).count()
616 }
617
618 #[test]
619 fn json_node_type_mappings() {
620 let content = r#"{"key": "value", "count": 42, "active": true, "nothing": null}"#;
621 let tokens = parse(Language::Json, content);
622
623 assert!(
624 has_type(&tokens, "key"),
625 "expected 'key' tokens for object keys"
626 );
627 assert!(
628 has_type(&tokens, "string"),
629 "expected 'string' tokens for string values"
630 );
631 assert!(
632 has_type(&tokens, "number"),
633 "expected 'number' token for 42"
634 );
635 assert!(
636 has_type(&tokens, "keyword"),
637 "expected 'keyword' tokens for true and null"
638 );
639
640 assert_eq!(count_type(&tokens, "key"), 4, "four object keys");
641 assert_eq!(count_type(&tokens, "string"), 1, "one string value");
642 }
643
644 #[test]
645 fn yaml_node_type_mappings() {
646 let content = "name: hello\ncount: 42\nactive: true\n# a comment\n";
647 let tokens = parse(Language::Yaml, content);
648
649 assert!(
650 has_type(&tokens, "key"),
651 "YAML mapping keys should be 'key'"
652 );
653 assert!(
654 has_type(&tokens, "string"),
655 "YAML plain scalars as values should be 'string'"
656 );
657 assert!(
658 has_type(&tokens, "number"),
659 "YAML integer should be 'number'"
660 );
661 assert!(
662 has_type(&tokens, "keyword"),
663 "YAML boolean should be 'keyword'"
664 );
665 assert!(
666 has_type(&tokens, "comment"),
667 "YAML comment should be 'comment'"
668 );
669 }
670
671 #[test]
672 fn toml_node_type_mappings() {
673 let content = "[section]\nkey = \"value\"\ncount = 42\nactive = true\n# comment\n";
674 let tokens = parse(Language::Toml, content);
675
676 assert!(has_type(&tokens, "key"), "TOML bare keys should be 'key'");
677 assert!(
678 has_type(&tokens, "string"),
679 "TOML strings should be 'string'"
680 );
681 assert!(
682 has_type(&tokens, "number"),
683 "TOML integers should be 'number'"
684 );
685 assert!(
686 has_type(&tokens, "keyword"),
687 "TOML booleans should be 'keyword'"
688 );
689 assert!(
690 has_type(&tokens, "comment"),
691 "TOML comments should be 'comment'"
692 );
693 }
694
695 #[test]
696 fn dockerfile_node_type_mappings() {
697 let content = "FROM ubuntu:22.04\nRUN apt-get update\n# comment\n";
698 let tokens = parse(Language::Dockerfile, content);
699
700 assert!(
701 has_type(&tokens, "keyword"),
702 "FROM and RUN instructions should be 'keyword'"
703 );
704 assert!(
705 has_type(&tokens, "type"),
706 "image name 'ubuntu' should be 'type'"
707 );
708 assert!(
709 has_type(&tokens, "string"),
710 "image tag '22.04' should be 'string'"
711 );
712 assert!(
713 has_type(&tokens, "comment"),
714 "# comment should be 'comment'"
715 );
716 }
717
718 #[test]
719 fn xml_node_type_mappings() {
720 let content = r#"<root id="1">text</root>"#;
721 let tokens = parse(Language::Xml, content);
722
723 assert!(has_type(&tokens, "type"), "tag names should be 'type'");
724 assert!(
725 has_type(&tokens, "key"),
726 "attribute name 'id' should be 'key'"
727 );
728 assert!(
729 has_type(&tokens, "string"),
730 "attribute value should be 'string'"
731 );
732 assert!(
733 has_type(&tokens, "variable"),
734 "text content should be 'variable'"
735 );
736 }
737
738 #[test]
739 fn yaml_multiline_block_scalar() {
740 let content = "text: |\n line one\n line two\n line three\n";
741 let tokens = parse(Language::Yaml, content);
742
743 let string_tokens: Vec<_> = tokens.iter().filter(|t| t.token_type == "string").collect();
744 assert!(
746 string_tokens.len() >= 3,
747 "expected at least 3 string tokens for 3-line block scalar, got {}",
748 string_tokens.len()
749 );
750 let string_lines: std::collections::HashSet<usize> =
751 string_tokens.iter().map(|t| t.line).collect();
752 assert!(
753 string_lines.contains(&1),
754 "line 1 should have a string token"
755 );
756 assert!(
757 string_lines.contains(&2),
758 "line 2 should have a string token"
759 );
760 assert!(
761 string_lines.contains(&3),
762 "line 3 should have a string token"
763 );
764 for tok in &string_tokens {
765 assert_eq!(tok.token_type, "string");
766 }
767 }
768
769 #[test]
770 fn large_file_guard_returns_empty() {
771 let huge = "x".repeat(512 * 1024 + 1);
772 let tokens = parse(Language::Json, &huge);
773 assert!(
774 tokens.is_empty(),
775 "content over 512 KB should return empty tokens"
776 );
777 }
778}