1use crate::utils::range_utils::LineIndex;
2use regex::Regex;
3use std::fmt;
4use std::sync::LazyLock;
5
6static FENCED_CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap());
8static FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```\s*$").unwrap());
9static ALTERNATE_FENCED_CODE_BLOCK_START: LazyLock<Regex> =
10 LazyLock::new(|| Regex::new(r"^(\s*)~~~(?:[^~\r\n]*)$").unwrap());
11static ALTERNATE_FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)~~~\s*$").unwrap());
12static INDENTED_CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})").unwrap());
13static LIST_ITEM_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-]|\d+[.)])(\s*)(.*)$").unwrap());
14
15pub struct CodeBlockUtils;
17
18impl CodeBlockUtils {
19 pub fn is_in_code_block(content: &str, line_num: usize) -> bool {
21 let lines: Vec<&str> = content.lines().collect();
22 if line_num >= lines.len() {
23 return false;
24 }
25
26 let mut in_fenced_code = false;
27 let mut in_alternate_fenced = false;
28
29 for (i, line) in lines.iter().enumerate() {
30 if i > line_num {
31 break;
32 }
33
34 if FENCED_CODE_BLOCK_START.is_match(line) {
35 in_fenced_code = !in_fenced_code;
36 } else if FENCED_CODE_BLOCK_END.is_match(line) && in_fenced_code {
37 in_fenced_code = false;
38 } else if ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line) {
39 in_alternate_fenced = !in_alternate_fenced;
40 } else if ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line) && in_alternate_fenced {
41 in_alternate_fenced = false;
42 }
43 }
44
45 if line_num < lines.len() && Self::is_indented_code_block(lines[line_num]) {
47 return true;
48 }
49
50 in_fenced_code || in_alternate_fenced
52 }
53
54 pub fn is_code_block_delimiter(line: &str) -> bool {
56 FENCED_CODE_BLOCK_START.is_match(line)
57 || FENCED_CODE_BLOCK_END.is_match(line)
58 || ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line)
59 || ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line)
60 }
61
62 pub fn is_code_block_start(line: &str) -> bool {
64 FENCED_CODE_BLOCK_START.is_match(line) || ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line)
65 }
66
67 pub fn is_code_block_end(line: &str) -> bool {
69 FENCED_CODE_BLOCK_END.is_match(line) || ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line)
70 }
71
72 pub fn is_indented_code_block(line: &str) -> bool {
74 let expanded_line = line.replace('\t', " ");
76 INDENTED_CODE_BLOCK.is_match(&expanded_line)
77 }
78
79 pub fn get_language_specifier(line: &str) -> Option<String> {
105 if FENCED_CODE_BLOCK_START.is_match(line) || ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line) {
106 let trimmed = line.trim_start();
107 let after_fence = &trimmed[3..].trim_start();
108 if !after_fence.is_empty() {
109 return Some(after_fence.to_string());
110 }
111 }
112 None
113 }
114
115 pub fn identify_code_block_lines(content: &str) -> Vec<bool> {
144 let lines: Vec<&str> = content.lines().collect();
145 let mut in_code_block = vec![false; lines.len()];
146
147 let mut in_fenced_code = false;
148 let mut in_alternate_fenced = false;
149
150 for (i, line) in lines.iter().enumerate() {
151 let trimmed = line.trim_start();
153
154 if trimmed.starts_with("```") {
155 if FENCED_CODE_BLOCK_START.is_match(line) {
156 in_fenced_code = !in_fenced_code;
157 in_code_block[i] = true; } else if in_fenced_code && FENCED_CODE_BLOCK_END.is_match(line) {
159 in_fenced_code = false;
160 in_code_block[i] = true; }
162 } else if trimmed.starts_with("~~~") {
163 if ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line) {
164 in_alternate_fenced = !in_alternate_fenced;
165 in_code_block[i] = true; } else if in_alternate_fenced && ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line) {
167 in_alternate_fenced = false;
168 in_code_block[i] = true; }
170 }
171
172 if in_fenced_code || in_alternate_fenced {
174 in_code_block[i] = true;
175 } else if !in_code_block[i] {
176 if (line.starts_with(" ") || INDENTED_CODE_BLOCK.is_match(line)) && !LIST_ITEM_RE.is_match(line) {
179 in_code_block[i] = true;
180 }
181 }
182 }
183
184 in_code_block
185 }
186}
187
188static FENCED_CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(?:```|~~~)").unwrap());
190static INDENTED_CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})").unwrap());
191
192#[derive(Debug, PartialEq, Clone, Copy)]
194pub enum CodeBlockState {
195 None,
196 Fenced,
197 Indented,
198}
199
200#[derive(Debug)]
202pub struct CodeBlockInfo<'a> {
203 pub block_states: Vec<CodeBlockState>,
205 pub code_spans: Vec<(usize, usize)>,
207 content: &'a str,
209 line_index: LineIndex<'a>,
211}
212
213impl<'a> CodeBlockInfo<'a> {
214 pub fn new(content: &'a str) -> Self {
216 let block_states = compute_code_blocks(content);
217 let code_spans = compute_code_spans(content);
218 let line_index = LineIndex::new(content);
219
220 CodeBlockInfo {
221 block_states,
222 code_spans,
223 content,
224 line_index,
225 }
226 }
227
228 pub fn is_in_code_block(&self, line_index: usize) -> bool {
230 if line_index < self.block_states.len() {
231 self.block_states[line_index] != CodeBlockState::None
232 } else {
233 false
234 }
235 }
236
237 pub fn is_in_code_span(&self, line_index: usize, column_index: usize) -> bool {
239 let line_start = self
241 .line_index
242 .get_line_start_byte(line_index + 1)
243 .unwrap_or(self.content.len());
244 let position = line_start + column_index;
245
246 for &(start, end) in &self.code_spans {
248 if position >= start && position <= end {
249 return true;
250 }
251 }
252
253 false
254 }
255
256 pub fn has_code_blocks(&self) -> bool {
258 self.block_states.iter().any(|state| *state != CodeBlockState::None)
259 }
260
261 pub fn has_code_spans(&self) -> bool {
263 !self.code_spans.is_empty()
264 }
265}
266
267pub fn compute_code_blocks(content: &str) -> Vec<CodeBlockState> {
269 let mut in_fenced_block = false;
270 let mut result = Vec::new();
271 let mut fence_marker = "";
272
273 for line in content.lines() {
274 if in_fenced_block {
275 if line.trim().starts_with(fence_marker) {
276 in_fenced_block = false;
277 result.push(CodeBlockState::Fenced); } else {
279 result.push(CodeBlockState::Fenced);
280 }
281 } else if FENCED_CODE_BLOCK_PATTERN.is_match(line) {
282 in_fenced_block = true;
283 fence_marker = if line.trim().starts_with("```") { "```" } else { "~~~" };
284 result.push(CodeBlockState::Fenced); } else if !line.trim().is_empty() {
286 let expanded_line = line.replace('\t', " ");
288 if INDENTED_CODE_BLOCK_PATTERN.is_match(&expanded_line) {
289 result.push(CodeBlockState::Indented);
290 } else {
291 result.push(CodeBlockState::None);
292 }
293 } else {
294 result.push(CodeBlockState::None);
295 }
296 }
297
298 result
299}
300
301pub fn compute_code_spans(content: &str) -> Vec<(usize, usize)> {
303 let mut spans = Vec::new();
304
305 let chars: Vec<char> = content.chars().collect();
307 let mut i = 0;
308
309 while i < chars.len() {
310 if i > 0 && chars[i] == '`' && chars[i - 1] == '\\' {
312 i += 1;
313 continue;
314 }
315
316 if chars[i] == '`' {
318 let mut backtick_count = 1;
319 let start_idx = i;
320
321 i += 1;
323 while i < chars.len() && chars[i] == '`' {
324 backtick_count += 1;
325 i += 1;
326 }
327
328 if is_likely_code_block_delimiter(&chars, start_idx) {
331 continue;
332 }
333
334 let mut j = i;
336 let mut found_closing = false;
337
338 while j < chars.len() {
339 if j > 0 && chars[j] == '`' && chars[j - 1] == '\\' {
341 j += 1;
342 continue;
343 }
344
345 if chars[j] == '`' {
346 let mut closing_count = 1;
347 let potential_end = j;
348
349 j += 1;
351 while j < chars.len() && chars[j] == '`' {
352 closing_count += 1;
353 j += 1;
354 }
355
356 if closing_count == backtick_count {
358 let start_byte = chars[..start_idx].iter().map(|c| c.len_utf8()).sum();
360 let end_byte = chars[..potential_end + closing_count]
361 .iter()
362 .map(|c| c.len_utf8())
363 .sum();
364
365 spans.push((start_byte, end_byte));
366 i = j; found_closing = true;
368 break;
369 }
370 }
371
372 j += 1;
373 }
374
375 if !found_closing {
376 continue;
378 }
379 } else {
380 i += 1;
381 }
382 }
383
384 spans
385}
386
387fn is_likely_code_block_delimiter(chars: &[char], start_idx: usize) -> bool {
389 let mut count = 0;
390 let mut i = start_idx;
391
392 while i < chars.len() && chars[i] == '`' {
394 count += 1;
395 i += 1;
396 }
397
398 if count < 3 {
399 return false;
401 }
402
403 let mut j = start_idx;
405 if j > 0 {
406 j -= 1;
407 while j > 0 && chars[j] != '\n' {
409 if !chars[j].is_whitespace() {
410 return false;
412 }
413 j -= 1;
414 }
415 }
416
417 true
418}
419
420#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
422pub enum CodeBlockStyle {
423 #[default]
425 Consistent,
426 Indented,
428 Fenced,
430}
431
432impl fmt::Display for CodeBlockStyle {
433 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
434 match self {
435 CodeBlockStyle::Fenced => write!(f, "fenced"),
436 CodeBlockStyle::Indented => write!(f, "indented"),
437 CodeBlockStyle::Consistent => write!(f, "consistent"),
438 }
439 }
440}
441
442#[cfg(test)]
443mod tests {
444 use super::*;
445
446 #[test]
447 fn test_is_in_code_block() {
448 let content = "Normal text
449```rust
450let x = 1;
451```
452More text";
453
454 assert!(!CodeBlockUtils::is_in_code_block(content, 0));
455 assert!(CodeBlockUtils::is_in_code_block(content, 1));
456 assert!(CodeBlockUtils::is_in_code_block(content, 2));
457 assert!(!CodeBlockUtils::is_in_code_block(content, 3)); assert!(!CodeBlockUtils::is_in_code_block(content, 4));
459
460 let content2 = "Text\n~~~\ncode\n~~~\nEnd";
462 assert!(!CodeBlockUtils::is_in_code_block(content2, 0));
463 assert!(CodeBlockUtils::is_in_code_block(content2, 1));
464 assert!(CodeBlockUtils::is_in_code_block(content2, 2));
465 assert!(!CodeBlockUtils::is_in_code_block(content2, 3)); assert!(!CodeBlockUtils::is_in_code_block(content2, 4));
467
468 let content3 = "Normal\n indented code\nNormal";
470 assert!(!CodeBlockUtils::is_in_code_block(content3, 0));
471 assert!(CodeBlockUtils::is_in_code_block(content3, 1));
472 assert!(!CodeBlockUtils::is_in_code_block(content3, 2));
473
474 assert!(!CodeBlockUtils::is_in_code_block("test", 10));
476 }
477
478 #[test]
479 fn test_is_code_block_delimiter() {
480 assert!(CodeBlockUtils::is_code_block_delimiter("```"));
481 assert!(CodeBlockUtils::is_code_block_delimiter("```rust"));
482 assert!(CodeBlockUtils::is_code_block_delimiter(" ```"));
483 assert!(CodeBlockUtils::is_code_block_delimiter("~~~"));
484 assert!(CodeBlockUtils::is_code_block_delimiter("~~~python"));
485
486 assert!(!CodeBlockUtils::is_code_block_delimiter("Normal text"));
487 assert!(!CodeBlockUtils::is_code_block_delimiter("``"));
488 assert!(!CodeBlockUtils::is_code_block_delimiter("~"));
489 assert!(!CodeBlockUtils::is_code_block_delimiter(""));
490 }
491
492 #[test]
493 fn test_is_code_block_start() {
494 assert!(CodeBlockUtils::is_code_block_start("```"));
495 assert!(CodeBlockUtils::is_code_block_start("```rust"));
496 assert!(CodeBlockUtils::is_code_block_start("~~~"));
497 assert!(CodeBlockUtils::is_code_block_start("~~~python"));
498 assert!(CodeBlockUtils::is_code_block_start(" ```"));
499
500 assert!(!CodeBlockUtils::is_code_block_start("Normal text"));
501 assert!(!CodeBlockUtils::is_code_block_start(""));
502 }
503
504 #[test]
505 fn test_is_code_block_end() {
506 assert!(CodeBlockUtils::is_code_block_end("```"));
507 assert!(CodeBlockUtils::is_code_block_end("~~~"));
508 assert!(CodeBlockUtils::is_code_block_end(" ```"));
509 assert!(CodeBlockUtils::is_code_block_end("``` "));
510
511 assert!(!CodeBlockUtils::is_code_block_end("```rust"));
513 assert!(!CodeBlockUtils::is_code_block_end("~~~python"));
514 assert!(!CodeBlockUtils::is_code_block_end("Normal text"));
515 }
516
517 #[test]
518 fn test_is_indented_code_block() {
519 assert!(CodeBlockUtils::is_indented_code_block(" code"));
520 assert!(CodeBlockUtils::is_indented_code_block(" more indented"));
521 assert!(CodeBlockUtils::is_indented_code_block("\tcode")); assert!(CodeBlockUtils::is_indented_code_block("\t\tcode")); assert!(CodeBlockUtils::is_indented_code_block(" \tcode")); assert!(!CodeBlockUtils::is_indented_code_block(" code")); assert!(!CodeBlockUtils::is_indented_code_block("normal text"));
528 assert!(!CodeBlockUtils::is_indented_code_block(""));
529 }
530
531 #[test]
532 fn test_get_language_specifier() {
533 assert_eq!(
534 CodeBlockUtils::get_language_specifier("```rust"),
535 Some("rust".to_string())
536 );
537 assert_eq!(
538 CodeBlockUtils::get_language_specifier("~~~python"),
539 Some("python".to_string())
540 );
541 assert_eq!(
542 CodeBlockUtils::get_language_specifier("```javascript"),
543 Some("javascript".to_string())
544 );
545 assert_eq!(
546 CodeBlockUtils::get_language_specifier(" ```rust"),
547 Some("rust".to_string())
548 );
549 assert_eq!(
550 CodeBlockUtils::get_language_specifier("```rust ignore"),
551 Some("rust ignore".to_string())
552 );
553
554 assert_eq!(CodeBlockUtils::get_language_specifier("```"), None);
555 assert_eq!(CodeBlockUtils::get_language_specifier("~~~"), None);
556 assert_eq!(CodeBlockUtils::get_language_specifier("Normal text"), None);
557 assert_eq!(CodeBlockUtils::get_language_specifier(""), None);
558 }
559
560 #[test]
561 fn test_identify_code_block_lines() {
562 let content = "Normal text
563```rust
564let x = 1;
565```
566More text";
567
568 let result = CodeBlockUtils::identify_code_block_lines(content);
569 assert_eq!(result, vec![false, true, true, true, false]);
570
571 let content2 = "Text\n~~~\ncode\n~~~\nEnd";
573 let result2 = CodeBlockUtils::identify_code_block_lines(content2);
574 assert_eq!(result2, vec![false, true, true, true, false]);
575
576 let content3 = "Normal\n code\n more code\nNormal";
578 let result3 = CodeBlockUtils::identify_code_block_lines(content3);
579 assert_eq!(result3, vec![false, true, true, false]);
580
581 let content4 = "List:\n * Item 1\n * Item 2";
583 let result4 = CodeBlockUtils::identify_code_block_lines(content4);
584 assert_eq!(result4, vec![false, false, false]);
585 }
586
587 #[test]
588 fn test_code_block_state_enum() {
589 assert_eq!(CodeBlockState::None, CodeBlockState::None);
590 assert_eq!(CodeBlockState::Fenced, CodeBlockState::Fenced);
591 assert_eq!(CodeBlockState::Indented, CodeBlockState::Indented);
592 assert_ne!(CodeBlockState::None, CodeBlockState::Fenced);
593 }
594
595 #[test]
596 fn test_code_block_info() {
597 let content = "Normal\n```\ncode\n```\nText";
598 let info = CodeBlockInfo::new(content);
599
600 assert!(!info.is_in_code_block(0));
601 assert!(info.is_in_code_block(1));
602 assert!(info.is_in_code_block(2));
603 assert!(info.is_in_code_block(3));
604 assert!(!info.is_in_code_block(4));
605
606 assert!(info.has_code_blocks());
607
608 assert!(!info.is_in_code_block(100));
610 }
611
612 #[test]
613 fn test_code_block_info_code_spans() {
614 let content = "Text with `inline code` here";
615 let info = CodeBlockInfo::new(content);
616
617 assert!(info.has_code_spans());
618 assert!(!info.has_code_blocks());
619
620 assert!(info.is_in_code_span(0, 11)); assert!(info.is_in_code_span(0, 15)); assert!(!info.is_in_code_span(0, 5)); assert!(!info.is_in_code_span(0, 25)); }
626
627 #[test]
628 fn test_compute_code_blocks() {
629 let content = "Normal\n```\ncode\n```\n indented";
630 let states = compute_code_blocks(content);
631
632 assert_eq!(states[0], CodeBlockState::None);
633 assert_eq!(states[1], CodeBlockState::Fenced);
634 assert_eq!(states[2], CodeBlockState::Fenced);
635 assert_eq!(states[3], CodeBlockState::Fenced);
636 assert_eq!(states[4], CodeBlockState::Indented);
637 }
638
639 #[test]
640 fn test_compute_code_spans() {
641 let content = "Text `code` and ``double`` backticks";
642 let spans = compute_code_spans(content);
643
644 assert_eq!(spans.len(), 2);
645 assert_eq!(&content[spans[0].0..spans[0].1], "`code`");
647 assert_eq!(&content[spans[1].0..spans[1].1], "``double``");
649
650 let content2 = r"Text \`not code\` but `real code`";
652 let spans2 = compute_code_spans(content2);
653 assert_eq!(spans2.len(), 1);
654 assert!(content2[spans2[0].0..spans2[0].1].contains("real code"));
655 }
656
657 #[test]
658 fn test_code_block_style() {
659 assert_eq!(CodeBlockStyle::Fenced.to_string(), "fenced");
660 assert_eq!(CodeBlockStyle::Indented.to_string(), "indented");
661 assert_eq!(CodeBlockStyle::Consistent.to_string(), "consistent");
662
663 assert_eq!(CodeBlockStyle::default(), CodeBlockStyle::Consistent);
664 }
665
666 #[test]
667 fn test_nested_code_blocks() {
668 let content = "```\n```\ncode\n```\n```";
670 let result = CodeBlockUtils::identify_code_block_lines(content);
671 assert_eq!(result, vec![true, true, false, true, true]);
673 }
674
675 #[test]
676 fn test_unicode_content() {
677 let content = "```rust\nlet 你好 = \"世界\";\n```";
678 let result = CodeBlockUtils::identify_code_block_lines(content);
679 assert_eq!(result, vec![true, true, true]);
680
681 assert_eq!(CodeBlockUtils::get_language_specifier("```🦀"), Some("🦀".to_string()));
682 }
683
684 #[test]
685 fn test_edge_cases() {
686 assert_eq!(CodeBlockUtils::identify_code_block_lines(""), Vec::<bool>::new());
688 assert!(!CodeBlockUtils::is_in_code_block("", 0));
689
690 assert_eq!(CodeBlockUtils::identify_code_block_lines("```"), vec![true]);
692 assert_eq!(CodeBlockUtils::identify_code_block_lines("~~~"), vec![true]);
693
694 let content = "```\ncode\n~~~\nmore\n```";
696 let result = CodeBlockUtils::identify_code_block_lines(content);
697 assert_eq!(result, vec![true, true, true, true, true]);
698 }
699}