1use lazy_static::lazy_static;
2use regex::Regex;
3use std::fmt;
4
5lazy_static! {
6 static ref FENCED_CODE_BLOCK_START: Regex = Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap();
8 static ref FENCED_CODE_BLOCK_END: Regex = Regex::new(r"^(\s*)```\s*$").unwrap();
9 static ref ALTERNATE_FENCED_CODE_BLOCK_START: Regex = Regex::new(r"^(\s*)~~~(?:[^~\r\n]*)$").unwrap();
10 static ref ALTERNATE_FENCED_CODE_BLOCK_END: Regex = Regex::new(r"^(\s*)~~~\s*$").unwrap();
11 static ref INDENTED_CODE_BLOCK: Regex = Regex::new(r"^(\s{4,})").unwrap();
12 static ref LIST_ITEM_RE: Regex = Regex::new(r"^(\s*)([*+-]|\d+[.)])(\s*)(.*)$").unwrap();
13}
14
15pub struct CodeBlockUtils;
17
18impl CodeBlockUtils {
19 pub fn is_in_code_block(content: &str, line_num: usize) -> bool {
21 let lines: Vec<&str> = content.lines().collect();
22 if line_num >= lines.len() {
23 return false;
24 }
25
26 let mut in_fenced_code = false;
27 let mut in_alternate_fenced = false;
28
29 for (i, line) in lines.iter().enumerate() {
30 if i > line_num {
31 break;
32 }
33
34 if FENCED_CODE_BLOCK_START.is_match(line) {
35 in_fenced_code = !in_fenced_code;
36 } else if FENCED_CODE_BLOCK_END.is_match(line) && in_fenced_code {
37 in_fenced_code = false;
38 } else if ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line) {
39 in_alternate_fenced = !in_alternate_fenced;
40 } else if ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line) && in_alternate_fenced {
41 in_alternate_fenced = false;
42 }
43 }
44
45 if line_num < lines.len() && Self::is_indented_code_block(lines[line_num]) {
47 return true;
48 }
49
50 in_fenced_code || in_alternate_fenced
52 }
53
54 pub fn is_code_block_delimiter(line: &str) -> bool {
56 FENCED_CODE_BLOCK_START.is_match(line)
57 || FENCED_CODE_BLOCK_END.is_match(line)
58 || ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line)
59 || ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line)
60 }
61
62 pub fn is_code_block_start(line: &str) -> bool {
64 FENCED_CODE_BLOCK_START.is_match(line) || ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line)
65 }
66
67 pub fn is_code_block_end(line: &str) -> bool {
69 FENCED_CODE_BLOCK_END.is_match(line) || ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line)
70 }
71
72 pub fn is_indented_code_block(line: &str) -> bool {
74 let expanded_line = line.replace('\t', " ");
76 INDENTED_CODE_BLOCK.is_match(&expanded_line)
77 }
78
79 pub fn get_language_specifier(line: &str) -> Option<String> {
105 if FENCED_CODE_BLOCK_START.is_match(line) || ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line) {
106 let trimmed = line.trim_start();
107 let after_fence = &trimmed[3..].trim_start();
108 if !after_fence.is_empty() {
109 return Some(after_fence.to_string());
110 }
111 }
112 None
113 }
114
115 pub fn identify_code_block_lines(content: &str) -> Vec<bool> {
144 let lines: Vec<&str> = content.lines().collect();
145 let mut in_code_block = vec![false; lines.len()];
146
147 let mut in_fenced_code = false;
148 let mut in_alternate_fenced = false;
149
150 for (i, line) in lines.iter().enumerate() {
151 let trimmed = line.trim_start();
153
154 if trimmed.starts_with("```") {
155 if FENCED_CODE_BLOCK_START.is_match(line) {
156 in_fenced_code = !in_fenced_code;
157 in_code_block[i] = true; } else if in_fenced_code && FENCED_CODE_BLOCK_END.is_match(line) {
159 in_fenced_code = false;
160 in_code_block[i] = true; }
162 } else if trimmed.starts_with("~~~") {
163 if ALTERNATE_FENCED_CODE_BLOCK_START.is_match(line) {
164 in_alternate_fenced = !in_alternate_fenced;
165 in_code_block[i] = true; } else if in_alternate_fenced && ALTERNATE_FENCED_CODE_BLOCK_END.is_match(line) {
167 in_alternate_fenced = false;
168 in_code_block[i] = true; }
170 }
171
172 if in_fenced_code || in_alternate_fenced {
174 in_code_block[i] = true;
175 } else if !in_code_block[i] {
176 if (line.starts_with(" ") || INDENTED_CODE_BLOCK.is_match(line)) && !LIST_ITEM_RE.is_match(line) {
179 in_code_block[i] = true;
180 }
181 }
182 }
183
184 in_code_block
185 }
186}
187
188lazy_static! {
190 static ref FENCED_CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(?:```|~~~)").unwrap();
191 static ref INDENTED_CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(\s{4,})").unwrap();
192 static ref BACKTICK_PATTERN: Regex = Regex::new(r"(`+)").unwrap();
193}
194
195#[derive(Debug, PartialEq, Clone, Copy)]
197pub enum CodeBlockState {
198 None,
199 Fenced,
200 Indented,
201}
202
203#[derive(Debug)]
205pub struct CodeBlockInfo {
206 pub block_states: Vec<CodeBlockState>,
208 pub code_spans: Vec<(usize, usize)>,
210 content: String,
212}
213
214impl CodeBlockInfo {
215 pub fn new(content: &str) -> Self {
217 let block_states = compute_code_blocks(content);
218 let code_spans = compute_code_spans(content);
219
220 CodeBlockInfo {
221 block_states,
222 code_spans,
223 content: content.to_string(),
224 }
225 }
226
227 pub fn is_in_code_block(&self, line_index: usize) -> bool {
229 if line_index < self.block_states.len() {
230 self.block_states[line_index] != CodeBlockState::None
231 } else {
232 false
233 }
234 }
235
236 pub fn is_in_code_span(&self, line_index: usize, column_index: usize) -> bool {
238 let mut position = 0;
240 let content_lines: Vec<&str> = self.content.lines().collect();
241
242 for i in 0..line_index {
243 if i < content_lines.len() {
244 position += content_lines[i].len() + 1; }
246 }
247
248 if line_index < content_lines.len() {
249 let line = content_lines[line_index];
251 if column_index < line.len() {
252 position += column_index;
253
254 for &(start, end) in &self.code_spans {
256 if position >= start && position <= end {
257 return true;
258 }
259 }
260 }
261 }
262
263 false
264 }
265
266 pub fn has_code_blocks(&self) -> bool {
268 self.block_states.iter().any(|state| *state != CodeBlockState::None)
269 }
270
271 pub fn has_code_spans(&self) -> bool {
273 !self.code_spans.is_empty()
274 }
275}
276
277pub fn compute_code_blocks(content: &str) -> Vec<CodeBlockState> {
279 let mut in_fenced_block = false;
280 let mut result = Vec::new();
281 let mut fence_marker = "";
282
283 for line in content.lines() {
284 if in_fenced_block {
285 if line.trim().starts_with(fence_marker) {
286 in_fenced_block = false;
287 result.push(CodeBlockState::Fenced); } else {
289 result.push(CodeBlockState::Fenced);
290 }
291 } else if FENCED_CODE_BLOCK_PATTERN.is_match(line) {
292 in_fenced_block = true;
293 fence_marker = if line.trim().starts_with("```") { "```" } else { "~~~" };
294 result.push(CodeBlockState::Fenced); } else if !line.trim().is_empty() {
296 let expanded_line = line.replace('\t', " ");
298 if INDENTED_CODE_BLOCK_PATTERN.is_match(&expanded_line) {
299 result.push(CodeBlockState::Indented);
300 } else {
301 result.push(CodeBlockState::None);
302 }
303 } else {
304 result.push(CodeBlockState::None);
305 }
306 }
307
308 result
309}
310
311pub fn compute_code_spans(content: &str) -> Vec<(usize, usize)> {
313 let mut spans = Vec::new();
314
315 let chars: Vec<char> = content.chars().collect();
317 let mut i = 0;
318
319 while i < chars.len() {
320 if i > 0 && chars[i] == '`' && chars[i - 1] == '\\' {
322 i += 1;
323 continue;
324 }
325
326 if chars[i] == '`' {
328 let mut backtick_count = 1;
329 let start_idx = i;
330
331 i += 1;
333 while i < chars.len() && chars[i] == '`' {
334 backtick_count += 1;
335 i += 1;
336 }
337
338 if is_likely_code_block_delimiter(&chars, start_idx) {
341 continue;
342 }
343
344 let mut j = i;
346 let mut found_closing = false;
347
348 while j < chars.len() {
349 if j > 0 && chars[j] == '`' && chars[j - 1] == '\\' {
351 j += 1;
352 continue;
353 }
354
355 if chars[j] == '`' {
356 let mut closing_count = 1;
357 let potential_end = j;
358
359 j += 1;
361 while j < chars.len() && chars[j] == '`' {
362 closing_count += 1;
363 j += 1;
364 }
365
366 if closing_count == backtick_count {
368 let start_byte = chars[..start_idx].iter().map(|c| c.len_utf8()).sum();
370 let end_byte = chars[..potential_end + closing_count]
371 .iter()
372 .map(|c| c.len_utf8())
373 .sum();
374
375 spans.push((start_byte, end_byte));
376 i = j; found_closing = true;
378 break;
379 }
380 }
381
382 j += 1;
383 }
384
385 if !found_closing {
386 continue;
388 }
389 } else {
390 i += 1;
391 }
392 }
393
394 spans
395}
396
397fn is_likely_code_block_delimiter(chars: &[char], start_idx: usize) -> bool {
399 let mut count = 0;
400 let mut i = start_idx;
401
402 while i < chars.len() && chars[i] == '`' {
404 count += 1;
405 i += 1;
406 }
407
408 if count < 3 {
409 return false;
411 }
412
413 let mut j = start_idx;
415 if j > 0 {
416 j -= 1;
417 while j > 0 && chars[j] != '\n' {
419 if !chars[j].is_whitespace() {
420 return false;
422 }
423 j -= 1;
424 }
425 }
426
427 true
428}
429
430#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
432pub enum CodeBlockStyle {
433 #[default]
435 Consistent,
436 Indented,
438 Fenced,
440}
441
442impl fmt::Display for CodeBlockStyle {
443 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
444 match self {
445 CodeBlockStyle::Fenced => write!(f, "fenced"),
446 CodeBlockStyle::Indented => write!(f, "indented"),
447 CodeBlockStyle::Consistent => write!(f, "consistent"),
448 }
449 }
450}
451
452#[cfg(test)]
453mod tests {
454 use super::*;
455
456 #[test]
457 fn test_is_in_code_block() {
458 let content = "Normal text
459```rust
460let x = 1;
461```
462More text";
463
464 assert!(!CodeBlockUtils::is_in_code_block(content, 0));
465 assert!(CodeBlockUtils::is_in_code_block(content, 1));
466 assert!(CodeBlockUtils::is_in_code_block(content, 2));
467 assert!(!CodeBlockUtils::is_in_code_block(content, 3)); assert!(!CodeBlockUtils::is_in_code_block(content, 4));
469
470 let content2 = "Text\n~~~\ncode\n~~~\nEnd";
472 assert!(!CodeBlockUtils::is_in_code_block(content2, 0));
473 assert!(CodeBlockUtils::is_in_code_block(content2, 1));
474 assert!(CodeBlockUtils::is_in_code_block(content2, 2));
475 assert!(!CodeBlockUtils::is_in_code_block(content2, 3)); assert!(!CodeBlockUtils::is_in_code_block(content2, 4));
477
478 let content3 = "Normal\n indented code\nNormal";
480 assert!(!CodeBlockUtils::is_in_code_block(content3, 0));
481 assert!(CodeBlockUtils::is_in_code_block(content3, 1));
482 assert!(!CodeBlockUtils::is_in_code_block(content3, 2));
483
484 assert!(!CodeBlockUtils::is_in_code_block("test", 10));
486 }
487
488 #[test]
489 fn test_is_code_block_delimiter() {
490 assert!(CodeBlockUtils::is_code_block_delimiter("```"));
491 assert!(CodeBlockUtils::is_code_block_delimiter("```rust"));
492 assert!(CodeBlockUtils::is_code_block_delimiter(" ```"));
493 assert!(CodeBlockUtils::is_code_block_delimiter("~~~"));
494 assert!(CodeBlockUtils::is_code_block_delimiter("~~~python"));
495
496 assert!(!CodeBlockUtils::is_code_block_delimiter("Normal text"));
497 assert!(!CodeBlockUtils::is_code_block_delimiter("``"));
498 assert!(!CodeBlockUtils::is_code_block_delimiter("~"));
499 assert!(!CodeBlockUtils::is_code_block_delimiter(""));
500 }
501
502 #[test]
503 fn test_is_code_block_start() {
504 assert!(CodeBlockUtils::is_code_block_start("```"));
505 assert!(CodeBlockUtils::is_code_block_start("```rust"));
506 assert!(CodeBlockUtils::is_code_block_start("~~~"));
507 assert!(CodeBlockUtils::is_code_block_start("~~~python"));
508 assert!(CodeBlockUtils::is_code_block_start(" ```"));
509
510 assert!(!CodeBlockUtils::is_code_block_start("Normal text"));
511 assert!(!CodeBlockUtils::is_code_block_start(""));
512 }
513
514 #[test]
515 fn test_is_code_block_end() {
516 assert!(CodeBlockUtils::is_code_block_end("```"));
517 assert!(CodeBlockUtils::is_code_block_end("~~~"));
518 assert!(CodeBlockUtils::is_code_block_end(" ```"));
519 assert!(CodeBlockUtils::is_code_block_end("``` "));
520
521 assert!(!CodeBlockUtils::is_code_block_end("```rust"));
523 assert!(!CodeBlockUtils::is_code_block_end("~~~python"));
524 assert!(!CodeBlockUtils::is_code_block_end("Normal text"));
525 }
526
527 #[test]
528 fn test_is_indented_code_block() {
529 assert!(CodeBlockUtils::is_indented_code_block(" code"));
530 assert!(CodeBlockUtils::is_indented_code_block(" more indented"));
531 assert!(CodeBlockUtils::is_indented_code_block("\tcode")); assert!(CodeBlockUtils::is_indented_code_block("\t\tcode")); assert!(CodeBlockUtils::is_indented_code_block(" \tcode")); assert!(!CodeBlockUtils::is_indented_code_block(" code")); assert!(!CodeBlockUtils::is_indented_code_block("normal text"));
538 assert!(!CodeBlockUtils::is_indented_code_block(""));
539 }
540
541 #[test]
542 fn test_get_language_specifier() {
543 assert_eq!(
544 CodeBlockUtils::get_language_specifier("```rust"),
545 Some("rust".to_string())
546 );
547 assert_eq!(
548 CodeBlockUtils::get_language_specifier("~~~python"),
549 Some("python".to_string())
550 );
551 assert_eq!(
552 CodeBlockUtils::get_language_specifier("```javascript"),
553 Some("javascript".to_string())
554 );
555 assert_eq!(
556 CodeBlockUtils::get_language_specifier(" ```rust"),
557 Some("rust".to_string())
558 );
559 assert_eq!(
560 CodeBlockUtils::get_language_specifier("```rust ignore"),
561 Some("rust ignore".to_string())
562 );
563
564 assert_eq!(CodeBlockUtils::get_language_specifier("```"), None);
565 assert_eq!(CodeBlockUtils::get_language_specifier("~~~"), None);
566 assert_eq!(CodeBlockUtils::get_language_specifier("Normal text"), None);
567 assert_eq!(CodeBlockUtils::get_language_specifier(""), None);
568 }
569
570 #[test]
571 fn test_identify_code_block_lines() {
572 let content = "Normal text
573```rust
574let x = 1;
575```
576More text";
577
578 let result = CodeBlockUtils::identify_code_block_lines(content);
579 assert_eq!(result, vec![false, true, true, true, false]);
580
581 let content2 = "Text\n~~~\ncode\n~~~\nEnd";
583 let result2 = CodeBlockUtils::identify_code_block_lines(content2);
584 assert_eq!(result2, vec![false, true, true, true, false]);
585
586 let content3 = "Normal\n code\n more code\nNormal";
588 let result3 = CodeBlockUtils::identify_code_block_lines(content3);
589 assert_eq!(result3, vec![false, true, true, false]);
590
591 let content4 = "List:\n * Item 1\n * Item 2";
593 let result4 = CodeBlockUtils::identify_code_block_lines(content4);
594 assert_eq!(result4, vec![false, false, false]);
595 }
596
597 #[test]
598 fn test_code_block_state_enum() {
599 assert_eq!(CodeBlockState::None, CodeBlockState::None);
600 assert_eq!(CodeBlockState::Fenced, CodeBlockState::Fenced);
601 assert_eq!(CodeBlockState::Indented, CodeBlockState::Indented);
602 assert_ne!(CodeBlockState::None, CodeBlockState::Fenced);
603 }
604
605 #[test]
606 fn test_code_block_info() {
607 let content = "Normal\n```\ncode\n```\nText";
608 let info = CodeBlockInfo::new(content);
609
610 assert!(!info.is_in_code_block(0));
611 assert!(info.is_in_code_block(1));
612 assert!(info.is_in_code_block(2));
613 assert!(info.is_in_code_block(3));
614 assert!(!info.is_in_code_block(4));
615
616 assert!(info.has_code_blocks());
617
618 assert!(!info.is_in_code_block(100));
620 }
621
622 #[test]
623 fn test_code_block_info_code_spans() {
624 let content = "Text with `inline code` here";
625 let info = CodeBlockInfo::new(content);
626
627 assert!(info.has_code_spans());
628 assert!(!info.has_code_blocks());
629
630 assert!(info.is_in_code_span(0, 11)); assert!(info.is_in_code_span(0, 15)); assert!(!info.is_in_code_span(0, 5)); assert!(!info.is_in_code_span(0, 25)); }
636
637 #[test]
638 fn test_compute_code_blocks() {
639 let content = "Normal\n```\ncode\n```\n indented";
640 let states = compute_code_blocks(content);
641
642 assert_eq!(states[0], CodeBlockState::None);
643 assert_eq!(states[1], CodeBlockState::Fenced);
644 assert_eq!(states[2], CodeBlockState::Fenced);
645 assert_eq!(states[3], CodeBlockState::Fenced);
646 assert_eq!(states[4], CodeBlockState::Indented);
647 }
648
649 #[test]
650 fn test_compute_code_spans() {
651 let content = "Text `code` and ``double`` backticks";
652 let spans = compute_code_spans(content);
653
654 assert_eq!(spans.len(), 2);
655 assert_eq!(&content[spans[0].0..spans[0].1], "`code`");
657 assert_eq!(&content[spans[1].0..spans[1].1], "``double``");
659
660 let content2 = r"Text \`not code\` but `real code`";
662 let spans2 = compute_code_spans(content2);
663 assert_eq!(spans2.len(), 1);
664 assert!(content2[spans2[0].0..spans2[0].1].contains("real code"));
665 }
666
667 #[test]
668 fn test_code_block_style() {
669 assert_eq!(CodeBlockStyle::Fenced.to_string(), "fenced");
670 assert_eq!(CodeBlockStyle::Indented.to_string(), "indented");
671 assert_eq!(CodeBlockStyle::Consistent.to_string(), "consistent");
672
673 assert_eq!(CodeBlockStyle::default(), CodeBlockStyle::Consistent);
674 }
675
676 #[test]
677 fn test_nested_code_blocks() {
678 let content = "```\n```\ncode\n```\n```";
680 let result = CodeBlockUtils::identify_code_block_lines(content);
681 assert_eq!(result, vec![true, true, false, true, true]);
683 }
684
685 #[test]
686 fn test_unicode_content() {
687 let content = "```rust\nlet 你好 = \"世界\";\n```";
688 let result = CodeBlockUtils::identify_code_block_lines(content);
689 assert_eq!(result, vec![true, true, true]);
690
691 assert_eq!(CodeBlockUtils::get_language_specifier("```🦀"), Some("🦀".to_string()));
692 }
693
694 #[test]
695 fn test_edge_cases() {
696 assert_eq!(CodeBlockUtils::identify_code_block_lines(""), Vec::<bool>::new());
698 assert!(!CodeBlockUtils::is_in_code_block("", 0));
699
700 assert_eq!(CodeBlockUtils::identify_code_block_lines("```"), vec![true]);
702 assert_eq!(CodeBlockUtils::identify_code_block_lines("~~~"), vec![true]);
703
704 let content = "```\ncode\n~~~\nmore\n```";
706 let result = CodeBlockUtils::identify_code_block_lines(content);
707 assert_eq!(result, vec![true, true, true, true, true]);
708 }
709}