1use carta_ast::{
16 Alignment, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Format, Inline,
17 ListAttributes, ListNumberDelim, ListNumberStyle, MathType, QuoteType, Row, Table, TableBody,
18 TableFoot, TableHead, Target, to_plain_text,
19};
20use carta_core::{Extension, Reader, ReaderOptions, Result};
21use unicode_normalization::UnicodeNormalization;
22
23use crate::entities;
24use crate::heading_ids::{IdRegistry, IdScheme};
25use crate::inline_text::trim_inline_ends;
26
27#[derive(Debug, Clone, Copy)]
29struct Ctx {
30 smart: bool,
32 math: bool,
34}
35
36#[derive(Debug, Clone, Copy)]
38enum Closer {
39 Quote(char),
41 Delim(char),
43 Mono,
45}
46
47#[derive(Debug, Clone, Copy, Default)]
50struct QuoteCtx {
51 in_single: bool,
52 in_double: bool,
53}
54
55#[derive(Debug, Default, Clone, Copy)]
57pub struct DokuwikiReader;
58
59impl Reader for DokuwikiReader {
60 fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
61 let ctx = Ctx {
62 smart: options.extensions.contains(Extension::Smart),
63 math: options.extensions.contains(Extension::TexMathDollars),
64 };
65 let text = normalize_newlines(input);
66 let lines: Vec<&str> = text.split('\n').collect();
67 let mut index = 0;
68 let mut blocks = parse_blocks(&lines, &mut index, ctx, 0);
69 if options.extensions.contains(Extension::EastAsianLineBreaks) {
70 strip_wide_line_breaks(&mut blocks);
71 }
72 if options.extensions.contains(Extension::AutoIdentifiers)
75 && let Some(scheme) = IdScheme::select(options.extensions, false)
76 {
77 let ascii = options.extensions.contains(Extension::AsciiIdentifiers);
78 let mut registry = IdRegistry::default();
79 assign_heading_ids(&mut blocks, scheme, ascii, &mut registry);
80 }
81 Ok(Document {
82 blocks,
83 ..Default::default()
84 })
85 }
86}
87
88const MAX_DEPTH: usize = 32;
91
92fn normalize_newlines(input: &str) -> String {
95 input.replace("\r\n", "\n").replace('\r', "\n")
96}
97
98fn matches_at(chars: &[char], i: usize, needle: &str) -> bool {
100 needle
101 .chars()
102 .enumerate()
103 .all(|(k, ch)| chars.get(i + k) == Some(&ch))
104}
105
106fn leading_spaces(line: &str) -> usize {
108 line.chars().take_while(|&c| c == ' ').count()
109}
110
111const TAB_STOP: usize = 4;
113
114fn expand_tabs(line: &str) -> String {
117 let mut out = String::new();
118 let mut col = 0;
119 for c in line.chars() {
120 if c == '\t' {
121 let next = (col / TAB_STOP + 1) * TAB_STOP;
122 for _ in col..next {
123 out.push(' ');
124 }
125 col = next;
126 } else {
127 out.push(c);
128 col += 1;
129 }
130 }
131 out
132}
133
134fn leading_columns(line: &str) -> usize {
137 let mut col = 0;
138 for c in line.chars() {
139 match c {
140 '\t' => col = (col / TAB_STOP + 1) * TAB_STOP,
141 ' ' => col += 1,
142 _ => break,
143 }
144 }
145 col
146}
147
148fn parse_blocks(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Vec<Block> {
154 let mut blocks = Vec::new();
155 while *index < lines.len() {
156 let line = lines.get(*index).copied().unwrap_or("");
157 if line.trim().is_empty() {
158 *index += 1;
159 continue;
160 }
161 if let Some((level, title, trailing)) = header_split(line) {
162 blocks.push(Block::Header(
163 level,
164 Box::default(),
165 inline_content(&title, ctx, depth),
166 ));
167 *index += 1;
168 if !trailing.trim().is_empty() && depth < MAX_DEPTH {
170 let tail = [trailing.as_str()];
171 let mut tail_index = 0;
172 blocks.append(&mut parse_blocks(&tail, &mut tail_index, ctx, depth + 1));
173 }
174 continue;
175 }
176 if let Some(block) = parse_code_or_raw(lines, index) {
177 blocks.push(block);
178 continue;
179 }
180 if is_table_line(line) {
181 blocks.push(parse_table(lines, index, ctx, depth));
182 continue;
183 }
184 if opens_list(line) {
185 blocks.push(parse_list(lines, index, ctx, depth));
186 continue;
187 }
188 if is_indented_code(line) {
189 blocks.push(parse_indented_code(lines, index));
190 continue;
191 }
192 if is_thematic_break(line) {
193 blocks.push(Block::HorizontalRule);
194 *index += 1;
195 continue;
196 }
197 if quote_depth(line).is_some() {
198 blocks.push(parse_quote(lines, index, ctx, depth));
199 continue;
200 }
201 blocks.append(&mut parse_paragraph(lines, index, ctx, depth));
202 }
203 blocks
204}
205
206fn interrupts_paragraph(line: &str) -> bool {
209 line.trim().is_empty()
210 || header_split(line).is_some()
211 || is_block_tag(line)
212 || is_table_line(line)
213 || opens_list(line)
214 || is_indented_code(line)
215 || is_thematic_break(line)
216 || quote_depth(line).is_some()
217}
218
219fn parse_paragraph(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Vec<Block> {
223 let mut buffer = String::new();
224 let mut first = true;
225 while *index < lines.len() {
226 let line = lines.get(*index).copied().unwrap_or("");
227 if !first && interrupts_paragraph(line) {
228 break;
229 }
230 if !first {
231 buffer.push('\n');
232 }
233 buffer.push_str(line);
234 first = false;
235 *index += 1;
236 }
237 split_on_embedded_code(&buffer, ctx, depth)
238}
239
240fn split_on_embedded_code(text: &str, ctx: Ctx, depth: usize) -> Vec<Block> {
244 let chars: Vec<char> = text.chars().collect();
245 if depth < MAX_DEPTH
246 && let Some((start, block, end)) = find_embedded_code(&chars)
247 {
248 let mut out = Vec::new();
249 let before: String = chars.get(..start).unwrap_or(&[]).iter().collect();
250 if !before.trim().is_empty() {
251 out.push(Block::Para(inline_content(before.trim(), ctx, depth)));
252 }
253 out.push(block);
254 let after: String = chars.get(end..).unwrap_or(&[]).iter().collect();
255 if !after.trim().is_empty() {
256 out.append(&mut split_on_embedded_code(&after, ctx, depth + 1));
257 }
258 out
259 } else if text.trim().is_empty() {
260 Vec::new()
261 } else {
262 vec![Block::Para(inline_content(text.trim(), ctx, depth))]
263 }
264}
265
266fn find_embedded_code(chars: &[char]) -> Option<(usize, Block, usize)> {
269 let mut i = 0;
270 while i < chars.len() {
271 if chars.get(i) == Some(&'<')
272 && (named_tag_at(chars, i, "code") || named_tag_at(chars, i, "file"))
273 && let Some((block, end)) = parse_raw_region(chars, i)
274 {
275 return Some((i, block, end));
276 }
277 i += 1;
278 }
279 None
280}
281
282fn named_tag_at(chars: &[char], start: usize, name: &str) -> bool {
284 if chars.get(start) != Some(&'<') {
285 return false;
286 }
287 let after = start + 1 + name.chars().count();
288 if !matches_at(chars, start + 1, name) {
289 return false;
290 }
291 matches!(chars.get(after), Some('>')) || chars.get(after).is_some_and(|c| c.is_whitespace())
292}
293
294fn header_split(line: &str) -> Option<(i32, String, String)> {
300 if line.starts_with(' ') || line.starts_with('\t') {
301 return None;
302 }
303 let chars: Vec<char> = line.chars().collect();
304 let open = chars.iter().take_while(|&&c| c == '=').count();
305 if !(2..=6).contains(&open) {
306 return None;
307 }
308 let mut at = open;
309 while at < chars.len() {
310 if chars.get(at) == Some(&'=') {
311 let run = run_length(&chars, at, '=');
312 if run >= 2 {
313 let title: String = chars.get(open..at).unwrap_or(&[]).iter().collect();
314 let trailing: String = chars.get(at + run..).unwrap_or(&[]).iter().collect();
315 let level = i32::try_from(7 - open).unwrap_or(1);
316 return Some((level, title.trim().to_string(), trailing));
317 }
318 at += run;
319 } else {
320 at += 1;
321 }
322 }
323 None
324}
325
326fn is_block_tag(line: &str) -> bool {
328 starts_named_tag(line, "code")
329 || starts_named_tag(line, "file")
330 || line.starts_with("<HTML>")
331 || line.starts_with("<PHP>")
332}
333
334fn starts_named_tag(line: &str, name: &str) -> bool {
336 let Some(rest) = line.strip_prefix('<').and_then(|l| l.strip_prefix(name)) else {
337 return false;
338 };
339 matches!(rest.chars().next(), Some('>')) || rest.starts_with(|c: char| c.is_whitespace())
340}
341
342fn is_table_line(line: &str) -> bool {
345 (line.starts_with('|') || line.starts_with('^')) && !split_row(line).is_empty()
346}
347
348fn is_indented_code(line: &str) -> bool {
350 leading_columns(line) >= 2 && !line.trim().is_empty()
351}
352
353fn is_thematic_break(line: &str) -> bool {
356 line.len() >= 4 && line.chars().all(|c| c == '-')
357}
358
359fn list_marker(line: &str) -> Option<(usize, bool)> {
362 let indent = leading_spaces(line);
363 if indent < 2 {
364 return None;
365 }
366 let chars: Vec<char> = line.chars().collect();
367 let marker = chars.get(indent)?;
368 let ordered = match marker {
369 '*' => false,
370 '-' => true,
371 _ => return None,
372 };
373 if chars.get(indent + 1) == Some(&' ') {
374 Some((indent, ordered))
375 } else {
376 None
377 }
378}
379
380fn list_level(indent: usize) -> usize {
383 indent / 2
384}
385
386fn opens_list(line: &str) -> bool {
389 list_marker(line).is_some_and(|(indent, _)| list_level(indent) == 1)
390}
391
392fn quote_depth(line: &str) -> Option<usize> {
395 if !line.starts_with('>') {
396 return None;
397 }
398 let depth = line.chars().take_while(|&c| c == '>').count();
399 let rest = line.get(depth..).unwrap_or("");
400 let rest = rest.strip_prefix(' ').unwrap_or(rest);
401 if rest.is_empty() { None } else { Some(depth) }
402}
403
404enum RawKind {
406 Code,
408 Html,
410 Php,
412}
413
414fn parse_code_or_raw(lines: &[&str], index: &mut usize) -> Option<Block> {
418 let line = lines.get(*index).copied().unwrap_or("");
419 if !is_block_tag(line) {
420 return None;
421 }
422 let joined: String = lines.get(*index..).unwrap_or(&[]).join("\n");
423 let chars: Vec<char> = joined.chars().collect();
424 let (block, end) = parse_raw_region(&chars, 0)?;
425 let consumed = chars
426 .get(..end)
427 .unwrap_or(&[])
428 .iter()
429 .filter(|&&c| c == '\n')
430 .count();
431 *index += consumed + 1;
432 Some(block)
433}
434
435fn parse_raw_region(chars: &[char], start: usize) -> Option<(Block, usize)> {
439 let (kind, close) = if named_tag_at(chars, start, "code") {
440 (RawKind::Code, "</code>")
441 } else if named_tag_at(chars, start, "file") {
442 (RawKind::Code, "</file>")
443 } else if matches_at(chars, start, "<HTML>") {
444 (RawKind::Html, "</HTML>")
445 } else if matches_at(chars, start, "<PHP>") {
446 (RawKind::Php, "</PHP>")
447 } else {
448 return None;
449 };
450 let open_end = (start..chars.len()).find(|&i| chars.get(i) == Some(&'>'))?;
451 let attr_text: String = chars
452 .get(start + 1..open_end)
453 .unwrap_or(&[])
454 .iter()
455 .collect();
456 let content_start = open_end + 1;
457 let close_at = find_subsequence(chars, content_start, close)?;
458 let mut content: String = chars
459 .get(content_start..close_at)
460 .unwrap_or(&[])
461 .iter()
462 .collect();
463 if let Some(stripped) = content.strip_prefix('\n') {
464 content = stripped.to_string();
465 }
466 let end = close_at + close.chars().count();
467 let block = match kind {
468 RawKind::Code => {
469 let class = code_language(&attr_text);
470 let attr = Attr {
471 classes: class.into_iter().map(Into::into).collect(),
472 ..Default::default()
473 };
474 Block::CodeBlock(Box::new(attr), content.into())
475 }
476 RawKind::Html => Block::RawBlock(Format("html".into()), content.into()),
477 RawKind::Php => {
478 Block::RawBlock(Format("html".into()), format!("<?php {content} ?>").into())
479 }
480 };
481 Some((block, end))
482}
483
484fn code_language(attr_text: &str) -> Option<String> {
487 let mut words = attr_text.split_whitespace();
488 let first = words.next();
489 match first {
491 Some("code" | "file") => {}
492 Some(word) if word != "-" => return Some(word.to_string()),
493 _ => return None,
494 }
495 match words.next() {
496 Some(word) if word != "-" => Some(word.to_string()),
497 _ => None,
498 }
499}
500
501fn find_subsequence(chars: &[char], from: usize, needle: &str) -> Option<usize> {
503 let len = needle.chars().count();
504 (from..=chars.len().saturating_sub(len)).find(|&i| matches_at(chars, i, needle))
505}
506
507fn parse_indented_code(lines: &[&str], index: &mut usize) -> Block {
510 let mut out = String::new();
511 while *index < lines.len() {
512 let line = lines.get(*index).copied().unwrap_or("");
513 if !is_indented_code(line) {
514 break;
515 }
516 let expanded = expand_tabs(line);
517 let body = expanded.get(2..).unwrap_or("");
518 out.push_str(body);
519 out.push('\n');
520 *index += 1;
521 }
522 Block::CodeBlock(Box::default(), out.into())
523}
524
525fn parse_list(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Block {
527 let start = *index;
528 let mut items = Vec::new();
529 while *index < lines.len() {
530 let line = lines.get(*index).copied().unwrap_or("");
531 let Some((indent, ordered)) = list_marker(line) else {
532 break;
533 };
534 let text: String = line.chars().skip(indent + 2).collect();
535 items.push((list_level(indent), ordered, text));
536 *index += 1;
537 }
538 let cutoff = list_cutoff(&items);
542 let consumed = items.get(..cutoff).unwrap_or(&[]);
543 let mut pos = 0;
544 let list = build_list(consumed, &mut pos, ctx, depth);
545 *index = start + pos;
548 list
549}
550
551fn list_cutoff(items: &[(usize, bool, String)]) -> usize {
554 let mut previous = None;
555 for (i, (level, _, _)) in items.iter().enumerate() {
556 if let Some(prev) = previous
557 && *level > prev + 1
558 {
559 return i;
560 }
561 previous = Some(*level);
562 }
563 items.len()
564}
565
566fn build_list(items: &[(usize, bool, String)], pos: &mut usize, ctx: Ctx, depth: usize) -> Block {
569 let (base_level, ordered) = items
570 .get(*pos)
571 .map_or((0, false), |(level, ordered, _)| (*level, *ordered));
572 let mut entries: Vec<Vec<Block>> = Vec::new();
573 while let Some((level, item_ordered, text)) = items.get(*pos) {
574 if *level < base_level {
575 break;
576 }
577 if *level == base_level {
578 if *item_ordered != ordered {
579 break;
580 }
581 let mut blocks = vec![Block::Plain(inline_content(text, ctx, depth))];
582 *pos += 1;
583 if depth < MAX_DEPTH && items.get(*pos).is_some_and(|(l, _, _)| *l > base_level) {
584 blocks.push(build_list(items, pos, ctx, depth + 1));
585 }
586 entries.push(blocks);
587 } else if depth < MAX_DEPTH {
588 let child = build_list(items, pos, ctx, depth + 1);
589 match entries.last_mut() {
590 Some(last) => last.push(child),
591 None => entries.push(vec![child]),
592 }
593 } else {
594 *pos += 1;
595 }
596 }
597 if ordered {
598 Block::OrderedList(
599 ListAttributes {
600 start: 1,
601 style: ListNumberStyle::DefaultStyle,
602 delim: ListNumberDelim::DefaultDelim,
603 },
604 entries,
605 )
606 } else {
607 Block::BulletList(entries)
608 }
609}
610
611fn parse_quote(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Block {
613 let mut items = Vec::new();
614 while *index < lines.len() {
615 let line = lines.get(*index).copied().unwrap_or("");
616 let Some(level) = quote_depth(line) else {
617 break;
618 };
619 let rest = line.get(level..).unwrap_or("");
620 let rest = rest.strip_prefix(' ').unwrap_or(rest);
621 items.push((level, rest.to_string()));
622 *index += 1;
623 }
624 let mut pos = 0;
625 Block::BlockQuote(build_quote(&items, &mut pos, 1, ctx, depth))
626}
627
628fn build_quote(
630 items: &[(usize, String)],
631 pos: &mut usize,
632 level: usize,
633 ctx: Ctx,
634 depth: usize,
635) -> Vec<Block> {
636 let mut blocks = Vec::new();
637 while let Some((line_level, _)) = items.get(*pos) {
638 if *line_level < level {
639 break;
640 }
641 if *line_level == level {
642 let mut inlines = Vec::new();
643 while let Some((line_level, text)) = items.get(*pos) {
644 if *line_level != level {
645 break;
646 }
647 if !inlines.is_empty() {
648 inlines.push(Inline::LineBreak);
649 }
650 inlines.extend(inline_content(text, ctx, depth));
651 *pos += 1;
652 }
653 blocks.push(Block::Plain(inlines));
654 } else if depth < MAX_DEPTH {
655 blocks.push(Block::BlockQuote(build_quote(
656 items,
657 pos,
658 level + 1,
659 ctx,
660 depth + 1,
661 )));
662 } else {
663 *pos += 1;
664 }
665 }
666 blocks
667}
668
669fn assign_heading_ids(
677 blocks: &mut [Block],
678 scheme: IdScheme,
679 ascii: bool,
680 registry: &mut IdRegistry,
681) {
682 for block in blocks {
683 match block {
684 Block::Header(_, attr, inlines) => {
685 let text = to_plain_text(inlines);
686 let text = if ascii { asciify(&text) } else { text };
687 attr.id = registry.assign(scheme, &text).into();
688 }
689 Block::BlockQuote(children)
690 | Block::Div(_, children)
691 | Block::Figure(_, _, children) => {
692 assign_heading_ids(children, scheme, ascii, registry);
693 }
694 Block::BulletList(items) | Block::OrderedList(_, items) => {
695 for item in items {
696 assign_heading_ids(item, scheme, ascii, registry);
697 }
698 }
699 _ => {}
700 }
701 }
702}
703
704fn asciify(text: &str) -> String {
707 text.nfd().filter(char::is_ascii).collect()
708}
709
710fn strip_wide_line_breaks(blocks: &mut [Block]) {
717 for block in blocks {
718 match block {
719 Block::Para(inlines) | Block::Plain(inlines) | Block::Header(_, _, inlines) => {
720 strip_wide_in_inlines(inlines);
721 }
722 Block::BlockQuote(children)
723 | Block::Div(_, children)
724 | Block::Figure(_, _, children) => {
725 strip_wide_line_breaks(children);
726 }
727 Block::BulletList(items) | Block::OrderedList(_, items) => {
728 for item in items {
729 strip_wide_line_breaks(item);
730 }
731 }
732 _ => {}
733 }
734 }
735}
736
737fn strip_wide_in_inlines(inlines: &mut Vec<Inline>) {
739 for inline in inlines.iter_mut() {
740 match inline {
741 Inline::Emph(children)
742 | Inline::Underline(children)
743 | Inline::Strong(children)
744 | Inline::Strikeout(children)
745 | Inline::Superscript(children)
746 | Inline::Subscript(children)
747 | Inline::SmallCaps(children)
748 | Inline::Quoted(_, children)
749 | Inline::Cite(_, children)
750 | Inline::Link(_, children, _)
751 | Inline::Image(_, children, _)
752 | Inline::Span(_, children) => strip_wide_in_inlines(children),
753 Inline::Note(blocks) => strip_wide_line_breaks(blocks),
754 _ => {}
755 }
756 }
757 let mut i = 0;
758 while i < inlines.len() {
759 if matches!(inlines.get(i), Some(Inline::SoftBreak)) {
760 let prev_wide = i
761 .checked_sub(1)
762 .and_then(|p| inlines.get(p))
763 .and_then(last_char)
764 .is_some_and(is_east_asian_wide);
765 let next_wide = inlines
766 .get(i + 1)
767 .and_then(first_char)
768 .is_some_and(is_east_asian_wide);
769 if prev_wide && next_wide {
770 inlines.remove(i);
771 continue;
772 }
773 }
774 i += 1;
775 }
776}
777
778fn last_char(inline: &Inline) -> Option<char> {
780 match inline {
781 Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
782 s.chars().last()
783 }
784 Inline::Emph(children)
785 | Inline::Underline(children)
786 | Inline::Strong(children)
787 | Inline::Strikeout(children)
788 | Inline::Superscript(children)
789 | Inline::Subscript(children)
790 | Inline::SmallCaps(children)
791 | Inline::Quoted(_, children)
792 | Inline::Cite(_, children)
793 | Inline::Link(_, children, _)
794 | Inline::Image(_, children, _)
795 | Inline::Span(_, children) => children.iter().rev().find_map(last_char),
796 _ => None,
797 }
798}
799
800fn first_char(inline: &Inline) -> Option<char> {
802 match inline {
803 Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
804 s.chars().next()
805 }
806 Inline::Emph(children)
807 | Inline::Underline(children)
808 | Inline::Strong(children)
809 | Inline::Strikeout(children)
810 | Inline::Superscript(children)
811 | Inline::Subscript(children)
812 | Inline::SmallCaps(children)
813 | Inline::Quoted(_, children)
814 | Inline::Cite(_, children)
815 | Inline::Link(_, children, _)
816 | Inline::Image(_, children, _)
817 | Inline::Span(_, children) => children.iter().find_map(first_char),
818 _ => None,
819 }
820}
821
822fn is_east_asian_wide(c: char) -> bool {
825 matches!(c as u32,
826 0x1100..=0x115F
827 | 0x2E80..=0x2EFF
828 | 0x2F00..=0x2FDF
829 | 0x2FF0..=0x2FFF
830 | 0x3000..=0x303E
831 | 0x3041..=0x33FF
832 | 0x3400..=0x4DBF
833 | 0x4E00..=0x9FFF
834 | 0xA000..=0xA4CF
835 | 0xA960..=0xA97F
836 | 0xAC00..=0xD7A3
837 | 0xF900..=0xFAFF
838 | 0xFE10..=0xFE19
839 | 0xFE30..=0xFE6F
840 | 0xFF00..=0xFF60
841 | 0xFFE0..=0xFFE6
842 | 0x1B000..=0x1B16F
843 | 0x1F200..=0x1F2FF
844 | 0x20000..=0x2FFFD
845 | 0x30000..=0x3FFFD)
846}
847
848fn inline_budget(len: usize) -> usize {
857 len.saturating_mul(8).saturating_add(64).min(200_000)
858}
859
860fn inline_content(text: &str, ctx: Ctx, depth: usize) -> Vec<Inline> {
862 let chars: Vec<char> = text.chars().collect();
863 let mut pos = 0;
864 let mut budget = inline_budget(chars.len());
865 let (mut inlines, _) = scan(
866 &chars,
867 &mut pos,
868 None,
869 ctx,
870 QuoteCtx::default(),
871 depth,
872 &mut budget,
873 );
874 trim_inline_ends(&mut inlines);
875 inlines
876}
877
878fn scan_slice(chars: &[char], ctx: Ctx, depth: usize) -> Vec<Inline> {
880 let mut pos = 0;
881 let mut budget = inline_budget(chars.len());
882 let (inlines, _) = scan(
883 chars,
884 &mut pos,
885 None,
886 ctx,
887 QuoteCtx::default(),
888 depth,
889 &mut budget,
890 );
891 inlines
892}
893
894fn flush(pending: &mut String, out: &mut Vec<Inline>) {
896 if !pending.is_empty() {
897 out.push(Inline::Str(std::mem::take(pending).into()));
898 }
899}
900
901#[allow(clippy::too_many_lines)]
904fn scan(
905 chars: &[char],
906 pos: &mut usize,
907 end: Option<Closer>,
908 ctx: Ctx,
909 qctx: QuoteCtx,
910 depth: usize,
911 budget: &mut usize,
912) -> (Vec<Inline>, bool) {
913 let start = *pos;
914 let mut out: Vec<Inline> = Vec::new();
915 let mut pending = String::new();
916 while let Some(&c) = chars.get(*pos) {
917 if let Some(closer) = end
918 && at_closer(chars, *pos, start, closer)
919 {
920 flush(&mut pending, &mut out);
921 *pos += closer_width(closer);
922 return (coalesce(out), true);
923 }
924 if c.is_ascii_alphabetic()
925 && boundary_before(chars, *pos)
926 && let Some((link, end)) = try_autolink(chars, *pos)
927 {
928 flush(&mut pending, &mut out);
929 out.push(link);
930 *pos = end;
931 continue;
932 }
933 match c {
934 ' ' | '\t' | '\n' => scan_whitespace_run(chars, pos, &mut pending, &mut out),
935 '&' => {
936 if let Some((decoded, next)) =
937 entities::read_reference(chars, *pos, chars.len(), true)
938 {
939 pending.push_str(&decoded);
940 *pos = next;
941 } else {
942 pending.push('&');
943 *pos += 1;
944 }
945 }
946 '\\' if chars.get(*pos + 1) == Some(&'\\') => {
947 scan_hard_break(chars, pos, &mut pending, &mut out);
948 }
949 '\\' if ctx.math && chars.get(*pos + 1) == Some(&'$') => {
950 pending.push('\\');
952 pending.push('$');
953 *pos += 2;
954 }
955 '*' if chars.get(*pos + 1) == Some(&'*') && depth < MAX_DEPTH => {
956 handle_delim(
957 chars,
958 pos,
959 '*',
960 ctx,
961 qctx,
962 depth,
963 budget,
964 &mut pending,
965 &mut out,
966 Inline::Strong,
967 );
968 }
969 '/' if chars.get(*pos + 1) == Some(&'/') && depth < MAX_DEPTH => {
970 handle_delim(
971 chars,
972 pos,
973 '/',
974 ctx,
975 qctx,
976 depth,
977 budget,
978 &mut pending,
979 &mut out,
980 Inline::Emph,
981 );
982 }
983 '_' if chars.get(*pos + 1) == Some(&'_') && depth < MAX_DEPTH => {
984 handle_delim(
985 chars,
986 pos,
987 '_',
988 ctx,
989 qctx,
990 depth,
991 budget,
992 &mut pending,
993 &mut out,
994 Inline::Underline,
995 );
996 }
997 '\'' if chars.get(*pos + 1) == Some(&'\'') => {
998 handle_mono_or_quote(chars, pos, ctx, qctx, depth, budget, &mut pending, &mut out);
999 }
1000 '\'' | '"' if ctx.smart => {
1001 handle_quote(
1002 chars,
1003 pos,
1004 c,
1005 ctx,
1006 qctx,
1007 depth,
1008 budget,
1009 &mut pending,
1010 &mut out,
1011 );
1012 }
1013 '$' if ctx.math => {
1014 handle_math(chars, pos, &mut pending, &mut out);
1015 }
1016 '-' if ctx.smart => {
1017 let run = run_length(chars, *pos, '-');
1018 pending.push_str(&fold_dashes(run));
1019 *pos += run;
1020 }
1021 '.' if ctx.smart => {
1022 let run = run_length(chars, *pos, '.');
1023 pending.push_str(&fold_ellipsis(run));
1024 *pos += run;
1025 }
1026 '[' if chars.get(*pos + 1) == Some(&'[') && depth < MAX_DEPTH => {
1027 handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1028 }
1029 '{' if chars.get(*pos + 1) == Some(&'{') && depth < MAX_DEPTH => {
1030 handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1031 }
1032 '(' if chars.get(*pos + 1) == Some(&'(') && depth < MAX_DEPTH => {
1033 handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1034 }
1035 '%' if chars.get(*pos + 1) == Some(&'%') => {
1036 handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1037 }
1038 '<' if depth < MAX_DEPTH => {
1039 handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1040 }
1041 '~' if chars.get(*pos + 1) == Some(&'~') => {
1042 handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1043 }
1044 other => {
1045 pending.push(other);
1046 *pos += 1;
1047 }
1048 }
1049 }
1050 flush(&mut pending, &mut out);
1051 (coalesce(out), end.is_none())
1052}
1053
1054fn at_closer(chars: &[char], pos: usize, start: usize, closer: Closer) -> bool {
1057 match closer {
1058 Closer::Quote(quote) => {
1059 chars.get(pos) == Some("e) && can_close_quote(chars, pos, quote)
1060 }
1061 Closer::Delim(delim) => {
1062 chars.get(pos) == Some(&delim)
1063 && chars.get(pos + 1) == Some(&delim)
1064 && pos > start
1065 && chars.get(pos - 1).is_some_and(|c| !c.is_whitespace())
1066 }
1067 Closer::Mono => {
1068 chars.get(pos) == Some(&'\'')
1069 && chars.get(pos + 1) == Some(&'\'')
1070 && pos > start
1071 && chars.get(pos - 1).is_some_and(|c| !c.is_whitespace())
1072 }
1073 }
1074}
1075
1076fn closer_width(closer: Closer) -> usize {
1078 match closer {
1079 Closer::Quote(_) => 1,
1080 Closer::Delim(_) | Closer::Mono => 2,
1081 }
1082}
1083
1084#[allow(clippy::too_many_arguments)]
1088fn handle_mono_or_quote(
1089 chars: &[char],
1090 pos: &mut usize,
1091 ctx: Ctx,
1092 qctx: QuoteCtx,
1093 depth: usize,
1094 budget: &mut usize,
1095 pending: &mut String,
1096 out: &mut Vec<Inline>,
1097) {
1098 if depth < MAX_DEPTH
1099 && let Some((node, end)) = parse_mono(chars, *pos, ctx, depth, budget)
1100 {
1101 flush(pending, out);
1102 out.push(node);
1103 *pos = end;
1104 } else if ctx.smart {
1105 handle_quote(chars, pos, '\'', ctx, qctx, depth, budget, pending, out);
1106 } else {
1107 pending.push('\'');
1108 *pos += 1;
1109 }
1110}
1111
1112fn scan_whitespace_run(
1115 chars: &[char],
1116 pos: &mut usize,
1117 pending: &mut String,
1118 out: &mut Vec<Inline>,
1119) {
1120 flush(pending, out);
1121 let mut has_newline = false;
1122 while let Some(&w) = chars.get(*pos) {
1123 match w {
1124 '\n' => {
1125 has_newline = true;
1126 *pos += 1;
1127 }
1128 ' ' | '\t' => *pos += 1,
1129 _ => break,
1130 }
1131 }
1132 out.push(if has_newline {
1133 Inline::SoftBreak
1134 } else {
1135 Inline::Space
1136 });
1137}
1138
1139fn scan_hard_break(chars: &[char], pos: &mut usize, pending: &mut String, out: &mut Vec<Inline>) {
1142 let after = chars.get(*pos + 2);
1143 if after.is_none_or(|c| c.is_whitespace()) {
1144 flush(pending, out);
1145 out.push(Inline::LineBreak);
1146 *pos += 2;
1147 if after.is_some() {
1148 *pos += 1;
1149 }
1150 } else {
1151 pending.push('\\');
1152 pending.push('\\');
1153 *pos += 2;
1154 }
1155}
1156
1157fn scan_construct(
1161 chars: &[char],
1162 pos: usize,
1163 c: char,
1164 ctx: Ctx,
1165 depth: usize,
1166) -> Option<(Vec<Inline>, usize)> {
1167 match c {
1168 '[' => parse_link(chars, pos).map(|(node, end)| (vec![node], end)),
1169 '{' => parse_media(chars, pos).map(|(node, end)| (vec![node], end)),
1170 '(' => parse_footnote(chars, pos, ctx, depth).map(|(node, end)| (vec![node], end)),
1171 '%' => parse_nowiki_pct(chars, pos),
1172 '<' => parse_angle(chars, pos, ctx, depth),
1173 '~' => parse_macro(chars, pos).map(|end| (Vec::new(), end)),
1174 _ => None,
1175 }
1176}
1177
1178#[allow(clippy::too_many_arguments)]
1182fn handle_construct(
1183 chars: &[char],
1184 pos: &mut usize,
1185 c: char,
1186 ctx: Ctx,
1187 depth: usize,
1188 budget: &mut usize,
1189 pending: &mut String,
1190 out: &mut Vec<Inline>,
1191) {
1192 if *budget > 0
1198 && let Some((mut nodes, end)) = scan_construct(chars, *pos, c, ctx, depth)
1199 {
1200 *budget = budget.saturating_sub((end - *pos).max(1));
1201 flush(pending, out);
1202 out.append(&mut nodes);
1203 *pos = end;
1204 } else {
1205 pending.push(c);
1206 *pos += 1;
1207 }
1208}
1209
1210#[allow(clippy::too_many_arguments)]
1215fn handle_delim(
1216 chars: &[char],
1217 pos: &mut usize,
1218 delim: char,
1219 ctx: Ctx,
1220 qctx: QuoteCtx,
1221 depth: usize,
1222 budget: &mut usize,
1223 pending: &mut String,
1224 out: &mut Vec<Inline>,
1225 wrap: fn(Vec<Inline>) -> Inline,
1226) {
1227 let begin = *pos;
1228 if !is_ws_opt(chars.get(begin + 2).copied()) && *budget > 0 {
1231 *budget -= 1;
1232 let mut scan_pos = begin + 2;
1233 let (inner, closed) = scan(
1234 chars,
1235 &mut scan_pos,
1236 Some(Closer::Delim(delim)),
1237 ctx,
1238 qctx,
1239 depth + 1,
1240 budget,
1241 );
1242 if closed {
1243 flush(pending, out);
1244 out.push(wrap(inner));
1245 *pos = scan_pos;
1246 return;
1247 }
1248 *budget = budget.saturating_sub(scan_pos - begin);
1253 }
1254 pending.push(delim);
1255 pending.push(delim);
1256 *pos = begin + 2;
1257}
1258
1259#[allow(clippy::too_many_arguments)]
1263fn handle_quote(
1264 chars: &[char],
1265 pos: &mut usize,
1266 quote: char,
1267 ctx: Ctx,
1268 qctx: QuoteCtx,
1269 depth: usize,
1270 budget: &mut usize,
1271 pending: &mut String,
1272 out: &mut Vec<Inline>,
1273) {
1274 let begin = *pos;
1275 if can_open_quote(chars, begin, quote, qctx) && depth < MAX_DEPTH && *budget > 0 {
1276 *budget -= 1;
1277 *pos = begin + 1;
1278 let mut inner_qctx = qctx;
1279 if quote == '\'' {
1280 inner_qctx.in_single = true;
1281 } else {
1282 inner_qctx.in_double = true;
1283 }
1284 let (inner, closed) = scan(
1285 chars,
1286 pos,
1287 Some(Closer::Quote(quote)),
1288 ctx,
1289 inner_qctx,
1290 depth + 1,
1291 budget,
1292 );
1293 if closed && (quote == '"' || !inner.is_empty()) {
1294 flush(pending, out);
1295 out.push(Inline::Quoted(quote_type(quote), inner));
1296 return;
1297 }
1298 *budget = budget.saturating_sub(pos.saturating_sub(begin));
1301 *pos = begin + 1;
1302 } else {
1303 *pos = begin + 1;
1304 }
1305 pending.push(quote_glyph(chars, begin, quote));
1306}
1307
1308fn quote_type(quote: char) -> QuoteType {
1310 if quote == '\'' {
1311 QuoteType::SingleQuote
1312 } else {
1313 QuoteType::DoubleQuote
1314 }
1315}
1316
1317fn quote_glyph(chars: &[char], pos: usize, quote: char) -> char {
1320 if quote == '\'' {
1321 '\u{2019}'
1322 } else if left_flanking(chars, pos) {
1323 '\u{201c}'
1324 } else {
1325 '\u{201d}'
1326 }
1327}
1328
1329fn parse_mono(
1337 chars: &[char],
1338 begin: usize,
1339 ctx: Ctx,
1340 depth: usize,
1341 budget: &mut usize,
1342) -> Option<(Inline, usize)> {
1343 if is_ws_opt(chars.get(begin + 2).copied()) {
1344 return None;
1345 }
1346 if ctx.smart {
1347 if *budget == 0 {
1348 return None;
1349 }
1350 *budget -= 1;
1351 let mut pos = begin + 2;
1352 let (inner, closed) = scan(
1353 chars,
1354 &mut pos,
1355 Some(Closer::Mono),
1356 ctx,
1357 QuoteCtx::default(),
1358 depth + 1,
1359 budget,
1360 );
1361 if !closed {
1362 return None;
1363 }
1364 return Some((
1365 Inline::Code(Box::default(), flatten_mono(&inner).into()),
1366 pos,
1367 ));
1368 }
1369 let close = find_subsequence(chars, begin + 2, "''")?;
1370 if close <= begin + 2 || is_ws_opt(chars.get(close - 1).copied()) {
1371 return None;
1372 }
1373 let content = chars.get(begin + 2..close).unwrap_or(&[]);
1374 let inner = scan_slice(content, ctx, depth + 1);
1375 Some((
1376 Inline::Code(Box::default(), to_plain_text(&inner).into()),
1377 close + 2,
1378 ))
1379}
1380
1381fn flatten_mono(inlines: &[Inline]) -> String {
1384 let mut out = String::new();
1385 push_mono_text(inlines, &mut out);
1386 out
1387}
1388
1389fn push_mono_text(inlines: &[Inline], out: &mut String) {
1390 for inline in inlines {
1391 match inline {
1392 Inline::Str(text) | Inline::Code(_, text) | Inline::Math(_, text) => out.push_str(text),
1393 Inline::Space | Inline::SoftBreak | Inline::LineBreak => out.push(' '),
1394 Inline::Quoted(QuoteType::SingleQuote, xs) => {
1395 out.push('\u{2018}');
1396 push_mono_text(xs, out);
1397 out.push('\u{2019}');
1398 }
1399 Inline::Quoted(QuoteType::DoubleQuote, xs) => {
1400 out.push('\u{201c}');
1401 push_mono_text(xs, out);
1402 out.push('\u{201d}');
1403 }
1404 Inline::Emph(xs)
1405 | Inline::Underline(xs)
1406 | Inline::Strong(xs)
1407 | Inline::Strikeout(xs)
1408 | Inline::Superscript(xs)
1409 | Inline::Subscript(xs)
1410 | Inline::SmallCaps(xs)
1411 | Inline::Cite(_, xs)
1412 | Inline::Link(_, xs, _)
1413 | Inline::Image(_, xs, _)
1414 | Inline::Span(_, xs) => push_mono_text(xs, out),
1415 Inline::RawInline(..) | Inline::Note(_) => {}
1416 }
1417 }
1418}
1419
1420fn handle_math(chars: &[char], pos: &mut usize, pending: &mut String, out: &mut Vec<Inline>) {
1424 let begin = *pos;
1425 let parsed = if chars.get(begin + 1) == Some(&'$') {
1426 parse_display_math(chars, begin)
1427 } else {
1428 parse_inline_math(chars, begin)
1429 };
1430 if let Some((node, end)) = parsed {
1431 flush(pending, out);
1432 out.push(node);
1433 *pos = end;
1434 } else {
1435 pending.push('$');
1436 *pos = begin + 1;
1437 }
1438}
1439
1440fn parse_display_math(chars: &[char], begin: usize) -> Option<(Inline, usize)> {
1443 let close = find_subsequence(chars, begin + 2, "$$")?;
1444 if close <= begin + 2 {
1445 return None;
1446 }
1447 let content: String = chars.get(begin + 2..close).unwrap_or(&[]).iter().collect();
1448 Some((
1449 Inline::Math(MathType::DisplayMath, content.into()),
1450 close + 2,
1451 ))
1452}
1453
1454fn parse_inline_math(chars: &[char], begin: usize) -> Option<(Inline, usize)> {
1457 if is_ws_opt(chars.get(begin + 1).copied()) {
1458 return None;
1459 }
1460 let mut j = begin + 1;
1461 while j < chars.len() {
1462 if chars.get(j) == Some(&'$')
1463 && j > begin + 1
1464 && chars.get(j - 1).is_some_and(|c| !c.is_whitespace())
1465 && !chars.get(j + 1).is_some_and(char::is_ascii_digit)
1466 {
1467 let content: String = chars.get(begin + 1..j).unwrap_or(&[]).iter().collect();
1468 return Some((Inline::Math(MathType::InlineMath, content.into()), j + 1));
1469 }
1470 j += 1;
1471 }
1472 None
1473}
1474
1475fn run_length(chars: &[char], pos: usize, ch: char) -> usize {
1477 let mut n = 0;
1478 while chars.get(pos + n) == Some(&ch) {
1479 n += 1;
1480 }
1481 n
1482}
1483
1484fn fold_dashes(n: usize) -> String {
1487 let mut s = "\u{2014}".repeat(n / 3);
1488 match n % 3 {
1489 2 => s.push('\u{2013}'),
1490 1 => s.push('-'),
1491 _ => {}
1492 }
1493 s
1494}
1495
1496fn fold_ellipsis(n: usize) -> String {
1498 let mut s = "\u{2026}".repeat(n / 3);
1499 s.push_str(&".".repeat(n % 3));
1500 s
1501}
1502
1503fn before_char(chars: &[char], pos: usize) -> Option<char> {
1507 pos.checked_sub(1).and_then(|p| chars.get(p)).copied()
1508}
1509
1510fn is_ws_opt(opt: Option<char>) -> bool {
1513 opt.is_none_or(char::is_whitespace)
1514}
1515
1516fn is_blank(chars: &[char]) -> bool {
1518 chars.iter().all(|c| c.is_whitespace())
1519}
1520
1521fn is_punct_opt(opt: Option<char>) -> bool {
1523 opt.is_some_and(is_punct)
1524}
1525
1526fn is_punct(c: char) -> bool {
1529 c.is_ascii_punctuation() || (!c.is_alphanumeric() && !c.is_whitespace())
1530}
1531
1532fn left_flanking(chars: &[char], pos: usize) -> bool {
1534 let before = before_char(chars, pos);
1535 let after = chars.get(pos + 1).copied();
1536 !is_ws_opt(after) && (!is_punct_opt(after) || is_ws_opt(before) || is_punct_opt(before))
1537}
1538
1539fn right_flanking(chars: &[char], pos: usize) -> bool {
1541 let before = before_char(chars, pos);
1542 let after = chars.get(pos + 1).copied();
1543 !is_ws_opt(before) && (!is_punct_opt(before) || is_ws_opt(after) || is_punct_opt(after))
1544}
1545
1546fn can_open_quote(chars: &[char], pos: usize, quote: char, qctx: QuoteCtx) -> bool {
1549 if (quote == '\'' && qctx.in_single) || (quote == '"' && qctx.in_double) {
1550 return false;
1551 }
1552 left_flanking(chars, pos)
1553}
1554
1555fn can_close_quote(chars: &[char], pos: usize, quote: char) -> bool {
1558 if !right_flanking(chars, pos) {
1559 return false;
1560 }
1561 if quote == '\'' {
1562 !chars.get(pos + 1).is_some_and(|c| c.is_alphanumeric())
1563 } else {
1564 true
1565 }
1566}
1567
1568fn boundary_before(chars: &[char], pos: usize) -> bool {
1570 before_char(chars, pos).is_none_or(|c| !c.is_alphanumeric())
1571}
1572
1573fn try_autolink(chars: &[char], pos: usize) -> Option<(Inline, usize)> {
1577 let mut k = pos;
1578 while chars
1579 .get(k)
1580 .is_some_and(|&c| c.is_ascii_alphanumeric() || matches!(c, '.' | '+' | '-'))
1581 {
1582 k += 1;
1583 }
1584 if !matches_at(chars, k, "://") {
1585 return None;
1586 }
1587 let scheme: String = chars.get(pos..k)?.iter().collect::<String>().to_lowercase();
1588 if !crate::url_schemes::is_scheme(&scheme) {
1589 return None;
1590 }
1591 let content_start = k + 3;
1592 let scan_end = forward_scan(chars, pos);
1593 let end = trim_trailing(chars, content_start, scan_end);
1594 if end <= content_start {
1595 return None;
1596 }
1597 let url: String = chars.get(pos..end)?.iter().collect();
1598 Some((
1599 Inline::Link(
1600 Box::default(),
1601 vec![Inline::Str(url.clone().into())],
1602 Box::new(Target {
1603 url: url.into(),
1604 title: carta_ast::Text::default(),
1605 }),
1606 ),
1607 end,
1608 ))
1609}
1610
1611fn forward_scan(chars: &[char], from: usize) -> usize {
1614 let mut depth: i32 = 0;
1615 let mut j = from;
1616 while let Some(&c) = chars.get(j) {
1617 if c.is_whitespace() || c == '<' {
1618 break;
1619 }
1620 match c {
1621 '(' => depth += 1,
1622 ')' | ']' if depth == 0 => break,
1623 ')' => depth -= 1,
1624 _ => {}
1625 }
1626 j += 1;
1627 }
1628 j
1629}
1630
1631fn trim_trailing(chars: &[char], min: usize, mut end: usize) -> usize {
1634 while end > min {
1635 match chars.get(end - 1) {
1636 Some('!' | '"' | '\'' | '*' | ',' | '.' | ':' | '?' | '_' | '~') => end -= 1,
1637 Some(';') => {
1638 let mut j = end - 1;
1639 while j > min
1640 && chars
1641 .get(j - 1)
1642 .is_some_and(|&c| c.is_ascii_alphanumeric() || c == '#')
1643 {
1644 j -= 1;
1645 }
1646 end = if j > min && chars.get(j - 1) == Some(&'&') {
1647 j - 1
1648 } else {
1649 end - 1
1650 };
1651 }
1652 _ => break,
1653 }
1654 }
1655 end
1656}
1657
1658fn coalesce(inlines: Vec<Inline>) -> Vec<Inline> {
1663 let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
1664 for inline in inlines {
1665 match inline {
1666 Inline::Str(s) => {
1667 if let Some(Inline::Str(prev)) = out.last_mut() {
1668 prev.push_str(&s);
1669 } else if !s.is_empty() {
1670 out.push(Inline::Str(s));
1671 }
1672 }
1673 Inline::Space | Inline::SoftBreak => match out.last() {
1674 Some(Inline::Space) => {}
1675 Some(Inline::SoftBreak) => {
1676 if matches!(inline, Inline::Space)
1677 && let Some(slot) = out.last_mut()
1678 {
1679 *slot = Inline::Space;
1680 }
1681 }
1682 _ => out.push(inline),
1683 },
1684 other => out.push(other),
1685 }
1686 }
1687 out
1688}
1689
1690fn tokenize_text(text: &str) -> Vec<Inline> {
1692 let mut out = Vec::new();
1693 let mut word = String::new();
1694 for c in text.chars() {
1695 if c.is_whitespace() {
1696 if !word.is_empty() {
1697 out.push(Inline::Str(std::mem::take(&mut word).into()));
1698 }
1699 let token = if c == '\n' {
1700 Inline::SoftBreak
1701 } else {
1702 Inline::Space
1703 };
1704 if !matches!(out.last(), Some(Inline::Space | Inline::SoftBreak)) {
1705 out.push(token);
1706 }
1707 } else {
1708 word.push(c);
1709 }
1710 }
1711 if !word.is_empty() {
1712 out.push(Inline::Str(word.into()));
1713 }
1714 out
1715}
1716
1717fn parse_link(chars: &[char], start: usize) -> Option<(Inline, usize)> {
1723 let close = find_subsequence(chars, start + 2, "]]")?;
1724 let inner: String = chars.get(start + 2..close).unwrap_or(&[]).iter().collect();
1725 let (raw_target, label) = match inner.split_once('|') {
1726 Some((t, l)) => (t, Some(l.to_string())),
1727 None => (inner.as_str(), None),
1728 };
1729 if raw_target.is_empty() {
1730 return None;
1731 }
1732 let target = raw_target.trim().to_string();
1733 let (url, display) = classify_link_target(&target);
1734 let label_inlines = match label {
1736 Some(text) if !text.trim().is_empty() => tokenize_text(text.trim()),
1737 _ => vec![Inline::Str(display.into())],
1738 };
1739 Some((
1740 Inline::Link(
1741 Box::default(),
1742 label_inlines,
1743 Box::new(Target {
1744 url: url.into(),
1745 title: carta_ast::Text::default(),
1746 }),
1747 ),
1748 close + 2,
1749 ))
1750}
1751
1752fn classify_link_target(target: &str) -> (String, String) {
1754 if target.starts_with("\\\\") || is_external(target) {
1755 (target.to_string(), target.to_string())
1756 } else if let Some((prefix, rest)) = target.split_once('>') {
1757 (interwiki_url(prefix, rest), rest.to_string())
1758 } else {
1759 (resolve_id(target), display_id(target))
1760 }
1761}
1762
1763fn parse_media(chars: &[char], start: usize) -> Option<(Inline, usize)> {
1766 let close = find_subsequence(chars, start + 2, "}}")?;
1767 let inner: String = chars.get(start + 2..close).unwrap_or(&[]).iter().collect();
1768 let end = close + 2;
1769
1770 let leading_space = inner.starts_with(char::is_whitespace);
1771 let (spec, caption) = match inner.split_once('|') {
1772 Some((s, c)) => (s, Some(c)),
1773 None => (inner.as_str(), None),
1774 };
1775 if spec.is_empty() {
1777 return None;
1778 }
1779 let trailing_space = spec.ends_with(char::is_whitespace);
1780 let mut classes = Vec::new();
1781 if let Some(class) = media_align(leading_space, trailing_space) {
1782 classes.push(class.into());
1783 }
1784
1785 let spec = spec.trim();
1786 let (id, query) = match spec.split_once('?') {
1787 Some((i, q)) => (i, Some(q)),
1788 None => (spec, None),
1789 };
1790 let url = if is_external(id) {
1791 id.to_string()
1792 } else {
1793 resolve_id(id)
1794 };
1795 let alt = match caption {
1797 Some(text) if !text.trim().is_empty() => tokenize_text(text.trim()),
1798 _ if is_external(id) => vec![Inline::Str(id.into())],
1799 _ => vec![Inline::Str(display_id(id).into())],
1800 };
1801 let target = Target {
1802 url: url.into(),
1803 title: carta_ast::Text::default(),
1804 };
1805
1806 let node = match query {
1807 Some(q) if q.contains("linkonly") => Inline::Link(
1808 Box::new(Attr {
1809 classes,
1810 ..Default::default()
1811 }),
1812 alt,
1813 Box::new(target),
1814 ),
1815 Some(q) => {
1816 let (width, height) = parse_size(q);
1817 let mut attributes = Vec::new();
1818 if let Some(w) = width {
1819 attributes.push(("width".to_string(), w));
1820 }
1821 if let Some(h) = height {
1822 attributes.push(("height".to_string(), h));
1823 }
1824 attributes.push(("query".to_string(), format!("?{q}")));
1825 Inline::Image(
1826 Box::new(Attr {
1827 classes,
1828 attributes: attributes
1829 .into_iter()
1830 .map(|(k, v)| (k.into(), v.into()))
1831 .collect(),
1832 ..Default::default()
1833 }),
1834 alt,
1835 Box::new(target),
1836 )
1837 }
1838 None => Inline::Image(
1839 Box::new(Attr {
1840 classes,
1841 ..Default::default()
1842 }),
1843 alt,
1844 Box::new(target),
1845 ),
1846 };
1847 Some((node, end))
1848}
1849
1850fn media_align(leading: bool, trailing: bool) -> Option<&'static str> {
1852 match (leading, trailing) {
1853 (true, true) => Some("align-center"),
1854 (false, true) => Some("align-left"),
1855 (true, false) => Some("align-right"),
1856 (false, false) => None,
1857 }
1858}
1859
1860fn parse_size(query: &str) -> (Option<String>, Option<String>) {
1862 let chars: Vec<char> = query.chars().collect();
1863 let mut i = 0;
1864 let mut width = String::new();
1865 while let Some(&c) = chars.get(i) {
1866 if c.is_ascii_digit() {
1867 width.push(c);
1868 i += 1;
1869 } else {
1870 break;
1871 }
1872 }
1873 if width.is_empty() {
1874 return (None, None);
1875 }
1876 let mut height = String::new();
1877 if matches!(chars.get(i), Some('x' | 'X')) {
1878 i += 1;
1879 while let Some(&c) = chars.get(i) {
1880 if c.is_ascii_digit() {
1881 height.push(c);
1882 i += 1;
1883 } else {
1884 break;
1885 }
1886 }
1887 }
1888 let height = if height.is_empty() {
1889 None
1890 } else {
1891 Some(height)
1892 };
1893 (Some(width), height)
1894}
1895
1896fn is_external(s: &str) -> bool {
1898 match s.find("://") {
1899 Some(idx) => {
1900 let scheme = s.get(..idx).unwrap_or("");
1901 !scheme.is_empty()
1902 && scheme
1903 .chars()
1904 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '+' | '-'))
1905 && crate::url_schemes::is_scheme(&scheme.to_lowercase())
1906 }
1907 None => false,
1908 }
1909}
1910
1911fn resolve_id(id: &str) -> String {
1914 if !id.contains(':') {
1915 return id.to_string();
1916 }
1917 if let Some(rest) = id.strip_prefix('.') {
1918 return rest.trim_start_matches('.').replace(':', "/");
1919 }
1920 let replaced = id.replace(':', "/");
1921 if replaced.starts_with('/') {
1922 replaced
1923 } else {
1924 format!("/{replaced}")
1925 }
1926}
1927
1928fn display_id(id: &str) -> String {
1930 match id.rsplit_once(':') {
1931 Some((_, last)) => last.to_string(),
1932 None => id.to_string(),
1933 }
1934}
1935
1936fn interwiki_url(prefix: &str, rest: &str) -> String {
1938 match prefix {
1939 "wp" => format!("https://en.wikipedia.org/wiki/{rest}"),
1940 "wpfr" => format!("https://fr.wikipedia.org/wiki/{rest}"),
1941 "wpde" => format!("https://de.wikipedia.org/wiki/{rest}"),
1942 "wpes" => format!("https://es.wikipedia.org/wiki/{rest}"),
1943 "wpjp" => format!("https://jp.wikipedia.org/wiki/{rest}"),
1944 "wppl" => format!("https://pl.wikipedia.org/wiki/{rest}"),
1945 "doku" => format!("https://www.dokuwiki.org/{rest}"),
1946 "phpfn" => format!("https://secure.php.net/{rest}"),
1947 "callto" => format!("callto://{rest}"),
1948 other => format!("{other}>{rest}"),
1949 }
1950}
1951
1952fn parse_footnote(chars: &[char], begin: usize, ctx: Ctx, depth: usize) -> Option<(Inline, usize)> {
1957 let close = find_subsequence(chars, begin + 2, "))")?;
1958 let inner: String = chars.get(begin + 2..close).unwrap_or(&[]).iter().collect();
1959 if inner.trim().is_empty() {
1960 return None;
1961 }
1962 Some((
1963 Inline::Note(parse_blocks_str(&inner, ctx, depth + 1)),
1964 close + 2,
1965 ))
1966}
1967
1968fn parse_nowiki_pct(chars: &[char], begin: usize) -> Option<(Vec<Inline>, usize)> {
1972 if chars.get(begin + 2).is_none_or(|c| c.is_whitespace()) {
1973 return None;
1974 }
1975 let mut j = begin + 2;
1976 while j < chars.len() {
1977 if chars.get(j) == Some(&'%')
1978 && chars.get(j + 1) == Some(&'%')
1979 && j > begin + 2
1980 && chars.get(j - 1).is_some_and(|c| !c.is_whitespace())
1981 {
1982 let inner: String = chars.get(begin + 2..j).unwrap_or(&[]).iter().collect();
1983 return Some((tokenize_text(&inner), j + 2));
1984 }
1985 j += 1;
1986 }
1987 None
1988}
1989
1990fn parse_angle(
1993 chars: &[char],
1994 begin: usize,
1995 ctx: Ctx,
1996 depth: usize,
1997) -> Option<(Vec<Inline>, usize)> {
1998 if let Some((inner, end)) = tag_region(chars, begin, "<sub>", "</sub>")
2000 && !is_blank(&inner)
2001 {
2002 return Some((
2003 vec![Inline::Subscript(scan_slice(&inner, ctx, depth + 1))],
2004 end,
2005 ));
2006 }
2007 if let Some((inner, end)) = tag_region(chars, begin, "<sup>", "</sup>")
2008 && !is_blank(&inner)
2009 {
2010 return Some((
2011 vec![Inline::Superscript(scan_slice(&inner, ctx, depth + 1))],
2012 end,
2013 ));
2014 }
2015 if let Some((inner, end)) = tag_region(chars, begin, "<del>", "</del>")
2016 && !is_blank(&inner)
2017 {
2018 return Some((
2019 vec![Inline::Strikeout(scan_slice(&inner, ctx, depth + 1))],
2020 end,
2021 ));
2022 }
2023 if let Some((inner, end)) = tag_region(chars, begin, "<nowiki>", "</nowiki>") {
2024 let text: String = inner.iter().collect();
2025 return Some((tokenize_text(&text), end));
2026 }
2027 if let Some((inner, end)) = tag_region(chars, begin, "<html>", "</html>") {
2028 let text: String = inner.iter().collect();
2029 return Some((
2030 vec![Inline::RawInline(Format("html".into()), text.into())],
2031 end,
2032 ));
2033 }
2034 if let Some((inner, end)) = tag_region(chars, begin, "<php>", "</php>") {
2035 let text: String = inner.iter().collect();
2036 return Some((
2037 vec![Inline::RawInline(
2038 Format("html".into()),
2039 format!("<?php {text} ?>").into(),
2040 )],
2041 end,
2042 ));
2043 }
2044 angle_email(chars, begin).map(|(node, end)| (vec![node], end))
2045}
2046
2047fn tag_region(chars: &[char], start: usize, open: &str, close: &str) -> Option<(Vec<char>, usize)> {
2049 if !matches_at(chars, start, open) {
2050 return None;
2051 }
2052 let content_start = start + open.chars().count();
2053 let close_at = find_subsequence(chars, content_start, close)?;
2054 let inner = chars.get(content_start..close_at).unwrap_or(&[]).to_vec();
2055 Some((inner, close_at + close.chars().count()))
2056}
2057
2058fn angle_email(chars: &[char], start: usize) -> Option<(Inline, usize)> {
2060 if chars.get(start) != Some(&'<') {
2061 return None;
2062 }
2063 let mut j = start + 1;
2064 while let Some(&c) = chars.get(j) {
2065 if c == '>' {
2066 break;
2067 }
2068 if c.is_whitespace() || c == '<' {
2069 return None;
2070 }
2071 j += 1;
2072 }
2073 if chars.get(j) != Some(&'>') {
2074 return None;
2075 }
2076 let inner: String = chars.get(start + 1..j).unwrap_or(&[]).iter().collect();
2077 let (local, domain) = inner.split_once('@')?;
2078 if local.is_empty() || !domain.contains('.') || domain.starts_with('.') || domain.ends_with('.')
2079 {
2080 return None;
2081 }
2082 let url = format!("mailto:{inner}");
2083 Some((
2084 Inline::Link(
2085 Box::default(),
2086 vec![Inline::Str(inner.into())],
2087 Box::new(Target {
2088 url: url.into(),
2089 title: carta_ast::Text::default(),
2090 }),
2091 ),
2092 j + 1,
2093 ))
2094}
2095
2096fn parse_macro(chars: &[char], start: usize) -> Option<usize> {
2098 for token in ["~~NOTOC~~", "~~NOCACHE~~"] {
2099 if matches_at(chars, start, token) {
2100 return Some(start + token.chars().count());
2101 }
2102 }
2103 None
2104}
2105
2106fn parse_blocks_str(text: &str, ctx: Ctx, depth: usize) -> Vec<Block> {
2108 let lines: Vec<&str> = text.split('\n').collect();
2109 let mut index = 0;
2110 parse_blocks(&lines, &mut index, ctx, depth)
2111}
2112
2113fn parse_table(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Block {
2120 let mut rows: Vec<(bool, Vec<String>)> = Vec::new();
2121 while *index < lines.len() {
2122 let line = lines.get(*index).copied().unwrap_or("");
2123 if !is_table_line(line) {
2124 break;
2125 }
2126 rows.push((line.starts_with('^'), split_row(line)));
2127 *index += 1;
2128 }
2129
2130 let first = rows.first();
2131 let col_count = first.map_or(0, |(_, cells)| cells.len());
2132 let col_specs: Vec<ColSpec> = first
2133 .map(|(_, cells)| {
2134 cells
2135 .iter()
2136 .map(|cell| ColSpec {
2137 align: cell_align(cell),
2138 width: ColWidth::ColWidthDefault,
2139 })
2140 .collect()
2141 })
2142 .unwrap_or_default();
2143
2144 let mut head_rows = Vec::new();
2145 let mut body_rows = Vec::new();
2146 for (i, (header, cells)) in rows.iter().enumerate() {
2147 let row = build_row(cells, col_count, ctx, depth);
2148 if i == 0 && *header {
2149 head_rows.push(row);
2150 } else {
2151 body_rows.push(row);
2152 }
2153 }
2154
2155 Block::Table(Box::new(Table {
2156 attr: Attr::default(),
2157 caption: Caption::default(),
2158 col_specs,
2159 head: TableHead {
2160 attr: Attr::default(),
2161 rows: head_rows,
2162 },
2163 bodies: vec![TableBody {
2164 attr: Attr::default(),
2165 row_head_columns: 0,
2166 head: Vec::new(),
2167 body: body_rows,
2168 }],
2169 foot: TableFoot::default(),
2170 }))
2171}
2172
2173fn build_row(cells: &[String], col_count: usize, ctx: Ctx, depth: usize) -> Row {
2175 let mut out = Vec::with_capacity(col_count);
2176 for i in 0..col_count {
2177 let trimmed = cells.get(i).map_or("", |c| c.trim());
2178 let content = if trimmed.is_empty() {
2179 Vec::new()
2180 } else {
2181 vec![Block::Plain(inline_content(trimmed, ctx, depth))]
2182 };
2183 out.push(Cell {
2184 attr: Attr::default(),
2185 align: Alignment::AlignDefault,
2186 row_span: 1,
2187 col_span: 1,
2188 content,
2189 });
2190 }
2191 Row {
2192 attr: Attr::default(),
2193 cells: out,
2194 }
2195}
2196
2197fn cell_align(raw: &str) -> Alignment {
2200 let leading = raw.chars().take_while(|&c| c == ' ').count();
2201 let trailing = raw.chars().rev().take_while(|&c| c == ' ').count();
2202 match (leading >= 2, trailing >= 2) {
2203 (true, true) => Alignment::AlignCenter,
2204 (_, true) => Alignment::AlignLeft,
2205 (true, _) => Alignment::AlignRight,
2206 _ => Alignment::AlignDefault,
2207 }
2208}
2209
2210fn split_row(line: &str) -> Vec<String> {
2213 let chars: Vec<char> = line.chars().collect();
2214 let mut segments: Vec<String> = Vec::new();
2215 let mut seg = String::new();
2216 let mut i = 0;
2217 while i < chars.len() {
2218 if let Some(skip) = protected_end(&chars, i) {
2219 seg.extend(chars.get(i..skip).unwrap_or(&[]));
2220 i = skip;
2221 continue;
2222 }
2223 match chars.get(i) {
2224 Some('|' | '^') => {
2225 segments.push(std::mem::take(&mut seg));
2226 i += 1;
2227 }
2228 Some(&c) => {
2229 seg.push(c);
2230 i += 1;
2231 }
2232 None => break,
2233 }
2234 }
2235 segments.push(seg);
2236 if !segments.is_empty() {
2237 segments.remove(0);
2238 }
2239 if segments.last().is_some_and(String::is_empty) {
2240 segments.pop();
2241 }
2242 segments
2243}
2244
2245fn protected_end(chars: &[char], i: usize) -> Option<usize> {
2248 for (open, close) in [("[[", "]]"), ("{{", "}}"), ("''", "''"), ("%%", "%%")] {
2249 if matches_at(chars, i, open) {
2250 let from = i + open.chars().count();
2251 let end = find_subsequence(chars, from, close)
2252 .map_or(chars.len(), |p| p + close.chars().count());
2253 return Some(end);
2254 }
2255 }
2256 if matches_at(chars, i, "<nowiki>") {
2257 let from = i + "<nowiki>".chars().count();
2258 let end = find_subsequence(chars, from, "</nowiki>")
2259 .map_or(chars.len(), |p| p + "</nowiki>".chars().count());
2260 return Some(end);
2261 }
2262 None
2263}
2264
2265#[cfg(test)]
2266mod tests {
2267 use super::*;
2268
2269 fn reads_ok(input: &str) -> bool {
2272 DokuwikiReader
2273 .read(input, &ReaderOptions::default())
2274 .is_ok()
2275 }
2276
2277 #[test]
2278 fn adversarial_footnotes_under_open_emphasis_do_not_stall() {
2279 let input = format!("(({}))", "//((x)) ".repeat(400));
2286 assert!(reads_ok(&input));
2287 }
2288
2289 #[test]
2290 fn adversarially_nested_footnotes_do_not_stall() {
2291 let input = format!("{}x{}", "((".repeat(2_000), "))".repeat(2_000));
2292 assert!(reads_ok(&input));
2293 }
2294
2295 #[test]
2296 fn a_delimiter_dense_run_does_not_blow_up() {
2297 let input = "//a ".repeat(4_000);
2303 assert!(reads_ok(&input));
2304 }
2305}