1use std::collections::BTreeMap;
21
22use carta_ast::{
23 Alignment, ApiVersion, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Format, Inline,
24 ListAttributes, ListNumberDelim, ListNumberStyle, MathType, MetaValue, QuoteType, Row, Table,
25 TableBody, TableFoot, TableHead, Target, ToCompactString, slug_gfm, to_plain_text,
26};
27use carta_core::{Extension, Extensions, Reader, ReaderOptions, Result};
28
29use crate::emoji;
30use crate::entities;
31use crate::heading_ids;
32
33#[derive(Debug, Default, Clone, Copy)]
35pub struct MediawikiReader;
36
37impl Reader for MediawikiReader {
38 fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
39 let stripped = strip_comments(&expand_tabs(input));
40 let (source, behavior_switches) = extract_behavior_switches(&stripped);
41 let chars: Vec<char> = source.chars().collect();
42 let mut parser = Parser::new(options);
43 let mut blocks = parser.parse_blocks(&chars);
44 if !parser.categories.is_empty() {
47 let mut inlines: Vec<Inline> = Vec::new();
48 for (index, category) in parser.categories.drain(..).enumerate() {
49 if index > 0 {
50 inlines.push(Inline::Space);
51 }
52 inlines.push(category);
53 }
54 blocks.push(Block::Para(inlines));
55 }
56 let mut meta: BTreeMap<String, MetaValue> = BTreeMap::new();
57 for switch in behavior_switches {
58 meta.insert(switch, MetaValue::MetaBool(true));
59 }
60 Ok(Document {
61 api_version: ApiVersion::default(),
62 meta: meta.into_iter().map(|(k, v)| (k.into(), v)).collect(),
63 blocks,
64 })
65 }
66}
67
68struct Parser {
71 extensions: Extensions,
72 link_counter: usize,
73 ids: heading_ids::IdRegistry,
74 categories: Vec<Inline>,
76 depth: usize,
78}
79
80const MAX_BLOCK_DEPTH: usize = 64;
84
85struct ListItem {
87 markers: Vec<char>,
88 content: String,
89}
90
91#[derive(PartialEq, Eq, Clone, Copy)]
93enum ListKind {
94 Bullet,
95 Ordered,
96 Definition,
97}
98
99struct RawCell {
102 is_header: bool,
103 align: Alignment,
104 col_span: i32,
105 row_span: i32,
106 attr: Attr,
107 content: String,
108}
109
110struct CellAttrs {
112 align: Alignment,
113 col_span: i32,
114 row_span: i32,
115 attr: Attr,
116}
117
118#[derive(Clone, Copy)]
120enum OpenTarget {
121 None,
122 Caption,
123 Cell,
124}
125
126enum Tok {
130 Inline(Inline),
131 Apostrophes(usize),
132 BlockRaw(String),
133 BlockBreak,
134 Block(Block),
137}
138
139enum HtmlTagRole {
141 Inline,
143 Block,
145 Break,
147}
148
149impl Parser {
150 fn new(options: &ReaderOptions) -> Self {
151 Self {
152 extensions: options.extensions,
153 link_counter: 0,
154 ids: heading_ids::IdRegistry::default(),
155 categories: Vec::new(),
156 depth: 0,
157 }
158 }
159
160 fn smart(&self) -> bool {
162 self.extensions.contains(Extension::Smart)
163 }
164
165 fn parse_blocks(&mut self, chars: &[char]) -> Vec<Block> {
166 self.depth += 1;
167 if self.depth > MAX_BLOCK_DEPTH {
168 self.depth -= 1;
169 return degraded_blocks(chars);
170 }
171 let blocks = self.parse_blocks_inner(chars);
172 self.depth -= 1;
173 blocks
174 }
175
176 fn parse_blocks_inner(&mut self, chars: &[char]) -> Vec<Block> {
177 let mut blocks: Vec<Block> = Vec::new();
178 let mut pos = 0;
179 let mut line_start = true;
180 let n = chars.len();
181 let mut scan = HeaderScan::default();
186 while pos < n {
187 if line_start {
188 let le = line_end(chars, pos);
189 if is_blank(chars, pos, le) {
190 pos = if le < n { le + 1 } else { le };
191 continue;
192 }
193 let c = at(chars, pos).unwrap_or(' ');
194 if c == '{'
195 && at(chars, pos + 1) == Some('{')
196 && template_opens(chars, pos)
197 && let Some(after) = balanced_braces(chars, pos)
198 {
199 let raw = collect_range(chars, pos, after);
200 blocks.push(Block::RawBlock(format_mediawiki(), raw.into()));
201 let (np, ls) = finish_inline_block(chars, after);
202 pos = np;
203 line_start = ls;
204 continue;
205 }
206 if c == '{' && at(chars, pos + 1) == Some('|') {
207 let (block, after) = self.parse_table(chars, pos);
208 blocks.push(block);
209 let (np, ls) = finish_inline_block(chars, after);
210 pos = np;
211 line_start = ls;
212 continue;
213 }
214 if c == '='
215 && let Some((level, inlines, closer_end)) =
216 self.try_header(chars, pos, &mut scan)
217 {
218 let id = self.make_id(&inlines);
219 let attr = Attr {
220 id: id.into(),
221 classes: Vec::new(),
222 attributes: Vec::new(),
223 };
224 blocks.push(Block::Header(level, Box::new(attr), inlines));
225 let (np, ls) = finish_inline_block(chars, closer_end);
226 pos = np;
227 line_start = ls;
228 continue;
229 }
230 if c == '-' && is_hr_line(chars, pos) {
231 blocks.push(Block::HorizontalRule);
232 let le2 = line_end(chars, pos);
233 pos = if le2 < n { le2 + 1 } else { le2 };
234 line_start = true;
235 continue;
236 }
237 if matches!(c, '*' | '#' | ':' | ';') && list_run_uniform(chars, pos) {
238 let (list_blocks, after) = self.parse_list(chars, pos);
239 blocks.extend(list_blocks);
240 pos = after;
241 line_start = true;
242 continue;
243 }
244 if c == ' ' {
245 let (block, after) = self.parse_preformatted(chars, pos);
246 blocks.push(block);
247 pos = after;
248 line_start = true;
249 continue;
250 }
251 if c == '<'
252 && let Some((block, after)) = self.parse_block_tag(chars, pos)
253 {
254 blocks.push(block);
255 let (np, ls) = finish_inline_block(chars, after);
256 pos = np;
257 line_start = ls;
258 continue;
259 }
260 }
261 let (mut para_blocks, after) = self.parse_paragraph(chars, pos, &mut scan);
262 blocks.append(&mut para_blocks);
263 pos = after;
264 line_start = true;
265 }
266 blocks
267 }
268
269 fn try_header(
270 &mut self,
271 chars: &[char],
272 pos: usize,
273 scan: &mut HeaderScan,
274 ) -> Option<(i32, Vec<Inline>, usize)> {
275 let le = line_end(chars, pos);
276 let mut m = 0;
277 while pos + m < le && at(chars, pos + m) == Some('=') {
278 m += 1;
279 }
280 if m == 0 || m > 6 {
281 return None;
282 }
283 let content_start = pos + m;
284 let region_end = header_region_end_scan(chars, pos, scan);
288 let closer = header_closer(chars, content_start, region_end, m)?;
289 let content = collect_range(chars, content_start, closer);
290 let inlines = self.parse_inlines(content.trim());
291 Some((i32::try_from(m).unwrap_or(1), inlines, closer + m))
292 }
293
294 fn parse_list(&mut self, chars: &[char], pos: usize) -> (Vec<Block>, usize) {
295 let mut items: Vec<ListItem> = Vec::new();
296 let mut cursor = pos;
297 let n = chars.len();
298 while at(chars, cursor).is_some_and(is_list_marker) {
299 let le = line_end(chars, cursor);
300 let mut scan = cursor;
301 let mut markers: Vec<char> = Vec::new();
302 while scan < le && at(chars, scan).is_some_and(is_list_marker) {
303 if let Some(marker) = at(chars, scan) {
304 markers.push(marker);
305 }
306 scan += 1;
307 }
308 let content = collect_range(chars, scan, le).trim().to_string();
309 items.push(ListItem { markers, content });
310 if le >= n {
311 cursor = le;
312 break;
313 }
314 cursor = le + 1;
315 }
316 (self.build_lists(&items, 0), cursor)
317 }
318
319 fn build_lists(&mut self, items: &[ListItem], level: usize) -> Vec<Block> {
320 if level >= MAX_BLOCK_DEPTH {
321 let mut out: Vec<Block> = Vec::new();
324 for item in items {
325 let inlines = self.parse_inlines(&item.content);
326 if !inlines.is_empty() {
327 out.push(Block::Plain(inlines));
328 }
329 }
330 return out;
331 }
332 let mut out: Vec<Block> = Vec::new();
333 let mut i = 0;
334 while i < items.len() {
335 let kind = if let Some(&m) = items.get(i).and_then(|it| it.markers.get(level)) {
336 list_kind(m)
337 } else {
338 i += 1;
339 continue;
340 };
341 let mut j = i;
342 while j < items.len() {
343 match items.get(j).and_then(|it| it.markers.get(level)) {
344 Some(&m) if list_kind(m) == kind => j += 1,
345 _ => break,
346 }
347 }
348 let group = items.get(i..j).unwrap_or(&[]);
349 match kind {
350 ListKind::Bullet => out.push(Block::BulletList(self.build_simple(group, level))),
351 ListKind::Ordered => {
352 out.push(Block::OrderedList(
353 default_list_attrs(),
354 self.build_simple(group, level),
355 ));
356 }
357 ListKind::Definition => out.push(self.build_definition(group, level)),
358 }
359 i = j;
360 }
361 out
362 }
363
364 fn build_simple(&mut self, group: &[ListItem], level: usize) -> Vec<Vec<Block>> {
365 let mut entries: Vec<Vec<Block>> = Vec::new();
366 let mut i = 0;
367 while i < group.len() {
368 let depth = group.get(i).map_or(0, |it| it.markers.len());
369 if depth == level + 1 {
370 let content = group.get(i).map_or("", |it| it.content.as_str());
371 let mut blocks = vec![plain_or_figure(self.parse_inlines(content))];
372 i += 1;
373 let start = i;
374 while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
375 i += 1;
376 }
377 if let Some(sub) = group.get(start..i)
378 && !sub.is_empty()
379 {
380 blocks.extend(self.build_lists(sub, level + 1));
381 }
382 entries.push(blocks);
383 } else {
384 let start = i;
385 while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
386 i += 1;
387 }
388 if i == start {
389 i += 1;
390 }
391 let blocks = group
392 .get(start..i)
393 .map(|sub| self.build_lists(sub, level + 1))
394 .unwrap_or_default();
395 entries.push(blocks);
396 }
397 }
398 entries
399 }
400
401 fn build_definition(&mut self, group: &[ListItem], level: usize) -> Block {
402 let mut pairs: Vec<(Vec<Inline>, Vec<Vec<Block>>)> = Vec::new();
403 let mut i = 0;
404 while i < group.len() {
405 let Some(item) = group.get(i) else { break };
406 if item.markers.len() == level + 1 {
407 let marker = item.markers.get(level).copied().unwrap_or(':');
408 let content = item.content.clone();
409 i += 1;
410 let start = i;
411 while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
412 i += 1;
413 }
414 let nested = group
415 .get(start..i)
416 .map(|sub| self.build_lists(sub, level + 1))
417 .unwrap_or_default();
418 if marker == ';' {
419 let (term_str, def_str) = split_term(&content);
420 let term = self.parse_inlines(&term_str);
421 let mut defs: Vec<Vec<Block>> = Vec::new();
422 if let Some(d) = def_str {
423 defs.push(vec![plain_or_figure(self.parse_inlines(&d))]);
424 }
425 if !nested.is_empty() {
426 match defs.last_mut() {
427 Some(last) => last.extend(nested),
428 None => defs.push(nested),
429 }
430 }
431 match pairs.last_mut() {
434 Some((last_term, last_defs)) if last_defs.is_empty() => {
435 last_term.push(Inline::LineBreak);
436 last_term.extend(term);
437 *last_defs = defs;
438 }
439 _ => pairs.push((term, defs)),
440 }
441 } else {
442 let mut blocks = vec![plain_or_figure(self.parse_inlines(&content))];
443 blocks.extend(nested);
444 match pairs.last_mut() {
445 Some(last) => last.1.push(blocks),
446 None => pairs.push((Vec::new(), vec![blocks])),
447 }
448 }
449 } else {
450 let start = i;
451 while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
452 i += 1;
453 }
454 if i == start {
455 i += 1;
456 }
457 let nested = group
458 .get(start..i)
459 .map(|sub| self.build_lists(sub, level + 1))
460 .unwrap_or_default();
461 match pairs.last_mut() {
462 Some(last) => match last.1.last_mut() {
463 Some(d) => d.extend(nested),
464 None => last.1.push(nested),
465 },
466 None => pairs.push((Vec::new(), vec![nested])),
467 }
468 }
469 }
470 Block::DefinitionList(pairs)
471 }
472
473 fn parse_preformatted(&mut self, chars: &[char], pos: usize) -> (Block, usize) {
474 let n = chars.len();
475 let mut p = pos;
476 let mut lines: Vec<Vec<Inline>> = Vec::new();
477 while at(chars, p) == Some(' ') {
478 let le = line_end(chars, p);
479 let content = collect_range(chars, p + 1, le);
480 lines.push(self.preformatted_line(&content));
481 if le >= n {
482 p = le;
483 break;
484 }
485 p = le + 1;
486 }
487 let mut out: Vec<Inline> = Vec::new();
488 for (idx, mut inlines) in lines.into_iter().enumerate() {
489 if idx > 0 {
490 out.push(Inline::LineBreak);
491 }
492 out.append(&mut inlines);
493 }
494 (Block::Para(out), p)
495 }
496
497 fn parse_block_tag(&mut self, chars: &[char], pos: usize) -> Option<(Block, usize)> {
498 let (name, raw_open, self_closing, after_open) = open_tag(chars, pos)?;
499 match name.as_str() {
500 "blockquote" => {
501 if self_closing {
502 return Some((Block::BlockQuote(Vec::new()), after_open));
503 }
504 let (inner, after) = enclosed(chars, after_open, "blockquote");
505 let inner_chars: Vec<char> = inner.chars().collect();
506 Some((Block::BlockQuote(self.parse_blocks(&inner_chars)), after))
507 }
508 "pre" => {
509 let (inner, after) = enclosed(chars, after_open, "pre");
510 Some((
511 Block::CodeBlock(Box::default(), trim_code(&inner).into()),
512 after,
513 ))
514 }
515 "source" | "syntaxhighlight" => {
516 let (inner, after) = enclosed(chars, after_open, &name);
517 let mut classes = Vec::new();
518 if let Some(lang) = tag_attribute(&raw_open, "lang")
519 && !lang.is_empty()
520 {
521 classes.push(lang.into());
522 }
523 let attr = Attr {
524 id: carta_ast::Text::default(),
525 classes,
526 attributes: Vec::new(),
527 };
528 Some((
529 Block::CodeBlock(Box::new(attr), trim_code(&inner).into()),
530 after,
531 ))
532 }
533 "ul" => Some(self.parse_html_list(chars, after_open, false, &raw_open, self_closing)),
534 "ol" => Some(self.parse_html_list(chars, after_open, true, &raw_open, self_closing)),
535 _ => None,
536 }
537 }
538
539 fn parse_html_list(
545 &mut self,
546 chars: &[char],
547 start: usize,
548 ordered: bool,
549 raw_open: &str,
550 self_closing: bool,
551 ) -> (Block, usize) {
552 let mut items: Vec<Vec<Block>> = Vec::new();
553 let mut i = start;
554 let close_name = if ordered { "ol" } else { "ul" };
555 if !self_closing {
556 loop {
557 while at(chars, i).is_some_and(char::is_whitespace) {
558 i += 1;
559 }
560 if at(chars, i) == Some('<')
561 && at(chars, i + 1) == Some('/')
562 && tag_name_matches(chars, i + 2, close_name)
563 && let Some((_, _, after)) = close_tag_parse(chars, i)
564 {
565 i = after;
566 break;
567 }
568 if at(chars, i) == Some('<')
569 && at(chars, i + 1) != Some('/')
570 && tag_name_matches(chars, i + 1, "li")
571 && let Some((_, _, _self_closing, after_li)) = open_tag(chars, i)
572 {
573 let (content_end, next) = html_li_content_bounds(chars, after_li);
574 let content: Vec<char> = collect_range(chars, after_li, content_end)
575 .chars()
576 .collect();
577 let mut blocks = self.parse_blocks(&content);
578 if let Some(Block::Para(inlines)) = blocks.first() {
579 let inlines = inlines.clone();
580 if let Some(first) = blocks.first_mut() {
581 *first = Block::Plain(inlines);
582 }
583 }
584 items.push(blocks);
585 i = next;
586 continue;
587 }
588 break;
589 }
590 }
591 let block = if ordered {
592 let start_num = tag_attribute(raw_open, "start")
593 .and_then(|value| value.trim().parse::<i32>().ok())
594 .unwrap_or(1);
595 Block::OrderedList(
596 ListAttributes {
597 start: start_num,
598 style: ListNumberStyle::DefaultStyle,
599 delim: ListNumberDelim::DefaultDelim,
600 },
601 items,
602 )
603 } else {
604 Block::BulletList(items)
605 };
606 (block, i)
607 }
608
609 fn parse_paragraph(
610 &mut self,
611 chars: &[char],
612 pos: usize,
613 scan: &mut HeaderScan,
614 ) -> (Vec<Block>, usize) {
615 let n = chars.len();
616 let mut pieces: Vec<String> = Vec::new();
617 let mut cur = pos;
618 loop {
619 let le = line_end(chars, cur);
620 pieces.push(collect_range(chars, cur, le));
621 if le >= n {
622 cur = le;
623 break;
624 }
625 let next = le + 1;
626 if next >= n {
627 cur = next;
628 break;
629 }
630 let ref_open = open_ref_depth(chars, pos, next) > 0;
636 let next_end = line_end(chars, next);
637 if is_blank(chars, next, next_end) {
638 if ref_open {
639 cur = next;
640 continue;
641 }
642 cur = if next_end < n { next_end + 1 } else { next_end };
643 break;
644 }
645 if line_starts_block_scan(chars, next, scan) {
646 if ref_open && open_ref_block_bodied(chars, pos, next) {
647 cur = next;
648 continue;
649 }
650 cur = next;
651 break;
652 }
653 cur = next;
654 }
655 let raw = pieces.join("\n");
656 let trimmed = raw.trim();
657 if trimmed.is_empty() {
658 return (Vec::new(), cur);
659 }
660 (self.parse_block_content(trimmed), cur)
661 }
662
663 fn parse_block_content(&mut self, text: &str) -> Vec<Block> {
667 let chars: Vec<char> = text.chars().collect();
668 let toks = self.lex(&chars, false, true);
669 let smart = self.smart();
670 let east_asian = self.extensions.contains(Extension::EastAsianLineBreaks);
671 let mut blocks: Vec<Block> = Vec::new();
672 let mut segment: Vec<Tok> = Vec::new();
673 for tok in toks {
674 match tok {
675 Tok::BlockRaw(raw) => {
676 flush_para_segment(&mut segment, &mut blocks, smart, east_asian);
677 blocks.push(Block::RawBlock(format_html(), raw.into()));
678 }
679 Tok::Block(block) => {
680 flush_para_segment(&mut segment, &mut blocks, smart, east_asian);
681 blocks.push(block);
682 }
683 Tok::BlockBreak => flush_para_segment(&mut segment, &mut blocks, smart, east_asian),
684 other => segment.push(other),
685 }
686 }
687 flush_para_segment(&mut segment, &mut blocks, smart, east_asian);
688 blocks
689 }
690
691 fn parse_table(&mut self, chars: &[char], pos: usize) -> (Block, usize) {
696 let after = table_block_end(chars, pos);
697 let region = collect_range(chars, pos, after);
698 (self.build_table(®ion), after)
699 }
700
701 fn build_table(&mut self, region: &str) -> Block {
702 let (mut rows, caption_text) = scan_table_region(region);
703 if rows.first().is_some_and(Vec::is_empty) {
707 rows.remove(0);
708 }
709 if rows.is_empty() {
710 rows.push(Vec::new());
712 }
713
714 let n_rows = rows.len();
715 let ncols = rows.first().map_or(0, |r| {
717 r.iter().map(|c| col_count(c.col_span)).sum::<usize>()
718 });
719 let col_specs = column_specs(&rows, ncols);
720
721 let is_header_first = rows
722 .first()
723 .and_then(|r| r.first())
724 .is_some_and(|c| c.is_header);
725
726 let ast_rows = self.lay_grid(&rows, ncols, n_rows);
727
728 let (head_rows, body_rows) = if is_header_first {
729 let mut iter = ast_rows.into_iter();
730 let head: Vec<Row> = iter.next().into_iter().collect();
731 (head, iter.collect::<Vec<Row>>())
732 } else {
733 (Vec::new(), ast_rows)
734 };
735
736 let caption = match caption_text {
737 Some(text) => {
738 let inlines = self.parse_inlines(text.trim());
739 if inlines.is_empty() {
740 Caption::default()
741 } else {
742 Caption {
743 short: None,
744 long: vec![Block::Plain(inlines)],
745 }
746 }
747 }
748 None => Caption::default(),
749 };
750
751 Block::Table(Box::new(Table {
752 attr: Attr::default(),
753 caption,
754 col_specs,
755 head: TableHead {
756 attr: Attr::default(),
757 rows: head_rows,
758 },
759 bodies: vec![TableBody {
760 attr: Attr::default(),
761 row_head_columns: 0,
762 head: Vec::new(),
763 body: body_rows,
764 }],
765 foot: TableFoot::default(),
766 }))
767 }
768
769 fn lay_grid(&mut self, rows: &[Vec<RawCell>], ncols: usize, n_rows: usize) -> Vec<Row> {
774 let mut ast_rows: Vec<Row> = Vec::new();
775 let mut occupied: Vec<i32> = vec![0; ncols];
776 for (r, raw) in rows.iter().enumerate() {
777 let available = i32::try_from(n_rows.saturating_sub(r)).unwrap_or(i32::MAX);
778 let mut cells: Vec<Cell> = Vec::new();
779 let mut col = 0usize;
780 for c in raw {
781 while col < ncols && occupied.get(col).copied().unwrap_or(0) > 0 {
782 col += 1;
783 }
784 if col >= ncols {
785 break;
786 }
787 let col_span = col_count(c.col_span).min(ncols - col);
788 let row_span = c.row_span.max(1).min(available);
789 let content_chars: Vec<char> = c.content.trim().chars().collect();
790 let content = self.parse_cell_blocks(&content_chars);
791 cells.push(Cell {
792 attr: c.attr.clone(),
793 align: c.align.clone(),
794 row_span,
795 col_span: i32::try_from(col_span).unwrap_or(1),
796 content,
797 });
798 for k in col..col + col_span {
799 if let Some(slot) = occupied.get_mut(k) {
800 *slot = row_span;
801 }
802 }
803 col += col_span;
804 }
805 while col < ncols {
806 if occupied.get(col).copied().unwrap_or(0) == 0 {
807 cells.push(empty_cell());
808 }
809 col += 1;
810 }
811 for slot in &mut occupied {
812 *slot = (*slot - 1).max(0);
813 }
814 ast_rows.push(Row {
815 attr: Attr::default(),
816 cells,
817 });
818 }
819 ast_rows
820 }
821
822 fn parse_cell_blocks(&mut self, chars: &[char]) -> Vec<Block> {
827 let first = at(chars, 0);
828 let suppressed = matches!(first, Some('*' | '#' | ';'))
829 || (first == Some('=') && is_header_line_within(chars, 0));
830 if !suppressed {
831 return self.parse_blocks(chars);
832 }
833 let (mut blocks, after) = self.parse_paragraph(chars, 0, &mut HeaderScan::default());
834 if let Some(rest) = chars.get(after..) {
835 blocks.extend(self.parse_blocks(rest));
836 }
837 blocks
838 }
839
840 fn note_blocks(&mut self, chars: &[char]) -> Vec<Block> {
842 let blocks = self.parse_blocks(chars);
843 match blocks.as_slice() {
844 [Block::Para(inlines)] => vec![Block::Plain(inlines.clone())],
845 _ => blocks,
846 }
847 }
848
849 fn parse_inlines(&mut self, text: &str) -> Vec<Inline> {
850 let chars: Vec<char> = text.chars().collect();
851 let toks = self.lex(&chars, false, false);
852 let mut inlines = coalesce(resolve_emphasis(toks));
853 if self.extensions.contains(Extension::EastAsianLineBreaks) {
854 inlines = drop_east_asian_breaks(inlines);
855 }
856 if self.smart() {
857 inlines = apply_smart_quotes(inlines);
858 }
859 inlines
860 }
861
862 fn preformatted_line(&mut self, text: &str) -> Vec<Inline> {
865 let chars: Vec<char> = text.chars().collect();
866 let toks = self.lex(&chars, true, false);
867 preformat_transform(resolve_emphasis(toks))
868 }
869
870 #[allow(clippy::too_many_lines)]
871 fn lex(&mut self, chars: &[char], preformatted: bool, block_context: bool) -> Vec<Tok> {
872 let mut toks: Vec<Tok> = Vec::new();
873 let mut word = String::new();
874 let mut i = 0;
875 let n = chars.len();
876 while i < n {
877 let Some(c) = at(chars, i) else { break };
878 if c == '\'' {
879 let mut end = i;
880 while at(chars, end) == Some('\'') {
881 end += 1;
882 }
883 let run = end - i;
884 if run >= 2 {
885 flush_word(&mut word, &mut toks);
886 toks.push(Tok::Apostrophes(run));
887 } else {
888 word.push('\'');
889 }
890 i = end;
891 continue;
892 }
893 if c.is_whitespace() {
894 if preformatted {
895 word.push(c);
896 i += 1;
897 continue;
898 }
899 flush_word(&mut word, &mut toks);
900 let (token, next) = whitespace_token(chars, i);
901 toks.push(Tok::Inline(token));
902 i = next;
903 continue;
904 }
905 if c == '&' {
906 if let Some((decoded, next)) = entities::read_reference(chars, i, chars.len(), true)
907 {
908 word.push_str(&decoded);
909 i = next;
910 } else {
911 word.push('&');
912 i += 1;
913 }
914 continue;
915 }
916 if c == '<' {
917 if let Some((inlines, next)) = self.handle_tag(chars, i) {
918 flush_word(&mut word, &mut toks);
919 for inline in inlines {
920 toks.push(Tok::Inline(inline));
921 }
922 i = next;
923 continue;
924 }
925 if block_context
926 && starts_block_tag(chars, i)
927 && let Some((block, next)) = self.parse_block_tag(chars, i)
928 {
929 flush_word(&mut word, &mut toks);
930 toks.push(Tok::Block(block));
931 i = next;
932 continue;
933 }
934 if let Some((tok, next)) = block_tag_token(chars, i) {
935 flush_word(&mut word, &mut toks);
936 toks.push(tok);
937 i = next;
938 continue;
939 }
940 word.push('<');
941 i += 1;
942 continue;
943 }
944 if c == '{' && at(chars, i + 1) == Some('{') {
945 if template_opens(chars, i)
946 && let Some(after) = balanced_braces(chars, i)
947 {
948 flush_word(&mut word, &mut toks);
949 let raw = collect_range(chars, i, after);
950 toks.push(Tok::Inline(Inline::RawInline(
951 format_mediawiki(),
952 raw.into(),
953 )));
954 i = after;
955 continue;
956 }
957 word.push('{');
958 i += 1;
959 continue;
960 }
961 if c == '[' {
962 let handled = if at(chars, i + 1) == Some('[') {
963 self.internal_link(chars, i)
964 } else {
965 self.external_link(chars, i)
966 };
967 if let Some((inlines, next)) = handled {
968 flush_word(&mut word, &mut toks);
969 for inline in inlines {
970 toks.push(Tok::Inline(inline));
971 }
972 i = next;
973 continue;
974 }
975 if at(chars, i + 1) != Some('[')
977 && let Some((inline, next)) = bare_url(chars, i + 1)
978 {
979 word.push('[');
980 flush_word(&mut word, &mut toks);
981 toks.push(Tok::Inline(inline));
982 i = next;
983 continue;
984 }
985 word.push('[');
986 i += 1;
987 continue;
988 }
989 if word.is_empty()
990 && let Some((inline, next)) = bare_url(chars, i)
991 {
992 toks.push(Tok::Inline(inline));
993 i = next;
994 continue;
995 }
996 word.push(c);
997 i += 1;
998 }
999 flush_word(&mut word, &mut toks);
1000 toks
1001 }
1002
1003 #[allow(clippy::too_many_lines)]
1004 fn handle_tag(&mut self, chars: &[char], i: usize) -> Option<(Vec<Inline>, usize)> {
1005 if at(chars, i) != Some('<') {
1006 return None;
1007 }
1008 match at(chars, i + 1) {
1009 Some('/') => {
1010 let (name, raw, after) = close_tag_parse(chars, i)?;
1011 return match html_tag_role(&name) {
1012 Some(HtmlTagRole::Inline) => Some((vec![raw_html(raw)], after)),
1013 _ => None,
1014 };
1015 }
1016 Some(c) if c.is_ascii_alphabetic() => {}
1017 _ => return None,
1018 }
1019 let (name, raw_open, self_closing, after_open) = open_tag(chars, i)?;
1020 match name.as_str() {
1021 "br" => Some((vec![Inline::LineBreak], after_open)),
1022 "ref" => {
1023 if self_closing {
1024 return Some((vec![Inline::Note(Vec::new())], after_open));
1025 }
1026 match close_tag(chars, after_open, "ref") {
1027 Some((inner_end, after)) => {
1028 let inner = collect_range(chars, after_open, inner_end);
1029 let inner_chars: Vec<char> = inner.chars().collect();
1030 Some((vec![Inline::Note(self.note_blocks(&inner_chars))], after))
1031 }
1032 None => Some((vec![raw_html(raw_open)], after_open)),
1033 }
1034 }
1035 "nowiki" => {
1036 if self_closing {
1037 return Some((Vec::new(), after_open));
1038 }
1039 let (inner, after) = enclosed(chars, after_open, "nowiki");
1040 Some((plain_inlines(&inner), after))
1041 }
1042 "math" => {
1043 if self_closing {
1044 return Some((Vec::new(), after_open));
1045 }
1046 match close_tag(chars, after_open, "math") {
1047 Some((inner_end, after)) => {
1048 let inner = collect_range(chars, after_open, inner_end);
1049 Some((
1050 vec![Inline::Math(MathType::InlineMath, inner.trim().into())],
1051 after,
1052 ))
1053 }
1054 None => Some((vec![raw_html(raw_open)], after_open)),
1055 }
1056 }
1057 "code" | "tt" => Some(verbatim_code(
1058 chars,
1059 &name,
1060 after_open,
1061 &raw_open,
1062 self_closing,
1063 &[],
1064 )),
1065 "var" => Some(verbatim_code(
1066 chars,
1067 "var",
1068 after_open,
1069 &raw_open,
1070 self_closing,
1071 &["variable"],
1072 )),
1073 "samp" => Some(verbatim_code(
1074 chars,
1075 "samp",
1076 after_open,
1077 &raw_open,
1078 self_closing,
1079 &["sample"],
1080 )),
1081 "sub" => Some(self.wrap(
1082 chars,
1083 "sub",
1084 after_open,
1085 &raw_open,
1086 self_closing,
1087 Inline::Subscript,
1088 )),
1089 "sup" => Some(self.wrap(
1090 chars,
1091 "sup",
1092 after_open,
1093 &raw_open,
1094 self_closing,
1095 Inline::Superscript,
1096 )),
1097 "del" | "strike" => Some(self.wrap(
1098 chars,
1099 &name,
1100 after_open,
1101 &raw_open,
1102 self_closing,
1103 Inline::Strikeout,
1104 )),
1105 "kbd" => Some(self.span(chars, "kbd", after_open, &raw_open, self_closing, "kbd")),
1106 "mark" => Some(self.span(chars, "mark", after_open, &raw_open, self_closing, "mark")),
1107 _ => match html_tag_role(&name) {
1108 Some(HtmlTagRole::Inline) => {
1109 if self_closing {
1110 return Some((vec![raw_html(raw_open)], after_open));
1111 }
1112 match close_tag(chars, after_open, &name) {
1113 Some((inner_end, after)) => {
1114 let inner = collect_range(chars, after_open, inner_end);
1115 let close_raw = collect_range(chars, inner_end, after);
1116 let mut out = vec![raw_html(raw_open)];
1117 out.extend(self.parse_inlines(&inner));
1118 out.push(raw_html(close_raw));
1119 Some((out, after))
1120 }
1121 None => Some((vec![raw_html(raw_open)], after_open)),
1122 }
1123 }
1124 _ => None,
1127 },
1128 }
1129 }
1130
1131 fn wrap(
1132 &mut self,
1133 chars: &[char],
1134 name: &str,
1135 after_open: usize,
1136 raw_open: &str,
1137 self_closing: bool,
1138 ctor: fn(Vec<Inline>) -> Inline,
1139 ) -> (Vec<Inline>, usize) {
1140 if self_closing {
1141 return (vec![raw_html(raw_open.to_string())], after_open);
1142 }
1143 match close_tag(chars, after_open, name) {
1144 Some((inner_end, after)) => {
1145 let inner = collect_range(chars, after_open, inner_end);
1146 (vec![ctor(self.parse_inlines(&inner))], after)
1147 }
1148 None => (vec![raw_html(raw_open.to_string())], after_open),
1149 }
1150 }
1151
1152 fn span(
1153 &mut self,
1154 chars: &[char],
1155 name: &str,
1156 after_open: usize,
1157 raw_open: &str,
1158 self_closing: bool,
1159 class: &str,
1160 ) -> (Vec<Inline>, usize) {
1161 if self_closing {
1162 return (vec![raw_html(raw_open.to_string())], after_open);
1163 }
1164 match close_tag(chars, after_open, name) {
1165 Some((inner_end, after)) => {
1166 let inner = collect_range(chars, after_open, inner_end);
1167 let attr = Attr {
1168 id: carta_ast::Text::default(),
1169 classes: vec![class.into()],
1170 attributes: Vec::new(),
1171 };
1172 (
1173 vec![Inline::Span(Box::new(attr), self.parse_inlines(&inner))],
1174 after,
1175 )
1176 }
1177 None => (vec![raw_html(raw_open.to_string())], after_open),
1178 }
1179 }
1180
1181 fn external_link(&mut self, chars: &[char], i: usize) -> Option<(Vec<Inline>, usize)> {
1182 let close = find_char(chars, i + 1, ']')?;
1183 let inner = collect_range(chars, i + 1, close);
1184 let (url, label) = match inner.split_once(|c: char| c.is_whitespace()) {
1185 Some((u, rest)) => (u.to_string(), rest.trim_start().to_string()),
1186 None => (inner.clone(), String::new()),
1187 };
1188 if !is_url(&url) {
1189 return None;
1190 }
1191 if label.is_empty() && at(chars, close + 1).is_some_and(char::is_alphanumeric) {
1194 return None;
1195 }
1196 let text = if label.is_empty() {
1197 self.link_counter += 1;
1198 vec![Inline::Str(self.link_counter.to_compact_string())]
1199 } else {
1200 self.parse_inlines(&label)
1201 };
1202 Some((
1203 vec![Inline::Link(
1204 Box::default(),
1205 text,
1206 Box::new(Target {
1207 url: encode_url_target(&url).into(),
1208 title: carta_ast::Text::default(),
1209 }),
1210 )],
1211 close + 1,
1212 ))
1213 }
1214
1215 fn internal_link(&mut self, chars: &[char], i: usize) -> Option<(Vec<Inline>, usize)> {
1216 let start = i + 2;
1219 let (target_end, has_pipe) = scan_link_target(chars, start)?;
1220 let target = collect_range(chars, start, target_end).trim().to_string();
1221
1222 let (label_content, close) = if has_pipe {
1225 let label_start = target_end + 1;
1226 let close = find_link_close(chars, label_start)?;
1227 (Some(collect_range(chars, label_start, close)), close)
1228 } else {
1229 (None, target_end)
1230 };
1231
1232 if let Some(ns) = namespace_of(&target) {
1233 if ns == "category" {
1234 let text = match &label_content {
1235 Some(label) if !label.trim().is_empty() => self.parse_inlines(label),
1236 _ => self.parse_inlines(&target),
1237 };
1238 let title = title_text(&text);
1239 let attr = Attr {
1240 id: carta_ast::Text::default(),
1241 classes: vec!["wikilink".into()],
1242 attributes: Vec::new(),
1243 };
1244 self.categories.push(Inline::Link(
1245 Box::new(attr),
1246 text,
1247 Box::new(Target {
1248 url: wikilink_url(&target).into(),
1249 title: title.into(),
1250 }),
1251 ));
1252 return Some((Vec::new(), close + 2));
1253 }
1254 if matches!(ns.as_str(), "file" | "image")
1257 && !strip_namespace(&target).is_empty()
1258 && let Some(image) = self.image_embed(&target, label_content.as_deref())
1259 {
1260 return Some((vec![image], close + 2));
1261 }
1262 }
1263 let mut after = close + 2;
1264 let mut trail = String::new();
1265 while let Some(c) = at(chars, after) {
1266 if c.is_ascii_alphabetic() {
1267 trail.push(c);
1268 after += 1;
1269 } else {
1270 break;
1271 }
1272 }
1273 let mut label = match &label_content {
1274 Some(l) if l.trim().is_empty() => self.pipe_trick_label(&target),
1276 Some(l) => self.parse_inlines(l),
1277 None => self.parse_inlines(&target),
1278 };
1279 let title = title_text(&label);
1280 if !trail.is_empty() {
1281 label.push(Inline::Str(trail.into()));
1282 label = coalesce(label);
1283 }
1284 let attr = Attr {
1285 id: carta_ast::Text::default(),
1286 classes: vec!["wikilink".into()],
1287 attributes: Vec::new(),
1288 };
1289 let url = wikilink_url(&target);
1290 Some((
1291 vec![Inline::Link(
1292 Box::new(attr),
1293 label,
1294 Box::new(Target {
1295 url: url.into(),
1296 title: title.into(),
1297 }),
1298 )],
1299 after,
1300 ))
1301 }
1302
1303 fn pipe_trick_label(&mut self, target: &str) -> Vec<Inline> {
1307 match target.split_once(':') {
1308 Some((_, rest)) => self.parse_inlines(rest),
1309 None => Vec::new(),
1310 }
1311 }
1312
1313 fn image_embed(&mut self, target: &str, params: Option<&str>) -> Option<Inline> {
1319 let url = wikilink_url(strip_namespace(target));
1320 let mut attributes: Vec<(String, String)> = Vec::new();
1321 let mut caption: Option<String> = None;
1322 if let Some(params) = params {
1323 for part in params.split('|') {
1324 let option = part.trim();
1325 if image_param_declines(option) {
1326 return None;
1327 }
1328 if let Some((width, height)) = image_size(option) {
1329 attributes.retain(|(key, _)| key != "width" && key != "height");
1330 attributes.push(("width".to_string(), width));
1331 if let Some(height) = height {
1332 attributes.push(("height".to_string(), height));
1333 }
1334 } else if is_image_keyword(option) || is_recognized_image_attr(option) {
1335 } else {
1338 caption = Some(part.to_string());
1339 }
1340 }
1341 }
1342 let caption = caption.unwrap_or_else(|| url.clone());
1343 let alt = self.parse_inlines(&caption);
1344 let title = title_text(&alt);
1345 let attr = Attr {
1346 id: carta_ast::Text::default(),
1347 classes: Vec::new(),
1348 attributes: attributes
1349 .into_iter()
1350 .map(|(k, v)| (k.into(), v.into()))
1351 .collect(),
1352 };
1353 Some(Inline::Image(
1354 Box::new(attr),
1355 alt,
1356 Box::new(Target {
1357 url: url.into(),
1358 title: title.into(),
1359 }),
1360 ))
1361 }
1362
1363 fn make_id(&mut self, inlines: &[Inline]) -> String {
1364 let plain = to_plain_text(inlines);
1365 if self.extensions.contains(Extension::GfmAutoIdentifiers) {
1366 let base = self.finish_id(slug_gfm, &emoji_to_aliases(&plain));
1367 self.ids.assign_with_separator(base, '-')
1368 } else if self.extensions.contains(Extension::AutoIdentifiers) {
1369 let base = self.finish_id(mediawiki_slug, &plain);
1370 self.ids.assign_with_separator(base, '_')
1371 } else {
1372 String::new()
1373 }
1374 }
1375
1376 fn finish_id(&self, slug: fn(&str) -> String, source: &str) -> String {
1381 let mut base = slug(source);
1382 if self.extensions.contains(Extension::AsciiIdentifiers) {
1383 base = slug(&transliterate_ascii(&base));
1384 }
1385 base
1386 }
1387}
1388
1389fn expand_tabs(input: &str) -> String {
1395 if !input.contains('\t') {
1396 return input.to_string();
1397 }
1398 let mut out = String::with_capacity(input.len());
1399 let mut col = 0usize;
1400 for ch in input.chars() {
1401 match ch {
1402 '\t' => {
1403 let spaces = 4 - (col % 4);
1404 for _ in 0..spaces {
1405 out.push(' ');
1406 }
1407 col += spaces;
1408 }
1409 '\n' => {
1410 out.push('\n');
1411 col = 0;
1412 }
1413 other => {
1414 out.push(other);
1415 col += 1;
1416 }
1417 }
1418 }
1419 out
1420}
1421
1422fn strip_comments(input: &str) -> String {
1430 let chars: Vec<char> = input.chars().collect();
1431 let n = chars.len();
1432 let mut out = String::new();
1433 let mut i = 0;
1434 while i < n {
1435 let Some(c) = at(&chars, i) else { break };
1436 if c == '<' {
1437 if let Some(after) = verbatim_region_end(&chars, i) {
1438 out.push_str(&collect_range(&chars, i, after));
1439 i = after;
1440 continue;
1441 }
1442 if matches_prefix_ci(&chars, i, "<!--") {
1443 if let Some(dash) = find_seq(&chars, i + 4, &['-', '-', '>']) {
1444 let comment_end = dash + 3;
1445 let preceded = i == 0 || at(&chars, i - 1) == Some('\n');
1446 let followed = comment_end >= n || at(&chars, comment_end) == Some('\n');
1447 if preceded && followed {
1448 i = if comment_end < n {
1449 comment_end + 1
1450 } else {
1451 comment_end
1452 };
1453 } else if preceded || followed {
1454 i = comment_end;
1458 } else {
1459 out.push(' ');
1461 i = comment_end;
1462 }
1463 continue;
1464 }
1465 out.push('<');
1466 i += 1;
1467 continue;
1468 }
1469 }
1470 out.push(c);
1471 i += 1;
1472 }
1473 out
1474}
1475
1476fn verbatim_region_end(chars: &[char], i: usize) -> Option<usize> {
1478 let (name, _raw, self_closing, after_open) = open_tag(chars, i)?;
1479 if !matches!(
1480 name.as_str(),
1481 "pre" | "nowiki" | "math" | "source" | "syntaxhighlight"
1482 ) {
1483 return None;
1484 }
1485 if self_closing {
1486 return Some(after_open);
1487 }
1488 match close_tag(chars, after_open, &name) {
1489 Some((_, after)) => Some(after),
1490 None => Some(chars.len()),
1491 }
1492}
1493
1494const BEHAVIOR_SWITCHES: &[&str] = &[
1498 "ARCHIVEDTALK",
1499 "DISAMBIG",
1500 "EXPECTUNUSEDCATEGORY",
1501 "EXPECTUNUSEDTEMPLATE",
1502 "FORCETOC",
1503 "HIDDENCAT",
1504 "INDEX",
1505 "NEWSECTIONLINK",
1506 "NOCC",
1507 "NOCONTENTCONVERT",
1508 "NOEDITSECTION",
1509 "NOGALLERY",
1510 "NOGLOBAL",
1511 "NOINDEX",
1512 "NONEWSECTIONLINK",
1513 "NOTC",
1514 "NOTITLECONVERT",
1515 "NOTOC",
1516 "STATICREDIRECT",
1517 "TOC",
1518];
1519
1520fn extract_behavior_switches(input: &str) -> (String, Vec<String>) {
1524 let chars: Vec<char> = input.chars().collect();
1525 let n = chars.len();
1526 let mut out = String::new();
1527 let mut found: Vec<String> = Vec::new();
1528 let mut i = 0;
1529 while i < n {
1530 if at(&chars, i) == Some('<')
1531 && let Some(after) = verbatim_region_end(&chars, i)
1532 {
1533 out.push_str(&collect_range(&chars, i, after));
1534 i = after;
1535 continue;
1536 }
1537 if at(&chars, i) == Some('_')
1538 && at(&chars, i + 1) == Some('_')
1539 && let Some((word, after)) = behavior_switch_at(&chars, i)
1540 {
1541 let key = word.to_ascii_lowercase();
1542 if !found.contains(&key) {
1543 found.push(key);
1544 }
1545 i = after;
1546 if out.is_empty() || out.ends_with('\n') {
1550 while matches!(at(&chars, i), Some(' ' | '\t')) {
1551 i += 1;
1552 }
1553 }
1554 continue;
1555 }
1556 if let Some(c) = at(&chars, i) {
1557 out.push(c);
1558 }
1559 i += 1;
1560 }
1561 (out, found)
1562}
1563
1564fn behavior_switch_at(chars: &[char], i: usize) -> Option<(String, usize)> {
1566 let start = i + 2;
1567 let mut j = start;
1568 while at(chars, j).is_some_and(|c| c.is_ascii_uppercase()) {
1569 j += 1;
1570 }
1571 let word = collect_range(chars, start, j);
1572 if word.is_empty()
1573 || at(chars, j) != Some('_')
1574 || at(chars, j + 1) != Some('_')
1575 || !BEHAVIOR_SWITCHES.contains(&word.as_str())
1576 {
1577 return None;
1578 }
1579 Some((word, j + 2))
1580}
1581
1582enum Unit {
1586 Apostrophe,
1587 Node(Inline),
1588}
1589
1590fn resolve_emphasis(toks: Vec<Tok>) -> Vec<Inline> {
1596 let mut units: Vec<Unit> = Vec::new();
1597 for tok in toks {
1598 match tok {
1599 Tok::Inline(inline) => units.push(Unit::Node(inline)),
1600 Tok::Apostrophes(n) => units.extend((0..n).map(|_| Unit::Apostrophe)),
1601 Tok::BlockRaw(raw) => units.push(Unit::Node(raw_html(raw))),
1602 Tok::BlockBreak | Tok::Block(_) => {}
1603 }
1604 }
1605 let runs = apostrophe_runs(&units);
1606 let mut budget = units
1608 .len()
1609 .saturating_mul(8)
1610 .saturating_add(64)
1611 .min(200_000);
1612 let (nodes, _, _) = parse_runs(&units, &runs, 0, None, &mut budget);
1613 nodes
1614}
1615
1616fn apostrophe_runs(units: &[Unit]) -> Vec<usize> {
1618 let mut runs = vec![0usize; units.len()];
1619 for i in (0..units.len()).rev() {
1620 if matches!(units.get(i), Some(Unit::Apostrophe)) {
1621 let next = runs.get(i + 1).copied().unwrap_or(0);
1622 if let Some(slot) = runs.get_mut(i) {
1623 *slot = 1 + next;
1624 }
1625 }
1626 }
1627 runs
1628}
1629
1630fn emphasis_width(strong: bool) -> usize {
1632 if strong { 3 } else { 2 }
1633}
1634
1635fn try_open(
1639 units: &[Unit],
1640 runs: &[usize],
1641 i: usize,
1642 strong: bool,
1643 budget: &mut usize,
1644) -> Option<(Inline, usize)> {
1645 if *budget == 0 {
1646 return None;
1647 }
1648 *budget -= 1;
1649 let width = emphasis_width(strong);
1650 let (body, next, closed) = parse_runs(units, runs, i + width, Some(strong), budget);
1651 if !closed || body.is_empty() {
1652 return None;
1653 }
1654 let body = strip_outer_whitespace(body);
1655 Some((
1656 if strong {
1657 Inline::Strong(body)
1658 } else {
1659 Inline::Emph(body)
1660 },
1661 next,
1662 ))
1663}
1664
1665fn parse_runs(
1672 units: &[Unit],
1673 runs: &[usize],
1674 start: usize,
1675 closer: Option<bool>,
1676 budget: &mut usize,
1677) -> (Vec<Inline>, usize, bool) {
1678 let mut nodes: Vec<Inline> = Vec::new();
1679 let mut pos = start;
1680 while let Some(unit) = units.get(pos) {
1681 match unit {
1682 Unit::Node(inline) => {
1683 nodes.push(inline.clone());
1684 pos += 1;
1685 }
1686 Unit::Apostrophe => {
1687 let run = runs.get(pos).copied().unwrap_or(0);
1688 if run >= emphasis_width(true)
1689 && closer != Some(true)
1690 && let Some((span, next)) = try_open(units, runs, pos, true, budget)
1691 {
1692 nodes.push(span);
1693 pos = next;
1694 continue;
1695 }
1696 if let Some(strong) = closer
1697 && run >= emphasis_width(strong)
1698 {
1699 return (nodes, pos + emphasis_width(strong), true);
1700 }
1701 if run >= emphasis_width(false)
1702 && closer != Some(false)
1703 && let Some((span, next)) = try_open(units, runs, pos, false, budget)
1704 {
1705 nodes.push(span);
1706 pos = next;
1707 continue;
1708 }
1709 nodes.push(Inline::Str("'".into()));
1710 pos += 1;
1711 }
1712 }
1713 }
1714 (nodes, pos, closer.is_none())
1715}
1716
1717fn strip_outer_whitespace(mut inlines: Vec<Inline>) -> Vec<Inline> {
1719 let lead = inlines
1720 .iter()
1721 .take_while(|x| matches!(x, Inline::Space | Inline::SoftBreak))
1722 .count();
1723 inlines.drain(0..lead);
1724 while matches!(inlines.last(), Some(Inline::Space | Inline::SoftBreak)) {
1725 inlines.pop();
1726 }
1727 inlines
1728}
1729
1730enum SmartUnit {
1734 Quote,
1735 Ch(char),
1736 Space(Inline),
1737 Node(Inline),
1738}
1739
1740fn apply_smart_quotes(inlines: Vec<Inline>) -> Vec<Inline> {
1745 let recursed: Vec<Inline> = inlines.into_iter().map(smart_descend).collect();
1746 let units = flatten_smart(recursed);
1747 resolve_double_quotes(&units, 0, units.len())
1748}
1749
1750fn smart_descend(inline: Inline) -> Inline {
1753 match inline {
1754 Inline::Emph(v) => Inline::Emph(apply_smart_quotes(v)),
1755 Inline::Underline(v) => Inline::Underline(apply_smart_quotes(v)),
1756 Inline::Strong(v) => Inline::Strong(apply_smart_quotes(v)),
1757 Inline::Strikeout(v) => Inline::Strikeout(apply_smart_quotes(v)),
1758 Inline::Superscript(v) => Inline::Superscript(apply_smart_quotes(v)),
1759 Inline::Subscript(v) => Inline::Subscript(apply_smart_quotes(v)),
1760 Inline::SmallCaps(v) => Inline::SmallCaps(apply_smart_quotes(v)),
1761 Inline::Quoted(quote_type, v) => Inline::Quoted(quote_type, apply_smart_quotes(v)),
1762 Inline::Span(attr, v) => Inline::Span(attr, apply_smart_quotes(v)),
1763 Inline::Link(attr, v, target) => Inline::Link(attr, apply_smart_quotes(v), target),
1764 Inline::Image(attr, v, target) => Inline::Image(attr, apply_smart_quotes(v), target),
1765 other => other,
1766 }
1767}
1768
1769fn flatten_smart(inlines: Vec<Inline>) -> Vec<SmartUnit> {
1770 let mut units: Vec<SmartUnit> = Vec::new();
1771 for inline in inlines {
1772 match inline {
1773 Inline::Str(text) => {
1774 for c in text.chars() {
1775 if c == '"' {
1776 units.push(SmartUnit::Quote);
1777 } else {
1778 units.push(SmartUnit::Ch(c));
1779 }
1780 }
1781 }
1782 space @ (Inline::Space | Inline::SoftBreak | Inline::LineBreak) => {
1783 units.push(SmartUnit::Space(space));
1784 }
1785 other => units.push(SmartUnit::Node(other)),
1786 }
1787 }
1788 units
1789}
1790
1791fn resolve_double_quotes(units: &[SmartUnit], lo: usize, hi: usize) -> Vec<Inline> {
1792 let mut out: Vec<Inline> = Vec::new();
1793 let mut buf = String::new();
1794 let mut i = lo;
1795 while i < hi {
1796 match units.get(i) {
1797 Some(SmartUnit::Quote) => {
1798 if smart_quote_opens(units, i, hi)
1799 && let Some(j) = next_smart_quote(units, i + 1, hi)
1800 {
1801 flush_smart_buf(&mut buf, &mut out);
1802 out.push(Inline::Quoted(
1803 QuoteType::DoubleQuote,
1804 strip_outer_whitespace(resolve_double_quotes(units, i + 1, j)),
1805 ));
1806 i = j + 1;
1807 } else {
1808 buf.push('"');
1809 i += 1;
1810 }
1811 }
1812 Some(SmartUnit::Ch(c)) => {
1813 buf.push(*c);
1814 i += 1;
1815 }
1816 Some(SmartUnit::Space(inline) | SmartUnit::Node(inline)) => {
1817 flush_smart_buf(&mut buf, &mut out);
1818 out.push(inline.clone());
1819 i += 1;
1820 }
1821 None => break,
1822 }
1823 }
1824 flush_smart_buf(&mut buf, &mut out);
1825 out
1826}
1827
1828fn flush_smart_buf(buf: &mut String, out: &mut Vec<Inline>) {
1829 if !buf.is_empty() {
1830 out.push(Inline::Str(std::mem::take(buf).into()));
1831 }
1832}
1833
1834fn smart_quote_opens(units: &[SmartUnit], i: usize, hi: usize) -> bool {
1837 if i + 1 >= hi {
1838 return false;
1839 }
1840 match units.get(i + 1) {
1841 Some(SmartUnit::Ch(c)) => !c.is_whitespace(),
1842 Some(SmartUnit::Quote | SmartUnit::Node(_)) => true,
1843 Some(SmartUnit::Space(_)) | None => false,
1844 }
1845}
1846
1847fn next_smart_quote(units: &[SmartUnit], from: usize, hi: usize) -> Option<usize> {
1848 (from..hi).find(|&j| matches!(units.get(j), Some(SmartUnit::Quote)))
1849}
1850
1851fn coalesce(inlines: Vec<Inline>) -> Vec<Inline> {
1854 let mut out: Vec<Inline> = Vec::new();
1855 for inline in inlines {
1856 let inline = match inline {
1857 Inline::Emph(xs) => Inline::Emph(coalesce(xs)),
1858 Inline::Strong(xs) => Inline::Strong(coalesce(xs)),
1859 Inline::Strikeout(xs) => Inline::Strikeout(coalesce(xs)),
1860 Inline::Superscript(xs) => Inline::Superscript(coalesce(xs)),
1861 Inline::Subscript(xs) => Inline::Subscript(coalesce(xs)),
1862 Inline::Underline(xs) => Inline::Underline(coalesce(xs)),
1863 Inline::SmallCaps(xs) => Inline::SmallCaps(coalesce(xs)),
1864 Inline::Span(attr, xs) => Inline::Span(attr, coalesce(xs)),
1865 other => other,
1866 };
1867 match (out.last_mut(), &inline) {
1868 (Some(Inline::Str(prev)), Inline::Str(next)) => prev.push_str(next),
1869 (
1873 Some(slot @ (Inline::Space | Inline::SoftBreak)),
1874 Inline::Space | Inline::SoftBreak,
1875 ) => {
1876 if matches!(inline, Inline::SoftBreak) {
1877 *slot = Inline::SoftBreak;
1878 }
1879 }
1880 _ => out.push(inline),
1881 }
1882 }
1883 out
1884}
1885
1886fn drop_east_asian_breaks(inlines: Vec<Inline>) -> Vec<Inline> {
1890 let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
1891 let mut iter = inlines.into_iter().peekable();
1892 while let Some(inline) = iter.next() {
1893 if matches!(inline, Inline::SoftBreak) {
1894 let prev_wide = out.last().and_then(trailing_char).is_some_and(is_wide_char);
1895 let next_wide = iter.peek().and_then(leading_char).is_some_and(is_wide_char);
1896 if prev_wide && next_wide {
1897 continue;
1898 }
1899 }
1900 out.push(inline);
1901 }
1902 out
1903}
1904
1905fn trailing_char(inline: &Inline) -> Option<char> {
1908 match inline {
1909 Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
1910 s.chars().next_back()
1911 }
1912 Inline::Emph(xs)
1913 | Inline::Underline(xs)
1914 | Inline::Strong(xs)
1915 | Inline::Strikeout(xs)
1916 | Inline::Superscript(xs)
1917 | Inline::Subscript(xs)
1918 | Inline::SmallCaps(xs)
1919 | Inline::Quoted(_, xs)
1920 | Inline::Span(_, xs)
1921 | Inline::Link(_, xs, _)
1922 | Inline::Cite(_, xs) => xs.iter().rev().find_map(trailing_char),
1923 _ => None,
1924 }
1925}
1926
1927fn leading_char(inline: &Inline) -> Option<char> {
1930 match inline {
1931 Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
1932 s.chars().next()
1933 }
1934 Inline::Emph(xs)
1935 | Inline::Underline(xs)
1936 | Inline::Strong(xs)
1937 | Inline::Strikeout(xs)
1938 | Inline::Superscript(xs)
1939 | Inline::Subscript(xs)
1940 | Inline::SmallCaps(xs)
1941 | Inline::Quoted(_, xs)
1942 | Inline::Span(_, xs)
1943 | Inline::Link(_, xs, _)
1944 | Inline::Cite(_, xs) => xs.iter().find_map(leading_char),
1945 _ => None,
1946 }
1947}
1948
1949fn is_wide_char(c: char) -> bool {
1952 let cp = c as u32;
1953 matches!(cp,
1954 0x1100..=0x115F
1955 | 0x2329 | 0x232A
1956 | 0x2E80..=0x303E
1957 | 0x3041..=0x33FF
1958 | 0x3400..=0x4DBF
1959 | 0x4E00..=0x9FFF
1960 | 0xA000..=0xA4CF
1961 | 0xA960..=0xA97F
1962 | 0xAC00..=0xD7A3
1963 | 0xF900..=0xFAFF
1964 | 0xFE10..=0xFE19
1965 | 0xFE30..=0xFE6F
1966 | 0xFF00..=0xFF60
1967 | 0xFFE0..=0xFFE6
1968 | 0x1B000..=0x1B2FF
1969 | 0x1F200..=0x1F2FF
1970 | 0x1F300..=0x1F64F
1971 | 0x1F900..=0x1F9FF
1972 | 0x20000..=0x3FFFD
1973 )
1974}
1975
1976fn preformat_transform(inlines: Vec<Inline>) -> Vec<Inline> {
1982 let mut out: Vec<Inline> = Vec::new();
1983 let mut run = String::new();
1984 for inline in inlines {
1985 match inline {
1986 Inline::Str(s) => run.push_str(&s.replace(' ', "\u{a0}")),
1987 Inline::Space | Inline::SoftBreak => run.push('\u{a0}'),
1988 other => {
1989 if !run.is_empty() {
1990 out.push(Inline::Code(
1991 Box::default(),
1992 std::mem::take(&mut run).into(),
1993 ));
1994 }
1995 out.push(preformat_descend(other));
1996 }
1997 }
1998 }
1999 if !run.is_empty() {
2000 out.push(Inline::Code(Box::default(), run.into()));
2001 }
2002 out
2003}
2004
2005fn preformat_descend(inline: Inline) -> Inline {
2008 match inline {
2009 Inline::Emph(xs) => Inline::Emph(preformat_transform(xs)),
2010 Inline::Strong(xs) => Inline::Strong(preformat_transform(xs)),
2011 Inline::Strikeout(xs) => Inline::Strikeout(preformat_transform(xs)),
2012 Inline::Superscript(xs) => Inline::Superscript(preformat_transform(xs)),
2013 Inline::Subscript(xs) => Inline::Subscript(preformat_transform(xs)),
2014 Inline::Underline(xs) => Inline::Underline(preformat_transform(xs)),
2015 Inline::SmallCaps(xs) => Inline::SmallCaps(preformat_transform(xs)),
2016 Inline::Span(attr, xs) => Inline::Span(attr, preformat_transform(xs)),
2017 Inline::Link(attr, xs, target) => Inline::Link(attr, preformat_transform(xs), target),
2018 other => other,
2019 }
2020}
2021
2022fn plain_inlines(text: &str) -> Vec<Inline> {
2027 let chars: Vec<char> = text.chars().collect();
2028 let n = chars.len();
2029 let mut out: Vec<Inline> = Vec::new();
2030 let mut word = String::new();
2031 let mut i = 0;
2032 while i < n {
2033 let Some(c) = at(&chars, i) else { break };
2034 if c.is_whitespace() {
2035 if !word.is_empty() {
2036 out.push(Inline::Str(std::mem::take(&mut word).into()));
2037 }
2038 let (token, next) = whitespace_token(&chars, i);
2039 out.push(token);
2040 i = next;
2041 } else if c == '&' {
2042 if let Some((decoded, next)) = entities::read_reference(&chars, i, chars.len(), true) {
2043 word.push_str(&decoded);
2044 i = next;
2045 } else {
2046 word.push('&');
2047 i += 1;
2048 }
2049 } else {
2050 word.push(c);
2051 i += 1;
2052 }
2053 }
2054 if !word.is_empty() {
2055 out.push(Inline::Str(word.into()));
2056 }
2057 out
2058}
2059
2060fn decode_entities(text: &str) -> String {
2062 let chars: Vec<char> = text.chars().collect();
2063 let n = chars.len();
2064 let mut out = String::new();
2065 let mut i = 0;
2066 while i < n {
2067 if at(&chars, i) == Some('&')
2068 && let Some((decoded, next)) = entities::read_reference(&chars, i, chars.len(), true)
2069 {
2070 out.push_str(&decoded);
2071 i = next;
2072 continue;
2073 }
2074 if let Some(c) = at(&chars, i) {
2075 out.push(c);
2076 }
2077 i += 1;
2078 }
2079 out
2080}
2081
2082fn is_scheme(name: &str) -> bool {
2087 let lower = name.to_ascii_lowercase();
2088 crate::url_schemes::is_scheme(&lower) || lower == "doi" || lower == "javascript"
2089}
2090
2091fn is_url(text: &str) -> bool {
2094 match text.split_once(':') {
2095 Some((scheme, _)) => {
2096 !scheme.is_empty()
2097 && scheme
2098 .chars()
2099 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.'))
2100 && is_scheme(scheme)
2101 }
2102 None => false,
2103 }
2104}
2105
2106fn url_scheme_len(chars: &[char], i: usize) -> Option<usize> {
2110 let mut j = i;
2111 let mut name = String::new();
2112 while let Some(c) = at(chars, j) {
2113 if c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.') {
2114 name.push(c);
2115 j += 1;
2116 } else {
2117 break;
2118 }
2119 }
2120 if name.is_empty() || at(chars, j) != Some(':') || !is_scheme(&name) {
2121 return None;
2122 }
2123 Some(j - i + 1)
2124}
2125
2126fn bare_url(chars: &[char], i: usize) -> Option<(Inline, usize)> {
2131 let scheme_len = url_scheme_len(chars, i)?;
2132 let mut j = i + scheme_len;
2133 while let Some(c) = at(chars, j) {
2134 if c.is_whitespace() || matches!(c, '<' | '>') {
2135 break;
2136 }
2137 if c == '\'' && at(chars, j + 1) == Some('\'') {
2139 break;
2140 }
2141 j += 1;
2142 }
2143 if j <= i + scheme_len {
2144 return None;
2145 }
2146 let mut display = collect_range(chars, i, j);
2147 trim_url_trailing(&mut display);
2148 if display.is_empty() {
2149 return None;
2150 }
2151 let consumed = display.chars().count();
2152 let target = encode_url_target(&display);
2153 Some((
2154 Inline::Link(
2155 Box::default(),
2156 vec![Inline::Str(display.into())],
2157 Box::new(Target {
2158 url: target.into(),
2159 title: carta_ast::Text::default(),
2160 }),
2161 ),
2162 i + consumed,
2163 ))
2164}
2165
2166fn trim_url_trailing(url: &mut String) {
2170 while let Some(last) = url.chars().last() {
2171 let always = matches!(
2172 last,
2173 '.' | ',' | ';' | ':' | '!' | '?' | '"' | '*' | '~' | '\'' | '|'
2174 );
2175 let unbalanced = match last {
2176 ')' => url.matches(')').count() > url.matches('(').count(),
2177 ']' => url.matches(']').count() > url.matches('[').count(),
2178 '}' => url.matches('}').count() > url.matches('{').count(),
2179 _ => false,
2180 };
2181 if always || unbalanced {
2182 url.pop();
2183 } else {
2184 break;
2185 }
2186 }
2187}
2188
2189fn encode_url_target(url: &str) -> String {
2191 let mut out = String::with_capacity(url.len());
2192 for ch in url.chars() {
2193 match ch {
2194 ' ' => out.push_str("%20"),
2195 '"' => out.push_str("%22"),
2196 '`' => out.push_str("%60"),
2197 '^' => out.push_str("%5E"),
2198 '[' => out.push_str("%5B"),
2199 ']' => out.push_str("%5D"),
2200 '{' => out.push_str("%7B"),
2201 '}' => out.push_str("%7D"),
2202 '|' => out.push_str("%7C"),
2203 other => out.push(other),
2204 }
2205 }
2206 out
2207}
2208
2209fn wikilink_url(target: &str) -> String {
2212 let mut out = String::new();
2213 let mut pending = false;
2214 for ch in target.chars() {
2215 if ch.is_whitespace() {
2216 pending = true;
2217 } else {
2218 if pending {
2219 out.push('_');
2220 pending = false;
2221 }
2222 out.push(ch);
2223 }
2224 }
2225 out
2226}
2227
2228fn title_text(inlines: &[Inline]) -> String {
2233 let mut out = String::new();
2234 push_title_text(inlines, &mut out);
2235 out
2236}
2237
2238fn push_title_text(inlines: &[Inline], out: &mut String) {
2239 for inline in inlines {
2240 match inline {
2241 Inline::Str(text) | Inline::Code(_, text) | Inline::Math(_, text) => out.push_str(text),
2242 Inline::Space | Inline::SoftBreak | Inline::LineBreak => out.push(' '),
2243 Inline::Quoted(QuoteType::SingleQuote, xs) => {
2244 out.push('\u{2018}');
2245 push_title_text(xs, out);
2246 out.push('\u{2019}');
2247 }
2248 Inline::Quoted(QuoteType::DoubleQuote, xs) => {
2249 out.push('\u{201c}');
2250 push_title_text(xs, out);
2251 out.push('\u{201d}');
2252 }
2253 Inline::Emph(xs)
2254 | Inline::Underline(xs)
2255 | Inline::Strong(xs)
2256 | Inline::Strikeout(xs)
2257 | Inline::Superscript(xs)
2258 | Inline::Subscript(xs)
2259 | Inline::SmallCaps(xs)
2260 | Inline::Cite(_, xs)
2261 | Inline::Link(_, xs, _)
2262 | Inline::Image(_, xs, _)
2263 | Inline::Span(_, xs) => push_title_text(xs, out),
2264 Inline::RawInline(..) | Inline::Note(_) => {}
2265 }
2266 }
2267}
2268
2269fn namespace_of(target: &str) -> Option<String> {
2270 if target.starts_with(':') {
2271 return None;
2272 }
2273 let (before, _) = target.split_once(':')?;
2274 Some(before.trim().to_lowercase())
2275}
2276
2277fn strip_namespace(target: &str) -> &str {
2281 match target.split_once(':') {
2282 Some((_, rest)) => rest.trim(),
2283 None => target,
2284 }
2285}
2286
2287fn image_size(param: &str) -> Option<(String, Option<String>)> {
2291 let digits = param.strip_suffix("px")?;
2292 match digits.split_once('x') {
2293 Some((width, height)) => {
2294 let valid = width.chars().all(|c| c.is_ascii_digit())
2295 && !height.is_empty()
2296 && height.chars().all(|c| c.is_ascii_digit());
2297 valid.then(|| (width.to_string(), Some(height.to_string())))
2298 }
2299 None => (!digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit()))
2300 .then(|| (digits.to_string(), None)),
2301 }
2302}
2303
2304fn image_param_declines(param: &str) -> bool {
2309 match param.split_once('=') {
2310 Some((key, _)) => {
2311 let key = key.trim().to_ascii_lowercase();
2312 key == "thumbtime" || key == "upright"
2313 }
2314 None => param.trim().eq_ignore_ascii_case("thumbtime"),
2315 }
2316}
2317
2318fn is_recognized_image_attr(param: &str) -> bool {
2322 match param.split_once('=') {
2323 Some((key, _)) => matches!(
2324 key.trim().to_ascii_lowercase().as_str(),
2325 "alt" | "link" | "class" | "page"
2326 ),
2327 None => false,
2328 }
2329}
2330
2331fn is_image_keyword(param: &str) -> bool {
2334 matches!(
2335 param.to_ascii_lowercase().as_str(),
2336 "thumb"
2337 | "thumbnail"
2338 | "frame"
2339 | "framed"
2340 | "frameless"
2341 | "border"
2342 | "left"
2343 | "right"
2344 | "center"
2345 | "centre"
2346 | "none"
2347 | "upright"
2348 | "baseline"
2349 | "sub"
2350 | "super"
2351 | "top"
2352 | "text-top"
2353 | "middle"
2354 | "bottom"
2355 | "text-bottom"
2356 )
2357}
2358
2359fn para_or_figure(inlines: Vec<Inline>) -> Block {
2362 match lone_image_figure(&inlines) {
2363 Some(figure) => figure,
2364 None => Block::Para(inlines),
2365 }
2366}
2367
2368fn plain_or_figure(inlines: Vec<Inline>) -> Block {
2370 match lone_image_figure(&inlines) {
2371 Some(figure) => figure,
2372 None => Block::Plain(inlines),
2373 }
2374}
2375
2376fn lone_image_figure(inlines: &[Inline]) -> Option<Block> {
2379 let mut significant = inlines.iter().filter(|inline| {
2380 !matches!(
2381 inline,
2382 Inline::Space | Inline::SoftBreak | Inline::LineBreak
2383 )
2384 });
2385 let Inline::Image(attr, alt, target) = significant.next()? else {
2386 return None;
2387 };
2388 if significant.next().is_some() {
2389 return None;
2390 }
2391 let caption = Caption {
2392 short: None,
2393 long: vec![Block::Plain(alt.clone())],
2394 };
2395 let image = Inline::Image(attr.clone(), Vec::new(), target.clone());
2396 Some(Block::Figure(
2397 Box::default(),
2398 Box::new(caption),
2399 vec![Block::Plain(vec![image])],
2400 ))
2401}
2402
2403fn emoji_to_aliases(text: &str) -> String {
2409 let mut out = String::new();
2410 let mut rest = text;
2411 while !rest.is_empty() {
2412 if let Some((alias, len)) = emoji::alias_at(rest) {
2413 out.push_str(alias);
2414 rest = rest.get(len..).unwrap_or("");
2415 } else if let Some(ch) = rest.chars().next() {
2416 out.push(ch);
2417 rest = rest.get(ch.len_utf8()..).unwrap_or("");
2418 } else {
2419 break;
2420 }
2421 }
2422 out
2423}
2424
2425fn mediawiki_slug(text: &str) -> String {
2430 let mut out = String::new();
2431 let mut in_ws = false;
2432 for ch in text.chars() {
2433 if ch.is_whitespace() {
2434 if !in_ws {
2435 out.push('_');
2436 in_ws = true;
2437 }
2438 } else if ch == '-' {
2439 out.push('_');
2440 in_ws = false;
2441 } else if ch.is_alphanumeric() || ch == '_' || ch == '.' {
2442 out.extend(ch.to_lowercase());
2443 in_ws = false;
2444 }
2445 }
2448 out.chars().skip_while(|c| !c.is_alphabetic()).collect()
2449}
2450
2451fn transliterate_ascii(text: &str) -> String {
2456 let mut out = String::with_capacity(text.len());
2457 for ch in text.chars() {
2458 if ch.is_ascii() {
2459 out.push(ch);
2460 } else if let Ok(index) = ASCII_FOLD.binary_search_by(|&(cp, _)| cp.cmp(&(ch as u32)))
2461 && let Some(&(_, byte)) = ASCII_FOLD.get(index)
2462 {
2463 out.push(byte as char);
2464 }
2465 }
2466 out
2467}
2468
2469const ASCII_FOLD: &[(u32, u8)] = &[
2473 (0x00C0, b'a'),
2474 (0x00C1, b'a'),
2475 (0x00C2, b'a'),
2476 (0x00C3, b'a'),
2477 (0x00C4, b'a'),
2478 (0x00C5, b'a'),
2479 (0x00C7, b'c'),
2480 (0x00C8, b'e'),
2481 (0x00C9, b'e'),
2482 (0x00CA, b'e'),
2483 (0x00CB, b'e'),
2484 (0x00CC, b'i'),
2485 (0x00CD, b'i'),
2486 (0x00CE, b'i'),
2487 (0x00CF, b'i'),
2488 (0x00D1, b'n'),
2489 (0x00D2, b'o'),
2490 (0x00D3, b'o'),
2491 (0x00D4, b'o'),
2492 (0x00D5, b'o'),
2493 (0x00D6, b'o'),
2494 (0x00D9, b'u'),
2495 (0x00DA, b'u'),
2496 (0x00DB, b'u'),
2497 (0x00DC, b'u'),
2498 (0x00DD, b'y'),
2499 (0x00E0, b'a'),
2500 (0x00E1, b'a'),
2501 (0x00E2, b'a'),
2502 (0x00E3, b'a'),
2503 (0x00E4, b'a'),
2504 (0x00E5, b'a'),
2505 (0x00E7, b'c'),
2506 (0x00E8, b'e'),
2507 (0x00E9, b'e'),
2508 (0x00EA, b'e'),
2509 (0x00EB, b'e'),
2510 (0x00EC, b'i'),
2511 (0x00ED, b'i'),
2512 (0x00EE, b'i'),
2513 (0x00EF, b'i'),
2514 (0x00F1, b'n'),
2515 (0x00F2, b'o'),
2516 (0x00F3, b'o'),
2517 (0x00F4, b'o'),
2518 (0x00F5, b'o'),
2519 (0x00F6, b'o'),
2520 (0x00F9, b'u'),
2521 (0x00FA, b'u'),
2522 (0x00FB, b'u'),
2523 (0x00FC, b'u'),
2524 (0x00FD, b'y'),
2525 (0x00FF, b'y'),
2526 (0x0100, b'a'),
2527 (0x0101, b'a'),
2528 (0x0102, b'a'),
2529 (0x0103, b'a'),
2530 (0x0104, b'a'),
2531 (0x0105, b'a'),
2532 (0x0106, b'c'),
2533 (0x0107, b'c'),
2534 (0x0108, b'c'),
2535 (0x0109, b'c'),
2536 (0x010A, b'c'),
2537 (0x010B, b'c'),
2538 (0x010C, b'c'),
2539 (0x010D, b'c'),
2540 (0x010E, b'd'),
2541 (0x010F, b'd'),
2542 (0x0112, b'e'),
2543 (0x0113, b'e'),
2544 (0x0114, b'e'),
2545 (0x0115, b'e'),
2546 (0x0116, b'e'),
2547 (0x0117, b'e'),
2548 (0x0118, b'e'),
2549 (0x0119, b'e'),
2550 (0x011A, b'e'),
2551 (0x011B, b'e'),
2552 (0x011C, b'g'),
2553 (0x011D, b'g'),
2554 (0x011E, b'g'),
2555 (0x011F, b'g'),
2556 (0x0120, b'g'),
2557 (0x0121, b'g'),
2558 (0x0122, b'g'),
2559 (0x0123, b'g'),
2560 (0x0124, b'h'),
2561 (0x0125, b'h'),
2562 (0x0128, b'i'),
2563 (0x0129, b'i'),
2564 (0x012A, b'i'),
2565 (0x012B, b'i'),
2566 (0x012C, b'i'),
2567 (0x012D, b'i'),
2568 (0x012E, b'i'),
2569 (0x012F, b'i'),
2570 (0x0130, b'i'),
2571 (0x0134, b'j'),
2572 (0x0135, b'j'),
2573 (0x0136, b'k'),
2574 (0x0137, b'k'),
2575 (0x0139, b'l'),
2576 (0x013A, b'l'),
2577 (0x013B, b'l'),
2578 (0x013C, b'l'),
2579 (0x013D, b'l'),
2580 (0x013E, b'l'),
2581 (0x0143, b'n'),
2582 (0x0144, b'n'),
2583 (0x0145, b'n'),
2584 (0x0146, b'n'),
2585 (0x0147, b'n'),
2586 (0x0148, b'n'),
2587 (0x014C, b'o'),
2588 (0x014D, b'o'),
2589 (0x014E, b'o'),
2590 (0x014F, b'o'),
2591 (0x0150, b'o'),
2592 (0x0151, b'o'),
2593 (0x0154, b'r'),
2594 (0x0155, b'r'),
2595 (0x0156, b'r'),
2596 (0x0157, b'r'),
2597 (0x0158, b'r'),
2598 (0x0159, b'r'),
2599 (0x015A, b's'),
2600 (0x015B, b's'),
2601 (0x015C, b's'),
2602 (0x015D, b's'),
2603 (0x015E, b's'),
2604 (0x015F, b's'),
2605 (0x0160, b's'),
2606 (0x0161, b's'),
2607 (0x0162, b't'),
2608 (0x0163, b't'),
2609 (0x0164, b't'),
2610 (0x0165, b't'),
2611 (0x0168, b'u'),
2612 (0x0169, b'u'),
2613 (0x016A, b'u'),
2614 (0x016B, b'u'),
2615 (0x016C, b'u'),
2616 (0x016D, b'u'),
2617 (0x016E, b'u'),
2618 (0x016F, b'u'),
2619 (0x0170, b'u'),
2620 (0x0171, b'u'),
2621 (0x0172, b'u'),
2622 (0x0173, b'u'),
2623 (0x0174, b'w'),
2624 (0x0175, b'w'),
2625 (0x0176, b'y'),
2626 (0x0177, b'y'),
2627 (0x0178, b'y'),
2628 (0x0179, b'z'),
2629 (0x017A, b'z'),
2630 (0x017B, b'z'),
2631 (0x017C, b'z'),
2632 (0x017D, b'z'),
2633 (0x017E, b'z'),
2634 (0x01A0, b'o'),
2635 (0x01A1, b'o'),
2636 (0x01AF, b'u'),
2637 (0x01B0, b'u'),
2638 (0x01CD, b'a'),
2639 (0x01CE, b'a'),
2640 (0x01CF, b'i'),
2641 (0x01D0, b'i'),
2642 (0x01D1, b'o'),
2643 (0x01D2, b'o'),
2644 (0x01D3, b'u'),
2645 (0x01D4, b'u'),
2646 (0x01D5, b'u'),
2647 (0x01D6, b'u'),
2648 (0x01D7, b'u'),
2649 (0x01D8, b'u'),
2650 (0x01D9, b'u'),
2651 (0x01DA, b'u'),
2652 (0x01DB, b'u'),
2653 (0x01DC, b'u'),
2654 (0x01DE, b'a'),
2655 (0x01DF, b'a'),
2656 (0x01E0, b'a'),
2657 (0x01E1, b'a'),
2658 (0x01E6, b'g'),
2659 (0x01E7, b'g'),
2660 (0x01E8, b'k'),
2661 (0x01E9, b'k'),
2662 (0x01EA, b'o'),
2663 (0x01EB, b'o'),
2664 (0x01EC, b'o'),
2665 (0x01ED, b'o'),
2666 (0x01F0, b'j'),
2667 (0x01F4, b'g'),
2668 (0x01F5, b'g'),
2669 (0x01F8, b'n'),
2670 (0x01F9, b'n'),
2671 (0x01FA, b'a'),
2672 (0x01FB, b'a'),
2673 (0x0200, b'a'),
2674 (0x0201, b'a'),
2675 (0x0202, b'a'),
2676 (0x0203, b'a'),
2677 (0x0204, b'e'),
2678 (0x0205, b'e'),
2679 (0x0206, b'e'),
2680 (0x0207, b'e'),
2681 (0x0208, b'i'),
2682 (0x0209, b'i'),
2683 (0x020A, b'i'),
2684 (0x020B, b'i'),
2685 (0x020C, b'o'),
2686 (0x020D, b'o'),
2687 (0x020E, b'o'),
2688 (0x020F, b'o'),
2689 (0x0210, b'r'),
2690 (0x0211, b'r'),
2691 (0x0212, b'r'),
2692 (0x0213, b'r'),
2693 (0x0214, b'u'),
2694 (0x0215, b'u'),
2695 (0x0216, b'u'),
2696 (0x0217, b'u'),
2697 (0x0218, b's'),
2698 (0x0219, b's'),
2699 (0x021A, b't'),
2700 (0x021B, b't'),
2701 (0x021E, b'h'),
2702 (0x021F, b'h'),
2703 (0x0226, b'a'),
2704 (0x0227, b'a'),
2705 (0x0228, b'e'),
2706 (0x0229, b'e'),
2707 (0x022A, b'o'),
2708 (0x022B, b'o'),
2709 (0x022C, b'o'),
2710 (0x022D, b'o'),
2711 (0x022E, b'o'),
2712 (0x022F, b'o'),
2713 (0x0230, b'o'),
2714 (0x0231, b'o'),
2715 (0x0232, b'y'),
2716 (0x0233, b'y'),
2717 (0x1E00, b'a'),
2718 (0x1E01, b'a'),
2719 (0x1E02, b'b'),
2720 (0x1E03, b'b'),
2721 (0x1E04, b'b'),
2722 (0x1E05, b'b'),
2723 (0x1E06, b'b'),
2724 (0x1E07, b'b'),
2725 (0x1E08, b'c'),
2726 (0x1E09, b'c'),
2727 (0x1E0A, b'd'),
2728 (0x1E0B, b'd'),
2729 (0x1E0C, b'd'),
2730 (0x1E0D, b'd'),
2731 (0x1E0E, b'd'),
2732 (0x1E0F, b'd'),
2733 (0x1E10, b'd'),
2734 (0x1E11, b'd'),
2735 (0x1E12, b'd'),
2736 (0x1E13, b'd'),
2737 (0x1E14, b'e'),
2738 (0x1E15, b'e'),
2739 (0x1E16, b'e'),
2740 (0x1E17, b'e'),
2741 (0x1E18, b'e'),
2742 (0x1E19, b'e'),
2743 (0x1E1A, b'e'),
2744 (0x1E1B, b'e'),
2745 (0x1E1C, b'e'),
2746 (0x1E1D, b'e'),
2747 (0x1E1E, b'f'),
2748 (0x1E1F, b'f'),
2749 (0x1E20, b'g'),
2750 (0x1E21, b'g'),
2751 (0x1E22, b'h'),
2752 (0x1E23, b'h'),
2753 (0x1E24, b'h'),
2754 (0x1E25, b'h'),
2755 (0x1E26, b'h'),
2756 (0x1E27, b'h'),
2757 (0x1E28, b'h'),
2758 (0x1E29, b'h'),
2759 (0x1E2A, b'h'),
2760 (0x1E2B, b'h'),
2761 (0x1E2C, b'i'),
2762 (0x1E2D, b'i'),
2763 (0x1E2E, b'i'),
2764 (0x1E2F, b'i'),
2765 (0x1E30, b'k'),
2766 (0x1E31, b'k'),
2767 (0x1E32, b'k'),
2768 (0x1E33, b'k'),
2769 (0x1E34, b'k'),
2770 (0x1E35, b'k'),
2771 (0x1E36, b'l'),
2772 (0x1E37, b'l'),
2773 (0x1E38, b'l'),
2774 (0x1E39, b'l'),
2775 (0x1E3A, b'l'),
2776 (0x1E3B, b'l'),
2777 (0x1E3C, b'l'),
2778 (0x1E3D, b'l'),
2779 (0x1E3E, b'm'),
2780 (0x1E3F, b'm'),
2781 (0x1E40, b'm'),
2782 (0x1E41, b'm'),
2783 (0x1E42, b'm'),
2784 (0x1E43, b'm'),
2785 (0x1E44, b'n'),
2786 (0x1E45, b'n'),
2787 (0x1E46, b'n'),
2788 (0x1E47, b'n'),
2789 (0x1E48, b'n'),
2790 (0x1E49, b'n'),
2791 (0x1E4A, b'n'),
2792 (0x1E4B, b'n'),
2793 (0x1E4C, b'o'),
2794 (0x1E4D, b'o'),
2795 (0x1E4E, b'o'),
2796 (0x1E4F, b'o'),
2797 (0x1E50, b'o'),
2798 (0x1E51, b'o'),
2799 (0x1E52, b'o'),
2800 (0x1E53, b'o'),
2801 (0x1E54, b'p'),
2802 (0x1E55, b'p'),
2803 (0x1E56, b'p'),
2804 (0x1E57, b'p'),
2805 (0x1E58, b'r'),
2806 (0x1E59, b'r'),
2807 (0x1E5A, b'r'),
2808 (0x1E5B, b'r'),
2809 (0x1E5C, b'r'),
2810 (0x1E5D, b'r'),
2811 (0x1E5E, b'r'),
2812 (0x1E5F, b'r'),
2813 (0x1E60, b's'),
2814 (0x1E61, b's'),
2815 (0x1E62, b's'),
2816 (0x1E63, b's'),
2817 (0x1E64, b's'),
2818 (0x1E65, b's'),
2819 (0x1E66, b's'),
2820 (0x1E67, b's'),
2821 (0x1E68, b's'),
2822 (0x1E69, b's'),
2823 (0x1E6A, b't'),
2824 (0x1E6B, b't'),
2825 (0x1E6C, b't'),
2826 (0x1E6D, b't'),
2827 (0x1E6E, b't'),
2828 (0x1E6F, b't'),
2829 (0x1E70, b't'),
2830 (0x1E71, b't'),
2831 (0x1E72, b'u'),
2832 (0x1E73, b'u'),
2833 (0x1E74, b'u'),
2834 (0x1E75, b'u'),
2835 (0x1E76, b'u'),
2836 (0x1E77, b'u'),
2837 (0x1E78, b'u'),
2838 (0x1E79, b'u'),
2839 (0x1E7A, b'u'),
2840 (0x1E7B, b'u'),
2841 (0x1E7C, b'v'),
2842 (0x1E7D, b'v'),
2843 (0x1E7E, b'v'),
2844 (0x1E7F, b'v'),
2845 (0x1E80, b'w'),
2846 (0x1E81, b'w'),
2847 (0x1E82, b'w'),
2848 (0x1E83, b'w'),
2849 (0x1E84, b'w'),
2850 (0x1E85, b'w'),
2851 (0x1E86, b'w'),
2852 (0x1E87, b'w'),
2853 (0x1E88, b'w'),
2854 (0x1E89, b'w'),
2855 (0x1E8A, b'x'),
2856 (0x1E8B, b'x'),
2857 (0x1E8C, b'x'),
2858 (0x1E8D, b'x'),
2859 (0x1E8E, b'y'),
2860 (0x1E8F, b'y'),
2861 (0x1E90, b'z'),
2862 (0x1E91, b'z'),
2863 (0x1E92, b'z'),
2864 (0x1E93, b'z'),
2865 (0x1E94, b'z'),
2866 (0x1E95, b'z'),
2867 (0x1E96, b'h'),
2868 (0x1E97, b't'),
2869 (0x1E98, b'w'),
2870 (0x1E99, b'y'),
2871 (0x1EA0, b'a'),
2872 (0x1EA1, b'a'),
2873 (0x1EA2, b'a'),
2874 (0x1EA3, b'a'),
2875 (0x1EA4, b'a'),
2876 (0x1EA5, b'a'),
2877 (0x1EA6, b'a'),
2878 (0x1EA7, b'a'),
2879 (0x1EA8, b'a'),
2880 (0x1EA9, b'a'),
2881 (0x1EAA, b'a'),
2882 (0x1EAB, b'a'),
2883 (0x1EAC, b'a'),
2884 (0x1EAD, b'a'),
2885 (0x1EAE, b'a'),
2886 (0x1EAF, b'a'),
2887 (0x1EB0, b'a'),
2888 (0x1EB1, b'a'),
2889 (0x1EB2, b'a'),
2890 (0x1EB3, b'a'),
2891 (0x1EB4, b'a'),
2892 (0x1EB5, b'a'),
2893 (0x1EB6, b'a'),
2894 (0x1EB7, b'a'),
2895 (0x1EB8, b'e'),
2896 (0x1EB9, b'e'),
2897 (0x1EBA, b'e'),
2898 (0x1EBB, b'e'),
2899 (0x1EBC, b'e'),
2900 (0x1EBD, b'e'),
2901 (0x1EBE, b'e'),
2902 (0x1EBF, b'e'),
2903 (0x1EC0, b'e'),
2904 (0x1EC1, b'e'),
2905 (0x1EC2, b'e'),
2906 (0x1EC3, b'e'),
2907 (0x1EC4, b'e'),
2908 (0x1EC5, b'e'),
2909 (0x1EC6, b'e'),
2910 (0x1EC7, b'e'),
2911 (0x1EC8, b'i'),
2912 (0x1EC9, b'i'),
2913 (0x1ECA, b'i'),
2914 (0x1ECB, b'i'),
2915 (0x1ECC, b'o'),
2916 (0x1ECD, b'o'),
2917 (0x1ECE, b'o'),
2918 (0x1ECF, b'o'),
2919 (0x1ED0, b'o'),
2920 (0x1ED1, b'o'),
2921 (0x1ED2, b'o'),
2922 (0x1ED3, b'o'),
2923 (0x1ED4, b'o'),
2924 (0x1ED5, b'o'),
2925 (0x1ED6, b'o'),
2926 (0x1ED7, b'o'),
2927 (0x1ED8, b'o'),
2928 (0x1ED9, b'o'),
2929 (0x1EDA, b'o'),
2930 (0x1EDB, b'o'),
2931 (0x1EDC, b'o'),
2932 (0x1EDD, b'o'),
2933 (0x1EDE, b'o'),
2934 (0x1EDF, b'o'),
2935 (0x1EE0, b'o'),
2936 (0x1EE1, b'o'),
2937 (0x1EE2, b'o'),
2938 (0x1EE3, b'o'),
2939 (0x1EE4, b'u'),
2940 (0x1EE5, b'u'),
2941 (0x1EE6, b'u'),
2942 (0x1EE7, b'u'),
2943 (0x1EE8, b'u'),
2944 (0x1EE9, b'u'),
2945 (0x1EEA, b'u'),
2946 (0x1EEB, b'u'),
2947 (0x1EEC, b'u'),
2948 (0x1EED, b'u'),
2949 (0x1EEE, b'u'),
2950 (0x1EEF, b'u'),
2951 (0x1EF0, b'u'),
2952 (0x1EF1, b'u'),
2953 (0x1EF2, b'y'),
2954 (0x1EF3, b'y'),
2955 (0x1EF4, b'y'),
2956 (0x1EF5, b'y'),
2957 (0x1EF6, b'y'),
2958 (0x1EF7, b'y'),
2959 (0x1EF8, b'y'),
2960 (0x1EF9, b'y'),
2961 (0x212A, b'k'),
2962 (0x212B, b'a'),
2963];
2964
2965fn open_tag(chars: &[char], start: usize) -> Option<(String, String, bool, usize)> {
2970 let mut cursor = start + 1;
2971 let mut name = String::new();
2972 while let Some(ch) = at(chars, cursor) {
2973 if ch.is_ascii_alphanumeric() {
2974 name.push(ch.to_ascii_lowercase());
2975 cursor += 1;
2976 } else {
2977 break;
2978 }
2979 }
2980 if name.is_empty() {
2981 return None;
2982 }
2983 let mut quote: Option<char> = None;
2984 let len = chars.len();
2985 while cursor < len {
2986 let Some(ch) = at(chars, cursor) else { break };
2987 match quote {
2988 Some(open_quote) => {
2989 if ch == open_quote {
2990 quote = None;
2991 }
2992 cursor += 1;
2993 }
2994 None => {
2995 if ch == '"' || ch == '\'' {
2996 quote = Some(ch);
2997 cursor += 1;
2998 } else if ch == '>' {
2999 break;
3000 } else {
3001 cursor += 1;
3002 }
3003 }
3004 }
3005 }
3006 if at(chars, cursor) != Some('>') {
3007 return None;
3008 }
3009 let self_closing = cursor > 0 && at(chars, cursor - 1) == Some('/');
3010 let raw = collect_range(chars, start, cursor + 1);
3011 Some((name, raw, self_closing, cursor + 1))
3012}
3013
3014fn close_tag(chars: &[char], start: usize, name: &str) -> Option<(usize, usize)> {
3017 let mut depth = 0i32;
3018 let mut j = start;
3019 let n = chars.len();
3020 while j < n {
3021 if at(chars, j) == Some('<') {
3022 if at(chars, j + 1) == Some('/') {
3023 if tag_name_matches(chars, j + 2, name) {
3024 if depth == 0 {
3025 let gt = find_char(chars, j, '>')?;
3026 return Some((j, gt + 1));
3027 }
3028 depth -= 1;
3029 }
3030 } else if tag_name_matches(chars, j + 1, name) {
3031 depth += 1;
3032 }
3033 }
3034 j += 1;
3035 }
3036 None
3037}
3038
3039fn enclosed(chars: &[char], start: usize, name: &str) -> (String, usize) {
3042 match close_tag(chars, start, name) {
3043 Some((inner_end, after)) => (collect_range(chars, start, inner_end), after),
3044 None => (collect_range(chars, start, chars.len()), chars.len()),
3045 }
3046}
3047
3048fn tag_name_matches(chars: &[char], pos: usize, name: &str) -> bool {
3049 let mut count = 0;
3050 for (k, nc) in name.chars().enumerate() {
3051 match at(chars, pos + k) {
3052 Some(c) if c.eq_ignore_ascii_case(&nc) => count += 1,
3053 _ => return false,
3054 }
3055 }
3056 match at(chars, pos + count) {
3057 Some(c) => c.is_whitespace() || c == '>' || c == '/',
3058 None => false,
3059 }
3060}
3061
3062fn starts_block_tag(chars: &[char], pos: usize) -> bool {
3063 if at(chars, pos) != Some('<') {
3064 return false;
3065 }
3066 ["pre", "source", "syntaxhighlight", "blockquote", "ul", "ol"]
3067 .iter()
3068 .any(|name| tag_name_matches(chars, pos + 1, name))
3069}
3070
3071fn open_ref_depth(chars: &[char], start: usize, end: usize) -> i32 {
3075 let mut depth = 0i32;
3076 let mut i = start;
3077 while i < end {
3078 if at(chars, i) == Some('<') {
3079 if let Some(after) = verbatim_region_end(chars, i) {
3080 i = after;
3081 continue;
3082 }
3083 if at(chars, i + 1) == Some('/') {
3084 if tag_name_matches(chars, i + 2, "ref") {
3085 depth = (depth - 1).max(0);
3086 }
3087 } else if tag_name_matches(chars, i + 1, "ref")
3088 && let Some((_, _, self_closing, after)) = open_tag(chars, i)
3089 {
3090 if !self_closing {
3091 depth += 1;
3092 }
3093 i = after;
3094 continue;
3095 }
3096 }
3097 i += 1;
3098 }
3099 depth
3100}
3101
3102fn open_ref_block_bodied(chars: &[char], start: usize, end: usize) -> bool {
3107 let mut stack: Vec<bool> = Vec::new();
3108 let mut i = start;
3109 while i < end {
3110 if at(chars, i) == Some('<') {
3111 if let Some(after) = verbatim_region_end(chars, i) {
3112 i = after;
3113 continue;
3114 }
3115 if at(chars, i + 1) == Some('/') {
3116 if tag_name_matches(chars, i + 2, "ref") {
3117 stack.pop();
3118 }
3119 } else if tag_name_matches(chars, i + 1, "ref")
3120 && let Some((_, _, self_closing, after)) = open_tag(chars, i)
3121 {
3122 if !self_closing {
3123 let mut j = after;
3124 while matches!(at(chars, j), Some(' ' | '\t')) {
3125 j += 1;
3126 }
3127 stack.push(matches!(at(chars, j), None | Some('\n')));
3128 }
3129 i = after;
3130 continue;
3131 }
3132 }
3133 i += 1;
3134 }
3135 stack.last().copied().unwrap_or(false)
3136}
3137
3138fn html_tag_role(name: &str) -> Option<HtmlTagRole> {
3141 const INLINE: &[&str] = &[
3142 "abbr", "b", "bdi", "bdo", "big", "cite", "data", "dfn", "em", "font", "i", "ins", "q",
3143 "rb", "rt", "rtc", "ruby", "s", "small", "span", "strong", "u", "wbr",
3144 ];
3145 const BLOCK: &[&str] = &[
3146 "caption",
3147 "center",
3148 "col",
3149 "colgroup",
3150 "dd",
3151 "div",
3152 "dl",
3153 "dt",
3154 "h1",
3155 "h2",
3156 "h3",
3157 "h4",
3158 "h5",
3159 "h6",
3160 "hr",
3161 "li",
3162 "ol",
3163 "references",
3164 "rp",
3165 "table",
3166 "td",
3167 "th",
3168 "time",
3169 "tr",
3170 "ul",
3171 ];
3172 const PARAGRAPH: &[&str] = &["gallery", "p"];
3173 if INLINE.contains(&name) {
3174 Some(HtmlTagRole::Inline)
3175 } else if BLOCK.contains(&name) {
3176 Some(HtmlTagRole::Block)
3177 } else if PARAGRAPH.contains(&name) {
3178 Some(HtmlTagRole::Break)
3179 } else {
3180 None
3181 }
3182}
3183
3184fn close_tag_parse(chars: &[char], i: usize) -> Option<(String, String, usize)> {
3187 if at(chars, i) != Some('<') || at(chars, i + 1) != Some('/') {
3188 return None;
3189 }
3190 let mut cursor = i + 2;
3191 let mut name = String::new();
3192 while let Some(ch) = at(chars, cursor) {
3193 if ch.is_ascii_alphanumeric() {
3194 name.push(ch.to_ascii_lowercase());
3195 cursor += 1;
3196 } else {
3197 break;
3198 }
3199 }
3200 if name.is_empty() {
3201 return None;
3202 }
3203 let gt = find_char(chars, cursor, '>')?;
3204 Some((name, collect_range(chars, i, gt + 1), gt + 1))
3205}
3206
3207fn html_li_content_bounds(chars: &[char], start: usize) -> (usize, usize) {
3213 let n = chars.len();
3214 let mut list_depth = 0i32;
3215 let mut j = start;
3216 while j < n {
3217 if at(chars, j) == Some('<') {
3218 if at(chars, j + 1) == Some('/') {
3219 if tag_name_matches(chars, j + 2, "ul") || tag_name_matches(chars, j + 2, "ol") {
3220 if list_depth == 0 {
3221 return (j, j);
3222 }
3223 list_depth -= 1;
3224 if let Some((_, _, after)) = close_tag_parse(chars, j) {
3225 j = after;
3226 continue;
3227 }
3228 } else if list_depth == 0
3229 && tag_name_matches(chars, j + 2, "li")
3230 && let Some((_, _, after)) = close_tag_parse(chars, j)
3231 {
3232 return (j, after);
3233 }
3234 } else if tag_name_matches(chars, j + 1, "ul") || tag_name_matches(chars, j + 1, "ol") {
3235 if let Some((_, _, self_closing, after)) = open_tag(chars, j) {
3236 if !self_closing {
3237 list_depth += 1;
3238 }
3239 j = after;
3240 continue;
3241 }
3242 } else if list_depth == 0 && tag_name_matches(chars, j + 1, "li") {
3243 return (j, j);
3244 }
3245 }
3246 j += 1;
3247 }
3248 (n, n)
3249}
3250
3251fn block_tag_token(chars: &[char], i: usize) -> Option<(Tok, usize)> {
3255 let (name, raw, after) = if at(chars, i + 1) == Some('/') {
3256 close_tag_parse(chars, i)?
3257 } else {
3258 let (name, raw, _self_closing, after) = open_tag(chars, i)?;
3259 (name, raw, after)
3260 };
3261 match html_tag_role(&name)? {
3262 HtmlTagRole::Block => Some((Tok::BlockRaw(raw), after)),
3263 HtmlTagRole::Break => Some((Tok::BlockBreak, after)),
3264 HtmlTagRole::Inline => None,
3265 }
3266}
3267
3268fn list_run_uniform(chars: &[char], pos: usize) -> bool {
3272 let first = at(chars, pos);
3273 let le = line_end(chars, pos);
3274 let mut p = pos;
3275 while p < le && at(chars, p).is_some_and(is_list_marker) {
3276 if at(chars, p) != first {
3277 return false;
3278 }
3279 p += 1;
3280 }
3281 true
3282}
3283
3284fn flush_para_segment(
3287 segment: &mut Vec<Tok>,
3288 blocks: &mut Vec<Block>,
3289 smart: bool,
3290 east_asian: bool,
3291) {
3292 if segment.is_empty() {
3293 return;
3294 }
3295 let toks = std::mem::take(segment);
3296 let mut inlines = coalesce(strip_outer_whitespace(resolve_emphasis(toks)));
3297 if east_asian {
3298 inlines = drop_east_asian_breaks(inlines);
3299 }
3300 if smart {
3301 inlines = apply_smart_quotes(inlines);
3302 }
3303 if !inlines.is_empty() {
3304 blocks.push(para_or_figure(inlines));
3305 }
3306}
3307
3308fn tag_attribute(raw: &str, key: &str) -> Option<String> {
3310 let chars: Vec<char> = raw.chars().collect();
3311 let n = chars.len();
3312 let mut i = 0;
3313 while i < n {
3314 match at(&chars, i) {
3315 Some(c) if c.is_ascii_alphabetic() => {
3316 let start = i;
3317 while let Some(c) = at(&chars, i) {
3318 if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
3319 i += 1;
3320 } else {
3321 break;
3322 }
3323 }
3324 let name = collect_range(&chars, start, i).to_lowercase();
3325 while at(&chars, i).is_some_and(char::is_whitespace) {
3326 i += 1;
3327 }
3328 if at(&chars, i) == Some('=') {
3329 i += 1;
3330 while at(&chars, i).is_some_and(char::is_whitespace) {
3331 i += 1;
3332 }
3333 let value = if let Some(q @ ('"' | '\'')) = at(&chars, i) {
3334 i += 1;
3335 let vs = i;
3336 while at(&chars, i).is_some_and(|c| c != q) {
3337 i += 1;
3338 }
3339 let v = collect_range(&chars, vs, i);
3340 i += 1;
3341 v
3342 } else {
3343 let vs = i;
3344 while at(&chars, i)
3345 .is_some_and(|c| !c.is_whitespace() && c != '>' && c != '/')
3346 {
3347 i += 1;
3348 }
3349 collect_range(&chars, vs, i)
3350 };
3351 if name == key {
3352 return Some(value);
3353 }
3354 }
3355 }
3356 _ => i += 1,
3357 }
3358 }
3359 None
3360}
3361
3362#[derive(Default)]
3372struct HeaderScan {
3373 region_end: BTreeMap<usize, usize>,
3374 is_header: BTreeMap<usize, bool>,
3375}
3376
3377fn line_starts_block_scan(chars: &[char], ls: usize, scan: &mut HeaderScan) -> bool {
3378 match at(chars, ls) {
3379 Some('*' | '#' | ':' | ';' | ' ') => true,
3380 Some('=') => is_header_line(chars, ls, scan),
3381 Some('-') => is_hr_line(chars, ls),
3382 Some('{') => matches!(at(chars, ls + 1), Some('{' | '|')),
3383 Some('<') => starts_block_tag(chars, ls),
3384 _ => false,
3385 }
3386}
3387
3388fn is_header_line_within(chars: &[char], pos: usize) -> bool {
3389 is_header_line(chars, pos, &mut HeaderScan::default())
3390}
3391
3392fn is_header_line(chars: &[char], pos: usize, scan: &mut HeaderScan) -> bool {
3393 if let Some(&cached) = scan.is_header.get(&pos) {
3394 return cached;
3395 }
3396 let le = line_end(chars, pos);
3397 let mut m = 0;
3398 while pos + m < le && at(chars, pos + m) == Some('=') {
3399 m += 1;
3400 }
3401 let result = if m == 0 || m > 6 {
3402 false
3403 } else {
3404 let region_end = header_region_end_scan(chars, pos, scan);
3405 header_closer(chars, pos + m, region_end, m).is_some()
3406 };
3407 scan.is_header.insert(pos, result);
3408 result
3409}
3410
3411fn header_region_end_scan(chars: &[char], pos: usize, scan: &mut HeaderScan) -> usize {
3415 if let Some(&cached) = scan.region_end.get(&pos) {
3416 return cached;
3417 }
3418 let n = chars.len();
3419 let mut starts = Vec::new();
3427 let mut cur = pos;
3428 loop {
3429 starts.push(cur);
3430 let le = line_end(chars, cur);
3431 if le >= n {
3432 break;
3433 }
3434 let next = le + 1;
3435 if next >= n {
3436 break;
3437 }
3438 let next_end = line_end(chars, next);
3439 if is_blank(chars, next, next_end) {
3440 break;
3441 }
3442 cur = next;
3443 }
3444 for &s in starts.iter().rev() {
3445 if scan.region_end.contains_key(&s) {
3446 continue;
3447 }
3448 let le = line_end(chars, s);
3449 let region = if le >= n {
3450 le
3451 } else {
3452 let next = le + 1;
3453 if next >= n {
3454 le
3455 } else {
3456 let next_end = line_end(chars, next);
3457 if is_blank(chars, next, next_end) || line_starts_block_scan(chars, next, scan) {
3460 le
3461 } else {
3462 header_region_end_scan(chars, next, scan)
3463 }
3464 }
3465 };
3466 scan.region_end.insert(s, region);
3467 }
3468 scan.region_end
3469 .get(&pos)
3470 .copied()
3471 .unwrap_or_else(|| line_end(chars, pos))
3472}
3473
3474fn header_closer(chars: &[char], content_start: usize, line_end: usize, m: usize) -> Option<usize> {
3478 let mut i = content_start;
3479 while i < line_end {
3480 if let Some(next) = skip_construct(chars, i)
3481 && next > i
3482 {
3483 i = next.min(line_end);
3484 continue;
3485 }
3486 if at(chars, i) == Some('=') {
3487 let mut j = i;
3488 while j < line_end && at(chars, j) == Some('=') {
3489 j += 1;
3490 }
3491 return if j - i >= m { Some(i) } else { None };
3492 }
3493 i += 1;
3494 }
3495 None
3496}
3497
3498fn is_hr_line(chars: &[char], pos: usize) -> bool {
3499 let le = line_end(chars, pos);
3500 let mut k = pos;
3501 while k < le && at(chars, k) == Some('-') {
3502 k += 1;
3503 }
3504 k - pos >= 4 && is_blank(chars, k, le)
3505}
3506
3507fn split_term(content: &str) -> (String, Option<String>) {
3510 let chars: Vec<char> = content.chars().collect();
3511 let n = chars.len();
3512 let mut i = 0;
3513 while i < n {
3514 if let Some(next) = skip_construct(&chars, i)
3515 && next > i
3516 {
3517 i = next;
3518 continue;
3519 }
3520 if let Some((_, next)) = bare_url(&chars, i)
3522 && next > i
3523 {
3524 i = next;
3525 continue;
3526 }
3527 if at(&chars, i) == Some(':') {
3528 let before = collect_range(&chars, 0, i).trim().to_string();
3529 let after = collect_range(&chars, i + 1, n).trim().to_string();
3530 return (before, Some(after));
3531 }
3532 i += 1;
3533 }
3534 (content.trim().to_string(), None)
3535}
3536
3537fn skip_construct(chars: &[char], i: usize) -> Option<usize> {
3539 match at(chars, i) {
3540 Some('{') if at(chars, i + 1) == Some('{') => balanced_braces(chars, i),
3541 Some('[') if at(chars, i + 1) == Some('[') => {
3542 find_seq(chars, i + 2, &[']', ']']).map(|c| c + 2)
3543 }
3544 Some('[') => find_char(chars, i + 1, ']').map(|c| c + 1),
3545 Some('<') => find_char(chars, i, '>').map(|c| c + 1),
3546 _ => None,
3547 }
3548}
3549
3550fn template_opens(chars: &[char], i: usize) -> bool {
3555 matches!(at(chars, i + 2), Some(c) if c.is_alphanumeric() || c == ':')
3556}
3557
3558fn balanced_braces(chars: &[char], i: usize) -> Option<usize> {
3559 let mut depth = 0i32;
3560 let mut j = i;
3561 let n = chars.len();
3562 while j < n {
3563 if at(chars, j) == Some('{') && at(chars, j + 1) == Some('{') {
3564 depth += 1;
3565 j += 2;
3566 } else if at(chars, j) == Some('}') && at(chars, j + 1) == Some('}') {
3567 depth -= 1;
3568 j += 2;
3569 if depth == 0 {
3570 return Some(j);
3571 }
3572 } else {
3573 j += 1;
3574 }
3575 }
3576 None
3577}
3578
3579fn is_list_marker(c: char) -> bool {
3582 matches!(c, '*' | '#' | ':' | ';')
3583}
3584
3585fn degraded_blocks(chars: &[char]) -> Vec<Block> {
3589 let text = collect_range(chars, 0, chars.len());
3590 let trimmed = text.trim();
3591 if trimmed.is_empty() {
3592 Vec::new()
3593 } else {
3594 vec![Block::Para(vec![Inline::Str(trimmed.into())])]
3595 }
3596}
3597
3598fn whitespace_token(chars: &[char], from: usize) -> (Inline, usize) {
3601 let mut i = from;
3602 let mut has_newline = false;
3603 while let Some(w) = at(chars, i) {
3604 if w.is_whitespace() {
3605 if w == '\n' {
3606 has_newline = true;
3607 }
3608 i += 1;
3609 } else {
3610 break;
3611 }
3612 }
3613 let token = if has_newline {
3614 Inline::SoftBreak
3615 } else {
3616 Inline::Space
3617 };
3618 (token, i)
3619}
3620
3621fn list_kind(marker: char) -> ListKind {
3622 match marker {
3623 '#' => ListKind::Ordered,
3624 ';' | ':' => ListKind::Definition,
3625 _ => ListKind::Bullet,
3626 }
3627}
3628
3629fn verbatim_code(
3632 chars: &[char],
3633 name: &str,
3634 after_open: usize,
3635 raw_open: &str,
3636 self_closing: bool,
3637 classes: &[&str],
3638) -> (Vec<Inline>, usize) {
3639 if self_closing {
3640 return (vec![raw_html(raw_open.to_string())], after_open);
3641 }
3642 match close_tag(chars, after_open, name) {
3643 Some((inner_end, after)) => {
3644 let inner = collect_range(chars, after_open, inner_end);
3645 let attr = Attr {
3646 id: carta_ast::Text::default(),
3647 classes: classes.iter().map(|s| (*s).into()).collect(),
3648 attributes: Vec::new(),
3649 };
3650 (
3651 vec![Inline::Code(Box::new(attr), decode_entities(&inner).into())],
3652 after,
3653 )
3654 }
3655 None => (vec![raw_html(raw_open.to_string())], after_open),
3656 }
3657}
3658
3659fn default_list_attrs() -> ListAttributes {
3660 ListAttributes {
3661 start: 1,
3662 style: ListNumberStyle::DefaultStyle,
3663 delim: ListNumberDelim::DefaultDelim,
3664 }
3665}
3666
3667fn finish_inline_block(chars: &[char], pos: usize) -> (usize, bool) {
3668 let le = line_end(chars, pos);
3669 if is_blank(chars, pos, le) {
3670 let next = if le < chars.len() { le + 1 } else { le };
3671 (next, true)
3672 } else {
3673 (pos, false)
3674 }
3675}
3676
3677fn trim_code(inner: &str) -> String {
3678 let stripped = inner
3679 .strip_prefix("\r\n")
3680 .or_else(|| inner.strip_prefix('\n'))
3681 .unwrap_or(inner);
3682 stripped
3683 .strip_suffix("\r\n")
3684 .or_else(|| stripped.strip_suffix('\n'))
3685 .unwrap_or(stripped)
3686 .to_string()
3687}
3688
3689fn flush_word(word: &mut String, toks: &mut Vec<Tok>) {
3690 if !word.is_empty() {
3691 toks.push(Tok::Inline(Inline::Str(std::mem::take(word).into())));
3692 }
3693}
3694
3695fn raw_html(text: String) -> Inline {
3696 Inline::RawInline(Format("html".into()), text.into())
3697}
3698
3699fn format_mediawiki() -> Format {
3700 Format("mediawiki".into())
3701}
3702
3703fn format_html() -> Format {
3704 Format("html".into())
3705}
3706
3707fn at(chars: &[char], i: usize) -> Option<char> {
3708 chars.get(i).copied()
3709}
3710
3711fn collect_range(chars: &[char], start: usize, end: usize) -> String {
3712 if end <= start {
3713 return String::new();
3714 }
3715 chars.iter().skip(start).take(end - start).collect()
3716}
3717
3718fn table_block_end(chars: &[char], pos: usize) -> usize {
3722 let n = chars.len();
3723 let mut depth = 0usize;
3724 let mut line = pos;
3725 loop {
3726 let mut content = line;
3727 while matches!(at(chars, content), Some(' ' | '\t')) {
3728 content += 1;
3729 }
3730 if at(chars, content) == Some('{') && at(chars, content + 1) == Some('|') {
3731 depth += 1;
3732 } else if at(chars, content) == Some('|') && at(chars, content + 1) == Some('}') {
3733 depth = depth.saturating_sub(1);
3734 if depth == 0 {
3735 return content + 2;
3736 }
3737 }
3738 let le = line_end(chars, line);
3739 if le >= n {
3740 return n;
3741 }
3742 line = le + 1;
3743 }
3744}
3745
3746fn scan_table_region(region: &str) -> (Vec<Vec<RawCell>>, Option<String>) {
3750 let mut caption_text: Option<String> = None;
3751 let mut rows: Vec<Vec<RawCell>> = Vec::new();
3752 let mut cur: Vec<RawCell> = Vec::new();
3753 let mut open = OpenTarget::None;
3754 let mut nest = 0i32;
3755
3756 let mut lines = region.lines();
3757 lines.next(); for line in lines {
3759 let trimmed = line.trim_start();
3760 if nest > 0 {
3761 if trimmed.starts_with("{|") {
3762 nest += 1;
3763 } else if trimmed.starts_with("|}") {
3764 nest -= 1;
3765 }
3766 append_continuation(open, &mut cur, &mut caption_text, line);
3767 continue;
3768 }
3769 if trimmed.starts_with("|}") {
3770 break;
3771 }
3772 if trimmed.starts_with("{|") {
3773 nest += 1;
3774 append_continuation(open, &mut cur, &mut caption_text, line);
3775 continue;
3776 }
3777 if let Some(rest) = trimmed.strip_prefix("|+") {
3778 caption_text = Some(rest.to_string());
3779 open = OpenTarget::Caption;
3780 continue;
3781 }
3782 if trimmed.starts_with("|-") {
3783 rows.push(std::mem::take(&mut cur));
3784 open = OpenTarget::None;
3785 continue;
3786 }
3787 if let Some(rest) = trimmed.strip_prefix('|') {
3788 cur.extend(parse_cell_line(false, rest));
3789 open = OpenTarget::Cell;
3790 continue;
3791 }
3792 if let Some(rest) = trimmed.strip_prefix('!') {
3793 cur.extend(parse_cell_line(true, rest));
3794 open = OpenTarget::Cell;
3795 continue;
3796 }
3797 append_continuation(open, &mut cur, &mut caption_text, line);
3798 }
3799 rows.push(cur);
3800 (rows, caption_text)
3801}
3802
3803fn column_specs(rows: &[Vec<RawCell>], ncols: usize) -> Vec<ColSpec> {
3806 let mut aligns: Vec<Alignment> = Vec::new();
3807 if let Some(first) = rows.first() {
3808 for cell in first {
3809 for _ in 0..col_count(cell.col_span) {
3810 aligns.push(cell.align.clone());
3811 }
3812 }
3813 }
3814 aligns.resize(ncols, Alignment::AlignDefault);
3815 aligns
3816 .into_iter()
3817 .map(|align| ColSpec {
3818 align,
3819 width: ColWidth::ColWidthDefault,
3820 })
3821 .collect()
3822}
3823
3824fn col_count(col_span: i32) -> usize {
3825 usize::try_from(col_span.max(1)).unwrap_or(1)
3826}
3827
3828fn empty_cell() -> Cell {
3830 Cell {
3831 attr: Attr::default(),
3832 align: Alignment::AlignDefault,
3833 row_span: 1,
3834 col_span: 1,
3835 content: Vec::new(),
3836 }
3837}
3838
3839fn append_continuation(
3841 open: OpenTarget,
3842 cur: &mut [RawCell],
3843 caption: &mut Option<String>,
3844 line: &str,
3845) {
3846 match open {
3847 OpenTarget::Cell => {
3848 if let Some(cell) = cur.last_mut() {
3849 cell.content.push('\n');
3850 cell.content.push_str(line);
3851 }
3852 }
3853 OpenTarget::Caption => {
3854 if let Some(text) = caption {
3855 text.push('\n');
3856 text.push_str(line);
3857 }
3858 }
3859 OpenTarget::None => {}
3860 }
3861}
3862
3863fn parse_cell_line(is_header: bool, rest: &str) -> Vec<RawCell> {
3866 split_cells(rest, is_header)
3867 .iter()
3868 .map(|chunk| parse_cell_chunk(is_header, chunk))
3869 .collect()
3870}
3871
3872fn split_cells(s: &str, header: bool) -> Vec<String> {
3875 let chars: Vec<char> = s.chars().collect();
3876 let n = chars.len();
3877 let mut out: Vec<String> = Vec::new();
3878 let mut start = 0usize;
3879 let mut square = 0i32;
3880 let mut curly = 0i32;
3881 let mut i = 0usize;
3882 while i < n {
3883 match at(&chars, i) {
3884 Some('[') => square += 1,
3885 Some(']') => square = (square - 1).max(0),
3886 Some('{') => curly += 1,
3887 Some('}') => curly = (curly - 1).max(0),
3888 _ => {}
3889 }
3890 if square == 0 && curly == 0 {
3891 let pipe = at(&chars, i) == Some('|') && at(&chars, i + 1) == Some('|');
3892 let bang = header && at(&chars, i) == Some('!') && at(&chars, i + 1) == Some('!');
3893 if pipe || bang {
3894 out.push(collect_range(&chars, start, i));
3895 i += 2;
3896 start = i;
3897 continue;
3898 }
3899 }
3900 i += 1;
3901 }
3902 out.push(collect_range(&chars, start, n));
3903 out
3904}
3905
3906fn parse_cell_chunk(is_header: bool, chunk: &str) -> RawCell {
3909 if let Some(idx) = find_attr_pipe(chunk)
3910 && let Some(attrs) = parse_cell_attrs(chunk.get(..idx).unwrap_or(""))
3911 {
3912 return RawCell {
3913 is_header,
3914 align: attrs.align,
3915 col_span: attrs.col_span,
3916 row_span: attrs.row_span,
3917 attr: attrs.attr,
3918 content: chunk.get(idx + 1..).unwrap_or("").to_string(),
3919 };
3920 }
3921 RawCell {
3922 is_header,
3923 align: Alignment::AlignDefault,
3924 col_span: 1,
3925 row_span: 1,
3926 attr: Attr::default(),
3927 content: chunk.to_string(),
3928 }
3929}
3930
3931fn find_attr_pipe(s: &str) -> Option<usize> {
3934 let mut square = 0i32;
3935 let mut curly = 0i32;
3936 let mut in_quote = false;
3937 for (i, ch) in s.char_indices() {
3938 if in_quote {
3939 if ch == '"' {
3940 in_quote = false;
3941 }
3942 continue;
3943 }
3944 match ch {
3945 '"' => in_quote = true,
3946 '[' => square += 1,
3947 ']' => square = (square - 1).max(0),
3948 '{' => curly += 1,
3949 '}' => curly = (curly - 1).max(0),
3950 '|' if square == 0 && curly == 0 => return Some(i),
3951 _ => {}
3952 }
3953 }
3954 None
3955}
3956
3957fn parse_cell_attrs(s: &str) -> Option<CellAttrs> {
3962 let chars: Vec<char> = s.chars().collect();
3963 let n = chars.len();
3964 let mut i = 0usize;
3965 let mut id = String::new();
3966 let mut classes: Vec<String> = Vec::new();
3967 let mut attributes: Vec<(String, String)> = Vec::new();
3968 let mut align = Alignment::AlignDefault;
3969 let mut col_span = 1i32;
3970 let mut row_span = 1i32;
3971 let mut any = false;
3972 while i < n {
3973 while at(&chars, i).is_some_and(char::is_whitespace) {
3974 i += 1;
3975 }
3976 if i >= n {
3977 break;
3978 }
3979 let name_start = i;
3980 while at(&chars, i).is_some_and(|c| !c.is_whitespace() && c != '=') {
3981 i += 1;
3982 }
3983 let name = collect_range(&chars, name_start, i);
3984 if name.is_empty() || at(&chars, i) != Some('=') {
3985 return None;
3986 }
3987 i += 1;
3988 let value = if at(&chars, i) == Some('"') {
3989 i += 1;
3990 let value_start = i;
3991 while at(&chars, i).is_some_and(|c| c != '"') {
3992 i += 1;
3993 }
3994 let value = collect_range(&chars, value_start, i);
3995 if at(&chars, i) == Some('"') {
3996 i += 1;
3997 }
3998 value
3999 } else {
4000 let value_start = i;
4001 while at(&chars, i).is_some_and(|c| !c.is_whitespace()) {
4002 i += 1;
4003 }
4004 collect_range(&chars, value_start, i)
4005 };
4006 any = true;
4007 match name.to_ascii_lowercase().as_str() {
4008 "id" => id = value,
4009 "class" => classes.extend(value.split_whitespace().map(str::to_string)),
4010 "align" => match value.to_ascii_lowercase().as_str() {
4011 "left" => align = Alignment::AlignLeft,
4012 "right" => align = Alignment::AlignRight,
4013 "center" => align = Alignment::AlignCenter,
4014 _ => attributes.push(("align".to_string(), value)),
4015 },
4016 "colspan" => match value.trim().parse::<i32>() {
4017 Ok(v) if v >= 1 => col_span = v,
4018 _ => attributes.push(("colspan".to_string(), value)),
4019 },
4020 "rowspan" => match value.trim().parse::<i32>() {
4021 Ok(v) if v >= 1 => row_span = v,
4022 _ => attributes.push(("rowspan".to_string(), value)),
4023 },
4024 _ => attributes.push((name, value)),
4025 }
4026 }
4027 if !any {
4028 return None;
4029 }
4030 Some(CellAttrs {
4031 align,
4032 col_span,
4033 row_span,
4034 attr: Attr {
4035 id: id.into(),
4036 classes: classes.into_iter().map(Into::into).collect(),
4037 attributes: attributes
4038 .into_iter()
4039 .map(|(k, v)| (k.into(), v.into()))
4040 .collect(),
4041 },
4042 })
4043}
4044
4045fn line_end(chars: &[char], pos: usize) -> usize {
4046 find_char(chars, pos, '\n').unwrap_or(chars.len())
4047}
4048
4049fn is_blank(chars: &[char], start: usize, end: usize) -> bool {
4050 (start..end).all(|j| at(chars, j).is_none_or(char::is_whitespace))
4051}
4052
4053fn find_char(chars: &[char], from: usize, target: char) -> Option<usize> {
4054 (from..chars.len()).find(|&j| at(chars, j) == Some(target))
4055}
4056
4057fn find_seq(chars: &[char], from: usize, seq: &[char]) -> Option<usize> {
4058 let n = chars.len();
4059 let m = seq.len();
4060 if m == 0 || n < m {
4061 return None;
4062 }
4063 (from..=n - m).find(|&j| (0..m).all(|k| at(chars, j + k) == seq.get(k).copied()))
4064}
4065
4066fn scan_link_target(chars: &[char], start: usize) -> Option<(usize, bool)> {
4070 let mut i = start;
4071 while let Some(c) = at(chars, i) {
4072 if c == '|' {
4073 return Some((i, true));
4074 }
4075 if c == ']' && at(chars, i + 1) == Some(']') {
4076 return Some((i, false));
4077 }
4078 i += 1;
4079 }
4080 None
4081}
4082
4083fn find_link_close(chars: &[char], start: usize) -> Option<usize> {
4086 let mut depth = 0usize;
4087 let mut i = start;
4088 while let Some(c) = at(chars, i) {
4089 if c == '[' && at(chars, i + 1) == Some('[') {
4090 depth += 1;
4091 i += 2;
4092 } else if c == ']' && at(chars, i + 1) == Some(']') {
4093 if depth == 0 {
4094 return Some(i);
4095 }
4096 depth -= 1;
4097 i += 2;
4098 } else {
4099 i += 1;
4100 }
4101 }
4102 None
4103}
4104
4105fn matches_prefix_ci(chars: &[char], i: usize, prefix: &str) -> bool {
4106 prefix
4107 .chars()
4108 .enumerate()
4109 .all(|(k, pc)| match at(chars, i + k) {
4110 Some(c) => c.eq_ignore_ascii_case(&pc),
4111 None => false,
4112 })
4113}
4114
4115#[cfg(test)]
4116mod tests {
4117 use super::*;
4118
4119 fn parse(input: &str) -> Vec<Block> {
4120 let mut options = ReaderOptions::default();
4121 options.extensions = Extensions::from_list(&[Extension::AutoIdentifiers]);
4122 MediawikiReader
4123 .read(input, &options)
4124 .expect("read should not fail")
4125 .blocks
4126 }
4127
4128 fn parse_gfm(input: &str) -> Vec<Block> {
4129 let mut options = ReaderOptions::default();
4130 options.extensions = Extensions::from_list(&[Extension::GfmAutoIdentifiers]);
4131 MediawikiReader.read(input, &options).expect("read").blocks
4132 }
4133
4134 #[test]
4135 fn doi_and_javascript_are_recognized_schemes() {
4136 assert!(is_scheme("doi"));
4137 assert!(is_scheme("javascript"));
4138 assert!(is_scheme("DOI"));
4139 assert!(is_scheme("http"));
4140 assert!(!is_scheme("notascheme"));
4141 }
4142
4143 fn cell_with(content: Vec<Block>) -> Cell {
4144 Cell {
4145 attr: Attr::default(),
4146 align: Alignment::AlignDefault,
4147 row_span: 1,
4148 col_span: 1,
4149 content,
4150 }
4151 }
4152
4153 fn data_cell(text: &str) -> Cell {
4154 cell_with(vec![Block::Para(vec![Inline::Str(text.into())])])
4155 }
4156
4157 fn table_row(cells: Vec<Cell>) -> Row {
4158 Row {
4159 attr: Attr::default(),
4160 cells,
4161 }
4162 }
4163
4164 fn default_col() -> ColSpec {
4165 ColSpec {
4166 align: Alignment::AlignDefault,
4167 width: ColWidth::ColWidthDefault,
4168 }
4169 }
4170
4171 #[test]
4172 fn table_markup_becomes_a_table() {
4173 assert_eq!(
4174 parse("{|\n! Header\n|-\n| Cell\n|}\nafter"),
4175 vec![
4176 Block::Table(Box::new(Table {
4177 col_specs: vec![default_col()],
4178 head: TableHead {
4179 rows: vec![table_row(vec![data_cell("Header")])],
4180 ..Default::default()
4181 },
4182 bodies: vec![TableBody {
4183 body: vec![table_row(vec![data_cell("Cell")])],
4184 ..Default::default()
4185 }],
4186 ..Default::default()
4187 })),
4188 Block::Para(vec![Inline::Str("after".into())]),
4189 ]
4190 );
4191 }
4192
4193 #[test]
4194 fn unterminated_table_markup_does_not_panic() {
4195 assert_eq!(
4196 parse("{|"),
4197 vec![Block::Table(Box::new(Table {
4198 bodies: vec![TableBody {
4199 body: vec![table_row(Vec::new())],
4200 ..Default::default()
4201 }],
4202 ..Default::default()
4203 }))]
4204 );
4205 }
4206
4207 #[test]
4208 fn nested_table_markup_closes_at_the_outer_marker() {
4209 let inner = Block::Table(Box::new(Table {
4210 col_specs: vec![default_col()],
4211 bodies: vec![TableBody {
4212 body: vec![table_row(vec![data_cell("inner")])],
4213 ..Default::default()
4214 }],
4215 ..Default::default()
4216 }));
4217 assert_eq!(
4218 parse("{|\n|\n{|\n| inner\n|}\n|}"),
4219 vec![Block::Table(Box::new(Table {
4220 col_specs: vec![default_col()],
4221 bodies: vec![TableBody {
4222 body: vec![table_row(vec![cell_with(vec![inner])])],
4223 ..Default::default()
4224 }],
4225 ..Default::default()
4226 }))]
4227 );
4228 }
4229
4230 #[test]
4231 fn paragraph_joins_lines_with_soft_breaks() {
4232 assert_eq!(
4233 parse("one two\nthree"),
4234 vec![Block::Para(vec![
4235 Inline::Str("one".into()),
4236 Inline::Space,
4237 Inline::Str("two".into()),
4238 Inline::SoftBreak,
4239 Inline::Str("three".into()),
4240 ])]
4241 );
4242 }
4243
4244 #[test]
4245 fn emphasis_runs_decompose() {
4246 assert_eq!(
4247 parse("''i'' '''b''' '''''both'''''"),
4248 vec![Block::Para(vec![
4249 Inline::Emph(vec![Inline::Str("i".into())]),
4250 Inline::Space,
4251 Inline::Strong(vec![Inline::Str("b".into())]),
4252 Inline::Space,
4253 Inline::Strong(vec![Inline::Emph(vec![Inline::Str("both".into())])]),
4254 ])]
4255 );
4256 }
4257
4258 #[test]
4259 fn header_carries_mediawiki_identifier() {
4260 assert_eq!(
4261 parse("== Hello World =="),
4262 vec![Block::Header(
4263 2,
4264 Box::new(Attr {
4265 id: "hello_world".into(),
4266 classes: vec![],
4267 attributes: vec![],
4268 }),
4269 vec![
4270 Inline::Str("Hello".into()),
4271 Inline::Space,
4272 Inline::Str("World".into()),
4273 ],
4274 )]
4275 );
4276 }
4277
4278 #[test]
4279 fn duplicate_identifiers_are_suffixed() {
4280 let blocks = parse("== Dup ==\n== Dup ==");
4281 let ids: Vec<String> = blocks
4282 .iter()
4283 .filter_map(|b| match b {
4284 Block::Header(_, attr, _) => Some(attr.id.to_string()),
4285 _ => None,
4286 })
4287 .collect();
4288 assert_eq!(ids, vec!["dup".to_string(), "dup_1".to_string()]);
4289 }
4290
4291 #[test]
4292 fn gfm_identifier_scheme_uses_hyphens() {
4293 let blocks = parse_gfm("== Hello World ==");
4294 match blocks.first() {
4295 Some(Block::Header(_, attr, _)) => assert_eq!(attr.id, "hello-world"),
4296 other => panic!("expected header, got {other:?}"),
4297 }
4298 }
4299
4300 #[test]
4301 fn empty_identifier_falls_back_to_section() {
4302 let blocks = parse("== !!! ==\n== ??? ==");
4303 let ids: Vec<String> = blocks
4304 .iter()
4305 .filter_map(|b| match b {
4306 Block::Header(_, attr, _) => Some(attr.id.to_string()),
4307 _ => None,
4308 })
4309 .collect();
4310 assert_eq!(ids, vec!["section".to_string(), "section_1".to_string()]);
4311 }
4312
4313 #[test]
4314 fn malformed_header_is_a_paragraph() {
4315 assert_eq!(
4316 parse("== a=b =="),
4317 vec![Block::Para(vec![
4318 Inline::Str("==".into()),
4319 Inline::Space,
4320 Inline::Str("a=b".into()),
4321 Inline::Space,
4322 Inline::Str("==".into()),
4323 ])]
4324 );
4325 }
4326
4327 #[test]
4328 fn header_leftover_becomes_paragraph() {
4329 assert_eq!(
4330 parse("== H ==="),
4331 vec![
4332 Block::Header(
4333 2,
4334 Box::new(Attr {
4335 id: "h".into(),
4336 classes: vec![],
4337 attributes: vec![],
4338 }),
4339 vec![Inline::Str("H".into())],
4340 ),
4341 Block::Para(vec![Inline::Str("=".into())]),
4342 ]
4343 );
4344 }
4345
4346 #[test]
4347 fn nested_bullets_and_ordered() {
4348 assert_eq!(
4349 parse("* a\n** b\n*# c"),
4350 vec![Block::BulletList(vec![vec![
4351 Block::Plain(vec![Inline::Str("a".into())]),
4352 Block::BulletList(vec![vec![Block::Plain(vec![Inline::Str("b".into())])]]),
4353 Block::OrderedList(
4354 default_list_attrs(),
4355 vec![vec![Block::Plain(vec![Inline::Str("c".into())])]]
4356 ),
4357 ]])]
4358 );
4359 }
4360
4361 #[test]
4362 fn definition_list_splits_inline_definition() {
4363 assert_eq!(
4364 parse("; term : def"),
4365 vec![Block::DefinitionList(vec![(
4366 vec![Inline::Str("term".into())],
4367 vec![vec![Block::Plain(vec![Inline::Str("def".into())])]],
4368 )])]
4369 );
4370 }
4371
4372 #[test]
4373 fn internal_link_with_trail() {
4374 assert_eq!(
4375 parse("[[Page]]s"),
4376 vec![Block::Para(vec![Inline::Link(
4377 Box::new(Attr {
4378 id: carta_ast::Text::default(),
4379 classes: vec!["wikilink".into()],
4380 attributes: vec![],
4381 }),
4382 vec![Inline::Str("Pages".into())],
4383 Box::new(Target {
4384 url: "Page".into(),
4385 title: "Page".into(),
4386 }),
4387 )])]
4388 );
4389 }
4390
4391 #[test]
4392 fn lone_file_embed_becomes_a_figure() {
4393 assert_eq!(
4394 parse("[[File:Foo.jpg|thumb|A caption]]"),
4395 vec![Block::Figure(
4396 Box::default(),
4397 Box::new(Caption {
4398 short: None,
4399 long: vec![Block::Plain(vec![
4400 Inline::Str("A".into()),
4401 Inline::Space,
4402 Inline::Str("caption".into()),
4403 ])],
4404 }),
4405 vec![Block::Plain(vec![Inline::Image(
4406 Box::default(),
4407 vec![],
4408 Box::new(Target {
4409 url: "Foo.jpg".into(),
4410 title: "A caption".into(),
4411 }),
4412 )])],
4413 )]
4414 );
4415 }
4416
4417 #[test]
4418 fn embed_without_caption_defaults_to_the_file_name() {
4419 assert_eq!(
4420 parse("[[Image:My Photo.jpg]]"),
4421 vec![Block::Figure(
4422 Box::default(),
4423 Box::new(Caption {
4424 short: None,
4425 long: vec![Block::Plain(vec![Inline::Str("My_Photo.jpg".into())])],
4426 }),
4427 vec![Block::Plain(vec![Inline::Image(
4428 Box::default(),
4429 vec![],
4430 Box::new(Target {
4431 url: "My_Photo.jpg".into(),
4432 title: "My_Photo.jpg".into(),
4433 }),
4434 )])],
4435 )]
4436 );
4437 }
4438
4439 #[test]
4440 fn embed_size_parameters_set_width_and_height() {
4441 assert_eq!(
4442 parse("[[File:Foo.jpg|100x200px|cap]]"),
4443 vec![Block::Figure(
4444 Box::default(),
4445 Box::new(Caption {
4446 short: None,
4447 long: vec![Block::Plain(vec![Inline::Str("cap".into())])],
4448 }),
4449 vec![Block::Plain(vec![Inline::Image(
4450 Box::new(Attr {
4451 id: carta_ast::Text::default(),
4452 classes: vec![],
4453 attributes: vec![
4454 ("width".into(), "100".into()),
4455 ("height".into(), "200".into()),
4456 ],
4457 }),
4458 vec![],
4459 Box::new(Target {
4460 url: "Foo.jpg".into(),
4461 title: "cap".into(),
4462 }),
4463 )])],
4464 )]
4465 );
4466 }
4467
4468 #[test]
4469 fn inline_embed_stays_an_image_not_a_figure() {
4470 assert_eq!(
4471 parse("x [[File:Foo.jpg|cap]]"),
4472 vec![Block::Para(vec![
4473 Inline::Str("x".into()),
4474 Inline::Space,
4475 Inline::Image(
4476 Box::default(),
4477 vec![Inline::Str("cap".into())],
4478 Box::new(Target {
4479 url: "Foo.jpg".into(),
4480 title: "cap".into(),
4481 }),
4482 ),
4483 ])]
4484 );
4485 }
4486
4487 #[test]
4488 fn empty_file_embed_is_an_ordinary_wikilink() {
4489 assert_eq!(
4490 parse("[[File:]]"),
4491 vec![Block::Para(vec![Inline::Link(
4492 Box::new(Attr {
4493 id: carta_ast::Text::default(),
4494 classes: vec!["wikilink".into()],
4495 attributes: vec![],
4496 }),
4497 vec![Inline::Str("File:".into())],
4498 Box::new(Target {
4499 url: "File:".into(),
4500 title: "File:".into(),
4501 }),
4502 )])]
4503 );
4504 }
4505
4506 #[test]
4507 fn external_links_number_and_label() {
4508 assert_eq!(
4509 parse("[http://x.com lbl] [http://y.com]"),
4510 vec![Block::Para(vec![
4511 Inline::Link(
4512 Box::default(),
4513 vec![Inline::Str("lbl".into())],
4514 Box::new(Target {
4515 url: "http://x.com".into(),
4516 title: carta_ast::Text::default(),
4517 }),
4518 ),
4519 Inline::Space,
4520 Inline::Link(
4521 Box::default(),
4522 vec![Inline::Str("1".into())],
4523 Box::new(Target {
4524 url: "http://y.com".into(),
4525 title: carta_ast::Text::default(),
4526 }),
4527 ),
4528 ])]
4529 );
4530 }
4531
4532 #[test]
4533 fn bare_url_trims_trailing_punctuation() {
4534 assert_eq!(
4535 parse("see http://x.com."),
4536 vec![Block::Para(vec![
4537 Inline::Str("see".into()),
4538 Inline::Space,
4539 Inline::Link(
4540 Box::default(),
4541 vec![Inline::Str("http://x.com".into())],
4542 Box::new(Target {
4543 url: "http://x.com".into(),
4544 title: carta_ast::Text::default(),
4545 }),
4546 ),
4547 Inline::Str(".".into()),
4548 ])]
4549 );
4550 }
4551
4552 #[test]
4553 fn entities_are_decoded_in_text() {
4554 assert_eq!(
4555 parse("AT&T ©"),
4556 vec![Block::Para(vec![
4557 Inline::Str("AT&T".into()),
4558 Inline::Space,
4559 Inline::Str("\u{a9}".into()),
4560 ])]
4561 );
4562 }
4563
4564 #[test]
4565 fn nowiki_is_literal_text() {
4566 assert_eq!(
4567 parse("<nowiki>'''raw'''</nowiki>"),
4568 vec![Block::Para(vec![Inline::Str("'''raw'''".into())])]
4569 );
4570 }
4571
4572 #[test]
4573 fn reference_becomes_a_note() {
4574 assert_eq!(
4575 parse("x<ref>note</ref>"),
4576 vec![Block::Para(vec![
4577 Inline::Str("x".into()),
4578 Inline::Note(vec![Block::Plain(vec![Inline::Str("note".into())])]),
4579 ])]
4580 );
4581 }
4582
4583 #[test]
4584 fn code_tag_decodes_entities() {
4585 assert_eq!(
4586 parse("<code>a & b</code>"),
4587 vec![Block::Para(vec![Inline::Code(
4588 Box::default(),
4589 "a & b".into()
4590 )])]
4591 );
4592 }
4593
4594 #[test]
4595 fn unknown_tag_passes_through_as_raw_html() {
4596 assert_eq!(
4597 parse("<b>x</b>"),
4598 vec![Block::Para(vec![
4599 raw_html("<b>".into()),
4600 Inline::Str("x".into()),
4601 raw_html("</b>".into()),
4602 ])]
4603 );
4604 }
4605
4606 #[test]
4607 fn whole_line_comment_is_removed_with_its_newline() {
4608 assert_eq!(
4609 parse("x\n<!--c-->\ny"),
4610 vec![Block::Para(vec![
4611 Inline::Str("x".into()),
4612 Inline::SoftBreak,
4613 Inline::Str("y".into()),
4614 ])]
4615 );
4616 }
4617
4618 #[test]
4619 fn inline_comment_becomes_a_space() {
4620 assert_eq!(
4621 parse("a<!--c-->b"),
4622 vec![Block::Para(vec![
4623 Inline::Str("a".into()),
4624 Inline::Space,
4625 Inline::Str("b".into()),
4626 ])]
4627 );
4628 }
4629
4630 #[test]
4631 fn syntax_highlight_block_keeps_language_and_content() {
4632 assert_eq!(
4633 parse("<syntaxhighlight lang=\"rust\">\nfn main(){}\n</syntaxhighlight>"),
4634 vec![Block::CodeBlock(
4635 Box::new(Attr {
4636 id: carta_ast::Text::default(),
4637 classes: vec!["rust".into()],
4638 attributes: vec![],
4639 }),
4640 "fn main(){}".into(),
4641 )]
4642 );
4643 }
4644
4645 #[test]
4646 fn horizontal_rule_requires_a_dashes_only_line() {
4647 assert_eq!(parse("----"), vec![Block::HorizontalRule]);
4648 assert_eq!(
4649 parse("----foo"),
4650 vec![Block::Para(vec![Inline::Str("----foo".into())])]
4651 );
4652 }
4653
4654 #[test]
4655 fn preformatted_lines_become_code() {
4656 assert_eq!(
4657 parse(" indented line"),
4658 vec![Block::Para(vec![Inline::Code(
4659 Box::default(),
4660 "indented\u{a0}\u{a0}line".into()
4661 )])]
4662 );
4663 }
4664
4665 #[test]
4666 fn preformatted_preserves_markup_and_spacing() {
4667 assert_eq!(
4668 parse(" a '''b''' c"),
4669 vec![Block::Para(vec![
4670 Inline::Code(Box::default(), "a\u{a0}".into()),
4671 Inline::Strong(vec![Inline::Code(Box::default(), "b".into())]),
4672 Inline::Code(Box::default(), "\u{a0}c".into()),
4673 ])]
4674 );
4675 }
4676
4677 #[test]
4678 fn block_template_is_raw_then_trailing_paragraph() {
4679 assert_eq!(
4680 parse("{{tpl}} trailing"),
4681 vec![
4682 Block::RawBlock(format_mediawiki(), "{{tpl}}".into()),
4683 Block::Para(vec![Inline::Str("trailing".into())]),
4684 ]
4685 );
4686 }
4687
4688 fn reads_ok(input: &str) -> bool {
4691 MediawikiReader
4692 .read(input, &ReaderOptions::default())
4693 .is_ok()
4694 }
4695
4696 #[test]
4697 fn adversarially_nested_wiki_list_does_not_panic() {
4698 let mut input = String::new();
4699 for n in 1..4000 {
4700 input.push_str(&"*".repeat(n));
4701 input.push_str(" item\n");
4702 }
4703 assert!(reads_ok(&input));
4704 let single = format!("{} item", "*".repeat(20_000));
4705 assert!(reads_ok(&single));
4706 }
4707
4708 #[test]
4709 fn adversarially_nested_tables_do_not_panic() {
4710 let input = format!("{}| x\n{}", "{|\n".repeat(4000), "|}\n".repeat(4000));
4711 assert!(reads_ok(&input));
4712 }
4713
4714 #[test]
4715 fn adversarially_nested_html_list_does_not_panic() {
4716 let input = format!("{}x{}", "<ul><li>".repeat(4000), "</li></ul>".repeat(4000));
4717 assert!(reads_ok(&input));
4718 }
4719
4720 #[test]
4721 fn adversarially_nested_refs_do_not_panic() {
4722 let input = format!("{}x{}", "a<ref>".repeat(4000), "</ref>".repeat(4000));
4723 assert!(reads_ok(&input));
4724 }
4725
4726 #[test]
4727 fn stacked_header_lines_do_not_blow_up() {
4728 let input = "== ~iT\n= w e\n= J".repeat(4000);
4734 assert!(reads_ok(&input));
4735 }
4736}