marki_parse/
inline.rs

1use std::mem::MaybeUninit;
2
3use crate::OffsetExt;
4use crate::SpecialChar;
5use crate::section::InlineSpan;
6use crate::simd::{ByteSet, ByteSliceExt};
7
8/// An inline element within a Markdown block.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum Inline<'src> {
11    Text(&'src str),
12    Bold(InlineSpan),
13    Italic(InlineSpan),
14    Link {
15        text: InlineSpan,
16        url: &'src str,
17        title: Option<&'src str>,
18    },
19    Image {
20        alt: &'src str,
21        url: &'src str,
22        title: Option<&'src str>,
23    },
24    Code(&'src str),
25    SoftBreak,
26    HardBreak,
27}
28
29impl<'src> From<&'src str> for Inline<'src> {
30    fn from(s: &'src str) -> Self {
31        Self::Text(s)
32    }
33}
34
35/// SIMD-accelerated byte set for inline special characters.
36static SPECIAL_SET: ByteSet = ByteSet::new(&[
37    SpecialChar::Newline.byte(),
38    SpecialChar::Asterisk.byte(),
39    SpecialChar::Underscore.byte(),
40    SpecialChar::OpenBracket.byte(),
41    SpecialChar::ExclamationMark.byte(),
42    SpecialChar::Backslash.byte(),
43    SpecialChar::Backtick.byte(),
44]);
45
46/// Pre-computed byte sets for `find_matching_close` — avoids rebuilding
47/// the 256-byte lookup table on every call.
48static BRACKET_CLOSE_SET: ByteSet = ByteSet::new(&[
49    SpecialChar::OpenBracket.byte(),
50    SpecialChar::CloseBracket.byte(),
51    SpecialChar::Backslash.byte(),
52]);
53static PAREN_CLOSE_SET: ByteSet = ByteSet::new(&[
54    SpecialChar::OpenParen.byte(),
55    SpecialChar::CloseParen.byte(),
56    SpecialChar::Backslash.byte(),
57]);
58
59/// Pre-computed byte sets for `try_parse_delimited` — one per delimiter type.
60static STAR_DELIM_SET: ByteSet =
61    ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Backslash.byte()]);
62static UNDER_DELIM_SET: ByteSet = ByteSet::new(&[
63    SpecialChar::Underscore.byte(),
64    SpecialChar::Backslash.byte(),
65]);
66
67/// Character classification for `CommonMark` emphasis flanking rules.
68#[derive(Clone, Copy, PartialEq, Eq)]
69enum CharClass {
70    Whitespace,
71    Punctuation,
72    Other,
73}
74
75impl CharClass {
76    const fn of(ch: char) -> Self {
77        if ch.is_whitespace() {
78            Self::Whitespace
79        } else if ch.is_ascii_punctuation() || Self::unicode_punctuation(ch) {
80            Self::Punctuation
81        } else {
82            Self::Other
83        }
84    }
85
86    /// Fast classification for ASCII bytes, avoiding UTF-8 decode.
87    #[inline]
88    const fn of_ascii(b: u8) -> Self {
89        if b.is_ascii_whitespace() {
90            Self::Whitespace
91        } else if b.is_ascii_punctuation() {
92            Self::Punctuation
93        } else {
94            Self::Other
95        }
96    }
97
98    /// Returns `true` for Unicode punctuation/symbol characters beyond ASCII.
99    /// Covers general categories P and S without an external crate.
100    const fn unicode_punctuation(ch: char) -> bool {
101        if ch.is_ascii() {
102            return false;
103        }
104        matches!(ch,
105            '\u{00A1}'..='\u{00BF}' // Latin punctuation/symbols
106            | '\u{2010}'..='\u{2027}' // General punctuation
107            | '\u{2030}'..='\u{205E}' // More general punctuation
108            | '\u{2190}'..='\u{23FF}' // Arrows, math operators, misc technical
109            | '\u{2500}'..='\u{2BFF}' // Box drawing, block elements, symbols
110            | '\u{3000}'..='\u{303F}' // CJK symbols and punctuation
111            | '\u{FE30}'..='\u{FE6F}' // CJK compatibility forms, small forms
112            | '\u{FF01}'..='\u{FF0F}' // Fullwidth punctuation
113            | '\u{FF1A}'..='\u{FF20}' // More fullwidth punctuation
114            | '\u{FF3B}'..='\u{FF40}' // Fullwidth brackets
115            | '\u{FF5B}'..='\u{FF65}' // Fullwidth punctuation
116        )
117    }
118}
119
120/// What emphasis types remain possible for a given delimiter character.
121/// Tracks delimiter availability per character type, avoiding O(n²)
122/// re-scanning in both top-level and recursive parse calls.
123#[derive(Clone, Copy)]
124enum DelimiterAvail {
125    /// Both bold and italic are still possible.
126    Both,
127    /// Bold failed; only italic can be attempted.
128    ItalicOnly,
129    /// Italic failed; only bold can be attempted.
130    BoldOnly,
131    /// Neither bold nor italic can succeed.
132    None,
133}
134
135impl DelimiterAvail {
136    const fn can_bold(self) -> bool {
137        matches!(self, Self::Both | Self::BoldOnly)
138    }
139
140    const fn can_italic(self) -> bool {
141        matches!(self, Self::Both | Self::ItalicOnly)
142    }
143
144    const fn bold_failed(&mut self) {
145        *self = match *self {
146            Self::Both => Self::ItalicOnly,
147            Self::BoldOnly => Self::None,
148            other => other,
149        };
150    }
151
152    const fn italic_failed(&mut self) {
153        *self = match *self {
154            Self::Both => Self::BoldOnly,
155            Self::ItalicOnly => Self::None,
156            other => other,
157        };
158    }
159
160    const fn from_count(count: usize) -> Self {
161        match count {
162            0 | 1 => Self::None,
163            2 => Self::BoldOnly,
164            _ => Self::Both,
165        }
166    }
167}
168
169struct EmphasisState {
170    star: DelimiterAvail,
171    under: DelimiterAvail,
172}
173
174impl EmphasisState {
175    const fn assume_both() -> Self {
176        Self {
177            star: DelimiterAvail::Both,
178            under: DelimiterAvail::Both,
179        }
180    }
181
182    fn from_bytes(bytes: &[u8]) -> Self {
183        static EMPH_SET: ByteSet =
184            ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Underscore.byte()]);
185        let mut stars: u8 = 0;
186        let mut unders: u8 = 0;
187        let mut i = 0;
188        while let Some(pos) = bytes.find_byte_set(i, &EMPH_SET) {
189            if bytes[pos] == SpecialChar::Asterisk {
190                stars = stars.saturating_add(1);
191            } else {
192                unders = unders.saturating_add(1);
193            }
194            if stars >= 4 && unders >= 4 {
195                break;
196            }
197            i = pos + 1;
198        }
199        Self {
200            star: DelimiterAvail::from_count(stars as usize),
201            under: DelimiterAvail::from_count(unders as usize),
202        }
203    }
204
205    const fn avail_mut(&mut self, is_star: bool) -> &mut DelimiterAvail {
206        if is_star {
207            &mut self.star
208        } else {
209            &mut self.under
210        }
211    }
212}
213
214/// Stack-allocated buffer for collecting inline elements without heap allocation.
215/// Uses `MaybeUninit` to avoid zeroing the stack array on every parse call.
216/// The capacity `CAP` is configurable via `MarkdownFile`'s `INLINE_STACK_CAP`
217/// const generic — falls back to heap if exceeded.
218struct InlineBuf<'src, const CAP: usize> {
219    stack: [MaybeUninit<Inline<'src>>; CAP],
220    len: usize,
221    overflow: Vec<Inline<'src>>,
222}
223
224impl<'src, const CAP: usize> InlineBuf<'src, CAP> {
225    #[inline]
226    const fn new() -> Self {
227        Self {
228            // SAFETY: An array of MaybeUninit does not require initialization.
229            stack: [const { MaybeUninit::uninit() }; CAP],
230            len: 0,
231            overflow: Vec::new(),
232        }
233    }
234
235    #[allow(clippy::inline_always)]
236    #[inline(always)]
237    fn push(&mut self, item: Inline<'src>) {
238        if self.len < CAP {
239            self.stack[self.len] = MaybeUninit::new(item);
240            self.len += 1;
241        } else {
242            self.push_slow(item);
243        }
244    }
245
246    #[cold]
247    fn push_slow(&mut self, item: Inline<'src>) {
248        if self.overflow.is_empty() {
249            // Spill stack to heap
250            self.overflow = Vec::with_capacity(CAP * 2);
251            // SAFETY: elements 0..self.len were initialized via push.
252            // Use a raw pointer to avoid borrow conflict with self.overflow.
253            let len = self.len;
254            let ptr = self.stack.as_ptr().cast::<Inline>();
255            let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
256            self.overflow.extend_from_slice(slice);
257        }
258        self.overflow.push(item);
259    }
260
261    /// Get initialized stack elements as a slice.
262    #[inline]
263    const fn initialized_stack(&self) -> &[Inline<'src>] {
264        // SAFETY: all elements 0..self.len have been initialized via push.
265        unsafe { std::slice::from_raw_parts(self.stack.as_ptr().cast::<Inline>(), self.len) }
266    }
267
268    #[inline]
269    fn flush_to_pool(self, pool: &mut Vec<Inline<'src>>) -> InlineSpan {
270        let start = pool.len().pool_offset();
271        if self.overflow.is_empty() {
272            pool.extend_from_slice(self.initialized_stack());
273            InlineSpan::new(start, self.len.pool_offset())
274        } else {
275            let len = self.overflow.len().pool_offset();
276            pool.extend(self.overflow);
277            InlineSpan::new(start, len)
278        }
279    }
280}
281
282/// Threshold below which the emphasis pre-scan costs more than it saves.
283const EMPH_SCAN_THRESHOLD: usize = 256;
284
285/// Stateful parser for a single inline parse pass.
286///
287/// Holds the input slice and a mutable reference to the output pool so that
288/// recursive/nested parsing can share the same pool without threading the
289/// pool through every helper.
290pub struct InlineParser<'src, 'pool, const MAX_DEPTH: u8, const CAP: usize> {
291    input: &'src str,
292    pool: &'pool mut Vec<Inline<'src>>,
293}
294
295impl<'src, 'pool, const MAX_DEPTH: u8, const CAP: usize> InlineParser<'src, 'pool, MAX_DEPTH, CAP> {
296    const fn new(input: &'src str, pool: &'pool mut Vec<Inline<'src>>) -> Self {
297        Self { input, pool }
298    }
299
300    /// Parse inline elements with configurable depth and stack limits.
301    pub(crate) fn parse_configured(
302        input: &'src str,
303        pool: &'pool mut Vec<Inline<'src>>,
304    ) -> InlineSpan {
305        Self::new(input, pool).parse()
306    }
307
308    /// Push parsed inline elements directly into the pool without wrapping
309    /// in a span, with configurable depth and stack limits.
310    pub(crate) fn parse_flat_into_configured(input: &'src str, pool: &'pool mut Vec<Inline<'src>>) {
311        Self::new(input, pool).parse_flat();
312    }
313
314    /// Parse inline elements and store them in the pool. Returns a span.
315    ///
316    /// Uses default limits (`MAX_INLINE_DEPTH = 16`, `INLINE_STACK_CAP = 32`).
317    /// For custom limits, use [`crate::MarkdownFile::parse`] with const generics.
318    #[must_use]
319    fn parse(&mut self) -> InlineSpan {
320        self.parse_at_depth(0)
321    }
322
323    fn parse_at_depth(&mut self, depth: u8) -> InlineSpan {
324        let bytes = self.input.as_bytes();
325        // Fast path: if no special bytes exist, the entire input is plain text.
326        if bytes.find_byte_set(0, &SPECIAL_SET).is_none() {
327            if self.input.is_empty() {
328                return InlineSpan::EMPTY;
329            }
330            let start = self.pool.len().pool_offset();
331            self.pool.push(Inline::Text(self.input));
332            return InlineSpan::new(start, 1);
333        }
334        let emph = if bytes.len() < EMPH_SCAN_THRESHOLD {
335            EmphasisState::assume_both()
336        } else {
337            EmphasisState::from_bytes(bytes)
338        };
339        let mut buf = InlineBuf::<CAP>::new();
340        self.parse_into_buf(bytes, emph, &mut buf, depth);
341        buf.flush_to_pool(self.pool)
342    }
343
344    fn parse_inner(&mut self, input: &'src str, depth: u8) -> InlineSpan {
345        InlineParser::<MAX_DEPTH, CAP> {
346            input,
347            pool: self.pool,
348        }
349        .parse_at_depth(depth)
350    }
351
352    /// Push parsed inline elements directly into the pool without wrapping
353    /// in a span. Used for blockquote multi-line accumulation where the caller
354    /// manages span boundaries.
355    fn parse_flat(&mut self) {
356        let bytes = self.input.as_bytes();
357        // Fast path: no special bytes means plain text.
358        if bytes.find_byte_set(0, &SPECIAL_SET).is_none() {
359            if !self.input.is_empty() {
360                self.pool.push(Inline::Text(self.input));
361            }
362            return;
363        }
364        let emph = if bytes.len() < EMPH_SCAN_THRESHOLD {
365            EmphasisState::assume_both()
366        } else {
367            EmphasisState::from_bytes(bytes)
368        };
369        // Parse directly into a buf that flushes to pool (flat, no span wrapper).
370        let mut buf = InlineBuf::<CAP>::new();
371        self.parse_into_buf(bytes, emph, &mut buf, 0);
372        // Flush buf directly to pool (not wrapped in a span).
373        if buf.overflow.is_empty() {
374            self.pool.extend_from_slice(buf.initialized_stack());
375        } else {
376            self.pool.extend(buf.overflow);
377        }
378    }
379
380    fn parse_into_buf(
381        &mut self,
382        bytes: &[u8],
383        mut emph: EmphasisState,
384        buf: &mut InlineBuf<'src, CAP>,
385        depth: u8,
386    ) {
387        let mut plain_start = 0;
388        let mut i = 0;
389
390        // SIMD-accelerated scan: find next special byte.
391        while let Some(pos) = bytes.find_byte_set(i, &SPECIAL_SET) {
392            i = pos;
393            let b = bytes[i];
394
395            if b == SpecialChar::Newline {
396                self.emit_line_break(bytes, plain_start, i, buf);
397                plain_start = i + 1;
398                i = plain_start;
399                continue;
400            }
401
402            // Backslash escape: only ASCII punctuation can be escaped (CommonMark spec).
403            // For non-punctuation, the backslash is kept as literal text.
404            if b == SpecialChar::Backslash
405                && let Some(&next) = bytes.get(i + 1)
406                && next.is_ascii_punctuation()
407            {
408                if let Some(text) = self.input.get(plain_start..i)
409                    && !text.is_empty()
410                {
411                    buf.push(Inline::Text(text));
412                }
413                plain_start = i + 1;
414                i += 2;
415                continue;
416            }
417
418            // Inline code: `code` or ``code``
419            if b == SpecialChar::Backtick
420                && let Some((code, end)) = Self::try_parse_inline_code(self.input, bytes, i)
421            {
422                if let Some(text) = self.input.get(plain_start..i)
423                    && !text.is_empty()
424                {
425                    buf.push(Inline::Text(text));
426                }
427                buf.push(Inline::Code(code));
428                plain_start = end;
429                i = end;
430                continue;
431            }
432
433            // Image: ![alt](url "title")
434            if b == SpecialChar::ExclamationMark
435                && bytes.get(i + 1) == SpecialChar::OpenBracket
436                && let Some((alt, url, title, end)) =
437                    Self::try_parse_bracket_paren(self.input, bytes, i + 1)
438            {
439                if let Some(text) = self.input.get(plain_start..i)
440                    && !text.is_empty()
441                {
442                    buf.push(Inline::Text(text));
443                }
444                buf.push(Inline::Image { alt, url, title });
445                plain_start = end;
446                i = end;
447                continue;
448            }
449
450            // Link: [text](url "title")
451            if b == SpecialChar::OpenBracket
452                && let Some((text_str, url, title, end)) =
453                    Self::try_parse_bracket_paren(self.input, bytes, i)
454            {
455                if let Some(text) = self.input.get(plain_start..i)
456                    && !text.is_empty()
457                {
458                    buf.push(Inline::Text(text));
459                }
460                let text_span = self.parse_inner(text_str, depth.saturating_add(1));
461                buf.push(Inline::Link {
462                    text: text_span,
463                    url,
464                    title,
465                });
466                plain_start = end;
467                i = end;
468                continue;
469            }
470
471            // Bold/Italic: ** __ * _
472            if let Some((elem, end)) = self.try_parse_emphasis(bytes, i, b, &mut emph, depth) {
473                if let Some(text) = self.input.get(plain_start..i)
474                    && !text.is_empty()
475                {
476                    buf.push(Inline::Text(text));
477                }
478                buf.push(elem);
479                plain_start = end;
480                i = end;
481                continue;
482            }
483
484            i += 1;
485        }
486
487        if let Some(text) = self.input.get(plain_start..)
488            && !text.is_empty()
489        {
490            buf.push(Inline::Text(text));
491        }
492    }
493
494    /// Emit a hard or soft line break at a newline position.
495    /// Hard break if preceded by trailing `\` or 2+ spaces; soft break otherwise.
496    #[inline]
497    fn emit_line_break(
498        &self,
499        bytes: &[u8],
500        plain_start: usize,
501        newline_pos: usize,
502        buf: &mut InlineBuf<'src, CAP>,
503    ) {
504        let preceding = bytes.get(plain_start..newline_pos).unwrap_or_default();
505        let (trim_end, is_hard) = if preceding.last() == SpecialChar::Backslash {
506            (newline_pos - 1, true)
507        } else {
508            // Count trailing spaces with a simple backward loop.
509            let mut spaces = 0;
510            let mut j = preceding.len();
511            while j > 0 && preceding[j - 1] == SpecialChar::Space {
512                spaces += 1;
513                j -= 1;
514            }
515            if spaces >= 2 {
516                (newline_pos - spaces, true)
517            } else {
518                (newline_pos, false)
519            }
520        };
521        if let Some(text) = self.input.get(plain_start..trim_end)
522            && !text.is_empty()
523        {
524            buf.push(Inline::Text(text));
525        }
526        buf.push(if is_hard {
527            Inline::HardBreak
528        } else {
529            Inline::SoftBreak
530        });
531    }
532
533    #[inline]
534    fn try_parse_emphasis(
535        &mut self,
536        bytes: &[u8],
537        i: usize,
538        b: u8,
539        emph: &mut EmphasisState,
540        depth: u8,
541    ) -> Option<(Inline<'src>, usize)> {
542        let is_star = b == SpecialChar::Asterisk;
543        if !is_star && b != SpecialChar::Underscore {
544            return None;
545        }
546        // Depth limit: treat as plain text to prevent stack overflow.
547        if depth >= MAX_DEPTH {
548            return None;
549        }
550        let avail = emph.avail_mut(is_star);
551        let open_run = if is_star {
552            SpecialChar::Asterisk.count_leading_bytes(&bytes[i..])
553        } else {
554            SpecialChar::Underscore.count_leading_bytes(&bytes[i..])
555        };
556
557        // Triple runs (*** or ___) can open/close both emphasis and strong
558        // emphasis. Match them as strong nested inside emphasis so that
559        // ***text*** becomes Italic(Bold(text)) instead of being split.
560        if open_run >= 3 && avail.can_bold() && avail.can_italic() {
561            if let Some((inner, end)) = Self::try_parse_delimited(self.input, bytes, i, b, 3) {
562                // Only match when the closing run is exactly three characters,
563                // leaving longer runs (e.g. ****text****) to the strong/italic
564                // logic below.
565                let close_run_start = end - 3;
566                let exact_close =
567                    bytes.get(close_run_start - 1) != Some(&b) && bytes.get(end) != Some(&b);
568                if exact_close {
569                    let inner_span = self.parse_inner(inner, depth + 1);
570                    let bold_start = self.pool.len().pool_offset();
571                    self.pool.push(Inline::Bold(inner_span));
572                    let bold_span = InlineSpan::new(bold_start, 1);
573                    return Some((Inline::Italic(bold_span), end));
574                }
575            }
576            // A triple run exists but couldn't be matched; strong and italic
577            // may still succeed from the same starting position.
578        }
579
580        // Bold: ** or __
581        if avail.can_bold() && bytes.get(i + 1) == Some(&b) {
582            if let Some((inner, end)) = Self::try_parse_delimited(self.input, bytes, i, b, 2) {
583                let span = self.parse_inner(inner, depth + 1);
584                return Some((Inline::Bold(span), end));
585            }
586            avail.bold_failed();
587        }
588
589        // Italic: * or _
590        if avail.can_italic() {
591            if let Some((inner, end)) = Self::try_parse_delimited(self.input, bytes, i, b, 1) {
592                let span = self.parse_inner(inner, depth + 1);
593                return Some((Inline::Italic(span), end));
594            }
595            avail.italic_failed();
596        }
597
598        None
599    }
600
601    /// Find the position of a matching closing delimiter, handling backslash
602    /// escapes and nested pairs.
603    fn find_matching_close(
604        bytes: &[u8],
605        start: usize,
606        open: SpecialChar,
607        close: SpecialChar,
608    ) -> Option<usize> {
609        // Select pre-computed static ByteSet instead of building one each call.
610        let set = if open == SpecialChar::OpenBracket {
611            &BRACKET_CLOSE_SET
612        } else {
613            &PAREN_CLOSE_SET
614        };
615        let mut nested = 0u32;
616        let mut j = start;
617        loop {
618            let pos = bytes.find_byte_set(j, set)?;
619            let b = bytes[pos];
620            if b == SpecialChar::Backslash
621                && bytes.get(pos + 1).is_some_and(u8::is_ascii_punctuation)
622            {
623                j = pos + 2;
624                continue;
625            }
626            if b == open {
627                nested += 1;
628            } else if b == close {
629                if nested == 0 {
630                    return Some(pos);
631                }
632                nested -= 1;
633            }
634            j = pos + 1;
635        }
636    }
637
638    fn try_parse_bracket_paren(
639        input: &'src str,
640        bytes: &[u8],
641        start: usize,
642    ) -> Option<(&'src str, &'src str, Option<&'src str>, usize)> {
643        if bytes.get(start) != SpecialChar::OpenBracket {
644            return None;
645        }
646
647        let bracket_start = start + 1;
648        let bracket_end = Self::find_matching_close(
649            bytes,
650            bracket_start,
651            SpecialChar::OpenBracket,
652            SpecialChar::CloseBracket,
653        )?;
654
655        let paren_pos = bracket_end + 1;
656        if bytes.get(paren_pos) != SpecialChar::OpenParen {
657            return None;
658        }
659
660        let paren_start = paren_pos + 1;
661        let paren_end = Self::find_matching_close(
662            bytes,
663            paren_start,
664            SpecialChar::OpenParen,
665            SpecialChar::CloseParen,
666        )?;
667
668        let paren_content = input.get(paren_start..paren_end)?;
669        let (url, title) = Self::split_url_title(paren_content);
670
671        Some((
672            input.get(bracket_start..bracket_end)?,
673            url,
674            title,
675            paren_end + 1,
676        ))
677    }
678
679    /// Split the content inside `(...)` into a URL and optional title
680    /// (`CommonMark` §6.3).
681    ///
682    /// Titles are delimited by `"..."`, `'...'`, or `(...)`.
683    ///
684    /// We scan **backwards** because the title, if present, is always at the
685    /// end. The algorithm:
686    ///  1. Check the last byte for a closing title delimiter (`"`, `'`, `)`).
687    ///  2. Walk backwards to find the matching opener.
688    ///  3. The opener must be preceded by whitespace — this separates the URL
689    ///     from the title. If no whitespace is found, there is no title.
690    ///  4. For **paired** delimiters (`(…)`), if the first candidate opener
691    ///     lacks preceding whitespace we keep scanning for an earlier `(`
692    ///     that does. For **same-char** delimiters (`"…"`, `'…'`), the first
693    ///     match is the only candidate (no nesting possible).
694    fn split_url_title(content: &'src str) -> (&'src str, Option<&'src str>) {
695        let trimmed = content.trim();
696        // A valid title needs at minimum: url, space, open+close quotes (e.g. `u "t"`).
697        // With fewer than 3 bytes the backward scan would underflow.
698        if trimmed.len() < 3 {
699            return (trimmed, None);
700        }
701
702        let bytes = trimmed.as_bytes();
703        let last = bytes[bytes.len() - 1];
704        let (open, close) = match SpecialChar::from_byte(last) {
705            Some(SpecialChar::DoubleQuote) => (SpecialChar::DoubleQuote, SpecialChar::DoubleQuote),
706            Some(SpecialChar::SingleQuote) => (SpecialChar::SingleQuote, SpecialChar::SingleQuote),
707            Some(SpecialChar::CloseParen) => (SpecialChar::OpenParen, SpecialChar::CloseParen),
708            // No trailing title delimiter — the entire content is the URL.
709            _ => return (trimmed, None),
710        };
711
712        // Scan backwards for the matching opening delimiter.
713        let mut j = bytes.len() - 2;
714        loop {
715            if bytes[j] == open {
716                // Whitespace before the opener separates URL from title.
717                if j > 0 && bytes[j - 1].is_ascii_whitespace() {
718                    let url = trimmed.get(..j).unwrap_or(trimmed).trim_end();
719                    let title = trimmed.get(j + 1..bytes.len() - 1).unwrap_or("");
720                    return (url, Some(title));
721                }
722                // For paired delimiters (open != close), keep scanning for an
723                // earlier opener that *does* have preceding whitespace.
724                if open != close {
725                    if j == 0 {
726                        break;
727                    }
728                    j -= 1;
729                    continue;
730                }
731                // Same-char delimiter: first match is the only candidate.
732                break;
733            }
734            if j == 0 {
735                break;
736            }
737            j -= 1;
738        }
739
740        // No valid title found — treat entire content as URL.
741        (trimmed, None)
742    }
743
744    #[inline]
745    /// Classify the character before a position for flanking delimiter rules.
746    /// Returns `CharClass::Whitespace` at start-of-input (treated as if preceded by newline).
747    fn char_class_before(bytes: &[u8], pos: usize) -> CharClass {
748        if pos == 0 {
749            return CharClass::Whitespace;
750        }
751        let b = bytes[pos - 1];
752        // Fast path: ASCII bytes need no UTF-8 decoding.
753        if b < 0x80 {
754            return CharClass::of_ascii(b);
755        }
756        // Walk back to find UTF-8 codepoint start.
757        let mut start = pos - 1;
758        while start > 0 && bytes[start] & 0xC0 == 0x80 {
759            start -= 1;
760        }
761        let ch = std::str::from_utf8(&bytes[start..pos])
762            .ok()
763            .and_then(|s| s.chars().next())
764            .unwrap_or(' ');
765        CharClass::of(ch)
766    }
767
768    #[inline]
769    /// Classify the character after a position for flanking delimiter rules.
770    /// Returns `CharClass::Whitespace` at end-of-input (treated as if followed by newline).
771    fn char_class_after(bytes: &[u8], pos: usize) -> CharClass {
772        if pos >= bytes.len() {
773            return CharClass::Whitespace;
774        }
775        let b = bytes[pos];
776        // Fast path: ASCII bytes need no UTF-8 decoding.
777        if b < 0x80 {
778            return CharClass::of_ascii(b);
779        }
780        // Decode the UTF-8 codepoint starting at `pos`.
781        let ch = std::str::from_utf8(&bytes[pos..])
782            .ok()
783            .and_then(|s| s.chars().next())
784            .unwrap_or(' ');
785        CharClass::of(ch)
786    }
787
788    fn try_parse_delimited(
789        input: &'src str,
790        bytes: &[u8],
791        start: usize,
792        marker: u8,
793        count: usize,
794    ) -> Option<(&'src str, usize)> {
795        let inner_start = start + count;
796        bytes.get(inner_start)?;
797
798        let is_star = marker == SpecialChar::Asterisk;
799
800        // CommonMark §6.2 — emphasis flanking rules:
801        // A left-flanking delimiter run must not be followed by whitespace,
802        // and must not be followed by punctuation unless preceded by whitespace
803        // or punctuation. For `_`, it must also not be right-flanking (unless
804        // preceded by punctuation), preventing intra-word emphasis.
805        let before_open = Self::char_class_before(bytes, start);
806        let after_open = Self::char_class_after(bytes, inner_start);
807
808        let left_flanking = after_open != CharClass::Whitespace
809            && (after_open != CharClass::Punctuation || before_open != CharClass::Other);
810        if !left_flanking {
811            return None;
812        }
813        if !is_star {
814            // _ can open only if left-flanking AND (not right-flanking OR preceded by punctuation)
815            let right_flanking_open = before_open != CharClass::Whitespace
816                && (before_open != CharClass::Punctuation || after_open != CharClass::Other);
817            if right_flanking_open && before_open != CharClass::Punctuation {
818                return None;
819            }
820        }
821
822        // Select pre-computed static ByteSet instead of building one each call.
823        let delim_set = if is_star {
824            &STAR_DELIM_SET
825        } else {
826            &UNDER_DELIM_SET
827        };
828
829        let mut i = inner_start;
830        while let Some(pos) = bytes.find_byte_set(i, delim_set) {
831            i = pos;
832            let b = bytes[i];
833
834            if b == SpecialChar::Backslash && bytes.get(i + 1).is_some_and(u8::is_ascii_punctuation)
835            {
836                i += 2;
837                continue;
838            }
839
840            if b != marker {
841                i += 1;
842                continue;
843            }
844
845            // Found a marker byte — check for a valid closing run.
846            let all_match = (1..count).all(|j| bytes.get(i + j) == Some(&marker));
847            if !all_match {
848                i += 1;
849                continue;
850            }
851
852            let close_end = i + count;
853            let before_close = Self::char_class_before(bytes, i);
854            let after_close = Self::char_class_after(bytes, close_end);
855
856            // CommonMark §6.2 — closing delimiter must be right-flanking:
857            // not preceded by whitespace, and not preceded by punctuation
858            // unless followed by whitespace or punctuation. For `_`, must
859            // also not be left-flanking (unless followed by punctuation).
860            let right_flanking = before_close != CharClass::Whitespace
861                && (before_close != CharClass::Punctuation || after_close != CharClass::Other);
862            if !right_flanking {
863                i += 1;
864                continue;
865            }
866            if !is_star {
867                // _ can close only if right-flanking AND (not left-flanking OR followed by punctuation)
868                let left_flanking_close = after_close != CharClass::Whitespace
869                    && (after_close != CharClass::Punctuation || before_close != CharClass::Other);
870                if left_flanking_close && after_close != CharClass::Punctuation {
871                    i += 1;
872                    continue;
873                }
874            }
875
876            return Some((input.get(inner_start..i)?, close_end));
877        }
878
879        None
880    }
881
882    /// Parse inline code spans (`CommonMark` §6.1).
883    /// The opening and closing backtick sequences must have the same length.
884    /// Content is taken verbatim (no backslash escaping inside code spans).
885    fn try_parse_inline_code(
886        input: &'src str,
887        bytes: &[u8],
888        start: usize,
889    ) -> Option<(&'src str, usize)> {
890        let backtick_count = SpecialChar::Backtick.count_leading_bytes(&bytes[start..]);
891        if backtick_count == 0 {
892            return None;
893        }
894
895        let content_start = start + backtick_count;
896        let mut i = content_start;
897        while i < bytes.len() {
898            // SIMD-accelerated backtick scan.
899            i = bytes.find_byte(i, SpecialChar::Backtick.byte())?;
900
901            // Count consecutive backticks
902            let close_count = SpecialChar::Backtick.count_leading_bytes(&bytes[i..]);
903
904            if close_count == backtick_count {
905                // CommonMark §6.1: strip one leading and one trailing space
906                // when the content both starts and ends with a space.
907                let mut cs = content_start;
908                let mut ce = i;
909                if ce - cs >= 2
910                    && bytes.get(cs) == SpecialChar::Space
911                    && bytes.get(ce - 1) == SpecialChar::Space
912                {
913                    cs += 1;
914                    ce -= 1;
915                }
916                return Some((input.get(cs..ce)?, i + close_count));
917            }
918            i += close_count;
919        }
920
921        None
922    }
923}
marki_parse/inline.rs

marki_parse/
inline.rs