marki_parse/
inline.rs

1use std::mem::MaybeUninit;
2
3use crate::SpecialChar;
4use crate::section::InlineSpan;
5use crate::simd::{ByteSet, find_byte, find_byte_set};
6
7/// Count consecutive occurrences of `needle` at the start of `bytes`.
8/// Scalar loop — faster than SIMD for short runs (inline code backticks are typically 1-3).
9#[inline]
10fn count_leading_byte(bytes: &[u8], needle: u8) -> usize {
11    let mut n = 0;
12    while n < bytes.len() && bytes[n] == needle {
13        n += 1;
14    }
15    n
16}
17
18/// An inline element within a Markdown block.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum Inline<'src> {
21    Text(&'src str),
22    Bold(InlineSpan),
23    Italic(InlineSpan),
24    Link {
25        text: InlineSpan,
26        url: &'src str,
27        title: Option<&'src str>,
28    },
29    Image {
30        alt: &'src str,
31        url: &'src str,
32        title: Option<&'src str>,
33    },
34    Code(&'src str),
35    SoftBreak,
36    HardBreak,
37}
38
39impl<'src> From<&'src str> for Inline<'src> {
40    fn from(s: &'src str) -> Self {
41        Self::Text(s)
42    }
43}
44
45/// SIMD-accelerated byte set for inline special characters.
46static SPECIAL_SET: ByteSet = ByteSet::new(&[
47    SpecialChar::Newline.byte(),
48    SpecialChar::Asterisk.byte(),
49    SpecialChar::Underscore.byte(),
50    SpecialChar::OpenBracket.byte(),
51    SpecialChar::ExclamationMark.byte(),
52    SpecialChar::Backslash.byte(),
53    SpecialChar::Backtick.byte(),
54]);
55
56/// Pre-computed byte sets for `find_matching_close` — avoids rebuilding
57/// the 256-byte lookup table on every call.
58static BRACKET_CLOSE_SET: ByteSet = ByteSet::new(&[
59    SpecialChar::OpenBracket.byte(),
60    SpecialChar::CloseBracket.byte(),
61    SpecialChar::Backslash.byte(),
62]);
63static PAREN_CLOSE_SET: ByteSet = ByteSet::new(&[
64    SpecialChar::OpenParen.byte(),
65    SpecialChar::CloseParen.byte(),
66    SpecialChar::Backslash.byte(),
67]);
68
69/// Pre-computed byte sets for `try_parse_delimited` — one per delimiter type.
70static STAR_DELIM_SET: ByteSet =
71    ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Backslash.byte()]);
72static UNDER_DELIM_SET: ByteSet = ByteSet::new(&[
73    SpecialChar::Underscore.byte(),
74    SpecialChar::Backslash.byte(),
75]);
76
77/// Character classification for `CommonMark` emphasis flanking rules.
78#[derive(Clone, Copy, PartialEq, Eq)]
79enum CharClass {
80    Whitespace,
81    Punctuation,
82    Other,
83}
84
85impl CharClass {
86    const fn of(ch: char) -> Self {
87        if ch.is_whitespace() {
88            Self::Whitespace
89        } else if ch.is_ascii_punctuation() || unicode_punctuation(ch) {
90            Self::Punctuation
91        } else {
92            Self::Other
93        }
94    }
95
96    /// Fast classification for ASCII bytes, avoiding UTF-8 decode.
97    #[inline]
98    const fn of_ascii(b: u8) -> Self {
99        if b.is_ascii_whitespace() {
100            Self::Whitespace
101        } else if b.is_ascii_punctuation() {
102            Self::Punctuation
103        } else {
104            Self::Other
105        }
106    }
107}
108
109/// Check if a character is Unicode punctuation (general categories P or S)
110/// beyond ASCII punctuation. Covers the most common cases without a dependency.
111/// A full implementation would use unicode-general-category crate.
112const fn unicode_punctuation(ch: char) -> bool {
113    if ch.is_ascii() {
114        return false;
115    }
116    matches!(ch,
117        '\u{00A1}'..='\u{00BF}' // Latin punctuation/symbols
118        | '\u{2010}'..='\u{2027}' // General punctuation (dashes, quotes, etc.)
119        | '\u{2030}'..='\u{205E}' // More general punctuation
120        | '\u{2190}'..='\u{23FF}' // Arrows, math operators, misc technical
121        | '\u{2500}'..='\u{2BFF}' // Box drawing, block elements, symbols
122        | '\u{3000}'..='\u{303F}' // CJK symbols and punctuation
123        | '\u{FE30}'..='\u{FE6F}' // CJK compatibility forms, small forms
124        | '\u{FF01}'..='\u{FF0F}' // Fullwidth punctuation
125        | '\u{FF1A}'..='\u{FF20}' // More fullwidth punctuation
126        | '\u{FF3B}'..='\u{FF40}' // Fullwidth brackets
127        | '\u{FF5B}'..='\u{FF65}' // Fullwidth punctuation
128    )
129}
130
131/// What emphasis types remain possible for a given delimiter character.
132/// Tracks delimiter availability per character type, avoiding O(n²)
133/// re-scanning in both top-level and recursive parse calls.
134#[derive(Clone, Copy)]
135enum DelimiterAvail {
136    /// Both bold and italic are still possible.
137    Both,
138    /// Bold failed; only italic can be attempted.
139    ItalicOnly,
140    /// Italic failed; only bold can be attempted.
141    BoldOnly,
142    /// Neither bold nor italic can succeed.
143    None,
144}
145
146impl DelimiterAvail {
147    const fn can_bold(self) -> bool {
148        matches!(self, Self::Both | Self::BoldOnly)
149    }
150
151    const fn can_italic(self) -> bool {
152        matches!(self, Self::Both | Self::ItalicOnly)
153    }
154
155    const fn bold_failed(&mut self) {
156        *self = match *self {
157            Self::Both => Self::ItalicOnly,
158            Self::BoldOnly => Self::None,
159            other => other,
160        };
161    }
162
163    const fn italic_failed(&mut self) {
164        *self = match *self {
165            Self::Both => Self::BoldOnly,
166            Self::ItalicOnly => Self::None,
167            other => other,
168        };
169    }
170
171    const fn from_count(count: usize) -> Self {
172        match count {
173            0 | 1 => Self::None,
174            2 | 3 => Self::ItalicOnly,
175            _ => Self::Both,
176        }
177    }
178}
179
180struct EmphasisState {
181    star: DelimiterAvail,
182    under: DelimiterAvail,
183}
184
185impl EmphasisState {
186    const fn assume_both() -> Self {
187        Self {
188            star: DelimiterAvail::Both,
189            under: DelimiterAvail::Both,
190        }
191    }
192
193    fn from_bytes(bytes: &[u8]) -> Self {
194        static EMPH_SET: ByteSet =
195            ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Underscore.byte()]);
196        let mut stars: u8 = 0;
197        let mut unders: u8 = 0;
198        let mut i = 0;
199        while let Some(pos) = find_byte_set(bytes, i, &EMPH_SET) {
200            if bytes[pos] == SpecialChar::Asterisk {
201                stars = stars.saturating_add(1);
202            } else {
203                unders = unders.saturating_add(1);
204            }
205            if stars >= 4 && unders >= 4 {
206                break;
207            }
208            i = pos + 1;
209        }
210        Self {
211            star: DelimiterAvail::from_count(stars as usize),
212            under: DelimiterAvail::from_count(unders as usize),
213        }
214    }
215
216    const fn avail_mut(&mut self, is_star: bool) -> &mut DelimiterAvail {
217        if is_star {
218            &mut self.star
219        } else {
220            &mut self.under
221        }
222    }
223}
224
225/// Stack-allocated buffer for collecting inline elements without heap allocation.
226/// Uses `MaybeUninit` to avoid zeroing the stack array on every parse call.
227/// The capacity `CAP` is configurable via `MarkdownFile`'s `INLINE_STACK_CAP`
228/// const generic — falls back to heap if exceeded.
229struct InlineBuf<'src, const CAP: usize> {
230    stack: [MaybeUninit<Inline<'src>>; CAP],
231    len: usize,
232    overflow: Vec<Inline<'src>>,
233}
234
235impl<'src, const CAP: usize> InlineBuf<'src, CAP> {
236    #[inline]
237    const fn new() -> Self {
238        Self {
239            // SAFETY: An array of MaybeUninit does not require initialization.
240            stack: [const { MaybeUninit::uninit() }; CAP],
241            len: 0,
242            overflow: Vec::new(),
243        }
244    }
245
246    #[allow(clippy::inline_always)]
247    #[inline(always)]
248    fn push(&mut self, item: Inline<'src>) {
249        if self.len < CAP {
250            self.stack[self.len] = MaybeUninit::new(item);
251            self.len += 1;
252        } else {
253            self.push_slow(item);
254        }
255    }
256
257    #[cold]
258    fn push_slow(&mut self, item: Inline<'src>) {
259        if self.overflow.is_empty() {
260            // Spill stack to heap
261            self.overflow = Vec::with_capacity(CAP * 2);
262            // SAFETY: elements 0..self.len were initialized via push.
263            // Use a raw pointer to avoid borrow conflict with self.overflow.
264            let len = self.len;
265            let ptr = self.stack.as_ptr().cast::<Inline>();
266            let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
267            self.overflow.extend_from_slice(slice);
268        }
269        self.overflow.push(item);
270    }
271
272    /// Get initialized stack elements as a slice.
273    #[inline]
274    const fn initialized_stack(&self) -> &[Inline<'src>] {
275        // SAFETY: all elements 0..self.len have been initialized via push.
276        unsafe { std::slice::from_raw_parts(self.stack.as_ptr().cast::<Inline>(), self.len) }
277    }
278
279    #[inline]
280    fn flush_to_pool(self, pool: &mut Vec<Inline<'src>>) -> InlineSpan {
281        let start = pool_offset(pool.len());
282        if self.overflow.is_empty() {
283            pool.extend_from_slice(self.initialized_stack());
284            InlineSpan::new(start, pool_offset(self.len))
285        } else {
286            let len = pool_offset(self.overflow.len());
287            pool.extend(self.overflow);
288            InlineSpan::new(start, len)
289        }
290    }
291}
292
293/// Pool index as `u32`. Panics if the pool exceeds 4 GiB of elements
294/// (unreachable in practice — that would require billions of inline nodes).
295#[allow(clippy::inline_always)]
296#[inline(always)]
297pub fn pool_offset(pool_len: usize) -> u32 {
298    u32::try_from(pool_len).expect("inline pool exceeds u32::MAX elements")
299}
300
301impl<'src> Inline<'src> {
302    /// Threshold below which the emphasis pre-scan costs more than it saves.
303    const EMPH_SCAN_THRESHOLD: usize = 256;
304
305    /// Parse inline elements and store them in the pool. Returns a span.
306    ///
307    /// Uses default limits (`MAX_INLINE_DEPTH = 16`, `INLINE_STACK_CAP = 32`).
308    /// For custom limits, use [`crate::MarkdownFile::parse`] with const generics.
309    #[must_use]
310    pub fn parse(input: &'src str, pool: &mut Vec<Self>) -> InlineSpan {
311        Self::parse_configured::<16, 32>(input, pool)
312    }
313
314    /// Parse inline elements with configurable depth and stack limits.
315    pub(crate) fn parse_configured<const MAX_DEPTH: u8, const CAP: usize>(
316        input: &'src str,
317        pool: &mut Vec<Self>,
318    ) -> InlineSpan {
319        let bytes = input.as_bytes();
320        // Fast path: if no special bytes exist, the entire input is plain text.
321        if find_byte_set(bytes, 0, &SPECIAL_SET).is_none() {
322            if input.is_empty() {
323                return InlineSpan::EMPTY;
324            }
325            let start = pool_offset(pool.len());
326            pool.push(Self::Text(input));
327            return InlineSpan::new(start, 1);
328        }
329        let emph = if bytes.len() < Self::EMPH_SCAN_THRESHOLD {
330            EmphasisState::assume_both()
331        } else {
332            EmphasisState::from_bytes(bytes)
333        };
334        Self::parse_with_emph::<MAX_DEPTH, CAP>(input, bytes, emph, pool, 0)
335    }
336
337    fn parse_inner<const MAX_DEPTH: u8, const CAP: usize>(
338        input: &'src str,
339        pool: &mut Vec<Self>,
340        depth: u8,
341    ) -> InlineSpan {
342        let bytes = input.as_bytes();
343        Self::parse_with_emph::<MAX_DEPTH, CAP>(
344            input,
345            bytes,
346            EmphasisState::assume_both(),
347            pool,
348            depth,
349        )
350    }
351
352    fn parse_with_emph<const MAX_DEPTH: u8, const CAP: usize>(
353        input: &'src str,
354        bytes: &[u8],
355        emph: EmphasisState,
356        pool: &mut Vec<Self>,
357        depth: u8,
358    ) -> InlineSpan {
359        let mut buf = InlineBuf::<CAP>::new();
360        Self::parse_into_buf::<MAX_DEPTH, CAP>(input, bytes, emph, pool, &mut buf, depth);
361        buf.flush_to_pool(pool)
362    }
363
364    /// Push parsed inline elements directly into the pool without wrapping
365    /// in a span. Used for blockquote multi-line accumulation where the caller
366    /// manages span boundaries.
367    ///
368    /// Uses default limits (`MAX_INLINE_DEPTH = 16`, `INLINE_STACK_CAP = 32`).
369    /// For custom limits, use [`crate::MarkdownFile::parse`] with const generics.
370    pub fn parse_flat_into(input: &'src str, pool: &mut Vec<Self>) {
371        Self::parse_flat_into_configured::<16, 32>(input, pool);
372    }
373
374    /// Push parsed inline elements with configurable depth and stack limits.
375    pub(crate) fn parse_flat_into_configured<const MAX_DEPTH: u8, const CAP: usize>(
376        input: &'src str,
377        pool: &mut Vec<Self>,
378    ) {
379        let bytes = input.as_bytes();
380        // Fast path: no special bytes means plain text.
381        if find_byte_set(bytes, 0, &SPECIAL_SET).is_none() {
382            if !input.is_empty() {
383                pool.push(Self::Text(input));
384            }
385            return;
386        }
387        let emph = if bytes.len() < Self::EMPH_SCAN_THRESHOLD {
388            EmphasisState::assume_both()
389        } else {
390            EmphasisState::from_bytes(bytes)
391        };
392        // Parse directly into a buf that flushes to pool (flat, no span wrapper).
393        let mut buf = InlineBuf::<CAP>::new();
394        Self::parse_into_buf::<MAX_DEPTH, CAP>(input, bytes, emph, pool, &mut buf, 0);
395        // Flush buf directly to pool (not wrapped in a span).
396        if buf.overflow.is_empty() {
397            pool.extend_from_slice(buf.initialized_stack());
398        } else {
399            pool.extend(buf.overflow);
400        }
401    }
402
403    fn parse_into_buf<const MAX_DEPTH: u8, const CAP: usize>(
404        input: &'src str,
405        bytes: &[u8],
406        mut emph: EmphasisState,
407        pool: &mut Vec<Self>,
408        buf: &mut InlineBuf<'src, CAP>,
409        depth: u8,
410    ) {
411        let mut plain_start = 0;
412        let mut i = 0;
413
414        // SIMD-accelerated scan: find next special byte.
415        while let Some(pos) = find_byte_set(bytes, i, &SPECIAL_SET) {
416            i = pos;
417            let b = bytes[i];
418
419            if b == SpecialChar::Newline {
420                Self::emit_line_break::<CAP>(input, bytes, plain_start, i, buf);
421                plain_start = i + 1;
422                i = plain_start;
423                continue;
424            }
425
426            // Backslash escape: only ASCII punctuation can be escaped (CommonMark spec).
427            // For non-punctuation, the backslash is kept as literal text.
428            if b == SpecialChar::Backslash
429                && let Some(&next) = bytes.get(i + 1)
430                && next.is_ascii_punctuation()
431            {
432                if let Some(text) = input.get(plain_start..i)
433                    && !text.is_empty()
434                {
435                    buf.push(Self::Text(text));
436                }
437                plain_start = i + 1;
438                i += 2;
439                continue;
440            }
441
442            // Inline code: `code` or ``code``
443            if b == SpecialChar::Backtick
444                && let Some((code, end)) = Self::try_parse_inline_code(input, bytes, i)
445            {
446                if let Some(text) = input.get(plain_start..i)
447                    && !text.is_empty()
448                {
449                    buf.push(Self::Text(text));
450                }
451                buf.push(Self::Code(code));
452                plain_start = end;
453                i = end;
454                continue;
455            }
456
457            // Image: ![alt](url "title")
458            if b == SpecialChar::ExclamationMark
459                && bytes.get(i + 1) == SpecialChar::OpenBracket
460                && let Some((alt, url, title, end)) =
461                    Self::try_parse_bracket_paren(input, bytes, i + 1)
462            {
463                if let Some(text) = input.get(plain_start..i)
464                    && !text.is_empty()
465                {
466                    buf.push(Self::Text(text));
467                }
468                buf.push(Self::Image { alt, url, title });
469                plain_start = end;
470                i = end;
471                continue;
472            }
473
474            // Link: [text](url "title")
475            if b == SpecialChar::OpenBracket
476                && let Some((text_str, url, title, end)) =
477                    Self::try_parse_bracket_paren(input, bytes, i)
478            {
479                if let Some(text) = input.get(plain_start..i)
480                    && !text.is_empty()
481                {
482                    buf.push(Self::Text(text));
483                }
484                let text_span =
485                    Self::parse_inner::<MAX_DEPTH, CAP>(text_str, pool, depth.saturating_add(1));
486                buf.push(Self::Link {
487                    text: text_span,
488                    url,
489                    title,
490                });
491                plain_start = end;
492                i = end;
493                continue;
494            }
495
496            // Bold/Italic: ** __ * _
497            if let Some((elem, end)) = Self::try_parse_emphasis::<MAX_DEPTH, CAP>(
498                input, bytes, i, b, &mut emph, pool, depth,
499            ) {
500                if let Some(text) = input.get(plain_start..i)
501                    && !text.is_empty()
502                {
503                    buf.push(Self::Text(text));
504                }
505                buf.push(elem);
506                plain_start = end;
507                i = end;
508                continue;
509            }
510
511            i += 1;
512        }
513
514        if let Some(text) = input.get(plain_start..)
515            && !text.is_empty()
516        {
517            buf.push(Self::Text(text));
518        }
519    }
520
521    /// Emit a hard or soft line break at a newline position.
522    /// Hard break if preceded by trailing `\` or 2+ spaces; soft break otherwise.
523    #[inline]
524    fn emit_line_break<const CAP: usize>(
525        input: &'src str,
526        bytes: &[u8],
527        plain_start: usize,
528        newline_pos: usize,
529        buf: &mut InlineBuf<'src, CAP>,
530    ) {
531        let preceding = bytes.get(plain_start..newline_pos).unwrap_or_default();
532        let (trim_end, is_hard) = if preceding.last() == SpecialChar::Backslash {
533            (newline_pos - 1, true)
534        } else {
535            // Count trailing spaces with a simple backward loop.
536            let mut spaces = 0;
537            let mut j = preceding.len();
538            while j > 0 && preceding[j - 1] == SpecialChar::Space {
539                spaces += 1;
540                j -= 1;
541            }
542            if spaces >= 2 {
543                (newline_pos - spaces, true)
544            } else {
545                (newline_pos, false)
546            }
547        };
548        if let Some(text) = input.get(plain_start..trim_end)
549            && !text.is_empty()
550        {
551            buf.push(Self::Text(text));
552        }
553        buf.push(if is_hard {
554            Self::HardBreak
555        } else {
556            Self::SoftBreak
557        });
558    }
559
560    #[inline]
561    fn try_parse_emphasis<const MAX_DEPTH: u8, const CAP: usize>(
562        input: &'src str,
563        bytes: &[u8],
564        i: usize,
565        b: u8,
566        emph: &mut EmphasisState,
567        pool: &mut Vec<Self>,
568        depth: u8,
569    ) -> Option<(Self, usize)> {
570        let is_star = b == SpecialChar::Asterisk;
571        if !is_star && b != SpecialChar::Underscore {
572            return None;
573        }
574        // Depth limit: treat as plain text to prevent stack overflow.
575        if depth >= MAX_DEPTH {
576            return None;
577        }
578        let avail = emph.avail_mut(is_star);
579
580        // Bold: ** or __
581        if avail.can_bold() && bytes.get(i + 1) == Some(&b) {
582            if let Some((inner, end)) = Self::try_parse_delimited(input, bytes, i, b, 2) {
583                let span = Self::parse_inner::<MAX_DEPTH, CAP>(inner, pool, depth + 1);
584                return Some((Self::Bold(span), end));
585            }
586            avail.bold_failed();
587        }
588
589        // Italic: * or _
590        if avail.can_italic() {
591            if let Some((inner, end)) = Self::try_parse_delimited(input, bytes, i, b, 1) {
592                let span = Self::parse_inner::<MAX_DEPTH, CAP>(inner, pool, depth + 1);
593                return Some((Self::Italic(span), end));
594            }
595            avail.italic_failed();
596        }
597
598        None
599    }
600
601    /// Find the position of a matching closing delimiter, handling backslash
602    /// escapes and nested pairs.
603    fn find_matching_close(
604        bytes: &[u8],
605        start: usize,
606        open: SpecialChar,
607        close: SpecialChar,
608    ) -> Option<usize> {
609        // Select pre-computed static ByteSet instead of building one each call.
610        let set = if open == SpecialChar::OpenBracket {
611            &BRACKET_CLOSE_SET
612        } else {
613            &PAREN_CLOSE_SET
614        };
615        let mut depth = 0u32;
616        let mut j = start;
617        loop {
618            let pos = find_byte_set(bytes, j, set)?;
619            let b = bytes[pos];
620            if b == SpecialChar::Backslash
621                && bytes.get(pos + 1).is_some_and(u8::is_ascii_punctuation)
622            {
623                j = pos + 2;
624                continue;
625            }
626            if b == open {
627                depth += 1;
628            } else if b == close {
629                if depth == 0 {
630                    return Some(pos);
631                }
632                depth -= 1;
633            }
634            j = pos + 1;
635        }
636    }
637
638    fn try_parse_bracket_paren(
639        input: &'src str,
640        bytes: &[u8],
641        start: usize,
642    ) -> Option<(&'src str, &'src str, Option<&'src str>, usize)> {
643        if bytes.get(start) != SpecialChar::OpenBracket {
644            return None;
645        }
646
647        let bracket_start = start + 1;
648        let bracket_end = Self::find_matching_close(
649            bytes,
650            bracket_start,
651            SpecialChar::OpenBracket,
652            SpecialChar::CloseBracket,
653        )?;
654
655        let paren_pos = bracket_end + 1;
656        if bytes.get(paren_pos) != SpecialChar::OpenParen {
657            return None;
658        }
659
660        let paren_start = paren_pos + 1;
661        let paren_end = Self::find_matching_close(
662            bytes,
663            paren_start,
664            SpecialChar::OpenParen,
665            SpecialChar::CloseParen,
666        )?;
667
668        let paren_content = input.get(paren_start..paren_end)?;
669        let (url, title) = Self::split_url_title(paren_content);
670
671        Some((
672            input.get(bracket_start..bracket_end)?,
673            url,
674            title,
675            paren_end + 1,
676        ))
677    }
678
679    /// Split the content inside `(...)` into a URL and optional title
680    /// (`CommonMark` §6.3).
681    ///
682    /// Titles are delimited by `"..."`, `'...'`, or `(...)`.
683    ///
684    /// We scan **backwards** because the title, if present, is always at the
685    /// end. The algorithm:
686    ///  1. Check the last byte for a closing title delimiter (`"`, `'`, `)`).
687    ///  2. Walk backwards to find the matching opener.
688    ///  3. The opener must be preceded by whitespace — this separates the URL
689    ///     from the title. If no whitespace is found, there is no title.
690    ///  4. For **paired** delimiters (`(…)`), if the first candidate opener
691    ///     lacks preceding whitespace we keep scanning for an earlier `(`
692    ///     that does. For **same-char** delimiters (`"…"`, `'…'`), the first
693    ///     match is the only candidate (no nesting possible).
694    fn split_url_title(content: &'src str) -> (&'src str, Option<&'src str>) {
695        let trimmed = content.trim();
696        // A valid title needs at minimum: url, space, open+close quotes (e.g. `u "t"`).
697        // With fewer than 3 bytes the backward scan would underflow.
698        if trimmed.len() < 3 {
699            return (trimmed, None);
700        }
701
702        let bytes = trimmed.as_bytes();
703        let last = bytes[bytes.len() - 1];
704        let (open, close) = match SpecialChar::from_byte(last) {
705            Some(SpecialChar::DoubleQuote) => (SpecialChar::DoubleQuote, SpecialChar::DoubleQuote),
706            Some(SpecialChar::SingleQuote) => (SpecialChar::SingleQuote, SpecialChar::SingleQuote),
707            Some(SpecialChar::CloseParen) => (SpecialChar::OpenParen, SpecialChar::CloseParen),
708            // No trailing title delimiter — the entire content is the URL.
709            _ => return (trimmed, None),
710        };
711
712        // Scan backwards for the matching opening delimiter.
713        let mut j = bytes.len() - 2;
714        loop {
715            if bytes[j] == open {
716                // Whitespace before the opener separates URL from title.
717                if j > 0 && bytes[j - 1].is_ascii_whitespace() {
718                    let url = trimmed.get(..j).unwrap_or(trimmed).trim_end();
719                    let title = trimmed.get(j + 1..bytes.len() - 1).unwrap_or("");
720                    return (url, Some(title));
721                }
722                // For paired delimiters (open != close), keep scanning for an
723                // earlier opener that *does* have preceding whitespace.
724                if open != close {
725                    if j == 0 {
726                        break;
727                    }
728                    j -= 1;
729                    continue;
730                }
731                // Same-char delimiter: first match is the only candidate.
732                break;
733            }
734            if j == 0 {
735                break;
736            }
737            j -= 1;
738        }
739
740        // No valid title found — treat entire content as URL.
741        (trimmed, None)
742    }
743
744    #[inline]
745    /// Classify the character before a position for flanking delimiter rules.
746    /// Returns `CharClass::Whitespace` at start-of-input (treated as if preceded by newline).
747    fn char_class_before(bytes: &[u8], pos: usize) -> CharClass {
748        if pos == 0 {
749            return CharClass::Whitespace;
750        }
751        let b = bytes[pos - 1];
752        // Fast path: ASCII bytes need no UTF-8 decoding.
753        if b < 0x80 {
754            return CharClass::of_ascii(b);
755        }
756        // Walk back to find UTF-8 codepoint start.
757        let mut start = pos - 1;
758        while start > 0 && bytes[start] & 0xC0 == 0x80 {
759            start -= 1;
760        }
761        let ch = std::str::from_utf8(&bytes[start..pos])
762            .ok()
763            .and_then(|s| s.chars().next())
764            .unwrap_or(' ');
765        CharClass::of(ch)
766    }
767
768    #[inline]
769    /// Classify the character after a position for flanking delimiter rules.
770    /// Returns `CharClass::Whitespace` at end-of-input (treated as if followed by newline).
771    fn char_class_after(bytes: &[u8], pos: usize) -> CharClass {
772        if pos >= bytes.len() {
773            return CharClass::Whitespace;
774        }
775        let b = bytes[pos];
776        // Fast path: ASCII bytes need no UTF-8 decoding.
777        if b < 0x80 {
778            return CharClass::of_ascii(b);
779        }
780        // Decode the UTF-8 codepoint starting at `pos`.
781        let ch = std::str::from_utf8(&bytes[pos..])
782            .ok()
783            .and_then(|s| s.chars().next())
784            .unwrap_or(' ');
785        CharClass::of(ch)
786    }
787
788    fn try_parse_delimited(
789        input: &'src str,
790        bytes: &[u8],
791        start: usize,
792        marker: u8,
793        count: usize,
794    ) -> Option<(&'src str, usize)> {
795        let inner_start = start + count;
796        bytes.get(inner_start)?;
797
798        let is_star = marker == SpecialChar::Asterisk;
799
800        // CommonMark §6.2 — emphasis flanking rules:
801        // A left-flanking delimiter run must not be followed by whitespace,
802        // and must not be followed by punctuation unless preceded by whitespace
803        // or punctuation. For `_`, it must also not be right-flanking (unless
804        // preceded by punctuation), preventing intra-word emphasis.
805        let before_open = Self::char_class_before(bytes, start);
806        let after_open = Self::char_class_after(bytes, inner_start);
807
808        let left_flanking = after_open != CharClass::Whitespace
809            && (after_open != CharClass::Punctuation || before_open != CharClass::Other);
810        if !left_flanking {
811            return None;
812        }
813        if !is_star {
814            // _ can open only if left-flanking AND (not right-flanking OR preceded by punctuation)
815            let right_flanking_open = before_open != CharClass::Whitespace
816                && (before_open != CharClass::Punctuation || after_open != CharClass::Other);
817            if right_flanking_open && before_open != CharClass::Punctuation {
818                return None;
819            }
820        }
821
822        // Select pre-computed static ByteSet instead of building one each call.
823        let delim_set = if is_star {
824            &STAR_DELIM_SET
825        } else {
826            &UNDER_DELIM_SET
827        };
828
829        let mut i = inner_start;
830        while let Some(pos) = find_byte_set(bytes, i, delim_set) {
831            i = pos;
832            let b = bytes[i];
833
834            if b == SpecialChar::Backslash && bytes.get(i + 1).is_some_and(u8::is_ascii_punctuation)
835            {
836                i += 2;
837                continue;
838            }
839
840            if b != marker {
841                i += 1;
842                continue;
843            }
844
845            // Found a marker byte — check for a valid closing run.
846            let all_match = (1..count).all(|j| bytes.get(i + j) == Some(&marker));
847            if !all_match {
848                i += 1;
849                continue;
850            }
851
852            let close_end = i + count;
853            let before_close = Self::char_class_before(bytes, i);
854            let after_close = Self::char_class_after(bytes, close_end);
855
856            // CommonMark §6.2 — closing delimiter must be right-flanking:
857            // not preceded by whitespace, and not preceded by punctuation
858            // unless followed by whitespace or punctuation. For `_`, must
859            // also not be left-flanking (unless followed by punctuation).
860            let right_flanking = before_close != CharClass::Whitespace
861                && (before_close != CharClass::Punctuation || after_close != CharClass::Other);
862            if !right_flanking {
863                i += 1;
864                continue;
865            }
866            if !is_star {
867                // _ can close only if right-flanking AND (not left-flanking OR followed by punctuation)
868                let left_flanking_close = after_close != CharClass::Whitespace
869                    && (after_close != CharClass::Punctuation || before_close != CharClass::Other);
870                if left_flanking_close && after_close != CharClass::Punctuation {
871                    i += 1;
872                    continue;
873                }
874            }
875
876            return Some((input.get(inner_start..i)?, close_end));
877        }
878
879        None
880    }
881
882    /// Parse inline code spans (`CommonMark` §6.1).
883    /// The opening and closing backtick sequences must have the same length.
884    /// Content is taken verbatim (no backslash escaping inside code spans).
885    fn try_parse_inline_code(
886        input: &'src str,
887        bytes: &[u8],
888        start: usize,
889    ) -> Option<(&'src str, usize)> {
890        let backtick_count = count_leading_byte(&bytes[start..], SpecialChar::Backtick.byte());
891        if backtick_count == 0 {
892            return None;
893        }
894
895        let content_start = start + backtick_count;
896        let mut i = content_start;
897        while i < bytes.len() {
898            // SIMD-accelerated backtick scan.
899            i = find_byte(bytes, i, SpecialChar::Backtick.byte())?;
900
901            // Count consecutive backticks
902            let close_count = count_leading_byte(&bytes[i..], SpecialChar::Backtick.byte());
903
904            if close_count == backtick_count {
905                // CommonMark §6.1: strip one leading and one trailing space
906                // when the content both starts and ends with a space.
907                let mut cs = content_start;
908                let mut ce = i;
909                if ce - cs >= 2
910                    && bytes.get(cs) == SpecialChar::Space
911                    && bytes.get(ce - 1) == SpecialChar::Space
912                {
913                    cs += 1;
914                    ce -= 1;
915                }
916                return Some((input.get(cs..ce)?, i + close_count));
917            }
918            i += close_count;
919        }
920
921        None
922    }
923}
marki_parse/inline.rs

marki_parse/
inline.rs