marki_parse/
inline.rs

1use std::mem::MaybeUninit;
2
3use crate::SpecialChar;
4use crate::section::InlineSpan;
5use crate::simd::{ByteSet, find_byte, find_byte_set};
6
7/// Count consecutive occurrences of `needle` at the start of `bytes`.
8/// Scalar loop — faster than SIMD for short runs (inline code backticks are typically 1-3).
9#[inline]
10fn count_leading_byte(bytes: &[u8], needle: u8) -> usize {
11    let mut n = 0;
12    while n < bytes.len() && bytes[n] == needle {
13        n += 1;
14    }
15    n
16}
17
18/// An inline element within a Markdown block.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum Inline<'src> {
21    Text(&'src str),
22    Bold(InlineSpan),
23    Italic(InlineSpan),
24    Link {
25        text: InlineSpan,
26        url: &'src str,
27        title: Option<&'src str>,
28    },
29    Image {
30        alt: &'src str,
31        url: &'src str,
32        title: Option<&'src str>,
33    },
34    Code(&'src str),
35    SoftBreak,
36    HardBreak,
37}
38
39impl<'src> From<&'src str> for Inline<'src> {
40    fn from(s: &'src str) -> Self {
41        Self::Text(s)
42    }
43}
44
45/// SIMD-accelerated byte set for inline special characters.
46static SPECIAL_SET: ByteSet = ByteSet::new(&[
47    SpecialChar::Newline.byte(),
48    SpecialChar::Asterisk.byte(),
49    SpecialChar::Underscore.byte(),
50    SpecialChar::OpenBracket.byte(),
51    SpecialChar::ExclamationMark.byte(),
52    SpecialChar::Backslash.byte(),
53    SpecialChar::Backtick.byte(),
54]);
55
56/// Pre-computed byte sets for `find_matching_close` — avoids rebuilding
57/// the 256-byte lookup table on every call.
58static BRACKET_CLOSE_SET: ByteSet = ByteSet::new(&[
59    SpecialChar::OpenBracket.byte(),
60    SpecialChar::CloseBracket.byte(),
61    SpecialChar::Backslash.byte(),
62]);
63static PAREN_CLOSE_SET: ByteSet = ByteSet::new(&[
64    SpecialChar::OpenParen.byte(),
65    SpecialChar::CloseParen.byte(),
66    SpecialChar::Backslash.byte(),
67]);
68
69/// Pre-computed byte sets for `try_parse_delimited` — one per delimiter type.
70static STAR_DELIM_SET: ByteSet =
71    ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Backslash.byte()]);
72static UNDER_DELIM_SET: ByteSet = ByteSet::new(&[
73    SpecialChar::Underscore.byte(),
74    SpecialChar::Backslash.byte(),
75]);
76
77/// Character classification for `CommonMark` emphasis flanking rules.
78#[derive(Clone, Copy, PartialEq, Eq)]
79enum CharClass {
80    Whitespace,
81    Punctuation,
82    Other,
83}
84
85impl CharClass {
86    const fn of(ch: char) -> Self {
87        if ch.is_whitespace() {
88            Self::Whitespace
89        } else if ch.is_ascii_punctuation() || unicode_punctuation(ch) {
90            Self::Punctuation
91        } else {
92            Self::Other
93        }
94    }
95
96    /// Fast classification for ASCII bytes, avoiding UTF-8 decode.
97    #[inline]
98    const fn of_ascii(b: u8) -> Self {
99        if b.is_ascii_whitespace() {
100            Self::Whitespace
101        } else if b.is_ascii_punctuation() {
102            Self::Punctuation
103        } else {
104            Self::Other
105        }
106    }
107}
108
109/// Check if a character is Unicode punctuation (general categories P or S)
110/// beyond ASCII punctuation. Covers the most common cases without a dependency.
111/// A full implementation would use unicode-general-category crate.
112const fn unicode_punctuation(ch: char) -> bool {
113    if ch.is_ascii() {
114        return false;
115    }
116    matches!(ch,
117        '\u{00A1}'..='\u{00BF}' // Latin punctuation/symbols
118        | '\u{2010}'..='\u{2027}' // General punctuation (dashes, quotes, etc.)
119        | '\u{2030}'..='\u{205E}' // More general punctuation
120        | '\u{2190}'..='\u{23FF}' // Arrows, math operators, misc technical
121        | '\u{2500}'..='\u{2BFF}' // Box drawing, block elements, symbols
122        | '\u{3000}'..='\u{303F}' // CJK symbols and punctuation
123        | '\u{FE30}'..='\u{FE6F}' // CJK compatibility forms, small forms
124        | '\u{FF01}'..='\u{FF0F}' // Fullwidth punctuation
125        | '\u{FF1A}'..='\u{FF20}' // More fullwidth punctuation
126        | '\u{FF3B}'..='\u{FF40}' // Fullwidth brackets
127        | '\u{FF5B}'..='\u{FF65}' // Fullwidth punctuation
128    )
129}
130
131/// What emphasis types remain possible for a given delimiter character.
132/// Tracks delimiter availability per character type, avoiding O(n²)
133/// re-scanning in both top-level and recursive parse calls.
134#[derive(Clone, Copy)]
135enum DelimiterAvail {
136    /// Both bold and italic are still possible.
137    Both,
138    /// Bold failed; only italic can be attempted.
139    ItalicOnly,
140    /// Italic failed; only bold can be attempted.
141    BoldOnly,
142    /// Neither bold nor italic can succeed.
143    None,
144}
145
146impl DelimiterAvail {
147    const fn can_bold(self) -> bool {
148        matches!(self, Self::Both | Self::BoldOnly)
149    }
150
151    const fn can_italic(self) -> bool {
152        matches!(self, Self::Both | Self::ItalicOnly)
153    }
154
155    const fn bold_failed(&mut self) {
156        *self = match *self {
157            Self::Both => Self::ItalicOnly,
158            Self::BoldOnly => Self::None,
159            other => other,
160        };
161    }
162
163    const fn italic_failed(&mut self) {
164        *self = match *self {
165            Self::Both => Self::BoldOnly,
166            Self::ItalicOnly => Self::None,
167            other => other,
168        };
169    }
170
171    const fn from_count(count: usize) -> Self {
172        match count {
173            0 | 1 => Self::None,
174            2 | 3 => Self::ItalicOnly,
175            _ => Self::Both,
176        }
177    }
178}
179
180struct EmphasisState {
181    star: DelimiterAvail,
182    under: DelimiterAvail,
183}
184
185impl EmphasisState {
186    const fn assume_both() -> Self {
187        Self {
188            star: DelimiterAvail::Both,
189            under: DelimiterAvail::Both,
190        }
191    }
192
193    fn from_bytes(bytes: &[u8]) -> Self {
194        static EMPH_SET: ByteSet =
195            ByteSet::new(&[SpecialChar::Asterisk.byte(), SpecialChar::Underscore.byte()]);
196        let mut stars: u8 = 0;
197        let mut unders: u8 = 0;
198        let mut i = 0;
199        loop {
200            let Some(pos) = find_byte_set(bytes, i, &EMPH_SET) else {
201                break;
202            };
203            if bytes[pos] == SpecialChar::Asterisk {
204                stars = stars.saturating_add(1);
205            } else {
206                unders = unders.saturating_add(1);
207            }
208            if stars >= 4 && unders >= 4 {
209                break;
210            }
211            i = pos + 1;
212        }
213        Self {
214            star: DelimiterAvail::from_count(stars as usize),
215            under: DelimiterAvail::from_count(unders as usize),
216        }
217    }
218
219    const fn avail_mut(&mut self, is_star: bool) -> &mut DelimiterAvail {
220        if is_star {
221            &mut self.star
222        } else {
223            &mut self.under
224        }
225    }
226}
227
228/// Stack-allocated buffer for collecting inline elements without heap allocation.
229/// Uses `MaybeUninit` to avoid zeroing the stack array on every parse call.
230/// The capacity `CAP` is configurable via `MarkdownFile`'s `INLINE_STACK_CAP`
231/// const generic — falls back to heap if exceeded.
232struct InlineBuf<'src, const CAP: usize> {
233    stack: [MaybeUninit<Inline<'src>>; CAP],
234    len: usize,
235    overflow: Vec<Inline<'src>>,
236}
237
238impl<'src, const CAP: usize> InlineBuf<'src, CAP> {
239    #[inline]
240    const fn new() -> Self {
241        Self {
242            // SAFETY: An array of MaybeUninit does not require initialization.
243            stack: [const { MaybeUninit::uninit() }; CAP],
244            len: 0,
245            overflow: Vec::new(),
246        }
247    }
248
249    #[allow(clippy::inline_always)]
250    #[inline(always)]
251    fn push(&mut self, item: Inline<'src>) {
252        if self.len < CAP {
253            self.stack[self.len] = MaybeUninit::new(item);
254            self.len += 1;
255        } else {
256            self.push_slow(item);
257        }
258    }
259
260    #[cold]
261    fn push_slow(&mut self, item: Inline<'src>) {
262        if self.overflow.is_empty() {
263            // Spill stack to heap
264            self.overflow = Vec::with_capacity(CAP * 2);
265            // SAFETY: elements 0..self.len were initialized via push.
266            // Use a raw pointer to avoid borrow conflict with self.overflow.
267            let len = self.len;
268            let ptr = self.stack.as_ptr().cast::<Inline>();
269            let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
270            self.overflow.extend_from_slice(slice);
271        }
272        self.overflow.push(item);
273    }
274
275    /// Get initialized stack elements as a slice.
276    #[inline]
277    const fn initialized_stack(&self) -> &[Inline<'src>] {
278        // SAFETY: all elements 0..self.len have been initialized via push.
279        unsafe { std::slice::from_raw_parts(self.stack.as_ptr().cast::<Inline>(), self.len) }
280    }
281
282    #[inline]
283    fn flush_to_pool(self, pool: &mut Vec<Inline<'src>>) -> InlineSpan {
284        let start = pool_offset(pool.len());
285        if self.overflow.is_empty() {
286            pool.extend_from_slice(self.initialized_stack());
287            InlineSpan::new(start, pool_offset(self.len))
288        } else {
289            let len = pool_offset(self.overflow.len());
290            pool.extend(self.overflow);
291            InlineSpan::new(start, len)
292        }
293    }
294}
295
296/// Pool index as `u32`. Panics if the pool exceeds 4 GiB of elements
297/// (unreachable in practice — that would require billions of inline nodes).
298#[allow(clippy::inline_always)]
299#[inline(always)]
300pub fn pool_offset(pool_len: usize) -> u32 {
301    u32::try_from(pool_len).expect("inline pool exceeds u32::MAX elements")
302}
303
304impl<'src> Inline<'src> {
305    /// Threshold below which the emphasis pre-scan costs more than it saves.
306    const EMPH_SCAN_THRESHOLD: usize = 256;
307
308    /// Parse inline elements and store them in the pool. Returns a span.
309    ///
310    /// Uses default limits (`MAX_INLINE_DEPTH = 16`, `INLINE_STACK_CAP = 32`).
311    /// For custom limits, use [`crate::MarkdownFile::parse`] with const generics.
312    #[must_use]
313    pub fn parse(input: &'src str, pool: &mut Vec<Self>) -> InlineSpan {
314        Self::parse_configured::<16, 32>(input, pool)
315    }
316
317    /// Parse inline elements with configurable depth and stack limits.
318    pub(crate) fn parse_configured<const MAX_DEPTH: u8, const CAP: usize>(
319        input: &'src str,
320        pool: &mut Vec<Self>,
321    ) -> InlineSpan {
322        let bytes = input.as_bytes();
323        // Fast path: if no special bytes exist, the entire input is plain text.
324        if find_byte_set(bytes, 0, &SPECIAL_SET).is_none() {
325            if input.is_empty() {
326                return InlineSpan::EMPTY;
327            }
328            let start = pool_offset(pool.len());
329            pool.push(Self::Text(input));
330            return InlineSpan::new(start, 1);
331        }
332        let emph = if bytes.len() < Self::EMPH_SCAN_THRESHOLD {
333            EmphasisState::assume_both()
334        } else {
335            EmphasisState::from_bytes(bytes)
336        };
337        Self::parse_with_emph::<MAX_DEPTH, CAP>(input, bytes, emph, pool, 0)
338    }
339
340    fn parse_inner<const MAX_DEPTH: u8, const CAP: usize>(
341        input: &'src str,
342        pool: &mut Vec<Self>,
343        depth: u8,
344    ) -> InlineSpan {
345        let bytes = input.as_bytes();
346        Self::parse_with_emph::<MAX_DEPTH, CAP>(
347            input,
348            bytes,
349            EmphasisState::assume_both(),
350            pool,
351            depth,
352        )
353    }
354
355    fn parse_with_emph<const MAX_DEPTH: u8, const CAP: usize>(
356        input: &'src str,
357        bytes: &[u8],
358        emph: EmphasisState,
359        pool: &mut Vec<Self>,
360        depth: u8,
361    ) -> InlineSpan {
362        let mut buf = InlineBuf::<CAP>::new();
363        Self::parse_into_buf::<MAX_DEPTH, CAP>(input, bytes, emph, pool, &mut buf, depth);
364        buf.flush_to_pool(pool)
365    }
366
367    /// Push parsed inline elements directly into the pool without wrapping
368    /// in a span. Used for blockquote multi-line accumulation where the caller
369    /// manages span boundaries.
370    ///
371    /// Uses default limits (`MAX_INLINE_DEPTH = 16`, `INLINE_STACK_CAP = 32`).
372    /// For custom limits, use [`crate::MarkdownFile::parse`] with const generics.
373    pub fn parse_flat_into(input: &'src str, pool: &mut Vec<Self>) {
374        Self::parse_flat_into_configured::<16, 32>(input, pool);
375    }
376
377    /// Push parsed inline elements with configurable depth and stack limits.
378    pub(crate) fn parse_flat_into_configured<const MAX_DEPTH: u8, const CAP: usize>(
379        input: &'src str,
380        pool: &mut Vec<Self>,
381    ) {
382        let bytes = input.as_bytes();
383        // Fast path: no special bytes means plain text.
384        if find_byte_set(bytes, 0, &SPECIAL_SET).is_none() {
385            if !input.is_empty() {
386                pool.push(Self::Text(input));
387            }
388            return;
389        }
390        let emph = if bytes.len() < Self::EMPH_SCAN_THRESHOLD {
391            EmphasisState::assume_both()
392        } else {
393            EmphasisState::from_bytes(bytes)
394        };
395        // Parse directly into a buf that flushes to pool (flat, no span wrapper).
396        let mut buf = InlineBuf::<CAP>::new();
397        Self::parse_into_buf::<MAX_DEPTH, CAP>(input, bytes, emph, pool, &mut buf, 0);
398        // Flush buf directly to pool (not wrapped in a span).
399        if buf.overflow.is_empty() {
400            pool.extend_from_slice(buf.initialized_stack());
401        } else {
402            pool.extend(buf.overflow);
403        }
404    }
405
406    fn parse_into_buf<const MAX_DEPTH: u8, const CAP: usize>(
407        input: &'src str,
408        bytes: &[u8],
409        mut emph: EmphasisState,
410        pool: &mut Vec<Self>,
411        buf: &mut InlineBuf<'src, CAP>,
412        depth: u8,
413    ) {
414        let mut plain_start = 0;
415        let mut i = 0;
416
417        // SIMD-accelerated scan: find next special byte.
418        while let Some(pos) = find_byte_set(bytes, i, &SPECIAL_SET) {
419            i = pos;
420            let b = bytes[i];
421
422            if b == SpecialChar::Newline {
423                Self::emit_line_break::<CAP>(input, bytes, plain_start, i, buf);
424                plain_start = i + 1;
425                i = plain_start;
426                continue;
427            }
428
429            // Backslash escape: only ASCII punctuation can be escaped (CommonMark spec).
430            // For non-punctuation, the backslash is kept as literal text.
431            if b == SpecialChar::Backslash
432                && let Some(&next) = bytes.get(i + 1)
433                && next.is_ascii_punctuation()
434            {
435                if let Some(text) = input.get(plain_start..i)
436                    && !text.is_empty()
437                {
438                    buf.push(Self::Text(text));
439                }
440                plain_start = i + 1;
441                i += 2;
442                continue;
443            }
444
445            // Inline code: `code` or ``code``
446            if b == SpecialChar::Backtick
447                && let Some((code, end)) = Self::try_parse_inline_code(input, bytes, i)
448            {
449                if let Some(text) = input.get(plain_start..i)
450                    && !text.is_empty()
451                {
452                    buf.push(Self::Text(text));
453                }
454                buf.push(Self::Code(code));
455                plain_start = end;
456                i = end;
457                continue;
458            }
459
460            // Image: ![alt](url "title")
461            if b == SpecialChar::ExclamationMark
462                && bytes.get(i + 1) == SpecialChar::OpenBracket
463                && let Some((alt, url, title, end)) =
464                    Self::try_parse_bracket_paren(input, bytes, i + 1)
465            {
466                if let Some(text) = input.get(plain_start..i)
467                    && !text.is_empty()
468                {
469                    buf.push(Self::Text(text));
470                }
471                buf.push(Self::Image { alt, url, title });
472                plain_start = end;
473                i = end;
474                continue;
475            }
476
477            // Link: [text](url "title")
478            if b == SpecialChar::OpenBracket
479                && let Some((text_str, url, title, end)) =
480                    Self::try_parse_bracket_paren(input, bytes, i)
481            {
482                if let Some(text) = input.get(plain_start..i)
483                    && !text.is_empty()
484                {
485                    buf.push(Self::Text(text));
486                }
487                let text_span =
488                    Self::parse_inner::<MAX_DEPTH, CAP>(text_str, pool, depth.saturating_add(1));
489                buf.push(Self::Link {
490                    text: text_span,
491                    url,
492                    title,
493                });
494                plain_start = end;
495                i = end;
496                continue;
497            }
498
499            // Bold/Italic: ** __ * _
500            if let Some((elem, end)) = Self::try_parse_emphasis::<MAX_DEPTH, CAP>(
501                input, bytes, i, b, &mut emph, pool, depth,
502            ) {
503                if let Some(text) = input.get(plain_start..i)
504                    && !text.is_empty()
505                {
506                    buf.push(Self::Text(text));
507                }
508                buf.push(elem);
509                plain_start = end;
510                i = end;
511                continue;
512            }
513
514            i += 1;
515        }
516
517        if let Some(text) = input.get(plain_start..)
518            && !text.is_empty()
519        {
520            buf.push(Self::Text(text));
521        }
522    }
523
524    /// Emit a hard or soft line break at a newline position.
525    /// Hard break if preceded by trailing `\` or 2+ spaces; soft break otherwise.
526    #[inline]
527    fn emit_line_break<const CAP: usize>(
528        input: &'src str,
529        bytes: &[u8],
530        plain_start: usize,
531        newline_pos: usize,
532        buf: &mut InlineBuf<'src, CAP>,
533    ) {
534        let preceding = bytes.get(plain_start..newline_pos).unwrap_or_default();
535        let (trim_end, is_hard) = if preceding.last() == SpecialChar::Backslash {
536            (newline_pos - 1, true)
537        } else {
538            // Count trailing spaces with a simple backward loop.
539            let mut spaces = 0;
540            let mut j = preceding.len();
541            while j > 0 && preceding[j - 1] == SpecialChar::Space {
542                spaces += 1;
543                j -= 1;
544            }
545            if spaces >= 2 {
546                (newline_pos - spaces, true)
547            } else {
548                (newline_pos, false)
549            }
550        };
551        if let Some(text) = input.get(plain_start..trim_end)
552            && !text.is_empty()
553        {
554            buf.push(Self::Text(text));
555        }
556        buf.push(if is_hard {
557            Self::HardBreak
558        } else {
559            Self::SoftBreak
560        });
561    }
562
563    #[inline]
564    fn try_parse_emphasis<const MAX_DEPTH: u8, const CAP: usize>(
565        input: &'src str,
566        bytes: &[u8],
567        i: usize,
568        b: u8,
569        emph: &mut EmphasisState,
570        pool: &mut Vec<Self>,
571        depth: u8,
572    ) -> Option<(Self, usize)> {
573        let is_star = b == SpecialChar::Asterisk;
574        if !is_star && b != SpecialChar::Underscore {
575            return None;
576        }
577        // Depth limit: treat as plain text to prevent stack overflow.
578        if depth >= MAX_DEPTH {
579            return None;
580        }
581        let avail = emph.avail_mut(is_star);
582
583        // Bold: ** or __
584        if avail.can_bold() && bytes.get(i + 1) == Some(&b) {
585            if let Some((inner, end)) = Self::try_parse_delimited(input, bytes, i, b, 2) {
586                let span = Self::parse_inner::<MAX_DEPTH, CAP>(inner, pool, depth + 1);
587                return Some((Self::Bold(span), end));
588            }
589            avail.bold_failed();
590        }
591
592        // Italic: * or _
593        if avail.can_italic() {
594            if let Some((inner, end)) = Self::try_parse_delimited(input, bytes, i, b, 1) {
595                let span = Self::parse_inner::<MAX_DEPTH, CAP>(inner, pool, depth + 1);
596                return Some((Self::Italic(span), end));
597            }
598            avail.italic_failed();
599        }
600
601        None
602    }
603
604    /// Find the position of a matching closing delimiter, handling backslash
605    /// escapes and nested pairs.
606    fn find_matching_close(
607        bytes: &[u8],
608        start: usize,
609        open: SpecialChar,
610        close: SpecialChar,
611    ) -> Option<usize> {
612        // Select pre-computed static ByteSet instead of building one each call.
613        let set = if open == SpecialChar::OpenBracket {
614            &BRACKET_CLOSE_SET
615        } else {
616            &PAREN_CLOSE_SET
617        };
618        let mut depth = 0u32;
619        let mut j = start;
620        loop {
621            let pos = find_byte_set(bytes, j, set)?;
622            let b = bytes[pos];
623            if b == SpecialChar::Backslash
624                && bytes.get(pos + 1).is_some_and(u8::is_ascii_punctuation)
625            {
626                j = pos + 2;
627                continue;
628            }
629            if b == open {
630                depth += 1;
631            } else if b == close {
632                if depth == 0 {
633                    return Some(pos);
634                }
635                depth -= 1;
636            }
637            j = pos + 1;
638        }
639    }
640
641    fn try_parse_bracket_paren(
642        input: &'src str,
643        bytes: &[u8],
644        start: usize,
645    ) -> Option<(&'src str, &'src str, Option<&'src str>, usize)> {
646        if bytes.get(start) != SpecialChar::OpenBracket {
647            return None;
648        }
649
650        let bracket_start = start + 1;
651        let bracket_end = Self::find_matching_close(
652            bytes,
653            bracket_start,
654            SpecialChar::OpenBracket,
655            SpecialChar::CloseBracket,
656        )?;
657
658        let paren_pos = bracket_end + 1;
659        if bytes.get(paren_pos) != SpecialChar::OpenParen {
660            return None;
661        }
662
663        let paren_start = paren_pos + 1;
664        let paren_end = Self::find_matching_close(
665            bytes,
666            paren_start,
667            SpecialChar::OpenParen,
668            SpecialChar::CloseParen,
669        )?;
670
671        let paren_content = input.get(paren_start..paren_end)?;
672        let (url, title) = Self::split_url_title(paren_content);
673
674        Some((
675            input.get(bracket_start..bracket_end)?,
676            url,
677            title,
678            paren_end + 1,
679        ))
680    }
681
682    /// Split the content inside `(...)` into a URL and optional title
683    /// (`CommonMark` §6.3).
684    ///
685    /// Titles are delimited by `"..."`, `'...'`, or `(...)`.
686    ///
687    /// We scan **backwards** because the title, if present, is always at the
688    /// end. The algorithm:
689    ///  1. Check the last byte for a closing title delimiter (`"`, `'`, `)`).
690    ///  2. Walk backwards to find the matching opener.
691    ///  3. The opener must be preceded by whitespace — this separates the URL
692    ///     from the title. If no whitespace is found, there is no title.
693    ///  4. For **paired** delimiters (`(…)`), if the first candidate opener
694    ///     lacks preceding whitespace we keep scanning for an earlier `(`
695    ///     that does. For **same-char** delimiters (`"…"`, `'…'`), the first
696    ///     match is the only candidate (no nesting possible).
697    fn split_url_title(content: &'src str) -> (&'src str, Option<&'src str>) {
698        let trimmed = content.trim();
699        // A valid title needs at minimum: url, space, open+close quotes (e.g. `u "t"`).
700        // With fewer than 3 bytes the backward scan would underflow.
701        if trimmed.len() < 3 {
702            return (trimmed, None);
703        }
704
705        let bytes = trimmed.as_bytes();
706        let last = bytes[bytes.len() - 1];
707        let (open, close) = match SpecialChar::from_byte(last) {
708            Some(SpecialChar::DoubleQuote) => (SpecialChar::DoubleQuote, SpecialChar::DoubleQuote),
709            Some(SpecialChar::SingleQuote) => (SpecialChar::SingleQuote, SpecialChar::SingleQuote),
710            Some(SpecialChar::CloseParen) => (SpecialChar::OpenParen, SpecialChar::CloseParen),
711            // No trailing title delimiter — the entire content is the URL.
712            _ => return (trimmed, None),
713        };
714
715        // Scan backwards for the matching opening delimiter.
716        let mut j = bytes.len() - 2;
717        loop {
718            if bytes[j] == open {
719                // Whitespace before the opener separates URL from title.
720                if j > 0 && bytes[j - 1].is_ascii_whitespace() {
721                    let url = trimmed.get(..j).unwrap_or(trimmed).trim_end();
722                    let title = trimmed.get(j + 1..bytes.len() - 1).unwrap_or("");
723                    return (url, Some(title));
724                }
725                // For paired delimiters (open != close), keep scanning for an
726                // earlier opener that *does* have preceding whitespace.
727                if open != close {
728                    if j == 0 {
729                        break;
730                    }
731                    j -= 1;
732                    continue;
733                }
734                // Same-char delimiter: first match is the only candidate.
735                break;
736            }
737            if j == 0 {
738                break;
739            }
740            j -= 1;
741        }
742
743        // No valid title found — treat entire content as URL.
744        (trimmed, None)
745    }
746
747    #[inline]
748    /// Classify the character before a position for flanking delimiter rules.
749    /// Returns `CharClass::Whitespace` at start-of-input (treated as if preceded by newline).
750    fn char_class_before(bytes: &[u8], pos: usize) -> CharClass {
751        if pos == 0 {
752            return CharClass::Whitespace;
753        }
754        let b = bytes[pos - 1];
755        // Fast path: ASCII bytes need no UTF-8 decoding.
756        if b < 0x80 {
757            return CharClass::of_ascii(b);
758        }
759        // Walk back to find UTF-8 codepoint start.
760        let mut start = pos - 1;
761        while start > 0 && bytes[start] & 0xC0 == 0x80 {
762            start -= 1;
763        }
764        let ch = std::str::from_utf8(&bytes[start..pos])
765            .ok()
766            .and_then(|s| s.chars().next())
767            .unwrap_or(' ');
768        CharClass::of(ch)
769    }
770
771    #[inline]
772    /// Classify the character after a position for flanking delimiter rules.
773    /// Returns `CharClass::Whitespace` at end-of-input (treated as if followed by newline).
774    fn char_class_after(bytes: &[u8], pos: usize) -> CharClass {
775        if pos >= bytes.len() {
776            return CharClass::Whitespace;
777        }
778        let b = bytes[pos];
779        // Fast path: ASCII bytes need no UTF-8 decoding.
780        if b < 0x80 {
781            return CharClass::of_ascii(b);
782        }
783        // Decode the UTF-8 codepoint starting at `pos`.
784        let ch = std::str::from_utf8(&bytes[pos..])
785            .ok()
786            .and_then(|s| s.chars().next())
787            .unwrap_or(' ');
788        CharClass::of(ch)
789    }
790
791    fn try_parse_delimited(
792        input: &'src str,
793        bytes: &[u8],
794        start: usize,
795        marker: u8,
796        count: usize,
797    ) -> Option<(&'src str, usize)> {
798        let inner_start = start + count;
799        bytes.get(inner_start)?;
800
801        let is_star = marker == SpecialChar::Asterisk;
802
803        // CommonMark §6.2 — emphasis flanking rules:
804        // A left-flanking delimiter run must not be followed by whitespace,
805        // and must not be followed by punctuation unless preceded by whitespace
806        // or punctuation. For `_`, it must also not be right-flanking (unless
807        // preceded by punctuation), preventing intra-word emphasis.
808        let before_open = Self::char_class_before(bytes, start);
809        let after_open = Self::char_class_after(bytes, inner_start);
810
811        let left_flanking = after_open != CharClass::Whitespace
812            && (after_open != CharClass::Punctuation || before_open != CharClass::Other);
813        if !left_flanking {
814            return None;
815        }
816        if !is_star {
817            // _ can open only if left-flanking AND (not right-flanking OR preceded by punctuation)
818            let right_flanking_open = before_open != CharClass::Whitespace
819                && (before_open != CharClass::Punctuation || after_open != CharClass::Other);
820            if right_flanking_open && before_open != CharClass::Punctuation {
821                return None;
822            }
823        }
824
825        // Select pre-computed static ByteSet instead of building one each call.
826        let delim_set = if is_star {
827            &STAR_DELIM_SET
828        } else {
829            &UNDER_DELIM_SET
830        };
831
832        let mut i = inner_start;
833        loop {
834            let Some(pos) = find_byte_set(bytes, i, delim_set) else {
835                break;
836            };
837            i = pos;
838            let b = bytes[i];
839
840            if b == SpecialChar::Backslash && bytes.get(i + 1).is_some_and(u8::is_ascii_punctuation)
841            {
842                i += 2;
843                continue;
844            }
845
846            if b != marker {
847                i += 1;
848                continue;
849            }
850
851            // Found a marker byte — check for a valid closing run.
852            let all_match = (1..count).all(|j| bytes.get(i + j) == Some(&marker));
853            if !all_match {
854                i += 1;
855                continue;
856            }
857
858            let close_end = i + count;
859            let before_close = Self::char_class_before(bytes, i);
860            let after_close = Self::char_class_after(bytes, close_end);
861
862            // CommonMark §6.2 — closing delimiter must be right-flanking:
863            // not preceded by whitespace, and not preceded by punctuation
864            // unless followed by whitespace or punctuation. For `_`, must
865            // also not be left-flanking (unless followed by punctuation).
866            let right_flanking = before_close != CharClass::Whitespace
867                && (before_close != CharClass::Punctuation || after_close != CharClass::Other);
868            if !right_flanking {
869                i += 1;
870                continue;
871            }
872            if !is_star {
873                // _ can close only if right-flanking AND (not left-flanking OR followed by punctuation)
874                let left_flanking_close = after_close != CharClass::Whitespace
875                    && (after_close != CharClass::Punctuation || before_close != CharClass::Other);
876                if left_flanking_close && after_close != CharClass::Punctuation {
877                    i += 1;
878                    continue;
879                }
880            }
881
882            return Some((input.get(inner_start..i)?, close_end));
883        }
884
885        None
886    }
887
888    /// Parse inline code spans (`CommonMark` §6.1).
889    /// The opening and closing backtick sequences must have the same length.
890    /// Content is taken verbatim (no backslash escaping inside code spans).
891    fn try_parse_inline_code(
892        input: &'src str,
893        bytes: &[u8],
894        start: usize,
895    ) -> Option<(&'src str, usize)> {
896        let backtick_count = count_leading_byte(&bytes[start..], SpecialChar::Backtick.byte());
897        if backtick_count == 0 {
898            return None;
899        }
900
901        let content_start = start + backtick_count;
902        let mut i = content_start;
903        while i < bytes.len() {
904            // SIMD-accelerated backtick scan.
905            i = find_byte(bytes, i, SpecialChar::Backtick.byte())?;
906
907            // Count consecutive backticks
908            let close_count = count_leading_byte(&bytes[i..], SpecialChar::Backtick.byte());
909
910            if close_count == backtick_count {
911                // CommonMark §6.1: strip one leading and one trailing space
912                // when the content both starts and ends with a space.
913                let mut cs = content_start;
914                let mut ce = i;
915                if ce - cs >= 2
916                    && bytes.get(cs) == SpecialChar::Space
917                    && bytes.get(ce - 1) == SpecialChar::Space
918                {
919                    cs += 1;
920                    ce -= 1;
921                }
922                return Some((input.get(cs..ce)?, i + close_count));
923            }
924            i += close_count;
925        }
926
927        None
928    }
929}
marki_parse/inline.rs

marki_parse/
inline.rs