lucid_lint/parser/
document.rs

1//! Document model produced by the parser.
2//!
3//! A [`Document`] contains an ordered list of [`Section`]s (derived from
4//! headings in Markdown). Each section contains an ordered list of
5//! [`Paragraph`]s. Each paragraph carries its sentences, computed once
6//! at construction via [`super::tokenizer::split_sentences`] — eight
7//! rhythm/syntax/lexicon/structure rules consume them, so paying the
8//! split once and sharing across rules is strictly cheaper than the
9//! previous "lazy per-rule" pattern (F103, samply 2026-04-25).
10
11use crate::parser::tokenizer::split_sentences;
12use crate::types::SourceFile;
13
14/// The parsed representation of a single input text.
15#[derive(Debug, Clone)]
16pub struct Document {
17    /// Origin of the document.
18    pub source: SourceFile,
19
20    /// Sections of the document. The first section may have no heading
21    /// (content before the first heading, or plain text input).
22    pub sections: Vec<Section>,
23
24    /// Inline-disable directives extracted from the source. Each directive
25    /// silences one rule on one target line. See [`Directive`].
26    pub directives: Vec<Directive>,
27
28    /// List items captured during parsing, with their nesting depth and line.
29    /// Empty for plain-text inputs (which have no list structure).
30    pub list_items: Vec<ListItem>,
31}
32
33impl Document {
34    /// Create a new document with no directives and no list items.
35    #[must_use]
36    pub const fn new(source: SourceFile, sections: Vec<Section>) -> Self {
37        Self {
38            source,
39            sections,
40            directives: Vec::new(),
41            list_items: Vec::new(),
42        }
43    }
44
45    /// Create a new document carrying parser-captured metadata (directives
46    /// and list items).
47    #[must_use]
48    pub const fn with_metadata(
49        source: SourceFile,
50        sections: Vec<Section>,
51        directives: Vec<Directive>,
52        list_items: Vec<ListItem>,
53    ) -> Self {
54        Self {
55            source,
56            sections,
57            directives,
58            list_items,
59        }
60    }
61
62    /// Iterate over all paragraphs across all sections, yielding each paragraph
63    /// with the title of the section it belongs to.
64    pub fn paragraphs_with_section(&self) -> impl Iterator<Item = (&Paragraph, Option<&str>)> {
65        self.sections.iter().flat_map(|section| {
66            let title = section.title.as_deref();
67            section.paragraphs.iter().map(move |p| (p, title))
68        })
69    }
70}
71
72/// A section of a document, rooted at a heading (or synthetic for pre-heading content).
73#[derive(Debug, Clone)]
74pub struct Section {
75    /// The heading text (without the leading `#` markers).
76    ///
77    /// `None` for the implicit section containing content before the first
78    /// heading, or for plain text input.
79    pub title: Option<String>,
80
81    /// Heading depth (1 for H1, 2 for H2, etc.). 0 for the synthetic pre-heading section.
82    pub depth: u32,
83
84    /// 1-based line of the heading in the source. `None` for the synthetic
85    /// pre-heading section.
86    pub heading_line: Option<u32>,
87
88    /// Paragraphs under this section.
89    pub paragraphs: Vec<Paragraph>,
90}
91
92impl Section {
93    /// Create a new section without a heading line (synthetic or plain text).
94    #[must_use]
95    pub const fn new(title: Option<String>, depth: u32, paragraphs: Vec<Paragraph>) -> Self {
96        Self {
97            title,
98            depth,
99            heading_line: None,
100            paragraphs,
101        }
102    }
103
104    /// Create a new section rooted at a heading on a specific line.
105    #[must_use]
106    pub const fn with_heading_line(
107        title: Option<String>,
108        depth: u32,
109        heading_line: u32,
110        paragraphs: Vec<Paragraph>,
111    ) -> Self {
112        Self {
113            title,
114            depth,
115            heading_line: Some(heading_line),
116            paragraphs,
117        }
118    }
119}
120
121/// A paragraph of prose.
122#[derive(Debug, Clone)]
123pub struct Paragraph {
124    /// The paragraph text with Markdown inline markup stripped.
125    pub text: String,
126
127    /// 1-based line number in the source where the paragraph starts.
128    pub start_line: u32,
129
130    /// Sentences extracted from `text` at construction time, with absolute
131    /// source positions seeded from `start_line`. Rules consume this slice
132    /// instead of re-running [`split_sentences`] per rule.
133    pub sentences: Vec<Sentence>,
134
135    /// Typed inline tree for this paragraph (F143 substrate).
136    ///
137    /// **Contract (lazy-build).** This field is **empty** when the
138    /// paragraph contained no emphasis (the common case in real
139    /// documents); rules that want to know "no spans worth modeling
140    /// here" simply check `inline.is_empty()`. When *non-empty*, the
141    /// tree faithfully mirrors [`Self::text`]: recursively flattening
142    /// it (concatenating `Text` payloads + descending into `Emphasis`
143    /// children) reproduces the visible-text string byte-for-byte.
144    /// Empty for plain-text input regardless.
145    ///
146    /// Today only [`Inline::Text`] and [`Inline::Emphasis`] are
147    /// produced — the enum is intentionally narrow until a second
148    /// rule demands more (Strong / Link / Code etc).
149    pub inline: Vec<Inline>,
150
151    /// `true` when the paragraph was extracted from a list-item span (tight
152    /// or loose). Rules that target body-prose width (e.g.
153    /// `structure.line-length-wide`) skip these because a rendered list item
154    /// wraps in a narrower column than running prose.
155    pub from_list_item: bool,
156}
157
158impl Paragraph {
159    /// Create a new body paragraph and split it into sentences.
160    #[must_use]
161    pub fn new(text: String, start_line: u32) -> Self {
162        Self::with_origin(text, start_line, false, Vec::new())
163    }
164
165    /// Create a new paragraph derived from a list-item span.
166    #[must_use]
167    pub fn from_list_item(text: String, start_line: u32) -> Self {
168        Self::with_origin(text, start_line, true, Vec::new())
169    }
170
171    /// Create a new body paragraph with a captured inline tree (F143).
172    #[must_use]
173    pub fn with_inline(text: String, start_line: u32, inline: Vec<Inline>) -> Self {
174        Self::with_origin(text, start_line, false, inline)
175    }
176
177    /// Create a new list-item paragraph with a captured inline tree (F143).
178    #[must_use]
179    pub fn from_list_item_with_inline(text: String, start_line: u32, inline: Vec<Inline>) -> Self {
180        Self::with_origin(text, start_line, true, inline)
181    }
182
183    fn with_origin(
184        text: String,
185        start_line: u32,
186        from_list_item: bool,
187        inline: Vec<Inline>,
188    ) -> Self {
189        let sentences = split_sentences(&text, start_line, 1);
190        Self {
191            text,
192            start_line,
193            sentences,
194            inline,
195            from_list_item,
196        }
197    }
198}
199
200/// A typed node in the paragraph-level inline tree (F143 substrate).
201///
202/// Captured at parse time from the Markdown event stream so rules that
203/// need *structural* inline information — emphasis-span boundaries,
204/// future strong / link / code spans — can walk a typed model instead
205/// of regex-scanning the flattened paragraph text.
206///
207/// **Variant set is intentionally narrow.** Today only [`Inline::Text`]
208/// and [`Inline::Emphasis`] are produced. Strong, Link, Code, footnotes,
209/// and task-list markers all flatten into [`Inline::Text`] until a rule
210/// actually needs them. Widen the enum *when a second rule demands it*,
211/// not preemptively.
212#[derive(Debug, Clone, PartialEq, Eq)]
213pub enum Inline {
214    /// Plain prose text. May contain spaces and authorial newlines
215    /// (from soft / hard breaks) the same way [`Paragraph::text`] does.
216    Text(String),
217    /// An italic / `*…*` / `_…_` span. Carries its own children so
218    /// nested emphasis (e.g. `*foo *bar* baz*`) round-trips faithfully.
219    Emphasis(EmphasisSpan),
220}
221
222/// An emphasis (italic) span captured during parsing (F143).
223///
224/// Position fields point at the *opening* `*` / `_` in the source, so
225/// rules emitting diagnostics on the span can land their squiggle on
226/// the visible delimiter rather than an arbitrary column inside the
227/// paragraph.
228#[derive(Debug, Clone, PartialEq, Eq)]
229pub struct EmphasisSpan {
230    /// Inline children. Recursive so nested emphasis is preserved.
231    pub children: Vec<Inline>,
232
233    /// 1-based source line of the opening delimiter.
234    pub start_line: u32,
235
236    /// 1-based source column of the opening delimiter (within `start_line`).
237    pub start_column: u32,
238}
239
240/// A sentence extracted from a paragraph.
241///
242/// Produced on demand by [`super::tokenizer::split_sentences`].
243#[derive(Debug, Clone, PartialEq, Eq)]
244pub struct Sentence {
245    /// The sentence text.
246    pub text: String,
247
248    /// 1-based line where the sentence starts (approximate).
249    pub line: u32,
250
251    /// 1-based column where the sentence starts within its line (approximate).
252    pub column: u32,
253}
254
255/// A disable directive extracted from the source.
256///
257/// Two forms are supported:
258///
259/// - **Line form** (v0.1): `<!-- lucid-lint disable-next-line <rule-id> -->`
260///   silences `rule_id` on the next non-blank line. `start_line == end_line`.
261/// - **Block form** (v0.2, F18):
262///   `<!-- lucid-lint-disable <rule-id> -->` … `<!-- lucid-lint-enable -->`
263///   silences `rule_id` on every line between the two comments (inclusive).
264#[derive(Debug, Clone, PartialEq, Eq)]
265pub struct Directive {
266    /// The rule id silenced by this directive.
267    pub rule_id: String,
268
269    /// 1-based first line the directive covers (inclusive).
270    pub start_line: u32,
271
272    /// 1-based last line the directive covers (inclusive).
273    pub end_line: u32,
274}
275
276/// A list item position captured during parsing.
277///
278/// Emitted once per `<li>` (or ordered-list item), carrying the 1-based
279/// nesting depth (outermost list is 1) and the 1-based source line where
280/// the item starts.
281#[derive(Debug, Clone, PartialEq, Eq)]
282pub struct ListItem {
283    /// 1-based nesting depth: outermost list items are depth 1.
284    pub depth: u32,
285
286    /// 1-based line where the item starts in the source.
287    pub line: u32,
288}
289
290impl ListItem {
291    /// Create a new list item.
292    #[must_use]
293    pub const fn new(depth: u32, line: u32) -> Self {
294        Self { depth, line }
295    }
296}
297
298impl Directive {
299    /// Create a line-form directive covering a single line.
300    #[must_use]
301    pub fn new(rule_id: impl Into<String>, target_line: u32) -> Self {
302        Self {
303            rule_id: rule_id.into(),
304            start_line: target_line,
305            end_line: target_line,
306        }
307    }
308
309    /// Create a block-form directive covering an inclusive line range.
310    #[must_use]
311    pub fn block(rule_id: impl Into<String>, start_line: u32, end_line: u32) -> Self {
312        Self {
313            rule_id: rule_id.into(),
314            start_line,
315            end_line,
316        }
317    }
318
319    /// Whether `line` falls inside this directive's range (inclusive).
320    #[must_use]
321    pub const fn covers(&self, line: u32) -> bool {
322        line >= self.start_line && line <= self.end_line
323    }
324}
325
326impl Sentence {
327    /// Create a new sentence with explicit position.
328    #[must_use]
329    pub fn new(text: impl Into<String>, line: u32, column: u32) -> Self {
330        Self {
331            text: text.into(),
332            line,
333            column,
334        }
335    }
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341
342    #[test]
343    fn paragraphs_with_section_yields_titles() {
344        let section = Section::new(
345            Some("Intro".to_string()),
346            2,
347            vec![Paragraph::new("Hello.".to_string(), 1)],
348        );
349        let doc = Document::new(SourceFile::Anonymous, vec![section]);
350        let collected: Vec<_> = doc
351            .paragraphs_with_section()
352            .map(|(p, title)| (p.text.clone(), title.map(ToOwned::to_owned)))
353            .collect();
354        assert_eq!(
355            collected,
356            vec![("Hello.".to_string(), Some("Intro".to_string()))]
357        );
358    }
359
360    #[test]
361    fn paragraphs_with_section_yields_none_for_untitled_sections() {
362        let section = Section::new(None, 0, vec![Paragraph::new("Body.".to_string(), 1)]);
363        let doc = Document::new(SourceFile::Anonymous, vec![section]);
364        let titles: Vec<_> = doc
365            .paragraphs_with_section()
366            .map(|(_, title)| title.map(ToOwned::to_owned))
367            .collect();
368        assert_eq!(titles, vec![None]);
369    }
370}
lucid_lint/parser/document.rs

lucid_lint/parser/
document.rs