Skip to main content

citum_engine/processor/document/
pipeline.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! High-level document-processing orchestration.
7
8use super::output::{
9    HtmlPlaceholderRegistry, RenderedDocumentBody, append_document_bibliography,
10    bibliography_block_placeholder, render_document_bibliography_block_replacement,
11    rewrite_document_markup_for_typst, stage_document_bibliography_blocks,
12};
13use super::{BibliographyBlock, CitationParser, DocumentFormat, ParsedDocument};
14use crate::processor::Processor;
15
16/// Format a bibliography section heading for the trailing-bibliography path.
17///
18/// Produces format-specific H2-level heading markup. HTML content in the
19/// trailing-bibliography path is inserted after `finalize_html_output`, so
20/// headings must already be HTML — hence the explicit `DocumentFormat::Html`
21/// arm, which emits a pre-escaped `<h2>` element. The heading text is sourced
22/// from YAML/JSON, so `&`, `<`, and `>` are escaped to prevent markup injection.
23fn render_bibliography_section_heading(heading: &str, format: DocumentFormat) -> String {
24    match format {
25        DocumentFormat::Html => {
26            let escaped = heading
27                .replace('&', "&amp;")
28                .replace('<', "&lt;")
29                .replace('>', "&gt;");
30            format!("<h2>{escaped}</h2>\n\n")
31        }
32        DocumentFormat::Latex => format!("\\subsection*{{{heading}}}\n\n"),
33        DocumentFormat::Typst => format!("== {heading}\n\n"),
34        _ => format!("## {heading}\n\n"),
35    }
36}
37
38impl Processor {
39    /// Process citations in a document and append a bibliography.
40    ///
41    /// This is the primary document-level entry point. It:
42    /// 1. Parses the source document using the provided adapter.
43    /// 2. Resolves frontmatter overrides (integral-name policy, bibliography options).
44    /// 3. Chooses a bibliography orchestration path based on frontmatter and document blocks.
45    #[allow(
46        clippy::string_slice,
47        reason = "parser-guaranteed boundaries and indices"
48    )]
49    pub fn process_document<P, F>(
50        &self,
51        content: &str,
52        parser: &P,
53        format: DocumentFormat,
54    ) -> String
55    where
56        P: CitationParser,
57        F: crate::render::format::OutputFormat<Output = String>,
58    {
59        let mut parsed = parser.parse_document(content, &self.locale);
60
61        if let Some(err) = &parsed.frontmatter_error {
62            eprintln!("citum: error: frontmatter parse error: {err}");
63            std::process::exit(1);
64        }
65
66        // `options.*` fields take precedence over the legacy top-level fields.
67        let effective_integral_override = parsed
68            .frontmatter_options
69            .as_ref()
70            .and_then(|o| o.integral_name_memory.as_ref())
71            .or(parsed.frontmatter_integral_name_memory.as_ref());
72        let owned_integral =
73            self.processor_with_document_integral_name_override(effective_integral_override);
74
75        // `options.org-abbreviation-memory` takes precedence over the legacy top-level field.
76        let effective_org_override = parsed
77            .frontmatter_options
78            .as_ref()
79            .and_then(|o| o.org_abbreviation_memory.as_ref())
80            .or(parsed.frontmatter_org_abbreviation_memory.as_ref());
81        let owned_org = {
82            let base = owned_integral.as_ref().unwrap_or(self);
83            base.processor_with_document_org_abbreviation_override(effective_org_override)
84        };
85
86        // Apply bibliography overrides from the options block.
87        let owned_bib = parsed
88            .frontmatter_options
89            .as_ref()
90            .filter(|o| o.bibliography.is_some())
91            .map(|options| {
92                let base = owned_org
93                    .as_ref()
94                    .or(owned_integral.as_ref())
95                    .unwrap_or(self);
96                base.processor_with_bibliography_override(options)
97            });
98
99        let processor = owned_bib
100            .as_ref()
101            .or(owned_org.as_ref())
102            .or(owned_integral.as_ref())
103            .unwrap_or(self);
104        let body = &content[parsed.body_start..];
105        if let Some(groups) = parsed.frontmatter_groups.take() {
106            return processor.process_document_with_frontmatter_groups::<P, F>(
107                body, parsed, groups, parser, format,
108            );
109        }
110
111        if !parsed.bibliography_blocks.is_empty() {
112            return processor.process_document_with_bibliography_blocks::<P, F>(
113                body,
114                std::mem::take(&mut parsed.bibliography_blocks),
115                parser,
116                format,
117            );
118        }
119
120        processor.process_document_with_default_bibliography::<P, F>(body, parsed, parser, format)
121    }
122
123    /// Orchestrate document processing with custom frontmatter bibliography groups.
124    ///
125    /// Renders each group as an ordered section via the shared
126    /// [`Processor::render_document_bibliography_blocks`] primitive, then
127    /// appends all sections as a trailing bibliography under the standard
128    /// "Bibliography" heading. Groups with no matching entries are omitted
129    /// silently; references not matched by any group selector are not rendered
130    /// (use a catch-all group with `selector: {}` or a `not:` negation to
131    /// capture unmatched entries).
132    fn process_document_with_frontmatter_groups<P, F>(
133        &self,
134        body: &str,
135        parsed: ParsedDocument,
136        groups: Vec<citum_schema::grouping::BibliographyGroup>,
137        parser: &P,
138        format: DocumentFormat,
139    ) -> String
140    where
141        P: CitationParser,
142        F: crate::render::format::OutputFormat<Output = String>,
143    {
144        self.render_document_with_trailing_bibliography::<P, F, _>(
145            body,
146            parsed,
147            parser,
148            format,
149            |processor| {
150                let rendered_blocks =
151                    processor.render_document_bibliography_blocks::<F>(&groups, None, None);
152                let mut output = String::new();
153                for block in rendered_blocks {
154                    if block.entries.is_empty() {
155                        continue;
156                    }
157                    if !output.is_empty() {
158                        output.push_str("\n\n");
159                    }
160                    if let Some(heading) = block.heading {
161                        output.push_str(&render_bibliography_section_heading(&heading, format));
162                    }
163                    output.push_str(&block.body);
164                }
165                output
166            },
167        )
168    }
169
170    /// Orchestrate document processing with explicit bibliography blocks.
171    fn process_document_with_bibliography_blocks<P, F>(
172        &self,
173        body: &str,
174        blocks: Vec<BibliographyBlock>,
175        parser: &P,
176        format: DocumentFormat,
177    ) -> String
178    where
179        P: CitationParser,
180        F: crate::render::format::OutputFormat<Output = String>,
181    {
182        let staged = stage_document_bibliography_blocks(body, &blocks);
183        let parsed_staged = parser.parse_document(&staged, &self.locale);
184        let mut rendered = self.render_document_body::<F>(&staged, parsed_staged, format);
185        self.replace_document_bibliography_blocks::<F>(&mut rendered, &blocks, format);
186        self.finalize_document_output::<P, F>(parser, format, rendered)
187    }
188
189    /// Process a document with bibliography groups supplied by the caller.
190    ///
191    /// Unlike the fenced-div path, the caller provides an ordered slice of
192    /// [`citum_schema::grouping::BibliographyGroup`]s (e.g. from `--bibliography-blocks` on the CLI or
193    /// a session-level block list) rather than `:::bibliography{...}` markers
194    /// embedded in the document. Citations are processed exactly as in
195    /// `process_document`; the trailing bibliography is replaced by one
196    /// rendered section per supplied group using the shared
197    /// `render_document_bibliography_blocks` primitive.
198    pub fn process_document_with_caller_blocks<P, F>(
199        &self,
200        content: &str,
201        blocks: &[citum_schema::grouping::BibliographyGroup],
202        parser: &P,
203        format: DocumentFormat,
204    ) -> String
205    where
206        P: CitationParser,
207        F: crate::render::format::OutputFormat<Output = String>,
208    {
209        let parsed = parser.parse_document(content, &self.locale);
210        let body = content.get(parsed.body_start..).unwrap_or(content);
211        let mut rendered = self.render_document_body::<F>(body, parsed, format);
212        // Render ordered sectional blocks via the unified primitive.
213        let rendered_groups = self.render_document_bibliography_blocks::<F>(blocks, None, None);
214        for rendered_group in rendered_groups {
215            let section = render_document_bibliography_block_replacement(
216                rendered.placeholders.as_mut(),
217                format,
218                rendered_group.heading,
219                rendered_group.body,
220            );
221            rendered.content.push_str("\n\n");
222            rendered.content.push_str(&section);
223        }
224        self.finalize_document_output::<P, F>(parser, format, rendered)
225    }
226
227    /// Orchestrate document processing with the default trailing bibliography.
228    fn process_document_with_default_bibliography<P, F>(
229        &self,
230        body: &str,
231        parsed: ParsedDocument,
232        parser: &P,
233        format: DocumentFormat,
234    ) -> String
235    where
236        P: CitationParser,
237        F: crate::render::format::OutputFormat<Output = String>,
238    {
239        self.render_document_with_trailing_bibliography::<P, F, _>(
240            body,
241            parsed,
242            parser,
243            format,
244            |p: &super::super::Processor| {
245                p.render_document_bibliography::<F>(true, None, None)
246                    .content
247            },
248        )
249    }
250
251    /// Generic helper for rendering document body + trailing bibliography.
252    fn render_document_with_trailing_bibliography<P, F, B>(
253        &self,
254        body: &str,
255        parsed: ParsedDocument,
256        parser: &P,
257        format: DocumentFormat,
258        render_bibliography: B,
259    ) -> String
260    where
261        P: CitationParser,
262        F: crate::render::format::OutputFormat<Output = String>,
263        B: FnOnce(&Self) -> String,
264    {
265        let mut rendered = self.render_document_body::<F>(body, parsed, format);
266        let bibliography = render_bibliography(self);
267        append_document_bibliography(&mut rendered, format, bibliography);
268        self.finalize_document_output::<P, F>(parser, format, rendered)
269    }
270
271    /// Render the citation-annotated document body.
272    ///
273    /// Governs the choice between note-style and inline-style processing,
274    /// and handles placeholder registration for format finalization.
275    /// HTML and terminal formats (Typst, LaTeX) both use the placeholder path
276    /// so that body markup can be converted after citations are spliced in.
277    fn render_document_body<F>(
278        &self,
279        content: &str,
280        parsed: ParsedDocument,
281        format: DocumentFormat,
282    ) -> RenderedDocumentBody
283    where
284        F: crate::render::format::OutputFormat<Output = String>,
285    {
286        if matches!(format, DocumentFormat::Html) {
287            let mut placeholders = HtmlPlaceholderRegistry::default();
288            let content = if self.is_note_style() {
289                self.process_note_document_html(content, parsed, &mut placeholders)
290            } else {
291                self.process_inline_document_html(content, parsed, &mut placeholders)
292            };
293            return RenderedDocumentBody {
294                content,
295                placeholders: Some(placeholders),
296                trailing: None,
297            };
298        }
299
300        // Terminal formats (Typst, LaTeX) need the same placeholder flow so
301        // the body markup can be converted to the target format after citations
302        // are replaced with NUL-token placeholders. This is a converted-output
303        // path, not passthrough; passthrough is limited to Plain/Djot/Markdown.
304        if matches!(format, DocumentFormat::Typst | DocumentFormat::Latex) {
305            let mut placeholders = HtmlPlaceholderRegistry::default();
306            // Note styles still emit source footnote syntax that the terminal
307            // body renderer does not yet model, so keep that narrow legacy
308            // exception isolated from author-date terminal conversion.
309            let content = if self.is_note_style() {
310                self.process_note_document::<F>(content, parsed)
311            } else {
312                self.process_inline_document_with_placeholders::<F>(
313                    content,
314                    parsed,
315                    &mut placeholders,
316                )
317            };
318            return RenderedDocumentBody {
319                content,
320                placeholders: if self.is_note_style() {
321                    None
322                } else {
323                    Some(placeholders)
324                },
325                trailing: None,
326            };
327        }
328
329        let content = if self.is_note_style() {
330            self.process_note_document::<F>(content, parsed)
331        } else {
332            self.process_inline_document::<F>(content, parsed)
333        };
334
335        RenderedDocumentBody {
336            content,
337            placeholders: None,
338            trailing: None,
339        }
340    }
341
342    /// Splice `F`-rendered citations into document markup using NUL placeholders.
343    ///
344    /// Mirrors `process_inline_document_html` but renders citations using the
345    /// generic format `F` (e.g. Typst or LaTeX) instead of HTML. The
346    /// surrounding body markup still contains the source syntax at this point;
347    /// `finalize_document_output` converts it after placeholder substitution.
348    #[allow(
349        clippy::string_slice,
350        reason = "parser-guaranteed boundaries and indices"
351    )]
352    fn process_inline_document_with_placeholders<F>(
353        &self,
354        content: &str,
355        parsed: ParsedDocument,
356        placeholders: &mut HtmlPlaceholderRegistry,
357    ) -> String
358    where
359        F: crate::render::format::OutputFormat<Output = String>,
360    {
361        let mut result = String::new();
362        let mut last_idx = 0;
363        let normalized = self.normalize_integral_name_citations(&parsed);
364
365        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
366            result.push_str(&content[last_idx..parsed.start]);
367            match self.process_citation_with_format::<F>(&citation) {
368                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
369                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
370            }
371            last_idx = parsed.end;
372        }
373
374        result.push_str(&content[last_idx..]);
375        result
376    }
377
378    /// Splice rendered citations into document markup for non-note styles.
379    #[allow(
380        clippy::string_slice,
381        reason = "parser-guaranteed boundaries and indices"
382    )]
383    fn process_inline_document<F>(&self, content: &str, parsed: ParsedDocument) -> String
384    where
385        F: crate::render::format::OutputFormat<Output = String>,
386    {
387        let mut result = String::new();
388        let mut last_idx = 0;
389        let normalized = self.normalize_integral_name_citations(&parsed);
390
391        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
392            result.push_str(&content[last_idx..parsed.start]);
393            match self.process_citation_with_format::<F>(&citation) {
394                Ok(rendered) => result.push_str(&rendered),
395                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
396            }
397            last_idx = parsed.end;
398        }
399
400        result.push_str(&content[last_idx..]);
401        result
402    }
403
404    /// Splice HTML-rendered citations into document markup using placeholders.
405    #[allow(
406        clippy::string_slice,
407        reason = "parser-guaranteed boundaries and indices"
408    )]
409    fn process_inline_document_html(
410        &self,
411        content: &str,
412        parsed: ParsedDocument,
413        placeholders: &mut HtmlPlaceholderRegistry,
414    ) -> String {
415        let mut result = String::new();
416        let mut last_idx = 0;
417        let normalized = self.normalize_integral_name_citations(&parsed);
418
419        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
420            result.push_str(&content[last_idx..parsed.start]);
421            match self.process_citation_with_format::<crate::render::html::Html>(&citation) {
422                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
423                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
424            }
425            last_idx = parsed.end;
426        }
427
428        result.push_str(&content[last_idx..]);
429        result
430    }
431
432    /// Replace bibliography block placeholders with rendered content.
433    fn replace_document_bibliography_blocks<F>(
434        &self,
435        rendered: &mut RenderedDocumentBody,
436        blocks: &[BibliographyBlock],
437        format: DocumentFormat,
438    ) where
439        F: crate::render::format::OutputFormat<Output = String>,
440    {
441        let groups: Vec<_> = blocks.iter().map(|b| b.group.clone()).collect();
442        let rendered_groups = self.render_document_bibliography_blocks::<F>(&groups, None, None);
443        for (index, rendered_group) in rendered_groups.into_iter().enumerate() {
444            let placeholder = bibliography_block_placeholder(index);
445            let replacement = render_document_bibliography_block_replacement(
446                rendered.placeholders.as_mut(),
447                format,
448                rendered_group.heading,
449                rendered_group.body,
450            );
451            rendered.content = rendered.content.replace(&placeholder, &replacement);
452        }
453    }
454
455    /// Perform final document rewrites and resolve placeholders.
456    ///
457    /// For HTML: converts body markup via `finalize_html_output` then
458    /// substitutes citation placeholder tokens.
459    /// For Typst/LaTeX: converts body markup via `render_body_markup::<F>`
460    /// then substitutes citation placeholder tokens.
461    /// For other formats: returns the spliced content as-is.
462    fn finalize_document_output<P, F>(
463        &self,
464        parser: &P,
465        format: DocumentFormat,
466        rendered: RenderedDocumentBody,
467    ) -> String
468    where
469        P: CitationParser,
470        F: crate::render::format::OutputFormat<Output = String>,
471    {
472        let mut result = if let Some(placeholders) = rendered.placeholders {
473            let fmt = F::default();
474            let converted = match format {
475                DocumentFormat::Html => parser.finalize_html_output(&rendered.content),
476                DocumentFormat::Typst | DocumentFormat::Latex => {
477                    parser.render_body_markup(&rendered.content, &fmt)
478                }
479                _ => rendered.content,
480            };
481            placeholders.apply(converted)
482        } else {
483            // Passthrough path for Plain/Djot/Markdown, plus the isolated
484            // note-style Typst/LaTeX exception documented in render_document_body.
485            // Keep the heading-rewrite for Typst in case headings came from
486            // bibliography group labels rather than body markup.
487            let content = rewrite_document_markup_for_typst(rendered.content, format);
488            match format {
489                DocumentFormat::Html => parser.finalize_html_output(&content),
490                _ => content,
491            }
492        };
493        // Append any trailing content (e.g. Typst/LaTeX bibliography) that was
494        // deferred so it would not pass through the body markup converter.
495        // Trim the body's trailing whitespace first: the markup renderer may
496        // have added paragraph-separator newlines that would otherwise double
497        // the leading newlines of the bibliography heading.
498        if let Some(tail) = rendered.trailing {
499            let trimmed = result.trim_end_matches('\n');
500            result = format!("{trimmed}{tail}");
501        }
502        result
503    }
504}