Skip to main content

citum_engine/processor/document/
pipeline.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! High-level document-processing orchestration.
7
8use super::output::{
9    HtmlPlaceholderRegistry, RenderedDocumentBody, append_document_bibliography,
10    bibliography_block_placeholder, render_document_bibliography_block_replacement,
11    rewrite_document_markup_for_typst, rewrite_group_headings_for_document,
12    stage_document_bibliography_blocks,
13};
14use super::{BibliographyBlock, CitationParser, DocumentFormat, ParsedDocument};
15use crate::processor::Processor;
16
17impl Processor {
18    /// Process citations in a document and append a bibliography.
19    ///
20    /// This is the primary document-level entry point. It:
21    /// 1. Parses the source document using the provided adapter.
22    /// 2. Resolves frontmatter overrides (integral-name policy, bibliography options).
23    /// 3. Chooses a bibliography orchestration path based on frontmatter and document blocks.
24    #[allow(
25        clippy::string_slice,
26        reason = "parser-guaranteed boundaries and indices"
27    )]
28    pub fn process_document<P, F>(
29        &self,
30        content: &str,
31        parser: &P,
32        format: DocumentFormat,
33    ) -> String
34    where
35        P: CitationParser,
36        F: crate::render::format::OutputFormat<Output = String>,
37    {
38        let mut parsed = parser.parse_document(content, &self.locale);
39
40        if let Some(err) = &parsed.frontmatter_error {
41            eprintln!("citum: error: frontmatter parse error: {err}");
42            std::process::exit(1);
43        }
44
45        // `options.*` fields take precedence over the legacy top-level fields.
46        let effective_integral_override = parsed
47            .frontmatter_options
48            .as_ref()
49            .and_then(|o| o.integral_name_memory.as_ref())
50            .or(parsed.frontmatter_integral_name_memory.as_ref());
51        let owned_integral =
52            self.processor_with_document_integral_name_override(effective_integral_override);
53
54        // `options.org-abbreviation-memory` takes precedence over the legacy top-level field.
55        let effective_org_override = parsed
56            .frontmatter_options
57            .as_ref()
58            .and_then(|o| o.org_abbreviation_memory.as_ref())
59            .or(parsed.frontmatter_org_abbreviation_memory.as_ref());
60        let owned_org = {
61            let base = owned_integral.as_ref().unwrap_or(self);
62            base.processor_with_document_org_abbreviation_override(effective_org_override)
63        };
64
65        // Apply bibliography overrides from the options block.
66        let owned_bib = parsed
67            .frontmatter_options
68            .as_ref()
69            .filter(|o| o.bibliography.is_some())
70            .map(|options| {
71                let base = owned_org
72                    .as_ref()
73                    .or(owned_integral.as_ref())
74                    .unwrap_or(self);
75                base.processor_with_bibliography_override(options)
76            });
77
78        let processor = owned_bib
79            .as_ref()
80            .or(owned_org.as_ref())
81            .or(owned_integral.as_ref())
82            .unwrap_or(self);
83        let body = &content[parsed.body_start..];
84        if let Some(groups) = parsed.frontmatter_groups.take() {
85            return processor.process_document_with_frontmatter_groups::<P, F>(
86                body, parsed, groups, parser, format,
87            );
88        }
89
90        if !parsed.bibliography_blocks.is_empty() {
91            return processor.process_document_with_bibliography_blocks::<P, F>(
92                body,
93                std::mem::take(&mut parsed.bibliography_blocks),
94                parser,
95                format,
96            );
97        }
98
99        processor.process_document_with_default_bibliography::<P, F>(body, parsed, parser, format)
100    }
101
102    /// Orchestrate document processing with custom frontmatter bibliography groups.
103    fn process_document_with_frontmatter_groups<P, F>(
104        &self,
105        body: &str,
106        parsed: ParsedDocument,
107        groups: Vec<citum_schema::grouping::BibliographyGroup>,
108        parser: &P,
109        format: DocumentFormat,
110    ) -> String
111    where
112        P: CitationParser,
113        F: crate::render::format::OutputFormat<Output = String>,
114    {
115        self.render_document_with_trailing_bibliography::<P, F, _>(
116            body,
117            parsed,
118            parser,
119            format,
120            |processor| {
121                rewrite_group_headings_for_document(
122                    processor.render_document_bibliography_groups::<F>(&groups),
123                    format,
124                )
125            },
126        )
127    }
128
129    /// Orchestrate document processing with explicit bibliography blocks.
130    fn process_document_with_bibliography_blocks<P, F>(
131        &self,
132        body: &str,
133        blocks: Vec<BibliographyBlock>,
134        parser: &P,
135        format: DocumentFormat,
136    ) -> String
137    where
138        P: CitationParser,
139        F: crate::render::format::OutputFormat<Output = String>,
140    {
141        let staged = stage_document_bibliography_blocks(body, &blocks);
142        let parsed_staged = parser.parse_document(&staged, &self.locale);
143        let mut rendered = self.render_document_body::<F>(&staged, parsed_staged, format);
144        self.replace_document_bibliography_blocks::<F>(&mut rendered, &blocks, format);
145        self.finalize_document_output::<P, F>(parser, format, rendered)
146    }
147
148    /// Orchestrate document processing with the default trailing bibliography.
149    fn process_document_with_default_bibliography<P, F>(
150        &self,
151        body: &str,
152        parsed: ParsedDocument,
153        parser: &P,
154        format: DocumentFormat,
155    ) -> String
156    where
157        P: CitationParser,
158        F: crate::render::format::OutputFormat<Output = String>,
159    {
160        self.render_document_with_trailing_bibliography::<P, F, _>(
161            body,
162            parsed,
163            parser,
164            format,
165            super::super::Processor::render_grouped_bibliography_with_format::<F>,
166        )
167    }
168
169    /// Generic helper for rendering document body + trailing bibliography.
170    fn render_document_with_trailing_bibliography<P, F, B>(
171        &self,
172        body: &str,
173        parsed: ParsedDocument,
174        parser: &P,
175        format: DocumentFormat,
176        render_bibliography: B,
177    ) -> String
178    where
179        P: CitationParser,
180        F: crate::render::format::OutputFormat<Output = String>,
181        B: FnOnce(&Self) -> String,
182    {
183        let mut rendered = self.render_document_body::<F>(body, parsed, format);
184        let bibliography = render_bibliography(self);
185        append_document_bibliography(&mut rendered, format, bibliography);
186        self.finalize_document_output::<P, F>(parser, format, rendered)
187    }
188
189    /// Render the citation-annotated document body.
190    ///
191    /// Governs the choice between note-style and inline-style processing,
192    /// and handles placeholder registration for format finalization.
193    /// HTML and terminal formats (Typst, LaTeX) both use the placeholder path
194    /// so that body markup can be converted after citations are spliced in.
195    fn render_document_body<F>(
196        &self,
197        content: &str,
198        parsed: ParsedDocument,
199        format: DocumentFormat,
200    ) -> RenderedDocumentBody
201    where
202        F: crate::render::format::OutputFormat<Output = String>,
203    {
204        if matches!(format, DocumentFormat::Html) {
205            let mut placeholders = HtmlPlaceholderRegistry::default();
206            let content = if self.is_note_style() {
207                self.process_note_document_html(content, parsed, &mut placeholders)
208            } else {
209                self.process_inline_document_html(content, parsed, &mut placeholders)
210            };
211            return RenderedDocumentBody {
212                content,
213                placeholders: Some(placeholders),
214                trailing: None,
215            };
216        }
217
218        // Terminal formats (Typst, LaTeX) need the same placeholder flow so
219        // the body markup can be converted to the target format after citations
220        // are replaced with NUL-token placeholders.
221        if matches!(format, DocumentFormat::Typst | DocumentFormat::Latex) {
222            let mut placeholders = HtmlPlaceholderRegistry::default();
223            // Note styles emit raw footnote syntax that the body markup
224            // converter doesn't understand; fall back to the passthrough path
225            // and let the caller handle conversion separately if needed.
226            let content = if self.is_note_style() {
227                self.process_note_document::<F>(content, parsed)
228            } else {
229                self.process_inline_document_with_placeholders::<F>(
230                    content,
231                    parsed,
232                    &mut placeholders,
233                )
234            };
235            return RenderedDocumentBody {
236                content,
237                placeholders: if self.is_note_style() {
238                    None
239                } else {
240                    Some(placeholders)
241                },
242                trailing: None,
243            };
244        }
245
246        let content = if self.is_note_style() {
247            self.process_note_document::<F>(content, parsed)
248        } else {
249            self.process_inline_document::<F>(content, parsed)
250        };
251
252        RenderedDocumentBody {
253            content,
254            placeholders: None,
255            trailing: None,
256        }
257    }
258
259    /// Splice `F`-rendered citations into document markup using NUL placeholders.
260    ///
261    /// Mirrors `process_inline_document_html` but renders citations using the
262    /// generic format `F` (e.g. Typst or LaTeX) instead of HTML. The
263    /// surrounding body markup still contains the source syntax at this point;
264    /// `finalize_document_output` converts it after placeholder substitution.
265    #[allow(
266        clippy::string_slice,
267        reason = "parser-guaranteed boundaries and indices"
268    )]
269    fn process_inline_document_with_placeholders<F>(
270        &self,
271        content: &str,
272        parsed: ParsedDocument,
273        placeholders: &mut HtmlPlaceholderRegistry,
274    ) -> String
275    where
276        F: crate::render::format::OutputFormat<Output = String>,
277    {
278        let mut result = String::new();
279        let mut last_idx = 0;
280        let normalized = self.normalize_integral_name_citations(&parsed);
281
282        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
283            result.push_str(&content[last_idx..parsed.start]);
284            match self.process_citation_with_format::<F>(&citation) {
285                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
286                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
287            }
288            last_idx = parsed.end;
289        }
290
291        result.push_str(&content[last_idx..]);
292        result
293    }
294
295    /// Splice rendered citations into document markup for non-note styles.
296    #[allow(
297        clippy::string_slice,
298        reason = "parser-guaranteed boundaries and indices"
299    )]
300    fn process_inline_document<F>(&self, content: &str, parsed: ParsedDocument) -> String
301    where
302        F: crate::render::format::OutputFormat<Output = String>,
303    {
304        let mut result = String::new();
305        let mut last_idx = 0;
306        let normalized = self.normalize_integral_name_citations(&parsed);
307
308        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
309            result.push_str(&content[last_idx..parsed.start]);
310            match self.process_citation_with_format::<F>(&citation) {
311                Ok(rendered) => result.push_str(&rendered),
312                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
313            }
314            last_idx = parsed.end;
315        }
316
317        result.push_str(&content[last_idx..]);
318        result
319    }
320
321    /// Splice HTML-rendered citations into document markup using placeholders.
322    #[allow(
323        clippy::string_slice,
324        reason = "parser-guaranteed boundaries and indices"
325    )]
326    fn process_inline_document_html(
327        &self,
328        content: &str,
329        parsed: ParsedDocument,
330        placeholders: &mut HtmlPlaceholderRegistry,
331    ) -> String {
332        let mut result = String::new();
333        let mut last_idx = 0;
334        let normalized = self.normalize_integral_name_citations(&parsed);
335
336        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
337            result.push_str(&content[last_idx..parsed.start]);
338            match self.process_citation_with_format::<crate::render::html::Html>(&citation) {
339                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
340                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
341            }
342            last_idx = parsed.end;
343        }
344
345        result.push_str(&content[last_idx..]);
346        result
347    }
348
349    /// Replace bibliography block placeholders with rendered content.
350    fn replace_document_bibliography_blocks<F>(
351        &self,
352        rendered: &mut RenderedDocumentBody,
353        blocks: &[BibliographyBlock],
354        format: DocumentFormat,
355    ) where
356        F: crate::render::format::OutputFormat<Output = String>,
357    {
358        for (index, block) in blocks.iter().enumerate() {
359            let placeholder = bibliography_block_placeholder(index);
360            let rendered_group = self.render_document_bibliography_block::<F>(&block.group);
361            let replacement = render_document_bibliography_block_replacement(
362                rendered.placeholders.as_mut(),
363                format,
364                rendered_group.heading,
365                rendered_group.body,
366            );
367            rendered.content = rendered.content.replace(&placeholder, &replacement);
368        }
369    }
370
371    /// Perform final document rewrites and resolve placeholders.
372    ///
373    /// For HTML: converts body markup via `finalize_html_output` then
374    /// substitutes citation placeholder tokens.
375    /// For Typst/LaTeX: converts body markup via `render_body_markup::<F>`
376    /// then substitutes citation placeholder tokens.
377    /// For other formats: returns the spliced content as-is.
378    fn finalize_document_output<P, F>(
379        &self,
380        parser: &P,
381        format: DocumentFormat,
382        rendered: RenderedDocumentBody,
383    ) -> String
384    where
385        P: CitationParser,
386        F: crate::render::format::OutputFormat<Output = String>,
387    {
388        let mut result = if let Some(placeholders) = rendered.placeholders {
389            let fmt = F::default();
390            let converted = match format {
391                DocumentFormat::Html => parser.finalize_html_output(&rendered.content),
392                DocumentFormat::Typst | DocumentFormat::Latex => {
393                    parser.render_body_markup(&rendered.content, &fmt)
394                }
395                _ => rendered.content,
396            };
397            placeholders.apply(converted)
398        } else {
399            // Passthrough path (Plain, Djot, Markdown, note-style Typst/LaTeX).
400            // Keep the heading-rewrite for Typst in case headings came from
401            // bibliography group labels rather than body markup.
402            let content = rewrite_document_markup_for_typst(rendered.content, format);
403            match format {
404                DocumentFormat::Html => parser.finalize_html_output(&content),
405                _ => content,
406            }
407        };
408        // Append any trailing content (e.g. Typst/LaTeX bibliography) that was
409        // deferred so it would not pass through the body markup converter.
410        // Trim the body's trailing whitespace first: the markup renderer may
411        // have added paragraph-separator newlines that would otherwise double
412        // the leading newlines of the bibliography heading.
413        if let Some(tail) = rendered.trailing {
414            let trimmed = result.trim_end_matches('\n');
415            result = format!("{trimmed}{tail}");
416        }
417        result
418    }
419}