Skip to main content

citum_engine/processor/document/
pipeline.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! High-level document-processing orchestration.
7
8use super::output::{
9    HtmlPlaceholderRegistry, RenderedDocumentBody, append_document_bibliography,
10    bibliography_block_placeholder, render_document_bibliography_block_replacement,
11    rewrite_document_markup_for_typst, rewrite_group_headings_for_document,
12    stage_document_bibliography_blocks,
13};
14use super::{BibliographyBlock, CitationParser, DocumentFormat, ParsedDocument};
15use crate::processor::Processor;
16
17impl Processor {
18    /// Process citations in a document and append a bibliography.
19    ///
20    /// This is the primary document-level entry point. It:
21    /// 1. Parses the source document using the provided adapter.
22    /// 2. Resolves frontmatter overrides (integral-name policy, bibliography options).
23    /// 3. Chooses a bibliography orchestration path based on frontmatter and document blocks.
24    #[allow(
25        clippy::string_slice,
26        reason = "parser-guaranteed boundaries and indices"
27    )]
28    pub fn process_document<P, F>(
29        &self,
30        content: &str,
31        parser: &P,
32        format: DocumentFormat,
33    ) -> String
34    where
35        P: CitationParser,
36        F: crate::render::format::OutputFormat<Output = String>,
37    {
38        let mut parsed = parser.parse_document(content, &self.locale);
39
40        if let Some(err) = &parsed.frontmatter_error {
41            eprintln!("citum: error: frontmatter parse error: {err}");
42            std::process::exit(1);
43        }
44
45        // `options.*` fields take precedence over the legacy top-level fields.
46        let effective_integral_override = parsed
47            .frontmatter_options
48            .as_ref()
49            .and_then(|o| o.integral_name_memory.as_ref())
50            .or(parsed.frontmatter_integral_name_memory.as_ref());
51        let owned_integral =
52            self.processor_with_document_integral_name_override(effective_integral_override);
53
54        // `options.org-abbreviation-memory` takes precedence over the legacy top-level field.
55        let effective_org_override = parsed
56            .frontmatter_options
57            .as_ref()
58            .and_then(|o| o.org_abbreviation_memory.as_ref())
59            .or(parsed.frontmatter_org_abbreviation_memory.as_ref());
60        let owned_org = {
61            let base = owned_integral.as_ref().unwrap_or(self);
62            base.processor_with_document_org_abbreviation_override(effective_org_override)
63        };
64
65        // Apply bibliography overrides from the options block.
66        let owned_bib = parsed
67            .frontmatter_options
68            .as_ref()
69            .filter(|o| o.bibliography.is_some())
70            .map(|options| {
71                let base = owned_org
72                    .as_ref()
73                    .or(owned_integral.as_ref())
74                    .unwrap_or(self);
75                base.processor_with_bibliography_override(options)
76            });
77
78        let processor = owned_bib
79            .as_ref()
80            .or(owned_org.as_ref())
81            .or(owned_integral.as_ref())
82            .unwrap_or(self);
83        let body = &content[parsed.body_start..];
84        if let Some(groups) = parsed.frontmatter_groups.take() {
85            return processor.process_document_with_frontmatter_groups::<P, F>(
86                body, parsed, groups, parser, format,
87            );
88        }
89
90        if !parsed.bibliography_blocks.is_empty() {
91            return processor.process_document_with_bibliography_blocks::<P, F>(
92                body,
93                std::mem::take(&mut parsed.bibliography_blocks),
94                parser,
95                format,
96            );
97        }
98
99        processor.process_document_with_default_bibliography::<P, F>(body, parsed, parser, format)
100    }
101
102    /// Orchestrate document processing with custom frontmatter bibliography groups.
103    fn process_document_with_frontmatter_groups<P, F>(
104        &self,
105        body: &str,
106        parsed: ParsedDocument,
107        groups: Vec<citum_schema::grouping::BibliographyGroup>,
108        parser: &P,
109        format: DocumentFormat,
110    ) -> String
111    where
112        P: CitationParser,
113        F: crate::render::format::OutputFormat<Output = String>,
114    {
115        self.render_document_with_trailing_bibliography::<P, F, _>(
116            body,
117            parsed,
118            parser,
119            format,
120            |processor| {
121                rewrite_group_headings_for_document(
122                    processor.render_document_bibliography_groups::<F>(&groups),
123                    format,
124                )
125            },
126        )
127    }
128
129    /// Orchestrate document processing with explicit bibliography blocks.
130    fn process_document_with_bibliography_blocks<P, F>(
131        &self,
132        body: &str,
133        blocks: Vec<BibliographyBlock>,
134        parser: &P,
135        format: DocumentFormat,
136    ) -> String
137    where
138        P: CitationParser,
139        F: crate::render::format::OutputFormat<Output = String>,
140    {
141        let staged = stage_document_bibliography_blocks(body, &blocks);
142        let parsed_staged = parser.parse_document(&staged, &self.locale);
143        let mut rendered = self.render_document_body::<F>(&staged, parsed_staged, format);
144        self.replace_document_bibliography_blocks::<F>(&mut rendered, &blocks, format);
145        self.finalize_document_output::<P, F>(parser, format, rendered)
146    }
147
148    /// Orchestrate document processing with the default trailing bibliography.
149    fn process_document_with_default_bibliography<P, F>(
150        &self,
151        body: &str,
152        parsed: ParsedDocument,
153        parser: &P,
154        format: DocumentFormat,
155    ) -> String
156    where
157        P: CitationParser,
158        F: crate::render::format::OutputFormat<Output = String>,
159    {
160        self.render_document_with_trailing_bibliography::<P, F, _>(
161            body,
162            parsed,
163            parser,
164            format,
165            super::super::Processor::render_grouped_document_bibliography_with_format::<F>,
166        )
167    }
168
169    /// Generic helper for rendering document body + trailing bibliography.
170    fn render_document_with_trailing_bibliography<P, F, B>(
171        &self,
172        body: &str,
173        parsed: ParsedDocument,
174        parser: &P,
175        format: DocumentFormat,
176        render_bibliography: B,
177    ) -> String
178    where
179        P: CitationParser,
180        F: crate::render::format::OutputFormat<Output = String>,
181        B: FnOnce(&Self) -> String,
182    {
183        let mut rendered = self.render_document_body::<F>(body, parsed, format);
184        let bibliography = render_bibliography(self);
185        append_document_bibliography(&mut rendered, format, bibliography);
186        self.finalize_document_output::<P, F>(parser, format, rendered)
187    }
188
189    /// Render the citation-annotated document body.
190    ///
191    /// Governs the choice between note-style and inline-style processing,
192    /// and handles placeholder registration for format finalization.
193    /// HTML and terminal formats (Typst, LaTeX) both use the placeholder path
194    /// so that body markup can be converted after citations are spliced in.
195    fn render_document_body<F>(
196        &self,
197        content: &str,
198        parsed: ParsedDocument,
199        format: DocumentFormat,
200    ) -> RenderedDocumentBody
201    where
202        F: crate::render::format::OutputFormat<Output = String>,
203    {
204        if matches!(format, DocumentFormat::Html) {
205            let mut placeholders = HtmlPlaceholderRegistry::default();
206            let content = if self.is_note_style() {
207                self.process_note_document_html(content, parsed, &mut placeholders)
208            } else {
209                self.process_inline_document_html(content, parsed, &mut placeholders)
210            };
211            return RenderedDocumentBody {
212                content,
213                placeholders: Some(placeholders),
214                trailing: None,
215            };
216        }
217
218        // Terminal formats (Typst, LaTeX) need the same placeholder flow so
219        // the body markup can be converted to the target format after citations
220        // are replaced with NUL-token placeholders. This is a converted-output
221        // path, not passthrough; passthrough is limited to Plain/Djot/Markdown.
222        if matches!(format, DocumentFormat::Typst | DocumentFormat::Latex) {
223            let mut placeholders = HtmlPlaceholderRegistry::default();
224            // Note styles still emit source footnote syntax that the terminal
225            // body renderer does not yet model, so keep that narrow legacy
226            // exception isolated from author-date terminal conversion.
227            let content = if self.is_note_style() {
228                self.process_note_document::<F>(content, parsed)
229            } else {
230                self.process_inline_document_with_placeholders::<F>(
231                    content,
232                    parsed,
233                    &mut placeholders,
234                )
235            };
236            return RenderedDocumentBody {
237                content,
238                placeholders: if self.is_note_style() {
239                    None
240                } else {
241                    Some(placeholders)
242                },
243                trailing: None,
244            };
245        }
246
247        let content = if self.is_note_style() {
248            self.process_note_document::<F>(content, parsed)
249        } else {
250            self.process_inline_document::<F>(content, parsed)
251        };
252
253        RenderedDocumentBody {
254            content,
255            placeholders: None,
256            trailing: None,
257        }
258    }
259
260    /// Splice `F`-rendered citations into document markup using NUL placeholders.
261    ///
262    /// Mirrors `process_inline_document_html` but renders citations using the
263    /// generic format `F` (e.g. Typst or LaTeX) instead of HTML. The
264    /// surrounding body markup still contains the source syntax at this point;
265    /// `finalize_document_output` converts it after placeholder substitution.
266    #[allow(
267        clippy::string_slice,
268        reason = "parser-guaranteed boundaries and indices"
269    )]
270    fn process_inline_document_with_placeholders<F>(
271        &self,
272        content: &str,
273        parsed: ParsedDocument,
274        placeholders: &mut HtmlPlaceholderRegistry,
275    ) -> String
276    where
277        F: crate::render::format::OutputFormat<Output = String>,
278    {
279        let mut result = String::new();
280        let mut last_idx = 0;
281        let normalized = self.normalize_integral_name_citations(&parsed);
282
283        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
284            result.push_str(&content[last_idx..parsed.start]);
285            match self.process_citation_with_format::<F>(&citation) {
286                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
287                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
288            }
289            last_idx = parsed.end;
290        }
291
292        result.push_str(&content[last_idx..]);
293        result
294    }
295
296    /// Splice rendered citations into document markup for non-note styles.
297    #[allow(
298        clippy::string_slice,
299        reason = "parser-guaranteed boundaries and indices"
300    )]
301    fn process_inline_document<F>(&self, content: &str, parsed: ParsedDocument) -> String
302    where
303        F: crate::render::format::OutputFormat<Output = String>,
304    {
305        let mut result = String::new();
306        let mut last_idx = 0;
307        let normalized = self.normalize_integral_name_citations(&parsed);
308
309        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
310            result.push_str(&content[last_idx..parsed.start]);
311            match self.process_citation_with_format::<F>(&citation) {
312                Ok(rendered) => result.push_str(&rendered),
313                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
314            }
315            last_idx = parsed.end;
316        }
317
318        result.push_str(&content[last_idx..]);
319        result
320    }
321
322    /// Splice HTML-rendered citations into document markup using placeholders.
323    #[allow(
324        clippy::string_slice,
325        reason = "parser-guaranteed boundaries and indices"
326    )]
327    fn process_inline_document_html(
328        &self,
329        content: &str,
330        parsed: ParsedDocument,
331        placeholders: &mut HtmlPlaceholderRegistry,
332    ) -> String {
333        let mut result = String::new();
334        let mut last_idx = 0;
335        let normalized = self.normalize_integral_name_citations(&parsed);
336
337        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
338            result.push_str(&content[last_idx..parsed.start]);
339            match self.process_citation_with_format::<crate::render::html::Html>(&citation) {
340                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
341                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
342            }
343            last_idx = parsed.end;
344        }
345
346        result.push_str(&content[last_idx..]);
347        result
348    }
349
350    /// Replace bibliography block placeholders with rendered content.
351    fn replace_document_bibliography_blocks<F>(
352        &self,
353        rendered: &mut RenderedDocumentBody,
354        blocks: &[BibliographyBlock],
355        format: DocumentFormat,
356    ) where
357        F: crate::render::format::OutputFormat<Output = String>,
358    {
359        let mut assigned = std::collections::HashSet::<String>::new();
360        for (index, block) in blocks.iter().enumerate() {
361            let placeholder = bibliography_block_placeholder(index);
362            let rendered_group =
363                self.render_document_bibliography_block::<F>(&block.group, &mut assigned);
364            let replacement = render_document_bibliography_block_replacement(
365                rendered.placeholders.as_mut(),
366                format,
367                rendered_group.heading,
368                rendered_group.body,
369            );
370            rendered.content = rendered.content.replace(&placeholder, &replacement);
371        }
372    }
373
374    /// Perform final document rewrites and resolve placeholders.
375    ///
376    /// For HTML: converts body markup via `finalize_html_output` then
377    /// substitutes citation placeholder tokens.
378    /// For Typst/LaTeX: converts body markup via `render_body_markup::<F>`
379    /// then substitutes citation placeholder tokens.
380    /// For other formats: returns the spliced content as-is.
381    fn finalize_document_output<P, F>(
382        &self,
383        parser: &P,
384        format: DocumentFormat,
385        rendered: RenderedDocumentBody,
386    ) -> String
387    where
388        P: CitationParser,
389        F: crate::render::format::OutputFormat<Output = String>,
390    {
391        let mut result = if let Some(placeholders) = rendered.placeholders {
392            let fmt = F::default();
393            let converted = match format {
394                DocumentFormat::Html => parser.finalize_html_output(&rendered.content),
395                DocumentFormat::Typst | DocumentFormat::Latex => {
396                    parser.render_body_markup(&rendered.content, &fmt)
397                }
398                _ => rendered.content,
399            };
400            placeholders.apply(converted)
401        } else {
402            // Passthrough path for Plain/Djot/Markdown, plus the isolated
403            // note-style Typst/LaTeX exception documented in render_document_body.
404            // Keep the heading-rewrite for Typst in case headings came from
405            // bibliography group labels rather than body markup.
406            let content = rewrite_document_markup_for_typst(rendered.content, format);
407            match format {
408                DocumentFormat::Html => parser.finalize_html_output(&content),
409                _ => content,
410            }
411        };
412        // Append any trailing content (e.g. Typst/LaTeX bibliography) that was
413        // deferred so it would not pass through the body markup converter.
414        // Trim the body's trailing whitespace first: the markup renderer may
415        // have added paragraph-separator newlines that would otherwise double
416        // the leading newlines of the bibliography heading.
417        if let Some(tail) = rendered.trailing {
418            let trimmed = result.trim_end_matches('\n');
419            result = format!("{trimmed}{tail}");
420        }
421        result
422    }
423}