Skip to main content

citum_engine/processor/document/
pipeline.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! High-level document-processing orchestration.
7
8use super::output::{
9    HtmlPlaceholderRegistry, RenderedDocumentBody, append_document_bibliography,
10    bibliography_block_placeholder, render_document_bibliography_block_replacement,
11    rewrite_document_markup_for_typst, rewrite_group_headings_for_document,
12    stage_document_bibliography_blocks,
13};
14use super::{BibliographyBlock, CitationParser, DocumentFormat, ParsedDocument};
15use crate::processor::Processor;
16
17impl Processor {
18    /// Process citations in a document and append a bibliography.
19    ///
20    /// This is the primary document-level entry point. It:
21    /// 1. Parses the source document using the provided adapter.
22    /// 2. Resolves frontmatter overrides (integral-name policy, bibliography options).
23    /// 3. Chooses a bibliography orchestration path based on frontmatter and document blocks.
24    #[allow(
25        clippy::string_slice,
26        reason = "parser-guaranteed boundaries and indices"
27    )]
28    pub fn process_document<P, F>(
29        &self,
30        content: &str,
31        parser: &P,
32        format: DocumentFormat,
33    ) -> String
34    where
35        P: CitationParser,
36        F: crate::render::format::OutputFormat<Output = String>,
37    {
38        let mut parsed = parser.parse_document(content, &self.locale);
39
40        if let Some(err) = &parsed.frontmatter_error {
41            eprintln!("citum: error: frontmatter parse error: {err}");
42            std::process::exit(1);
43        }
44
45        // `options.*` fields take precedence over the legacy top-level fields.
46        let effective_integral_override = parsed
47            .frontmatter_options
48            .as_ref()
49            .and_then(|o| o.integral_name_memory.as_ref())
50            .or(parsed.frontmatter_integral_name_memory.as_ref());
51        let owned_integral =
52            self.processor_with_document_integral_name_override(effective_integral_override);
53
54        // `options.org-abbreviation-memory` takes precedence over the legacy top-level field.
55        let effective_org_override = parsed
56            .frontmatter_options
57            .as_ref()
58            .and_then(|o| o.org_abbreviation_memory.as_ref())
59            .or(parsed.frontmatter_org_abbreviation_memory.as_ref());
60        let owned_org = {
61            let base = owned_integral.as_ref().unwrap_or(self);
62            base.processor_with_document_org_abbreviation_override(effective_org_override)
63        };
64
65        // Apply bibliography overrides from the options block.
66        let owned_bib = parsed
67            .frontmatter_options
68            .as_ref()
69            .filter(|o| o.bibliography.is_some())
70            .map(|options| {
71                let base = owned_org
72                    .as_ref()
73                    .or(owned_integral.as_ref())
74                    .unwrap_or(self);
75                base.processor_with_bibliography_override(options)
76            });
77
78        let processor = owned_bib
79            .as_ref()
80            .or(owned_org.as_ref())
81            .or(owned_integral.as_ref())
82            .unwrap_or(self);
83        let body = &content[parsed.body_start..];
84        if let Some(groups) = parsed.frontmatter_groups.take() {
85            return processor.process_document_with_frontmatter_groups::<P, F>(
86                body, parsed, groups, parser, format,
87            );
88        }
89
90        if !parsed.bibliography_blocks.is_empty() {
91            return processor.process_document_with_bibliography_blocks::<P, F>(
92                body,
93                std::mem::take(&mut parsed.bibliography_blocks),
94                parser,
95                format,
96            );
97        }
98
99        processor.process_document_with_default_bibliography::<P, F>(body, parsed, parser, format)
100    }
101
102    /// Orchestrate document processing with custom frontmatter bibliography groups.
103    fn process_document_with_frontmatter_groups<P, F>(
104        &self,
105        body: &str,
106        parsed: ParsedDocument,
107        groups: Vec<citum_schema::grouping::BibliographyGroup>,
108        parser: &P,
109        format: DocumentFormat,
110    ) -> String
111    where
112        P: CitationParser,
113        F: crate::render::format::OutputFormat<Output = String>,
114    {
115        self.render_document_with_trailing_bibliography::<P, F, _>(
116            body,
117            parsed,
118            parser,
119            format,
120            |processor| {
121                rewrite_group_headings_for_document(
122                    processor.render_document_bibliography_groups::<F>(&groups),
123                    format,
124                )
125            },
126        )
127    }
128
129    /// Orchestrate document processing with explicit bibliography blocks.
130    fn process_document_with_bibliography_blocks<P, F>(
131        &self,
132        body: &str,
133        blocks: Vec<BibliographyBlock>,
134        parser: &P,
135        format: DocumentFormat,
136    ) -> String
137    where
138        P: CitationParser,
139        F: crate::render::format::OutputFormat<Output = String>,
140    {
141        let staged = stage_document_bibliography_blocks(body, &blocks);
142        let parsed_staged = parser.parse_document(&staged, &self.locale);
143        let mut rendered = self.render_document_body::<F>(&staged, parsed_staged, format);
144        self.replace_document_bibliography_blocks::<F>(&mut rendered, &blocks, format);
145        self.finalize_document_output::<P, F>(parser, format, rendered)
146    }
147
148    /// Process a document with bibliography groups supplied by the caller.
149    ///
150    /// Unlike the fenced-div path, the caller provides an ordered slice of
151    /// [`citum_schema::grouping::BibliographyGroup`]s (e.g. from `--bibliography-blocks` on the CLI or
152    /// a session-level block list) rather than `:::bibliography{...}` markers
153    /// embedded in the document. Citations are processed exactly as in
154    /// `process_document`; the trailing bibliography is replaced by one
155    /// rendered section per supplied group using the shared
156    /// `render_document_bibliography_blocks` primitive.
157    pub fn process_document_with_caller_blocks<P, F>(
158        &self,
159        content: &str,
160        blocks: &[citum_schema::grouping::BibliographyGroup],
161        parser: &P,
162        format: DocumentFormat,
163    ) -> String
164    where
165        P: CitationParser,
166        F: crate::render::format::OutputFormat<Output = String>,
167    {
168        let parsed = parser.parse_document(content, &self.locale);
169        let body = content.get(parsed.body_start..).unwrap_or(content);
170        let mut rendered = self.render_document_body::<F>(body, parsed, format);
171        // Render ordered sectional blocks via the unified primitive.
172        let rendered_groups = self.render_document_bibliography_blocks::<F>(blocks, None, None);
173        for rendered_group in rendered_groups {
174            let section = render_document_bibliography_block_replacement(
175                rendered.placeholders.as_mut(),
176                format,
177                rendered_group.heading,
178                rendered_group.body,
179            );
180            rendered.content.push_str("\n\n");
181            rendered.content.push_str(&section);
182        }
183        self.finalize_document_output::<P, F>(parser, format, rendered)
184    }
185
186    /// Orchestrate document processing with the default trailing bibliography.
187    fn process_document_with_default_bibliography<P, F>(
188        &self,
189        body: &str,
190        parsed: ParsedDocument,
191        parser: &P,
192        format: DocumentFormat,
193    ) -> String
194    where
195        P: CitationParser,
196        F: crate::render::format::OutputFormat<Output = String>,
197    {
198        self.render_document_with_trailing_bibliography::<P, F, _>(
199            body,
200            parsed,
201            parser,
202            format,
203            super::super::Processor::render_grouped_document_bibliography_with_format::<F>,
204        )
205    }
206
207    /// Generic helper for rendering document body + trailing bibliography.
208    fn render_document_with_trailing_bibliography<P, F, B>(
209        &self,
210        body: &str,
211        parsed: ParsedDocument,
212        parser: &P,
213        format: DocumentFormat,
214        render_bibliography: B,
215    ) -> String
216    where
217        P: CitationParser,
218        F: crate::render::format::OutputFormat<Output = String>,
219        B: FnOnce(&Self) -> String,
220    {
221        let mut rendered = self.render_document_body::<F>(body, parsed, format);
222        let bibliography = render_bibliography(self);
223        append_document_bibliography(&mut rendered, format, bibliography);
224        self.finalize_document_output::<P, F>(parser, format, rendered)
225    }
226
227    /// Render the citation-annotated document body.
228    ///
229    /// Governs the choice between note-style and inline-style processing,
230    /// and handles placeholder registration for format finalization.
231    /// HTML and terminal formats (Typst, LaTeX) both use the placeholder path
232    /// so that body markup can be converted after citations are spliced in.
233    fn render_document_body<F>(
234        &self,
235        content: &str,
236        parsed: ParsedDocument,
237        format: DocumentFormat,
238    ) -> RenderedDocumentBody
239    where
240        F: crate::render::format::OutputFormat<Output = String>,
241    {
242        if matches!(format, DocumentFormat::Html) {
243            let mut placeholders = HtmlPlaceholderRegistry::default();
244            let content = if self.is_note_style() {
245                self.process_note_document_html(content, parsed, &mut placeholders)
246            } else {
247                self.process_inline_document_html(content, parsed, &mut placeholders)
248            };
249            return RenderedDocumentBody {
250                content,
251                placeholders: Some(placeholders),
252                trailing: None,
253            };
254        }
255
256        // Terminal formats (Typst, LaTeX) need the same placeholder flow so
257        // the body markup can be converted to the target format after citations
258        // are replaced with NUL-token placeholders. This is a converted-output
259        // path, not passthrough; passthrough is limited to Plain/Djot/Markdown.
260        if matches!(format, DocumentFormat::Typst | DocumentFormat::Latex) {
261            let mut placeholders = HtmlPlaceholderRegistry::default();
262            // Note styles still emit source footnote syntax that the terminal
263            // body renderer does not yet model, so keep that narrow legacy
264            // exception isolated from author-date terminal conversion.
265            let content = if self.is_note_style() {
266                self.process_note_document::<F>(content, parsed)
267            } else {
268                self.process_inline_document_with_placeholders::<F>(
269                    content,
270                    parsed,
271                    &mut placeholders,
272                )
273            };
274            return RenderedDocumentBody {
275                content,
276                placeholders: if self.is_note_style() {
277                    None
278                } else {
279                    Some(placeholders)
280                },
281                trailing: None,
282            };
283        }
284
285        let content = if self.is_note_style() {
286            self.process_note_document::<F>(content, parsed)
287        } else {
288            self.process_inline_document::<F>(content, parsed)
289        };
290
291        RenderedDocumentBody {
292            content,
293            placeholders: None,
294            trailing: None,
295        }
296    }
297
298    /// Splice `F`-rendered citations into document markup using NUL placeholders.
299    ///
300    /// Mirrors `process_inline_document_html` but renders citations using the
301    /// generic format `F` (e.g. Typst or LaTeX) instead of HTML. The
302    /// surrounding body markup still contains the source syntax at this point;
303    /// `finalize_document_output` converts it after placeholder substitution.
304    #[allow(
305        clippy::string_slice,
306        reason = "parser-guaranteed boundaries and indices"
307    )]
308    fn process_inline_document_with_placeholders<F>(
309        &self,
310        content: &str,
311        parsed: ParsedDocument,
312        placeholders: &mut HtmlPlaceholderRegistry,
313    ) -> String
314    where
315        F: crate::render::format::OutputFormat<Output = String>,
316    {
317        let mut result = String::new();
318        let mut last_idx = 0;
319        let normalized = self.normalize_integral_name_citations(&parsed);
320
321        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
322            result.push_str(&content[last_idx..parsed.start]);
323            match self.process_citation_with_format::<F>(&citation) {
324                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
325                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
326            }
327            last_idx = parsed.end;
328        }
329
330        result.push_str(&content[last_idx..]);
331        result
332    }
333
334    /// Splice rendered citations into document markup for non-note styles.
335    #[allow(
336        clippy::string_slice,
337        reason = "parser-guaranteed boundaries and indices"
338    )]
339    fn process_inline_document<F>(&self, content: &str, parsed: ParsedDocument) -> String
340    where
341        F: crate::render::format::OutputFormat<Output = String>,
342    {
343        let mut result = String::new();
344        let mut last_idx = 0;
345        let normalized = self.normalize_integral_name_citations(&parsed);
346
347        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
348            result.push_str(&content[last_idx..parsed.start]);
349            match self.process_citation_with_format::<F>(&citation) {
350                Ok(rendered) => result.push_str(&rendered),
351                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
352            }
353            last_idx = parsed.end;
354        }
355
356        result.push_str(&content[last_idx..]);
357        result
358    }
359
360    /// Splice HTML-rendered citations into document markup using placeholders.
361    #[allow(
362        clippy::string_slice,
363        reason = "parser-guaranteed boundaries and indices"
364    )]
365    fn process_inline_document_html(
366        &self,
367        content: &str,
368        parsed: ParsedDocument,
369        placeholders: &mut HtmlPlaceholderRegistry,
370    ) -> String {
371        let mut result = String::new();
372        let mut last_idx = 0;
373        let normalized = self.normalize_integral_name_citations(&parsed);
374
375        for (parsed, citation) in parsed.citations.iter().zip(normalized) {
376            result.push_str(&content[last_idx..parsed.start]);
377            match self.process_citation_with_format::<crate::render::html::Html>(&citation) {
378                Ok(rendered) => result.push_str(&placeholders.push_inline(rendered)),
379                Err(_) => result.push_str(&content[parsed.start..parsed.end]),
380            }
381            last_idx = parsed.end;
382        }
383
384        result.push_str(&content[last_idx..]);
385        result
386    }
387
388    /// Replace bibliography block placeholders with rendered content.
389    fn replace_document_bibliography_blocks<F>(
390        &self,
391        rendered: &mut RenderedDocumentBody,
392        blocks: &[BibliographyBlock],
393        format: DocumentFormat,
394    ) where
395        F: crate::render::format::OutputFormat<Output = String>,
396    {
397        let groups: Vec<_> = blocks.iter().map(|b| b.group.clone()).collect();
398        let rendered_groups = self.render_document_bibliography_blocks::<F>(&groups, None, None);
399        for (index, rendered_group) in rendered_groups.into_iter().enumerate() {
400            let placeholder = bibliography_block_placeholder(index);
401            let replacement = render_document_bibliography_block_replacement(
402                rendered.placeholders.as_mut(),
403                format,
404                rendered_group.heading,
405                rendered_group.body,
406            );
407            rendered.content = rendered.content.replace(&placeholder, &replacement);
408        }
409    }
410
411    /// Perform final document rewrites and resolve placeholders.
412    ///
413    /// For HTML: converts body markup via `finalize_html_output` then
414    /// substitutes citation placeholder tokens.
415    /// For Typst/LaTeX: converts body markup via `render_body_markup::<F>`
416    /// then substitutes citation placeholder tokens.
417    /// For other formats: returns the spliced content as-is.
418    fn finalize_document_output<P, F>(
419        &self,
420        parser: &P,
421        format: DocumentFormat,
422        rendered: RenderedDocumentBody,
423    ) -> String
424    where
425        P: CitationParser,
426        F: crate::render::format::OutputFormat<Output = String>,
427    {
428        let mut result = if let Some(placeholders) = rendered.placeholders {
429            let fmt = F::default();
430            let converted = match format {
431                DocumentFormat::Html => parser.finalize_html_output(&rendered.content),
432                DocumentFormat::Typst | DocumentFormat::Latex => {
433                    parser.render_body_markup(&rendered.content, &fmt)
434                }
435                _ => rendered.content,
436            };
437            placeholders.apply(converted)
438        } else {
439            // Passthrough path for Plain/Djot/Markdown, plus the isolated
440            // note-style Typst/LaTeX exception documented in render_document_body.
441            // Keep the heading-rewrite for Typst in case headings came from
442            // bibliography group labels rather than body markup.
443            let content = rewrite_document_markup_for_typst(rendered.content, format);
444            match format {
445                DocumentFormat::Html => parser.finalize_html_output(&content),
446                _ => content,
447            }
448        };
449        // Append any trailing content (e.g. Typst/LaTeX bibliography) that was
450        // deferred so it would not pass through the body markup converter.
451        // Trim the body's trailing whitespace first: the markup renderer may
452        // have added paragraph-separator newlines that would otherwise double
453        // the leading newlines of the bibliography heading.
454        if let Some(tail) = rendered.trailing {
455            let trimmed = result.trim_end_matches('\n');
456            result = format!("{trimmed}{tail}");
457        }
458        result
459    }
460}