Skip to main content

math_core/
lib.rs

1//! Convert LaTeX math to MathML Core.
2//!
3//! For more background on what that means and on what to do with the resulting MathML code,
4//! see the repo's README: https://github.com/tmke8/math-core
5//!
6//! # Usage
7//!
8//! The main struct of this library is [`LatexToMathML`]. In order to use the library, create an
9//! instance of this struct and then call one of the convert functions. The constructor of the
10//! struct expects a config object in the form of an instance of [`MathCoreConfig`].
11//!
12//! Basic use looks like this:
13//!
14//! ```rust
15//! use math_core::{LatexToMathML, MathCoreConfig, MathDisplay};
16//!
17//! let latex = r#"\erf ( x ) = \frac{ 2 }{ \sqrt{ \pi } } \int_0^x e^{- t^2} \, dt"#;
18//! let config = MathCoreConfig::default();
19//! let converter = LatexToMathML::new(config).unwrap();
20//! let mathml = converter.convert_with_local_counter(latex, MathDisplay::Block).unwrap();
21//! println!("{}", mathml);
22//! ```
23//!
24//! # Features
25//!
26//! - `serde`: With this feature, `MathCoreConfig` implements serde's `Deserialize`.
27//!
28mod atof;
29mod character_class;
30mod color_defs;
31mod commands;
32mod environments;
33mod error;
34mod html_utils;
35mod lexer;
36mod parser;
37mod predefined;
38mod specifications;
39mod text_parser;
40mod token;
41mod token_queue;
42
43use std::num::NonZeroU16;
44
45use rustc_hash::{FxBuildHasher, FxHashMap};
46#[cfg(feature = "serde")]
47use serde::{Deserialize, Serialize};
48
49use mathml_renderer::{arena::Arena, ast::Node, fmt::new_line_and_indent};
50
51pub use self::error::LatexError;
52use self::{error::LatexErrKind, lexer::Lexer, parser::Parser, token::Token};
53
54/// Display mode for the LaTeX math equations.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub enum MathDisplay {
57    /// For inline equations, like those in `$...$` in LaTeX.
58    Inline,
59    /// For block equations (or "display style" equations), like those in `$$...$$` in LaTeX.
60    Block,
61}
62
63/// Configuration for pretty-printing the MathML output.
64///
65/// Pretty-printing means that newlines and indentation is added to the MathML output, to make it
66/// easier to read.
67#[derive(Debug, Clone, Copy, Default)]
68#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
69#[cfg_attr(feature = "serde", serde(rename_all = "kebab-case"))]
70#[non_exhaustive]
71pub enum PrettyPrint {
72    /// Never pretty print.
73    #[default]
74    Never,
75    /// Always pretty print.
76    Always,
77    /// Pretty print for block equations only.
78    Auto,
79}
80
81/// Configuration object for the LaTeX to MathML conversion.
82///
83/// # Example usage
84///
85/// ```rust
86/// use math_core::{MathCoreConfig, PrettyPrint};
87///
88/// // Default values
89/// let config = MathCoreConfig::default();
90///
91/// // Specifying pretty-print behavior
92/// let config = MathCoreConfig {
93///     pretty_print: PrettyPrint::Always,
94///     ..Default::default()
95///  };
96///
97/// // Specifying pretty-print behavior and custom macros
98/// let macros = vec![
99///     ("d".to_string(), r"\mathrm{d}".to_string()),
100///     ("bb".to_string(), r"\mathbb{#1}".to_string()), // with argument
101/// ];
102/// let config = MathCoreConfig {
103///     pretty_print: PrettyPrint::Auto,
104///     macros,
105///     ..Default::default()
106/// };
107/// ```
108///
109#[derive(Debug, Default)]
110#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
111#[cfg_attr(feature = "serde", serde(default, rename_all = "kebab-case"))]
112pub struct MathCoreConfig {
113    /// A configuration for pretty-printing the MathML output. See [`PrettyPrint`] for details.
114    pub pretty_print: PrettyPrint,
115    /// A list of LaTeX macros; each tuple contains (macro_name, macro_definition).
116    #[cfg_attr(feature = "serde", serde(with = "tuple_vec_map"))]
117    pub macros: Vec<(String, String)>,
118    /// If `true`, include `xmlns="http://www.w3.org/1998/Math/MathML"` in the `<math>` tag.
119    pub xml_namespace: bool,
120    /// If `true`, unknown commands will be rendered as red text in the output, instead of
121    /// returning an error.
122    pub ignore_unknown_commands: bool,
123    /// If `true`, wrap the MathML output in `<semantics>` tags with an
124    /// `<annotation encoding="application/x-tex">` child containing the original LaTeX source.
125    pub annotation: bool,
126    /// If `true`, allow rendering commands that produce MathML Core output that is unreliably
127    /// rendered by browsers.
128    pub allow_unreliable_rendering: bool,
129}
130
131#[derive(Debug, Default)]
132struct CommandConfig {
133    custom_cmd_tokens: Vec<Token<'static>>,
134    custom_cmd_map: FxHashMap<String, (u8, (usize, usize))>,
135    ignore_unknown_commands: bool,
136    allow_unreliable_rendering: bool,
137}
138
139impl CommandConfig {
140    pub fn get_command<'config>(&'config self, command: &str) -> Option<Token<'config>> {
141        let (num_args, slice) = *self.custom_cmd_map.get(command)?;
142        let tokens = self.custom_cmd_tokens.get(slice.0..slice.1)?;
143        Some(Token::CustomCmd(num_args, tokens))
144    }
145}
146
147/// This struct contains those fields from `MathCoreConfig` that are simple flags.
148#[derive(Debug, Default)]
149struct Flags {
150    pretty_print: PrettyPrint,
151    xml_namespace: bool,
152    annotation: bool,
153}
154
155impl From<&MathCoreConfig> for Flags {
156    fn from(config: &MathCoreConfig) -> Self {
157        // TODO: can we use a macro here to avoid repeating the field names?
158        Self {
159            pretty_print: config.pretty_print,
160            xml_namespace: config.xml_namespace,
161            annotation: config.annotation,
162        }
163    }
164}
165
166/// A converter that transforms LaTeX math equations into MathML Core.
167#[derive(Debug, Default)]
168pub struct LatexToMathML {
169    flags: Flags,
170    /// This is used for numbering equations in the document.
171    equation_count: u16,
172    label_map: FxHashMap<Box<str>, NonZeroU16>,
173    cmd_cfg: Option<CommandConfig>,
174}
175
176impl LatexToMathML {
177    /// Create a new `LatexToMathML` converter with the given configuration.
178    ///
179    /// This function returns an error if the custom macros in the given configuration could not
180    /// be parsed. The error contains the parsing error, the macro index and the macro definition
181    /// that caused the error.
182    pub fn new(config: MathCoreConfig) -> Result<Self, (Box<LatexError>, usize, String)> {
183        Ok(Self {
184            flags: Flags::from(&config),
185            equation_count: 0,
186            label_map: FxHashMap::default(),
187            cmd_cfg: Some(parse_custom_commands(config)?),
188        })
189    }
190
191    /// Convert LaTeX text to MathML with a global equation counter.
192    ///
193    /// For basic usage, see the documentation of [`convert_with_local_counter`].
194    ///
195    /// This conversion function maintains state, in order to count equations correctly across
196    /// different calls to this function.
197    ///
198    /// The counter can be reset with [`reset_global_counter`].
199    pub fn convert_with_global_counter(
200        &mut self,
201        latex: &str,
202        display: MathDisplay,
203    ) -> Result<String, Box<LatexError>> {
204        convert(
205            latex,
206            display,
207            self.cmd_cfg.as_ref(),
208            &mut self.equation_count,
209            &mut self.label_map,
210            &self.flags,
211        )
212    }
213
214    /// Convert LaTeX text to MathML.
215    ///
216    /// The second argument specifies whether it is inline-equation or block-equation.
217    ///
218    /// ```rust
219    /// use math_core::{LatexToMathML, MathCoreConfig, MathDisplay};
220    ///
221    /// let latex = r#"(n + 1)! = \Gamma ( n + 1 )"#;
222    /// let config = MathCoreConfig::default();
223    /// let converter = LatexToMathML::new(config).unwrap();
224    /// let mathml = converter.convert_with_local_counter(latex, MathDisplay::Inline).unwrap();
225    /// println!("{}", mathml);
226    ///
227    /// let latex = r#"x = \frac{ - b \pm \sqrt{ b^2 - 4 a c } }{ 2 a }"#;
228    /// let mathml = converter.convert_with_local_counter(latex, MathDisplay::Block).unwrap();
229    /// println!("{}", mathml);
230    /// ```
231    ///
232    #[inline]
233    pub fn convert_with_local_counter(
234        &self,
235        latex: &str,
236        display: MathDisplay,
237    ) -> Result<String, Box<LatexError>> {
238        let mut equation_count = 0;
239        let mut label_map = FxHashMap::default();
240        convert(
241            latex,
242            display,
243            self.cmd_cfg.as_ref(),
244            &mut equation_count,
245            &mut label_map,
246            &self.flags,
247        )
248    }
249
250    /// Reset the equation counter to zero.
251    ///
252    /// This should normally be done at the beginning of a new document or section.
253    pub fn reset_global_counter(&mut self) {
254        self.equation_count = 0;
255    }
256}
257
258fn convert(
259    latex: &str,
260    display: MathDisplay,
261    cmd_cfg: Option<&CommandConfig>,
262    equation_count: &mut u16,
263    label_map: &mut FxHashMap<Box<str>, NonZeroU16>,
264    flags: &Flags,
265) -> Result<String, Box<LatexError>> {
266    let arena = Arena::new();
267    let ast = parse(latex, &arena, cmd_cfg, equation_count, label_map)?;
268
269    let mut output = String::new();
270    output.push_str("<math");
271    if flags.xml_namespace {
272        output.push_str(" xmlns=\"http://www.w3.org/1998/Math/MathML\"");
273    }
274    if matches!(display, MathDisplay::Block) {
275        output.push_str(" display=\"block\"");
276    }
277    output.push('>');
278
279    let pretty_print = matches!(flags.pretty_print, PrettyPrint::Always)
280        || (matches!(flags.pretty_print, PrettyPrint::Auto) && display == MathDisplay::Block);
281
282    let base_indent = if pretty_print { 1 } else { 0 };
283    if flags.annotation {
284        let children_indent = if pretty_print { 2 } else { 0 };
285        new_line_and_indent(&mut output, base_indent);
286        output.push_str("<semantics>");
287        let node = parser::node_vec_to_node(&arena, &ast, false);
288        let _ = node.emit(&mut output, children_indent);
289        new_line_and_indent(&mut output, children_indent);
290        output.push_str("<annotation encoding=\"application/x-tex\">");
291        html_utils::escape_html_content(&mut output, latex);
292        output.push_str("</annotation>");
293        new_line_and_indent(&mut output, base_indent);
294        output.push_str("</semantics>");
295    } else {
296        for node in ast {
297            // We ignore the result of `emit` here, because the only possible error is a formatting
298            // error when writing to the string, and that can only happen if the string's `write_str`
299            // implementation returns an error. Since `String`'s `write_str` implementation never
300            // returns an error, we can safely ignore the result of `emit`.
301            let _ = node.emit(&mut output, base_indent);
302        }
303    }
304    if pretty_print {
305        output.push('\n');
306    }
307    output.push_str("</math>");
308    Ok(output)
309}
310
311fn parse<'config, 'source, 'arena>(
312    latex: &'source str,
313    arena: &'arena Arena,
314    cmd_cfg: Option<&'config CommandConfig>,
315    equation_count: &'arena mut u16,
316    label_map: &'arena mut FxHashMap<Box<str>, NonZeroU16>,
317) -> Result<Vec<&'arena Node<'arena>>, Box<LatexError>>
318where
319    'config: 'source,
320    'source: 'arena,
321{
322    let lexer = Lexer::new(latex, false, cmd_cfg);
323    let mut p = Parser::new(lexer, arena, equation_count, label_map)?;
324    let nodes = p.parse()?;
325    Ok(nodes)
326}
327
328fn parse_custom_commands(
329    cfg: MathCoreConfig,
330) -> Result<CommandConfig, (Box<LatexError>, usize, String)> {
331    let macros = cfg.macros;
332    let mut map = FxHashMap::with_capacity_and_hasher(macros.len(), FxBuildHasher);
333    let mut tokens = Vec::new();
334    for (idx, (name, definition)) in macros.into_iter().enumerate() {
335        if !is_valid_macro_name(name.as_str()) {
336            return Err((
337                Box::new(LatexError(0..0, LatexErrKind::InvalidMacroName(name))),
338                idx,
339                definition,
340            ));
341        }
342
343        // In order to be able to return `definition` in case of an error, we need to ensure
344        // that the lexer (which borrows `definition`) is dropped before we return the error.
345        // Therefore, we put the whole lexing process into its own block.
346        let value = 'value: {
347            let mut lexer: Lexer<'static, '_> = Lexer::new(definition.as_str(), true, None);
348            let start = tokens.len();
349            loop {
350                match lexer.next_token_no_unknown_command() {
351                    Ok(tokloc) => {
352                        if matches!(tokloc.token(), Token::Eoi) {
353                            break;
354                        }
355                        tokens.push(tokloc.into_token());
356                    }
357                    Err(err) => {
358                        break 'value Err(err);
359                    }
360                }
361            }
362            let end = tokens.len();
363            let num_args = lexer.parse_cmd_args().unwrap_or(0);
364            Ok((num_args, (start, end)))
365        };
366
367        match value {
368            Err(err) => {
369                return Err((err, idx, definition));
370            }
371            Ok(v) => {
372                map.insert(name, v);
373            }
374        }
375    }
376    Ok(CommandConfig {
377        custom_cmd_tokens: tokens,
378        custom_cmd_map: map,
379        ignore_unknown_commands: cfg.ignore_unknown_commands,
380        allow_unreliable_rendering: cfg.allow_unreliable_rendering,
381    })
382}
383
384fn is_valid_macro_name(s: &str) -> bool {
385    if s.is_empty() {
386        return false;
387    }
388    let mut chars = s.chars();
389    match (chars.next(), chars.next()) {
390        // If the name contains only one character, any character is valid.
391        (Some(_), None) => true,
392        // If the name contains more than one character, all characters must be ASCII alphabetic.
393        _ => s.bytes().all(|b| b.is_ascii_alphabetic()),
394    }
395}