mitex_spec/
lib.rs

1//! Specification structure of a set of LaTeX commands.
2//!
3//! The specification will be passed to MiTeX for converting LaTeX code
4//! correctly. For example, MiTeX Parser uses it to produce an AST that respect
5//! the shape of commands.
6//!
7//! Note: since we need to process environments statically, users cannot
8//! override the `\begin` and `\end` commands.
9//!
10//! See <https://github.com/mitex-rs/mitex/blob/main/docs/spec.typ> for detailed description.
11
12use std::sync::Arc;
13
14#[cfg(feature = "serde")]
15use serde::{Deserialize, Serialize};
16
17#[cfg(feature = "rkyv")]
18use rkyv::{Archive, Deserialize as rDeser, Serialize as rSer};
19
20pub mod preludes;
21pub mod query;
22mod stream;
23pub use query::CommandSpecRepr as JsonCommandSpec;
24
25/// An item of command specification. It is either a normal _command_ or an
26/// _environment_.
27/// See [Command Syntax] for concept of _command_.
28/// See [Environment Syntax] for concept of _environment_.
29///
30/// [Command Syntax]: https://latexref.xyz/LaTeX-command-syntax.html
31/// [Environment Syntax]: https://latexref.xyz/Environment-syntax.html
32#[derive(Debug, Clone)]
33#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
34#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
35#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
36pub enum CommandSpecItem {
37    /// Specifies a TeX command
38    /// e.g. `\hat`, `\sum`, `\sqrt`
39    Cmd(CmdShape),
40    /// Specifies a TeX environment
41    /// e.g. `equation`, `matrix`
42    Env(EnvShape),
43}
44
45/// Command specification that contains a set of commands and environments.
46#[derive(Debug, Clone)]
47#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
48#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
49#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
50pub struct CommandSpecRepr {
51    /// A map from command name to command specification
52    pub commands: fxhash::FxHashMap<String, CommandSpecItem>,
53}
54
55/// Command specification that is cheap to clone
56#[derive(Debug, Clone)]
57#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
58#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
59pub struct CommandSpec(Arc<CommandSpecRepr>);
60
61#[cfg(feature = "rkyv")]
62impl CommandSpec {
63    /// Serializes the command specification into bytes in rkyv format.
64    ///
65    /// # Panics
66    /// Panics if rkyv doesn't work properly.
67    pub fn to_bytes(&self) -> Vec<u8> {
68        // Or you can customize your serialization for better performance
69        // and compatibility with #![no_std] environments
70        use rkyv::ser::{serializers::AllocSerializer, Serializer};
71
72        let mut serializer = AllocSerializer::<0>::default();
73        serializer.serialize_value(self.0.as_ref()).unwrap();
74        let bytes = serializer.into_serializer().into_inner();
75
76        bytes.into_vec()
77    }
78
79    /// Deserializes the command specification from bytes in rkyv format.
80    #[cfg(feature = "rkyv-validation")]
81    pub fn from_bytes(bytes: &[u8]) -> Self {
82        let s = stream::BytesModuleStream::from_slice(bytes);
83
84        Self(Arc::new(s.checkout_owned()))
85    }
86
87    /// # Safety
88    /// The data source must be trusted and valid.
89    pub unsafe fn from_bytes_unchecked(bytes: &[u8]) -> Self {
90        let s = stream::BytesModuleStream::from_slice(bytes);
91
92        Self(Arc::new(s.checkout_owned_unchecked()))
93    }
94}
95
96impl CommandSpec {
97    /// Create a new command specification
98    pub fn new(commands: fxhash::FxHashMap<String, CommandSpecItem>) -> Self {
99        Self(Arc::new(CommandSpecRepr { commands }))
100    }
101
102    /// Get an item by name
103    pub fn get(&self, name: &str) -> Option<&CommandSpecItem> {
104        self.0.commands.get(name)
105    }
106
107    /// Iterate all items
108    pub fn items(&self) -> impl Iterator<Item = (&str, &CommandSpecItem)> {
109        self.0.commands.iter().map(|(k, v)| (k.as_str(), v))
110    }
111
112    /// Get an item by name in kind of _command_
113    pub fn get_cmd(&self, name: &str) -> Option<&CmdShape> {
114        self.get(name).and_then(|item| {
115            if let CommandSpecItem::Cmd(item) = item {
116                Some(item)
117            } else {
118                None
119            }
120        })
121    }
122
123    /// Get an item by name in kind of _environment_
124    pub fn get_env(&self, name: &str) -> Option<&EnvShape> {
125        self.get(name).and_then(|item| {
126            if let CommandSpecItem::Env(item) = item {
127                Some(item)
128            } else {
129                None
130            }
131        })
132    }
133}
134
135/// Shape of a TeX command.
136#[derive(Debug, Clone)]
137#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
138#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
139#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
140pub struct CmdShape {
141    /// Describes how we could match the arguments of a command item.
142    pub args: ArgShape,
143    /// Makes the command alias to some Typst handler.
144    /// For exmaple, alias `\prod` to Typst's `product`
145    pub alias: Option<String>,
146}
147
148/// Shape of a TeX envionment.
149#[derive(Debug, Clone)]
150#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
151#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
152#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
153pub struct EnvShape {
154    /// Describes how we could match the arguments of an environment item.
155    pub args: ArgPattern,
156    /// Specifies how we could process items before passing them
157    /// to the Typst handler
158    pub ctx_feature: ContextFeature,
159    /// Makes the command alias to some Typst handler.
160    /// For exmaple, alias `\pmatrix` to a Typst function `pmat` in scope.
161    pub alias: Option<String>,
162}
163
164/// The character encoding used for argument matching
165pub mod argument_kind {
166    /// The character used for matching argument in a term (curly group or
167    /// others)
168    pub const ARGUMENT_KIND_TERM: char = 't';
169    /// The character used for matching argument in a bracket group
170    pub const ARGUMENT_KIND_BRACKET: char = 'b';
171    /// The character used for matching argument in a parenthesis group
172    pub const ARGUMENT_KIND_PAREN: char = 'p';
173}
174
175/// A shared string that represents a glob pattern.
176#[derive(Debug, Clone)]
177#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
178#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
179pub struct GlobStr(pub Arc<str>);
180
181impl From<&str> for GlobStr {
182    fn from(s: &str) -> Self {
183        Self(s.into())
184    }
185}
186#[cfg(feature = "serde")]
187mod glob_str_impl {
188
189    use super::GlobStr;
190    use serde::{Deserialize, Deserializer, Serialize, Serializer};
191
192    impl Serialize for GlobStr {
193        fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
194            self.0.serialize(serializer)
195        }
196    }
197
198    impl<'de> Deserialize<'de> for GlobStr {
199        fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
200            Ok(GlobStr(String::deserialize(deserializer)?.into()))
201        }
202    }
203}
204
205/// An efficient pattern used for argument matching.
206///
207/// There are four kinds of pattern. The most powerful one is
208/// [`ArgPattern::Glob`], which matches an sequence of input as arguments. Among
209/// these four kinds, [`ArgPattern::Glob`] can already match all possible inputs
210/// in our use cases. But one should specify a fixed length pattern
211/// ([`ArgPattern::FixedLenTerm`]), a range length pattern
212/// ([`ArgPattern::RangeLenTerm`]), or a greedy pattern
213/// ([`ArgPattern::Greedy`]) to achieve better performance.
214///
215/// Let us look at usage of a glob pattern by \sqrt, which is `{,b}t`.
216///
217/// - Example 1. For `\sqrt{2}{3}`, parser requires the pattern to match with an
218///   encoded string `tt`. Here, `{,b}t` matches and yields the string `t`
219///   (which corresponds to `{2}`).
220///
221/// - Example 2. For `\sqrt[1]{2}{2}`, parser requires the pattern to match with
222///   an encoded string `btt`. Here, `{,b}t` matches and yields the string `bt`
223///   (which corresponds to `[1]{2}`).
224///
225/// Kinds of item to match:
226/// - Bracket/b: []
227/// - Parenthesis/p: ()
228/// - Term/t: any remaining terms, typically {} or a single char
229///
230/// Note: any prefix of the argument pattern are matched during the parse stage,
231/// so you need to check whether it is complete in later stages.
232#[derive(Debug, Clone)]
233#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
234#[cfg_attr(feature = "serde", serde(tag = "kind"))]
235#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
236#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
237pub enum ArgPattern {
238    /// No arguments are passed, i.e. this is processed as a variable in Typst.
239    ///
240    /// E.g. `\alpha` => `$alpha$`, where `\alpha` has an argument pattern of
241    /// `None`
242    #[cfg_attr(feature = "serde", serde(rename = "none"))]
243    None,
244    /// Fixed length pattern, equivalent to repeat `{,t}` for `x` times
245    ///
246    /// E.g. `\hat x y` => `$hat(x) y$`, where `\hat` has an argument pattern of
247    /// `FixedLenTerm(1)`
248    ///
249    /// E.g. `1 \sum\limits` => `$1 limits(sum)$`, where `\limits` has an
250    /// argument pattern of `FixedLenTerm(1)`
251    #[cfg_attr(feature = "serde", serde(rename = "fixed-len"))]
252    FixedLenTerm {
253        /// The length of the arguments should be matched
254        len: u8,
255    },
256    /// Range length pattern (matches as much as possible), equivalent to
257    /// repeat `t` for `x` times, then repeat `{,t}` for `y` times.
258    ///
259    /// No example
260    #[cfg_attr(feature = "serde", serde(rename = "range-len"))]
261    RangeLenTerm {
262        /// The minimum length of the arguments should be matched
263        min: u8,
264        /// The maximum length of the arguments should be matched
265        max: u8,
266    },
267    /// Receives any items as much as possible, equivalent to `*`.
268    ///
269    /// E.g. \over, \displaystyle
270    #[cfg_attr(feature = "serde", serde(rename = "greedy"))]
271    Greedy,
272    /// The most powerful pattern, but slightly slow.
273    /// Note that the glob must accept the whole prefix of the input.
274    ///
275    /// E.g. \sqrt has a glob argument pattern of `{,b}t`
276    ///
277    /// Description of the glob pattern:
278    /// - {,b}: first, it matches a bracket option, e.g. `\sqrt[3]`
279    /// - t: it then matches a single term, e.g. `\sqrt[3]{a}` or `\sqrt{a}`
280    #[cfg_attr(feature = "serde", serde(rename = "glob"))]
281    Glob {
282        /// The glob pattern to match the arguments
283        pattern: GlobStr,
284    },
285}
286
287// struct ArgShape(ArgPattern, Direction);
288
289/// Shape of arguments with direction to match since.
290///
291/// Note: We currently only support
292/// - `Direction::Right` with any `ArgPattern`
293/// - `Direction::Left` with `ArgPattern::FixedLenTerm(1)`
294/// - `Direction::Infix` with `ArgPattern::Greedy`
295#[derive(Debug, Clone)]
296#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
297#[cfg_attr(feature = "serde", serde(tag = "kind"))]
298#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
299#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
300pub enum ArgShape {
301    /// A command that associates with the right side of items.
302    ///
303    /// E.g. `\hat`
304    #[cfg_attr(feature = "serde", serde(rename = "right"))]
305    Right {
306        /// The pattern to match the arguments
307        pattern: ArgPattern,
308    },
309    /// A command that associates with the left side of items, and with
310    /// `ArgPattern::FixedLenTerm(1)`.
311    ///
312    /// E.g. `\limits`
313    #[cfg_attr(feature = "serde", serde(rename = "left1"))]
314    Left1,
315    /// A command that associates with both side of items, and with
316    /// `ArgPattern::Greedy`, also known as infix operators.
317    ///
318    /// E.g. `\over`
319    #[cfg_attr(feature = "serde", serde(rename = "infix-greedy"))]
320    InfixGreedy,
321}
322
323/// A feature that specifies how to process the content of an environment.
324#[derive(Debug, Clone)]
325#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
326#[cfg_attr(feature = "serde", serde(tag = "kind"))]
327#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
328#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
329pub enum ContextFeature {
330    /// No special feature
331    #[cfg_attr(feature = "serde", serde(rename = "none"))]
332    None,
333    /// Parse content like math environments
334    #[cfg_attr(feature = "serde", serde(rename = "is-math"))]
335    IsMath,
336    /// Parse content like mat arguments
337    #[cfg_attr(feature = "serde", serde(rename = "is-matrix"))]
338    IsMatrix,
339    /// Parse content like cases
340    #[cfg_attr(feature = "serde", serde(rename = "is-cases"))]
341    IsCases,
342    /// Parse content like figure
343    #[cfg_attr(feature = "serde", serde(rename = "is-figure"))]
344    IsFigure,
345    /// Parse content like table
346    #[cfg_attr(feature = "serde", serde(rename = "is-table"))]
347    IsTable,
348    /// Parse content like itemize
349    #[cfg_attr(feature = "serde", serde(rename = "is-itemize"))]
350    IsItemize,
351    /// Parse content like enumerate
352    #[cfg_attr(feature = "serde", serde(rename = "is-enumerate"))]
353    IsEnumerate,
354}