mitex_spec/lib.rs
1//! Specification structure of a set of LaTeX commands.
2//!
3//! The specification will be passed to MiTeX for converting LaTeX code
4//! correctly. For example, MiTeX Parser uses it to produce an AST that respect
5//! the shape of commands.
6//!
7//! Note: since we need to process environments statically, users cannot
8//! override the `\begin` and `\end` commands.
9//!
10//! See <https://github.com/mitex-rs/mitex/blob/main/docs/spec.typ> for detailed description.
11
12use std::sync::Arc;
13
14#[cfg(feature = "serde")]
15use serde::{Deserialize, Serialize};
16
17#[cfg(feature = "rkyv")]
18use rkyv::{Archive, Deserialize as rDeser, Serialize as rSer};
19
20pub mod preludes;
21pub mod query;
22mod stream;
23pub use query::CommandSpecRepr as JsonCommandSpec;
24
25/// An item of command specification. It is either a normal _command_ or an
26/// _environment_.
27/// See [Command Syntax] for concept of _command_.
28/// See [Environment Syntax] for concept of _environment_.
29///
30/// [Command Syntax]: https://latexref.xyz/LaTeX-command-syntax.html
31/// [Environment Syntax]: https://latexref.xyz/Environment-syntax.html
32#[derive(Debug, Clone)]
33#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
34#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
35#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
36pub enum CommandSpecItem {
37 /// Specifies a TeX command
38 /// e.g. `\hat`, `\sum`, `\sqrt`
39 Cmd(CmdShape),
40 /// Specifies a TeX environment
41 /// e.g. `equation`, `matrix`
42 Env(EnvShape),
43}
44
45/// Command specification that contains a set of commands and environments.
46#[derive(Debug, Clone)]
47#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
48#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
49#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
50pub struct CommandSpecRepr {
51 /// A map from command name to command specification
52 pub commands: fxhash::FxHashMap<String, CommandSpecItem>,
53}
54
55/// Command specification that is cheap to clone
56#[derive(Debug, Clone)]
57#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
58#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
59pub struct CommandSpec(Arc<CommandSpecRepr>);
60
61#[cfg(feature = "rkyv")]
62impl CommandSpec {
63 /// Serializes the command specification into bytes in rkyv format.
64 ///
65 /// # Panics
66 /// Panics if rkyv doesn't work properly.
67 pub fn to_bytes(&self) -> Vec<u8> {
68 // Or you can customize your serialization for better performance
69 // and compatibility with #![no_std] environments
70 use rkyv::ser::{serializers::AllocSerializer, Serializer};
71
72 let mut serializer = AllocSerializer::<0>::default();
73 serializer.serialize_value(self.0.as_ref()).unwrap();
74 let bytes = serializer.into_serializer().into_inner();
75
76 bytes.into_vec()
77 }
78
79 /// Deserializes the command specification from bytes in rkyv format.
80 #[cfg(feature = "rkyv-validation")]
81 pub fn from_bytes(bytes: &[u8]) -> Self {
82 let s = stream::BytesModuleStream::from_slice(bytes);
83
84 Self(Arc::new(s.checkout_owned()))
85 }
86
87 /// # Safety
88 /// The data source must be trusted and valid.
89 pub unsafe fn from_bytes_unchecked(bytes: &[u8]) -> Self {
90 let s = stream::BytesModuleStream::from_slice(bytes);
91
92 Self(Arc::new(s.checkout_owned_unchecked()))
93 }
94}
95
96impl CommandSpec {
97 /// Create a new command specification
98 pub fn new(commands: fxhash::FxHashMap<String, CommandSpecItem>) -> Self {
99 Self(Arc::new(CommandSpecRepr { commands }))
100 }
101
102 /// Get an item by name
103 pub fn get(&self, name: &str) -> Option<&CommandSpecItem> {
104 self.0.commands.get(name)
105 }
106
107 /// Iterate all items
108 pub fn items(&self) -> impl Iterator<Item = (&str, &CommandSpecItem)> {
109 self.0.commands.iter().map(|(k, v)| (k.as_str(), v))
110 }
111
112 /// Get an item by name in kind of _command_
113 pub fn get_cmd(&self, name: &str) -> Option<&CmdShape> {
114 self.get(name).and_then(|item| {
115 if let CommandSpecItem::Cmd(item) = item {
116 Some(item)
117 } else {
118 None
119 }
120 })
121 }
122
123 /// Get an item by name in kind of _environment_
124 pub fn get_env(&self, name: &str) -> Option<&EnvShape> {
125 self.get(name).and_then(|item| {
126 if let CommandSpecItem::Env(item) = item {
127 Some(item)
128 } else {
129 None
130 }
131 })
132 }
133}
134
135/// Shape of a TeX command.
136#[derive(Debug, Clone)]
137#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
138#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
139#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
140pub struct CmdShape {
141 /// Describes how we could match the arguments of a command item.
142 pub args: ArgShape,
143 /// Makes the command alias to some Typst handler.
144 /// For exmaple, alias `\prod` to Typst's `product`
145 pub alias: Option<String>,
146}
147
148/// Shape of a TeX envionment.
149#[derive(Debug, Clone)]
150#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
151#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
152#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
153pub struct EnvShape {
154 /// Describes how we could match the arguments of an environment item.
155 pub args: ArgPattern,
156 /// Specifies how we could process items before passing them
157 /// to the Typst handler
158 pub ctx_feature: ContextFeature,
159 /// Makes the command alias to some Typst handler.
160 /// For exmaple, alias `\pmatrix` to a Typst function `pmat` in scope.
161 pub alias: Option<String>,
162}
163
164/// The character encoding used for argument matching
165pub mod argument_kind {
166 /// The character used for matching argument in a term (curly group or
167 /// others)
168 pub const ARGUMENT_KIND_TERM: char = 't';
169 /// The character used for matching argument in a bracket group
170 pub const ARGUMENT_KIND_BRACKET: char = 'b';
171 /// The character used for matching argument in a parenthesis group
172 pub const ARGUMENT_KIND_PAREN: char = 'p';
173}
174
175/// A shared string that represents a glob pattern.
176#[derive(Debug, Clone)]
177#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
178#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
179pub struct GlobStr(pub Arc<str>);
180
181impl From<&str> for GlobStr {
182 fn from(s: &str) -> Self {
183 Self(s.into())
184 }
185}
186#[cfg(feature = "serde")]
187mod glob_str_impl {
188
189 use super::GlobStr;
190 use serde::{Deserialize, Deserializer, Serialize, Serializer};
191
192 impl Serialize for GlobStr {
193 fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
194 self.0.serialize(serializer)
195 }
196 }
197
198 impl<'de> Deserialize<'de> for GlobStr {
199 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
200 Ok(GlobStr(String::deserialize(deserializer)?.into()))
201 }
202 }
203}
204
205/// An efficient pattern used for argument matching.
206///
207/// There are four kinds of pattern. The most powerful one is
208/// [`ArgPattern::Glob`], which matches an sequence of input as arguments. Among
209/// these four kinds, [`ArgPattern::Glob`] can already match all possible inputs
210/// in our use cases. But one should specify a fixed length pattern
211/// ([`ArgPattern::FixedLenTerm`]), a range length pattern
212/// ([`ArgPattern::RangeLenTerm`]), or a greedy pattern
213/// ([`ArgPattern::Greedy`]) to achieve better performance.
214///
215/// Let us look at usage of a glob pattern by \sqrt, which is `{,b}t`.
216///
217/// - Example 1. For `\sqrt{2}{3}`, parser requires the pattern to match with an
218/// encoded string `tt`. Here, `{,b}t` matches and yields the string `t`
219/// (which corresponds to `{2}`).
220///
221/// - Example 2. For `\sqrt[1]{2}{2}`, parser requires the pattern to match with
222/// an encoded string `btt`. Here, `{,b}t` matches and yields the string `bt`
223/// (which corresponds to `[1]{2}`).
224///
225/// Kinds of item to match:
226/// - Bracket/b: []
227/// - Parenthesis/p: ()
228/// - Term/t: any remaining terms, typically {} or a single char
229///
230/// Note: any prefix of the argument pattern are matched during the parse stage,
231/// so you need to check whether it is complete in later stages.
232#[derive(Debug, Clone)]
233#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
234#[cfg_attr(feature = "serde", serde(tag = "kind"))]
235#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
236#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
237pub enum ArgPattern {
238 /// No arguments are passed, i.e. this is processed as a variable in Typst.
239 ///
240 /// E.g. `\alpha` => `$alpha$`, where `\alpha` has an argument pattern of
241 /// `None`
242 #[cfg_attr(feature = "serde", serde(rename = "none"))]
243 None,
244 /// Fixed length pattern, equivalent to repeat `{,t}` for `x` times
245 ///
246 /// E.g. `\hat x y` => `$hat(x) y$`, where `\hat` has an argument pattern of
247 /// `FixedLenTerm(1)`
248 ///
249 /// E.g. `1 \sum\limits` => `$1 limits(sum)$`, where `\limits` has an
250 /// argument pattern of `FixedLenTerm(1)`
251 #[cfg_attr(feature = "serde", serde(rename = "fixed-len"))]
252 FixedLenTerm {
253 /// The length of the arguments should be matched
254 len: u8,
255 },
256 /// Range length pattern (matches as much as possible), equivalent to
257 /// repeat `t` for `x` times, then repeat `{,t}` for `y` times.
258 ///
259 /// No example
260 #[cfg_attr(feature = "serde", serde(rename = "range-len"))]
261 RangeLenTerm {
262 /// The minimum length of the arguments should be matched
263 min: u8,
264 /// The maximum length of the arguments should be matched
265 max: u8,
266 },
267 /// Receives any items as much as possible, equivalent to `*`.
268 ///
269 /// E.g. \over, \displaystyle
270 #[cfg_attr(feature = "serde", serde(rename = "greedy"))]
271 Greedy,
272 /// The most powerful pattern, but slightly slow.
273 /// Note that the glob must accept the whole prefix of the input.
274 ///
275 /// E.g. \sqrt has a glob argument pattern of `{,b}t`
276 ///
277 /// Description of the glob pattern:
278 /// - {,b}: first, it matches a bracket option, e.g. `\sqrt[3]`
279 /// - t: it then matches a single term, e.g. `\sqrt[3]{a}` or `\sqrt{a}`
280 #[cfg_attr(feature = "serde", serde(rename = "glob"))]
281 Glob {
282 /// The glob pattern to match the arguments
283 pattern: GlobStr,
284 },
285}
286
287// struct ArgShape(ArgPattern, Direction);
288
289/// Shape of arguments with direction to match since.
290///
291/// Note: We currently only support
292/// - `Direction::Right` with any `ArgPattern`
293/// - `Direction::Left` with `ArgPattern::FixedLenTerm(1)`
294/// - `Direction::Infix` with `ArgPattern::Greedy`
295#[derive(Debug, Clone)]
296#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
297#[cfg_attr(feature = "serde", serde(tag = "kind"))]
298#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
299#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
300pub enum ArgShape {
301 /// A command that associates with the right side of items.
302 ///
303 /// E.g. `\hat`
304 #[cfg_attr(feature = "serde", serde(rename = "right"))]
305 Right {
306 /// The pattern to match the arguments
307 pattern: ArgPattern,
308 },
309 /// A command that associates with the left side of items, and with
310 /// `ArgPattern::FixedLenTerm(1)`.
311 ///
312 /// E.g. `\limits`
313 #[cfg_attr(feature = "serde", serde(rename = "left1"))]
314 Left1,
315 /// A command that associates with both side of items, and with
316 /// `ArgPattern::Greedy`, also known as infix operators.
317 ///
318 /// E.g. `\over`
319 #[cfg_attr(feature = "serde", serde(rename = "infix-greedy"))]
320 InfixGreedy,
321}
322
323/// A feature that specifies how to process the content of an environment.
324#[derive(Debug, Clone)]
325#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
326#[cfg_attr(feature = "serde", serde(tag = "kind"))]
327#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
328#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
329pub enum ContextFeature {
330 /// No special feature
331 #[cfg_attr(feature = "serde", serde(rename = "none"))]
332 None,
333 /// Parse content like math environments
334 #[cfg_attr(feature = "serde", serde(rename = "is-math"))]
335 IsMath,
336 /// Parse content like mat arguments
337 #[cfg_attr(feature = "serde", serde(rename = "is-matrix"))]
338 IsMatrix,
339 /// Parse content like cases
340 #[cfg_attr(feature = "serde", serde(rename = "is-cases"))]
341 IsCases,
342 /// Parse content like figure
343 #[cfg_attr(feature = "serde", serde(rename = "is-figure"))]
344 IsFigure,
345 /// Parse content like table
346 #[cfg_attr(feature = "serde", serde(rename = "is-table"))]
347 IsTable,
348 /// Parse content like itemize
349 #[cfg_attr(feature = "serde", serde(rename = "is-itemize"))]
350 IsItemize,
351 /// Parse content like enumerate
352 #[cfg_attr(feature = "serde", serde(rename = "is-enumerate"))]
353 IsEnumerate,
354}