tex_engine/utils/
strings.rs

1/*!
2Plain TeX parses source files as a sequence of [`u8`]s, but other engines might use bigger types, e.g. XeTeX.
3
4The [`CharType`] trait allows us to abstract over the character type, and the trait [`TeXStr`]`<Char:`[`CharType`]`>`
5abstracts over the string type.
6*/
7
8use std::convert::Into;
9use std::fmt::{Display, Formatter, Debug};
10use std::hash::Hash;
11use std::marker::PhantomData;
12use std::vec::IntoIter;
13use crate::tex::catcodes::{CategoryCodeScheme, STARTING_SCHEME_U8};
14use crate::utils::Ptr;
15
16
17
18/**
19Plain TeX parses source files as a sequence of [`u8`]s, but other engines might use bigger types, e.g. XeTeX.
20This trait allows us to abstract over the character type, by providing the relevant data needed to treat them
21(essentially) like [`u8`]s.
22 */
23pub trait CharType:Copy+PartialEq+Eq+Hash+Display+Debug+'static+From<u8>+Default {
24    /// The type of the array/vec/whatever of all possible characters. For [`u8`], this is `[A;256]`.
25    type Allchars<A:Default> : AllCharsTrait<Self,A>;
26
27    /// The maximum value of the character type. For [`u8`], this is `255`.
28    const MAX:Self;
29
30    /// Parses a character from a byte iterator. For [`u8`], this is just `iter.next()`.
31    fn from_u8_iter(iter:&mut IntoIter<u8>) -> Option<Self>;
32    /// Convert a `&`[`str`] into a [`TeXStr`]`<Self>`.
33    fn from_str(s:&str) -> TeXStr<Self>;
34
35    /** Whether the character is an end-of-line character.
36     *
37     * Should return:
38     * - `Some(true)`: is end of line (e.g. `\n`).
39     * - `Some(false)`: is not end of line.
40     * - `None`: might be, depending on the next character - e.g. `\r`, in which case the next
41     *    character should be checked to be `\n`.
42     */
43    fn is_eol(self) -> Option<bool>;
44
45    /** Should return:
46     * - `true`: if the pair (`self`,`next`) represents an end of line (e.g. `\r\n`).
47     * - `false`: if not; in which case `self` itself is considered to be an end of line (e.g. `\r`).
48     */
49    fn is_eol_pair(self,next:Self) -> bool;
50
51    /// The string "par" as a [`TeXStr`]`<Self>`.
52    fn par_token() -> TeXStr<Self>;
53
54    /// The string "relax" as a [`TeXStr`]`<Self>`.
55    fn relax_token() -> TeXStr<Self>;
56
57    /// The empty string as a [`TeXStr`]`<Self>`.
58    fn empty_str() -> TeXStr<Self>;
59
60    /// The starting category code scheme for this character type, see [`struct@STARTING_SCHEME_U8`].
61    fn starting_catcode_scheme() -> CategoryCodeScheme<Self>;
62
63    fn newline() -> Self;
64    fn carriage_return() -> Self;
65    fn backslash() -> Self;
66    fn zeros() -> Self::Allchars<Self>;
67    fn ident() -> Self::Allchars<Self>;
68    fn rep_field<A:Clone+Default>(a:A) -> Self::Allchars<A>;
69
70    /// How to display a [`TeXStr`]`<Self>`.
71    fn display_str(str:&TeXStr<Self>, f: &mut Formatter<'_>) -> std::fmt::Result {
72        for u in &*str.0 { write!(f,"{}",u.char_str())?; }
73        Ok(())
74    }
75    fn char_str(&self) -> String;
76    fn as_bytes(&self) -> Vec<u8>;
77
78    fn from_i64(i:i64) -> Option<Self>;
79    fn to_usize(self) -> usize;
80}
81
82thread_local! {
83    /// "par" as a [`TeXStr`]`<u8>`.
84    pub static PAR_U8: TeXStr<u8> = "par".into();
85    /// "relax" as a [`TeXStr`]`<u8>`.
86    pub static RELAX_U8: TeXStr<u8> = "relax".into();
87    /// The empty string as a [`TeXStr`]`<u8>`.
88    pub static EMPTY_U8: TeXStr<u8> = "".into();
89}
90
91impl CharType for u8 {
92    type Allchars<A:Default> = [A;256];
93    const MAX:Self=255;
94    fn from_u8_iter(iter: &mut IntoIter<u8>) -> Option<Self> { iter.next() }
95    fn newline() -> Self { b'\n' }
96    fn carriage_return() -> Self {b'\r'}
97    fn backslash() -> Self { b'\\' }
98    // #[inline(always)]
99    fn is_eol(self) -> Option<bool> {
100        match self {
101            b'\n' => Some(true),
102            b'\r' => None,
103            _ => Some(false)
104        }
105    }
106    fn from_str(s: &str) -> TeXStr<Self> {
107        TeXStr(Ptr::new(s.as_bytes().to_vec()))
108    }
109    // #[inline(always)]
110    fn is_eol_pair(self, next: Self) -> bool {
111        // invariant: self == \r
112        next == b'\n'
113    }
114    fn par_token() -> TeXStr<Self> { PAR_U8.with(|p| p.clone()) }
115    fn relax_token() -> TeXStr<Self> { RELAX_U8.with(|p| p.clone()) }
116    fn empty_str() -> TeXStr<Self> {EMPTY_U8.with(|p| p.clone()) }
117    // #[inline(always)]
118    fn starting_catcode_scheme() -> CategoryCodeScheme<Self> {
119        STARTING_SCHEME_U8.clone()
120    }
121    fn as_bytes(&self) -> Vec<u8> { vec![*self] }
122
123    // #[inline(always)]
124    fn char_str(&self) -> String {
125        match *self {
126            0 => "\\u0000".to_string(),
127            b'\n' => "\\n".to_string(),
128            b'\r' => "\\r".to_string(),
129            o if is_ascii(o) => (o as char).to_string(), //f.write_char((o).into()),
130            o => format!("\\u00{:X}",o)
131        }
132    }
133    fn zeros() -> Self::Allchars<Self> {
134        [0;256]
135    }
136    fn ident() -> Self::Allchars<Self> {
137        let mut a = [0;256];
138        for i in 0..256 { a[i] = i as u8; }
139        a
140    }
141    fn rep_field<A: Clone+Default>(a: A) -> Self::Allchars<A> {
142        [ // UTTERLY RIDICULOUS
143            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
144            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
145            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
146            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
147            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
148            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
149            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
150            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
151            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
152            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
153            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
154            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
155            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
156            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
157            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
158            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
159            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
160            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
161            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
162            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
163            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
164            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
165            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
166            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
167            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
168            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
169            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
170            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
171            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
172            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
173            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),
174            a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone(),a.clone()
175        ]
176    }
177
178    fn from_i64(i: i64) -> Option<Self> {
179        if i == -1 {Some(255)} else if i < 0 || i > 255 { None } else { Some(i as u8) }
180    }
181    fn to_usize(self) -> usize { self as usize }
182}
183
184/** A trait for arrays of all possible characters. For [`u8`], this is `[A;256]`.
185*/
186pub trait AllCharsTrait<C:CharType,A> {
187    /// Returns the value for character `u`.
188    fn get(&self, u: C) -> &A;
189    /// Sets the value for character `u` to `v`.
190    fn set(&mut self, u: C, v:A);
191    /// Replaces the value for character `u` with `v`, returning the old value.
192    fn replace(&mut self, u: C, v:A) -> A;
193}
194impl<A> AllCharsTrait<u8,A> for [A;256] {
195    //#[inline(always)]
196    fn get(&self, u:u8) -> &A { &self[u as usize] }
197   // #[inline(always)]
198    fn set(&mut self, u:u8,v:A) { self[u as usize] = v }
199    // #[inline(always)]
200    fn replace(&mut self, u: u8, v: A) -> A {
201        std::mem::replace(&mut self[u as usize], v)
202    }
203}
204
205/** A "string" in TeX is a sequence of characters of some [`CharType`]. [`TeXStr`]
206* abstracts away the character type, e.g. for control sequence names.
207*/
208#[derive(Clone,PartialEq,Hash,Eq)]
209pub struct TeXStr<C:CharType>(Ptr<Vec<C>>);
210impl<C:CharType> TeXStr<C> {
211    pub fn len(&self) -> usize { self.0.len() }
212    pub fn as_vec(&self) -> &Vec<C> { &self.0 }
213}
214
215//#[inline(always)]
216fn is_ascii(u:u8) -> bool { 32 <= u && u <= 126 }
217
218impl<C:CharType> Display for TeXStr<C> {
219    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
220        C::display_str(self, f)
221    }
222}
223impl From<&str> for TeXStr<u8> {
224    fn from(s: &str) -> Self {
225        TeXStr(Ptr::new(s.as_bytes().to_vec()))
226    }
227}
228impl From<String> for TeXStr<u8> {
229    fn from(s: String) -> Self {
230        TeXStr(Ptr::new(s.into_bytes()))
231    }
232}
233impl<C:CharType> From<Vec<C>> for TeXStr<C> {
234    fn from(v: Vec<C>) -> Self {
235        TeXStr(Ptr::new(v))
236    }
237}
238
239
240
241impl CharType for char {
242    const MAX: Self = 255 as char;
243    type Allchars<A: Default> = AllUnicodeChars<A>; // TODO
244    fn from_i64(i: i64) -> Option<Self> { if i > 0 && i < 0x110000 { Some(char::from_u32(i as u32).unwrap()) } else { None } }
245    fn to_usize(self) -> usize { self as usize }
246    fn from_str(s: &str) -> TeXStr<Self> {
247        TeXStr(Ptr::new(s.chars().collect()))
248    }
249    fn display_str(str: &TeXStr<Self>, f: &mut Formatter<'_>) -> std::fmt::Result {
250        let str : String = str.0.iter().collect();
251        write!(f,"{}",str)
252    }
253
254    fn backslash() -> Self { '\\' }
255    fn carriage_return() -> Self { '\r' }
256    fn newline() -> Self { '\n' }
257    fn char_str(&self) -> String { self.to_string() }
258    fn is_eol(self) -> Option<bool> {
259        match self {
260            '\n' => Some(true),
261            '\r' => None,
262            _ => Some(false)
263        }
264    }
265    fn is_eol_pair(self, next: Self) -> bool {
266        next == '\n' // self == '\r'
267    }
268    fn as_bytes(&self) -> Vec<u8> {
269        self.to_string().as_bytes().into_iter().map(|u| *u).collect()
270    }
271
272    fn ident() -> Self::Allchars<Self> {
273        todo!()
274    }
275    fn empty_str() -> TeXStr<Self> {
276        todo!()
277    }
278    fn par_token() -> TeXStr<Self> {
279        todo!()
280    }
281    fn relax_token() -> TeXStr<Self> {
282        todo!()
283    }
284    fn starting_catcode_scheme() -> CategoryCodeScheme<Self> {
285        todo!()
286    }
287    fn from_u8_iter(iter: &mut IntoIter<u8>) -> Option<Self> {
288        todo!()
289    }
290    fn rep_field<A: Clone + Default>(a: A) -> Self::Allchars<A> {
291        todo!()
292    }
293    fn zeros() -> Self::Allchars<Self> {
294        todo!()
295    }
296}
297
298pub struct AllUnicodeChars<A:Default>(PhantomData<A>);
299impl<A:Default> AllCharsTrait<char,A> for AllUnicodeChars<A> {
300    fn get(&self, u: char) -> &A {
301        todo!()
302    }
303
304    fn set(&mut self, u: char, v: A) {
305        todo!()
306    }
307
308    fn replace(&mut self, u: char, v: A) -> A {
309        todo!()
310    }
311}
312
313/*
314impl CharType for char {
315
316}
317
318#[derive(Copy,Clone,Debug,PartialEq,Eq,Hash)]
319pub struct Unicode(char);
320
321impl Default for Unicode {
322    fn default() -> Self {
323        Self(0 as char)
324    }
325}
326impl Display for Unicode {
327    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
328        write!(f,"{}",self.0)
329    }
330}
331
332impl Into<usize> for Unicode {
333    fn into(self) -> usize {
334        self.0 as usize
335    }
336}
337
338impl From<u8> for Unicode {
339    fn from(value: u8) -> Self {
340        Self(char::from_u32(value as u32).unwrap())
341    }
342}
343
344impl TryFrom<i64> for Unicode {
345    type Error = ();
346
347    fn try_from(val: i64) -> Result<Self, Self::Error> {
348        if val > 0 && val < 0x10FFFF && (val <= 0xD800 || val > 0xDFFF) {
349            Ok(Self(char::from_u32(val as u32).unwrap()))
350        } else {
351            Err(())
352        }
353    }
354}
355
356impl Into<i64> for Unicode {
357    fn into(self) -> i64 {
358        self.0 as i64
359    }
360}
361
362impl CharType for Unicode {
363    type Allchars<A:Default> = Vec<A>;
364    const MAX:Unicode = Unicode(-1 as char);
365    fn display_str(str: &TeXStr<Self>, f: &mut Formatter<'_>) -> std::fmt::Result {
366        let str : String = str.0.iter().map(|c| c.0).collect();
367        write!(f,"{}",str)
368    }
369    fn from_str(s: &str) -> TeXStr<Self> {
370        TeXStr(Ptr::new(s.chars().map(|c| Unicode(c)).collect()))
371    }
372    fn backslash() -> Self { Unicode('\\') }
373    fn carriage_return() -> Self { Unicode('\r') }
374    fn char_str(&self) -> String { self.0.to_string() }
375    fn is_eol(self) -> Option<bool> {
376        match self.0 {
377            '\r' => None,
378            '\n' => Some(true),
379            _ => Some(false)
380        }
381    }
382    fn is_eol_pair(self, next: Self) -> bool {
383        self.0 == '\r' && next.0 == '\n'
384    }
385    fn par_token() -> TeXStr<Self> {
386        todo!()//TeXStr(Ptr::new(vec![Unicode('p'),Unicode('a'),Unicode('r')]))
387    }
388    fn relax_token() -> TeXStr<Self> {
389        todo!()//TeXStr(Ptr::new(vec![Unicode('r'),Unicode('e'),Unicode('l'),Unicode('a'),Unicode('x')]))
390    }
391    fn empty_str() -> TeXStr<Self> {
392        todo!()//TeXStr(Ptr::new(vec![]))
393    }
394    fn ident() -> Self::Allchars<Self> {
395        todo!()
396    }
397
398}
399impl<A:Default> AllCharsTrait<Unicode,A> for Vec<A> {
400    fn get(&self, u: Unicode) -> &A { &self[u.0 as usize] }
401    fn set(&mut self, u: Unicode, v:A) { self[u.0 as usize] = v }
402    fn replace(&mut self, u: Unicode, v:A) -> A { std::mem::replace(&mut self[u.0 as usize],v) }
403}
404 */