tectonic_xetex_format/
cshash.rs

1// Copyright 2021 the Tectonic Project
2// Licensed under the MIT License.
3
4//! The hash table for multi-letter control sequences.
5
6use nom::{
7    multi::count,
8    number::complete::{be_i32, be_u8},
9    IResult,
10};
11use tectonic_errors::prelude::*;
12
13use crate::{
14    base::{self, SIZEOF_MEMORY_WORD},
15    engine::Engine,
16    eqtb::EqtbPointer,
17    parseutils,
18    stringtable::{StrPointer, StringTable},
19    symbols::{SymbolCategory, SymbolTable},
20};
21
22pub fn initialize_cshash_symbols(symbols: &mut SymbolTable) -> Result<()> {
23    symbols.add(SymbolCategory::CsHash, "HASH_SIZE", 15_000)?;
24    symbols.add(SymbolCategory::CsHash, "HASH_EXTRA", 600_000)?;
25    symbols.add(SymbolCategory::CsHash, "HASH_OFFSET", 514)?;
26    symbols.add(SymbolCategory::CsHash, "HASH_PRIME", 8501)?;
27    Ok(())
28}
29
30#[derive(Debug)]
31pub struct ControlSeqHash {
32    need_offset_hash: Vec<u8>,
33
34    // To keep this type self-contained, it's easiest just to copy out the
35    // settings that we need to do our various computations.
36    hash_base: EqtbPointer,
37    hash_prime: u32,
38    hash_offset: i32,
39    single_base: EqtbPointer,
40    null_cs_loc: EqtbPointer,
41    undefined_cs_loc: EqtbPointer,
42    eqtb_size: EqtbPointer,
43    eqtb_top: EqtbPointer,
44    prim_eqtb_base: EqtbPointer,
45    frozen_null_font_loc: EqtbPointer,
46}
47
48impl ControlSeqHash {
49    pub(crate) fn parse<'a>(
50        input: &'a [u8],
51        engine: &Engine,
52        hash_high: i32,
53    ) -> IResult<&'a [u8], Self> {
54        let hash_base = engine.symbols.lookup("HASH_BASE") as EqtbPointer;
55        let hash_prime = engine.symbols.lookup("HASH_PRIME") as u32;
56        let hash_offset = engine.symbols.lookup("HASH_OFFSET") as i32;
57        let single_base = engine.symbols.lookup("SINGLE_BASE") as EqtbPointer;
58        let null_cs_loc = engine.symbols.lookup("NULL_CS") as EqtbPointer;
59        let undefined_cs_loc = engine.symbols.lookup("UNDEFINED_CONTROL_SEQUENCE") as EqtbPointer;
60        let eqtb_size = engine.symbols.lookup("EQTB_SIZE") as EqtbPointer;
61        let eqtb_top = engine.symbols.lookup("EQTB_TOP") as EqtbPointer;
62        let prim_eqtb_base = engine.symbols.lookup("PRIM_EQTB_BASE") as EqtbPointer;
63        let frozen_null_font_loc = engine.symbols.lookup("FROZEN_NULL_FONT") as EqtbPointer;
64
65        let index = |i: i32| (i - hash_offset) as usize * SIZEOF_MEMORY_WORD;
66
67        let high_hash_size = eqtb_top + 1 - hash_offset;
68        let mut need_offset_hash = vec![0u8; high_hash_size as usize * SIZEOF_MEMORY_WORD];
69
70        let (input, hash_used) = parseutils::ranged_be_i32(
71            hash_base,
72            engine.symbols.lookup("FROZEN_CONTROL_SEQUENCE") as i32,
73        )(input)?;
74
75        let mut p = hash_base - 1;
76        let mut input = input;
77
78        loop {
79            let (ii, new_p) = parseutils::ranged_be_i32(p + 1, hash_used)(input)?;
80            p = new_p;
81
82            // TODO: load directly into `hash`?
83            let (ii, block) = count(be_u8, 8)(ii)?;
84            let ofs = index(p);
85            need_offset_hash[ofs..ofs + 8].copy_from_slice(&block[..]);
86
87            input = ii;
88
89            if p == hash_used {
90                break;
91            }
92        }
93
94        // TODO: load directly into `hash`?
95        let nb = ((engine.symbols.lookup("UNDEFINED_CONTROL_SEQUENCE") as i32 - 1) - hash_used)
96            as usize
97            * SIZEOF_MEMORY_WORD;
98        let (input, block) = count(be_u8, nb)(input)?;
99        let ofs = index(hash_used + 1);
100        need_offset_hash[ofs..ofs + nb].copy_from_slice(&block[..]);
101
102        let mut input = input;
103
104        if hash_high > 0 {
105            let nb = hash_high as usize * SIZEOF_MEMORY_WORD;
106            let (new_input, block) = count(be_u8, nb)(input)?;
107            input = new_input;
108            let ofs = index(eqtb_size + 1);
109            need_offset_hash[ofs..ofs + nb].copy_from_slice(&block[..]);
110        }
111
112        let (input, _cs_count) = be_i32(input)?;
113
114        Ok((
115            input,
116            ControlSeqHash {
117                need_offset_hash,
118                hash_base,
119                hash_prime,
120                hash_offset,
121                single_base,
122                null_cs_loc,
123                undefined_cs_loc,
124                eqtb_size,
125                eqtb_top,
126                prim_eqtb_base,
127                frozen_null_font_loc,
128            },
129        ))
130    }
131
132    fn decode(&self, index: i32) -> (StrPointer, i32) {
133        let index = index - self.hash_offset;
134        let text_ptr = base::memword_read_b32_s1(&self.need_offset_hash[..], index);
135        let next_ptr = base::memword_read_b32_s0(&self.need_offset_hash[..], index);
136        (text_ptr, next_ptr)
137    }
138
139    pub fn lookup(&self, csname: &str, strings: &StringTable) -> Option<EqtbPointer> {
140        let csname_len_utf16 = crate::stringtable::len_utf16(csname);
141
142        let mut h = 0;
143
144        for c in csname.chars() {
145            h = h + h + c as u32;
146            while h >= self.hash_prime {
147                h -= self.hash_prime;
148            }
149        }
150
151        let mut p = h as i32 + self.hash_base;
152
153        loop {
154            let (str_ptr, next_ptr) = self.decode(p);
155
156            if str_ptr > 0 {
157                let len = strings.utf16_length(str_ptr);
158
159                if len == csname_len_utf16 {
160                    let s = strings.lookup(str_ptr);
161
162                    if s == csname {
163                        return Some(p);
164                    }
165                }
166            }
167
168            if next_ptr == 0 {
169                return None;
170            }
171
172            p = next_ptr;
173        }
174    }
175
176    /// Similar to TeX's `print_cs`
177    pub fn stringify(&self, p: EqtbPointer, strings: &StringTable) -> Option<String> {
178        if p < self.hash_base {
179            // Single-character control sequence, or active character, or the
180            // null CS.
181
182            if p >= self.single_base {
183                if p == self.null_cs_loc {
184                    return Some("".to_owned());
185                } else {
186                    let usv = (p - self.single_base) as u32;
187                    return char::from_u32(usv).map(|c| c.to_string());
188                }
189            } else {
190                // The 1 here is formally ACTIVE_BASE
191                return Some(format!(
192                    "[active character {}]",
193                    crate::format::fmt_usv(p - 1)
194                ));
195            }
196        }
197
198        if p >= self.undefined_cs_loc && p <= self.eqtb_size {
199            return None;
200        }
201
202        if p > self.eqtb_top {
203            return None;
204        }
205
206        if p >= self.prim_eqtb_base && p < self.frozen_null_font_loc {
207            //TODO: print_esc(prim[p - PRIM_EQTB_BASE].s1 - 1);
208            return None;
209        }
210
211        // `if (text(p) >= str_ptr) => "NONEXISTENT."`
212
213        let (text_ptr, _next_ptr) = self.decode(p);
214        Some(strings.lookup(text_ptr).to_owned())
215    }
216}