pdf_font/font/type1/
mod.rs

1//! Reading Type1 fonts.
2
3mod charstring;
4mod charstring_parser;
5mod decrypt;
6mod operator;
7mod standard;
8pub(crate) mod stream;
9
10use crate::font::type1::charstring::{parse_char_string, parse_char_string_width};
11use crate::font::type1::decrypt::{decrypt, decrypt_byte};
12use crate::font::type1::standard::STANDARD;
13use crate::font::type1::stream::Stream;
14use crate::font::{Arc, Map, Matrix, OutlineBuilder};
15use alloc::borrow::Cow;
16use alloc::string::{String, ToString};
17use alloc::vec;
18use alloc::vec::Vec;
19use core::iter::Copied;
20use core::slice::Iter;
21use core::str::FromStr;
22use log::error;
23// Many parts of the parser code are adapted from
24// https://github.com/janpe2/CFFDump/blob/master/cff/type1/Type1Dump.java
25
26#[derive(Debug)]
27pub(crate) struct Parameters {
28    font_matrix: Matrix,
29    encoding_type: EncodingType,
30    subroutines: Map<u32, Vec<u8>>,
31    charstrings: Map<String, Vec<u8>>,
32    charstring_names: Vec<String>,
33    charstring_indices: Map<String, u16>,
34    pub(crate) weight_vector: Option<Vec<f32>>,
35}
36
37impl Default for Parameters {
38    fn default() -> Self {
39        Self {
40            font_matrix: Matrix::default(),
41            encoding_type: EncodingType::Standard,
42            subroutines: Map::new(),
43            charstrings: Map::new(),
44            charstring_names: Vec::new(),
45            charstring_indices: Map::new(),
46            weight_vector: None,
47        }
48    }
49}
50
51/// A Type1 font table.
52#[derive(Debug, Clone)]
53pub struct Table {
54    params: Arc<Parameters>,
55}
56
57impl Table {
58    /// Parses a table from raw data.
59    pub fn parse(data: &[u8]) -> Option<Self> {
60        let data = if data.starts_with(&[0x80, 0x01]) {
61            extract_pfb_segments(data)?
62        } else if data.starts_with(b"%!") {
63            Cow::Borrowed(data)
64        } else {
65            error!("type1 font wasn't recognized!");
66
67            return None;
68        };
69
70        let mut s = Stream::new(data.as_ref());
71        let mut params = Parameters::default();
72
73        while let Some(token) = s.next_token() {
74            match token {
75                b"/FontInfo" => s.skip_dict(),
76                b"/FontName" => s.skip_token(),
77                b"/PaintType" => s.skip_token(),
78                b"/FontType" => s.skip_token(),
79                b"/FontBBox" => s.skip_token(),
80                b"/UniqueID" => s.skip_token(),
81                b"/Metrics" => s.skip_dict(),
82                b"/StrokeWidth" => s.skip_token(),
83                b"/FontMatrix" => {
84                    let matrix = s.read_font_matrix()?;
85                    params.font_matrix = Matrix {
86                        sx: matrix[0],
87                        kx: matrix[2],
88                        ky: matrix[1],
89                        sy: matrix[3],
90                        tx: matrix[4],
91                        ty: matrix[5],
92                    };
93                }
94                b"/WeightVector" => {
95                    if let Some(wv) = s.read_float_array() {
96                        params.weight_vector = Some(wv);
97                    }
98                }
99                b"/Encoding" => params.encoding_type = s.read_encoding()?,
100                b"eexec" => {
101                    let decrypted = decrypt(s.tail()?, true)?;
102                    Self::parse_eexec(&decrypted, &mut params)?;
103                }
104                _ => {}
105            }
106        }
107
108        // Parsing seems to have failed, so reject as invalid.
109        if params.charstrings.is_empty() {
110            return None;
111        }
112
113        Some(Self {
114            params: Arc::new(params),
115        })
116    }
117
118    fn parse_eexec(data: &[u8], params: &mut Parameters) -> Option<()> {
119        let mut s = Stream::new(data);
120
121        let mut len_iv = 4;
122        let mut use_decryption = true;
123
124        while let Some(token) = s.next_token() {
125            match token {
126                b"/Subrs" => {
127                    params.subroutines = s
128                        .parse_subroutines(len_iv, use_decryption)
129                        .unwrap_or_default();
130                }
131                b"/CharStrings" => {
132                    if let Some((chars, names)) = s.parse_charstrings(len_iv, use_decryption) {
133                        params.charstring_indices = names
134                            .iter()
135                            .enumerate()
136                            .map(|(i, n)| (n.clone(), i as u16))
137                            .collect();
138                        params.charstrings = chars;
139                        params.charstring_names = names;
140                    }
141                }
142                b"/lenIV" => {
143                    len_iv = s.next_int()?;
144
145                    if len_iv < 0 {
146                        use_decryption = false;
147                        len_iv = 0;
148                    }
149                }
150                b"/WeightVector" => {
151                    if let Some(wv) = s.read_float_array() {
152                        params.weight_vector = Some(wv);
153                    }
154                }
155                _ => {}
156            }
157        }
158
159        Some(())
160    }
161
162    /// Returns whether this is a `MultipleMaster` font.
163    pub fn is_multiple_master(&self) -> bool {
164        self.params.weight_vector.is_some()
165    }
166
167    /// Returns a font transformation matrix.
168    pub fn matrix(&self) -> Matrix {
169        self.params.font_matrix
170    }
171
172    /// Outlines a glyph.
173    pub fn outline(&self, string: &str, builder: &mut dyn OutlineBuilder) -> Option<()> {
174        let data = self.params.charstrings.get(string)?;
175
176        parse_char_string(data, &self.params, builder).ok()?;
177
178        Some(())
179    }
180
181    /// Returns the raw charstring advance width for a glyph name.
182    pub fn glyph_width(&self, string: &str) -> Option<f32> {
183        let data = self.params.charstrings.get(string)?;
184        parse_char_string_width(data, &self.params).ok()
185    }
186
187    /// Return the glyph name of the code point.
188    pub fn code_to_string(&self, code_point: u8) -> Option<&str> {
189        self.params.encoding_type.encode(code_point)
190    }
191
192    /// Returns charstring names in their original insertion order.
193    pub fn charstring_names(&self) -> &[String] {
194        &self.params.charstring_names
195    }
196
197    /// Returns the insertion index of a charstring by name.
198    pub fn charstring_index(&self, name: &str) -> Option<u16> {
199        self.params.charstring_indices.get(name).copied()
200    }
201}
202
203// <https://github.com/apache/pdfbox/blob/129aafe26548c1ff935af9c55cb40a996186c35f/fontbox/src/main/java/org/apache/fontbox/pfb/PfbParser.java#L119>
204fn extract_pfb_segments(pfb: &[u8]) -> Option<Cow<'static, [u8]>> {
205    const START_MARKER: u8 = 0x80;
206    const ASCII_MARKER: u8 = 0x01;
207    const BINARY_MARKER: u8 = 0x02;
208    const EOF_MARKER: u8 = 0x03;
209    const PFB_HEADER_LENGTH: usize = 18;
210
211    if pfb.len() < PFB_HEADER_LENGTH {
212        return None;
213    }
214
215    let mut stream = Stream::new(pfb);
216    let mut type_list = Vec::new();
217    let mut barr_list = Vec::new();
218    let mut total = 0;
219
220    loop {
221        let r = stream.read_byte();
222        if r.is_none() && total > 0 {
223            break; // EOF
224        }
225        let r = r?;
226
227        if r != START_MARKER {
228            return None;
229        }
230
231        let record_type = stream.read_byte()?;
232        if record_type == EOF_MARKER {
233            break;
234        }
235
236        if record_type != ASCII_MARKER && record_type != BINARY_MARKER {
237            return None;
238        }
239
240        let size_bytes = stream.read_bytes(4)?;
241        let mut size = size_bytes[0] as usize;
242        size += (size_bytes[1] as usize) << 8;
243        size += (size_bytes[2] as usize) << 16;
244        size += (size_bytes[3] as usize) << 24;
245
246        // PDFJS-14462: Simply abort parsing if we have an invalid size.
247        let Some(ar) = stream.read_bytes(size) else {
248            break;
249        };
250
251        total += size;
252        type_list.push(record_type);
253        barr_list.push(ar);
254    }
255
256    // We now have ASCII and binary segments. Lets arrange these so that the ASCII segments
257    // come first, then the binary segments, then the last ASCII segment if it is
258    // 0000... cleartomark
259
260    let mut pfbdata = Vec::with_capacity(total);
261    let mut cleartomark_segment = None;
262
263    // copy the ASCII segments
264    for i in 0..type_list.len() {
265        if type_list[i] != ASCII_MARKER {
266            continue;
267        }
268
269        let ar = barr_list[i];
270        if i == type_list.len() - 1
271            && ar.len() < 600
272            && let Ok(s) = core::str::from_utf8(ar)
273            && s.contains("cleartomark")
274        {
275            cleartomark_segment = Some(ar);
276            continue;
277        }
278        pfbdata.extend_from_slice(ar);
279    }
280
281    // copy the binary segments
282    for i in 0..type_list.len() {
283        if type_list[i] != BINARY_MARKER {
284            continue;
285        }
286        let ar = barr_list[i];
287        pfbdata.extend_from_slice(ar);
288    }
289
290    if let Some(segment) = cleartomark_segment {
291        pfbdata.extend_from_slice(segment);
292    }
293
294    Some(Cow::Owned(pfbdata))
295}
296
297const ND: &[u8] = b"ND";
298const ND_ALT: &[u8] = b"|-";
299
300const RD: &[u8] = b"RD";
301const RD_ALT: &[u8] = b"-|";
302
303const NP: &[u8] = b"NP";
304const NP_ALT: &[u8] = b"|";
305
306impl<'a> Stream<'a> {
307    fn next_int(&mut self) -> Option<i64> {
308        parse_int(core::str::from_utf8(self.next_token()?).ok()?)
309    }
310
311    #[allow(clippy::type_complexity)]
312    fn parse_charstrings(
313        &mut self,
314        len_iv: i64,
315        use_decryption: bool,
316    ) -> Option<(Map<String, Vec<u8>>, Vec<String>)> {
317        let mut charstrings = Map::new();
318        let mut names = Vec::new();
319
320        let mut first_glyph_name = None;
321        let mut int_token = None;
322
323        while let Some(token) = self.next_token() {
324            if token == b"end" {
325                return Some((charstrings, names));
326            }
327
328            if token.starts_with(b"/") {
329                first_glyph_name = Some(token);
330            } else if token
331                .iter()
332                .all(|b| matches!(*b, b'#') || b.is_ascii_digit())
333            {
334                int_token = parse_int(core::str::from_utf8(token).ok()?);
335            } else if token == RD || token == RD_ALT {
336                break;
337            }
338        }
339
340        let (first_glyph_name, int_token) = (first_glyph_name?, int_token?);
341
342        let mut is_first = true;
343
344        loop {
345            let bin_len;
346            let mut glyph_name;
347
348            if is_first {
349                is_first = false;
350                bin_len = int_token;
351                glyph_name = first_glyph_name;
352
353                if glyph_name.starts_with(b"/") {
354                    glyph_name = &glyph_name[1..];
355                }
356
357                self.read_byte();
358            } else {
359                let tok = self.next_token()?;
360                if tok == b"end" {
361                    break;
362                }
363
364                if tok.starts_with(b"/") {
365                    glyph_name = &tok[1..];
366                } else {
367                    glyph_name = tok;
368                }
369
370                // See PDFBOX-3979.
371                let Some(len) = self.next_int() else {
372                    break;
373                };
374                bin_len = len;
375                let tok = self.next_token()?;
376
377                if tok == RD || tok == RD_ALT {
378                    self.read_byte();
379                } else {
380                    error!("invalid charstring in start, expected RD");
381
382                    return None;
383                }
384            }
385
386            let encrypted_bytes = self.read_bytes(bin_len as usize)?;
387            let decrypted_bytes = decrypt_charstring(encrypted_bytes, len_iv, use_decryption)?;
388            let name = core::str::from_utf8(glyph_name).ok()?.to_string();
389            names.push(name.clone());
390            charstrings.insert(name, decrypted_bytes);
391
392            let tok = self.next_token()?;
393            if tok == ND || tok == ND_ALT {
394            } else if tok.starts_with(b"/") {
395                // PDFJS-14462: In case there are no end tokens, go back so
396                // that we can parse the next charstring in the font.
397                self.move_back(tok.len());
398            } else {
399                error!("invalid charstring in end, expected ND, found {tok:?}");
400
401                if charstrings.is_empty() {
402                    return None;
403                } else {
404                    // Return what we have extracted so far.
405                    break;
406                }
407            }
408        }
409
410        Some((charstrings, names))
411    }
412
413    fn parse_subroutines(
414        &mut self,
415        len_iv: i64,
416        use_decryption: bool,
417    ) -> Option<Map<u32, Vec<u8>>> {
418        let mut subroutines = Map::new();
419
420        let num_subrs = parse_int(core::str::from_utf8(self.next_token()?).ok()?)?;
421
422        if num_subrs < 1 {
423            return Some(subroutines);
424        }
425
426        if !self.skip_until_before(b"dup", |b| matches!(b, ND | ND_ALT | b"noaccess")) {
427            return Some(subroutines);
428        }
429
430        while let Some(token) = self.next_token() {
431            if matches!(token, ND | ND_ALT) {
432                break;
433            }
434
435            if token == b"noaccess" {
436                if self.next_token() == Some(b"def") {
437                    break;
438                } else {
439                    error!("invalid sequence noaccess");
440
441                    return None;
442                }
443            }
444
445            if token != b"dup" {
446                error!("expected dup, got token {:?} instead", &token);
447
448                return None;
449            }
450
451            let subr_idx = self.next_int()?;
452            let bin_len = self.next_int()?;
453
454            let tok = self.next_token()?;
455
456            if tok != RD && tok != RD_ALT {
457                error!("invalid subroutine start token {tok:?}");
458
459                return None;
460            } else {
461                // Whitespace
462                self.read_byte();
463            }
464
465            let encrypted_bytes = self.read_bytes(bin_len as usize)?;
466            subroutines.insert(
467                subr_idx as u32,
468                decrypt_charstring(encrypted_bytes, len_iv, use_decryption)?,
469            );
470
471            let mut tok = self.next_token()?;
472            if tok == NP || tok == NP_ALT {
473            } else if tok == b"noaccess" {
474                tok = self.next_token()?;
475                if tok == b"def" {
476                    break;
477                }
478
479                if tok == b"put" {
480                } else {
481                    error!("invalid subroutine end {tok:?}");
482
483                    return None;
484                }
485            } else {
486                error!("invalid subroutine end token {tok:?}");
487
488                return None;
489            }
490        }
491
492        Some(subroutines)
493    }
494
495    fn peek_token(&mut self) -> Option<&'a [u8]> {
496        self.clone().next_token()
497    }
498
499    fn next_token(&mut self) -> Option<&'a [u8]> {
500        let skip_token = |st: &mut Stream<'_>| -> usize {
501            let mut count = 1;
502            while let Some(ch) = st.read_bytes(1) {
503                if is_whitespace(ch[0]) || is_self_delim_after_token(ch[0]) {
504                    st.move_back(1);
505                    break;
506                }
507
508                count += 1;
509            }
510
511            count
512        };
513
514        self.skip_whitespaces();
515
516        while let Some(ch) = self.clone().read_bytes(1) {
517            let tail = self.tail()?;
518            self.read_bytes(1);
519
520            match ch[0] {
521                b'%' => self.skip_line_comment(),
522                b'(' => return Some(b"("),
523                b'<' => {
524                    if let Some(ch2) = self.read_bytes(1) {
525                        if ch2[0] == b'>' {
526                            return Some(b"( )");
527                        } else if ch2[0] == b'<' {
528                            return Some(b"<<");
529                        } else {
530                            return Some(b"<");
531                        }
532                    }
533                }
534                b'>' => {
535                    if let Some(ch2) = self.read_bytes(1) {
536                        if ch2[0] == b'>' {
537                            return Some(b">>");
538                        } else {
539                            self.move_back(1);
540                            return Some(b">");
541                        }
542                    }
543                }
544                b'[' => {
545                    return Some(b"[");
546                }
547                b']' => {
548                    return Some(b"]");
549                }
550                b'{' => {
551                    return Some(b"{");
552                }
553                b'}' => {
554                    return Some(b"}");
555                }
556                b'/' => {
557                    if let Some(ch2) = self.read_bytes(1) {
558                        if is_whitespace(ch2[0]) || is_self_delim_after_token(ch2[0]) {
559                            let token = b"/";
560
561                            if is_self_delim_after_token(ch2[0]) {
562                                self.move_back(1);
563                            }
564
565                            return Some(token);
566                        } else {
567                            let count = skip_token(self);
568
569                            return Some(&tail[0..(count + 1)]);
570                        }
571                    }
572                }
573                _ => {
574                    let count = skip_token(self);
575                    return Some(&tail[0..count]);
576                }
577            }
578
579            self.skip_whitespaces();
580        }
581
582        None
583    }
584
585    // Note: AI generated, haven't double-checked.
586    fn read_float_array(&mut self) -> Option<Vec<f32>> {
587        let mut entries = Vec::new();
588
589        if self.next_token()? != b"[" {
590            return None;
591        }
592
593        while let Some(token) = self.next_token() {
594            if token == b"]" {
595                break;
596            }
597            if let Ok(s) = core::str::from_utf8(token)
598                && let Ok(v) = f32::from_str(s)
599            {
600                entries.push(v);
601            }
602        }
603
604        Some(entries)
605    }
606
607    fn read_font_matrix(&mut self) -> Option<[f32; 6]> {
608        let mut entries = [0.0_f32; 6];
609        let mut idx = 0;
610
611        // Skip '[';
612        self.skip_token();
613
614        while let Some(token) = self.next_token() {
615            entries[idx] = f32::from_str(core::str::from_utf8(token).ok()?).ok()?;
616
617            idx += 1;
618            if idx == 5 {
619                break;
620            }
621        }
622
623        // Skip `]`.
624        self.skip_token();
625
626        Some(entries)
627    }
628
629    fn read_encoding(&mut self) -> Option<EncodingType> {
630        let mut map = Map::new();
631
632        let t1 = self.next_token()?;
633        let t2 = self.next_token()?;
634
635        if t1 == b"StandardEncoding" && t2 == b"def" {
636            return Some(EncodingType::Standard);
637        }
638
639        if !self.skip_until_before(b"dup", |b| matches!(b, b"def" | b"readonly")) {
640            return Some(EncodingType::Custom(Arc::new(map)));
641        }
642
643        while let Some(token) = self.next_token() {
644            if matches!(token, b"def" | b"readonly") {
645                break;
646            }
647
648            if token != b"dup" {
649                error!("Unexpected token {token:?}");
650
651                return None;
652            }
653
654            let next = self.next_token();
655            // TODO: Should other places in the parser also use `parse_int`?
656            let code = parse_int(core::str::from_utf8(next?).ok()?)?;
657            let glyph_name = core::str::from_utf8(&self.next_token()?[1..])
658                .ok()?
659                .to_string();
660
661            if self.next_token()? != b"put" {
662                error!("Unexpected token {token:?}");
663
664                return None;
665            }
666
667            map.insert(u8::try_from(code).ok()?, glyph_name);
668        }
669
670        Some(EncodingType::Custom(Arc::new(map)))
671    }
672
673    fn skip_dict(&mut self) {
674        self.skip_until(b"begin", |b| matches!(b, b"end"));
675    }
676
677    fn skip_token(&mut self) {
678        self.next_token();
679    }
680
681    fn skip_line_comment(&mut self) {
682        while let Some(ch) = self.read_byte() {
683            if matches!(ch, b'\n' | b'\r') {
684                break;
685            }
686        }
687    }
688
689    fn skip_until(&mut self, find: &[u8], stop: impl Fn(&[u8]) -> bool) -> bool {
690        while let Some(token) = self.next_token() {
691            if token == find {
692                return true;
693            }
694
695            if stop(token) {
696                break;
697            }
698        }
699
700        false
701    }
702
703    fn skip_whitespaces(&mut self) {
704        while let Some(ch) = self.peek_byte() {
705            if is_whitespace(ch) {
706                self.read_byte();
707            } else {
708                break;
709            }
710        }
711    }
712
713    fn skip_until_before(&mut self, find: &[u8], stop: impl Fn(&[u8]) -> bool) -> bool {
714        while let Some(token) = self.peek_token() {
715            if token == find {
716                return true;
717            }
718
719            if self.next_token().is_none() {
720                return false;
721            }
722
723            if stop(token) {
724                break;
725            }
726        }
727
728        false
729    }
730}
731
732fn decrypt_charstring(data: &[u8], len_iv: i64, use_decryption: bool) -> Option<Vec<u8>> {
733    let mut r = 4330;
734    let mut cb: Copied<Iter<'_, u8>> = data.iter().copied();
735    let mut decrypted = vec![];
736
737    for _ in 0..len_iv {
738        let _ = decrypt_byte(cb.next()?, &mut r, use_decryption);
739    }
740
741    for byte in cb {
742        decrypted.push(decrypt_byte(byte, &mut r, use_decryption));
743    }
744
745    Some(decrypted)
746}
747
748fn is_whitespace(c: u8) -> bool {
749    if c <= 32 {
750        return matches!(c, b' ' | b'\n' | b'\r' | b'\t' | 0x00 | 0x0C);
751    }
752
753    false
754}
755
756fn is_self_delim_after_token(c: u8) -> bool {
757    // The characters ()<>[]{}/% are special. They delimit syntactic entities
758    // such as strings, procedure bodies, name literals, and comments. Any of these
759    // characters terminates the entity preceding it and is not included in the entity.
760
761    matches!(
762        c,
763        b'(' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%' | b')'
764    )
765
766    // This checks for self delimiters appearing after tokens. Thus there is no
767    // need to check for ')'. However, char '>' can appear in keyword >>, like
768    // here: /Pages 2 0 R>>. So the char '>' must end the token R.
769}
770
771#[derive(Debug, Clone)]
772pub(crate) enum EncodingType {
773    Standard,
774    Custom(Arc<Map<u8, String>>),
775}
776
777impl EncodingType {
778    pub(crate) fn encode(&self, code: u8) -> Option<&str> {
779        match self {
780            Self::Standard => STANDARD.get(&code).copied(),
781            Self::Custom(c) => c.get(&code).map(|s| s.as_str()),
782        }
783    }
784}
785
786fn parse_int(str: &str) -> Option<i64> {
787    if let Some(hash_idx) = str.find('#') {
788        if hash_idx == 1 || hash_idx == 2 {
789            // It's a radix number, like 8#40.
790            let radix_str = &str[0..hash_idx];
791            let number_str = &str[hash_idx + 1..];
792
793            let radix = radix_str.parse::<u32>().ok()?;
794
795            if (2..=36).contains(&radix) {
796                i64::from_str_radix(number_str, radix).ok()
797            } else {
798                None
799            }
800        } else {
801            str.parse::<i64>().ok()
802        }
803    } else {
804        str.parse::<i64>().ok()
805    }
806}
807
808#[cfg(test)]
809mod tests {
810    use crate::font::type1::stream::Stream;
811
812    macro_rules! assert_token {
813        ($content:expr, $token:expr) => {
814            assert_eq!($content.next_token(), Some(&$token[..]))
815        };
816    }
817
818    #[test]
819    fn lexing_1() {
820        let mut content = Stream::new(b"/FontInfo ");
821
822        assert_token!(content, b"/FontInfo");
823    }
824
825    #[test]
826    fn lexing_2() {
827        let mut content = Stream::new(b"/version (01) readonly def");
828
829        assert_token!(content, b"/version");
830        assert_token!(content, b"(");
831        assert_token!(content, b"01");
832        assert_token!(content, b")");
833        assert_token!(content, b"readonly");
834        assert_token!(content, b"def");
835    }
836}
pdf_font/font/type1/mod.rs

pdf_font/font/type1/
mod.rs