adobe_cmap_parser/
lib.rs

1extern crate pom;
2
3use pom::char_class::{alpha, hex_digit, oct_digit, multispace};
4use pom::Parser;
5use pom::parser::*;
6use pom::DataInput;
7use std::collections::HashMap;
8
9use std::str::FromStr;
10
11#[derive(Debug)]
12pub enum Value {
13    LiteralString(Vec<u8>),
14    Name(Vec<u8>),
15    Number(String),
16    Integer(i64),
17    Array(Vec<Value>),
18    Operator(String),
19    Boolean(bool),
20    Dictionary(HashMap<String, Value>),
21}
22
23fn hex_char() -> Parser<u8, u8> {
24    let number = is_a(hex_digit).repeat(2);
25    number.collect().convert(|v|u8::from_str_radix(&String::from_utf8(v).unwrap(), 16))
26}
27
28fn comment() -> Parser<u8, ()> {
29    sym(b'%') * none_of(b"\r\n").repeat(0..) * eol().discard()
30}
31
32fn content_space() -> Parser<u8, ()> {
33    is_a(multispace).repeat(0..).discard()
34}
35
36fn operator() -> Parser<u8, String> {
37    (is_a(alpha) | one_of(b"*'\"")).repeat(1..).convert(|v|String::from_utf8(v))
38}
39
40fn oct_char() -> Parser<u8, u8> {
41    let number = is_a(oct_digit).repeat(1..4);
42    number.collect().convert(|v|u8::from_str_radix(&String::from_utf8(v).unwrap(), 8))
43}
44
45fn escape_sequence() -> Parser<u8, Vec<u8>> {
46    sym(b'\\') *
47        ( sym(b'\\').map(|_| vec![b'\\'])
48            | sym(b'(').map(|_| vec![b'('])
49            | sym(b')').map(|_| vec![b')'])
50            | sym(b'n').map(|_| vec![b'\n'])
51            | sym(b'r').map(|_| vec![b'\r'])
52            | sym(b't').map(|_| vec![b'\t'])
53            | sym(b'b').map(|_| vec![b'\x08'])
54            | sym(b'f').map(|_| vec![b'\x0C'])
55            | oct_char().map(|c| vec![c])
56            | eol()     .map(|_| vec![])
57            | empty()   .map(|_| vec![])
58        )
59}
60
61fn nested_literal_string() -> Parser<u8, Vec<u8>> {
62    sym(b'(') *
63        ( none_of(b"\\()").repeat(1..)
64            | escape_sequence()
65            | call(nested_literal_string)
66        ).repeat(0..).map(|segments| {
67            let mut bytes = segments.into_iter().fold(
68                vec![b'('],
69                |mut bytes, mut segment| {
70                    bytes.append(&mut segment);
71                    bytes
72                });
73            bytes.push(b')');
74            bytes
75        })
76        - sym(b')')
77}
78
79fn literal_string() -> Parser<u8, Vec<u8>> {
80    sym(b'(') *
81        ( none_of(b"\\()").repeat(1..)
82            | escape_sequence()
83            | nested_literal_string()
84        ).repeat(0..).map(|segments|segments.concat())
85        - sym(b')')
86}
87
88fn name() -> Parser<u8, Vec<u8>> {
89    sym(b'/') * (none_of(b" \t\n\r\x0C()<>[]{}/%#") | sym(b'#') * hex_char()).repeat(0..)
90}
91
92fn integer() -> Parser<u8, i64> {
93    let number = one_of(b"+-").opt() + one_of(b"0123456789").repeat(1..);
94    number.collect().convert(|v|String::from_utf8(v)).convert(|s|i64::from_str(&s))
95}
96
97fn number() -> Parser<u8, String> {
98    let number = one_of(b"+-").opt() +
99        ( (one_of(b"0123456789") - one_of(b"0123456789").repeat(0..).discard())
100            | (one_of(b"0123456789").repeat(1..) * sym(b'.') - one_of(b"0123456789").repeat(0..))
101            | sym(b'.') - one_of(b"0123456789").repeat(1..)
102        );
103    number.collect().convert(|v|String::from_utf8(v))
104}
105
106fn space() -> Parser<u8, ()> {
107    ( one_of(b" \t\n\r\0\x0C").repeat(1..).discard()
108    ).repeat(0..).discard()
109}
110
111// Dictionaries are not mentioned in the CMap spec but are produced by software like Cairo and Skia and supported other by readers
112fn dictionary() -> Parser<u8, HashMap<String, Value>> {
113    let entry = name() - space() + call(value);
114    let entries = seq(b"<<") * space() * entry.repeat(0..) - seq(b">>");
115    entries.map(|entries| entries.into_iter().fold(
116        HashMap::new(),
117        |mut dict: HashMap<String, Value>, (key, value)| { dict.insert(String::from_utf8(key).unwrap(), value); dict }
118    ))
119}
120
121fn hexadecimal_string() -> Parser<u8, Vec<u8>> {
122    sym(b'<') * (space() * hex_char()).repeat(0..) - (space() * sym(b'>'))
123}
124
125fn eol() -> Parser<u8, u8> {
126    sym(b'\r') * sym(b'\n') | sym(b'\n') | sym(b'\r')
127}
128
129fn value() -> Parser<u8, Value> {
130    ( seq(b"true").map(|_| Value::Boolean(true))
131    | seq(b"false").map(|_| Value::Boolean(false))
132    | integer().map(|v| Value::Integer(v))
133    | number().map(|v| Value::Number(v))
134    | name().map(|v| Value::Name(v))
135    | operator().map(|v| Value::Operator(v))
136    | literal_string().map(|v| Value::LiteralString(v))
137    | dictionary().map(|v| Value::Dictionary(v))
138    | hexadecimal_string().map(|v| Value::LiteralString(v))
139    | array().map(|v| Value::Array(v))
140    ) - content_space()
141}
142
143
144
145fn array() -> Parser<u8, Vec<Value>> {
146    sym(b'[') * space() * call(value).repeat(0..) - sym(b']')
147}
148
149
150fn file() -> Parser<u8,Vec<Value>>
151{
152    ( comment().repeat(0..) * content_space() * value()).repeat(1..)
153}
154
155pub fn parse(input: &[u8]) -> Result<Vec<Value>, pom::Error> {
156    file().parse(&mut DataInput::new(input))
157}
158
159fn as_code(str: &[u8]) -> u32 {
160    let mut code: u32 = 0;
161    for c in str {
162        code = (code << 8) | (*c as u32);
163    }
164    code
165}
166
167/// Return a mapping from character codes to Unicode character sequences expressed in UTF-16BE encoding.
168pub fn get_unicode_map(input: &[u8]) -> Result<HashMap<u32, Vec<u8>>, &'static str> {
169    let lexed = parse(&input).expect("failed to parse");
170
171    let mut i = 0;
172    let mut map = HashMap::new();
173    while i < lexed.len() {
174        match lexed[i] {
175            Value::Operator(ref o) => {
176                match o.as_ref() {
177                    "beginbfchar" => {
178                        let count = if let &Value::Integer(ref c) = &lexed[i-1] { Ok(*c) } else { Err("beginbfchar exected int") }?;
179                        i += 1;
180                        for _ in 0..count {
181                            let char_code = if let &Value::LiteralString(ref s) = &lexed[i] { Ok(s) } else { Err("beginbfchar exected hexstring") }?;
182                            let uni_code = if let &Value::LiteralString(ref s) = &lexed[i+1] { Ok(s) } else { Err("beginbfchar exected hexstring") }?;
183                            //let char_code =
184                            map.insert(as_code(char_code), uni_code.clone());
185                            i += 2;
186                        }
187                        i += 1;
188                    }
189                    "beginbfrange" => {
190                        let count = if let &Value::Integer(ref c) = &lexed[i-1] { Ok(*c) } else { Err("beginbfrange exected int") }?;
191                        i += 1;
192                        for _ in 0..count {
193                            let lower_code = if let &Value::LiteralString(ref s) = &lexed[i] { Ok(as_code(s)) } else { Err("beginbfrange exected hexstring") }?;
194                            let upper_code = if let &Value::LiteralString(ref s) = &lexed[i+1] { Ok(as_code(s)) } else { Err("beginbfrange exected hexstring") }?;
195                            match &lexed[i+2] {
196                                &Value::LiteralString(ref start) => {
197                                    match start.len() {
198                                        4 => {
199                                            let val = as_code(start);
200                                            for c in lower_code..=upper_code {
201                                                let code = val + (c - lower_code);
202                                                map.insert(c, code.to_be_bytes().to_vec());
203                                            }
204                                        }
205                                        2 => {
206                                            let val: u16 = as_code(start) as u16;
207                                            for c in lower_code..=upper_code {
208                                                let code = val + (c - lower_code) as u16;
209                                                map.insert(c, code.to_be_bytes().to_vec());
210                                            }
211                                        }
212                                        _ => {
213                                            panic!("bad length of hexstring");
214                                        }
215                                    }
216                                }
217                                &Value::Array(ref codes) => {
218                                    // inclusive ranges would be nice
219                                    let mut i = 0;
220                                    if (upper_code - lower_code + 1) as usize != codes.len() {
221                                        return Err("bad length of array");
222                                    }
223                                    for c in lower_code..=upper_code {
224                                        map.insert(c, if let &Value::LiteralString(ref s) = &codes[i] { Ok(s.clone()) } else { Err("beginbfrange exected hexstring") }?);
225                                        i += 1;
226                                    }
227                                }
228                                _ => { return Err("beginbfrange exected array or literal") }
229                            }
230                            i += 3;
231                        }
232                        i += 1;
233                    }
234                    _ => { i += 1; }
235                }
236
237            }
238            _ => { i += 1; }
239        }
240    }
241    Ok(map)
242
243}
244
245fn as_code_range(start_chars: &Vec<u8>, end_chars: &Vec<u8>) -> CodeRange {
246    let mut start = 0;
247    let mut end = 0;
248    assert!(start_chars.len() == end_chars.len());
249    for i in 0..start_chars.len() {
250        start = (start << 8) | (start_chars[i] as u32);
251        end = (end << 8) | (end_chars[i] as u32);
252    }
253    let width = start_chars.len() as u32 / 2;
254    CodeRange { start, end, width }
255}
256
257#[derive(Debug)]
258pub struct CodeRange {
259    pub width: u32,
260    pub start: u32,
261    pub end: u32,
262}
263
264#[derive(Debug)]
265pub struct CIDRange {
266    pub src_code_lo: u32,
267    pub src_code_hi: u32,
268    #[allow(non_snake_case)]
269    pub dst_CID_lo: u32,
270}
271
272#[derive(Debug)]
273pub struct ByteMapping {
274    pub codespace: Vec<CodeRange>,
275    pub cid: Vec<CIDRange>,
276}
277
278pub fn get_byte_mapping(input: &[u8]) -> Result<ByteMapping, &'static str> {
279    let lexed = parse(&input).expect("failed to parse");
280
281    let mut i = 0;
282    let mut result = ByteMapping { codespace: Vec::new(), cid: Vec::new() };
283    while i < lexed.len() {
284        match lexed[i] {
285            Value::Operator(ref o) => {
286                match o.as_ref() {
287                    "begincodespacerange" => {
288                        let count = if let &Value::Integer(ref c) = &lexed[i-1] { Ok(*c) } else { Err("begincodespacerange exected int") }?;
289                        i += 1;
290                        for _ in 0..count {
291                            let start = if let &Value::LiteralString(ref s) = &lexed[i] { Ok(s) } else { Err("begincodespacerange exected hexstring") }?;
292                            let end = if let &Value::LiteralString(ref s) = &lexed[i+1] { Ok(s) } else { Err("begincodespacerange exected hexstring") }?;
293                            result.codespace.push(as_code_range(start, end));
294                            i += 2;
295                        }
296                        i += 1;
297                    }
298                    "begincidrange" => {
299                        let count = if let &Value::Integer(ref c) = &lexed[i-1] { Ok(*c) } else { Err("begincidrange exected int") }?;
300                        i += 1;
301                        for _ in 0..count {
302                            let start = if let &Value::LiteralString(ref s) = &lexed[i] { Ok(s) } else { Err("begincidrange exected hexstring") }?;
303                            let end = if let &Value::LiteralString(ref s) = &lexed[i+1] { Ok(s) } else { Err("begincidrange exected hexstring") }?;
304                            let offset = if let &Value::Integer(ref s) = &lexed[i+2] { Ok(s) } else { Err("begincidrange exected int") }?;
305                            result.cid.push(CIDRange { src_code_lo: as_code(start), src_code_hi: as_code(end), dst_CID_lo: *offset as u32 });
306                            i += 2;
307                        }
308                        i += 1;
309                    }
310                    _ => { i += 1; }
311                }
312
313            }
314            _ => { i += 1; }
315        }
316    }
317    Ok(result)
318
319}
320
321
322#[cfg(test)]
323mod tests {
324    use parse;
325    use std::fs::File;
326    use std::io::BufReader;
327    use std::io::Read;
328
329    fn do_parse(input: &[u8]) {
330        let result = parse(input);
331        if let Ok(lines) = result  {
332            for l in lines {
333                println!("{:?}", l)
334            }
335        } else {
336            println!("{:?}", result)
337        }
338    }
339    #[test]
340    fn it_works() {
341        let f = File::open("examples/Identity-V").unwrap();
342        let mut f = BufReader::new(f);
343        let mut contents = Vec::new();
344        f.read_to_end(&mut contents);
345
346        //for line in f.lines() {
347        do_parse(&contents);
348
349    }
350}