mdict_parser/
parser.rs

1use std::{
2    collections::HashMap,
3    io::{self, Read},
4    str,
5};
6
7use adler32::adler32;
8
9use compress::zlib;
10use encoding::{all::UTF_16LE, label::encoding_from_whatwg_label, Encoding};
11use nom::{
12    bytes::complete::{take, take_till},
13    combinator::map,
14    multi::{count, length_data, many0},
15    number::complete::{be_u16, be_u32, be_u64, be_u8, le_u32},
16    sequence::tuple,
17    IResult, Slice,
18};
19use regex::Regex;
20use ripemd::{Digest, Ripemd128};
21use salsa20::{cipher::KeyIvInit, Salsa20};
22
23use crate::mdict::Mdx;
24
25#[derive(Debug)]
26pub(crate) struct KeyBlock {
27    pub(crate) entries: Vec<KeyEntry>,
28}
29
30#[derive(Debug)]
31pub struct KeyEntry {
32    pub offset: usize,
33    pub text: String,
34}
35
36#[derive(Debug)]
37pub struct Header {
38    version: Version,
39    encrypted: u8,
40    encoding: String,
41}
42
43#[derive(Debug)]
44struct KeyBlockHeader {
45    block_num: usize,
46    entry_num: usize,
47    decompressed_size: usize,
48    block_info_size: usize,
49    key_block_size: usize,
50}
51
52#[derive(Debug)]
53pub(crate) struct BlockEntryInfo {
54    pub(crate) compressed_size: usize,
55    pub(crate) decompressed_size: usize,
56}
57
58#[derive(Debug)]
59enum Version {
60    V1,
61    V2,
62    V3,
63}
64
65fn parse_header(input: &[u8]) -> IResult<&[u8], Header> {
66    let (input, (info, chksum)) = tuple((length_data(be_u32), le_u32))(input)?;
67
68    assert_eq!(adler32(info).unwrap(), chksum);
69
70    let info = UTF_16LE
71        .decode(info, encoding::DecoderTrap::Strict)
72        .unwrap();
73    let attrs = parse_key_value(info.as_str());
74
75    let version = attrs
76        .get("GeneratedByEngineVersion")
77        .unwrap()
78        .trim()
79        .slice(0..1)
80        .parse::<u8>()
81        .unwrap();
82
83    let version = match version {
84        1 => Version::V1,
85        2 => Version::V2,
86        3 => Version::V3,
87        _ => panic!("unsupported version"),
88    };
89
90    let encrypted = attrs
91        .get("Encrypted")
92        .and_then(|x| match x == "Yes" {
93            true => Some(1_u8),
94            false => x.as_str().parse().ok(),
95        })
96        .unwrap_or(0);
97
98    let encoding = attrs
99        .get("Encoding")
100        .unwrap_or(&"UTF-8".to_string())
101        .to_string();
102
103    Ok((
104        input,
105        Header {
106            version,
107            encrypted,
108            encoding,
109        },
110    ))
111}
112
113fn parse_key_value(s: &str) -> HashMap<String, String> {
114    let re = Regex::new(r#"(\w+)="((.|\r\n|[\r\n])*?)""#).unwrap();
115    let mut attrs = HashMap::new();
116    for cap in re.captures_iter(s) {
117        attrs.insert(cap[1].to_string(), cap[2].to_string());
118    }
119    attrs
120}
121
122fn parse_key_block_header_v2(input: &[u8]) -> IResult<&[u8], KeyBlockHeader> {
123    let (input, block_info_buf) = take(40_usize)(input)?;
124    let (input, chksum) = be_u32(input)?;
125    assert_eq!(adler32(block_info_buf).unwrap(), chksum);
126
127    let (_, res) = map(
128        tuple((be_u64, be_u64, be_u64, be_u64, be_u64)),
129        |(block_num, entry_num, decompressed_size, block_info_size, key_block_size)| {
130            KeyBlockHeader {
131                block_num: block_num as usize,
132                entry_num: entry_num as usize,
133                decompressed_size: decompressed_size as usize,
134                block_info_size: block_info_size as usize,
135                key_block_size: key_block_size as usize,
136            }
137        },
138    )(block_info_buf)?;
139    Ok((input, res))
140}
141
142fn parse_key_block_header_v1(input: &[u8]) -> IResult<&[u8], KeyBlockHeader> {
143    let (input, block_info_buf) = take(16_usize)(input)?;
144
145    let (_, res) = map(
146        tuple((be_u32, be_u32, be_u32, be_u32)),
147        |(block_num, entry_num, block_info_size, key_block_size)| KeyBlockHeader {
148            block_num: block_num as usize,
149            entry_num: entry_num as usize,
150            decompressed_size: block_info_size as usize,
151            block_info_size: block_info_size as usize,
152            key_block_size: key_block_size as usize,
153        },
154    )(block_info_buf)?;
155    Ok((input, res))
156}
157
158fn parse_key_block_header<'a>(
159    input: &'a [u8],
160    header: &'a Header,
161) -> IResult<&'a [u8], KeyBlockHeader> {
162    match header.version {
163        Version::V2 => parse_key_block_header_v2(input),
164        Version::V1 => parse_key_block_header_v1(input),
165        _ => panic!("unsupported version"),
166    }
167}
168
169fn parse_key_block_infos<'a>(
170    input: &'a [u8],
171    size: usize,
172    dict_header: &'a Header,
173) -> IResult<&'a [u8], Vec<BlockEntryInfo>> {
174    match &dict_header.version {
175        Version::V1 => parse_key_block_infos_v1(input, size),
176        Version::V2 => parse_key_block_infos_v2(input, size, dict_header),
177        _ => panic!("unsupported version"),
178    }
179}
180
181fn parse_key_block_infos_v1<'a>(
182    input: &'a [u8],
183    size: usize,
184) -> IResult<&'a [u8], Vec<BlockEntryInfo>> {
185    let (input, block_info) = take(size)(input)?;
186    let entry_infos = decode_key_block_info_v1(&block_info[..]);
187    Ok((input, entry_infos))
188}
189fn parse_key_block_infos_v2<'a>(
190    input: &'a [u8],
191    size: usize,
192    dict_header: &'a Header,
193) -> IResult<&'a [u8], Vec<BlockEntryInfo>> {
194    let (input, block_info) = take(size)(input)?;
195
196    assert_eq!(block_info.slice(0..4), b"\x02\x00\x00\x00");
197    let mut key_block_info = vec![];
198
199    //decrypt
200    if dict_header.encrypted == 2 {
201        let mut md = Ripemd128::new();
202        let mut v = Vec::from(block_info.slice(4..8));
203        let value: u32 = 0x3695;
204        v.extend_from_slice(&value.to_le_bytes());
205        md.update(v);
206        let key = md.finalize();
207        let mut d = Vec::from(&block_info[0..8]);
208        let decrypte = fast_decrypt(&block_info[8..], key.as_slice());
209        d.extend(decrypte);
210        zlib::Decoder::new(&d[8..])
211            .read_to_end(&mut key_block_info)
212            .unwrap();
213    }
214
215    let entry_infos = decode_key_block_info_v2(&key_block_info[..]);
216    Ok((input, entry_infos))
217}
218
219fn text_len_parser_v2(input: &[u8]) -> IResult<&[u8], u16> {
220    let (input, len) = be_u16(input)?;
221    Ok((input, len + 1))
222}
223
224fn text_len_parser_v1(input: &[u8]) -> IResult<&[u8], u8> {
225    be_u8(input)
226}
227
228fn decode_key_block_info_v1(input: &[u8]) -> Vec<BlockEntryInfo> {
229    let mut info_parser = many0(map(
230        tuple((
231            be_u32,
232            length_data(text_len_parser_v1),
233            length_data(text_len_parser_v1),
234            be_u32,
235            be_u32,
236        )),
237        |(_, _, _, compressed_size, decompressed_size)| BlockEntryInfo {
238            compressed_size: compressed_size as usize,
239            decompressed_size: decompressed_size as usize,
240        },
241    ));
242    let (remain, res) = info_parser(input).unwrap();
243    assert_eq!(remain.len(), 0);
244    res
245}
246
247fn decode_key_block_info_v2(input: &[u8]) -> Vec<BlockEntryInfo> {
248    let mut info_parser = many0(map(
249        tuple((
250            be_u64,
251            length_data(text_len_parser_v2),
252            length_data(text_len_parser_v2),
253            be_u64,
254            be_u64,
255        )),
256        |(_, _, _, compressed_size, decompressed_size)| BlockEntryInfo {
257            // num,
258            compressed_size: compressed_size as usize,
259            decompressed_size: decompressed_size as usize,
260        },
261    ));
262    let (remain, res) = info_parser(input).unwrap();
263    assert_eq!(remain.len(), 0);
264    res
265}
266
267fn parse_key_blocks<'a>(
268    input: &'a [u8],
269    size: usize,
270    header: &Header,
271    block_infos: &'a Vec<BlockEntryInfo>,
272) -> IResult<&'a [u8], Vec<KeyBlock>> {
273    let (input, buf) = take(size)(input)?;
274
275    let blocks = match &header.version {
276        Version::V1 => decode_blocks(buf, block_infos, &header),
277        Version::V2 => decode_blocks(buf, block_infos, &header),
278        Version::V3 => panic!("unsupported version"),
279    };
280
281    Ok((input, blocks))
282}
283
284fn decode_blocks(buf: &[u8], entry_infos: &Vec<BlockEntryInfo>, header: &Header) -> Vec<KeyBlock> {
285    let mut buf = buf;
286
287    let mut res = vec![];
288    for info in entry_infos.iter() {
289        let (remain, decompressed) =
290            block_parser(info.compressed_size, info.decompressed_size)(buf).unwrap();
291        let (_, entries) = match &header.version {
292            Version::V1 => parse_block_items_v1(&decompressed[..], &header.encoding).unwrap(),
293            Version::V2 => parse_block_items_v2(&decompressed[..], &header.encoding).unwrap(),
294            _ => panic!("unsupported version"),
295        };
296
297        buf = remain;
298        res.push(KeyBlock { entries });
299    }
300
301    res
302}
303
304fn parse_block_items_v1<'a>(
305    input: &'a [u8],
306    encoding: &'a str,
307) -> IResult<&'a [u8], Vec<KeyEntry>> {
308    let (remain, sep) = many0(map(
309        tuple((be_u32, take_till(|x| x == 0), take(1_usize))),
310        |(offset, buf, _)| {
311            let decoder = encoding_from_whatwg_label(encoding).unwrap();
312            let text = decoder.decode(buf, encoding::DecoderTrap::Ignore).unwrap();
313            KeyEntry {
314                offset: offset as usize,
315                text,
316            }
317        },
318    ))(input)?;
319
320    assert_eq!(remain.len(), 0);
321
322    Ok((remain, sep))
323}
324fn parse_block_items_v2<'a>(
325    input: &'a [u8],
326    encoding: &'a str,
327) -> IResult<&'a [u8], Vec<KeyEntry>> {
328    let (remain, sep) = many0(map(
329        tuple((be_u64, take_till(|x| x == 0), take(1_usize))),
330        |(offset, buf, _)| {
331            let decoder = encoding_from_whatwg_label(encoding).unwrap();
332            let text = decoder.decode(buf, encoding::DecoderTrap::Ignore).unwrap();
333            KeyEntry {
334                offset: offset as usize,
335                text,
336            }
337        },
338    ))(input)?;
339
340    assert_eq!(remain.len(), 0);
341
342    Ok((remain, sep))
343}
344
345fn block_parser_v1<'a>(size: usize) -> impl FnMut(&'a [u8]) -> IResult<&'a [u8], Vec<u8>> {
346    map(
347        tuple((le_u32, take(4_usize), take(size - 8))),
348        |(enc, chksum, encrypted)| {
349            let enc_method = (enc >> 4) & 0xf;
350            let enc_size = (enc >> 8) & 0xff;
351            let comp_method = enc & 0xf;
352
353            let mut md = Ripemd128::new();
354            md.update(chksum);
355            let key = md.finalize();
356
357            let data: Vec<u8> = match enc_method {
358                0 => Vec::from(encrypted),
359                1 => fast_decrypt(encrypted, key.as_slice()),
360                2 => {
361                    let mut decrypt = vec![];
362                    let mut cipher = Salsa20::new(key.as_slice().into(), &[0; 8].into());
363
364                    decrypt
365                }
366                _ => panic!("unknown enc method: {}", enc_method),
367            };
368
369            let decompressed = match comp_method {
370                0 => data,
371                2 => {
372                    let mut v = vec![];
373                    zlib::Decoder::new(&data[..]).read_to_end(&mut v).unwrap();
374                    v
375                }
376                _ => panic!("unknown compression method: {}", comp_method),
377            };
378
379            decompressed
380        },
381    )
382}
383fn block_parser<'a>(
384    comp_size: usize,
385    decomp_size: usize,
386) -> impl FnMut(&'a [u8]) -> IResult<&'a [u8], Vec<u8>> {
387    map(
388        tuple((le_u32, take(4_usize), take(comp_size - 8))),
389        move |(enc, chksum, encrypted)| {
390            let enc_method = (enc >> 4) & 0xf;
391            let enc_size = (enc >> 8) & 0xff;
392            let comp_method = enc & 0xf;
393
394            let mut md = Ripemd128::new();
395            md.update(chksum);
396            let key = md.finalize();
397
398            let data: Vec<u8> = match enc_method {
399                0 => Vec::from(encrypted),
400                1 => fast_decrypt(encrypted, key.as_slice()),
401                2 => {
402                    let mut decrypt = vec![];
403                    let mut cipher = Salsa20::new(key.as_slice().into(), &[0; 8].into());
404
405                    decrypt
406                }
407                _ => panic!("unknown enc method: {}", enc_method),
408            };
409
410            let decompressed = match comp_method {
411                0 => data,
412                1 => {
413                    let mut comp: Vec<u8> = vec![0xf0];
414                    comp.extend_from_slice(&data[..]);
415                    let lzo = minilzo_rs::LZO::init().unwrap();
416                    lzo.decompress(&data[..], decomp_size).unwrap()
417                }
418                2 => {
419                    let mut v = vec![];
420                    zlib::Decoder::new(&data[..]).read_to_end(&mut v).unwrap();
421                    v
422                }
423                _ => panic!("unknown compression method: {}", comp_method),
424            };
425
426            decompressed
427        },
428    )
429}
430
431fn parse_record_blocks<'a>(input: &'a [u8], header: &'a Header) -> IResult<&'a [u8], Vec<BlockEntryInfo>> {
432    match &header.version {
433        Version::V1 => parse_record_blocks_v1(input),
434        Version::V2 => parse_record_blocks_v2(input),
435        _ => panic!("unsupported version"),
436    }
437}
438
439fn parse_record_blocks_v1(input: &[u8]) -> IResult<&[u8], Vec<BlockEntryInfo>> {
440    let (input, records) = be_u32(input)?;
441    let (input, entries) = be_u32(input)?;
442    let (input, record_info_size) = be_u32(input)?;
443    let (input, record_buf_size) = be_u32(input)?;
444
445    assert_eq!(records * 8, record_info_size);
446
447    count(
448        map(
449            tuple((be_u32, be_u32)),
450            |(compressed_size, decompressed_size)| BlockEntryInfo {
451                compressed_size: compressed_size as usize,
452                decompressed_size: decompressed_size as usize,
453            },
454        ),
455        records as usize,
456    )(input)
457}
458fn parse_record_blocks_v2(input: &[u8]) -> IResult<&[u8], Vec<BlockEntryInfo>> {
459    let (input, records) = be_u64(input)?;
460    let (input, entries) = be_u64(input)?;
461    let (input, record_info_size) = be_u64(input)?;
462    let (input, record_buf_size) = be_u64(input)?;
463
464    assert_eq!(records * 16, record_info_size);
465
466    count(
467        map(
468            tuple((be_u64, be_u64)),
469            |(compressed_size, decompressed_size)| BlockEntryInfo {
470                compressed_size: compressed_size as usize,
471                decompressed_size: decompressed_size as usize,
472            },
473        ),
474        records as usize,
475    )(input)
476}
477
478fn fast_decrypt(encrypted: &[u8], key: &[u8]) -> Vec<u8> {
479    let mut buf = Vec::from(encrypted);
480    let mut prev = 0x36;
481    for i in 0..buf.len() {
482        let mut t = buf[i] >> 4 | buf[i] << 4;
483        t = t ^ prev ^ (i as u8) ^ key[i % key.len()];
484        prev = buf[i];
485        buf[i] = t;
486    }
487    buf
488}
489
490pub(crate) fn record_block_parser<'a>(
491    size: usize,
492    decomp_size:usize
493) -> impl FnMut(&'a [u8]) -> IResult<&'a [u8], Vec<u8>> {
494    map(
495        tuple((le_u32, take(4_usize), take(size - 8))),
496        move |(enc, chksum, encrypted)| {
497            let enc_method = (enc >> 4) & 0xf;
498            let enc_size = (enc >> 8) & 0xff;
499            let comp_method = enc & 0xf;
500
501            let mut md = Ripemd128::new();
502            md.update(chksum);
503            let key = md.finalize();
504
505            let data: Vec<u8> = match enc_method {
506                0 => Vec::from(encrypted),
507                1 => fast_decrypt(encrypted, key.as_slice()),
508                2 => {
509                    let mut decrypt = vec![];
510                    let mut cipher = Salsa20::new(key.as_slice().into(), &[0; 8].into());
511
512                    decrypt
513                }
514                _ => panic!("unknown enc method: {}", enc_method),
515            };
516
517            let decompressed = match comp_method {
518                0 => data,
519                1 => {
520                    let lzo = minilzo_rs::LZO::init().unwrap();
521                    lzo.decompress(&data[..], decomp_size).unwrap()
522                },
523                2 => {
524                    let mut v = vec![];
525                    zlib::Decoder::new(&data[..]).read_to_end(&mut v).unwrap();
526                    v
527                }
528                _ => panic!("unknown compression method: {}", comp_method),
529            };
530
531            decompressed
532        },
533    )
534}
535
536pub fn parse(data: &[u8]) -> Mdx {
537    let (input, header) = parse_header(data).unwrap();
538    let (input, key_block_header) = parse_key_block_header(input, &header).unwrap();
539    let (input, key_block_infos) =
540        parse_key_block_infos(input, key_block_header.block_info_size, &header).unwrap();
541    let (input, key_blocks) = parse_key_blocks(
542        input,
543        key_block_header.key_block_size,
544        &header,
545        &key_block_infos,
546    )
547    .unwrap();
548    let (input, record_blocks) = parse_record_blocks(input, &header).unwrap();
549    Mdx {
550        key_blocks,
551        records_info: record_blocks,
552        records: Vec::from(input),
553        encoding: header.encoding,
554        encrypted: header.encrypted,
555    }
556}