1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
//! Parse vbaProject.bin file
//!
//! Retranscription from:
//! https://github.com/unixfreak0037/officeparser/blob/master/officeparser.py

use std::io::Read;
use std::path::PathBuf;
use std::collections::HashMap;

use byteorder::{LittleEndian, ReadBytesExt};
use log::Level;

use cfb::{Cfb, XlsEncoding};
use utils::read_u16;

/// A VBA specific error enum
#[derive(Debug, Fail)]
pub enum VbaError {
    /// Error comes from a cfb parsing
    #[fail(display = "{}", _0)]
    Cfb(#[cause] ::cfb::CfbError),
    /// Io error
    #[fail(display = "{}", _0)]
    Io(#[cause] ::std::io::Error),

    /// Cannot find module
    #[fail(display = "Cannot find module '{}'", _0)]
    ModuleNotFound(String),
    /// Generic unknown u16 value
    #[fail(display = "Unknown {} '{:X}'", typ, val)]
    Unknown {
        /// error type
        typ: &'static str,
        /// value found
        val: u16,
    },
    /// Invalid libid format
    #[fail(display = "Unexpected libid format")]
    LibId,
    /// Invalid record id
    #[fail(display = "Invalid record id: expecting {:X} found {:X}", expected, found)]
    InvalidRecordId {
        /// expected record id
        expected: u16,
        /// record if found
        found: u16,
    },
}

from_err!(::cfb::CfbError, VbaError, Cfb);
from_err!(::std::io::Error, VbaError, Io);

/// A struct for managing VBA reading
#[allow(dead_code)]
#[derive(Clone)]
pub struct VbaProject {
    references: Vec<Reference>,
    modules: HashMap<String, Vec<u8>>,
    encoding: XlsEncoding,
}

impl VbaProject {
    /// Create a new `VbaProject` out of the vbaProject.bin `ZipFile` or xls file
    ///
    /// Starts reading project metadata (header, directories, sectors and minisectors).
    pub fn new<R: Read>(r: &mut R, len: usize) -> Result<VbaProject, VbaError> {
        let mut cfb = Cfb::new(r, len)?;
        VbaProject::from_cfb(r, &mut cfb)
    }

    /// Creates a new `VbaProject` out of a Compound File Binary and the corresponding reader
    pub fn from_cfb<R: Read>(r: &mut R, cfb: &mut Cfb) -> Result<VbaProject, VbaError> {
        // dir stream
        let stream = cfb.get_stream("dir", r)?;
        let stream = ::cfb::decompress_stream(&*stream)?;
        let stream = &mut &*stream;

        // read dir information record (not used)
        let encoding = read_dir_information(stream)?;

        // array of REFERENCE records
        let refs = Reference::from_stream(stream, &encoding)?;

        // modules
        let mods: Vec<Module> = read_modules(stream, &encoding)?;

        // read all modules
        let modules: HashMap<String, Vec<u8>> = mods.into_iter()
            .map(|m| {
                cfb.get_stream(&m.stream_name, r).and_then(|s| {
                    ::cfb::decompress_stream(&s[m.text_offset..]).map(move |s| (m.name, s))
                })
            })
            .collect::<Result<HashMap<_, _>, _>>()?;

        Ok(VbaProject {
            references: refs,
            modules: modules,
            encoding: encoding,
        })
    }

    /// Gets the list of `Reference`s
    pub fn get_references(&self) -> &[Reference] {
        &self.references
    }

    /// Gets the list of `Module` names
    pub fn get_module_names(&self) -> Vec<&str> {
        self.modules.keys().map(|k| &**k).collect()
    }

    /// Reads module content and tries to convert to utf8
    ///
    /// While it works most of the time, the modules are MBSC encoding and the conversion
    /// may fail. If this is the case you should revert to `read_module_raw` as there is
    /// no built in decoding provided in this crate
    ///
    /// # Examples
    /// ```
    /// use calamine::{Reader, open_workbook, Xlsx};
    ///
    /// # let path = format!("{}/tests/vba.xlsm", env!("CARGO_MANIFEST_DIR"));
    /// let mut xl: Xlsx<_> = open_workbook(path).expect("Cannot find excel file");
    /// if let Some(Ok(mut vba)) = xl.vba_project() {
    ///     let vba = vba.to_mut();
    ///     let modules = vba.get_module_names().into_iter()
    ///                      .map(|s| s.to_string()).collect::<Vec<_>>();
    ///     for m in modules {
    ///         println!("Module {}:", m);
    ///         println!("{}", vba.get_module(&m)
    ///                           .expect(&format!("cannot read {:?} module", m)));
    ///     }
    /// }
    /// ```
    pub fn get_module(&self, name: &str) -> Result<String, VbaError> {
        debug!("read module {}", name);
        let data = self.get_module_raw(name)?;
        Ok(self.encoding.decode_all(data))
    }

    /// Reads module content (MBSC encoded) and output it as-is (binary output)
    pub fn get_module_raw(&self, name: &str) -> Result<&[u8], VbaError> {
        match self.modules.get(name) {
            Some(m) => Ok(&**m),
            None => return Err(VbaError::ModuleNotFound(name.into())),
        }
    }
}

/// A vba reference
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct Reference {
    /// name
    pub name: String,
    /// description
    pub description: String,
    /// location of the reference
    pub path: PathBuf,
}

impl Reference {
    /// Check if the reference location is accessible
    pub fn is_missing(&self) -> bool {
        !self.path.exists()
    }

    /// Gets the list of references from the dir_stream relevant part
    fn from_stream(stream: &mut &[u8], encoding: &XlsEncoding) -> Result<Vec<Reference>, VbaError> {
        debug!("read all references metadata");

        let mut references = Vec::new();
        let mut reference = Reference {
            name: "".to_string(),
            description: "".to_string(),
            path: "".into(),
        };

        loop {
            let check = stream.read_u16::<LittleEndian>();
            match check? {
                0x000F => {
                    // termination of references array
                    if !reference.name.is_empty() {
                        references.push(reference);
                    }
                    break;
                }
                0x0016 => {
                    // REFERENCENAME
                    if !reference.name.is_empty() {
                        references.push(reference);
                    }
                    let name = read_variable_record(stream, 1)?;
                    let name = encoding.decode_all(name);
                    reference = Reference {
                        name: name.clone(),
                        description: name,
                        path: "".into(),
                    };
                    check_variable_record(0x003E, stream)?; // unicode
                }
                0x0033 => {
                    // REFERENCEORIGINAL (followed by REFERENCECONTROL)
                    reference.set_libid(stream, encoding)?;
                }
                0x002F => {
                    // REFERENCECONTROL
                    *stream = &stream[4..]; // SizeTwiddled: len of total ref control
                    reference.set_libid(stream, encoding)?;

                    *stream = &stream[6..];
                    match stream.read_u16::<LittleEndian>()? {
                        0x0016 => {
                            // optional name record extended
                            read_variable_record(stream, 1)?; // name extended
                            check_variable_record(0x003E, stream)?; // name extended unicode
                            check_record(0x0030, stream)?;
                        }
                        0x0030 => (),
                        e => {
                            return Err(VbaError::Unknown {
                                typ: "token in reference control",
                                val: e,
                            })
                        }
                    }
                    *stream = &stream[4..];
                    reference.set_libid(stream, encoding)?;
                    *stream = &stream[26..];
                }
                0x000D => {
                    // REFERENCEREGISTERED
                    *stream = &stream[4..];
                    reference.set_libid(stream, encoding)?;
                    *stream = &stream[6..];
                }
                0x000E => {
                    // REFERENCEPROJECT
                    *stream = &stream[4..];
                    let absolute = read_variable_record(stream, 1)?; // project libid absolute
                    {
                        let absolute = encoding.decode_all(absolute);
                        reference.path = if absolute.starts_with("*\\C") {
                            absolute[3..].into()
                        } else {
                            absolute.into()
                        };
                    }
                    read_variable_record(stream, 1)?; // project libid relative
                    *stream = &stream[6..];
                }
                c => {
                    return Err(VbaError::Unknown {
                        typ: "check id",
                        val: c,
                    })
                }
            }
        }

        debug!("references: {:#?}", references);
        Ok(references)
    }

    fn set_libid(&mut self, stream: &mut &[u8], encoding: &XlsEncoding) -> Result<(), VbaError> {
        let libid = read_variable_record(stream, 1)?; //libid twiddled
        if libid.is_empty() || libid.ends_with(b"##") {
            return Ok(());
        }
        let libid = encoding.decode_all(libid);
        let mut parts = libid.rsplit('#');
        match (parts.next(), parts.next()) {
            (Some(desc), Some(path)) => {
                self.description = desc.into();
                // use original path if already set
                if !path.is_empty() && self.path.as_os_str().is_empty() {
                    self.path = path.into();
                }
                Ok(())
            }
            _ => return Err(VbaError::LibId),
        }
    }
}

/// A vba module
#[derive(Debug, Clone, Default)]
struct Module {
    /// module name as it appears in vba project
    name: String,
    stream_name: String,
    text_offset: usize,
}

fn read_dir_information(stream: &mut &[u8]) -> Result<XlsEncoding, VbaError> {
    debug!("read dir header");

    // PROJECTSYSKIND, PROJECTLCID and PROJECTLCIDINVOKE Records
    *stream = &stream[30..];

    // PROJECT Codepage
    let encoding = XlsEncoding::from_codepage(read_u16(&stream[6..8]))?;
    *stream = &stream[8..];

    // PROJECTNAME Record
    check_variable_record(0x0004, stream)?;

    // PROJECTDOCSTRING Record
    check_variable_record(0x0005, stream)?;
    check_variable_record(0x0040, stream)?; // unicode

    // PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
    check_variable_record(0x0006, stream)?;
    check_variable_record(0x003D, stream)?;

    // PROJECTHELPCONTEXT PROJECTLIBFLAGS and PROJECTVERSION Records
    *stream = &stream[32..];

    // PROJECTCONSTANTS Record
    check_variable_record(0x000C, stream)?;
    check_variable_record(0x003C, stream)?; // unicode

    Ok(encoding)
}

fn read_modules(stream: &mut &[u8], encoding: &XlsEncoding) -> Result<Vec<Module>, VbaError> {
    debug!("read all modules metadata");
    *stream = &stream[4..];

    let module_len = stream.read_u16::<LittleEndian>()? as usize;

    *stream = &stream[8..]; // PROJECTCOOKIE record
    let mut modules = Vec::with_capacity(module_len);

    for _ in 0..module_len {
        // name
        let name = check_variable_record(0x0019, stream)?;
        let name = encoding.decode_all(name);

        check_variable_record(0x0047, stream)?; // unicode

        let stream_name = check_variable_record(0x001A, stream)?; // stream name
        let stream_name = encoding.decode_all(stream_name);

        check_variable_record(0x0032, stream)?; // stream name unicode
        check_variable_record(0x001C, stream)?; // doc string
        check_variable_record(0x0048, stream)?; // doc string unicode

        // offset
        check_record(0x0031, stream)?;
        *stream = &stream[4..];
        let offset = stream.read_u32::<LittleEndian>()? as usize;

        // help context
        check_record(0x001E, stream)?;
        *stream = &stream[8..];

        // cookie
        check_record(0x002C, stream)?;
        *stream = &stream[6..];

        match stream.read_u16::<LittleEndian>()? {
            0x0021 /* procedural module */ |
            0x0022 /* document, class or designer module */ => (),
            e => return Err(VbaError::Unknown { typ: "module typ", val: e }),
        }

        loop {
            *stream = &stream[4..]; // reserved
            match stream.read_u16::<LittleEndian>() {
                Ok(0x0025) /* readonly */ | Ok(0x0028) /* private */ => (),
                Ok(0x002B) => break,
                Ok(e) => return Err(VbaError::Unknown { typ: "record id", val: e }),
                Err(e) => return Err(VbaError::Io(e)),
            }
        }
        *stream = &stream[4..]; // reserved

        modules.push(Module {
            name: name,
            stream_name: stream_name,
            text_offset: offset,
        });
    }

    Ok(modules)
}

/// Reads a variable length record
///
/// `mult` is a multiplier of the length (e.g 2 when parsing XLWideString)
fn read_variable_record<'a>(r: &mut &'a [u8], mult: usize) -> Result<&'a [u8], VbaError> {
    let len = r.read_u32::<LittleEndian>()? as usize * mult;
    let (read, next) = r.split_at(len);
    *r = next;
    Ok(read)
}

/// Check that next record matches `id` and returns a variable length record
fn check_variable_record<'a>(id: u16, r: &mut &'a [u8]) -> Result<&'a [u8], VbaError> {
    check_record(id, r)?;
    let record = read_variable_record(r, 1)?;
    if log_enabled!(Level::Warn) && record.len() > 100_000 {
        warn!(
            "record id {} as a suspicious huge length of {} (hex: {:x})",
            id,
            record.len(),
            record.len() as u32
        );
    }
    Ok(record)
}

/// Check that next record matches `id`
fn check_record(id: u16, r: &mut &[u8]) -> Result<(), VbaError> {
    debug!("check record {:x}", id);
    let record_id = r.read_u16::<LittleEndian>()?;
    if record_id != id {
        Err(VbaError::InvalidRecordId {
            expected: id,
            found: record_id,
        })
    } else {
        Ok(())
    }
}