1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
// MTEF (MathType Equation Format) Extractor for OLE Documents
//
// This module provides functionality to extract and parse MTEF binary data
// from OLE streams in legacy Office documents (.doc, .ppt, etc.).
//
// MTEF data in OLE2 documents is stored in streams named "Equation Native".
// The stream format is:
// - 28 bytes: EQNOLEFILEHDR (OLE header)
// - 5 bytes: MTEF header (version, platform, product, version major, version minor)
// - N bytes: MTEF byte stream (formula data)
use crate::formula::{MtefParser, MathNode};
use crate::ole::file::OleFile;
use std::io::{Read, Seek};
/// MTEF extractor for OLE documents
pub struct MtefExtractor<'arena> {
#[allow(dead_code)] // Kept for future use in instance methods
arena: &'arena bumpalo::Bump,
}
impl<'arena> MtefExtractor<'arena> {
/// Create a new MTEF extractor
#[allow(dead_code)] // Public API for future use
pub fn new(arena: &'arena bumpalo::Bump) -> Self {
Self { arena }
}
/// Extract MTEF data from an OLE stream path (internal use only)
///
/// MTEF data is stored in streams named "Equation Native" within OLE storages.
/// The stream format is:
/// - 28 bytes: EQNOLEFILEHDR (OLE header)
/// - Remaining bytes: MTEF data (5-byte header + MTEF byte stream)
///
/// # Arguments
///
/// * `ole_file` - The OLE file to extract from
/// * `stream_path` - Path components to the stream (e.g., &["ObjectPool", "_1234567890", "Equation Native"])
///
/// # Returns
///
/// Returns the MTEF binary data (including OLE header) if found and valid, None otherwise
pub(crate) fn extract_mtef_from_stream<R: Read + Seek>(
ole_file: &mut OleFile<R>,
stream_path: &[&str],
) -> Result<Option<Vec<u8>>, MtefExtractionError> {
// Try to open the stream
let data = match ole_file.open_stream(stream_path) {
Ok(d) => d,
Err(_) => return Ok(None),
};
// Validate minimum size (28-byte OLE header + 5-byte MTEF header)
if data.len() < 33 {
return Ok(None);
}
// Validate OLE header and extract proper data size
if !Self::validate_ole_header(&data) {
return Ok(None);
}
// Parse cbObject field (u32 little-endian at offset 8)
// This tells us the size of MTEF data after the 28-byte header
let cb_object = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize;
// Calculate total size: OLE header (28) + MTEF data (cbObject)
let total_size = 28 + cb_object;
// Ensure we don't read past the actual data
let actual_size = total_size.min(data.len());
// Return only the valid portion (OLE header + exact MTEF data)
Ok(Some(data[..actual_size].to_vec()))
}
/// Validate the OLE header manually to avoid zerocopy alignment issues
///
/// The OLE header (EQNOLEFILEHDR) is 28 bytes with the following structure:
/// - Offset 0x00-0x01 (2 bytes): cb_hdr = 28 (header size)
/// - Offset 0x02-0x05 (4 bytes): version (typically 0x00020000)
/// - Offset 0x06-0x07 (2 bytes): format (varies, e.g., 0xC16D, 0xC19B, 0xC1C7, 0xC2D3)
/// - Offset 0x08-0x0B (4 bytes): cbObject (size of MTEF data after header)
/// - Offset 0x0C-0x1B (16 bytes): reserved
fn validate_ole_header(data: &[u8]) -> bool {
if data.len() < 28 {
return false;
}
// Parse cb_hdr (u16 little-endian at offset 0)
let cb_hdr = u16::from_le_bytes([data[0], data[1]]);
if cb_hdr != 28 {
return false;
}
// Parse version (u32 little-endian at offset 2)
let version = u32::from_le_bytes([data[2], data[3], data[4], data[5]]);
// Accept both common version formats
if version != 0x00020000 && version != 0x00000200 {
return false;
}
// Parse format (u16 little-endian at offset 6)
let format = u16::from_le_bytes([data[6], data[7]]);
// Format can vary; common values are 0xC16D, 0xC19B, 0xC1C7, 0xC2D3
// We accept any format in the 0xC1xx-0xC2xx range
if !(0xC100..=0xC2FF).contains(&format) {
return false;
}
// Parse cbObject (u32 little-endian at offset 8)
let cb_object = u32::from_le_bytes([data[8], data[9], data[10], data[11]]);
// Check that the object size is reasonable
if cb_object == 0 || cb_object as usize > data.len() - 28 {
return false;
}
true
}
/// Extract all MTEF formulas from embedded OLE objects in ObjectPool
///
/// In Word documents, embedded equations are stored as OLE objects in the ObjectPool directory.
/// Each embedded object is a storage (directory) with a name like "_1234567890".
/// Within each storage, there should be a stream named "Equation Native" containing the MTEF data.
///
/// # Arguments
///
/// * `ole_file` - The OLE file to extract from
///
/// # Returns
///
/// Returns a HashMap mapping object IDs to their MTEF binary data (including OLE header)
pub(crate) fn extract_all_mtef_from_objectpool<R: Read + Seek>(
ole_file: &mut OleFile<R>,
) -> Result<std::collections::HashMap<String, Vec<u8>>, MtefExtractionError> {
use std::collections::HashMap;
let mut mtef_map = HashMap::new();
// Check if ObjectPool directory exists
if !ole_file.directory_exists(&["ObjectPool"]) {
return Ok(mtef_map);
}
// List all entries in ObjectPool
let entries = ole_file
.list_directory_entries(&["ObjectPool"])
.map_err(|e| MtefExtractionError::IoError(std::io::Error::other(e.to_string())))?;
// Process each entry in ObjectPool
for entry in entries {
// Skip if not a storage (embedded OLE objects are storages)
// STGTY_STORAGE = 1
if entry.entry_type != 1 {
continue;
}
// Object names typically start with "_"
if !entry.name.starts_with('_') {
continue;
}
// Try to extract "Equation Native" stream from this embedded object
// The stream path is: ObjectPool/<object_name>/Equation Native
if let Ok(Some(mtef_data)) = Self::extract_mtef_from_stream(
ole_file,
&["ObjectPool", &entry.name, "Equation Native"],
) {
mtef_map.insert(entry.name.clone(), mtef_data);
}
}
Ok(mtef_map)
}
/// Extract all MTEF formulas from a PowerPoint presentation
///
/// In PPT files, MTEF formulas follow a similar pattern to Word documents.
/// Equations are stored as OLE objects in storage directories that are children of the root storage.
/// These are typically named with patterns like "MBD[hexadecimal]" or "Equation Native".
///
/// # Arguments
///
/// * `ole_file` - The OLE file to extract from
///
/// # Returns
///
/// Returns a HashMap mapping storage names to their MTEF binary data (including OLE header)
#[allow(dead_code)]
pub(crate) fn extract_all_mtef_from_ppt<R: Read + Seek>(
ole_file: &mut OleFile<R>,
) -> Result<std::collections::HashMap<String, Vec<u8>>, MtefExtractionError> {
use std::collections::HashMap;
let mut mtef_map = HashMap::new();
// Get all root-level entries
let entries = ole_file
.list_directory_entries(&[])
.map_err(|e| MtefExtractionError::IoError(std::io::Error::other(e.to_string())))?;
// Process each entry at root level
for entry in entries {
// Skip if not a storage
if entry.entry_type != 1 {
continue;
}
// Look for storages that might contain equations
// Common patterns: "MBD[hex]", "Equation Native", or names starting with "_"
let is_equation_storage = entry.name.starts_with("MBD")
|| entry.name == "Equation Native"
|| entry.name.starts_with('_');
if !is_equation_storage {
continue;
}
// Try to extract "Equation Native" stream from this storage
if let Ok(Some(mtef_data)) = Self::extract_mtef_from_stream(
ole_file,
&[&entry.name, "Equation Native"],
) {
mtef_map.insert(entry.name.clone(), mtef_data);
continue;
}
// Try alternative stream names
for stream_name in &["CONTENTS", "\x01Ole", "\x01Ole10Native"] {
if let Ok(Some(mtef_data)) = Self::extract_mtef_from_stream(
ole_file,
&[&entry.name, stream_name],
) {
mtef_map.insert(entry.name.clone(), mtef_data);
break;
}
}
}
Ok(mtef_map)
}
/// Parse MTEF binary data into formula AST nodes
///
/// # Arguments
///
/// * `mtef_data` - The MTEF binary data to parse
///
/// # Returns
///
/// Returns a vector of MathNode AST nodes representing the parsed formula
#[allow(dead_code)] // Public API for future use
pub fn parse_mtef_to_ast(&self, mtef_data: Vec<u8>) -> Result<Vec<MathNode<'arena>>, MtefExtractionError> {
// We need to create a reference with the arena lifetime
// This is a bit of a hack, but we know the data will live long enough
let data_ref: &[u8] = unsafe {
std::slice::from_raw_parts(mtef_data.as_ptr(), mtef_data.len())
};
// Extend the lifetime (this is safe because we control the Vec lifetime)
let data_with_lifetime: &'arena [u8] = unsafe {
std::mem::transmute(data_ref)
};
let mut parser = MtefParser::new(self.arena, data_with_lifetime);
parser.parse().map_err(|e| MtefExtractionError::ParseError(e.to_string()))
}
/// Extract and parse MTEF data from an OLE file
///
/// This is a convenience method that combines extraction and parsing.
/// It tries each stream name in order and returns the first valid MTEF data found.
///
/// # Arguments
///
/// * `ole_file` - The OLE file to extract from
/// * `stream_names` - Possible stream names to check (e.g., &["Equation Native", "MSWordEquation"])
///
/// # Returns
///
/// Returns parsed formula AST nodes if MTEF data is found and valid
#[allow(dead_code)] // Public API for future use
pub fn extract_and_parse_mtef<R: Read + Seek>(
&self,
ole_file: &mut OleFile<R>,
stream_names: &[&str],
) -> Result<Option<Vec<MathNode<'arena>>>, MtefExtractionError> {
// Try each stream name in order
for stream_name in stream_names {
if let Some(mtef_data) = Self::extract_mtef_from_stream(ole_file, &[stream_name])? {
// Found valid MTEF data - parse it
return Ok(Some(self.parse_mtef_to_ast(mtef_data)?));
}
}
// No valid MTEF data found in any of the stream names
Ok(None)
}
}
/// Errors that can occur during MTEF extraction
#[derive(Debug)]
pub enum MtefExtractionError {
IoError(std::io::Error),
#[allow(dead_code)] // Kept for completeness
InvalidOleHeader,
ParseError(String),
}
impl std::fmt::Display for MtefExtractionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
MtefExtractionError::IoError(e) => write!(f, "IO error: {}", e),
MtefExtractionError::InvalidOleHeader => write!(f, "Invalid OLE header"),
MtefExtractionError::ParseError(msg) => write!(f, "Parse error: {}", msg),
}
}
}
impl std::error::Error for MtefExtractionError {}
impl From<std::io::Error> for MtefExtractionError {
fn from(e: std::io::Error) -> Self {
MtefExtractionError::IoError(e)
}
}
impl From<crate::formula::MtefError> for MtefExtractionError {
fn from(e: crate::formula::MtefError) -> Self {
MtefExtractionError::ParseError(e.to_string())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::formula::ast::Formula;
#[test]
fn test_ole_header_validation() {
// Valid OLE header
let valid_data = vec![
0x1C, 0x00, // cb_hdr = 28
0x00, 0x00, 0x02, 0x00, // version = 0x00020000 (little endian)
0xD3, 0xC2, // format = 0xC2D3
0x20, 0x00, 0x00, 0x00, // size = 32
0x00, 0x00, 0x00, 0x00, // reserved
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
];
assert!(MtefExtractor::validate_ole_header(&valid_data));
// Invalid OLE header (wrong cb_hdr)
let mut invalid_data = valid_data.clone();
invalid_data[0] = 0x10; // Invalid cb_hdr
assert!(!MtefExtractor::validate_ole_header(&invalid_data));
// Invalid version
let mut invalid_version = valid_data.clone();
invalid_version[2] = 0xFF;
assert!(!MtefExtractor::validate_ole_header(&invalid_version));
}
#[test]
fn test_mtef_extractor_creation() {
let formula = Formula::new();
let _extractor = MtefExtractor::new(formula.arena());
// Just test that it can be created
// Test passed - no assertions needed
}
}