xlsbye-biff12 0.1.0

BIFF12 binary record parser for XLSB files
Documentation
use xlsbye_core::error::{Result, XlsByeError};
use xlsbye_core::types::{RichTextRun, SharedStringEntry, SharedStringTable};

use crate::record::cursor::RecordCursor;
use crate::record::header::RecordIter;
use crate::record::ids::{BRT_BEGIN_SST, BRT_END_SST, BRT_SST_ITEM};

pub fn parse_shared_strings(data: &[u8]) -> Result<(SharedStringTable, u32)> {
    let mut sst = SharedStringTable::new();
    let mut total_ref_count = 0u32;
    let mut expected_unique_count: Option<u32> = None;
    let mut seen_begin_sst = false;

    for record in RecordIter::new(data) {
        let (record_type, payload) = record?;

        if record_type == BRT_END_SST.as_u16() {
            break;
        }

        if record_type == BRT_BEGIN_SST.as_u16() {
            let mut cursor = RecordCursor::new(payload);
            total_ref_count = cursor.read_u32()?;
            expected_unique_count = Some(cursor.read_u32()?);
            if !cursor.is_empty() {
                return Err(XlsByeError::Biff12(
                    "BrtBeginSst payload has trailing bytes".to_string(),
                ));
            }
            seen_begin_sst = true;
            continue;
        }

        if record_type == BRT_SST_ITEM.as_u16() {
            if !seen_begin_sst {
                return Err(XlsByeError::Biff12(
                    "BrtSstItem encountered before BrtBeginSst".to_string(),
                ));
            }
            sst.push(parse_sst_item(payload)?);
        }
    }

    if !seen_begin_sst {
        return Err(XlsByeError::Biff12(
            "missing BrtBeginSst record in sharedStrings.bin".to_string(),
        ));
    }

    if let Some(unique_count) = expected_unique_count {
        let parsed_unique_count = u32::try_from(sst.len()).map_err(|_| {
            XlsByeError::Biff12("shared string unique count exceeds u32 range".to_string())
        })?;

        if parsed_unique_count != unique_count {
            return Err(XlsByeError::Biff12(format!(
                "shared string unique count mismatch: BrtBeginSst={unique_count}, parsed={parsed_unique_count}"
            )));
        }
    }

    Ok((sst, total_ref_count))
}

fn parse_sst_item(payload: &[u8]) -> Result<SharedStringEntry> {
    let mut cursor = RecordCursor::new(payload);
    let flags = cursor.read_u8()?;
    let text = cursor.read_wide_string()?;

    let has_rich_text = (flags & 0x01) != 0;
    if !has_rich_text {
        if !cursor.is_empty() {
            return Err(XlsByeError::Biff12(
                "BrtSstItem plain string payload has trailing bytes".to_string(),
            ));
        }
        return Ok(SharedStringEntry::Plain(text));
    }

    let run_count = usize::try_from(cursor.read_u32()?)
        .map_err(|_| XlsByeError::Biff12("rich text run count out of range".to_string()))?;
    let mut runs = Vec::with_capacity(run_count);
    for _ in 0..run_count {
        let start_index = usize::from(cursor.read_u16()?);
        let font_index = cursor.read_u16()?;
        runs.push((start_index, font_index));
    }

    if !cursor.is_empty() {
        return Err(XlsByeError::Biff12(
            "BrtSstItem rich string payload has trailing bytes".to_string(),
        ));
    }

    let utf16 = text.encode_utf16().collect::<Vec<_>>();
    let total_units = utf16.len();
    let mut rich_runs = Vec::with_capacity(runs.len() + 1);

    if let Some((first_start, _)) = runs.first() {
        if *first_start > 0 {
            let prefix = String::from_utf16_lossy(&utf16[..*first_start]);
            rich_runs.push(RichTextRun {
                font_index: None,
                text: prefix,
            });
        }
    }

    for (index, (start, font_index)) in runs.iter().enumerate() {
        if *start > total_units {
            return Err(XlsByeError::Biff12(format!(
                "rich text run start index {} is out of bounds for string length {}",
                start, total_units
            )));
        }

        let end = if let Some((next_start, _)) = runs.get(index + 1) {
            *next_start
        } else {
            total_units
        };

        if *start > end {
            return Err(XlsByeError::Biff12(
                "rich text run start indices are not in ascending order".to_string(),
            ));
        }

        if end > total_units {
            return Err(XlsByeError::Biff12(format!(
                "rich text run end index {} is out of bounds for string length {}",
                end, total_units
            )));
        }

        let segment = String::from_utf16_lossy(&utf16[*start..end]);
        rich_runs.push(RichTextRun {
            font_index: Some(u32::from(*font_index)),
            text: segment,
        });
    }

    Ok(SharedStringEntry::Rich(rich_runs))
}

#[cfg(test)]
mod tests {
    use super::*;

    fn encode_varint(mut value: u32) -> Vec<u8> {
        let mut out = Vec::new();
        loop {
            let mut byte = (value & 0x7F) as u8;
            value >>= 7;
            if value != 0 {
                byte |= 0x80;
            }
            out.push(byte);
            if value == 0 {
                break;
            }
        }
        out
    }

    fn encode_record(record_type: u16, payload: &[u8]) -> Vec<u8> {
        let mut out = Vec::new();
        out.extend_from_slice(&encode_varint(u32::from(record_type)));
        out.extend_from_slice(&encode_varint(payload.len() as u32));
        out.extend_from_slice(payload);
        out
    }

    fn encode_wide_string(value: &str) -> Vec<u8> {
        let utf16 = value.encode_utf16().collect::<Vec<_>>();
        let mut out = Vec::new();
        out.extend_from_slice(&(utf16.len() as u32).to_le_bytes());
        for code_unit in utf16 {
            out.extend_from_slice(&code_unit.to_le_bytes());
        }
        out
    }

    #[test]
    fn parses_plain_shared_strings() {
        let mut begin_payload = Vec::new();
        begin_payload.extend_from_slice(&3u32.to_le_bytes());
        begin_payload.extend_from_slice(&2u32.to_le_bytes());

        let mut item_1 = vec![0u8];
        item_1.extend_from_slice(&encode_wide_string("hello"));

        let mut item_2 = vec![0u8];
        item_2.extend_from_slice(&encode_wide_string("world"));

        let mut data = Vec::new();
        data.extend_from_slice(&encode_record(BRT_BEGIN_SST.as_u16(), &begin_payload));
        data.extend_from_slice(&encode_record(BRT_SST_ITEM.as_u16(), &item_1));
        data.extend_from_slice(&encode_record(BRT_SST_ITEM.as_u16(), &item_2));
        data.extend_from_slice(&encode_record(BRT_END_SST.as_u16(), &[]));

        let (sst, total_ref_count) = parse_shared_strings(&data).expect("parse should succeed");
        assert_eq!(total_ref_count, 3);
        assert_eq!(
            sst,
            vec![
                SharedStringEntry::Plain("hello".to_string()),
                SharedStringEntry::Plain("world".to_string())
            ]
        );
    }

    #[test]
    fn parses_rich_text_shared_string_runs() {
        let mut begin_payload = Vec::new();
        begin_payload.extend_from_slice(&1u32.to_le_bytes());
        begin_payload.extend_from_slice(&1u32.to_le_bytes());

        let mut item = vec![0x01u8];
        item.extend_from_slice(&encode_wide_string("HelloWorld"));
        item.extend_from_slice(&2u32.to_le_bytes());
        item.extend_from_slice(&0u16.to_le_bytes());
        item.extend_from_slice(&2u16.to_le_bytes());
        item.extend_from_slice(&5u16.to_le_bytes());
        item.extend_from_slice(&7u16.to_le_bytes());

        let mut data = Vec::new();
        data.extend_from_slice(&encode_record(BRT_BEGIN_SST.as_u16(), &begin_payload));
        data.extend_from_slice(&encode_record(BRT_SST_ITEM.as_u16(), &item));
        data.extend_from_slice(&encode_record(BRT_END_SST.as_u16(), &[]));

        let (sst, total_ref_count) = parse_shared_strings(&data).expect("parse should succeed");
        assert_eq!(total_ref_count, 1);
        assert_eq!(
            sst,
            vec![SharedStringEntry::Rich(vec![
                RichTextRun {
                    font_index: Some(2),
                    text: "Hello".to_string(),
                },
                RichTextRun {
                    font_index: Some(7),
                    text: "World".to_string(),
                }
            ])]
        );
    }

    #[test]
    fn preserves_leading_plain_segment_in_rich_string() {
        let mut begin_payload = Vec::new();
        begin_payload.extend_from_slice(&1u32.to_le_bytes());
        begin_payload.extend_from_slice(&1u32.to_le_bytes());

        let mut item = vec![0x01u8];
        item.extend_from_slice(&encode_wide_string("PrefixBold "));
        item.extend_from_slice(&1u32.to_le_bytes());
        item.extend_from_slice(&6u16.to_le_bytes());
        item.extend_from_slice(&3u16.to_le_bytes());

        let mut data = Vec::new();
        data.extend_from_slice(&encode_record(BRT_BEGIN_SST.as_u16(), &begin_payload));
        data.extend_from_slice(&encode_record(BRT_SST_ITEM.as_u16(), &item));
        data.extend_from_slice(&encode_record(BRT_END_SST.as_u16(), &[]));

        let (sst, _) = parse_shared_strings(&data).expect("parse should succeed");
        assert_eq!(
            sst,
            vec![SharedStringEntry::Rich(vec![
                RichTextRun {
                    font_index: None,
                    text: "Prefix".to_string(),
                },
                RichTextRun {
                    font_index: Some(3),
                    text: "Bold ".to_string(),
                },
            ])]
        );
    }
}