xlsbye-biff12 0.1.0

BIFF12 binary record parser for XLSB files
Documentation
use xlsbye_core::error::{Result, XlsByeError};
use xlsbye_core::types::{CellRef, Comment, ParsedComments, RichTextRun};

use crate::record::cursor::RecordCursor;
use crate::record::header::RecordIter;
use crate::record::ids::{BRT_BEGIN_COMMENT, BRT_COMMENT_AUTHOR};

const BRT_REAL_COMMENT_AUTHOR: u16 = 0x0278;
const BRT_REAL_COMMENT_ANCHOR: u16 = 0x027B;
const BRT_REAL_COMMENT_TEXT: u16 = 0x027D;

pub fn parse_comments(data: &[u8]) -> Result<ParsedComments> {
    let mut authors = Vec::new();
    let mut comments = Vec::new();
    let mut pending_comment: Option<Comment> = None;

    for record in RecordIter::new(data) {
        let (record_type, payload) = record?;
        if (record_type == BRT_COMMENT_AUTHOR.as_u16() || record_type == BRT_REAL_COMMENT_AUTHOR)
            && !payload.is_empty()
        {
            let mut cursor = RecordCursor::new(payload);
            authors.push(cursor.read_wide_string()?);
            continue;
        }

        if record_type == BRT_BEGIN_COMMENT.as_u16() && !payload.is_empty() {
            comments.push(parse_comment(payload)?);
            continue;
        }

        if record_type == BRT_REAL_COMMENT_ANCHOR {
            pending_comment = Some(parse_real_comment_anchor(payload)?);
            continue;
        }

        if record_type == BRT_REAL_COMMENT_TEXT {
            if let Some(mut comment) = pending_comment.take() {
                comment.text = parse_real_comment_text(payload)?;
                comments.push(comment);
            }
        }
    }

    Ok(ParsedComments { authors, comments })
}

fn parse_real_comment_anchor(payload: &[u8]) -> Result<Comment> {
    let mut cursor = RecordCursor::new(payload);
    let author_index = cursor.read_u32()?;
    let row = one_based(cursor.read_u32()?, "comment row")?;
    let _row_last = cursor.read_u32()?;
    let col = one_based(cursor.read_u32()?, "comment column")?;
    let _col_last = cursor.read_u32()?;

    Ok(Comment {
        cell_ref: CellRef { row, col },
        author_index,
        text: Vec::new(),
    })
}

fn parse_real_comment_text(payload: &[u8]) -> Result<Vec<RichTextRun>> {
    let mut cursor = RecordCursor::new(payload);
    let flags = cursor.read_u8()?;
    let char_count = usize::try_from(cursor.read_u32()?)
        .map_err(|_| XlsByeError::Biff12("comment text length out of range".to_string()))?;
    let utf16_bytes = cursor.read_bytes(char_count * 2)?;
    let utf16 = utf16_bytes
        .chunks_exact(2)
        .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
        .collect::<Vec<_>>();
    let text = String::from_utf16_lossy(&utf16);

    if flags & 0x01 == 0 {
        return Ok(vec![RichTextRun {
            font_index: None,
            text,
        }]);
    }

    let run_count = usize::try_from(cursor.read_u32()?)
        .map_err(|_| XlsByeError::Biff12("comment rich text run count out of range".to_string()))?;
    let mut runs = Vec::with_capacity(run_count);
    for _ in 0..run_count {
        let start_index = usize::from(cursor.read_u16()?);
        let font_index = cursor.read_u16()?;
        runs.push((start_index, font_index));
    }

    let mut rich_runs = Vec::with_capacity(runs.len() + 1);
    if let Some((first_start, _)) = runs.first() {
        if *first_start > 0 {
            rich_runs.push(RichTextRun {
                font_index: None,
                text: String::from_utf16_lossy(&utf16[..*first_start]),
            });
        }
    }

    for (index, (start, font_index)) in runs.iter().enumerate() {
        let end = runs.get(index + 1).map(|(next_start, _)| *next_start).unwrap_or(utf16.len());
        rich_runs.push(RichTextRun {
            font_index: Some(u32::from(*font_index)),
            text: String::from_utf16_lossy(&utf16[*start..end]),
        });
    }

    Ok(rich_runs)
}

fn parse_comment(payload: &[u8]) -> Result<Comment> {
    let mut cursor = RecordCursor::new(payload);
    let author_index = cursor.read_u32()?;
    let row = one_based(cursor.read_u32()?, "comment row")?;
    let _row_last = cursor.read_u32()?;
    let col = one_based(cursor.read_u32()?, "comment column")?;
    let _col_last = cursor.read_u32()?;

    let text = if cursor.is_empty() {
        Vec::new()
    } else {
        vec![RichTextRun {
            font_index: None,
            text: cursor.read_wide_string()?,
        }]
    };

    Ok(Comment {
        cell_ref: CellRef { row, col },
        author_index,
        text,
    })
}

fn one_based(value: u32, field: &str) -> Result<u32> {
    value.checked_add(1).ok_or_else(|| {
        XlsByeError::Biff12(format!(
            "{field} value {value} overflows when converting to 1-based index"
        ))
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::record::ids::{
        BRT_BEGIN_COMMENT_AUTHORS, BRT_BEGIN_COMMENT_LIST, BRT_END_COMMENT, BRT_END_COMMENT_LIST,
    };

    fn encode_varint(mut value: u32) -> Vec<u8> {
        let mut out = Vec::new();
        loop {
            let mut byte = (value & 0x7F) as u8;
            value >>= 7;
            if value != 0 {
                byte |= 0x80;
            }
            out.push(byte);
            if value == 0 {
                break;
            }
        }
        out
    }

    fn encode_record(record_type: u16, payload: &[u8]) -> Vec<u8> {
        let mut out = Vec::new();
        out.extend_from_slice(&encode_varint(u32::from(record_type)));
        out.extend_from_slice(&encode_varint(payload.len() as u32));
        out.extend_from_slice(payload);
        out
    }

    fn encode_wide_string(value: &str) -> Vec<u8> {
        let utf16 = value.encode_utf16().collect::<Vec<_>>();
        let mut out = Vec::new();
        out.extend_from_slice(&(utf16.len() as u32).to_le_bytes());
        for unit in utf16 {
            out.extend_from_slice(&unit.to_le_bytes());
        }
        out
    }

    #[test]
    fn parses_authors_and_comments() {
        let mut comment = Vec::new();
        comment.extend_from_slice(&0u32.to_le_bytes());
        comment.extend_from_slice(&0u32.to_le_bytes());
        comment.extend_from_slice(&0u32.to_le_bytes());
        comment.extend_from_slice(&1u32.to_le_bytes());
        comment.extend_from_slice(&1u32.to_le_bytes());
        comment.extend_from_slice(&encode_wide_string("Looks good"));

        let mut data = Vec::new();
        data.extend_from_slice(&encode_record(BRT_BEGIN_COMMENT_AUTHORS.as_u16(), &[]));
        data.extend_from_slice(&encode_record(
            BRT_COMMENT_AUTHOR.as_u16(),
            &encode_wide_string("Alice"),
        ));
        data.extend_from_slice(&encode_record(BRT_BEGIN_COMMENT_LIST.as_u16(), &[]));
        data.extend_from_slice(&encode_record(BRT_BEGIN_COMMENT.as_u16(), &comment));
        data.extend_from_slice(&encode_record(BRT_END_COMMENT.as_u16(), &[]));
        data.extend_from_slice(&encode_record(BRT_END_COMMENT_LIST.as_u16(), &[]));

        let parsed = parse_comments(&data).expect("comments should parse");
        assert_eq!(parsed.authors, vec!["Alice".to_string()]);
        assert_eq!(parsed.comments.len(), 1);
        assert_eq!(parsed.comments[0].cell_ref.row, 1);
        assert_eq!(parsed.comments[0].cell_ref.col, 2);
        assert_eq!(parsed.comments[0].author_index, 0);
        assert_eq!(parsed.comments[0].text[0].text, "Looks good");
    }
}