use super::DocumentError;
use quick_xml::events::Event;
use quick_xml::Reader;
use std::io::Cursor;
const MAX_EVENTS: usize = 1_000_000;
pub(crate) fn to_markdown(bytes: &[u8]) -> Result<String, DocumentError> {
let cursor = Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(cursor)?;
let doc_entry = archive.by_name("word/document.xml")?;
let mut reader = Reader::from_reader(std::io::BufReader::new(doc_entry));
reader.config_mut().trim_text(true);
let mut out = String::with_capacity(bytes.len() / 8);
let mut buf = Vec::with_capacity(4096);
let mut in_paragraph = false;
let mut in_run = false;
let mut in_text = false;
let mut heading_level: Option<u8> = None;
let mut is_bold = false;
let mut is_italic = false;
let mut paragraph_text = String::new();
let mut in_table = false;
let mut table_rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut current_cell = String::new();
let mut in_table_cell = false;
let mut event_count: usize = 0;
loop {
if event_count >= MAX_EVENTS {
break;
}
event_count += 1;
match reader.read_event_into(&mut buf) {
Ok(Event::Eof) => break,
Ok(Event::Start(ref e)) => {
let name = e.name();
let local = local_name(name.as_ref());
match local {
b"p" => {
in_paragraph = true;
paragraph_text.clear();
heading_level = None;
is_bold = false;
is_italic = false;
}
b"r" => {
in_run = true;
}
b"t" => {
in_text = true;
}
b"pStyle" if in_paragraph => {
heading_level = parse_heading_level(e);
}
b"b" if in_run => {
is_bold = true;
}
b"i" if in_run => {
is_italic = true;
}
b"tbl" => {
in_table = true;
table_rows.clear();
}
b"tr" if in_table => {
current_row.clear();
}
b"tc" if in_table => {
in_table_cell = true;
current_cell.clear();
}
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
let name = e.name();
let local = local_name(name.as_ref());
match local {
b"pStyle" if in_paragraph => {
heading_level = parse_heading_level(e);
}
b"b" if in_run => {
is_bold = true;
}
b"i" if in_run => {
is_italic = true;
}
b"br" if in_paragraph => {
paragraph_text.push('\n');
}
_ => {}
}
}
Ok(Event::Text(ref e)) if in_text => {
if let Ok(text) = e.unescape() {
let text_str = text.as_ref();
if !text_str.is_empty() {
if in_table_cell {
if is_bold {
current_cell.push_str("**");
}
if is_italic {
current_cell.push('*');
}
current_cell.push_str(text_str);
if is_italic {
current_cell.push('*');
}
if is_bold {
current_cell.push_str("**");
}
} else {
if is_bold {
paragraph_text.push_str("**");
}
if is_italic {
paragraph_text.push('*');
}
paragraph_text.push_str(text_str);
if is_italic {
paragraph_text.push('*');
}
if is_bold {
paragraph_text.push_str("**");
}
}
}
}
}
Ok(Event::End(ref e)) => {
let name = e.name();
let local = local_name(name.as_ref());
match local {
b"t" => {
in_text = false;
}
b"r" => {
in_run = false;
is_bold = false;
is_italic = false;
}
b"p" => {
in_paragraph = false;
if in_table_cell {
if !current_cell.is_empty() && !paragraph_text.is_empty() {
current_cell.push(' ');
}
current_cell.push_str(¶graph_text);
} else if !paragraph_text.is_empty() {
emit_paragraph(
&mut out,
¶graph_text,
heading_level,
);
}
paragraph_text.clear();
}
b"tc" if in_table => {
in_table_cell = false;
let cell = if current_cell.contains('|') {
current_cell.replace('|', "\\|")
} else {
current_cell.clone()
};
current_row.push(cell);
current_cell.clear();
}
b"tr" if in_table => {
table_rows.push(current_row.clone());
current_row.clear();
}
b"tbl" => {
in_table = false;
emit_table(&mut out, &table_rows);
table_rows.clear();
}
_ => {}
}
}
Ok(_) => {}
Err(_) => break,
}
buf.clear();
}
Ok(out)
}
fn local_name(full: &[u8]) -> &[u8] {
match full.iter().position(|&b| b == b':') {
Some(pos) => &full[pos + 1..],
None => full,
}
}
fn parse_heading_level(e: &quick_xml::events::BytesStart<'_>) -> Option<u8> {
for attr in e.attributes().flatten() {
let key = local_name(attr.key.as_ref());
if key == b"val" {
if let Ok(val) = std::str::from_utf8(&attr.value) {
let lower = val.to_ascii_lowercase();
if let Some(rest) = lower.strip_prefix("heading") {
if let Ok(level) = rest.parse::<u8>() {
if (1..=6).contains(&level) {
return Some(level);
}
}
}
}
}
}
None
}
fn emit_paragraph(out: &mut String, text: &str, heading_level: Option<u8>) {
if let Some(level) = heading_level {
for _ in 0..level {
out.push('#');
}
out.push(' ');
}
out.push_str(text);
out.push_str("\n\n");
}
fn emit_table(out: &mut String, rows: &[Vec<String>]) {
if rows.is_empty() {
return;
}
let max_cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
if max_cols == 0 {
return;
}
for (i, row) in rows.iter().enumerate() {
out.push('|');
for col_idx in 0..max_cols {
out.push(' ');
if let Some(cell) = row.get(col_idx) {
out.push_str(cell);
}
out.push_str(" |");
}
out.push('\n');
if i == 0 {
out.push('|');
for _ in 0..max_cols {
out.push_str(" --- |");
}
out.push('\n');
}
}
out.push('\n');
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_garbage_bytes_no_panic() {
assert!(to_markdown(&[]).is_err());
assert!(to_markdown(&[0xFF; 256]).is_err());
}
#[test]
fn test_local_name() {
assert_eq!(local_name(b"w:p"), b"p");
assert_eq!(local_name(b"a:t"), b"t");
assert_eq!(local_name(b"body"), b"body");
}
#[test]
fn test_emit_paragraph_heading() {
let mut out = String::new();
emit_paragraph(&mut out, "Title", Some(1));
assert_eq!(out, "# Title\n\n");
}
#[test]
fn test_emit_paragraph_plain() {
let mut out = String::new();
emit_paragraph(&mut out, "Hello world", None);
assert_eq!(out, "Hello world\n\n");
}
#[test]
fn test_emit_table_empty() {
let mut out = String::new();
emit_table(&mut out, &[]);
assert!(out.is_empty());
}
#[test]
fn test_emit_table_basic() {
let mut out = String::new();
let rows = vec![
vec!["A".to_string(), "B".to_string()],
vec!["1".to_string(), "2".to_string()],
];
emit_table(&mut out, &rows);
assert!(out.contains("| A | B |"));
assert!(out.contains("| --- | --- |"));
assert!(out.contains("| 1 | 2 |"));
}
#[test]
fn test_emit_table_ragged_rows() {
let mut out = String::new();
let rows = vec![
vec!["A".to_string(), "B".to_string(), "C".to_string()],
vec!["1".to_string()], ];
emit_table(&mut out, &rows);
assert!(out.contains("| A | B | C |"));
assert!(out.contains("| 1 | | |"));
}
}