pub(crate) const WS_SENTINEL_SPACE: char = '\u{E020}';
pub(crate) const WS_SENTINEL_TAB: char = '\u{E009}';
pub(crate) const WS_SENTINEL_LF: char = '\u{E00A}';
pub(crate) const WS_SENTINEL_CR: char = '\u{E00D}';
pub(crate) fn substitute_whitespace_only_runs(xml: &[u8]) -> Vec<u8> {
if !contains_subslice(xml, b"<w:t") {
return xml.to_vec();
}
let mut out: Vec<u8> = Vec::with_capacity(xml.len());
let mut i = 0;
while i < xml.len() {
if let Some((content_start, content_end, close_tag_end)) = match_w_t_with_text(xml, i) {
let content = &xml[content_start..content_end];
if !content.is_empty() && content.iter().all(is_xml_whitespace_byte) {
out.extend_from_slice(&xml[i..content_start]);
for &b in content {
let sentinel = match b {
b' ' => WS_SENTINEL_SPACE,
b'\t' => WS_SENTINEL_TAB,
b'\n' => WS_SENTINEL_LF,
b'\r' => WS_SENTINEL_CR,
_ => unreachable!("guarded by is_xml_whitespace_byte"),
};
let mut buf = [0u8; 4];
out.extend_from_slice(sentinel.encode_utf8(&mut buf).as_bytes());
}
i = content_end;
continue;
}
out.extend_from_slice(&xml[i..close_tag_end]);
i = close_tag_end;
continue;
}
out.push(xml[i]);
i += 1;
}
out
}
pub(crate) fn restore_whitespace_sentinels(s: &str) -> String {
if !s.chars().any(is_ws_sentinel) {
return s.to_string();
}
let mut out = String::with_capacity(s.len());
for ch in s.chars() {
out.push(match ch {
WS_SENTINEL_SPACE => ' ',
WS_SENTINEL_TAB => '\t',
WS_SENTINEL_LF => '\n',
WS_SENTINEL_CR => '\r',
other => other,
});
}
out
}
#[inline]
fn is_xml_whitespace_byte(b: &u8) -> bool {
matches!(*b, b' ' | b'\t' | b'\n' | b'\r')
}
#[inline]
fn is_ws_sentinel(c: char) -> bool {
matches!(
c,
WS_SENTINEL_SPACE | WS_SENTINEL_TAB | WS_SENTINEL_LF | WS_SENTINEL_CR
)
}
fn contains_subslice(haystack: &[u8], needle: &[u8]) -> bool {
if needle.is_empty() {
return true;
}
haystack.windows(needle.len()).any(|w| w == needle)
}
fn match_w_t_with_text(xml: &[u8], pos: usize) -> Option<(usize, usize, usize)> {
let prefix = b"<w:t";
if !xml[pos..].starts_with(prefix) {
return None;
}
let after_name = pos + prefix.len();
let next = *xml.get(after_name)?;
if !matches!(next, b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') {
return None;
}
let mut j = after_name;
let mut quote: Option<u8> = None;
let start_tag_end = loop {
let b = *xml.get(j)?;
match (quote, b) {
(None, b'>') => break j + 1,
(None, b'/') if xml.get(j + 1) == Some(&b'>') => return None, (None, b'"') | (None, b'\'') => quote = Some(b),
(Some(q), b) if b == q => quote = None,
_ => {}
}
j += 1;
};
let lt = xml[start_tag_end..].iter().position(|&b| b == b'<')?;
let content_end = start_tag_end + lt;
let close = b"</w:t>";
if !xml[content_end..].starts_with(close) {
return None;
}
let close_tag_end = content_end + close.len();
Some((start_tag_end, content_end, close_tag_end))
}
#[cfg(test)]
mod tests {
use super::*;
fn s(b: Vec<u8>) -> String {
String::from_utf8(b).unwrap()
}
#[test]
fn single_space_preserve_is_substituted() {
let xml = br#"<w:r><w:t xml:space="preserve"> </w:t></w:r>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(
out,
format!(
r#"<w:r><w:t xml:space="preserve">{}</w:t></w:r>"#,
WS_SENTINEL_SPACE
)
);
}
#[test]
fn three_spaces_preserve_each_substituted() {
let xml = br#"<w:t xml:space="preserve"> </w:t>"#;
let out = s(substitute_whitespace_only_runs(xml));
let expected = format!(
r#"<w:t xml:space="preserve">{0}{0}{0}</w:t>"#,
WS_SENTINEL_SPACE
);
assert_eq!(out, expected);
}
#[test]
fn whitespace_only_without_preserve_is_also_substituted() {
let xml = br#"<w:t> </w:t>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(out, format!("<w:t>{}</w:t>", WS_SENTINEL_SPACE));
}
#[test]
fn tab_only_is_substituted() {
let xml = b"<w:t xml:space=\"preserve\">\t</w:t>";
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(
out,
format!("<w:t xml:space=\"preserve\">{}</w:t>", WS_SENTINEL_TAB)
);
}
#[test]
fn lf_and_cr_substituted() {
let xml = b"<w:t xml:space=\"preserve\">\r\n</w:t>";
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(
out,
format!(
"<w:t xml:space=\"preserve\">{}{}</w:t>",
WS_SENTINEL_CR, WS_SENTINEL_LF
)
);
}
#[test]
fn mixed_content_is_left_untouched() {
let xml = br#"<w:t xml:space="preserve">hello world </w:t>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(out, std::str::from_utf8(xml).unwrap());
}
#[test]
fn empty_w_t_is_left_untouched() {
let xml = br#"<w:t></w:t>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(out, std::str::from_utf8(xml).unwrap());
}
#[test]
fn self_closing_w_t_is_left_untouched() {
let xml = br#"<w:t xml:space="preserve"/>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(out, std::str::from_utf8(xml).unwrap());
}
#[test]
fn longer_element_names_are_not_matched() {
let xml = br#"<w:tbl><w:tr><w:tc><w:tab/></w:tc></w:tr></w:tbl>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(out, std::str::from_utf8(xml).unwrap());
}
#[test]
fn multiple_runs_in_one_buffer() {
let xml = br#"<w:p><w:r><w:t>A</w:t></w:r><w:r><w:t xml:space="preserve"> </w:t></w:r><w:r><w:t>B</w:t></w:r></w:p>"#;
let out = s(substitute_whitespace_only_runs(xml));
let expected = format!(
r#"<w:p><w:r><w:t>A</w:t></w:r><w:r><w:t xml:space="preserve">{}</w:t></w:r><w:r><w:t>B</w:t></w:r></w:p>"#,
WS_SENTINEL_SPACE
);
assert_eq!(out, expected);
}
#[test]
fn buffer_without_w_t_is_unchanged_via_fast_path() {
let xml = br#"<a:theme><a:clrScheme/></a:theme>"#;
let out = substitute_whitespace_only_runs(xml);
assert_eq!(&out[..], &xml[..]);
}
#[test]
fn empty_buffer_is_handled() {
let xml: &[u8] = b"";
let out = substitute_whitespace_only_runs(xml);
assert!(out.is_empty());
}
#[test]
fn attribute_with_quoted_gt_is_handled() {
let xml = br#"<w:t weird="a>b" xml:space="preserve"> </w:t>"#;
let out = s(substitute_whitespace_only_runs(xml));
assert_eq!(
out,
format!(
r#"<w:t weird="a>b" xml:space="preserve">{}</w:t>"#,
WS_SENTINEL_SPACE
)
);
}
#[test]
fn restore_replaces_each_sentinel_with_original_byte() {
let input = format!(
"{}{}{}{}",
WS_SENTINEL_SPACE, WS_SENTINEL_TAB, WS_SENTINEL_LF, WS_SENTINEL_CR
);
assert_eq!(restore_whitespace_sentinels(&input), " \t\n\r");
}
#[test]
fn restore_passes_normal_text_through() {
let input = "hello world";
assert_eq!(restore_whitespace_sentinels(input), "hello world");
}
#[test]
fn restore_handles_mixed_text_and_sentinels() {
let input = format!("a{}b", WS_SENTINEL_SPACE);
assert_eq!(restore_whitespace_sentinels(&input), "a b");
}
#[test]
fn round_trip_through_quick_xml_recovers_original_whitespace() {
use serde::Deserialize;
#[derive(Deserialize)]
struct TextXml {
#[serde(rename = "$text", default)]
content: String,
}
#[derive(Deserialize)]
struct R {
#[serde(rename = "t")]
t: TextXml,
}
let original = br#"<r><w:t xml:space="preserve"> </w:t></r>"#;
let preprocessed = substitute_whitespace_only_runs(original);
let parsed: R = quick_xml::de::from_str(std::str::from_utf8(&preprocessed).unwrap())
.expect("quick-xml parse");
let restored = restore_whitespace_sentinels(&parsed.t.content);
assert_eq!(restored, " ", "expected single literal space to survive");
}
}