use std::sync::Arc;
use wp_model_core::raw::RawData;
use wp_parse_api::{PipeProcessor, WparseResult};
#[derive(Debug)]
pub struct BomClearProc;
fn detect_bom_at(data: &[u8], pos: usize) -> Option<usize> {
let remaining = &data[pos..];
if remaining.len() >= 3 && remaining[0] == 0xEF && remaining[1] == 0xBB && remaining[2] == 0xBF
{
return Some(3);
}
if remaining.len() >= 4
&& remaining[0] == 0xFF
&& remaining[1] == 0xFE
&& remaining[2] == 0x00
&& remaining[3] == 0x00
{
return Some(4);
}
if remaining.len() >= 4
&& remaining[0] == 0x00
&& remaining[1] == 0x00
&& remaining[2] == 0xFE
&& remaining[3] == 0xFF
{
return Some(4);
}
if remaining.len() >= 2 && remaining[0] == 0xFF && remaining[1] == 0xFE {
return Some(2);
}
if remaining.len() >= 2 && remaining[0] == 0xFE && remaining[1] == 0xFF {
return Some(2);
}
None
}
fn remove_all_boms(data: &[u8]) -> Option<Vec<u8>> {
let mut result = Vec::new();
let mut has_bom = false;
let mut pos = 0;
while pos < data.len() {
if let Some(bom_len) = detect_bom_at(data, pos) {
has_bom = true;
pos += bom_len;
} else {
result.push(data[pos]);
pos += 1;
}
}
if has_bom { Some(result) } else { None }
}
impl PipeProcessor for BomClearProc {
fn process(&self, data: RawData) -> WparseResult<RawData> {
match data {
RawData::String(s) => {
let bytes = s.as_bytes();
if let Some(cleaned) = remove_all_boms(bytes) {
let result = String::from_utf8_lossy(&cleaned).into_owned();
Ok(RawData::from_string(result))
} else {
Ok(RawData::from_string(s))
}
}
RawData::Bytes(b) => {
if let Some(cleaned) = remove_all_boms(&b) {
Ok(RawData::Bytes(cleaned.into()))
} else {
Ok(RawData::Bytes(b))
}
}
RawData::ArcBytes(b) => {
if let Some(cleaned) = remove_all_boms(&b) {
Ok(RawData::ArcBytes(Arc::new(cleaned)))
} else {
Ok(RawData::ArcBytes(b))
}
}
}
}
fn name(&self) -> &'static str {
"strip/bom"
}
}
#[cfg(test)]
mod tests {
use bytes::Bytes;
use super::*;
use crate::types::AnyResult;
fn detect_bom(data: &[u8]) -> Option<usize> {
detect_bom_at(data, 0)
}
#[test]
fn test_detect_utf8_bom() {
let data = &[0xEF, 0xBB, 0xBF, b'h', b'e', b'l', b'l', b'o'];
assert_eq!(detect_bom(data), Some(3));
}
#[test]
fn test_detect_utf16_le_bom() {
let data = &[0xFF, 0xFE, b'h', b'e', b'l', b'l', b'o'];
assert_eq!(detect_bom(data), Some(2));
}
#[test]
fn test_detect_utf16_be_bom() {
let data = &[0xFE, 0xFF, b'h', b'e', b'l', b'l', b'o'];
assert_eq!(detect_bom(data), Some(2));
}
#[test]
fn test_detect_utf32_le_bom() {
let data = &[0xFF, 0xFE, 0x00, 0x00, b'h', b'e', b'l', b'l', b'o'];
assert_eq!(detect_bom(data), Some(4));
}
#[test]
fn test_detect_utf32_be_bom() {
let data = &[0x00, 0x00, 0xFE, 0xFF, b'h', b'e', b'l', b'l', b'o'];
assert_eq!(detect_bom(data), Some(4));
}
#[test]
fn test_detect_no_bom() {
let data = b"hello world";
assert_eq!(detect_bom(data), None);
}
#[test]
fn test_detect_bom_too_short() {
let data = &[0xEF, 0xBB];
assert_eq!(detect_bom(data), None);
}
#[test]
fn test_bom_clear_utf8_string() -> AnyResult<()> {
let mut input = vec![0xEF, 0xBB, 0xBF];
input.extend_from_slice(b"hello");
let data = RawData::from_string(String::from_utf8(input)?);
let result = BomClearProc.process(data)?;
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "hello");
Ok(())
}
#[test]
fn test_bom_clear_utf16_le_bytes() -> AnyResult<()> {
let mut input = vec![0xFF, 0xFE];
input.extend_from_slice(b"hello");
let data = RawData::Bytes(Bytes::from(input));
let result = BomClearProc.process(data)?;
assert!(matches!(result, RawData::Bytes(_)));
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "hello");
Ok(())
}
#[test]
fn test_bom_clear_utf16_be_bytes() -> AnyResult<()> {
let mut input = vec![0xFE, 0xFF];
input.extend_from_slice(b"world");
let data = RawData::Bytes(Bytes::from(input));
let result = BomClearProc.process(data)?;
assert!(matches!(result, RawData::Bytes(_)));
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "world");
Ok(())
}
#[test]
fn test_bom_clear_utf32_le_arc_bytes() -> AnyResult<()> {
let mut input = vec![0xFF, 0xFE, 0x00, 0x00];
input.extend_from_slice(b"test");
let data = RawData::ArcBytes(Arc::new(input));
let result = BomClearProc.process(data)?;
assert!(matches!(result, RawData::ArcBytes(_)));
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "test");
Ok(())
}
#[test]
fn test_bom_clear_utf32_be_arc_bytes() -> AnyResult<()> {
let mut input = vec![0x00, 0x00, 0xFE, 0xFF];
input.extend_from_slice(b"data");
let data = RawData::ArcBytes(Arc::new(input));
let result = BomClearProc.process(data)?;
assert!(matches!(result, RawData::ArcBytes(_)));
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "data");
Ok(())
}
#[test]
fn test_bom_clear_no_bom_string() -> AnyResult<()> {
let data = RawData::from_string("hello world".to_string());
let result = BomClearProc.process(data)?;
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"hello world"
);
Ok(())
}
#[test]
fn test_bom_clear_no_bom_bytes() -> AnyResult<()> {
let data = RawData::Bytes(Bytes::from_static(b"no bom here"));
let result = BomClearProc.process(data)?;
assert!(matches!(result, RawData::Bytes(_)));
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"no bom here"
);
Ok(())
}
#[test]
fn test_bom_clear_empty_string() -> AnyResult<()> {
let data = RawData::from_string("".to_string());
let result = BomClearProc.process(data)?;
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "");
Ok(())
}
#[test]
fn test_bom_clear_only_bom() -> AnyResult<()> {
let input = vec![0xEF, 0xBB, 0xBF];
let data = RawData::from_string(String::from_utf8(input)?);
let result = BomClearProc.process(data)?;
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "");
Ok(())
}
#[test]
fn test_bom_clear_chinese_with_utf8_bom() -> AnyResult<()> {
let mut input = vec![0xEF, 0xBB, 0xBF];
input.extend_from_slice("你好世界".as_bytes());
let data = RawData::from_string(String::from_utf8(input)?);
let result = BomClearProc.process(data)?;
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"你好世界"
);
Ok(())
}
#[test]
fn test_bom_clear_preserves_container_type() -> AnyResult<()> {
let str_data = RawData::from_string("\u{FEFF}test".to_string());
let str_result = BomClearProc.process(str_data)?;
assert!(matches!(str_result, RawData::String(_)));
let bytes_data = RawData::Bytes(Bytes::from_static(&[0xEF, 0xBB, 0xBF, b't']));
let bytes_result = BomClearProc.process(bytes_data)?;
assert!(matches!(bytes_result, RawData::Bytes(_)));
let arc_data = RawData::ArcBytes(Arc::new(vec![0xEF, 0xBB, 0xBF, b't']));
let arc_result = BomClearProc.process(arc_data)?;
assert!(matches!(arc_result, RawData::ArcBytes(_)));
Ok(())
}
#[test]
fn test_bom_in_middle_of_data() -> AnyResult<()> {
let mut input = b"hello".to_vec();
input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice(b"world");
let data = RawData::Bytes(Bytes::from(input));
let result = BomClearProc.process(data)?;
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"helloworld"
);
Ok(())
}
#[test]
fn test_multiple_boms_in_data() -> AnyResult<()> {
let mut input = vec![0xEF, 0xBB, 0xBF]; input.extend_from_slice(b"start");
input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice(b"middle");
input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice(b"end");
let data = RawData::from_string(String::from_utf8(input)?);
let result = BomClearProc.process(data)?;
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"startmiddleend"
);
Ok(())
}
#[test]
fn test_mixed_bom_types() -> AnyResult<()> {
let mut input = vec![0xEF, 0xBB, 0xBF]; input.extend_from_slice(b"utf8");
input.extend_from_slice(&[0xFF, 0xFE]); input.extend_from_slice(b"utf16");
input.extend_from_slice(&[0xFE, 0xFF]); input.extend_from_slice(b"data");
let data = RawData::Bytes(Bytes::from(input));
let result = BomClearProc.process(data)?;
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"utf8utf16data"
);
Ok(())
}
#[test]
fn test_bom_at_end() -> AnyResult<()> {
let mut input = b"data".to_vec();
input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); let data = RawData::Bytes(Bytes::from(input));
let result = BomClearProc.process(data)?;
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "data");
Ok(())
}
#[test]
fn test_consecutive_boms() -> AnyResult<()> {
let mut input = vec![0xEF, 0xBB, 0xBF]; input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice(b"text");
let data = RawData::ArcBytes(Arc::new(input));
let result = BomClearProc.process(data)?;
assert_eq!(crate::eval::builtins::raw_to_utf8_string(&result), "text");
Ok(())
}
#[test]
fn test_bom_removal_with_chinese() -> AnyResult<()> {
let mut input = b"start".to_vec();
input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice("中文".as_bytes());
input.extend_from_slice(&[0xEF, 0xBB, 0xBF]); input.extend_from_slice("内容".as_bytes());
let data = RawData::from_string(String::from_utf8(input)?);
let result = BomClearProc.process(data)?;
assert_eq!(
crate::eval::builtins::raw_to_utf8_string(&result),
"start中文内容"
);
Ok(())
}
}