#![allow(dead_code)]
pub(crate) const BOM: [u8; 3] = [0xEF, 0xBB, 0xBF];
#[inline]
#[must_use]
pub(crate) fn strip_bom(bytes: &[u8]) -> usize {
if bytes.starts_with(&BOM) { 3 } else { 0 }
}
#[inline]
#[must_use]
pub(crate) fn is_doc_marker_at(bytes: &[u8], i: usize) -> bool {
if i + 3 > bytes.len() {
return false;
}
if &bytes[i..i + 3] != b"---" {
return false;
}
let preceded_by_break = i == 0 || matches!(bytes[i - 1], b'\n' | b'\r');
if !preceded_by_break {
return false;
}
if i + 3 == bytes.len() {
return true;
}
matches!(bytes[i + 3], b'\n' | b'\r' | b' ' | b'\t')
}
#[must_use]
pub(crate) fn scan_markers(bytes: &[u8], max_markers: usize) -> Vec<usize> {
let mut out = Vec::new();
if max_markers == 0 {
return out;
}
let mut i = 0;
while i + 3 <= bytes.len() {
if is_doc_marker_at(bytes, i) {
out.push(i);
if out.len() >= max_markers {
break;
}
i += 3;
continue;
}
i += 1;
}
out
}
#[must_use]
pub(crate) fn next_marker_after(bytes: &[u8], start: usize) -> Option<usize> {
let mut i = start.max(1);
while i + 3 <= bytes.len() {
if is_doc_marker_at(bytes, i) {
return Some(i);
}
i += 1;
}
None
}
#[must_use]
pub(crate) fn split_documents(input: &str, max_markers: usize) -> Vec<&str> {
let bytes = input.as_bytes();
let markers = scan_markers(bytes, max_markers);
if markers.is_empty() {
return if input.trim().is_empty() {
Vec::new()
} else {
vec![input]
};
}
let mut docs: Vec<&str> = Vec::with_capacity(markers.len() + 1);
if markers[0] > 0 {
let pre = input[..markers[0]].trim();
if !pre.is_empty() {
docs.push(&input[..markers[0]]);
}
}
for window in markers.windows(2) {
docs.push(&input[window[0]..window[1]]);
}
let last = *markers.last().unwrap();
if last < input.len() {
let trailing = &input[last..];
if !trailing.trim_end().is_empty() {
docs.push(trailing);
}
}
docs
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bom_strip_round_trip() {
assert_eq!(strip_bom(b""), 0);
assert_eq!(strip_bom(b"a: 1\n"), 0);
assert_eq!(strip_bom(b"\xEF\xBB\xBFa: 1\n"), 3);
}
#[test]
fn lf_terminated_markers() {
let m = scan_markers(b"---\na: 1\n---\nb: 2\n", 16);
assert_eq!(m, vec![0, 9]);
}
#[test]
fn crlf_terminated_markers_are_recognised() {
let m = scan_markers(b"---\r\na: 1\r\n---\r\nb: 2\r\n", 16);
assert_eq!(m, vec![0, 11]);
}
#[test]
fn mid_line_dashes_are_not_markers() {
let m = scan_markers(b"a: ---\nb: 2\n", 16);
assert!(m.is_empty());
}
#[test]
fn marker_at_eof_is_recognised() {
let m = scan_markers(b"a: 1\n---", 16);
assert_eq!(m, vec![5]);
}
#[test]
fn marker_cap_truncates() {
let input = b"---\n---\n---\n---\n---\n";
let m = scan_markers(input, 2);
assert_eq!(m.len(), 2);
}
#[test]
fn next_marker_skips_leading() {
assert_eq!(next_marker_after(b"---\na: 1\n", 0), None);
assert_eq!(next_marker_after(b"---\na: 1\n---\nb: 2\n", 0), Some(9));
}
#[test]
fn marker_cap_zero_yields_empty() {
assert!(scan_markers(b"---\n---\n", 0).is_empty());
}
}