use std::borrow::Cow;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct ZeroCopyIriParser {
decode_cache: HashMap<Vec<u8>, String>,
}
impl ZeroCopyIriParser {
pub fn new() -> Self {
Self {
decode_cache: HashMap::with_capacity(256),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
decode_cache: HashMap::with_capacity(capacity),
}
}
pub fn parse_iri_ref<'a>(&mut self, input: &'a [u8]) -> Result<Cow<'a, str>, ParseError> {
if input.len() < 2 || input[0] != b'<' || input[input.len() - 1] != b'>' {
return Err(ParseError::InvalidIriFormat);
}
let content = &input[1..input.len() - 1];
if !Self::needs_decoding(content) {
match std::str::from_utf8(content) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(_) => Err(ParseError::InvalidUtf8),
}
} else {
if let Some(cached) = self.decode_cache.get(content) {
return Ok(Cow::Owned(cached.clone()));
}
let decoded = Self::decode_iri(content)?;
self.decode_cache.insert(content.to_vec(), decoded.clone());
Ok(Cow::Owned(decoded))
}
}
#[inline]
fn needs_decoding(content: &[u8]) -> bool {
content.iter().any(|&b| b == b'%' || b == b'\\')
}
fn decode_iri(content: &[u8]) -> Result<String, ParseError> {
let mut result = String::with_capacity(content.len());
let mut i = 0;
while i < content.len() {
match content[i] {
b'%' => {
if i + 2 >= content.len() {
return Err(ParseError::InvalidEscape);
}
let hex = &content[i + 1..i + 3];
let byte = Self::decode_hex_byte(hex)?;
result.push(byte as char);
i += 3;
}
b'\\' => {
if i + 1 >= content.len() {
return Err(ParseError::InvalidEscape);
}
let escaped = match content[i + 1] {
b't' => '\t',
b'n' => '\n',
b'r' => '\r',
b'\\' => '\\',
b'>' => '>',
c => c as char,
};
result.push(escaped);
i += 2;
}
b => {
result.push(b as char);
i += 1;
}
}
}
Ok(result)
}
#[inline]
fn decode_hex_byte(hex: &[u8]) -> Result<u8, ParseError> {
if hex.len() != 2 {
return Err(ParseError::InvalidEscape);
}
let high = Self::hex_digit(hex[0])?;
let low = Self::hex_digit(hex[1])?;
Ok((high << 4) | low)
}
#[inline]
fn hex_digit(c: u8) -> Result<u8, ParseError> {
match c {
b'0'..=b'9' => Ok(c - b'0'),
b'a'..=b'f' => Ok(c - b'a' + 10),
b'A'..=b'F' => Ok(c - b'A' + 10),
_ => Err(ParseError::InvalidHexDigit),
}
}
pub fn clear_cache(&mut self) {
self.decode_cache.clear();
}
pub fn cache_size(&self) -> usize {
self.decode_cache.len()
}
pub fn shrink_cache(&mut self) {
self.decode_cache.shrink_to_fit();
}
}
impl Default for ZeroCopyIriParser {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ParseError {
InvalidIriFormat,
InvalidUtf8,
InvalidEscape,
InvalidHexDigit,
}
impl std::fmt::Display for ParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParseError::InvalidIriFormat => write!(f, "Invalid IRI format"),
ParseError::InvalidUtf8 => write!(f, "Invalid UTF-8 encoding"),
ParseError::InvalidEscape => write!(f, "Invalid escape sequence"),
ParseError::InvalidHexDigit => write!(f, "Invalid hexadecimal digit"),
}
}
}
impl std::error::Error for ParseError {}
#[derive(Debug, Clone)]
pub struct ZeroCopyLiteralParser;
impl ZeroCopyLiteralParser {
pub fn new() -> Self {
Self
}
pub fn parse_string_literal<'a>(&self, input: &'a [u8]) -> Result<Cow<'a, str>, ParseError> {
if input.len() < 2 {
return Err(ParseError::InvalidIriFormat);
}
let quote = input[0];
if quote != b'"' && quote != b'\'' {
return Err(ParseError::InvalidIriFormat);
}
if input[input.len() - 1] != quote {
return Err(ParseError::InvalidIriFormat);
}
let content = &input[1..input.len() - 1];
if !content.contains(&b'\\') {
match std::str::from_utf8(content) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(_) => Err(ParseError::InvalidUtf8),
}
} else {
Ok(Cow::Owned(Self::decode_string(content)?))
}
}
fn decode_string(content: &[u8]) -> Result<String, ParseError> {
let mut result = String::with_capacity(content.len());
let mut i = 0;
while i < content.len() {
if content[i] == b'\\' {
if i + 1 >= content.len() {
return Err(ParseError::InvalidEscape);
}
let escaped = match content[i + 1] {
b't' => '\t',
b'n' => '\n',
b'r' => '\r',
b'\\' => '\\',
b'"' => '"',
b'\'' => '\'',
b'u' => {
if i + 5 >= content.len() {
return Err(ParseError::InvalidEscape);
}
let hex = &content[i + 2..i + 6];
let codepoint = Self::decode_unicode_4(hex)?;
i += 6; result.push(codepoint);
continue;
}
b'U' => {
if i + 9 >= content.len() {
return Err(ParseError::InvalidEscape);
}
let hex = &content[i + 2..i + 10];
let codepoint = Self::decode_unicode_8(hex)?;
i += 10; result.push(codepoint);
continue;
}
c => c as char,
};
result.push(escaped);
i += 2;
} else {
result.push(content[i] as char);
i += 1;
}
}
Ok(result)
}
fn decode_unicode_4(hex: &[u8]) -> Result<char, ParseError> {
if hex.len() != 4 {
return Err(ParseError::InvalidEscape);
}
let mut value = 0u32;
for &byte in hex {
value = (value << 4) | ZeroCopyIriParser::hex_digit(byte)? as u32;
}
char::from_u32(value).ok_or(ParseError::InvalidEscape)
}
fn decode_unicode_8(hex: &[u8]) -> Result<char, ParseError> {
if hex.len() != 8 {
return Err(ParseError::InvalidEscape);
}
let mut value = 0u32;
for &byte in hex {
value = (value << 4) | ZeroCopyIriParser::hex_digit(byte)? as u32;
}
char::from_u32(value).ok_or(ParseError::InvalidEscape)
}
}
impl Default for ZeroCopyLiteralParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_iri_no_allocation() {
let mut parser = ZeroCopyIriParser::new();
let iri = parser
.parse_iri_ref(b"<http://example.org/>")
.expect("valid IRI");
assert!(matches!(iri, Cow::Borrowed(_)));
assert_eq!(iri, "http://example.org/");
}
#[test]
fn test_iri_with_escape_allocates() {
let mut parser = ZeroCopyIriParser::new();
let iri = parser
.parse_iri_ref(b"<http://example.org/sp%20ace>")
.expect("valid IRI");
assert!(matches!(iri, Cow::Owned(_)));
assert_eq!(iri, "http://example.org/sp ace");
}
#[test]
fn test_iri_cache() {
let mut parser = ZeroCopyIriParser::new();
parser
.parse_iri_ref(b"<http://example.org/sp%20ace>")
.expect("valid IRI");
parser
.parse_iri_ref(b"<http://example.org/sp%20ace>")
.expect("valid IRI");
assert_eq!(parser.cache_size(), 1);
}
#[test]
fn test_invalid_iri_format() {
let mut parser = ZeroCopyIriParser::new();
assert!(parser.parse_iri_ref(b"<http://example.org/").is_err());
assert!(parser.parse_iri_ref(b"http://example.org/>").is_err());
assert!(parser.parse_iri_ref(b"http://example.org/").is_err());
}
#[test]
fn test_string_literal_no_allocation() {
let parser = ZeroCopyLiteralParser::new();
let literal = parser
.parse_string_literal(b"\"hello world\"")
.expect("parsing should succeed");
assert!(matches!(literal, Cow::Borrowed(_)));
assert_eq!(literal, "hello world");
}
#[test]
fn test_string_literal_with_escapes() {
let parser = ZeroCopyLiteralParser::new();
let literal = parser
.parse_string_literal(b"\"hello\\nworld\"")
.expect("parsing should succeed");
assert!(matches!(literal, Cow::Owned(_)));
assert_eq!(literal, "hello\nworld");
}
#[test]
fn test_string_literal_unicode_escape() {
let parser = ZeroCopyLiteralParser::new();
let literal = parser
.parse_string_literal(b"\"\\u0041BC\"")
.expect("parsing should succeed");
assert_eq!(literal, "ABC");
let literal = parser
.parse_string_literal(b"\"\\U00000041BC\"")
.expect("parsing should succeed");
assert_eq!(literal, "ABC");
}
#[test]
fn test_string_literal_mixed_quotes() {
let parser = ZeroCopyLiteralParser::new();
let literal = parser
.parse_string_literal(b"\"test\"")
.expect("parsing should succeed");
assert_eq!(literal, "test");
let literal = parser
.parse_string_literal(b"'test'")
.expect("parsing should succeed");
assert_eq!(literal, "test");
}
#[test]
fn test_hex_digit_decoding() {
assert_eq!(ZeroCopyIriParser::hex_digit(b'0').expect("valid IRI"), 0);
assert_eq!(ZeroCopyIriParser::hex_digit(b'9').expect("valid IRI"), 9);
assert_eq!(ZeroCopyIriParser::hex_digit(b'a').expect("valid IRI"), 10);
assert_eq!(ZeroCopyIriParser::hex_digit(b'f').expect("valid IRI"), 15);
assert_eq!(ZeroCopyIriParser::hex_digit(b'A').expect("valid IRI"), 10);
assert_eq!(ZeroCopyIriParser::hex_digit(b'F').expect("valid IRI"), 15);
assert!(ZeroCopyIriParser::hex_digit(b'g').is_err());
assert!(ZeroCopyIriParser::hex_digit(b'Z').is_err());
}
#[test]
fn test_percent_encoding() {
let mut parser = ZeroCopyIriParser::new();
let iri = parser
.parse_iri_ref(b"<http://example.org/%20>")
.expect("valid IRI");
assert_eq!(iri, "http://example.org/ ");
let iri = parser
.parse_iri_ref(b"<http://example.org/%20%21%22>")
.expect("valid IRI");
assert_eq!(iri, "http://example.org/ !\"");
}
#[test]
fn test_clear_cache() {
let mut parser = ZeroCopyIriParser::new();
parser
.parse_iri_ref(b"<http://example.org/sp%20ace>")
.expect("valid IRI");
assert_eq!(parser.cache_size(), 1);
parser.clear_cache();
assert_eq!(parser.cache_size(), 0);
}
}