use super::AnnotationSyntaxError;
#[derive(Debug, Clone, PartialEq)]
pub enum AnnotationToken {
LBrace2,
RBrace2,
NamedNode(String),
BlankNode(String),
Literal(String, Option<String>, Option<String>),
Dot,
Semicolon,
Comma,
}
pub fn tokenize_annotation_block(
input: &str,
) -> Result<Vec<AnnotationToken>, AnnotationSyntaxError> {
let mut tokens = Vec::new();
let mut chars = input.char_indices().peekable();
while let Some((i, c)) = chars.next() {
match c {
c if c.is_whitespace() => continue,
'#' => {
for (_, ch) in chars.by_ref() {
if ch == '\n' {
break;
}
}
}
'{' => {
if chars.peek().map(|(_, ch)| *ch) == Some('|') {
chars.next(); tokens.push(AnnotationToken::LBrace2);
} else {
return Err(AnnotationSyntaxError::UnexpectedToken {
expected: "{| (annotation block open)".to_string(),
got: format!("'{{' at position {}", i),
});
}
}
'|' => {
if chars.peek().map(|(_, ch)| *ch) == Some('}') {
chars.next(); tokens.push(AnnotationToken::RBrace2);
} else {
return Err(AnnotationSyntaxError::UnexpectedToken {
expected: "|} (annotation block close)".to_string(),
got: format!("'|' at position {}", i),
});
}
}
'<' => {
let iri = read_iri(&mut chars, i)?;
tokens.push(AnnotationToken::NamedNode(iri));
}
'_' => {
if chars.peek().map(|(_, ch)| *ch) == Some(':') {
chars.next(); let label = read_label(&mut chars);
tokens.push(AnnotationToken::BlankNode(label));
} else {
return Err(AnnotationSyntaxError::UnexpectedToken {
expected: "_: (blank node prefix)".to_string(),
got: format!("'_' without ':' at position {}", i),
});
}
}
'"' => {
let (value, lang, datatype) = read_literal(&mut chars, i)?;
tokens.push(AnnotationToken::Literal(value, lang, datatype));
}
'.' => tokens.push(AnnotationToken::Dot),
';' => tokens.push(AnnotationToken::Semicolon),
',' => tokens.push(AnnotationToken::Comma),
other => {
return Err(AnnotationSyntaxError::UnexpectedToken {
expected: "IRI, blank node, literal, or separator".to_string(),
got: format!("'{}' at position {}", other, i),
});
}
}
}
Ok(tokens)
}
pub fn find_annotation_blocks(input: &str) -> Vec<(usize, usize)> {
let mut result = Vec::new();
let bytes = input.as_bytes();
let len = bytes.len();
let mut i = 0;
while i + 1 < len {
if bytes[i] == b'{' && bytes[i + 1] == b'|' {
let start = i;
i += 2;
let mut depth = 1usize;
while i + 1 < len {
if bytes[i] == b'{' && bytes[i + 1] == b'|' {
depth += 1;
i += 2;
} else if bytes[i] == b'|' && bytes[i + 1] == b'}' {
depth -= 1;
i += 2;
if depth == 0 {
result.push((start, i));
break;
}
} else if bytes[i] == b'"' {
i += 1;
while i < len {
if bytes[i] == b'\\' {
i += 2; } else if bytes[i] == b'"' {
i += 1;
break;
} else {
i += 1;
}
}
} else if bytes[i] == b'#' {
while i < len && bytes[i] != b'\n' {
i += 1;
}
} else {
i += 1;
}
}
} else {
i += 1;
}
}
result
}
fn read_iri(
chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>,
start: usize,
) -> Result<String, AnnotationSyntaxError> {
let mut iri = String::new();
for (_, c) in chars.by_ref() {
if c == '>' {
return Ok(iri);
}
iri.push(c);
}
Err(AnnotationSyntaxError::UnexpectedToken {
expected: "closing '>' for IRI".to_string(),
got: format!("end of input after position {}", start),
})
}
fn read_label(chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>) -> String {
let mut label = String::new();
while let Some((_, c)) = chars.peek() {
if c.is_alphanumeric() || *c == '_' || *c == '-' || *c == '.' {
label.push(*c);
chars.next();
} else {
break;
}
}
label
}
fn read_literal(
chars: &mut std::iter::Peekable<std::str::CharIndices<'_>>,
start: usize,
) -> Result<(String, Option<String>, Option<String>), AnnotationSyntaxError> {
let mut value = String::new();
let mut escaped = false;
loop {
match chars.next() {
None => {
return Err(AnnotationSyntaxError::UnexpectedToken {
expected: "closing '\"' for string literal".to_string(),
got: format!("end of input after position {}", start),
});
}
Some((_, c)) => {
if escaped {
value.push(match c {
'n' => '\n',
'r' => '\r',
't' => '\t',
'"' => '"',
'\\' => '\\',
other => other,
});
escaped = false;
} else if c == '\\' {
escaped = true;
} else if c == '"' {
break;
} else {
value.push(c);
}
}
}
}
match chars.peek() {
Some((_, '@')) => {
chars.next(); let lang = read_label(chars);
Ok((value, Some(lang), None))
}
Some((_, '^')) => {
chars.next(); if chars.peek().map(|(_, c)| *c) == Some('^') {
chars.next(); }
match chars.next() {
Some((pos, '<')) => {
let dt = read_iri(chars, pos)?;
Ok((value, None, Some(dt)))
}
Some((_, other)) => Err(AnnotationSyntaxError::UnexpectedToken {
expected: "<datatype IRI> after ^^".to_string(),
got: format!("'{}'", other),
}),
None => Err(AnnotationSyntaxError::UnexpectedToken {
expected: "<datatype IRI> after ^^".to_string(),
got: "end of input".to_string(),
}),
}
}
_ => Ok((value, None, None)),
}
}