use crate::nquads_streaming::NQuadsParseError;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
IriRef(String),
BlankNodeLabel(String),
StringLiteral {
value: String,
lang: Option<String>,
datatype: Option<String>,
},
Dot,
Caret,
At,
}
pub struct NQuadsLexer;
impl NQuadsLexer {
pub fn tokenize_line(line: &str, line_num: usize) -> Result<Vec<Token>, NQuadsParseError> {
let mut tokens = Vec::new();
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i].is_ascii_whitespace() {
i += 1;
continue;
}
if bytes[i] == b'#' {
break;
}
if bytes[i] == b'<' {
let (iri, consumed) = Self::read_iri(line, i, line_num)?;
tokens.push(Token::IriRef(iri));
i += consumed;
continue;
}
if i + 1 < bytes.len() && bytes[i] == b'_' && bytes[i + 1] == b':' {
let (label, consumed) = Self::read_blank_node(line, i, line_num)?;
tokens.push(Token::BlankNodeLabel(label));
i += consumed;
continue;
}
if bytes[i] == b'"' {
let (tok, consumed) = Self::read_string_literal(line, i, line_num)?;
tokens.push(tok);
i += consumed;
continue;
}
if bytes[i] == b'.' {
tokens.push(Token::Dot);
i += 1;
continue;
}
if i + 1 < bytes.len() && bytes[i] == b'^' && bytes[i + 1] == b'^' {
tokens.push(Token::Caret);
i += 2;
continue;
}
if bytes[i] == b'@' {
tokens.push(Token::At);
i += 1;
continue;
}
return Err(NQuadsParseError::InvalidLine {
line: line_num,
message: format!(
"Unexpected character '{}' at position {}",
bytes[i] as char, i
),
});
}
Ok(tokens)
}
fn read_iri(
line: &str,
start: usize,
line_num: usize,
) -> Result<(String, usize), NQuadsParseError> {
let bytes = line.as_bytes();
debug_assert_eq!(bytes[start], b'<');
let mut i = start + 1;
let mut iri = String::new();
loop {
if i >= bytes.len() {
return Err(NQuadsParseError::InvalidIri {
line: line_num,
iri: iri.clone(),
});
}
match bytes[i] {
b'>' => {
return Ok((iri, i - start + 1));
}
b'\\' => {
let (ch, skip) = Self::read_escape(line, i, line_num)?;
iri.push(ch);
i += skip;
}
b => {
if b < 0x21 || matches!(b, b'"' | b'{' | b'}' | b'|' | b'^' | b'`') {
return Err(NQuadsParseError::InvalidIri {
line: line_num,
iri: iri.clone(),
});
}
let (ch, char_bytes) = Self::decode_utf8_char(line, i, line_num)?;
iri.push(ch);
i += char_bytes;
}
}
}
}
fn read_blank_node(
line: &str,
start: usize,
line_num: usize,
) -> Result<(String, usize), NQuadsParseError> {
let bytes = line.as_bytes();
debug_assert_eq!(bytes[start], b'_');
debug_assert_eq!(bytes[start + 1], b':');
let mut i = start + 2;
let mut label = String::new();
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_whitespace() || b == b'.' || b == b',' || b == b';' {
break;
}
let (ch, char_bytes) = Self::decode_utf8_char(line, i, line_num)?;
label.push(ch);
i += char_bytes;
}
if label.is_empty() {
return Err(NQuadsParseError::InvalidBlankNode {
line: line_num,
name: String::new(),
});
}
Ok((label, i - start))
}
fn read_string_literal(
line: &str,
start: usize,
line_num: usize,
) -> Result<(Token, usize), NQuadsParseError> {
let bytes = line.as_bytes();
debug_assert_eq!(bytes[start], b'"');
let mut i = start + 1;
let mut value = String::new();
loop {
if i >= bytes.len() {
return Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: "Unterminated string literal".to_string(),
});
}
match bytes[i] {
b'"' => {
i += 1;
break;
}
b'\\' => {
let (ch, skip) = Self::read_escape(line, i, line_num)?;
value.push(ch);
i += skip;
}
_ => {
let (ch, char_bytes) = Self::decode_utf8_char(line, i, line_num)?;
value.push(ch);
i += char_bytes;
}
}
}
let after_close = i;
let mut j = after_close;
while j < bytes.len() && bytes[j].is_ascii_whitespace() {
j += 1;
}
if j < bytes.len() && bytes[j] == b'@' {
j += 1; let lang_start = j;
while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'-') {
j += 1;
}
let lang = &line[lang_start..j];
if lang.is_empty() {
return Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: "Empty language tag".to_string(),
});
}
return Ok((
Token::StringLiteral {
value,
lang: Some(lang.to_string()),
datatype: None,
},
j - start,
));
}
if j + 1 < bytes.len() && bytes[j] == b'^' && bytes[j + 1] == b'^' {
j += 2; while j < bytes.len() && bytes[j].is_ascii_whitespace() {
j += 1;
}
if j >= bytes.len() || bytes[j] != b'<' {
return Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: "Expected '<' after '^^'".to_string(),
});
}
let (datatype_iri, iri_len) = Self::read_iri(line, j, line_num)?;
return Ok((
Token::StringLiteral {
value,
lang: None,
datatype: Some(datatype_iri),
},
j + iri_len - start,
));
}
Ok((
Token::StringLiteral {
value,
lang: None,
datatype: None,
},
after_close - start,
))
}
fn read_escape(
line: &str,
start: usize,
line_num: usize,
) -> Result<(char, usize), NQuadsParseError> {
let bytes = line.as_bytes();
debug_assert_eq!(bytes[start], b'\\');
if start + 1 >= bytes.len() {
return Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: "Incomplete escape sequence".to_string(),
});
}
match bytes[start + 1] {
b'n' => Ok(('\n', 2)),
b't' => Ok(('\t', 2)),
b'r' => Ok(('\r', 2)),
b'\\' => Ok(('\\', 2)),
b'"' => Ok(('"', 2)),
b'\'' => Ok(('\'', 2)),
b'u' => {
if start + 5 >= bytes.len() {
return Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: "\\uXXXX requires 4 hex digits".to_string(),
});
}
let hex = &line[start + 2..start + 6];
let code_point =
u32::from_str_radix(hex, 16).map_err(|_| NQuadsParseError::InvalidLiteral {
line: line_num,
message: format!("Invalid \\u escape: \\u{}", hex),
})?;
let ch =
char::from_u32(code_point).ok_or_else(|| NQuadsParseError::InvalidLiteral {
line: line_num,
message: format!("Invalid Unicode code point U+{:04X}", code_point),
})?;
Ok((ch, 6))
}
b'U' => {
if start + 9 >= bytes.len() {
return Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: "\\UXXXXXXXX requires 8 hex digits".to_string(),
});
}
let hex = &line[start + 2..start + 10];
let code_point =
u32::from_str_radix(hex, 16).map_err(|_| NQuadsParseError::InvalidLiteral {
line: line_num,
message: format!("Invalid \\U escape: \\U{}", hex),
})?;
let ch =
char::from_u32(code_point).ok_or_else(|| NQuadsParseError::InvalidLiteral {
line: line_num,
message: format!("Invalid Unicode code point U+{:08X}", code_point),
})?;
Ok((ch, 10))
}
other => Err(NQuadsParseError::InvalidLiteral {
line: line_num,
message: format!("Unknown escape sequence: \\{}", other as char),
}),
}
}
fn decode_utf8_char(
line: &str,
pos: usize,
line_num: usize,
) -> Result<(char, usize), NQuadsParseError> {
let slice = &line[pos..];
let ch = slice
.chars()
.next()
.ok_or_else(|| NQuadsParseError::InvalidLine {
line: line_num,
message: format!("Invalid UTF-8 at byte position {}", pos),
})?;
Ok((ch, ch.len_utf8()))
}
}