use super::error::{ParseResult, RdfParseError, TextPosition};
use std::io::BufRead;
#[allow(dead_code)]
pub struct Lexer<B, TR> {
buffer: B,
tokenizer: TR,
position: TextPosition,
current_char: Option<char>,
peek_char: Option<char>,
}
pub trait TokenRecognizer {
type Token;
fn recognize_next_token(
&mut self,
buffer: &mut dyn BufferProvider,
position: &mut TextPosition,
) -> ParseResult<Option<Self::Token>>;
}
pub trait RuleRecognizer<Node> {
fn recognize_next_node<Token>(
&mut self,
parser: &mut Parser<Token>,
) -> ParseResult<Option<Node>>;
}
pub trait BufferProvider {
fn current(&self) -> Option<char>;
fn peek(&self) -> Option<char>;
fn advance(&mut self) -> Option<char>;
fn position(&self) -> &TextPosition;
fn update_position(&mut self, ch: char);
}
pub struct StringBuffer {
content: String,
position: TextPosition,
current: Option<char>,
peek: Option<char>,
char_position: usize, }
impl StringBuffer {
pub fn new(content: String) -> Self {
let mut buffer = Self {
content,
position: TextPosition::start(),
current: None,
peek: None,
char_position: 0,
};
buffer.current = buffer.get_char_at(0);
buffer.peek = buffer.get_char_at(1);
buffer
}
fn get_char_at(&self, index: usize) -> Option<char> {
self.content.chars().nth(index)
}
}
impl BufferProvider for StringBuffer {
fn current(&self) -> Option<char> {
self.current
}
fn peek(&self) -> Option<char> {
self.peek
}
fn advance(&mut self) -> Option<char> {
if let Some(ch) = self.current {
self.update_position(ch);
}
self.current = self.peek;
self.char_position += 1;
self.peek = self.get_char_at(self.char_position + 1);
self.current
}
fn position(&self) -> &TextPosition {
&self.position
}
fn update_position(&mut self, ch: char) {
match ch {
'\n' => {
self.position.line += 1;
self.position.column = 1;
self.position.offset += 1;
}
'\r' => {
if self.peek == Some('\n') {
} else {
self.position.line += 1;
self.position.column = 1;
}
self.position.offset += 1;
}
_ => {
self.position.column += 1;
self.position.offset += 1;
}
}
}
}
pub struct ReaderBuffer<R: BufRead> {
reader: R,
position: TextPosition,
current: Option<char>,
peek: Option<char>,
char_buffer: Vec<char>,
buffer_pos: usize,
}
impl<R: BufRead> ReaderBuffer<R> {
pub fn new(reader: R) -> ParseResult<Self> {
let mut buffer = Self {
reader,
position: TextPosition::start(),
current: None,
peek: None,
char_buffer: Vec::new(),
buffer_pos: 0,
};
buffer.fill_buffer()?;
buffer.advance(); Ok(buffer)
}
fn fill_buffer(&mut self) -> ParseResult<()> {
let mut line = String::new();
match self.reader.read_line(&mut line) {
Ok(0) => Ok(()), Ok(_) => {
self.char_buffer.extend(line.chars());
Ok(())
}
Err(e) => Err(RdfParseError::Io(e)),
}
}
#[allow(dead_code)]
fn ensure_chars_available(&mut self) -> ParseResult<()> {
if self.buffer_pos + 1 >= self.char_buffer.len() {
self.fill_buffer()?;
}
Ok(())
}
}
impl<R: BufRead> BufferProvider for ReaderBuffer<R> {
fn current(&self) -> Option<char> {
self.current
}
fn peek(&self) -> Option<char> {
self.peek
}
fn advance(&mut self) -> Option<char> {
if let Some(ch) = self.current {
self.update_position(ch);
}
self.current = self.peek;
self.buffer_pos += 1;
if self.buffer_pos < self.char_buffer.len() {
self.peek = Some(self.char_buffer[self.buffer_pos]);
} else {
if self.fill_buffer().is_ok() && self.buffer_pos < self.char_buffer.len() {
self.peek = Some(self.char_buffer[self.buffer_pos]);
} else {
self.peek = None;
}
}
self.current
}
fn position(&self) -> &TextPosition {
&self.position
}
fn update_position(&mut self, ch: char) {
match ch {
'\n' => {
self.position.line += 1;
self.position.column = 1;
self.position.offset += 1;
}
'\r' => {
if self.peek == Some('\n') {
} else {
self.position.line += 1;
self.position.column = 1;
}
self.position.offset += 1;
}
_ => {
self.position.column += 1;
self.position.offset += 1;
}
}
}
}
impl<B: BufferProvider, TR> Lexer<B, TR> {
pub fn new(buffer: B, tokenizer: TR) -> Self {
Self {
buffer,
tokenizer,
position: TextPosition::start(),
current_char: None,
peek_char: None,
}
}
}
impl<B: BufferProvider, TR: TokenRecognizer> Lexer<B, TR> {
pub fn next_token(&mut self) -> ParseResult<Option<TR::Token>> {
self.tokenizer
.recognize_next_token(&mut self.buffer, &mut self.position)
}
pub fn position(&self) -> &TextPosition {
self.buffer.position()
}
}
pub struct Parser<Token> {
tokens: Vec<Token>,
position: usize,
}
impl<Token> Parser<Token> {
pub fn new(tokens: Vec<Token>) -> Self {
Self {
tokens,
position: 0,
}
}
pub fn peek(&self) -> Option<&Token> {
self.tokens.get(self.position)
}
pub fn next_token(&mut self) -> Option<&Token> {
if self.position < self.tokens.len() {
let token = &self.tokens[self.position];
self.position += 1;
Some(token)
} else {
None
}
}
pub fn is_at_end(&self) -> bool {
self.position >= self.tokens.len()
}
pub fn token_position(&self) -> usize {
self.position
}
pub fn reset_to(&mut self, pos: usize) {
self.position = pos.min(self.tokens.len());
}
}
pub mod char_utils {
pub fn is_whitespace(ch: char) -> bool {
matches!(ch, ' ' | '\t' | '\n' | '\r')
}
pub fn is_iri_start(ch: char) -> bool {
ch == '<'
}
pub fn is_iri_char(ch: char) -> bool {
!matches!(
ch,
'<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\' | '\x00'..='\x20'
)
}
pub fn is_blank_node_start(ch: char) -> bool {
ch == '_'
}
pub fn is_pn_chars_base(ch: char) -> bool {
matches!(ch, 'A'..='Z' | 'a'..='z' | '\u{00C0}'..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}')
}
pub fn is_pn_chars(ch: char) -> bool {
is_pn_chars_base(ch)
|| matches!(ch, '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
}
pub fn is_numeric_start(ch: char) -> bool {
matches!(ch, '0'..='9' | '+' | '-' | '.')
}
pub fn is_digit(ch: char) -> bool {
ch.is_ascii_digit()
}
pub fn is_hex_digit(ch: char) -> bool {
ch.is_ascii_hexdigit()
}
}
pub mod string_utils {
use super::ParseResult;
use crate::format::error::{RdfParseError, RdfSyntaxError, TextPosition};
pub fn unescape_string(input: &str, position: &TextPosition) -> ParseResult<String> {
let mut result = String::new();
let mut chars = input.chars();
while let Some(ch) = chars.next() {
if ch == '\\' {
match chars.next() {
Some('t') => result.push('\t'),
Some('n') => result.push('\n'),
Some('r') => result.push('\r'),
Some('b') => result.push('\u{0008}'),
Some('f') => result.push('\u{000C}'),
Some('"') => result.push('"'),
Some('\'') => result.push('\''),
Some('\\') => result.push('\\'),
Some('u') => {
let mut unicode_chars = String::new();
for _ in 0..4 {
match chars.next() {
Some(c) if c.is_ascii_hexdigit() => unicode_chars.push(c),
_ => {
return Err(RdfParseError::Syntax(
RdfSyntaxError::with_position(
"Invalid Unicode escape sequence".to_string(),
*position,
),
))
}
}
}
let code_point = u32::from_str_radix(&unicode_chars, 16).map_err(|_| {
RdfParseError::Syntax(RdfSyntaxError::with_position(
"Invalid Unicode code point".to_string(),
*position,
))
})?;
match char::from_u32(code_point) {
Some(unicode_char) => result.push(unicode_char),
None => {
return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
"Invalid Unicode code point".to_string(),
*position,
)))
}
}
}
Some('U') => {
let mut unicode_chars = String::new();
for _ in 0..8 {
match chars.next() {
Some(c) if c.is_ascii_hexdigit() => unicode_chars.push(c),
_ => {
return Err(RdfParseError::Syntax(
RdfSyntaxError::with_position(
"Invalid Unicode escape sequence".to_string(),
*position,
),
))
}
}
}
let code_point = u32::from_str_radix(&unicode_chars, 16).map_err(|_| {
RdfParseError::Syntax(RdfSyntaxError::with_position(
"Invalid Unicode code point".to_string(),
*position,
))
})?;
match char::from_u32(code_point) {
Some(unicode_char) => result.push(unicode_char),
None => {
return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
"Invalid Unicode code point".to_string(),
*position,
)))
}
}
}
Some(other) => {
return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
format!("Invalid escape sequence: \\{other}"),
*position,
)));
}
None => {
return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
"Incomplete escape sequence".to_string(),
*position,
)));
}
}
} else {
result.push(ch);
}
}
Ok(result)
}
pub fn escape_string(input: &str) -> String {
let mut result = String::new();
for ch in input.chars() {
match ch {
'\t' => result.push_str("\\t"),
'\n' => result.push_str("\\n"),
'\r' => result.push_str("\\r"),
'\u{0008}' => result.push_str("\\b"),
'\u{000C}' => result.push_str("\\f"),
'"' => result.push_str("\\\""),
'\\' => result.push_str("\\\\"),
c if c.is_control() => {
if (c as u32) <= 0xFFFF {
result.push_str(&format!("\\u{:04X}", c as u32));
} else {
result.push_str(&format!("\\U{:08X}", c as u32));
}
}
c => result.push(c),
}
}
result
}
}
#[cfg(test)]
mod tests {
use super::char_utils::*;
use super::string_utils::*;
use super::*;
#[test]
fn test_string_buffer() {
let mut buffer = StringBuffer::new("hello\nworld".to_string());
assert_eq!(buffer.current(), Some('h'));
assert_eq!(buffer.peek(), Some('e'));
buffer.advance();
assert_eq!(buffer.current(), Some('e'));
assert_eq!(buffer.position().column, 2);
for _ in 0..4 {
buffer.advance();
}
assert_eq!(buffer.current(), Some('\n'));
assert_eq!(buffer.position().line, 1);
assert_eq!(buffer.position().column, 6);
buffer.advance();
assert_eq!(buffer.current(), Some('w'));
assert_eq!(buffer.position().line, 2);
assert_eq!(buffer.position().column, 1);
}
#[test]
fn test_char_classification() {
assert!(is_whitespace(' '));
assert!(is_whitespace('\t'));
assert!(is_whitespace('\n'));
assert!(!is_whitespace('a'));
assert!(is_iri_start('<'));
assert!(!is_iri_start('a'));
assert!(is_pn_chars_base('A'));
assert!(is_pn_chars_base('z'));
assert!(!is_pn_chars_base('1'));
assert!(is_pn_chars('A'));
assert!(is_pn_chars('1'));
assert!(is_pn_chars('-'));
assert!(is_numeric_start('1'));
assert!(is_numeric_start('+'));
assert!(is_numeric_start('.'));
assert!(!is_numeric_start('a'));
}
#[test]
fn test_string_escaping() {
let position = TextPosition::start();
assert_eq!(
unescape_string("hello\\nworld", &position).expect("unescape should succeed"),
"hello\nworld"
);
assert_eq!(
unescape_string("say \\\"hello\\\"", &position).expect("unescape should succeed"),
"say \"hello\""
);
assert_eq!(
unescape_string("\\u0041", &position).expect("unescape should succeed"),
"A"
);
assert_eq!(
unescape_string("\\U00000041", &position).expect("unescape should succeed"),
"A"
);
assert_eq!(escape_string("hello\nworld"), "hello\\nworld");
assert_eq!(escape_string("say \"hello\""), "say \\\"hello\\\"");
}
}