1use super::error::{ParseResult, RdfParseError, TextPosition};
7use std::io::BufRead;
8
9#[allow(dead_code)]
11pub struct Lexer<B, TR> {
12 buffer: B,
13 tokenizer: TR,
14 position: TextPosition,
15 current_char: Option<char>,
16 peek_char: Option<char>,
17}
18
19pub trait TokenRecognizer {
21 type Token;
22
23 fn recognize_next_token(
25 &mut self,
26 buffer: &mut dyn BufferProvider,
27 position: &mut TextPosition,
28 ) -> ParseResult<Option<Self::Token>>;
29}
30
31pub trait RuleRecognizer<Node> {
33 fn recognize_next_node<Token>(
35 &mut self,
36 parser: &mut Parser<Token>,
37 ) -> ParseResult<Option<Node>>;
38}
39
40pub trait BufferProvider {
42 fn current(&self) -> Option<char>;
44
45 fn peek(&self) -> Option<char>;
47
48 fn advance(&mut self) -> Option<char>;
50
51 fn position(&self) -> &TextPosition;
53
54 fn update_position(&mut self, ch: char);
56}
57
58pub struct StringBuffer {
60 content: String,
61 position: TextPosition,
62 current: Option<char>,
63 peek: Option<char>,
64 char_position: usize, }
66
67impl StringBuffer {
68 pub fn new(content: String) -> Self {
69 let mut buffer = Self {
70 content,
71 position: TextPosition::start(),
72 current: None,
73 peek: None,
74 char_position: 0,
75 };
76 buffer.current = buffer.get_char_at(0);
78 buffer.peek = buffer.get_char_at(1);
79 buffer
80 }
81
82 fn get_char_at(&self, index: usize) -> Option<char> {
83 self.content.chars().nth(index)
84 }
85}
86
87impl BufferProvider for StringBuffer {
88 fn current(&self) -> Option<char> {
89 self.current
90 }
91
92 fn peek(&self) -> Option<char> {
93 self.peek
94 }
95
96 fn advance(&mut self) -> Option<char> {
97 if let Some(ch) = self.current {
98 self.update_position(ch);
99 }
100
101 self.current = self.peek;
102 self.char_position += 1;
103 self.peek = self.get_char_at(self.char_position + 1);
104 self.current
105 }
106
107 fn position(&self) -> &TextPosition {
108 &self.position
109 }
110
111 fn update_position(&mut self, ch: char) {
112 match ch {
113 '\n' => {
114 self.position.line += 1;
115 self.position.column = 1;
116 self.position.offset += 1;
117 }
118 '\r' => {
119 if self.peek == Some('\n') {
121 } else {
123 self.position.line += 1;
124 self.position.column = 1;
125 }
126 self.position.offset += 1;
127 }
128 _ => {
129 self.position.column += 1;
130 self.position.offset += 1;
131 }
132 }
133 }
134}
135
136pub struct ReaderBuffer<R: BufRead> {
138 reader: R,
139 position: TextPosition,
140 current: Option<char>,
141 peek: Option<char>,
142 char_buffer: Vec<char>,
143 buffer_pos: usize,
144}
145
146impl<R: BufRead> ReaderBuffer<R> {
147 pub fn new(reader: R) -> ParseResult<Self> {
148 let mut buffer = Self {
149 reader,
150 position: TextPosition::start(),
151 current: None,
152 peek: None,
153 char_buffer: Vec::new(),
154 buffer_pos: 0,
155 };
156
157 buffer.fill_buffer()?;
158 buffer.advance(); Ok(buffer)
160 }
161
162 fn fill_buffer(&mut self) -> ParseResult<()> {
163 let mut line = String::new();
164 match self.reader.read_line(&mut line) {
165 Ok(0) => Ok(()), Ok(_) => {
167 self.char_buffer.extend(line.chars());
168 Ok(())
169 }
170 Err(e) => Err(RdfParseError::Io(e)),
171 }
172 }
173
174 #[allow(dead_code)]
175 fn ensure_chars_available(&mut self) -> ParseResult<()> {
176 if self.buffer_pos + 1 >= self.char_buffer.len() {
177 self.fill_buffer()?;
178 }
179 Ok(())
180 }
181}
182
183impl<R: BufRead> BufferProvider for ReaderBuffer<R> {
184 fn current(&self) -> Option<char> {
185 self.current
186 }
187
188 fn peek(&self) -> Option<char> {
189 self.peek
190 }
191
192 fn advance(&mut self) -> Option<char> {
193 if let Some(ch) = self.current {
194 self.update_position(ch);
195 }
196
197 self.current = self.peek;
198
199 self.buffer_pos += 1;
201 if self.buffer_pos < self.char_buffer.len() {
202 self.peek = Some(self.char_buffer[self.buffer_pos]);
203 } else {
204 if self.fill_buffer().is_ok() && self.buffer_pos < self.char_buffer.len() {
206 self.peek = Some(self.char_buffer[self.buffer_pos]);
207 } else {
208 self.peek = None;
209 }
210 }
211
212 self.current
213 }
214
215 fn position(&self) -> &TextPosition {
216 &self.position
217 }
218
219 fn update_position(&mut self, ch: char) {
220 match ch {
221 '\n' => {
222 self.position.line += 1;
223 self.position.column = 1;
224 self.position.offset += 1;
225 }
226 '\r' => {
227 if self.peek == Some('\n') {
228 } else {
230 self.position.line += 1;
231 self.position.column = 1;
232 }
233 self.position.offset += 1;
234 }
235 _ => {
236 self.position.column += 1;
237 self.position.offset += 1;
238 }
239 }
240 }
241}
242
243impl<B: BufferProvider, TR> Lexer<B, TR> {
244 pub fn new(buffer: B, tokenizer: TR) -> Self {
245 Self {
246 buffer,
247 tokenizer,
248 position: TextPosition::start(),
249 current_char: None,
250 peek_char: None,
251 }
252 }
253}
254
255impl<B: BufferProvider, TR: TokenRecognizer> Lexer<B, TR> {
256 pub fn next_token(&mut self) -> ParseResult<Option<TR::Token>> {
258 self.tokenizer
259 .recognize_next_token(&mut self.buffer, &mut self.position)
260 }
261
262 pub fn position(&self) -> &TextPosition {
264 self.buffer.position()
265 }
266}
267
268pub struct Parser<Token> {
270 tokens: Vec<Token>,
271 position: usize,
272}
273
274impl<Token> Parser<Token> {
275 pub fn new(tokens: Vec<Token>) -> Self {
276 Self {
277 tokens,
278 position: 0,
279 }
280 }
281
282 pub fn peek(&self) -> Option<&Token> {
284 self.tokens.get(self.position)
285 }
286
287 pub fn next_token(&mut self) -> Option<&Token> {
289 if self.position < self.tokens.len() {
290 let token = &self.tokens[self.position];
291 self.position += 1;
292 Some(token)
293 } else {
294 None
295 }
296 }
297
298 pub fn is_at_end(&self) -> bool {
300 self.position >= self.tokens.len()
301 }
302
303 pub fn token_position(&self) -> usize {
305 self.position
306 }
307
308 pub fn reset_to(&mut self, pos: usize) {
310 self.position = pos.min(self.tokens.len());
311 }
312}
313
314pub mod char_utils {
316 pub fn is_whitespace(ch: char) -> bool {
318 matches!(ch, ' ' | '\t' | '\n' | '\r')
319 }
320
321 pub fn is_iri_start(ch: char) -> bool {
323 ch == '<'
324 }
325
326 pub fn is_iri_char(ch: char) -> bool {
328 !matches!(
329 ch,
330 '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\' | '\x00'..='\x20'
331 )
332 }
333
334 pub fn is_blank_node_start(ch: char) -> bool {
336 ch == '_'
337 }
338
339 pub fn is_pn_chars_base(ch: char) -> bool {
341 matches!(ch, 'A'..='Z' | 'a'..='z' | '\u{00C0}'..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}')
342 }
343
344 pub fn is_pn_chars(ch: char) -> bool {
346 is_pn_chars_base(ch)
347 || matches!(ch, '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
348 }
349
350 pub fn is_numeric_start(ch: char) -> bool {
352 matches!(ch, '0'..='9' | '+' | '-' | '.')
353 }
354
355 pub fn is_digit(ch: char) -> bool {
357 ch.is_ascii_digit()
358 }
359
360 pub fn is_hex_digit(ch: char) -> bool {
362 ch.is_ascii_hexdigit()
363 }
364}
365
366pub mod string_utils {
368 use super::ParseResult;
369 use crate::format::error::{RdfParseError, RdfSyntaxError, TextPosition};
370
371 pub fn unescape_string(input: &str, position: &TextPosition) -> ParseResult<String> {
373 let mut result = String::new();
374 let mut chars = input.chars();
375
376 while let Some(ch) = chars.next() {
377 if ch == '\\' {
378 match chars.next() {
379 Some('t') => result.push('\t'),
380 Some('n') => result.push('\n'),
381 Some('r') => result.push('\r'),
382 Some('b') => result.push('\u{0008}'),
383 Some('f') => result.push('\u{000C}'),
384 Some('"') => result.push('"'),
385 Some('\'') => result.push('\''),
386 Some('\\') => result.push('\\'),
387 Some('u') => {
388 let mut unicode_chars = String::new();
390 for _ in 0..4 {
391 match chars.next() {
392 Some(c) if c.is_ascii_hexdigit() => unicode_chars.push(c),
393 _ => {
394 return Err(RdfParseError::Syntax(
395 RdfSyntaxError::with_position(
396 "Invalid Unicode escape sequence".to_string(),
397 *position,
398 ),
399 ))
400 }
401 }
402 }
403 let code_point = u32::from_str_radix(&unicode_chars, 16).map_err(|_| {
404 RdfParseError::Syntax(RdfSyntaxError::with_position(
405 "Invalid Unicode code point".to_string(),
406 *position,
407 ))
408 })?;
409 match char::from_u32(code_point) {
410 Some(unicode_char) => result.push(unicode_char),
411 None => {
412 return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
413 "Invalid Unicode code point".to_string(),
414 *position,
415 )))
416 }
417 }
418 }
419 Some('U') => {
420 let mut unicode_chars = String::new();
422 for _ in 0..8 {
423 match chars.next() {
424 Some(c) if c.is_ascii_hexdigit() => unicode_chars.push(c),
425 _ => {
426 return Err(RdfParseError::Syntax(
427 RdfSyntaxError::with_position(
428 "Invalid Unicode escape sequence".to_string(),
429 *position,
430 ),
431 ))
432 }
433 }
434 }
435 let code_point = u32::from_str_radix(&unicode_chars, 16).map_err(|_| {
436 RdfParseError::Syntax(RdfSyntaxError::with_position(
437 "Invalid Unicode code point".to_string(),
438 *position,
439 ))
440 })?;
441 match char::from_u32(code_point) {
442 Some(unicode_char) => result.push(unicode_char),
443 None => {
444 return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
445 "Invalid Unicode code point".to_string(),
446 *position,
447 )))
448 }
449 }
450 }
451 Some(other) => {
452 return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
453 format!("Invalid escape sequence: \\{other}"),
454 *position,
455 )));
456 }
457 None => {
458 return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
459 "Incomplete escape sequence".to_string(),
460 *position,
461 )));
462 }
463 }
464 } else {
465 result.push(ch);
466 }
467 }
468
469 Ok(result)
470 }
471
472 pub fn escape_string(input: &str) -> String {
474 let mut result = String::new();
475 for ch in input.chars() {
476 match ch {
477 '\t' => result.push_str("\\t"),
478 '\n' => result.push_str("\\n"),
479 '\r' => result.push_str("\\r"),
480 '\u{0008}' => result.push_str("\\b"),
481 '\u{000C}' => result.push_str("\\f"),
482 '"' => result.push_str("\\\""),
483 '\\' => result.push_str("\\\\"),
484 c if c.is_control() => {
485 if (c as u32) <= 0xFFFF {
486 result.push_str(&format!("\\u{:04X}", c as u32));
487 } else {
488 result.push_str(&format!("\\U{:08X}", c as u32));
489 }
490 }
491 c => result.push(c),
492 }
493 }
494 result
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::char_utils::*;
501 use super::string_utils::*;
502 use super::*;
503
504 #[test]
505 fn test_string_buffer() {
506 let mut buffer = StringBuffer::new("hello\nworld".to_string());
507
508 assert_eq!(buffer.current(), Some('h'));
509 assert_eq!(buffer.peek(), Some('e'));
510
511 buffer.advance();
512 assert_eq!(buffer.current(), Some('e'));
513 assert_eq!(buffer.position().column, 2);
514
515 for _ in 0..4 {
517 buffer.advance();
518 }
519 assert_eq!(buffer.current(), Some('\n'));
520 assert_eq!(buffer.position().line, 1);
521 assert_eq!(buffer.position().column, 6);
522
523 buffer.advance();
524 assert_eq!(buffer.current(), Some('w'));
525 assert_eq!(buffer.position().line, 2);
526 assert_eq!(buffer.position().column, 1);
527 }
528
529 #[test]
530 fn test_char_classification() {
531 assert!(is_whitespace(' '));
532 assert!(is_whitespace('\t'));
533 assert!(is_whitespace('\n'));
534 assert!(!is_whitespace('a'));
535
536 assert!(is_iri_start('<'));
537 assert!(!is_iri_start('a'));
538
539 assert!(is_pn_chars_base('A'));
540 assert!(is_pn_chars_base('z'));
541 assert!(!is_pn_chars_base('1'));
542
543 assert!(is_pn_chars('A'));
544 assert!(is_pn_chars('1'));
545 assert!(is_pn_chars('-'));
546
547 assert!(is_numeric_start('1'));
548 assert!(is_numeric_start('+'));
549 assert!(is_numeric_start('.'));
550 assert!(!is_numeric_start('a'));
551 }
552
553 #[test]
554 fn test_string_escaping() {
555 let position = TextPosition::start();
556
557 assert_eq!(
559 unescape_string("hello\\nworld", &position).expect("unescape should succeed"),
560 "hello\nworld"
561 );
562 assert_eq!(
563 unescape_string("say \\\"hello\\\"", &position).expect("unescape should succeed"),
564 "say \"hello\""
565 );
566
567 assert_eq!(
569 unescape_string("\\u0041", &position).expect("unescape should succeed"),
570 "A"
571 );
572 assert_eq!(
573 unescape_string("\\U00000041", &position).expect("unescape should succeed"),
574 "A"
575 );
576
577 assert_eq!(escape_string("hello\nworld"), "hello\\nworld");
579 assert_eq!(escape_string("say \"hello\""), "say \\\"hello\\\"");
580 }
581}