Skip to main content

oxirs_core/format/
toolkit.rs

1//! Generic parsing toolkit for RDF formats
2//!
3//! This module provides reusable parsing infrastructure including lexers, parsers,
4//! and error handling utilities adapted from OxiGraph's parsing toolkit.
5
6use super::error::{ParseResult, RdfParseError, TextPosition};
7use std::io::BufRead;
8
9/// A generic lexer that tokenizes input streams
10#[allow(dead_code)]
11pub struct Lexer<B, TR> {
12    buffer: B,
13    tokenizer: TR,
14    position: TextPosition,
15    current_char: Option<char>,
16    peek_char: Option<char>,
17}
18
19/// Trait for tokenizing rules
20pub trait TokenRecognizer {
21    type Token;
22
23    /// Recognize the next token from the current position
24    fn recognize_next_token(
25        &mut self,
26        buffer: &mut dyn BufferProvider,
27        position: &mut TextPosition,
28    ) -> ParseResult<Option<Self::Token>>;
29}
30
31/// Trait for parsing rules that build AST nodes
32pub trait RuleRecognizer<Node> {
33    /// Recognize the next rule/node from token stream
34    fn recognize_next_node<Token>(
35        &mut self,
36        parser: &mut Parser<Token>,
37    ) -> ParseResult<Option<Node>>;
38}
39
40/// Buffer provider trait for reading characters
41pub trait BufferProvider {
42    /// Get the current character without advancing
43    fn current(&self) -> Option<char>;
44
45    /// Get the next character without advancing
46    fn peek(&self) -> Option<char>;
47
48    /// Advance to the next character and return it
49    fn advance(&mut self) -> Option<char>;
50
51    /// Get current position for error reporting
52    fn position(&self) -> &TextPosition;
53
54    /// Update position tracking
55    fn update_position(&mut self, ch: char);
56}
57
58/// String buffer implementation
59pub struct StringBuffer {
60    content: String,
61    position: TextPosition,
62    current: Option<char>,
63    peek: Option<char>,
64    char_position: usize, // Current position in chars
65}
66
67impl StringBuffer {
68    pub fn new(content: String) -> Self {
69        let mut buffer = Self {
70            content,
71            position: TextPosition::start(),
72            current: None,
73            peek: None,
74            char_position: 0,
75        };
76        // Initialize current and peek
77        buffer.current = buffer.get_char_at(0);
78        buffer.peek = buffer.get_char_at(1);
79        buffer
80    }
81
82    fn get_char_at(&self, index: usize) -> Option<char> {
83        self.content.chars().nth(index)
84    }
85}
86
87impl BufferProvider for StringBuffer {
88    fn current(&self) -> Option<char> {
89        self.current
90    }
91
92    fn peek(&self) -> Option<char> {
93        self.peek
94    }
95
96    fn advance(&mut self) -> Option<char> {
97        if let Some(ch) = self.current {
98            self.update_position(ch);
99        }
100
101        self.current = self.peek;
102        self.char_position += 1;
103        self.peek = self.get_char_at(self.char_position + 1);
104        self.current
105    }
106
107    fn position(&self) -> &TextPosition {
108        &self.position
109    }
110
111    fn update_position(&mut self, ch: char) {
112        match ch {
113            '\n' => {
114                self.position.line += 1;
115                self.position.column = 1;
116                self.position.offset += 1;
117            }
118            '\r' => {
119                // Handle Windows line endings
120                if self.peek == Some('\n') {
121                    // Don't increment position for \r if followed by \n
122                } else {
123                    self.position.line += 1;
124                    self.position.column = 1;
125                }
126                self.position.offset += 1;
127            }
128            _ => {
129                self.position.column += 1;
130                self.position.offset += 1;
131            }
132        }
133    }
134}
135
136/// Reader buffer implementation for streaming
137pub struct ReaderBuffer<R: BufRead> {
138    reader: R,
139    position: TextPosition,
140    current: Option<char>,
141    peek: Option<char>,
142    char_buffer: Vec<char>,
143    buffer_pos: usize,
144}
145
146impl<R: BufRead> ReaderBuffer<R> {
147    pub fn new(reader: R) -> ParseResult<Self> {
148        let mut buffer = Self {
149            reader,
150            position: TextPosition::start(),
151            current: None,
152            peek: None,
153            char_buffer: Vec::new(),
154            buffer_pos: 0,
155        };
156
157        buffer.fill_buffer()?;
158        buffer.advance(); // Load first character
159        Ok(buffer)
160    }
161
162    fn fill_buffer(&mut self) -> ParseResult<()> {
163        let mut line = String::new();
164        match self.reader.read_line(&mut line) {
165            Ok(0) => Ok(()), // EOF
166            Ok(_) => {
167                self.char_buffer.extend(line.chars());
168                Ok(())
169            }
170            Err(e) => Err(RdfParseError::Io(e)),
171        }
172    }
173
174    #[allow(dead_code)]
175    fn ensure_chars_available(&mut self) -> ParseResult<()> {
176        if self.buffer_pos + 1 >= self.char_buffer.len() {
177            self.fill_buffer()?;
178        }
179        Ok(())
180    }
181}
182
183impl<R: BufRead> BufferProvider for ReaderBuffer<R> {
184    fn current(&self) -> Option<char> {
185        self.current
186    }
187
188    fn peek(&self) -> Option<char> {
189        self.peek
190    }
191
192    fn advance(&mut self) -> Option<char> {
193        if let Some(ch) = self.current {
194            self.update_position(ch);
195        }
196
197        self.current = self.peek;
198
199        // Try to get next peek character
200        self.buffer_pos += 1;
201        if self.buffer_pos < self.char_buffer.len() {
202            self.peek = Some(self.char_buffer[self.buffer_pos]);
203        } else {
204            // Try to read more
205            if self.fill_buffer().is_ok() && self.buffer_pos < self.char_buffer.len() {
206                self.peek = Some(self.char_buffer[self.buffer_pos]);
207            } else {
208                self.peek = None;
209            }
210        }
211
212        self.current
213    }
214
215    fn position(&self) -> &TextPosition {
216        &self.position
217    }
218
219    fn update_position(&mut self, ch: char) {
220        match ch {
221            '\n' => {
222                self.position.line += 1;
223                self.position.column = 1;
224                self.position.offset += 1;
225            }
226            '\r' => {
227                if self.peek == Some('\n') {
228                    // Don't increment position for \r if followed by \n
229                } else {
230                    self.position.line += 1;
231                    self.position.column = 1;
232                }
233                self.position.offset += 1;
234            }
235            _ => {
236                self.position.column += 1;
237                self.position.offset += 1;
238            }
239        }
240    }
241}
242
243impl<B: BufferProvider, TR> Lexer<B, TR> {
244    pub fn new(buffer: B, tokenizer: TR) -> Self {
245        Self {
246            buffer,
247            tokenizer,
248            position: TextPosition::start(),
249            current_char: None,
250            peek_char: None,
251        }
252    }
253}
254
255impl<B: BufferProvider, TR: TokenRecognizer> Lexer<B, TR> {
256    /// Get the next token from the input
257    pub fn next_token(&mut self) -> ParseResult<Option<TR::Token>> {
258        self.tokenizer
259            .recognize_next_token(&mut self.buffer, &mut self.position)
260    }
261
262    /// Get current position for error reporting
263    pub fn position(&self) -> &TextPosition {
264        self.buffer.position()
265    }
266}
267
268/// Generic parser combining lexer with grammar rules
269pub struct Parser<Token> {
270    tokens: Vec<Token>,
271    position: usize,
272}
273
274impl<Token> Parser<Token> {
275    pub fn new(tokens: Vec<Token>) -> Self {
276        Self {
277            tokens,
278            position: 0,
279        }
280    }
281
282    /// Peek at current token without consuming
283    pub fn peek(&self) -> Option<&Token> {
284        self.tokens.get(self.position)
285    }
286
287    /// Advance and return current token
288    pub fn next_token(&mut self) -> Option<&Token> {
289        if self.position < self.tokens.len() {
290            let token = &self.tokens[self.position];
291            self.position += 1;
292            Some(token)
293        } else {
294            None
295        }
296    }
297
298    /// Check if we're at the end of input
299    pub fn is_at_end(&self) -> bool {
300        self.position >= self.tokens.len()
301    }
302
303    /// Get current position in token stream
304    pub fn token_position(&self) -> usize {
305        self.position
306    }
307
308    /// Reset to a previous position (for backtracking)
309    pub fn reset_to(&mut self, pos: usize) {
310        self.position = pos.min(self.tokens.len());
311    }
312}
313
314/// Utility functions for character classification
315pub mod char_utils {
316    /// Check if character is whitespace in Turtle/N3
317    pub fn is_whitespace(ch: char) -> bool {
318        matches!(ch, ' ' | '\t' | '\n' | '\r')
319    }
320
321    /// Check if character can start an IRI
322    pub fn is_iri_start(ch: char) -> bool {
323        ch == '<'
324    }
325
326    /// Check if character can be in an IRI
327    pub fn is_iri_char(ch: char) -> bool {
328        !matches!(
329            ch,
330            '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\' | '\x00'..='\x20'
331        )
332    }
333
334    /// Check if character can start a blank node
335    pub fn is_blank_node_start(ch: char) -> bool {
336        ch == '_'
337    }
338
339    /// Check if character can start a prefix name
340    pub fn is_pn_chars_base(ch: char) -> bool {
341        matches!(ch, 'A'..='Z' | 'a'..='z' | '\u{00C0}'..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}')
342    }
343
344    /// Check if character can be in a prefix name (after first character)
345    pub fn is_pn_chars(ch: char) -> bool {
346        is_pn_chars_base(ch)
347            || matches!(ch, '-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}')
348    }
349
350    /// Check if character can start a numeric literal
351    pub fn is_numeric_start(ch: char) -> bool {
352        matches!(ch, '0'..='9' | '+' | '-' | '.')
353    }
354
355    /// Check if character is a digit
356    pub fn is_digit(ch: char) -> bool {
357        ch.is_ascii_digit()
358    }
359
360    /// Check if character is hexadecimal
361    pub fn is_hex_digit(ch: char) -> bool {
362        ch.is_ascii_hexdigit()
363    }
364}
365
366/// Utility functions for string processing
367pub mod string_utils {
368    use super::ParseResult;
369    use crate::format::error::{RdfParseError, RdfSyntaxError, TextPosition};
370
371    /// Unescape string literal with Turtle escape sequences
372    pub fn unescape_string(input: &str, position: &TextPosition) -> ParseResult<String> {
373        let mut result = String::new();
374        let mut chars = input.chars();
375
376        while let Some(ch) = chars.next() {
377            if ch == '\\' {
378                match chars.next() {
379                    Some('t') => result.push('\t'),
380                    Some('n') => result.push('\n'),
381                    Some('r') => result.push('\r'),
382                    Some('b') => result.push('\u{0008}'),
383                    Some('f') => result.push('\u{000C}'),
384                    Some('"') => result.push('"'),
385                    Some('\'') => result.push('\''),
386                    Some('\\') => result.push('\\'),
387                    Some('u') => {
388                        // Unicode escape \uXXXX
389                        let mut unicode_chars = String::new();
390                        for _ in 0..4 {
391                            match chars.next() {
392                                Some(c) if c.is_ascii_hexdigit() => unicode_chars.push(c),
393                                _ => {
394                                    return Err(RdfParseError::Syntax(
395                                        RdfSyntaxError::with_position(
396                                            "Invalid Unicode escape sequence".to_string(),
397                                            *position,
398                                        ),
399                                    ))
400                                }
401                            }
402                        }
403                        let code_point = u32::from_str_radix(&unicode_chars, 16).map_err(|_| {
404                            RdfParseError::Syntax(RdfSyntaxError::with_position(
405                                "Invalid Unicode code point".to_string(),
406                                *position,
407                            ))
408                        })?;
409                        match char::from_u32(code_point) {
410                            Some(unicode_char) => result.push(unicode_char),
411                            None => {
412                                return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
413                                    "Invalid Unicode code point".to_string(),
414                                    *position,
415                                )))
416                            }
417                        }
418                    }
419                    Some('U') => {
420                        // Unicode escape \UXXXXXXXX
421                        let mut unicode_chars = String::new();
422                        for _ in 0..8 {
423                            match chars.next() {
424                                Some(c) if c.is_ascii_hexdigit() => unicode_chars.push(c),
425                                _ => {
426                                    return Err(RdfParseError::Syntax(
427                                        RdfSyntaxError::with_position(
428                                            "Invalid Unicode escape sequence".to_string(),
429                                            *position,
430                                        ),
431                                    ))
432                                }
433                            }
434                        }
435                        let code_point = u32::from_str_radix(&unicode_chars, 16).map_err(|_| {
436                            RdfParseError::Syntax(RdfSyntaxError::with_position(
437                                "Invalid Unicode code point".to_string(),
438                                *position,
439                            ))
440                        })?;
441                        match char::from_u32(code_point) {
442                            Some(unicode_char) => result.push(unicode_char),
443                            None => {
444                                return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
445                                    "Invalid Unicode code point".to_string(),
446                                    *position,
447                                )))
448                            }
449                        }
450                    }
451                    Some(other) => {
452                        return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
453                            format!("Invalid escape sequence: \\{other}"),
454                            *position,
455                        )));
456                    }
457                    None => {
458                        return Err(RdfParseError::Syntax(RdfSyntaxError::with_position(
459                            "Incomplete escape sequence".to_string(),
460                            *position,
461                        )));
462                    }
463                }
464            } else {
465                result.push(ch);
466            }
467        }
468
469        Ok(result)
470    }
471
472    /// Escape string for Turtle output
473    pub fn escape_string(input: &str) -> String {
474        let mut result = String::new();
475        for ch in input.chars() {
476            match ch {
477                '\t' => result.push_str("\\t"),
478                '\n' => result.push_str("\\n"),
479                '\r' => result.push_str("\\r"),
480                '\u{0008}' => result.push_str("\\b"),
481                '\u{000C}' => result.push_str("\\f"),
482                '"' => result.push_str("\\\""),
483                '\\' => result.push_str("\\\\"),
484                c if c.is_control() => {
485                    if (c as u32) <= 0xFFFF {
486                        result.push_str(&format!("\\u{:04X}", c as u32));
487                    } else {
488                        result.push_str(&format!("\\U{:08X}", c as u32));
489                    }
490                }
491                c => result.push(c),
492            }
493        }
494        result
495    }
496}
497
498#[cfg(test)]
499mod tests {
500    use super::char_utils::*;
501    use super::string_utils::*;
502    use super::*;
503
504    #[test]
505    fn test_string_buffer() {
506        let mut buffer = StringBuffer::new("hello\nworld".to_string());
507
508        assert_eq!(buffer.current(), Some('h'));
509        assert_eq!(buffer.peek(), Some('e'));
510
511        buffer.advance();
512        assert_eq!(buffer.current(), Some('e'));
513        assert_eq!(buffer.position().column, 2);
514
515        // Advance to newline
516        for _ in 0..4 {
517            buffer.advance();
518        }
519        assert_eq!(buffer.current(), Some('\n'));
520        assert_eq!(buffer.position().line, 1);
521        assert_eq!(buffer.position().column, 6);
522
523        buffer.advance();
524        assert_eq!(buffer.current(), Some('w'));
525        assert_eq!(buffer.position().line, 2);
526        assert_eq!(buffer.position().column, 1);
527    }
528
529    #[test]
530    fn test_char_classification() {
531        assert!(is_whitespace(' '));
532        assert!(is_whitespace('\t'));
533        assert!(is_whitespace('\n'));
534        assert!(!is_whitespace('a'));
535
536        assert!(is_iri_start('<'));
537        assert!(!is_iri_start('a'));
538
539        assert!(is_pn_chars_base('A'));
540        assert!(is_pn_chars_base('z'));
541        assert!(!is_pn_chars_base('1'));
542
543        assert!(is_pn_chars('A'));
544        assert!(is_pn_chars('1'));
545        assert!(is_pn_chars('-'));
546
547        assert!(is_numeric_start('1'));
548        assert!(is_numeric_start('+'));
549        assert!(is_numeric_start('.'));
550        assert!(!is_numeric_start('a'));
551    }
552
553    #[test]
554    fn test_string_escaping() {
555        let position = TextPosition::start();
556
557        // Test basic escapes
558        assert_eq!(
559            unescape_string("hello\\nworld", &position).expect("unescape should succeed"),
560            "hello\nworld"
561        );
562        assert_eq!(
563            unescape_string("say \\\"hello\\\"", &position).expect("unescape should succeed"),
564            "say \"hello\""
565        );
566
567        // Test Unicode escapes
568        assert_eq!(
569            unescape_string("\\u0041", &position).expect("unescape should succeed"),
570            "A"
571        );
572        assert_eq!(
573            unescape_string("\\U00000041", &position).expect("unescape should succeed"),
574            "A"
575        );
576
577        // Test escape string
578        assert_eq!(escape_string("hello\nworld"), "hello\\nworld");
579        assert_eq!(escape_string("say \"hello\""), "say \\\"hello\\\"");
580    }
581}