Skip to main content

oxirs_core/sparql/
parser_combinators.rs

1//! Parser combinators for SPARQL token stream processing
2//!
3//! Provides a simple, composable parser combinator library for processing
4//! SPARQL token streams. Supports keywords, IRIs, variables, literals,
5//! punctuation, optional matches, zero-or-more repetitions, and ordered choice.
6
7/// The kind of a SPARQL token
8#[derive(Debug, Clone, PartialEq, Eq)]
9pub enum TokenKind {
10    /// SPARQL keyword (SELECT, WHERE, PREFIX, etc.)
11    Keyword,
12    /// Full IRI enclosed in angle brackets: <http://…>
13    Iri,
14    /// Prefixed name: prefix:local
15    PrefixedName,
16    /// Variable: ?name or $name
17    Variable,
18    /// Literal: "text", 42, 3.14, true
19    Literal,
20    /// Punctuation: { } ( ) , ; . etc.
21    Punctuation,
22    /// Whitespace (spaces/tabs/newlines)
23    Whitespace,
24    /// Comment (# …)
25    Comment,
26    /// End of token stream
27    Eof,
28}
29
30/// A single SPARQL token
31#[derive(Debug, Clone, PartialEq, Eq)]
32pub struct Token {
33    /// The kind of this token
34    pub kind: TokenKind,
35    /// The raw text value of the token
36    pub value: String,
37    /// Byte offset of the token's first character in the source
38    pub position: usize,
39}
40
41impl Token {
42    /// Create a new token
43    pub fn new(kind: TokenKind, value: impl Into<String>, position: usize) -> Self {
44        Token {
45            kind,
46            value: value.into(),
47            position,
48        }
49    }
50}
51
52/// An immutable, cloneable view of a token stream with a cursor position
53#[derive(Debug, Clone)]
54pub struct TokenStream {
55    tokens: Vec<Token>,
56    pos: usize,
57}
58
59/// Result type for parser combinators
60pub type ParseResult<T> = Result<(T, TokenStream), ParseError>;
61
62/// Error produced when a combinator fails
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub struct ParseError {
65    /// Human-readable description of the failure
66    pub message: String,
67    /// Byte position in the source where the error occurred
68    pub position: usize,
69}
70
71impl ParseError {
72    /// Construct a new parse error
73    pub fn new(message: impl Into<String>, position: usize) -> Self {
74        ParseError {
75            message: message.into(),
76            position,
77        }
78    }
79}
80
81impl std::fmt::Display for ParseError {
82    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83        write!(f, "parse error at {}: {}", self.position, self.message)
84    }
85}
86
87impl std::error::Error for ParseError {}
88
89// ─── TokenStream ─────────────────────────────────────────────────────────────
90
91impl TokenStream {
92    /// Create a new token stream from a vector of tokens
93    pub fn new(tokens: Vec<Token>) -> Self {
94        TokenStream { tokens, pos: 0 }
95    }
96
97    /// Peek at the current token without advancing
98    pub fn peek(&self) -> Option<&Token> {
99        self.tokens.get(self.pos)
100    }
101
102    /// Consume the current token, returning it and the advanced stream
103    pub fn next(mut self) -> (Option<Token>, TokenStream) {
104        if self.pos < self.tokens.len() {
105            let tok = self.tokens[self.pos].clone();
106            self.pos += 1;
107            (Some(tok), self)
108        } else {
109            (None, self)
110        }
111    }
112
113    /// Return true if no tokens remain (excluding Eof sentinel)
114    pub fn is_empty(&self) -> bool {
115        self.remaining() == 0
116    }
117
118    /// Number of tokens remaining (excluding any trailing Eof token)
119    pub fn remaining(&self) -> usize {
120        let total = self.tokens.len();
121        if total == 0 {
122            return 0;
123        }
124        // Count non-EOF tokens from current pos
125        let remaining_tokens = &self.tokens[self.pos..];
126        remaining_tokens
127            .iter()
128            .filter(|t| t.kind != TokenKind::Eof)
129            .count()
130    }
131
132    /// Current cursor position (index into token slice)
133    pub fn position(&self) -> usize {
134        self.pos
135    }
136
137    /// Return the byte offset of the current token (or end-of-source)
138    pub fn byte_offset(&self) -> usize {
139        self.tokens.get(self.pos).map(|t| t.position).unwrap_or(0)
140    }
141}
142
143// ─── Combinator functions ─────────────────────────────────────────────────────
144
145/// Expect the next token to be a keyword matching `keyword` (case-insensitive)
146pub fn expect_keyword(stream: TokenStream, keyword: &str) -> ParseResult<()> {
147    match stream.peek() {
148        Some(tok) if tok.kind == TokenKind::Keyword && tok.value.eq_ignore_ascii_case(keyword) => {
149            let (_, rest) = stream.next();
150            Ok(((), rest))
151        }
152        Some(tok) => Err(ParseError::new(
153            format!(
154                "expected keyword '{}', found {:?} '{}'",
155                keyword, tok.kind, tok.value
156            ),
157            tok.position,
158        )),
159        None => Err(ParseError::new(
160            format!("expected keyword '{}', reached end of stream", keyword),
161            0,
162        )),
163    }
164}
165
166/// Expect the next token to be an IRI; returns the IRI string (without angle brackets)
167pub fn expect_iri(stream: TokenStream) -> ParseResult<String> {
168    match stream.peek() {
169        Some(tok) if tok.kind == TokenKind::Iri => {
170            let value = tok.value.clone();
171            let (_, rest) = stream.next();
172            Ok((value, rest))
173        }
174        Some(tok) if tok.kind == TokenKind::PrefixedName => {
175            let value = tok.value.clone();
176            let (_, rest) = stream.next();
177            Ok((value, rest))
178        }
179        Some(tok) => Err(ParseError::new(
180            format!("expected IRI, found {:?} '{}'", tok.kind, tok.value),
181            tok.position,
182        )),
183        None => Err(ParseError::new("expected IRI, reached end of stream", 0)),
184    }
185}
186
187/// Expect the next token to be a variable; returns the variable name (without ? or $)
188pub fn expect_variable(stream: TokenStream) -> ParseResult<String> {
189    match stream.peek() {
190        Some(tok) if tok.kind == TokenKind::Variable => {
191            let value = tok.value.clone();
192            let (_, rest) = stream.next();
193            Ok((value, rest))
194        }
195        Some(tok) => Err(ParseError::new(
196            format!("expected variable, found {:?} '{}'", tok.kind, tok.value),
197            tok.position,
198        )),
199        None => Err(ParseError::new(
200            "expected variable, reached end of stream",
201            0,
202        )),
203    }
204}
205
206/// Try to apply `f`; on failure, return `None` without consuming any tokens
207pub fn optional<T, F>(stream: TokenStream, f: F) -> ParseResult<Option<T>>
208where
209    F: Fn(TokenStream) -> ParseResult<T>,
210{
211    let snapshot = stream.clone();
212    match f(stream) {
213        Ok((value, rest)) => Ok((Some(value), rest)),
214        Err(_) => Ok((None, snapshot)),
215    }
216}
217
218/// Apply `f` repeatedly until it fails; return all successful results
219/// The stream is advanced only for successful applications
220pub fn many0<T, F>(stream: TokenStream, f: F) -> ParseResult<Vec<T>>
221where
222    F: Fn(TokenStream) -> ParseResult<T>,
223{
224    let mut results = Vec::new();
225    let mut current = stream;
226    loop {
227        let snapshot = current.clone();
228        match f(current) {
229            Ok((value, rest)) => {
230                results.push(value);
231                current = rest;
232            }
233            Err(_) => {
234                current = snapshot;
235                break;
236            }
237        }
238    }
239    Ok((results, current))
240}
241
242/// Try each parser in order; return the first success (backtracking on failure)
243pub fn choice<T>(
244    stream: TokenStream,
245    parsers: Vec<Box<dyn Fn(TokenStream) -> ParseResult<T>>>,
246) -> ParseResult<T> {
247    let mut last_err = ParseError::new("no alternatives in choice", stream.byte_offset());
248    for parser in &parsers {
249        let snapshot = stream.clone();
250        match parser(snapshot) {
251            Ok(result) => return Ok(result),
252            Err(e) => last_err = e,
253        }
254    }
255    Err(last_err)
256}
257
258// ─── Tokenizer ───────────────────────────────────────────────────────────────
259
260/// Known SPARQL keywords (subset used in SPARQL 1.1/1.2)
261const SPARQL_KEYWORDS: &[&str] = &[
262    "BASE",
263    "PREFIX",
264    "SELECT",
265    "DISTINCT",
266    "REDUCED",
267    "CONSTRUCT",
268    "DESCRIBE",
269    "ASK",
270    "FROM",
271    "NAMED",
272    "WHERE",
273    "ORDER",
274    "BY",
275    "ASC",
276    "DESC",
277    "LIMIT",
278    "OFFSET",
279    "HAVING",
280    "GROUP",
281    "UNION",
282    "OPTIONAL",
283    "MINUS",
284    "GRAPH",
285    "SERVICE",
286    "BIND",
287    "VALUES",
288    "FILTER",
289    "EXISTS",
290    "NOT",
291    "IN",
292    "AS",
293    "SEPARATOR",
294    "COUNT",
295    "SUM",
296    "MIN",
297    "MAX",
298    "AVG",
299    "SAMPLE",
300    "REGEX",
301    "LANG",
302    "DATATYPE",
303    "IRI",
304    "URI",
305    "BNODE",
306    "STR",
307    "STRDT",
308    "STRLANG",
309    "TRUE",
310    "FALSE",
311    "UNDEF",
312    "LOAD",
313    "CLEAR",
314    "DROP",
315    "CREATE",
316    "ADD",
317    "MOVE",
318    "COPY",
319    "INSERT",
320    "DELETE",
321    "WITH",
322    "USING",
323    "DATA",
324    "INTO",
325    "ALL",
326    "DEFAULT",
327    "SILENT",
328    "UPDATE",
329    "SPARQL",
330];
331
332/// Tokenizer for SPARQL source text
333pub struct Tokenizer;
334
335impl Tokenizer {
336    /// Tokenize the full input, including whitespace and comments
337    pub fn tokenize(input: &str) -> Result<Vec<Token>, ParseError> {
338        let mut tokens = Vec::new();
339        let chars: Vec<char> = input.chars().collect();
340        let mut i = 0;
341
342        while i < chars.len() {
343            let start = i;
344            let ch = chars[i];
345
346            // Whitespace
347            if ch.is_whitespace() {
348                let mut end = i;
349                while end < chars.len() && chars[end].is_whitespace() {
350                    end += 1;
351                }
352                let value: String = chars[start..end].iter().collect();
353                tokens.push(Token::new(TokenKind::Whitespace, value, start));
354                i = end;
355                continue;
356            }
357
358            // Comment
359            if ch == '#' {
360                let mut end = i;
361                while end < chars.len() && chars[end] != '\n' {
362                    end += 1;
363                }
364                let value: String = chars[start..end].iter().collect();
365                tokens.push(Token::new(TokenKind::Comment, value, start));
366                i = end;
367                continue;
368            }
369
370            // IRI: <…>  (but not <= which is a comparison operator)
371            if ch == '<' && !(i + 1 < chars.len() && chars[i + 1] == '=') {
372                let mut end = i + 1;
373                while end < chars.len() && chars[end] != '>' {
374                    if chars[end] == '\n' || chars[end] == ' ' {
375                        return Err(ParseError::new(
376                            "unterminated IRI: unexpected whitespace inside angle brackets",
377                            start,
378                        ));
379                    }
380                    end += 1;
381                }
382                if end >= chars.len() {
383                    return Err(ParseError::new("unterminated IRI: missing '>'", start));
384                }
385                end += 1; // consume '>'
386                let value: String = chars[start..end].iter().collect();
387                tokens.push(Token::new(TokenKind::Iri, value, start));
388                i = end;
389                continue;
390            }
391
392            // String literal: "…" or '…'
393            if ch == '"' || ch == '\'' {
394                let quote = ch;
395                // Check for triple-quoted
396                let triple = i + 2 < chars.len() && chars[i + 1] == quote && chars[i + 2] == quote;
397                let (delim_len, close_seq): (usize, Vec<char>) = if triple {
398                    (3, vec![quote, quote, quote])
399                } else {
400                    (1, vec![quote])
401                };
402                let mut end = i + delim_len;
403                loop {
404                    if end + close_seq.len() > chars.len() {
405                        return Err(ParseError::new("unterminated string literal", start));
406                    }
407                    let window: Vec<char> = chars[end..end + close_seq.len()].to_vec();
408                    if window == close_seq {
409                        end += close_seq.len();
410                        break;
411                    }
412                    if chars[end] == '\\' {
413                        end += 2; // skip escape
414                    } else {
415                        end += 1;
416                    }
417                }
418                // Optional language tag or datatype
419                if end < chars.len() && chars[end] == '@' {
420                    end += 1;
421                    while end < chars.len() && (chars[end].is_alphanumeric() || chars[end] == '-') {
422                        end += 1;
423                    }
424                } else if end + 1 < chars.len() && chars[end] == '^' && chars[end + 1] == '^' {
425                    end += 2;
426                    if end < chars.len() && chars[end] == '<' {
427                        while end < chars.len() && chars[end] != '>' {
428                            end += 1;
429                        }
430                        if end < chars.len() {
431                            end += 1;
432                        }
433                    } else {
434                        // prefixed datatype
435                        while end < chars.len()
436                            && (chars[end].is_alphanumeric()
437                                || chars[end] == ':'
438                                || chars[end] == '_')
439                        {
440                            end += 1;
441                        }
442                    }
443                }
444                let value: String = chars[start..end].iter().collect();
445                tokens.push(Token::new(TokenKind::Literal, value, start));
446                i = end;
447                continue;
448            }
449
450            // Variable: ?name or $name
451            if ch == '?' || ch == '$' {
452                let mut end = i + 1;
453                while end < chars.len() && (chars[end].is_alphanumeric() || chars[end] == '_') {
454                    end += 1;
455                }
456                let value: String = chars[start..end].iter().collect();
457                tokens.push(Token::new(TokenKind::Variable, value, start));
458                i = end;
459                continue;
460            }
461
462            // Numeric literal
463            if ch.is_ascii_digit()
464                || (ch == '-' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit())
465            {
466                let mut end = i;
467                if chars[end] == '-' {
468                    end += 1;
469                }
470                while end < chars.len() && chars[end].is_ascii_digit() {
471                    end += 1;
472                }
473                if end < chars.len() && chars[end] == '.' {
474                    end += 1;
475                    while end < chars.len() && chars[end].is_ascii_digit() {
476                        end += 1;
477                    }
478                }
479                // Optional exponent
480                if end < chars.len() && (chars[end] == 'e' || chars[end] == 'E') {
481                    end += 1;
482                    if end < chars.len() && (chars[end] == '+' || chars[end] == '-') {
483                        end += 1;
484                    }
485                    while end < chars.len() && chars[end].is_ascii_digit() {
486                        end += 1;
487                    }
488                }
489                let value: String = chars[start..end].iter().collect();
490                tokens.push(Token::new(TokenKind::Literal, value, start));
491                i = end;
492                continue;
493            }
494
495            // Keyword or prefixed name or bare identifier
496            if ch.is_alphabetic() || ch == '_' {
497                let mut end = i;
498                while end < chars.len()
499                    && (chars[end].is_alphanumeric() || chars[end] == '_' || chars[end] == '-')
500                {
501                    end += 1;
502                }
503                let word: String = chars[start..end].iter().collect();
504
505                // Check for prefix:local
506                if end < chars.len() && chars[end] == ':' {
507                    end += 1; // consume ':'
508                              // local part (may be empty)
509                    while end < chars.len()
510                        && (chars[end].is_alphanumeric()
511                            || chars[end] == '_'
512                            || chars[end] == '-'
513                            || chars[end] == '.')
514                    {
515                        end += 1;
516                    }
517                    let full: String = chars[start..end].iter().collect();
518                    tokens.push(Token::new(TokenKind::PrefixedName, full, start));
519                    i = end;
520                    continue;
521                }
522
523                // Check keyword (case-insensitive)
524                if SPARQL_KEYWORDS
525                    .iter()
526                    .any(|kw| kw.eq_ignore_ascii_case(&word))
527                {
528                    tokens.push(Token::new(TokenKind::Keyword, word, start));
529                } else {
530                    // bare identifier — treat as literal for simplicity
531                    tokens.push(Token::new(TokenKind::Literal, word, start));
532                }
533                i = end;
534                continue;
535            }
536
537            // Punctuation and operators
538            let punct_chars: &[char] = &[
539                '{', '}', '(', ')', '[', ']', '.', ',', ';', '|', '/', '^', '+', '*', '!', '=',
540                '<', '>', '&', '@',
541            ];
542            if punct_chars.contains(&ch) {
543                // Handle two-character operators
544                let two: String = if i + 1 < chars.len() {
545                    chars[i..i + 2].iter().collect()
546                } else {
547                    String::new()
548                };
549                if matches!(two.as_str(), "!=" | "<=" | ">=" | "&&" | "||" | "^^") {
550                    tokens.push(Token::new(TokenKind::Punctuation, two, start));
551                    i += 2;
552                } else {
553                    tokens.push(Token::new(TokenKind::Punctuation, ch.to_string(), start));
554                    i += 1;
555                }
556                continue;
557            }
558
559            return Err(ParseError::new(
560                format!("unexpected character '{}'", ch),
561                start,
562            ));
563        }
564
565        tokens.push(Token::new(TokenKind::Eof, "", input.len()));
566        Ok(tokens)
567    }
568
569    /// Tokenize and strip whitespace and comment tokens
570    pub fn tokenize_filtered(input: &str) -> Result<Vec<Token>, ParseError> {
571        let tokens = Self::tokenize(input)?;
572        Ok(tokens
573            .into_iter()
574            .filter(|t| t.kind != TokenKind::Whitespace && t.kind != TokenKind::Comment)
575            .collect())
576    }
577}
578
579// ─── Tests ────────────────────────────────────────────────────────────────────
580
581#[cfg(test)]
582mod tests {
583    use super::*;
584
585    // ── Tokenizer tests ──────────────────────────────────────────────────────
586
587    #[test]
588    fn test_tokenize_keyword_select() {
589        let tokens = Tokenizer::tokenize("SELECT").expect("valid SPARQL input");
590        assert_eq!(tokens[0].kind, TokenKind::Keyword);
591        assert_eq!(tokens[0].value, "SELECT");
592    }
593
594    #[test]
595    fn test_tokenize_keyword_case_insensitive() {
596        let tokens = Tokenizer::tokenize("select").expect("valid SPARQL input");
597        assert_eq!(tokens[0].kind, TokenKind::Keyword);
598    }
599
600    #[test]
601    fn test_tokenize_keyword_where() {
602        let tokens = Tokenizer::tokenize("WHERE").expect("valid SPARQL input");
603        assert_eq!(tokens[0].kind, TokenKind::Keyword);
604        assert_eq!(tokens[0].value, "WHERE");
605    }
606
607    #[test]
608    fn test_tokenize_keyword_prefix() {
609        let tokens = Tokenizer::tokenize("PREFIX").expect("valid SPARQL input");
610        assert_eq!(tokens[0].kind, TokenKind::Keyword);
611    }
612
613    #[test]
614    fn test_tokenize_keyword_optional() {
615        let tokens = Tokenizer::tokenize("OPTIONAL").expect("valid SPARQL input");
616        assert_eq!(tokens[0].kind, TokenKind::Keyword);
617    }
618
619    #[test]
620    fn test_tokenize_iri() {
621        let tokens = Tokenizer::tokenize("<http://example.org/foo>").expect("valid SPARQL input");
622        assert_eq!(tokens[0].kind, TokenKind::Iri);
623        assert_eq!(tokens[0].value, "<http://example.org/foo>");
624        assert_eq!(tokens[0].position, 0);
625    }
626
627    #[test]
628    fn test_tokenize_iri_position() {
629        let tokens = Tokenizer::tokenize("  <http://example.org/>").expect("valid SPARQL input");
630        let iri = tokens
631            .iter()
632            .find(|t| t.kind == TokenKind::Iri)
633            .expect("should find element");
634        assert_eq!(iri.position, 2);
635    }
636
637    #[test]
638    fn test_tokenize_variable_question_mark() {
639        let tokens = Tokenizer::tokenize("?name").expect("valid SPARQL input");
640        assert_eq!(tokens[0].kind, TokenKind::Variable);
641        assert_eq!(tokens[0].value, "?name");
642    }
643
644    #[test]
645    fn test_tokenize_variable_dollar() {
646        let tokens = Tokenizer::tokenize("$subject").expect("valid SPARQL input");
647        assert_eq!(tokens[0].kind, TokenKind::Variable);
648        assert_eq!(tokens[0].value, "$subject");
649    }
650
651    #[test]
652    fn test_tokenize_string_literal_double_quote() {
653        let tokens = Tokenizer::tokenize("\"hello\"").expect("valid SPARQL input");
654        assert_eq!(tokens[0].kind, TokenKind::Literal);
655        assert_eq!(tokens[0].value, "\"hello\"");
656    }
657
658    #[test]
659    fn test_tokenize_string_literal_single_quote() {
660        let tokens = Tokenizer::tokenize("'world'").expect("valid SPARQL input");
661        assert_eq!(tokens[0].kind, TokenKind::Literal);
662    }
663
664    #[test]
665    fn test_tokenize_numeric_literal_integer() {
666        let tokens = Tokenizer::tokenize("42").expect("valid SPARQL input");
667        assert_eq!(tokens[0].kind, TokenKind::Literal);
668        assert_eq!(tokens[0].value, "42");
669    }
670
671    #[test]
672    fn test_tokenize_numeric_literal_float() {
673        let tokens = Tokenizer::tokenize("3.14").expect("valid SPARQL input");
674        assert_eq!(tokens[0].kind, TokenKind::Literal);
675        assert_eq!(tokens[0].value, "3.14");
676    }
677
678    #[test]
679    fn test_tokenize_prefixed_name() {
680        let tokens = Tokenizer::tokenize("rdf:type").expect("valid SPARQL input");
681        assert_eq!(tokens[0].kind, TokenKind::PrefixedName);
682        assert_eq!(tokens[0].value, "rdf:type");
683    }
684
685    #[test]
686    fn test_tokenize_prefixed_name_empty_local() {
687        let tokens = Tokenizer::tokenize("ex:").expect("valid SPARQL input");
688        assert_eq!(tokens[0].kind, TokenKind::PrefixedName);
689    }
690
691    #[test]
692    fn test_tokenize_punctuation_brace() {
693        let tokens = Tokenizer::tokenize("{").expect("valid SPARQL input");
694        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
695        assert_eq!(tokens[0].value, "{");
696    }
697
698    #[test]
699    fn test_tokenize_punctuation_dot() {
700        let tokens = Tokenizer::tokenize(".").expect("valid SPARQL input");
701        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
702        assert_eq!(tokens[0].value, ".");
703    }
704
705    #[test]
706    fn test_tokenize_whitespace() {
707        let tokens = Tokenizer::tokenize("   ").expect("valid SPARQL input");
708        assert_eq!(tokens[0].kind, TokenKind::Whitespace);
709    }
710
711    #[test]
712    fn test_tokenize_comment() {
713        let tokens = Tokenizer::tokenize("# this is a comment\n").expect("valid SPARQL input");
714        assert_eq!(tokens[0].kind, TokenKind::Comment);
715        assert!(tokens[0].value.starts_with('#'));
716    }
717
718    #[test]
719    fn test_tokenize_eof_appended() {
720        let tokens = Tokenizer::tokenize("SELECT").expect("valid SPARQL input");
721        assert_eq!(
722            tokens.last().expect("collection should not be empty").kind,
723            TokenKind::Eof
724        );
725    }
726
727    #[test]
728    fn test_tokenize_multiple_tokens() {
729        let tokens =
730            Tokenizer::tokenize_filtered("SELECT ?x WHERE { ?x rdf:type <http://a.org/A> }")
731                .expect("operation should succeed");
732        let kinds: Vec<&TokenKind> = tokens.iter().map(|t| &t.kind).collect();
733        assert!(kinds.contains(&&TokenKind::Keyword));
734        assert!(kinds.contains(&&TokenKind::Variable));
735        assert!(kinds.contains(&&TokenKind::PrefixedName));
736        assert!(kinds.contains(&&TokenKind::Iri));
737        assert!(kinds.contains(&&TokenKind::Punctuation));
738    }
739
740    #[test]
741    fn test_tokenize_filtered_removes_whitespace() {
742        let all = Tokenizer::tokenize("SELECT ?x").expect("valid SPARQL input");
743        let filtered = Tokenizer::tokenize_filtered("SELECT ?x").expect("valid SPARQL input");
744        assert!(all.len() > filtered.len());
745        assert!(!filtered.iter().any(|t| t.kind == TokenKind::Whitespace));
746    }
747
748    #[test]
749    fn test_tokenize_filtered_removes_comments() {
750        let filtered =
751            Tokenizer::tokenize_filtered("SELECT # comment\n?x").expect("valid SPARQL input");
752        assert!(!filtered.iter().any(|t| t.kind == TokenKind::Comment));
753    }
754
755    #[test]
756    fn test_tokenize_string_with_language_tag() {
757        let tokens = Tokenizer::tokenize("\"hello\"@en").expect("valid SPARQL input");
758        assert_eq!(tokens[0].kind, TokenKind::Literal);
759        assert!(tokens[0].value.contains("@en"));
760    }
761
762    #[test]
763    fn test_tokenize_unterminated_iri_error() {
764        let result = Tokenizer::tokenize("<http://unclosed");
765        assert!(result.is_err());
766    }
767
768    // ── TokenStream tests ────────────────────────────────────────────────────
769
770    #[test]
771    fn test_stream_peek_first_token() {
772        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
773        let stream = TokenStream::new(tokens);
774        let tok = stream.peek().expect("stream should have tokens");
775        assert_eq!(tok.kind, TokenKind::Keyword);
776    }
777
778    #[test]
779    fn test_stream_peek_empty() {
780        let stream = TokenStream::new(vec![]);
781        assert!(stream.peek().is_none());
782    }
783
784    #[test]
785    fn test_stream_next_advances() {
786        let tokens = Tokenizer::tokenize_filtered("SELECT ?x").expect("valid SPARQL input");
787        let stream = TokenStream::new(tokens);
788        let (tok, rest) = stream.next();
789        assert!(tok.is_some());
790        assert_eq!(tok.expect("should have value").kind, TokenKind::Keyword);
791        let (tok2, _) = rest.next();
792        assert_eq!(
793            tok2.expect("should have second token").kind,
794            TokenKind::Variable
795        );
796    }
797
798    #[test]
799    fn test_stream_remaining_count() {
800        let tokens = Tokenizer::tokenize_filtered("SELECT ?x WHERE").expect("valid SPARQL input");
801        let stream = TokenStream::new(tokens);
802        // SELECT ?x WHERE + EOF = 4, but remaining excludes EOF
803        assert_eq!(stream.remaining(), 3);
804    }
805
806    #[test]
807    fn test_stream_is_empty_after_consuming_all() {
808        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
809        let stream = TokenStream::new(tokens);
810        let (_, rest) = stream.next(); // consume SELECT
811        let (_, rest2) = rest.next(); // consume EOF
812        assert!(rest2.is_empty());
813    }
814
815    #[test]
816    fn test_stream_position_zero_initially() {
817        let tokens = Tokenizer::tokenize_filtered("WHERE").expect("valid SPARQL input");
818        let stream = TokenStream::new(tokens);
819        assert_eq!(stream.position(), 0);
820    }
821
822    #[test]
823    fn test_stream_position_advances() {
824        let tokens = Tokenizer::tokenize_filtered("SELECT ?x").expect("valid SPARQL input");
825        let stream = TokenStream::new(tokens);
826        let (_, rest) = stream.next();
827        assert_eq!(rest.position(), 1);
828    }
829
830    // ── expect_keyword tests ─────────────────────────────────────────────────
831
832    #[test]
833    fn test_expect_keyword_success() {
834        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
835        let stream = TokenStream::new(tokens);
836        let result = expect_keyword(stream, "SELECT");
837        assert!(result.is_ok());
838    }
839
840    #[test]
841    fn test_expect_keyword_case_insensitive() {
842        let tokens = Tokenizer::tokenize_filtered("select").expect("valid SPARQL input");
843        let stream = TokenStream::new(tokens);
844        assert!(expect_keyword(stream, "SELECT").is_ok());
845    }
846
847    #[test]
848    fn test_expect_keyword_wrong_keyword() {
849        let tokens = Tokenizer::tokenize_filtered("WHERE").expect("valid SPARQL input");
850        let stream = TokenStream::new(tokens);
851        let result = expect_keyword(stream, "SELECT");
852        assert!(result.is_err());
853        assert!(result.unwrap_err().message.contains("SELECT"));
854    }
855
856    #[test]
857    fn test_expect_keyword_not_a_keyword() {
858        let tokens = Tokenizer::tokenize_filtered("?x").expect("valid SPARQL input");
859        let stream = TokenStream::new(tokens);
860        let result = expect_keyword(stream, "SELECT");
861        assert!(result.is_err());
862    }
863
864    #[test]
865    fn test_expect_keyword_consumes_token() {
866        let tokens = Tokenizer::tokenize_filtered("SELECT WHERE").expect("valid SPARQL input");
867        let stream = TokenStream::new(tokens);
868        let (_, rest) = expect_keyword(stream, "SELECT").expect("keyword parse should succeed");
869        assert!(expect_keyword(rest, "WHERE").is_ok());
870    }
871
872    // ── expect_iri tests ─────────────────────────────────────────────────────
873
874    #[test]
875    fn test_expect_iri_success() {
876        let tokens =
877            Tokenizer::tokenize_filtered("<http://example.org/>").expect("valid SPARQL input");
878        let stream = TokenStream::new(tokens);
879        let result = expect_iri(stream);
880        assert!(result.is_ok());
881        assert_eq!(
882            result.expect("should have value").0,
883            "<http://example.org/>"
884        );
885    }
886
887    #[test]
888    fn test_expect_iri_prefixed_name() {
889        let tokens = Tokenizer::tokenize_filtered("rdf:type").expect("valid SPARQL input");
890        let stream = TokenStream::new(tokens);
891        let result = expect_iri(stream);
892        assert!(result.is_ok());
893    }
894
895    #[test]
896    fn test_expect_iri_failure_on_variable() {
897        let tokens = Tokenizer::tokenize_filtered("?x").expect("valid SPARQL input");
898        let stream = TokenStream::new(tokens);
899        let result = expect_iri(stream);
900        assert!(result.is_err());
901    }
902
903    // ── expect_variable tests ────────────────────────────────────────────────
904
905    #[test]
906    fn test_expect_variable_success() {
907        let tokens = Tokenizer::tokenize_filtered("?subject").expect("valid SPARQL input");
908        let stream = TokenStream::new(tokens);
909        let result = expect_variable(stream);
910        assert!(result.is_ok());
911        assert_eq!(result.expect("should have value").0, "?subject");
912    }
913
914    #[test]
915    fn test_expect_variable_dollar_prefix() {
916        let tokens = Tokenizer::tokenize_filtered("$pred").expect("valid SPARQL input");
917        let stream = TokenStream::new(tokens);
918        let result = expect_variable(stream);
919        assert!(result.is_ok());
920        assert_eq!(result.expect("should have value").0, "$pred");
921    }
922
923    #[test]
924    fn test_expect_variable_failure_on_keyword() {
925        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
926        let stream = TokenStream::new(tokens);
927        let result = expect_variable(stream);
928        assert!(result.is_err());
929    }
930
931    // ── optional tests ───────────────────────────────────────────────────────
932
933    #[test]
934    fn test_optional_hit() {
935        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
936        let stream = TokenStream::new(tokens);
937        let (result, _) = optional(stream, |s| expect_keyword(s, "SELECT"))
938            .expect("optional parse should succeed");
939        assert!(result.is_some());
940    }
941
942    #[test]
943    fn test_optional_miss_returns_none() {
944        let tokens = Tokenizer::tokenize_filtered("WHERE").expect("valid SPARQL input");
945        let stream = TokenStream::new(tokens);
946        let (result, rest) = optional(stream, |s| expect_keyword(s, "SELECT"))
947            .expect("optional parse should succeed");
948        assert!(result.is_none());
949        // Stream should not have advanced
950        assert_eq!(rest.position(), 0);
951    }
952
953    #[test]
954    fn test_optional_miss_does_not_advance_stream() {
955        let tokens = Tokenizer::tokenize_filtered("?x").expect("valid SPARQL input");
956        let stream = TokenStream::new(tokens);
957        let pos_before = stream.position();
958        let (_, rest) = optional(stream, |s| expect_keyword(s, "SELECT"))
959            .expect("optional parse should succeed");
960        assert_eq!(rest.position(), pos_before);
961    }
962
963    // ── many0 tests ──────────────────────────────────────────────────────────
964
965    #[test]
966    fn test_many0_zero_matches() {
967        let tokens = Tokenizer::tokenize_filtered("WHERE").expect("valid SPARQL input");
968        let stream = TokenStream::new(tokens);
969        let (results, rest) = many0(stream, |s| expect_keyword(s, "SELECT"))
970            .expect("repetition parse should succeed");
971        assert_eq!(results.len(), 0);
972        assert_eq!(rest.position(), 0);
973    }
974
975    #[test]
976    fn test_many0_one_match() {
977        let tokens = Tokenizer::tokenize_filtered("SELECT WHERE").expect("valid SPARQL input");
978        let stream = TokenStream::new(tokens);
979        let (results, _) = many0(stream, |s| expect_keyword(s, "SELECT"))
980            .expect("repetition parse should succeed");
981        assert_eq!(results.len(), 1);
982    }
983
984    #[test]
985    fn test_many0_multiple_matches() {
986        let tokens =
987            Tokenizer::tokenize_filtered("SELECT SELECT SELECT WHERE").expect("valid SPARQL input");
988        let stream = TokenStream::new(tokens);
989        let (results, rest) = many0(stream, |s| expect_keyword(s, "SELECT"))
990            .expect("repetition parse should succeed");
991        assert_eq!(results.len(), 3);
992        // remaining should show WHERE + EOF
993        assert!(rest.remaining() >= 1);
994    }
995
996    #[test]
997    fn test_many0_variables() {
998        let tokens = Tokenizer::tokenize_filtered("?a ?b ?c WHERE").expect("valid SPARQL input");
999        let stream = TokenStream::new(tokens);
1000        let (vars, _) = many0(stream, expect_variable).expect("repetition parse should succeed");
1001        assert_eq!(vars.len(), 3);
1002        assert_eq!(vars[0], "?a");
1003        assert_eq!(vars[1], "?b");
1004        assert_eq!(vars[2], "?c");
1005    }
1006
1007    // ── choice tests ─────────────────────────────────────────────────────────
1008
1009    #[test]
1010    fn test_choice_first_alternative() {
1011        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
1012        let stream = TokenStream::new(tokens);
1013        let parsers: Vec<Box<dyn Fn(TokenStream) -> ParseResult<&'static str>>> = vec![
1014            Box::new(|s| expect_keyword(s, "SELECT").map(|(_, r)| ("SELECT", r))),
1015            Box::new(|s| expect_keyword(s, "ASK").map(|(_, r)| ("ASK", r))),
1016        ];
1017        let (result, _) = choice(stream, parsers).expect("choice parse should succeed");
1018        assert_eq!(result, "SELECT");
1019    }
1020
1021    #[test]
1022    fn test_choice_second_alternative() {
1023        let tokens = Tokenizer::tokenize_filtered("ASK").expect("valid SPARQL input");
1024        let stream = TokenStream::new(tokens);
1025        let parsers: Vec<Box<dyn Fn(TokenStream) -> ParseResult<&'static str>>> = vec![
1026            Box::new(|s| expect_keyword(s, "SELECT").map(|(_, r)| ("SELECT", r))),
1027            Box::new(|s| expect_keyword(s, "ASK").map(|(_, r)| ("ASK", r))),
1028        ];
1029        let (result, _) = choice(stream, parsers).expect("choice parse should succeed");
1030        assert_eq!(result, "ASK");
1031    }
1032
1033    #[test]
1034    fn test_choice_no_match_returns_error() {
1035        let tokens = Tokenizer::tokenize_filtered("WHERE").expect("valid SPARQL input");
1036        let stream = TokenStream::new(tokens);
1037        let parsers: Vec<Box<dyn Fn(TokenStream) -> ParseResult<&'static str>>> = vec![
1038            Box::new(|s| expect_keyword(s, "SELECT").map(|(_, r)| ("SELECT", r))),
1039            Box::new(|s| expect_keyword(s, "ASK").map(|(_, r)| ("ASK", r))),
1040        ];
1041        assert!(choice(stream, parsers).is_err());
1042    }
1043
1044    #[test]
1045    fn test_choice_empty_parsers_returns_error() {
1046        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
1047        let stream = TokenStream::new(tokens);
1048        let parsers: Vec<Box<dyn Fn(TokenStream) -> ParseResult<String>>> = vec![];
1049        assert!(choice(stream, parsers).is_err());
1050    }
1051
1052    // ── ParseError tests ─────────────────────────────────────────────────────
1053
1054    #[test]
1055    fn test_parse_error_position() {
1056        let tokens = Tokenizer::tokenize_filtered("?x").expect("valid SPARQL input");
1057        let stream = TokenStream::new(tokens);
1058        let err = expect_keyword(stream, "SELECT").unwrap_err();
1059        assert_eq!(err.position, 0);
1060    }
1061
1062    #[test]
1063    fn test_parse_error_message_contains_expected() {
1064        let tokens = Tokenizer::tokenize_filtered("?x").expect("valid SPARQL input");
1065        let stream = TokenStream::new(tokens);
1066        let err = expect_keyword(stream, "SELECT").unwrap_err();
1067        assert!(err.message.contains("SELECT"));
1068    }
1069
1070    #[test]
1071    fn test_parse_error_display() {
1072        let err = ParseError::new("test error", 42);
1073        let display = format!("{}", err);
1074        assert!(display.contains("42"));
1075        assert!(display.contains("test error"));
1076    }
1077
1078    // ── Composite parsing tests ──────────────────────────────────────────────
1079
1080    #[test]
1081    fn test_parse_simple_triple_pattern() {
1082        // Parse: ?s rdf:type ?o
1083        let tokens = Tokenizer::tokenize_filtered("?s rdf:type ?o").expect("valid SPARQL input");
1084        let stream = TokenStream::new(tokens);
1085
1086        let (subj, rest) = expect_variable(stream).expect("variable parse should succeed");
1087        let (pred, rest) = expect_iri(rest).expect("IRI parse should succeed");
1088        let (obj, _) = expect_variable(rest).expect("variable parse should succeed");
1089
1090        assert_eq!(subj, "?s");
1091        assert_eq!(pred, "rdf:type");
1092        assert_eq!(obj, "?o");
1093    }
1094
1095    #[test]
1096    fn test_parse_select_query_skeleton() {
1097        let tokens = Tokenizer::tokenize_filtered("SELECT ?x WHERE").expect("valid SPARQL input");
1098        let stream = TokenStream::new(tokens);
1099        let (_, rest) = expect_keyword(stream, "SELECT").expect("keyword parse should succeed");
1100        let (vars, rest) = many0(rest, expect_variable).expect("repetition parse should succeed");
1101        let (_, _) = expect_keyword(rest, "WHERE").expect("keyword parse should succeed");
1102        assert_eq!(vars, vec!["?x"]);
1103    }
1104
1105    #[test]
1106    fn test_token_new() {
1107        let tok = Token::new(TokenKind::Keyword, "SELECT", 0);
1108        assert_eq!(tok.kind, TokenKind::Keyword);
1109        assert_eq!(tok.value, "SELECT");
1110        assert_eq!(tok.position, 0);
1111    }
1112
1113    #[test]
1114    fn test_parse_error_new() {
1115        let err = ParseError::new("oops", 5);
1116        assert_eq!(err.position, 5);
1117        assert_eq!(err.message, "oops");
1118    }
1119
1120    #[test]
1121    fn test_tokenize_two_char_operator_neq() {
1122        let tokens = Tokenizer::tokenize("!=").expect("valid SPARQL input");
1123        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1124        assert_eq!(tokens[0].value, "!=");
1125    }
1126
1127    #[test]
1128    fn test_tokenize_two_char_operator_leq() {
1129        let tokens = Tokenizer::tokenize("<=").expect("valid SPARQL input");
1130        assert_eq!(tokens[0].kind, TokenKind::Punctuation);
1131        assert_eq!(tokens[0].value, "<=");
1132    }
1133
1134    #[test]
1135    fn test_tokenize_keyword_filter() {
1136        let tokens = Tokenizer::tokenize("FILTER").expect("valid SPARQL input");
1137        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1138    }
1139
1140    #[test]
1141    fn test_tokenize_keyword_bind() {
1142        let tokens = Tokenizer::tokenize("BIND").expect("valid SPARQL input");
1143        assert_eq!(tokens[0].kind, TokenKind::Keyword);
1144    }
1145
1146    #[test]
1147    fn test_stream_clone_independence() {
1148        let tokens = Tokenizer::tokenize_filtered("SELECT WHERE").expect("valid SPARQL input");
1149        let stream = TokenStream::new(tokens);
1150        let clone = stream.clone();
1151        let (_, advanced) = stream.next();
1152        // Original clone should still be at position 0
1153        assert_eq!(clone.position(), 0);
1154        assert_eq!(advanced.position(), 1);
1155    }
1156
1157    #[test]
1158    fn test_many0_with_iri() {
1159        let tokens = Tokenizer::tokenize_filtered("<http://a.org/> <http://b.org/> ?x")
1160            .expect("valid SPARQL input");
1161        let stream = TokenStream::new(tokens);
1162        let (iris, rest) = many0(stream, expect_iri).expect("repetition parse should succeed");
1163        assert_eq!(iris.len(), 2);
1164        assert_eq!(iris[0], "<http://a.org/>");
1165        assert_eq!(iris[1], "<http://b.org/>");
1166        // ?x should still be next
1167        assert_eq!(
1168            rest.peek().expect("stream should have tokens").kind,
1169            TokenKind::Variable
1170        );
1171    }
1172
1173    #[test]
1174    fn test_optional_iri_hit() {
1175        let tokens = Tokenizer::tokenize_filtered("<http://example.org/> WHERE")
1176            .expect("valid SPARQL input");
1177        let stream = TokenStream::new(tokens);
1178        let (result, _) = optional(stream, expect_iri).expect("optional parse should succeed");
1179        assert!(result.is_some());
1180    }
1181
1182    #[test]
1183    fn test_optional_variable_miss_on_keyword() {
1184        let tokens = Tokenizer::tokenize_filtered("SELECT").expect("valid SPARQL input");
1185        let stream = TokenStream::new(tokens);
1186        let (result, rest) =
1187            optional(stream, expect_variable).expect("optional parse should succeed");
1188        assert!(result.is_none());
1189        assert_eq!(rest.position(), 0);
1190    }
1191}