Skip to main content

oxirs_arq/
update_protocol_parser.rs

1//! Tokeniser and recursive-descent parser for SPARQL 1.1 Update text.
2//!
3//! The [`SparqlUpdateParser`] consumes a SPARQL Update string and produces a
4//! sequence of [`SparqlUpdate`] operations.  The implementation is a
5//! hand-written, single-pass tokeniser plus recursive descent — it covers
6//! the most common SPARQL 1.1 Update grammar productions but does not
7//! perform prefix expansion (callers should expand prefixes themselves).
8
9use crate::update_protocol_types::{
10    ClearType, DropType, ParseError, PatternTerm, SparqlUpdate, Triple, TriplePattern,
11};
12
13// ---------------------------------------------------------------------------
14// Parser
15// ---------------------------------------------------------------------------
16
17/// A lightweight tokenising parser for SPARQL 1.1 Update text.
18///
19/// The parser is intentionally simple (hand-written recursive descent on a
20/// token stream) and covers the most common production patterns.  It does not
21/// handle prefix declarations (`PREFIX`) or the `BASE` directive — callers
22/// that require prefix resolution should expand prefixes before passing the
23/// string to the parser.
24pub struct SparqlUpdateParser;
25
26impl SparqlUpdateParser {
27    /// Parse zero or more semicolon-separated update operations from `input`.
28    pub fn parse(input: &str) -> Result<Vec<SparqlUpdate>, ParseError> {
29        let tokens = tokenise(input);
30        let mut cursor = 0usize;
31        let mut results = Vec::new();
32
33        // Skip optional leading PREFIX declarations.
34        skip_prefixes(&tokens, &mut cursor);
35
36        while cursor < tokens.len() {
37            skip_prefixes(&tokens, &mut cursor);
38            if cursor >= tokens.len() {
39                break;
40            }
41            let update = parse_one_operation(&tokens, &mut cursor)?;
42            results.push(update);
43            // Consume optional ';' separator.
44            if cursor < tokens.len() && tokens[cursor] == ";" {
45                cursor += 1;
46            }
47        }
48
49        Ok(results)
50    }
51
52    /// Parse exactly one update operation from `input`.
53    pub fn parse_one(input: &str) -> Result<SparqlUpdate, ParseError> {
54        let mut updates = Self::parse(input)?;
55        match updates.len() {
56            0 => Err(ParseError::at(0, "no update operation found")),
57            1 => Ok(updates.remove(0)),
58            n => Err(ParseError::at(
59                0,
60                format!("expected exactly one operation, found {n}"),
61            )),
62        }
63    }
64}
65
66// ---------------------------------------------------------------------------
67// Tokeniser
68// ---------------------------------------------------------------------------
69
70/// Produce a flat vector of tokens from a SPARQL update string.
71///
72/// Tokens are: keywords, IRIs (`<…>`), string literals (`"…"` / `'…'`),
73/// blank node labels (`_:…`), variables (`?…` / `$…`), punctuation, and
74/// bare identifiers.  Whitespace and `# comments` are discarded.
75fn tokenise(input: &str) -> Vec<String> {
76    let mut tokens = Vec::new();
77    let chars: Vec<char> = input.chars().collect();
78    let mut i = 0;
79
80    while i < chars.len() {
81        match chars[i] {
82            // Skip whitespace
83            c if c.is_whitespace() => i += 1,
84            // Line comment
85            '#' => {
86                while i < chars.len() && chars[i] != '\n' {
87                    i += 1;
88                }
89            }
90            // IRI reference <…>
91            '<' => {
92                let mut tok = String::from('<');
93                i += 1;
94                while i < chars.len() && chars[i] != '>' {
95                    tok.push(chars[i]);
96                    i += 1;
97                }
98                if i < chars.len() {
99                    tok.push('>');
100                    i += 1;
101                }
102                tokens.push(tok);
103            }
104            // Double-quoted literal
105            '"' => {
106                let mut tok = String::from('"');
107                i += 1;
108                while i < chars.len() && chars[i] != '"' {
109                    if chars[i] == '\\' && i + 1 < chars.len() {
110                        tok.push(chars[i]);
111                        i += 1;
112                    }
113                    tok.push(chars[i]);
114                    i += 1;
115                }
116                if i < chars.len() {
117                    tok.push('"');
118                    i += 1;
119                }
120                tokens.push(tok);
121            }
122            // Single-quoted literal
123            '\'' => {
124                let mut tok = String::from('\'');
125                i += 1;
126                while i < chars.len() && chars[i] != '\'' {
127                    if chars[i] == '\\' && i + 1 < chars.len() {
128                        tok.push(chars[i]);
129                        i += 1;
130                    }
131                    tok.push(chars[i]);
132                    i += 1;
133                }
134                if i < chars.len() {
135                    tok.push('\'');
136                    i += 1;
137                }
138                tokens.push(tok);
139            }
140            // Variable ?name or $name
141            '?' | '$' => {
142                let mut tok = String::from('?');
143                i += 1;
144                while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_') {
145                    tok.push(chars[i]);
146                    i += 1;
147                }
148                tokens.push(tok);
149            }
150            // Punctuation: { } ( ) . , ; ^^ @
151            '{' | '}' | '(' | ')' | '.' | ',' | ';' => {
152                tokens.push(chars[i].to_string());
153                i += 1;
154            }
155            // ^^  datatype marker or ^ inverse path
156            '^' => {
157                if i + 1 < chars.len() && chars[i + 1] == '^' {
158                    tokens.push("^^".to_string());
159                    i += 2;
160                } else {
161                    tokens.push("^".to_string());
162                    i += 1;
163                }
164            }
165            // @ language tag
166            '@' => {
167                let mut tok = String::from('@');
168                i += 1;
169                while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '-') {
170                    tok.push(chars[i]);
171                    i += 1;
172                }
173                tokens.push(tok);
174            }
175            // Bare identifier, keyword, prefixed name, or blank node label
176            c if c.is_alphabetic() || c == '_' => {
177                let mut tok = String::new();
178                while i < chars.len()
179                    && (chars[i].is_alphanumeric()
180                        || chars[i] == '_'
181                        || chars[i] == ':'
182                        || chars[i] == '-')
183                {
184                    tok.push(chars[i]);
185                    i += 1;
186                }
187                tokens.push(tok);
188            }
189            // Numbers / other characters — collect until whitespace or punctuation
190            _ => {
191                let mut tok = String::new();
192                while i < chars.len()
193                    && !chars[i].is_whitespace()
194                    && !matches!(chars[i], '{' | '}' | '(' | ')' | ',' | ';')
195                {
196                    tok.push(chars[i]);
197                    i += 1;
198                }
199                if !tok.is_empty() {
200                    tokens.push(tok);
201                }
202            }
203        }
204    }
205
206    tokens
207}
208
209// ---------------------------------------------------------------------------
210// Parser helpers
211// ---------------------------------------------------------------------------
212
213fn peek(tokens: &[String], cursor: usize) -> Option<&str> {
214    tokens.get(cursor).map(|s| s.as_str())
215}
216
217fn expect<'a>(
218    tokens: &'a [String],
219    cursor: &mut usize,
220    expected: &str,
221) -> Result<&'a str, ParseError> {
222    match tokens.get(*cursor) {
223        Some(tok) if tok.to_uppercase() == expected.to_uppercase() => {
224            *cursor += 1;
225            Ok(tok.as_str())
226        }
227        Some(tok) => Err(ParseError::at(
228            *cursor,
229            format!("expected '{expected}', found '{tok}'"),
230        )),
231        None => Err(ParseError::at(
232            *cursor,
233            format!("expected '{expected}', found end of input"),
234        )),
235    }
236}
237
238fn consume_keyword(tokens: &[String], cursor: &mut usize, keyword: &str) -> bool {
239    match tokens.get(*cursor) {
240        Some(tok) if tok.to_uppercase() == keyword.to_uppercase() => {
241            *cursor += 1;
242            true
243        }
244        _ => false,
245    }
246}
247
248/// Consume the next token unconditionally and return it.
249fn consume(tokens: &[String], cursor: &mut usize) -> Result<String, ParseError> {
250    tokens
251        .get(*cursor)
252        .map(|t| {
253            *cursor += 1;
254            t.clone()
255        })
256        .ok_or_else(|| ParseError::at(*cursor, "unexpected end of input"))
257}
258
259/// Parse an IRI token of the form `<…>` and return the inner string.
260fn parse_iri(tokens: &[String], cursor: &mut usize) -> Result<String, ParseError> {
261    match tokens.get(*cursor) {
262        Some(tok) if tok.starts_with('<') && tok.ends_with('>') => {
263            let iri = tok[1..tok.len() - 1].to_string();
264            *cursor += 1;
265            Ok(iri)
266        }
267        Some(tok) => Err(ParseError::at(
268            *cursor,
269            format!("expected IRI, found '{tok}'"),
270        )),
271        None => Err(ParseError::at(*cursor, "expected IRI, found end of input")),
272    }
273}
274
275/// Parse an optional `SILENT` keyword and return whether it was present.
276fn parse_silent(tokens: &[String], cursor: &mut usize) -> bool {
277    consume_keyword(tokens, cursor, "SILENT")
278}
279
280/// Skip zero or more `PREFIX` declarations.
281fn skip_prefixes(tokens: &[String], cursor: &mut usize) {
282    while let Some(tok) = tokens.get(*cursor) {
283        if tok.to_uppercase() != "PREFIX" {
284            break;
285        }
286        *cursor += 1; // consume PREFIX
287                      // prefix name (e.g. "ex:" or ":")
288        *cursor += 1;
289        // IRI
290        *cursor += 1;
291    }
292}
293
294// ---------------------------------------------------------------------------
295// Triple / pattern parsing
296// ---------------------------------------------------------------------------
297
298/// Parse a set of concrete triples inside `{ … }`.
299/// Triples may be separated by `.` or `;` (simplified Turtle-like syntax).
300fn parse_triple_block(tokens: &[String], cursor: &mut usize) -> Result<Vec<Triple>, ParseError> {
301    expect(tokens, cursor, "{")?;
302    let mut triples = Vec::new();
303
304    while let Some(tok) = tokens.get(*cursor) {
305        if tok == "}" {
306            break;
307        }
308        if tok == "." {
309            *cursor += 1;
310            continue;
311        }
312
313        let s = parse_term_str(tokens, cursor)?;
314        let p = parse_term_str(tokens, cursor)?;
315        let o = parse_term_str(tokens, cursor)?;
316        triples.push(Triple::new(s, p, o));
317
318        // Optional trailing dot.
319        if matches!(peek(tokens, *cursor), Some(".")) {
320            *cursor += 1;
321        }
322    }
323
324    expect(tokens, cursor, "}")?;
325    Ok(triples)
326}
327
328/// Parse a set of triple patterns (may contain variables) inside `{ … }`.
329fn parse_pattern_block(
330    tokens: &[String],
331    cursor: &mut usize,
332) -> Result<Vec<TriplePattern>, ParseError> {
333    expect(tokens, cursor, "{")?;
334    let mut patterns = Vec::new();
335
336    while let Some(tok) = tokens.get(*cursor) {
337        if tok == "}" {
338            break;
339        }
340        if tok == "." {
341            *cursor += 1;
342            continue;
343        }
344
345        let s = parse_pattern_term(tokens, cursor)?;
346        let p = parse_pattern_term(tokens, cursor)?;
347        let o = parse_pattern_term(tokens, cursor)?;
348        patterns.push(TriplePattern::new(s, p, o));
349
350        if matches!(peek(tokens, *cursor), Some(".")) {
351            *cursor += 1;
352        }
353    }
354
355    expect(tokens, cursor, "}")?;
356    Ok(patterns)
357}
358
359/// Parse a single term as a plain string for use in concrete triples.
360fn parse_term_str(tokens: &[String], cursor: &mut usize) -> Result<String, ParseError> {
361    let tok = consume(tokens, cursor)?;
362    // Unwrap IRI angles.
363    if tok.starts_with('<') && tok.ends_with('>') {
364        return Ok(tok[1..tok.len() - 1].to_string());
365    }
366    Ok(tok)
367}
368
369/// Parse a single term into a `PatternTerm`.
370fn parse_pattern_term(tokens: &[String], cursor: &mut usize) -> Result<PatternTerm, ParseError> {
371    let tok = consume(tokens, cursor)?;
372
373    // Variable: ?name
374    if let Some(stripped) = tok.strip_prefix('?') {
375        return Ok(PatternTerm::Variable(stripped.to_string()));
376    }
377
378    // IRI: <…>
379    if tok.starts_with('<') && tok.ends_with('>') {
380        return Ok(PatternTerm::Iri(tok[1..tok.len() - 1].to_string()));
381    }
382
383    // Blank node: _:label
384    if let Some(stripped) = tok.strip_prefix("_:") {
385        return Ok(PatternTerm::BlankNode(stripped.to_string()));
386    }
387
388    // Literal: "…"
389    if tok.starts_with('"') || tok.starts_with('\'') {
390        // Consume optional @lang or ^^datatype.
391        if matches!(peek(tokens, *cursor), Some(t) if t.starts_with('@')) {
392            let _lang = consume(tokens, cursor)?;
393        } else if matches!(peek(tokens, *cursor), Some("^^")) {
394            *cursor += 1; // skip ^^
395            let _dt = consume(tokens, cursor)?;
396        }
397        return Ok(PatternTerm::Literal(tok));
398    }
399
400    // Prefixed name or keyword treated as IRI-like.
401    Ok(PatternTerm::Iri(tok))
402}
403
404// ---------------------------------------------------------------------------
405// Top-level operation parser
406// ---------------------------------------------------------------------------
407
408fn parse_one_operation(tokens: &[String], cursor: &mut usize) -> Result<SparqlUpdate, ParseError> {
409    let keyword = match tokens.get(*cursor) {
410        Some(k) => k.to_uppercase(),
411        None => return Err(ParseError::at(*cursor, "unexpected end of input")),
412    };
413
414    match keyword.as_str() {
415        "INSERT" => {
416            *cursor += 1;
417            if matches!(peek(tokens, *cursor), Some(t) if t.to_uppercase() == "DATA") {
418                *cursor += 1;
419                let triples = parse_triple_block(tokens, cursor)?;
420                Ok(SparqlUpdate::InsertData(triples))
421            } else {
422                // INSERT { template } WHERE { pattern }
423                let template = parse_pattern_block(tokens, cursor)?;
424                expect(tokens, cursor, "WHERE")?;
425                let where_clause = parse_pattern_block(tokens, cursor)?;
426                Ok(SparqlUpdate::InsertWhere {
427                    template,
428                    where_clause,
429                })
430            }
431        }
432        "DELETE" => {
433            *cursor += 1;
434            if matches!(peek(tokens, *cursor), Some(t) if t.to_uppercase() == "DATA") {
435                *cursor += 1;
436                let triples = parse_triple_block(tokens, cursor)?;
437                Ok(SparqlUpdate::DeleteData(triples))
438            } else if matches!(peek(tokens, *cursor), Some(t) if t.to_uppercase() == "WHERE") {
439                // DELETE WHERE { pattern }
440                *cursor += 1;
441                let where_clause = parse_pattern_block(tokens, cursor)?;
442                Ok(SparqlUpdate::DeleteWhere {
443                    template: vec![],
444                    where_clause,
445                })
446            } else {
447                // DELETE { del_template } [INSERT { ins_template }] WHERE { pattern }
448                let delete_template = parse_pattern_block(tokens, cursor)?;
449                let insert_template = if matches!(peek(tokens, *cursor), Some(t) if t.to_uppercase() == "INSERT")
450                {
451                    *cursor += 1;
452                    parse_pattern_block(tokens, cursor)?
453                } else {
454                    vec![]
455                };
456                expect(tokens, cursor, "WHERE")?;
457                let where_clause = parse_pattern_block(tokens, cursor)?;
458                Ok(SparqlUpdate::Modify {
459                    delete: delete_template,
460                    insert: insert_template,
461                    where_clause,
462                })
463            }
464        }
465        "CREATE" => {
466            *cursor += 1;
467            let silent = parse_silent(tokens, cursor);
468            consume_keyword(tokens, cursor, "GRAPH");
469            let iri = parse_iri(tokens, cursor)?;
470            Ok(SparqlUpdate::CreateGraph { iri, silent })
471        }
472        "DROP" => {
473            *cursor += 1;
474            let silent = parse_silent(tokens, cursor);
475            parse_graph_target_update(tokens, cursor, |iri, drop_type| SparqlUpdate::DropGraph {
476                iri,
477                silent,
478                drop_type: drop_type.into_drop(),
479            })
480        }
481        "CLEAR" => {
482            *cursor += 1;
483            let silent = parse_silent(tokens, cursor);
484            parse_graph_target_update(tokens, cursor, |iri, clear_type| SparqlUpdate::ClearGraph {
485                iri,
486                silent,
487                clear_type: clear_type.into_clear(),
488            })
489        }
490        "COPY" => {
491            *cursor += 1;
492            let silent = parse_silent(tokens, cursor);
493            let source = parse_iri(tokens, cursor)?;
494            expect(tokens, cursor, "TO")?;
495            let target = parse_iri(tokens, cursor)?;
496            Ok(SparqlUpdate::CopyGraph {
497                source,
498                target,
499                silent,
500            })
501        }
502        "MOVE" => {
503            *cursor += 1;
504            let silent = parse_silent(tokens, cursor);
505            let source = parse_iri(tokens, cursor)?;
506            expect(tokens, cursor, "TO")?;
507            let target = parse_iri(tokens, cursor)?;
508            Ok(SparqlUpdate::MoveGraph {
509                source,
510                target,
511                silent,
512            })
513        }
514        "ADD" => {
515            *cursor += 1;
516            let silent = parse_silent(tokens, cursor);
517            let source = parse_iri(tokens, cursor)?;
518            expect(tokens, cursor, "TO")?;
519            let target = parse_iri(tokens, cursor)?;
520            Ok(SparqlUpdate::AddGraph {
521                source,
522                target,
523                silent,
524            })
525        }
526        "LOAD" => {
527            *cursor += 1;
528            let silent = parse_silent(tokens, cursor);
529            let iri = parse_iri(tokens, cursor)?;
530            let into = if consume_keyword(tokens, cursor, "INTO") {
531                consume_keyword(tokens, cursor, "GRAPH");
532                Some(parse_iri(tokens, cursor)?)
533            } else {
534                None
535            };
536            Ok(SparqlUpdate::Load { iri, into, silent })
537        }
538        other => Err(ParseError::at(
539            *cursor,
540            format!("unknown update operation keyword: '{other}'"),
541        )),
542    }
543}
544
545// ---------------------------------------------------------------------------
546// Graph target parsing helper (DROP / CLEAR share the same grammar)
547// ---------------------------------------------------------------------------
548
549/// Temporary enum for the parsed scope before converting to `DropType`/`ClearType`.
550enum GraphScope {
551    GraphIri,
552    Default,
553    Named,
554    All,
555}
556
557impl GraphScope {
558    fn into_drop(self) -> DropType {
559        match self {
560            GraphScope::GraphIri => DropType::Graph,
561            GraphScope::Default => DropType::Default,
562            GraphScope::Named => DropType::Named,
563            GraphScope::All => DropType::All,
564        }
565    }
566
567    fn into_clear(self) -> ClearType {
568        match self {
569            GraphScope::GraphIri => ClearType::Graph,
570            GraphScope::Default => ClearType::Default,
571            GraphScope::Named => ClearType::Named,
572            GraphScope::All => ClearType::All,
573        }
574    }
575}
576
577fn parse_graph_target_update<F>(
578    tokens: &[String],
579    cursor: &mut usize,
580    builder: F,
581) -> Result<SparqlUpdate, ParseError>
582where
583    F: FnOnce(Option<String>, GraphScope) -> SparqlUpdate,
584{
585    let keyword = tokens.get(*cursor).map(|t| t.to_uppercase());
586    match keyword.as_deref() {
587        Some("DEFAULT") => {
588            *cursor += 1;
589            let scope = GraphScope::Default;
590            Ok(builder(None, scope))
591        }
592        Some("NAMED") => {
593            *cursor += 1;
594            let scope = GraphScope::Named;
595            Ok(builder(None, scope))
596        }
597        Some("ALL") => {
598            *cursor += 1;
599            let scope = GraphScope::All;
600            Ok(builder(None, scope))
601        }
602        Some("GRAPH") => {
603            *cursor += 1;
604            let iri = parse_iri(tokens, cursor)?;
605            let scope = GraphScope::GraphIri;
606            Ok(builder(Some(iri), scope))
607        }
608        // Bare IRI without GRAPH keyword.
609        Some(tok) if tok.starts_with('<') => {
610            let iri = parse_iri(tokens, cursor)?;
611            let scope = GraphScope::GraphIri;
612            Ok(builder(Some(iri), scope))
613        }
614        _ => Err(ParseError::at(
615            *cursor,
616            "expected graph scope (DEFAULT | NAMED | ALL | GRAPH <iri>)",
617        )),
618    }
619}