use std::collections::HashMap;
use std::io::BufRead;
use crate::trig_streaming::{
lexer::{TriGLexer, TriGToken},
StreamedQuad, TriGLiteral, TriGParseError, TriGTerm,
};
const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
#[allow(dead_code)]
const XSD_STRING: &str = "http://www.w3.org/2001/XMLSchema#string";
const XSD_BOOLEAN: &str = "http://www.w3.org/2001/XMLSchema#boolean";
const XSD_INTEGER: &str = "http://www.w3.org/2001/XMLSchema#integer";
const XSD_DECIMAL: &str = "http://www.w3.org/2001/XMLSchema#decimal";
const XSD_DOUBLE: &str = "http://www.w3.org/2001/XMLSchema#double";
pub struct TriGParser<R: BufRead> {
lexer: TriGLexer<R>,
prefix_map: HashMap<String, String>,
base: Option<String>,
bnode_counter: usize,
blank_node_map: HashMap<String, usize>,
current_graph: Option<TriGTerm>,
graph_opened_at: Option<usize>,
}
impl<R: BufRead> TriGParser<R> {
pub fn new(reader: R) -> Self {
Self {
lexer: TriGLexer::new(reader),
prefix_map: HashMap::new(),
base: None,
bnode_counter: 0,
blank_node_map: HashMap::new(),
current_graph: None,
graph_opened_at: None,
}
}
pub fn parse_statement(&mut self) -> Result<Option<Vec<StreamedQuad>>, TriGParseError> {
let tok = match self.lexer.peek()? {
None => return Ok(None),
Some(tok) => tok.clone(),
};
match tok {
TriGToken::Prefix(_) => self.handle_prefix_directive(),
TriGToken::Base(iri) => {
let resolved = self.resolve_iri(&iri);
self.base = Some(resolved);
self.lexer.next_token()?;
self.expect_dot()?;
Ok(Some(vec![]))
}
TriGToken::RBrace => {
self.lexer.next_token()?;
if self.current_graph.is_none() && self.graph_opened_at.is_none() {
return Err(TriGParseError::InvalidGraph {
line: self.lexer.line(),
name: "Unexpected '}' outside any graph block".to_string(),
});
}
self.current_graph = None;
self.graph_opened_at = None;
Ok(Some(vec![]))
}
TriGToken::Dot => {
self.lexer.next_token()?;
Ok(Some(vec![]))
}
TriGToken::IriRef(_) | TriGToken::PrefixedName(_, _) => {
self.parse_iri_or_graph_or_triple()
}
TriGToken::BlankNodeLabel(_) | TriGToken::AnonBlankNode | TriGToken::LBracket => {
let quads = self.parse_triples()?;
Ok(Some(quads))
}
TriGToken::LBrace => {
self.lexer.next_token()?;
let opened = self.lexer.line();
self.current_graph = None; self.graph_opened_at = Some(opened);
Ok(Some(vec![]))
}
_ => {
let line = self.lexer.line();
Err(TriGParseError::InvalidToken {
line,
message: format!("Unexpected token at start of statement: {:?}", tok),
})
}
}
}
fn handle_prefix_directive(&mut self) -> Result<Option<Vec<StreamedQuad>>, TriGParseError> {
let label = match self.lexer.next_token()? {
Some(TriGToken::Prefix(label)) => label,
other => {
return Err(TriGParseError::InvalidToken {
line: self.lexer.line(),
message: format!("Expected @prefix token, got {:?}", other),
});
}
};
let iri = match self.lexer.next_token()? {
Some(TriGToken::IriRef(iri)) => self.resolve_iri(&iri),
other => {
return Err(TriGParseError::InvalidToken {
line: self.lexer.line(),
message: format!("Expected IRI for @prefix, got {:?}", other),
});
}
};
self.prefix_map.insert(label, iri);
self.expect_dot()?;
Ok(Some(vec![]))
}
fn parse_iri_or_graph_or_triple(
&mut self,
) -> Result<Option<Vec<StreamedQuad>>, TriGParseError> {
let tok = self.lexer.next_token()?.expect("peeked above");
let term = self.token_to_term(tok)?;
match self.lexer.peek()? {
Some(TriGToken::LBrace) => {
self.lexer.next_token()?; let opened = self.lexer.line();
self.current_graph = Some(term);
self.graph_opened_at = Some(opened);
Ok(Some(vec![]))
}
_ => {
let quads = self.parse_predicate_object_list(term)?;
self.expect_dot()?;
Ok(Some(quads))
}
}
}
pub fn parse_triples(&mut self) -> Result<Vec<StreamedQuad>, TriGParseError> {
let subject = self.parse_term()?;
let quads = self.parse_predicate_object_list(subject)?;
self.expect_dot()?;
Ok(quads)
}
fn parse_predicate_object_list(
&mut self,
subject: TriGTerm,
) -> Result<Vec<StreamedQuad>, TriGParseError> {
let mut quads: Vec<StreamedQuad> = Vec::new();
'outer: loop {
match self.lexer.peek()? {
Some(TriGToken::Dot) | Some(TriGToken::RBrace) | None => break,
Some(TriGToken::Semicolon) => {
self.lexer.next_token()?;
match self.lexer.peek()? {
Some(TriGToken::Dot) | Some(TriGToken::RBrace) | None => break,
Some(TriGToken::Semicolon) => continue 'outer, _ => {} }
}
_ => {}
}
let predicate = self.parse_predicate()?;
loop {
let object = self.parse_object()?;
quads.push(StreamedQuad {
subject: subject.clone(),
predicate: predicate.clone(),
object,
graph_name: self.current_graph.clone(),
});
match self.lexer.peek()? {
Some(TriGToken::Comma) => {
self.lexer.next_token()?;
}
_ => break,
}
}
}
Ok(quads)
}
pub fn parse_term(&mut self) -> Result<TriGTerm, TriGParseError> {
let tok = match self.lexer.next_token()? {
Some(t) => t,
None => {
return Err(TriGParseError::InvalidTriple {
line: self.lexer.line(),
message: "Expected term, got EOF".to_string(),
});
}
};
self.token_to_term(tok)
}
fn parse_predicate(&mut self) -> Result<TriGTerm, TriGParseError> {
let tok = match self.lexer.next_token()? {
Some(t) => t,
None => {
return Err(TriGParseError::InvalidTriple {
line: self.lexer.line(),
message: "Expected predicate, got EOF".to_string(),
});
}
};
match tok {
TriGToken::A => Ok(TriGTerm::NamedNode(RDF_TYPE.to_string())),
TriGToken::IriRef(iri) => Ok(TriGTerm::NamedNode(self.resolve_iri(&iri))),
TriGToken::PrefixedName(p, l) => {
let iri = self.expand_prefixed_name(&p, &l)?;
Ok(TriGTerm::NamedNode(iri))
}
other => Err(TriGParseError::InvalidTriple {
line: self.lexer.line(),
message: format!("Expected predicate (IRI or 'a'), got {:?}", other),
}),
}
}
fn parse_object(&mut self) -> Result<TriGTerm, TriGParseError> {
match self.lexer.peek()? {
Some(TriGToken::LBracket) => {
self.lexer.next_token()?; let bnode = self.new_blank_node();
let _inner_quads = self.parse_predicate_object_list(bnode.clone())?;
match self.lexer.next_token()? {
Some(TriGToken::RBracket) => {}
other => {
return Err(TriGParseError::InvalidTriple {
line: self.lexer.line(),
message: format!("Expected ']', got {:?}", other),
});
}
}
Ok(bnode)
}
_ => self.parse_term(),
}
}
fn token_to_term(&mut self, tok: TriGToken) -> Result<TriGTerm, TriGParseError> {
match tok {
TriGToken::IriRef(iri) => Ok(TriGTerm::NamedNode(self.resolve_iri(&iri))),
TriGToken::PrefixedName(prefix, local) => {
let iri = self.expand_prefixed_name(&prefix, &local)?;
Ok(TriGTerm::NamedNode(iri))
}
TriGToken::BlankNodeLabel(label) => {
let id = self.get_or_create_bnode(&label);
Ok(TriGTerm::BlankNode(format!("b{}", id)))
}
TriGToken::AnonBlankNode => Ok(self.new_blank_node()),
TriGToken::A => Ok(TriGTerm::NamedNode(RDF_TYPE.to_string())),
TriGToken::StringLiteral { value, lang, datatype } => {
let resolved_dt = datatype.map(|dt| self.resolve_datatype(&dt));
Ok(TriGTerm::Literal(TriGLiteral {
value,
datatype: resolved_dt,
language: lang,
}))
}
TriGToken::Integer(i) => Ok(TriGTerm::Literal(TriGLiteral {
value: i.to_string(),
datatype: Some(XSD_INTEGER.to_string()),
language: None,
})),
TriGToken::Decimal(f) => Ok(TriGTerm::Literal(TriGLiteral {
value: format!("{}", f),
datatype: Some(XSD_DECIMAL.to_string()),
language: None,
})),
TriGToken::Double(f) => Ok(TriGTerm::Literal(TriGLiteral {
value: format!("{:E}", f),
datatype: Some(XSD_DOUBLE.to_string()),
language: None,
})),
TriGToken::True => Ok(TriGTerm::Literal(TriGLiteral {
value: "true".to_string(),
datatype: Some(XSD_BOOLEAN.to_string()),
language: None,
})),
TriGToken::False => Ok(TriGTerm::Literal(TriGLiteral {
value: "false".to_string(),
datatype: Some(XSD_BOOLEAN.to_string()),
language: None,
})),
other => Err(TriGParseError::InvalidTriple {
line: self.lexer.line(),
message: format!("Cannot use {:?} as a term", other),
}),
}
}
pub fn resolve_iri(&self, iri: &str) -> String {
if iri.contains("://") || iri.starts_with("urn:") {
return iri.to_string();
}
if iri.is_empty() {
return self.base.clone().unwrap_or_default();
}
if let Some(base) = &self.base {
if iri.starts_with('#') || iri.starts_with('/') {
return format!("{}{}", base, iri);
}
let base_no_frag = base.split('#').next().unwrap_or(base);
let base_path = if base_no_frag.contains('/') {
let last_slash = base_no_frag.rfind('/').unwrap_or(base_no_frag.len());
&base_no_frag[..=last_slash]
} else {
base_no_frag
};
return format!("{}{}", base_path, iri);
}
iri.to_string()
}
pub fn expand_prefixed_name(
&self,
prefix: &str,
local: &str,
) -> Result<String, TriGParseError> {
match self.prefix_map.get(prefix) {
Some(iri_prefix) => Ok(format!("{}{}", iri_prefix, local)),
None => Err(TriGParseError::InvalidToken {
line: self.lexer.line(),
message: format!("Unknown prefix: {:?}", prefix),
}),
}
}
fn resolve_datatype(&self, dt: &str) -> String {
if let Some(colon_pos) = dt.find(':') {
let prefix = &dt[..colon_pos];
let local = &dt[colon_pos + 1..];
if let Ok(expanded) = self.expand_prefixed_name(prefix, local) {
return expanded;
}
}
if dt.contains("://") || dt.starts_with("urn:") {
return dt.to_string();
}
self.resolve_iri(dt)
}
fn new_blank_node(&mut self) -> TriGTerm {
let id = self.bnode_counter;
self.bnode_counter += 1;
TriGTerm::BlankNode(format!("b{}", id))
}
fn get_or_create_bnode(&mut self, label: &str) -> usize {
if let Some(&id) = self.blank_node_map.get(label) {
return id;
}
let id = self.bnode_counter;
self.bnode_counter += 1;
self.blank_node_map.insert(label.to_string(), id);
id
}
fn expect_dot(&mut self) -> Result<(), TriGParseError> {
match self.lexer.next_token()? {
Some(TriGToken::Dot) => Ok(()),
other => Err(TriGParseError::InvalidTriple {
line: self.lexer.line(),
message: format!("Expected '.', got {:?}", other),
}),
}
}
}