use crate::error::{TextPosition, TurtleParseError, TurtleResult, TurtleSyntaxError};
use crate::turtle::TurtleParser;
use oxirs_core::model::Triple;
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ParseState {
Ready,
Incomplete,
HasData,
Complete,
Error,
}
#[derive(Debug, Clone)]
pub struct ParseCheckpoint {
pub byte_offset: usize,
pub triple_count: usize,
pub prefixes: HashMap<String, String>,
pub base_iri: Option<String>,
pub pending_data: Vec<u8>,
pub state: ParseState,
}
impl ParseCheckpoint {
pub fn new() -> Self {
Self {
byte_offset: 0,
triple_count: 0,
prefixes: HashMap::new(),
base_iri: None,
pending_data: Vec::new(),
state: ParseState::Ready,
}
}
}
impl Default for ParseCheckpoint {
fn default() -> Self {
Self::new()
}
}
pub struct IncrementalParser {
buffer: Vec<u8>,
state: ParseState,
prefixes: HashMap<String, String>,
base_iri: Option<String>,
bytes_processed: usize,
triples_parsed: usize,
lenient: bool,
eof: bool,
errors: Vec<TurtleParseError>,
}
impl IncrementalParser {
pub fn new() -> Self {
Self {
buffer: Vec::new(),
state: ParseState::Ready,
prefixes: HashMap::new(),
base_iri: None,
bytes_processed: 0,
triples_parsed: 0,
lenient: false,
eof: false,
errors: Vec::new(),
}
}
pub fn new_lenient() -> Self {
let mut parser = Self::new();
parser.lenient = true;
parser
}
pub fn set_lenient(&mut self, lenient: bool) {
self.lenient = lenient;
}
pub fn state(&self) -> ParseState {
self.state
}
pub fn bytes_processed(&self) -> usize {
self.bytes_processed
}
pub fn triples_parsed(&self) -> usize {
self.triples_parsed
}
pub fn errors(&self) -> &[TurtleParseError] {
&self.errors
}
pub fn clear_errors(&mut self) {
self.errors.clear();
}
pub fn push_data(&mut self, data: &[u8]) -> TurtleResult<()> {
if self.eof {
return Err(TurtleParseError::syntax(TurtleSyntaxError::Generic {
message: "Cannot push data after EOF".to_string(),
position: TextPosition::new(1, 1, self.bytes_processed),
}));
}
self.buffer.extend_from_slice(data);
self.bytes_processed += data.len();
if self.buffer.is_empty() {
self.state = ParseState::Ready;
} else {
self.state = ParseState::HasData;
}
Ok(())
}
pub fn push_eof(&mut self) {
self.eof = true;
if self.buffer.is_empty() {
self.state = ParseState::Complete;
}
}
pub fn has_data(&self) -> bool {
!self.buffer.is_empty()
}
pub fn is_complete(&self) -> bool {
self.state == ParseState::Complete
}
pub fn parse_available(&mut self) -> TurtleResult<Vec<Triple>> {
if self.buffer.is_empty() {
return Ok(Vec::new());
}
let content = match std::str::from_utf8(&self.buffer) {
Ok(s) => s.to_string(),
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to == 0 {
return Err(TurtleParseError::syntax(TurtleSyntaxError::Generic {
message: "Invalid UTF-8 data".to_string(),
position: TextPosition::new(1, 1, 0),
}));
}
std::str::from_utf8(&self.buffer[..valid_up_to])
.expect("valid UTF-8")
.to_string()
}
};
let (parseable, remaining) = self.find_statement_boundary(&content);
if parseable.is_empty() {
if self.eof {
self.state = ParseState::Complete;
return self.parse_final();
} else {
self.state = ParseState::Incomplete;
return Ok(Vec::new());
}
}
let triples = self.parse_content(&parseable)?;
self.triples_parsed += triples.len();
self.buffer = remaining.as_bytes().to_vec();
if self.buffer.is_empty() {
if self.eof {
self.state = ParseState::Complete;
} else {
self.state = ParseState::Ready;
}
} else {
self.state = ParseState::Incomplete;
}
Ok(triples)
}
fn parse_final(&mut self) -> TurtleResult<Vec<Triple>> {
if self.buffer.is_empty() {
return Ok(Vec::new());
}
let content = match std::str::from_utf8(&self.buffer) {
Ok(s) => s.to_string(),
Err(_) => {
return Err(TurtleParseError::syntax(TurtleSyntaxError::Generic {
message: "Invalid UTF-8 at end of stream".to_string(),
position: TextPosition::new(1, 1, self.bytes_processed),
}));
}
};
let triples = self.parse_content(&content)?;
self.triples_parsed += triples.len();
self.buffer.clear();
self.state = ParseState::Complete;
Ok(triples)
}
fn find_statement_boundary(&self, content: &str) -> (String, String) {
let mut last_complete = 0;
let mut in_string = false;
let mut in_long_string = false;
let mut string_quote = '\0';
let mut chars = content.char_indices().peekable();
while let Some((i, ch)) = chars.next() {
if !in_string && !in_long_string && (ch == '"' || ch == '\'') {
let mut count = 1;
while let Some(&(_, next_ch)) = chars.peek() {
if next_ch == ch && count < 3 {
chars.next();
count += 1;
} else {
break;
}
}
if count == 3 {
in_long_string = true;
} else {
in_string = count == 1;
}
string_quote = ch;
} else if in_long_string && ch == string_quote {
let mut count = 1;
while let Some(&(_, next_ch)) = chars.peek() {
if next_ch == string_quote && count < 3 {
chars.next();
count += 1;
} else {
break;
}
}
if count >= 3 {
in_long_string = false;
}
} else if in_string && ch == string_quote {
in_string = false;
} else if in_string && ch == '\\' {
chars.next();
} else if !in_string && !in_long_string {
if ch == '.' || ch == '}' {
let mut end_pos = i + ch.len_utf8();
while let Some(&(next_i, next_ch)) = chars.peek() {
if next_ch == ' ' || next_ch == '\t' || next_ch == '\n' || next_ch == '\r' {
chars.next();
end_pos = next_i + next_ch.len_utf8();
} else {
break;
}
}
last_complete = end_pos;
}
}
}
if last_complete == 0 {
(String::new(), content.to_string())
} else {
(
content[..last_complete].to_string(),
content[last_complete..].to_string(),
)
}
}
fn parse_content(&mut self, content: &str) -> TurtleResult<Vec<Triple>> {
for line in content.lines() {
let trimmed = line.trim();
if let Some(rest) = trimmed.strip_prefix("@prefix") {
let rest = rest.trim();
if let Some(colon_pos) = rest.find(':') {
let prefix = rest[..colon_pos].trim().to_string();
let after_colon = rest[colon_pos + 1..].trim();
if let Some(iri_start) = after_colon.find('<') {
if let Some(iri_end) = after_colon.find('>') {
let iri = after_colon[iri_start + 1..iri_end].to_string();
self.prefixes.insert(prefix, iri);
}
}
}
} else if let Some(rest) = trimmed.strip_prefix("@base") {
let rest = rest.trim();
if let Some(iri_start) = rest.find('<') {
if let Some(iri_end) = rest.find('>') {
let iri = rest[iri_start + 1..iri_end].to_string();
self.base_iri = Some(iri);
}
}
}
}
let mut parser = TurtleParser::new();
if self.lenient {
parser.lenient = true;
}
for (prefix, iri) in &self.prefixes {
parser.prefixes.insert(prefix.clone(), iri.clone());
}
if let Some(base) = &self.base_iri {
parser.base_iri = Some(base.clone());
}
match parser.parse_document(content) {
Ok(triples) => Ok(triples),
Err(e) => {
if self.lenient {
self.errors.push(e);
self.state = ParseState::Error;
Ok(Vec::new())
} else {
self.state = ParseState::Error;
Err(e)
}
}
}
}
pub fn checkpoint(&self) -> ParseCheckpoint {
ParseCheckpoint {
byte_offset: self.bytes_processed,
triple_count: self.triples_parsed,
prefixes: self.prefixes.clone(),
base_iri: self.base_iri.clone(),
pending_data: self.buffer.clone(),
state: self.state,
}
}
pub fn restore(&mut self, checkpoint: ParseCheckpoint) {
self.bytes_processed = checkpoint.byte_offset;
self.triples_parsed = checkpoint.triple_count;
self.prefixes = checkpoint.prefixes;
self.base_iri = checkpoint.base_iri;
self.buffer = checkpoint.pending_data;
self.state = checkpoint.state;
self.eof = checkpoint.state == ParseState::Complete;
self.errors.clear();
}
pub fn reset(&mut self) {
self.buffer.clear();
self.state = ParseState::Ready;
self.prefixes.clear();
self.base_iri = None;
self.bytes_processed = 0;
self.triples_parsed = 0;
self.eof = false;
self.errors.clear();
}
pub fn pending_size(&self) -> usize {
self.buffer.len()
}
pub fn prefixes(&self) -> &HashMap<String, String> {
&self.prefixes
}
pub fn base_iri(&self) -> Option<&str> {
self.base_iri.as_deref()
}
}
impl Default for IncrementalParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_incremental_basic() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:s ex:p \"object\" .\n")
.expect("push data should succeed");
parser.push_eof();
let triples = parser.parse_available().expect("parsing should succeed");
assert_eq!(triples.len(), 1);
assert!(parser.is_complete());
}
#[test]
fn test_incremental_chunks() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <")
.expect("push data should succeed");
parser
.push_data(b"http://example.org/> .\n")
.expect("push data should succeed");
parser
.push_data(b"ex:s ex:p ")
.expect("push data should succeed");
parser
.push_data(b"\"object\" .\n")
.expect("push data should succeed");
parser.push_eof();
let triples = parser.parse_available().expect("parsing should succeed");
assert_eq!(triples.len(), 1);
}
#[test]
fn test_incremental_incomplete() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:s ex:p")
.expect("push data should succeed");
let triples = parser.parse_available().expect("parsing should succeed");
assert!(triples.is_empty());
assert_eq!(parser.state(), ParseState::Incomplete);
parser
.push_data(b" \"object\" .\n")
.expect("push data should succeed");
parser.push_eof();
let triples = parser.parse_available().expect("parsing should succeed");
assert_eq!(triples.len(), 1);
}
#[test]
fn test_incremental_multiple_triples() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:a ex:p \"1\" .\nex:b ex:p \"2\" .\nex:c ex:p \"3\" .\n")
.expect("operation should succeed");
parser.push_eof();
let triples = parser.parse_available().expect("parsing should succeed");
assert_eq!(triples.len(), 3);
}
#[test]
fn test_checkpoint_restore() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:a ex:p \"1\" .\n")
.expect("push data should succeed");
parser.parse_available().expect("parsing should succeed");
let checkpoint = parser.checkpoint();
parser
.push_data(b"ex:b ex:p \"2\" .\n")
.expect("push data should succeed");
parser.push_eof();
parser.parse_available().expect("parsing should succeed");
assert_eq!(parser.triples_parsed(), 2);
parser.restore(checkpoint);
assert_eq!(parser.triples_parsed(), 1);
}
#[test]
fn test_lenient_mode() {
let mut parser = IncrementalParser::new_lenient();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:s ex:p \"object\" .\n")
.expect("push data should succeed");
parser.push_eof();
let triples = parser.parse_available().expect("parsing should succeed");
assert_eq!(triples.len(), 1);
assert!(parser.errors().is_empty());
let mut parser2 = IncrementalParser::new_lenient();
parser2
.push_data(b"invalid syntax here\n")
.expect("push data should succeed");
parser2.push_eof();
let _ = parser2.parse_available().expect("parsing should succeed"); }
#[test]
fn test_prefix_accumulation() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser.parse_available().expect("parsing should succeed");
assert!(parser.prefixes().contains_key("ex"));
parser
.push_data(b"@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n")
.expect("operation should succeed");
parser.parse_available().expect("parsing should succeed");
assert!(parser.prefixes().contains_key("ex"));
assert!(parser.prefixes().contains_key("foaf"));
}
#[test]
fn test_multiline_string() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:s ex:p \"\"\"hello\nworld\"\"\" .\n")
.expect("operation should succeed");
parser.push_eof();
let triples = parser.parse_available().expect("parsing should succeed");
assert_eq!(triples.len(), 1);
}
#[test]
fn test_reset() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
parser
.push_data(b"ex:s ex:p \"object\" .\n")
.expect("push data should succeed");
parser.push_eof();
parser.parse_available().expect("parsing should succeed");
assert_eq!(parser.triples_parsed(), 1);
parser.reset();
assert_eq!(parser.triples_parsed(), 0);
assert_eq!(parser.bytes_processed(), 0);
assert!(!parser.is_complete());
}
#[test]
fn test_eof_error() {
let mut parser = IncrementalParser::new();
parser.push_eof();
let result = parser.push_data(b"data after eof");
assert!(result.is_err());
}
#[test]
fn test_progress_tracking() {
let mut parser = IncrementalParser::new();
parser
.push_data(b"@prefix ex: <http://example.org/> .\n")
.expect("operation should succeed");
assert_eq!(parser.bytes_processed(), 36);
parser
.push_data(b"ex:s ex:p \"object\" .\n")
.expect("push data should succeed");
assert_eq!(parser.bytes_processed(), 57);
parser.push_eof();
parser.parse_available().expect("parsing should succeed");
assert_eq!(parser.triples_parsed(), 1);
}
}