oxirs_core/format/
turtle.rs1use super::error::SerializeResult;
7use super::error::{ParseResult, RdfParseError};
8use super::serializer::QuadSerializer;
9use crate::model::{QuadRef, Triple, TripleRef};
10use std::collections::HashMap;
11use std::io::{Read, Write};
12
13#[derive(Debug, Clone)]
15pub struct TurtleParser {
16 lenient: bool,
17 base_iri: Option<String>,
18 prefixes: HashMap<String, String>,
19}
20
21impl TurtleParser {
22 pub fn new() -> Self {
24 Self {
25 lenient: false,
26 base_iri: None,
27 prefixes: HashMap::new(),
28 }
29 }
30
31 pub fn lenient(mut self) -> Self {
33 self.lenient = true;
34 self
35 }
36
37 pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Self {
39 self.base_iri = Some(base_iri.into());
40 self
41 }
42
43 pub fn with_prefix(mut self, prefix: impl Into<String>, iri: impl Into<String>) -> Self {
45 self.prefixes.insert(prefix.into(), iri.into());
46 self
47 }
48
49 pub fn parse_reader<R: Read>(&self, mut reader: R) -> ParseResult<Vec<Triple>> {
51 let mut buffer = String::new();
53 reader.read_to_string(&mut buffer)?;
54
55 self.parse_str(&buffer)
59 }
60
61 pub fn parse_slice(&self, slice: &[u8]) -> ParseResult<Vec<Triple>> {
63 let content = std::str::from_utf8(slice)
66 .map_err(|e| RdfParseError::syntax(format!("Invalid UTF-8: {e}")))?;
67
68 self.parse_str(content)
69 }
70
71 pub fn parse_str(&self, input: &str) -> ParseResult<Vec<Triple>> {
73 use super::parser::helpers::convert_quad;
74 use std::io::Cursor;
75
76 let mut oxttl_parser = oxttl::TurtleParser::new();
78
79 if let Some(ref base) = self.base_iri {
81 oxttl_parser = oxttl_parser
82 .with_base_iri(base.as_str())
83 .unwrap_or_else(|_| oxttl::TurtleParser::new());
84 }
85
86 if self.lenient {
88 oxttl_parser = oxttl_parser.lenient();
89 }
90
91 let reader = Cursor::new(input.as_bytes());
93 let mut triples = Vec::new();
94
95 for result in oxttl_parser.for_reader(reader) {
96 match result {
97 Ok(triple) => {
98 let quad = oxrdf::Quad::new(
100 triple.subject,
101 triple.predicate,
102 triple.object,
103 oxrdf::GraphName::DefaultGraph,
104 );
105 let oxirs_quad = convert_quad(quad)?;
106 triples.push(oxirs_quad.to_triple());
107 }
108 Err(e) => {
109 if !self.lenient {
110 return Err(RdfParseError::syntax(e.to_string()));
111 }
112 }
114 }
115 }
116
117 Ok(triples)
118 }
119
120 pub fn prefixes(&self) -> &HashMap<String, String> {
122 &self.prefixes
123 }
124
125 pub fn base_iri(&self) -> Option<&str> {
127 self.base_iri.as_deref()
128 }
129
130 pub fn is_lenient(&self) -> bool {
132 self.lenient
133 }
134}
135
136impl Default for TurtleParser {
137 fn default() -> Self {
138 Self::new()
139 }
140}
141
142#[derive(Debug, Clone)]
144pub struct TurtleSerializer {
145 base_iri: Option<String>,
146 prefixes: HashMap<String, String>,
147 pretty: bool,
148}
149
150impl TurtleSerializer {
151 pub fn new() -> Self {
153 Self {
154 base_iri: None,
155 prefixes: HashMap::new(),
156 pretty: false,
157 }
158 }
159
160 pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Self {
162 self.base_iri = Some(base_iri.into());
163 self
164 }
165
166 pub fn with_prefix(mut self, prefix: impl Into<String>, iri: impl Into<String>) -> Self {
168 self.prefixes.insert(prefix.into(), iri.into());
169 self
170 }
171
172 pub fn pretty(mut self) -> Self {
174 self.pretty = true;
175 self
176 }
177
178 pub fn for_writer<W: Write>(self, writer: W) -> WriterTurtleSerializer<W> {
180 WriterTurtleSerializer::new(writer, self)
181 }
182
183 pub fn serialize_to_string(&self, triples: &[Triple]) -> SerializeResult<String> {
185 let mut buffer = Vec::new();
186 {
187 let mut serializer = self.clone().for_writer(&mut buffer);
188 for triple in triples {
189 serializer.serialize_triple(triple.as_ref())?;
190 }
191 serializer.finish()?;
192 }
193 String::from_utf8(buffer)
194 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
195 }
196
197 pub fn prefixes(&self) -> &HashMap<String, String> {
199 &self.prefixes
200 }
201
202 pub fn base_iri(&self) -> Option<&str> {
204 self.base_iri.as_deref()
205 }
206
207 pub fn is_pretty(&self) -> bool {
209 self.pretty
210 }
211}
212
213impl Default for TurtleSerializer {
214 fn default() -> Self {
215 Self::new()
216 }
217}
218
219pub struct WriterTurtleSerializer<W: Write> {
221 writer: W,
222 config: TurtleSerializer,
223 headers_written: bool,
224}
225
226impl<W: Write> WriterTurtleSerializer<W> {
227 pub fn new(writer: W, config: TurtleSerializer) -> Self {
229 Self {
230 writer,
231 config,
232 headers_written: false,
233 }
234 }
235
236 pub fn serialize_triple(&mut self, triple: TripleRef<'_>) -> SerializeResult<()> {
238 self.ensure_headers_written()?;
239
240 let subject_str = self.serialize_subject(triple.subject())?;
242
243 let predicate_str = self.serialize_predicate(triple.predicate())?;
245
246 let object_str = self.serialize_object(triple.object())?;
248
249 if self.config.pretty {
251 writeln!(self.writer, "{subject_str} {predicate_str} {object_str} .")?;
252 } else {
253 writeln!(self.writer, "{subject_str} {predicate_str} {object_str}.")?;
254 }
255
256 Ok(())
257 }
258
259 fn serialize_subject(&self, subject: crate::model::SubjectRef<'_>) -> SerializeResult<String> {
261 use crate::model::SubjectRef;
262
263 match subject {
264 SubjectRef::NamedNode(node) => self.serialize_named_node(node.into()),
265 SubjectRef::BlankNode(node) => {
266 let node_str = node.as_str();
267 Ok(format!("_:{node_str}"))
268 }
269 SubjectRef::Variable(var) => {
270 let var_str = var.as_str();
271 Ok(format!("?{var_str}"))
272 }
273 }
274 }
275
276 fn serialize_predicate(
278 &self,
279 predicate: crate::model::PredicateRef<'_>,
280 ) -> SerializeResult<String> {
281 use crate::model::PredicateRef;
282
283 match predicate {
284 PredicateRef::NamedNode(node) => {
285 if node.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" {
287 Ok("a".to_string())
288 } else {
289 self.serialize_named_node(node.into())
290 }
291 }
292 PredicateRef::Variable(var) => {
293 let var_str = var.as_str();
294 Ok(format!("?{var_str}"))
295 }
296 }
297 }
298
299 fn serialize_object(&self, object: crate::model::ObjectRef<'_>) -> SerializeResult<String> {
301 use crate::model::ObjectRef;
302
303 match object {
304 ObjectRef::NamedNode(node) => self.serialize_named_node(node.into()),
305 ObjectRef::BlankNode(node) => {
306 let node_str = node.as_str();
307 Ok(format!("_:{node_str}"))
308 }
309 ObjectRef::Literal(literal) => self.serialize_literal(literal),
310 ObjectRef::Variable(var) => {
311 let var_str = var.as_str();
312 Ok(format!("?{var_str}"))
313 }
314 }
315 }
316
317 fn serialize_named_node(
319 &self,
320 node: crate::model::NamedNodeRef<'_>,
321 ) -> SerializeResult<String> {
322 let iri = node.as_str();
323
324 for (prefix, namespace) in &self.config.prefixes {
326 if iri.starts_with(namespace) {
327 let local = &iri[namespace.len()..];
328 if is_valid_local_name(local) {
330 return Ok(format!("{prefix}:{local}"));
331 }
332 }
333 }
334
335 Ok(format!("<{iri}>"))
337 }
338
339 fn serialize_literal(&self, literal: &crate::model::Literal) -> SerializeResult<String> {
341 let value = literal.value();
342
343 let escaped_value = escape_turtle_string(value);
345
346 if let Some(lang) = literal.language() {
348 return Ok(format!("\"{escaped_value}\"@{lang}"));
349 }
350
351 let datatype = literal.datatype();
353 if datatype.as_str() == "http://www.w3.org/2001/XMLSchema#string" {
354 Ok(format!("\"{escaped_value}\""))
356 } else {
357 let datatype_str = self.serialize_named_node(datatype)?;
359 Ok(format!("\"{escaped_value}\"^^{datatype_str}"))
360 }
361 }
362
363 pub fn finish(self) -> SerializeResult<W> {
365 Ok(self.writer)
366 }
367
368 fn ensure_headers_written(&mut self) -> SerializeResult<()> {
370 if self.headers_written {
371 return Ok(());
372 }
373
374 if let Some(base) = &self.config.base_iri {
376 writeln!(self.writer, "@base <{base}> .")?;
377 }
378
379 for (prefix, iri) in &self.config.prefixes {
381 writeln!(self.writer, "@prefix {prefix}: <{iri}> .")?;
382 }
383
384 if self.config.base_iri.is_some() || !self.config.prefixes.is_empty() {
386 writeln!(self.writer)?;
387 }
388
389 self.headers_written = true;
390 Ok(())
391 }
392}
393
394impl<W: Write> QuadSerializer<W> for WriterTurtleSerializer<W> {
395 fn serialize_quad(&mut self, quad: QuadRef<'_>) -> SerializeResult<()> {
396 if quad.graph_name().is_default_graph() {
398 self.serialize_triple(quad.triple())
399 } else {
400 Ok(())
402 }
403 }
404
405 fn finish(self: Box<Self>) -> SerializeResult<W> {
406 Ok(self.writer)
407 }
408}
409
410fn is_valid_local_name(local: &str) -> bool {
412 if local.is_empty() {
413 return true; }
415
416 let first_char = local
418 .chars()
419 .next()
420 .expect("local name validated to be non-empty");
421 if !is_pn_chars_base(first_char) && first_char != '_' {
422 return false;
423 }
424
425 for ch in local.chars().skip(1) {
427 if !is_pn_chars(ch) && ch != '.' && ch != '-' {
428 return false;
429 }
430 }
431
432 !local.ends_with('.')
434}
435
436fn is_pn_chars_base(ch: char) -> bool {
438 ch.is_ascii_alphabetic()
439 || ('\u{00C0}'..='\u{00D6}').contains(&ch)
440 || ('\u{00D8}'..='\u{00F6}').contains(&ch)
441 || ('\u{00F8}'..='\u{02FF}').contains(&ch)
442 || ('\u{0370}'..='\u{037D}').contains(&ch)
443 || ('\u{037F}'..='\u{1FFF}').contains(&ch)
444 || ('\u{200C}'..='\u{200D}').contains(&ch)
445 || ('\u{2070}'..='\u{218F}').contains(&ch)
446 || ('\u{2C00}'..='\u{2FEF}').contains(&ch)
447 || ('\u{3001}'..='\u{D7FF}').contains(&ch)
448 || ('\u{F900}'..='\u{FDCF}').contains(&ch)
449 || ('\u{FDF0}'..='\u{FFFD}').contains(&ch)
450}
451
452fn is_pn_chars(ch: char) -> bool {
454 is_pn_chars_base(ch)
455 || ch == '_'
456 || ch.is_ascii_digit()
457 || ch == '\u{00B7}'
458 || ('\u{0300}'..='\u{036F}').contains(&ch)
459 || ('\u{203F}'..='\u{2040}').contains(&ch)
460}
461
462fn escape_turtle_string(input: &str) -> String {
464 let mut result = String::with_capacity(input.len());
465
466 for ch in input.chars() {
467 match ch {
468 '"' => result.push_str("\\\""),
469 '\\' => result.push_str("\\\\"),
470 '\n' => result.push_str("\\n"),
471 '\r' => result.push_str("\\r"),
472 '\t' => result.push_str("\\t"),
473 '\x08' => result.push_str("\\b"), '\x0C' => result.push_str("\\f"), c if c.is_control() => {
476 let code = c as u32;
478 result.push_str(&format!("\\u{code:04X}"));
479 }
480 c => result.push(c),
481 }
482 }
483
484 result
485}
486
487#[cfg(test)]
488mod tests {
489 use super::*;
490
491 #[test]
492 fn test_turtle_parser_creation() {
493 let parser = TurtleParser::new();
494 assert!(!parser.is_lenient());
495 assert!(parser.base_iri().is_none());
496 assert!(parser.prefixes().is_empty());
497 }
498
499 #[test]
500 fn test_turtle_parser_configuration() {
501 let parser = TurtleParser::new()
502 .lenient()
503 .with_base_iri("http://example.org/")
504 .with_prefix("ex", "http://example.org/ns#");
505
506 assert!(parser.is_lenient());
507 assert_eq!(parser.base_iri(), Some("http://example.org/"));
508 assert_eq!(
509 parser.prefixes().get("ex"),
510 Some(&"http://example.org/ns#".to_string())
511 );
512 }
513
514 #[test]
515 fn test_turtle_serializer_creation() {
516 let serializer = TurtleSerializer::new();
517 assert!(!serializer.is_pretty());
518 assert!(serializer.base_iri().is_none());
519 assert!(serializer.prefixes().is_empty());
520 }
521
522 #[test]
523 fn test_turtle_serializer_configuration() {
524 let serializer = TurtleSerializer::new()
525 .pretty()
526 .with_base_iri("http://example.org/")
527 .with_prefix("ex", "http://example.org/ns#");
528
529 assert!(serializer.is_pretty());
530 assert_eq!(serializer.base_iri(), Some("http://example.org/"));
531 assert_eq!(
532 serializer.prefixes().get("ex"),
533 Some(&"http://example.org/ns#".to_string())
534 );
535 }
536
537 #[test]
538 fn test_empty_turtle_parsing() {
539 let parser = TurtleParser::new();
540 let result = parser.parse_str("");
541 assert!(result.is_ok());
542 assert!(result.expect("should have value").is_empty());
543 }
544
545 #[test]
546 fn test_turtle_comments() {
547 let parser = TurtleParser::new();
548 let turtle = "# This is a comment\n# Another comment";
549 let result = parser.parse_str(turtle);
550 assert!(result.is_ok());
551 assert!(result.expect("should have value").is_empty());
552 }
553}