use std::io::BufRead;
use std::path::Path;
use num_rational::Ratio;
use crate::attribute::Attribute;
use crate::blob::schemas::longstring::LongString;
use crate::id::{ExclusiveId, Id};
use crate::macros::entity;
use crate::prelude::valueschemas;
use crate::repo::{BlobStore, Workspace};
use crate::trible::{Trible, TribleSet};
use crate::value::schemas::hash::{Blake3, Handle};
use crate::value::{ToValue, Value};
const XSD: &str = "http://www.w3.org/2001/XMLSchema#";
enum RdfLiteral {
Text(String),
SignedInt(i128),
UnsignedInt(u128),
Decimal(Ratio<i128>),
Float(f64),
Bool(bool),
}
enum NtObject {
Uri(String),
Literal(RdfLiteral),
}
fn parse_line(line: &str) -> Option<(String, String, NtObject)> {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
return None;
}
let (subject, rest) = parse_uri(line)?;
let rest = rest.trim_start();
let (predicate, rest) = parse_uri(rest)?;
let rest = rest.trim_start();
let object = if rest.starts_with('<') {
let (uri, _) = parse_uri(rest)?;
NtObject::Uri(uri)
} else if rest.starts_with('"') {
let (text, datatype) = parse_literal_with_datatype(rest)?;
NtObject::Literal(typed_literal(text, datatype.as_deref()))
} else {
return None;
};
Some((subject, predicate, object))
}
fn parse_uri(input: &str) -> Option<(String, &str)> {
if !input.starts_with('<') {
return None;
}
let end = input[1..].find('>')?;
Some((input[1..=end].to_string(), &input[end + 2..]))
}
fn parse_literal_with_datatype(input: &str) -> Option<(String, Option<String>)> {
if !input.starts_with('"') {
return None;
}
let bytes = input.as_bytes();
let mut i = 1;
let mut text = String::new();
while i < bytes.len() {
if bytes[i] == b'\\' && i + 1 < bytes.len() {
match bytes[i + 1] {
b'n' => text.push('\n'),
b't' => text.push('\t'),
b'r' => text.push('\r'),
b'"' => text.push('"'),
b'\\' => text.push('\\'),
_ => {
text.push('\\');
text.push(bytes[i + 1] as char);
}
}
i += 2;
} else if bytes[i] == b'"' {
let rest = &input[i + 1..];
if let Some(rest) = rest.strip_prefix("^^") {
let (dt, _) = parse_uri(rest)?;
return Some((text, Some(dt)));
}
return Some((text, None));
} else {
text.push(bytes[i] as char);
i += 1;
}
}
None
}
fn parse_decimal(s: &str) -> Option<Ratio<i128>> {
if let Some(dot_pos) = s.find('.') {
let decimals = s.len() - dot_pos - 1;
let without_dot: String = s.chars().filter(|c| *c != '.').collect();
let numerator: i128 = without_dot.parse().ok()?;
let denominator: i128 = 10i128.checked_pow(decimals as u32)?;
Some(Ratio::new(numerator, denominator))
} else {
let n: i128 = s.parse().ok()?;
Some(Ratio::from_integer(n))
}
}
fn typed_literal(text: String, datatype: Option<&str>) -> RdfLiteral {
match datatype {
Some(dt) if dt.starts_with(XSD) => {
let local = &dt[XSD.len()..];
match local {
"integer" | "int" | "long" | "short" | "byte"
| "negativeInteger" | "nonPositiveInteger" => text
.parse::<i128>()
.map(RdfLiteral::SignedInt)
.unwrap_or(RdfLiteral::Text(text)),
"nonNegativeInteger"
| "positiveInteger"
| "unsignedInt"
| "unsignedLong"
| "unsignedShort"
| "unsignedByte" => text
.parse::<u128>()
.map(RdfLiteral::UnsignedInt)
.unwrap_or(RdfLiteral::Text(text)),
"decimal" => parse_decimal(&text)
.map(RdfLiteral::Decimal)
.unwrap_or(RdfLiteral::Text(text)),
"float" | "double" => text
.parse::<f64>()
.map(RdfLiteral::Float)
.unwrap_or(RdfLiteral::Text(text)),
"boolean" => match text.as_str() {
"true" | "1" => RdfLiteral::Bool(true),
"false" | "0" => RdfLiteral::Bool(false),
_ => RdfLiteral::Text(text),
},
_ => RdfLiteral::Text(text),
}
}
_ => RdfLiteral::Text(text),
}
}
fn uri_to_id<Blobs>(ws: &mut Workspace<Blobs>, uri: &str) -> Id
where
Blobs: BlobStore<Blake3>,
{
let handle: Value<Handle<Blake3, LongString>> = ws.put(uri.to_owned());
let fragment = entity! { crate::import::rdf_uri: handle };
fragment.root().expect("intrinsic URI entity")
}
pub fn ingest_ntriples<Blobs>(
ws: &mut Workspace<Blobs>,
reader: impl BufRead,
) -> (TribleSet, usize)
where
Blobs: BlobStore<Blake3>,
{
let mut facts = TribleSet::new();
let mut count = 0;
for line in reader.lines() {
let Ok(line) = line else { continue };
let Some((subject, predicate, object)) = parse_line(&line) else {
continue;
};
let subject_id = uri_to_id(ws, &subject);
let sub_h: Value<Handle<Blake3, LongString>> = ws.put(subject.to_owned());
facts += entity! { crate::import::rdf_uri: sub_h };
let e = ExclusiveId::force_ref(&subject_id);
match object {
NtObject::Uri(ref obj_uri) => {
let attr = Attribute::<valueschemas::GenId>::from_name(&predicate);
let obj_id = uri_to_id(ws, obj_uri);
let obj_h: Value<Handle<Blake3, LongString>> = ws.put(obj_uri.to_owned());
facts += entity! { crate::import::rdf_uri: obj_h };
facts.insert(&Trible::new(e, &attr.id(), &obj_id.to_value()));
}
NtObject::Literal(RdfLiteral::Text(ref text)) => {
let attr = Attribute::<Handle<Blake3, LongString>>::from_name(&predicate);
let handle: Value<Handle<Blake3, LongString>> = ws.put(text.to_owned());
facts.insert(&Trible::new(e, &attr.id(), &handle));
}
NtObject::Literal(RdfLiteral::SignedInt(val)) => {
let attr = Attribute::<valueschemas::I256BE>::from_name(&predicate);
let v: Value<valueschemas::I256BE> = val.to_value();
facts.insert(&Trible::new(e, &attr.id(), &v));
}
NtObject::Literal(RdfLiteral::UnsignedInt(val)) => {
let attr = Attribute::<valueschemas::U256BE>::from_name(&predicate);
let v: Value<valueschemas::U256BE> = val.to_value();
facts.insert(&Trible::new(e, &attr.id(), &v));
}
NtObject::Literal(RdfLiteral::Decimal(val)) => {
let attr = Attribute::<valueschemas::R256BE>::from_name(&predicate);
let v: Value<valueschemas::R256BE> = val.to_value();
facts.insert(&Trible::new(e, &attr.id(), &v));
}
NtObject::Literal(RdfLiteral::Float(val)) => {
let attr = Attribute::<valueschemas::F64>::from_name(&predicate);
facts.insert(&Trible::new(e, &attr.id(), &val.to_value()));
}
NtObject::Literal(RdfLiteral::Bool(val)) => {
let attr = Attribute::<valueschemas::Boolean>::from_name(&predicate);
facts.insert(&Trible::new(e, &attr.id(), &val.to_value()));
}
}
count += 1;
}
(facts, count)
}
pub fn ingest_ntriples_file<Blobs>(
ws: &mut Workspace<Blobs>,
path: &Path,
) -> Result<(TribleSet, usize), std::io::Error>
where
Blobs: BlobStore<Blake3>,
{
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
Ok(ingest_ntriples(ws, reader))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_uri_triple() {
let line = r#"<http://example.org/s> <http://example.org/p> <http://example.org/o> ."#;
let (s, p, o) = parse_line(line).unwrap();
assert_eq!(s, "http://example.org/s");
assert_eq!(p, "http://example.org/p");
assert!(matches!(o, NtObject::Uri(ref u) if u == "http://example.org/o"));
}
#[test]
fn parse_string_literal() {
let line = r#"<http://example.org/s> <http://example.org/p> "hello" ."#;
let (_, _, o) = parse_line(line).unwrap();
assert!(matches!(o, NtObject::Literal(RdfLiteral::Text(ref t)) if t == "hello"));
}
#[test]
fn parse_integer_to_i128() {
let line = r#"<http://example.org/s> <http://example.org/p> "42"^^<http://www.w3.org/2001/XMLSchema#integer> ."#;
let (_, _, o) = parse_line(line).unwrap();
assert!(matches!(o, NtObject::Literal(RdfLiteral::SignedInt(42))));
}
#[test]
fn parse_unsigned_integer() {
let line = r#"<http://example.org/s> <http://example.org/p> "100"^^<http://www.w3.org/2001/XMLSchema#nonNegativeInteger> ."#;
let (_, _, o) = parse_line(line).unwrap();
assert!(matches!(o, NtObject::Literal(RdfLiteral::UnsignedInt(100))));
}
#[test]
fn parse_decimal_to_ratio() {
let line = r#"<http://example.org/s> <http://example.org/p> "3.14"^^<http://www.w3.org/2001/XMLSchema#decimal> ."#;
let (_, _, o) = parse_line(line).unwrap();
match o {
NtObject::Literal(RdfLiteral::Decimal(r)) => {
assert_eq!(*r.numer(), 157);
assert_eq!(*r.denom(), 50);
}
_ => panic!("expected Decimal"),
}
}
#[test]
fn parse_double_to_f64() {
let line = r#"<http://example.org/s> <http://example.org/p> "2.718"^^<http://www.w3.org/2001/XMLSchema#double> ."#;
let (_, _, o) = parse_line(line).unwrap();
assert!(matches!(o, NtObject::Literal(RdfLiteral::Float(v)) if (v - 2.718).abs() < 0.001));
}
#[test]
fn parse_boolean() {
let line = r#"<http://example.org/s> <http://example.org/p> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> ."#;
let (_, _, o) = parse_line(line).unwrap();
assert!(matches!(o, NtObject::Literal(RdfLiteral::Bool(true))));
}
#[test]
fn decimal_parse_helper() {
let r = parse_decimal("3.14").unwrap();
assert_eq!(*r.numer(), 157);
assert_eq!(*r.denom(), 50);
let r = parse_decimal("42").unwrap();
assert_eq!(*r.numer(), 42);
assert_eq!(*r.denom(), 1);
let r = parse_decimal("-0.5").unwrap();
assert_eq!(*r.numer(), -1);
assert_eq!(*r.denom(), 2);
}
#[test]
fn skip_comments_and_blank() {
assert!(parse_line("# comment").is_none());
assert!(parse_line("").is_none());
}
}