use crate::datatypes::values::Value;
use crate::graph::schema::InternedKey;
use crate::graph::storage::mapped::mmap_vec::MmapOrVec;
use std::collections::{HashMap, HashSet};
pub(super) enum Subject<'a> {
Entity(&'a str), Other,
}
pub(super) enum Predicate<'a> {
WikidataDirect(&'a str), Label,
Description,
AltLabel,
Type,
Other,
}
pub(super) enum Object<'a> {
Entity(&'a str), Literal(String), LangLiteral(String, &'a str), TypedLiteral(String, &'a str), Other,
}
pub(super) struct LineBuffer {
pub(super) data: Vec<u8>,
pub(super) offsets: Vec<u32>,
}
impl LineBuffer {
pub(super) fn with_capacity(line_cap: usize, byte_cap: usize) -> Self {
Self {
data: Vec::with_capacity(byte_cap),
offsets: Vec::with_capacity(line_cap),
}
}
pub(super) fn is_empty(&self) -> bool {
self.offsets.is_empty()
}
pub(super) fn push_line(&mut self, line: &[u8]) {
let start = self.data.len() as u32;
self.data.extend_from_slice(line);
self.offsets.push(start);
}
#[inline]
pub(super) fn line(&self, i: usize) -> &[u8] {
let start = self.offsets[i] as usize;
let end = if i + 1 < self.offsets.len() {
self.offsets[i + 1] as usize
} else {
self.data.len()
};
&self.data[start..end]
}
}
pub(super) enum EdgeBuffer {
Strings(Vec<(String, String, String)>),
Compact(MmapOrVec<(u32, u32, InternedKey)>),
}
impl EdgeBuffer {
pub(super) fn len(&self) -> usize {
match self {
Self::Strings(v) => v.len(),
Self::Compact(v) => v.len(),
}
}
}
pub(super) fn parse_qcode_number(qcode: &str) -> Option<u32> {
qcode.strip_prefix('Q')?.parse::<u32>().ok()
}
pub(super) struct EntityAccumulator {
pub(super) id: String, pub(super) label: Option<String>,
pub(super) description: Option<String>,
pub(super) type_qcode: Option<String>, pub(super) properties: HashMap<String, Value>,
pub(super) outgoing_edges: Vec<(String, String)>, }
impl EntityAccumulator {
pub(super) fn new(id: String) -> Self {
Self {
id,
label: None,
description: None,
type_qcode: None,
properties: HashMap::with_capacity(32),
outgoing_edges: Vec::with_capacity(8),
}
}
}
const WD_ENTITY: &str = "http://www.wikidata.org/entity/";
const WD_PROP_DIRECT: &str = "http://www.wikidata.org/prop/direct/";
const RDFS_LABEL: &str = "http://www.w3.org/2000/01/rdf-schema#label";
const SCHEMA_DESC: &str = "http://schema.org/description";
const SKOS_ALT: &str = "http://www.w3.org/2004/02/skos/core#altLabel";
const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
pub(super) fn parse_line(line: &str) -> Option<(Subject<'_>, Predicate<'_>, Object<'_>)> {
let bytes = line.as_bytes();
let mut start = 0;
while start < bytes.len() && bytes[start].is_ascii_whitespace() {
start += 1;
}
let mut end = bytes.len();
while end > start && bytes[end - 1].is_ascii_whitespace() {
end -= 1;
}
if start >= end {
return None;
}
let bytes = &bytes[start..end];
if bytes[0] == b'#' || bytes[0] != b'<' {
return None;
}
let subj_end = 1 + memchr::memchr(b'>', &bytes[1..])?;
if bytes.get(subj_end + 1) != Some(&b' ') {
return None;
}
let subj_uri = unsafe { std::str::from_utf8_unchecked(&bytes[1..subj_end]) };
let subject = if let Some(qcode) = subj_uri.strip_prefix(WD_ENTITY) {
if qcode.starts_with('Q') {
Subject::Entity(qcode)
} else {
Subject::Other
}
} else {
Subject::Other
};
let pred_start = subj_end + 2;
if bytes.get(pred_start) != Some(&b'<') {
return None;
}
let pred_end_rel = memchr::memchr(b'>', &bytes[pred_start + 1..])?;
let pred_end = pred_start + 1 + pred_end_rel;
if bytes.get(pred_end + 1) != Some(&b' ') {
return None;
}
let pred_uri = unsafe { std::str::from_utf8_unchecked(&bytes[pred_start + 1..pred_end]) };
let predicate = if let Some(pcode) = pred_uri.strip_prefix(WD_PROP_DIRECT) {
if pcode.starts_with('P') {
Predicate::WikidataDirect(pcode)
} else {
Predicate::Other
}
} else if pred_uri == RDFS_LABEL {
Predicate::Label
} else if pred_uri == SCHEMA_DESC {
Predicate::Description
} else if pred_uri == SKOS_ALT {
Predicate::AltLabel
} else if pred_uri == RDF_TYPE {
Predicate::Type
} else {
Predicate::Other
};
let mut obj_end = bytes.len();
while obj_end > 0 && bytes[obj_end - 1].is_ascii_whitespace() {
obj_end -= 1;
}
if obj_end == 0 || bytes[obj_end - 1] != b'.' {
return None;
}
obj_end -= 1;
while obj_end > 0 && bytes[obj_end - 1].is_ascii_whitespace() {
obj_end -= 1;
}
let obj_start = pred_end + 2;
if obj_start >= obj_end {
return None;
}
let obj_str = std::str::from_utf8(&bytes[obj_start..obj_end]).ok()?;
let object = parse_object(obj_str);
Some((subject, predicate, object))
}
pub(super) fn parse_object<'a>(s: &'a str) -> Object<'a> {
if s.starts_with('<') {
let uri = s.trim_start_matches('<').trim_end_matches('>');
if let Some(qcode) = uri.strip_prefix(WD_ENTITY) {
if qcode.starts_with('Q') {
return Object::Entity(qcode); }
}
Object::Other
} else if s.starts_with('"') {
if let Some((value, suffix)) = extract_quoted_string(s) {
if suffix.is_empty() {
Object::Literal(value)
} else if let Some(lang) = suffix.strip_prefix('@') {
Object::LangLiteral(value, lang) } else if let Some(type_part) = suffix.strip_prefix("^^<") {
let type_uri = type_part.trim_end_matches('>');
Object::TypedLiteral(value, type_uri) } else {
Object::Literal(value)
}
} else {
Object::Other
}
} else {
Object::Other
}
}
pub(super) fn extract_quoted_string(s: &str) -> Option<(String, &str)> {
let s = s.strip_prefix('"')?;
let mut value = String::new();
let mut chars = s.char_indices();
let mut end_idx = 0;
while let Some((idx, ch)) = chars.next() {
if ch == '\\' {
if let Some((_, next_ch)) = chars.next() {
match next_ch {
'n' => value.push('\n'),
't' => value.push('\t'),
'r' => value.push('\r'),
'"' => value.push('"'),
'\\' => value.push('\\'),
'u' => {
let hex: String = chars.by_ref().take(4).map(|(_, c)| c).collect();
if let Ok(cp) = u32::from_str_radix(&hex, 16) {
if let Some(c) = char::from_u32(cp) {
value.push(c);
}
}
}
'U' => {
let hex: String = chars.by_ref().take(8).map(|(_, c)| c).collect();
if let Ok(cp) = u32::from_str_radix(&hex, 16) {
if let Some(c) = char::from_u32(cp) {
value.push(c);
}
}
}
_ => {
value.push('\\');
value.push(next_ch);
}
}
}
} else if ch == '"' {
end_idx = idx + 1; break;
} else {
value.push(ch);
}
}
Some((value, &s[end_idx..]))
}
pub(super) const XSD_INTEGER: &str = "http://www.w3.org/2001/XMLSchema#integer";
pub(super) const XSD_DECIMAL: &str = "http://www.w3.org/2001/XMLSchema#decimal";
pub(super) const XSD_DOUBLE: &str = "http://www.w3.org/2001/XMLSchema#double";
pub(super) const XSD_FLOAT: &str = "http://www.w3.org/2001/XMLSchema#float";
pub(super) const XSD_DATE: &str = "http://www.w3.org/2001/XMLSchema#dateTime";
pub(super) const XSD_BOOLEAN: &str = "http://www.w3.org/2001/XMLSchema#boolean";
pub(super) fn typed_literal_to_value(text: &str, type_uri: &str) -> Value {
match type_uri {
XSD_INTEGER => text
.parse::<i64>()
.map(Value::Int64)
.unwrap_or(Value::String(text.to_string())),
XSD_DECIMAL => {
let cleaned = text.trim_start_matches('+');
if let Ok(i) = cleaned.parse::<i64>() {
Value::Int64(i)
} else if let Ok(f) = cleaned.parse::<f64>() {
Value::Float64(f)
} else {
Value::String(text.to_string())
}
}
XSD_DOUBLE | XSD_FLOAT => text
.parse::<f64>()
.map(Value::Float64)
.unwrap_or(Value::String(text.to_string())),
XSD_BOOLEAN => match text {
"true" | "1" => Value::Boolean(true),
"false" | "0" => Value::Boolean(false),
_ => Value::String(text.to_string()),
},
XSD_DATE => {
Value::String(text.to_string())
}
_ => {
Value::String(text.to_string())
}
}
}
pub(super) fn language_matches(lang: &str, filter: &Option<HashSet<String>>) -> bool {
match filter {
None => true,
Some(langs) => langs.contains(lang),
}
}
pub(super) fn extract_lang_text(
object: &Object<'_>,
languages: &Option<HashSet<String>>,
) -> Option<String> {
match object {
Object::LangLiteral(text, lang) => {
if language_matches(lang, languages) {
Some(text.clone())
} else {
None
}
}
Object::Literal(text) => Some(text.clone()),
_ => None,
}
}