pub mod dictionary;
pub mod format;
pub mod triples;
#[cfg(test)]
mod tests;
pub use dictionary::{parse_plain_dictionary, DictionarySection, HdtDictionary};
pub use format::{compute_crc16, compute_crc32, read_vbyte, read_vbyte_slice, write_vbyte};
pub use triples::HdtTriplesSection;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;
#[derive(Debug, thiserror::Error)]
pub enum HdtError {
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[error("invalid HDT magic bytes: found {:?}", got)]
InvalidMagic {
got: Vec<u8>,
},
#[error("invalid HDT section: {name}")]
InvalidSection {
name: String,
},
#[error("dictionary decode error for id {id}")]
DictionaryDecodeError {
id: u64,
},
#[error("triple decode error: {msg}")]
TripleDecodeError {
msg: String,
},
#[error("unsupported HDT version: {version}")]
UnsupportedVersion {
version: u8,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HdtHeader {
pub triples_count: u64,
pub subjects_count: u64,
pub predicates_count: u64,
pub objects_count: u64,
pub shared_count: u64,
pub format: String,
pub base_uri: Option<String>,
}
impl Default for HdtHeader {
fn default() -> Self {
HdtHeader {
triples_count: 0,
subjects_count: 0,
predicates_count: 0,
objects_count: 0,
shared_count: 0,
format: "hdt/plain".to_owned(),
base_uri: None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HdtStats {
pub triple_count: u64,
pub distinct_subjects: u64,
pub distinct_predicates: u64,
pub distinct_objects: u64,
pub shared_so_count: u64,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HdtTriple {
pub subject: String,
pub predicate: String,
pub object: String,
}
#[derive(Debug)]
pub struct HdtReader {
header: HdtHeader,
dictionary: HdtDictionary,
triples_section: HdtTriplesSection,
}
const HDT_MAGIC_SIMPLE: &[u8] = b"$HDT\x01";
impl HdtReader {
pub fn open(path: &Path) -> Result<Self, HdtError> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let mut data = Vec::new();
reader.read_to_end(&mut data)?;
Self::parse_bytes(&data)
}
pub fn from_bytes(data: Vec<u8>) -> Result<Self, HdtError> {
Self::parse_bytes(&data)
}
pub fn header(&self) -> &HdtHeader {
&self.header
}
pub fn triple_count(&self) -> u64 {
self.header.triples_count
}
pub fn stats(&self) -> HdtStats {
HdtStats {
triple_count: self.header.triples_count,
distinct_subjects: self.header.subjects_count,
distinct_predicates: self.header.predicates_count,
distinct_objects: self.header.objects_count,
shared_so_count: self.header.shared_count,
}
}
pub fn triples(&self) -> impl Iterator<Item = Result<HdtTriple, HdtError>> + '_ {
self.triples_section.iter_ids().map(|(s_id, p_id, o_id)| {
let subject = self
.dictionary
.lookup_subject(s_id)
.ok_or(HdtError::DictionaryDecodeError { id: s_id as u64 })?
.to_owned();
let predicate = self
.dictionary
.lookup_predicate(p_id)
.ok_or(HdtError::DictionaryDecodeError { id: p_id as u64 })?
.to_owned();
let object = self
.dictionary
.lookup_object(o_id)
.ok_or(HdtError::DictionaryDecodeError { id: o_id as u64 })?
.to_owned();
Ok(HdtTriple {
subject,
predicate,
object,
})
})
}
pub fn iter_triples(&self) -> impl Iterator<Item = Result<(String, String, String), HdtError>> + '_ {
self.triples().map(|r| r.map(|t| (t.subject, t.predicate, t.object)))
}
pub fn lookup_subject(&self, id: u32) -> Result<&str, HdtError> {
self.dictionary
.lookup_subject(id)
.ok_or(HdtError::DictionaryDecodeError { id: id as u64 })
}
pub fn lookup_predicate(&self, id: u32) -> Result<&str, HdtError> {
self.dictionary
.lookup_predicate(id)
.ok_or(HdtError::DictionaryDecodeError { id: id as u64 })
}
pub fn lookup_object(&self, id: u32) -> Result<&str, HdtError> {
self.dictionary
.lookup_object(id)
.ok_or(HdtError::DictionaryDecodeError { id: id as u64 })
}
pub fn lookup_subject_str(&self, subject: &str) -> Result<Vec<(String, String, String)>, HdtError> {
let s_id = self.dictionary.subject_to_id(subject);
match s_id {
None => Ok(Vec::new()),
Some(id) => {
let result = self
.triples_section
.iter_ids()
.filter(|(s, _, _)| *s == id)
.map(|(s, p, o)| {
let subj = self
.dictionary
.lookup_subject(s)
.ok_or(HdtError::DictionaryDecodeError { id: s as u64 })?
.to_owned();
let pred = self
.dictionary
.lookup_predicate(p)
.ok_or(HdtError::DictionaryDecodeError { id: p as u64 })?
.to_owned();
let obj = self
.dictionary
.lookup_object(o)
.ok_or(HdtError::DictionaryDecodeError { id: o as u64 })?
.to_owned();
Ok((subj, pred, obj))
})
.collect::<Result<Vec<_>, HdtError>>()?;
Ok(result)
}
}
}
fn parse_bytes(data: &[u8]) -> Result<Self, HdtError> {
if data.len() < HDT_MAGIC_SIMPLE.len() || &data[..HDT_MAGIC_SIMPLE.len()] != HDT_MAGIC_SIMPLE {
return Err(HdtError::InvalidMagic {
got: data[..HDT_MAGIC_SIMPLE.len().min(data.len())].to_vec(),
});
}
let mut offset = HDT_MAGIC_SIMPLE.len();
let header = Self::parse_header(data, &mut offset)?;
let dictionary = Self::parse_dictionary(data, &mut offset)?;
let triples_section = Self::parse_triples(data, &mut offset)?;
Ok(HdtReader {
header,
dictionary,
triples_section,
})
}
fn parse_header(data: &[u8], offset: &mut usize) -> Result<HdtHeader, HdtError> {
if *offset + 8 > data.len() {
return Err(HdtError::InvalidSection {
name: "truncated before header size field".to_owned(),
});
}
let hdr_size = u64::from_le_bytes(
data[*offset..*offset + 8]
.try_into()
.map_err(|_| HdtError::InvalidSection {
name: "cannot read header size".to_owned(),
})?,
) as usize;
*offset += 8;
let hdr_end = (*offset + hdr_size).min(data.len());
let hdr_bytes = &data[*offset..hdr_end];
*offset = hdr_end;
let mut header = HdtHeader::default();
let text = std::str::from_utf8(hdr_bytes).map_err(|e| HdtError::InvalidSection {
name: format!("header UTF-8: {}", e),
})?;
for line in text.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if let Some((key, value)) = line.split_once('=') {
match key.trim() {
"triples" => {
if let Ok(v) = value.trim().parse::<u64>() {
header.triples_count = v;
}
}
"subjects" => {
if let Ok(v) = value.trim().parse::<u64>() {
header.subjects_count = v;
}
}
"predicates" => {
if let Ok(v) = value.trim().parse::<u64>() {
header.predicates_count = v;
}
}
"objects" => {
if let Ok(v) = value.trim().parse::<u64>() {
header.objects_count = v;
}
}
"shared" => {
if let Ok(v) = value.trim().parse::<u64>() {
header.shared_count = v;
}
}
"format" => {
header.format = value.trim().to_owned();
}
"baseURI" | "base_uri" => {
header.base_uri = Some(value.trim().to_owned());
}
_ => {}
}
}
}
Ok(header)
}
fn parse_dictionary(data: &[u8], offset: &mut usize) -> Result<HdtDictionary, HdtError> {
let mut dict = HdtDictionary::new();
dict.shared = Self::read_dict_section(data, offset, "shared")?;
dict.subjects = Self::read_dict_section(data, offset, "subjects")?;
dict.predicates = Self::read_dict_section(data, offset, "predicates")?;
dict.objects = Self::read_dict_section(data, offset, "objects")?;
Ok(dict)
}
fn read_dict_section(
data: &[u8],
offset: &mut usize,
name: &str,
) -> Result<Vec<String>, HdtError> {
if *offset + 4 > data.len() {
return Err(HdtError::InvalidSection {
name: format!("{} section: truncated before size", name),
});
}
let sec_len = u32::from_le_bytes(
data[*offset..*offset + 4]
.try_into()
.map_err(|_| HdtError::InvalidSection {
name: format!("{} section: cannot read 4-byte length", name),
})?,
) as usize;
*offset += 4;
if *offset + sec_len > data.len() {
return Err(HdtError::InvalidSection {
name: format!(
"{} section: truncated (need {} bytes, have {})",
name,
sec_len,
data.len() - *offset
),
});
}
let sec_data = &data[*offset..*offset + sec_len];
*offset += sec_len;
parse_plain_dictionary(sec_data)
}
fn parse_triples(data: &[u8], offset: &mut usize) -> Result<HdtTriplesSection, HdtError> {
let triples_data = &data[*offset..];
*offset = data.len();
HdtTriplesSection::parse(triples_data)
}
}