use super::HdtError;
use super::format::read_vbyte_slice;
#[derive(Debug, Clone, Default)]
pub struct DictionarySection {
pub terms: Vec<String>,
}
impl DictionarySection {
pub fn new() -> Self {
Self::default()
}
pub fn from_plain(data: &[u8]) -> Result<Self, HdtError> {
if data.is_empty() {
return Ok(Self::default());
}
let mut terms = Vec::new();
for segment in data.split(|b| *b == 0) {
if segment.is_empty() {
continue;
}
let s = std::str::from_utf8(segment).map_err(|_| HdtError::DictionaryDecodeError { id: 0 })?;
terms.push(s.to_owned());
}
Ok(DictionarySection { terms })
}
pub fn from_front_coded(data: &[u8], k: usize) -> Result<Self, HdtError> {
if data.is_empty() {
return Ok(Self::default());
}
let k = k.max(1);
let mut terms: Vec<String> = Vec::new();
let mut pos = 0usize;
while pos < data.len() {
let entry_idx = terms.len();
if entry_idx % k == 0 {
let null_pos = data[pos..]
.iter()
.position(|b| *b == 0)
.ok_or(HdtError::DictionaryDecodeError { id: entry_idx as u64 })?;
let s = std::str::from_utf8(&data[pos..pos + null_pos])
.map_err(|_| HdtError::DictionaryDecodeError { id: entry_idx as u64 })?;
terms.push(s.to_owned());
pos += null_pos + 1; } else {
let (shared_len, consumed) = read_vbyte_slice(&data[pos..])?;
pos += consumed;
let shared_len = shared_len as usize;
let null_pos = data[pos..]
.iter()
.position(|b| *b == 0)
.ok_or(HdtError::DictionaryDecodeError { id: entry_idx as u64 })?;
let suffix = std::str::from_utf8(&data[pos..pos + null_pos])
.map_err(|_| HdtError::DictionaryDecodeError { id: entry_idx as u64 })?;
pos += null_pos + 1;
let prev = terms
.last()
.ok_or(HdtError::DictionaryDecodeError { id: entry_idx as u64 })?;
let prefix_bytes = prev.as_bytes().get(..shared_len).ok_or(
HdtError::DictionaryDecodeError { id: entry_idx as u64 },
)?;
let prefix = std::str::from_utf8(prefix_bytes)
.map_err(|_| HdtError::DictionaryDecodeError { id: entry_idx as u64 })?;
terms.push(format!("{}{}", prefix, suffix));
}
}
Ok(DictionarySection { terms })
}
pub fn id_to_term(&self, id: usize) -> Option<&str> {
if id == 0 {
return None;
}
self.terms.get(id - 1).map(String::as_str)
}
pub fn term_to_id(&self, term: &str) -> Option<usize> {
self.terms
.binary_search_by(|s| s.as_str().cmp(term))
.ok()
.map(|idx| idx + 1)
}
}
#[derive(Debug, Clone, Default)]
pub struct HdtDictionary {
pub shared: Vec<String>,
pub subjects: Vec<String>,
pub predicates: Vec<String>,
pub objects: Vec<String>,
}
impl HdtDictionary {
pub fn new() -> Self {
Self::default()
}
pub fn lookup_subject(&self, id: u32) -> Option<&str> {
if id == 0 {
return None;
}
let id_usize = id as usize;
let sh_len = self.shared.len();
if id_usize <= sh_len {
return self.shared.get(id_usize - 1).map(String::as_str);
}
let so_idx = id_usize - sh_len - 1;
self.subjects.get(so_idx).map(String::as_str)
}
pub fn lookup_predicate(&self, id: u32) -> Option<&str> {
if id == 0 {
return None;
}
self.predicates.get(id as usize - 1).map(String::as_str)
}
pub fn lookup_object(&self, id: u32) -> Option<&str> {
if id == 0 {
return None;
}
let id_usize = id as usize;
let sh_len = self.shared.len();
if id_usize <= sh_len {
return self.shared.get(id_usize - 1).map(String::as_str);
}
let o_idx = id_usize - sh_len - 1;
self.objects.get(o_idx).map(String::as_str)
}
pub fn subject_to_id(&self, s: &str) -> Option<u32> {
if let Ok(idx) = self.shared.binary_search_by(|t| t.as_str().cmp(s)) {
return Some((idx + 1) as u32);
}
if let Ok(idx) = self.subjects.binary_search_by(|t| t.as_str().cmp(s)) {
return Some((self.shared.len() + idx + 1) as u32);
}
None
}
pub fn predicate_to_id(&self, p: &str) -> Option<u32> {
self.predicates
.binary_search_by(|t| t.as_str().cmp(p))
.ok()
.map(|idx| (idx + 1) as u32)
}
pub fn object_to_id(&self, o: &str) -> Option<u32> {
if let Ok(idx) = self.shared.binary_search_by(|t| t.as_str().cmp(o)) {
return Some((idx + 1) as u32);
}
if let Ok(idx) = self.objects.binary_search_by(|t| t.as_str().cmp(o)) {
return Some((self.shared.len() + idx + 1) as u32);
}
None
}
pub fn subject_count(&self) -> u32 {
(self.shared.len() + self.subjects.len()) as u32
}
pub fn predicate_count(&self) -> u32 {
self.predicates.len() as u32
}
pub fn object_count(&self) -> u32 {
(self.shared.len() + self.objects.len()) as u32
}
}
pub fn parse_plain_dictionary(data: &[u8]) -> Result<Vec<String>, HdtError> {
DictionarySection::from_plain(data).map(|s| s.terms)
}