#![allow(dead_code)]
use std::io::{self, Write};
use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Map, MapBuilder, Streamer};
use serde::{Deserialize, Serialize};
pub const FORMAT_VERSION: u32 = 1;
const MAGIC: &[u8; 8] = b"RETROSAU";
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct WordRec {
pub lemma: String,
pub senses: Vec<u32>,
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct SenseRec {
pub word: u32,
pub synset: u32,
pub rels: Vec<Rel>,
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct SynsetRec {
pub pos: u8,
pub definition: String,
pub examples: Vec<String>,
pub members: Vec<u32>,
pub rels: Vec<Rel>,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct Rel {
pub kind: u8,
pub target: u32,
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct Tables {
pub words: Vec<WordRec>,
pub senses: Vec<SenseRec>,
pub synsets: Vec<SynsetRec>,
pub groups: Vec<Vec<u32>>,
}
pub fn pos_tag(code: &str) -> Option<u8> {
match code {
"n" => Some(0),
"v" => Some(1),
"a" | "s" => Some(2),
"r" => Some(3),
_ => None,
}
}
pub const REL_ANTONYM: u8 = 0;
pub const REL_HYPERNYM: u8 = 1;
pub const REL_HYPONYM: u8 = 2;
pub const REL_HOLONYM: u8 = 3;
pub const REL_MERONYM: u8 = 4;
pub const REL_SIMILAR: u8 = 5;
pub const REL_ALSO: u8 = 6;
pub const REL_DERIVATION: u8 = 7;
pub const REL_PERTAINYM: u8 = 8;
pub fn rel_tag(rel_type: &str) -> Option<u8> {
match rel_type {
"antonym" => Some(REL_ANTONYM),
"hypernym" | "instance_hypernym" => Some(REL_HYPERNYM),
"hyponym" | "instance_hyponym" => Some(REL_HYPONYM),
"holonym" | "holo_part" | "holo_member" | "holo_substance" => Some(REL_HOLONYM),
"meronym" | "mero_part" | "mero_member" | "mero_substance" => Some(REL_MERONYM),
"similar" => Some(REL_SIMILAR),
"also" => Some(REL_ALSO),
"derivation" => Some(REL_DERIVATION),
"pertainym" => Some(REL_PERTAINYM),
_ => None,
}
}
pub fn rel_is_antonym(tag: u8) -> bool {
tag == REL_ANTONYM
}
pub fn rel_label(tag: u8) -> &'static str {
match tag {
REL_ANTONYM => "antonyms",
REL_HYPERNYM => "more general",
REL_HYPONYM => "more specific",
REL_HOLONYM => "part of",
REL_MERONYM => "has parts",
REL_SIMILAR => "similar to",
REL_ALSO => "see also",
REL_DERIVATION => "related forms",
REL_PERTAINYM => "pertains to",
_ => "related",
}
}
pub fn rel_order(tag: u8) -> u8 {
match tag {
REL_SIMILAR => 0,
REL_HYPERNYM => 1,
REL_HYPONYM => 2,
REL_HOLONYM => 3,
REL_MERONYM => 4,
REL_DERIVATION => 5,
REL_PERTAINYM => 6,
REL_ALSO => 7,
_ => 8,
}
}
pub fn write_index(
fst_entries: &[(String, u64)],
tables: &Tables,
source_sha: [u8; 32],
) -> io::Result<Vec<u8>> {
let mut builder = MapBuilder::memory();
for (key, val) in fst_entries {
builder
.insert(key.as_bytes(), *val)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
}
let fst_bytes = builder.into_inner().map_err(io::Error::other)?;
let table_bytes =
postcard::to_allocvec(tables).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let mut out = Vec::with_capacity(fst_bytes.len() + table_bytes.len() + 64);
out.write_all(MAGIC)?;
out.write_all(&FORMAT_VERSION.to_le_bytes())?;
out.write_all(&source_sha)?;
out.write_all(&(fst_bytes.len() as u64).to_le_bytes())?;
out.write_all(&(table_bytes.len() as u64).to_le_bytes())?;
out.write_all(&fst_bytes)?;
out.write_all(&table_bytes)?;
Ok(out)
}
pub fn peek_header(bytes: &[u8]) -> Option<(u32, [u8; 32])> {
if bytes.len() < 8 + 4 + 32 || &bytes[..8] != MAGIC {
return None;
}
let version = u32::from_le_bytes(bytes[8..12].try_into().ok()?);
let mut sha = [0u8; 32];
sha.copy_from_slice(&bytes[12..44]);
Some((version, sha))
}
#[derive(Debug)]
pub enum ReadError {
BadMagic,
Version { found: u32, expected: u32 },
Truncated,
Fst(fst::Error),
Tables(postcard::Error),
}
impl std::fmt::Display for ReadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ReadError::BadMagic => write!(f, "not a RetroSaurus index (bad magic)"),
ReadError::Version { found, expected } => {
write!(f, "index format version {found}, expected {expected}")
}
ReadError::Truncated => write!(f, "index is truncated"),
ReadError::Fst(e) => write!(f, "fst error: {e}"),
ReadError::Tables(e) => write!(f, "tables decode error: {e}"),
}
}
}
impl std::error::Error for ReadError {}
pub struct Index {
fst: Map<Vec<u8>>,
tables: Tables,
}
impl Index {
pub fn read(bytes: &[u8]) -> Result<Index, ReadError> {
if bytes.len() < 8 + 4 + 32 + 8 + 8 {
return Err(ReadError::Truncated);
}
if &bytes[..8] != MAGIC {
return Err(ReadError::BadMagic);
}
let version = u32::from_le_bytes(bytes[8..12].try_into().unwrap());
if version != FORMAT_VERSION {
return Err(ReadError::Version {
found: version,
expected: FORMAT_VERSION,
});
}
let fst_len = u64::from_le_bytes(bytes[44..52].try_into().unwrap()) as usize;
let tables_len = u64::from_le_bytes(bytes[52..60].try_into().unwrap()) as usize;
let fst_start = 60;
let tables_start = fst_start + fst_len;
let end = tables_start + tables_len;
if bytes.len() < end {
return Err(ReadError::Truncated);
}
let fst = Map::new(bytes[fst_start..tables_start].to_vec()).map_err(ReadError::Fst)?;
let tables = postcard::from_bytes(&bytes[tables_start..end]).map_err(ReadError::Tables)?;
Ok(Index { fst, tables })
}
pub fn tables(&self) -> &Tables {
&self.tables
}
pub fn group_of(&self, lower_key: &str) -> Option<u32> {
self.fst.get(lower_key.as_bytes()).map(|v| v as u32)
}
pub fn prefix(&self, lower_prefix: &str, limit: usize) -> Vec<u32> {
let mut out = Vec::new();
if limit == 0 {
return out;
}
let automaton = Str::new(lower_prefix).starts_with();
let mut stream = self.fst.search(automaton).into_stream();
'outer: while let Some((_key, group_id)) = stream.next() {
if let Some(members) = self.tables.groups.get(group_id as usize) {
for &word in members {
out.push(word);
if out.len() >= limit {
break 'outer;
}
}
}
}
out
}
}
#[cfg(test)]
mod tests {
use super::*;
fn tiny() -> (Vec<(String, u64)>, Tables) {
let tables = Tables {
words: vec![
WordRec {
lemma: "abandon".into(),
senses: vec![0],
},
WordRec {
lemma: "able".into(),
senses: vec![1],
},
],
senses: vec![
SenseRec {
word: 0,
synset: 0,
rels: vec![Rel {
kind: REL_ANTONYM,
target: 1,
}],
},
SenseRec {
word: 1,
synset: 1,
rels: vec![],
},
],
synsets: vec![
SynsetRec {
pos: 1,
definition: "give up".into(),
examples: vec!["abandon ship".into()],
members: vec![0],
rels: vec![Rel {
kind: REL_HYPERNYM,
target: 1,
}],
},
SynsetRec {
pos: 2,
definition: "having the power".into(),
examples: vec![],
members: vec![1],
rels: vec![],
},
],
groups: vec![vec![0], vec![1]],
};
let entries = vec![("abandon".to_string(), 0u64), ("able".to_string(), 1u64)];
(entries, tables)
}
#[test]
fn round_trips() {
let (entries, tables) = tiny();
let sha = [7u8; 32];
let bytes = write_index(&entries, &tables, sha).unwrap();
assert_eq!(peek_header(&bytes), Some((FORMAT_VERSION, sha)));
let index = Index::read(&bytes).unwrap();
assert_eq!(index.tables(), &tables);
assert_eq!(index.group_of("abandon"), Some(0));
assert_eq!(index.group_of("able"), Some(1));
assert_eq!(index.group_of("zebra"), None);
assert_eq!(index.prefix("aba", 10), vec![0]);
assert_eq!(index.prefix("ab", 10), vec![0, 1]); assert_eq!(index.prefix("a", 10), vec![0, 1]);
assert_eq!(index.prefix("a", 1), vec![0]);
assert_eq!(index.prefix("z", 10), Vec::<u32>::new());
}
#[test]
fn rejects_bad_blobs() {
assert!(matches!(
Index::read(b"too short"),
Err(ReadError::Truncated)
));
let mut bytes = write_index(&[], &Tables::default(), [0u8; 32]).unwrap();
bytes[0] = b'X';
assert!(matches!(Index::read(&bytes), Err(ReadError::BadMagic)));
}
}