wordnet_types/
lib.rs

1//! Shared, zero-copy types that mirror WordNet's dictionary format.
2//!
3//! The goal is to expose the exact fields found in `data.*`/`index.*` while
4//! making it cheap to build higher-level tooling. Text fields borrow from a
5//! backing buffer (`&str`); numeric fields keep their raw representation
6//! (`offset`, `lex_id`, `ss_type`, pointer source/target indices).
7//!
8//! Use [`Pos`] and [`SynsetId`] to key into a database, [`Synset`] and
9//! [`IndexEntry`] to inspect parsed records, and helpers like [`decode_st`] to
10//! interpret pointer source/target pairs.
11//!
12//! ```rust
13//! use wordnet_types::{Pos, SynsetId, decode_st};
14//!
15//! let pos = Pos::from_char('n').unwrap();
16//! let id = SynsetId { pos, offset: 1740 };
17//! assert_eq!(decode_st("0a0b"), (Some(10), Some(11)));
18//! ```
19
20use std::fmt;
21
22/// Part-of-speech marker as used by WordNet files (`n`, `v`, `a`/`s`, `r`).
23#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
24pub enum Pos {
25    Noun,
26    Verb,
27    Adj,
28    Adv,
29}
30
31impl Pos {
32    /// Parse a WordNet POS character into an enum.
33    pub fn from_char(c: char) -> Option<Self> {
34        match c {
35            'n' => Some(Pos::Noun),
36            'v' => Some(Pos::Verb),
37            'a' | 's' => Some(Pos::Adj),
38            'r' => Some(Pos::Adv),
39            _ => None,
40        }
41    }
42
43    /// Emit the POS character used in `index.*`/`data.*`.
44    pub fn to_char(self) -> char {
45        match self {
46            Pos::Noun => 'n',
47            Pos::Verb => 'v',
48            Pos::Adj => 'a',
49            Pos::Adv => 'r',
50        }
51    }
52}
53
54impl fmt::Display for Pos {
55    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
56        f.write_str(match self {
57            Pos::Noun => "noun",
58            Pos::Verb => "verb",
59            Pos::Adj => "adj",
60            Pos::Adv => "adv",
61        })
62    }
63}
64
65/// `(offset, pos)` pair uniquely identifying a synset within the WordNet files.
66#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
67pub struct SynsetId {
68    pub pos: Pos,
69    pub offset: u32,
70}
71
72/// Raw `ss_type` marker from `data.*`, including adjective satellites.
73#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
74pub enum SynsetType {
75    Noun,
76    Verb,
77    Adj,
78    Adv,
79    AdjSatellite,
80}
81
82impl SynsetType {
83    /// Parse the `ss_type` character from a data line.
84    pub fn from_char(c: char) -> Option<Self> {
85        match c {
86            'n' => Some(SynsetType::Noun),
87            'v' => Some(SynsetType::Verb),
88            'a' => Some(SynsetType::Adj),
89            's' => Some(SynsetType::AdjSatellite),
90            'r' => Some(SynsetType::Adv),
91            _ => None,
92        }
93    }
94}
95
96/// A lemma string and its per-synset `lex_id`.
97#[derive(Clone, Debug)]
98pub struct Lemma<'a> {
99    pub text: &'a str,
100    pub lex_id: u8,
101}
102
103/// Verb frame (`f_cnt`) entry describing example template applicability.
104#[derive(Clone, Debug)]
105pub struct Frame {
106    pub frame_number: u16,
107    pub word_number: Option<u16>,
108}
109
110/// Pointer metadata from `p_cnt` section.
111#[derive(Clone, Debug)]
112pub struct Pointer<'a> {
113    pub symbol: &'a str,
114    pub target: SynsetId,
115    pub src_word: Option<u16>,
116    pub dst_word: Option<u16>,
117}
118
119/// Parsed gloss with convenience helpers while keeping the raw text intact.
120#[derive(Clone, Debug)]
121pub struct Gloss<'a> {
122    pub raw: &'a str,
123    pub definition: &'a str,
124    pub examples: Vec<&'a str>,
125}
126
127/// Complete synset record with all semantic fields preserved.
128#[derive(Clone, Debug)]
129pub struct Synset<'a> {
130    pub id: SynsetId,
131    pub lex_filenum: u8,
132    pub synset_type: SynsetType,
133    pub words: Vec<Lemma<'a>>,
134    pub pointers: Vec<Pointer<'a>>,
135    pub frames: &'a [Frame],
136    pub gloss: Gloss<'a>,
137}
138
139/// Index record from `index.*`, including sense and tagsense counts.
140#[derive(Clone, Debug)]
141pub struct IndexEntry<'a> {
142    pub lemma: &'a str,
143    pub pos: Pos,
144    pub synset_cnt: u32,
145    pub p_cnt: u32,
146    pub ptr_symbols: Vec<&'a str>,
147    pub sense_cnt: u32,
148    pub tagsense_cnt: u32,
149    pub synset_offsets: &'a [u32],
150}
151
152/// Decode the four-hex source/target field used in pointer blocks.
153///
154/// High byte is the source word number, low byte is the target word number.
155/// Zero indicates "not specified" per WordNet conventions.
156pub fn decode_st(hex4: &str) -> (Option<u16>, Option<u16>) {
157    if hex4.len() != 4 {
158        return (None, None);
159    }
160
161    match u16::from_str_radix(hex4, 16) {
162        Ok(val) => {
163            let src = val >> 8;
164            let dst = val & 0x00FF;
165            let src = if src == 0 { None } else { Some(src) };
166            let dst = if dst == 0 { None } else { Some(dst) };
167            (src, dst)
168        }
169        Err(_) => (None, None),
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn decode_source_target() {
179        assert_eq!(decode_st("0000"), (None, None));
180        assert_eq!(decode_st("0100"), (Some(1), None));
181        assert_eq!(decode_st("00ff"), (None, Some(255)));
182        assert_eq!(decode_st("0a0b"), (Some(10), Some(11)));
183        assert_eq!(decode_st("bad"), (None, None));
184    }
185}