Skip to main content

wn_parser/
index.rs

1use crate::common::{IndexEntry, PointerSymbol, SynsetType};
2use std::str::FromStr;
3
4/// Parse a line from an index.* file into an IndexEntry
5pub fn parse_index_line(line: &str) -> Result<IndexEntry, &'static str> {
6    // Skip lines that begin with space (copyright notices)
7    if line.starts_with(' ') {
8        return Err("Skipping copyright notice line");
9    }
10
11    let fields: Vec<&str> = line.split_whitespace().collect();
12    if fields.len() < 6 {
13        return Err("Invalid index line format: insufficient fields");
14    }
15
16    // Parse the fixed fields
17    let lemma = fields[0].to_string();
18    let pos = SynsetType::from(fields[1]);
19    let synset_cnt = u32::from_str(fields[2]).map_err(|_| "Invalid synset count")?;
20    let p_cnt = u32::from_str(fields[3]).map_err(|_| "Invalid pointer count")?;
21
22    // Parse pointer symbols
23    let mut ptr_symbols = Vec::new();
24    let mut current_index = 4;
25    for _ in 0..p_cnt {
26        if current_index >= fields.len() {
27            return Err("Invalid index line format: missing pointer symbol fields");
28        }
29        ptr_symbols.push(PointerSymbol::from(fields[current_index]));
30        current_index += 1;
31    }
32
33    // Next fields are sense_cnt and tagsense_cnt
34    if current_index + 1 >= fields.len() {
35        return Err("Invalid index line format: missing sense count fields");
36    }
37    let sense_cnt = u32::from_str(fields[current_index]).map_err(|_| "Invalid sense count")?;
38    current_index += 1;
39    
40    let tagsense_cnt = u32::from_str(fields[current_index]).map_err(|_| "Invalid tagged sense count")?;
41    current_index += 1;
42
43    // Parse synset offsets
44    let mut synset_offsets = Vec::new();
45    while current_index < fields.len() {
46        let offset = u64::from_str(fields[current_index]).map_err(|_| "Invalid synset offset")?;
47        synset_offsets.push(offset);
48        current_index += 1;
49    }
50
51    // Validation - number of offsets should match synset_cnt
52    if synset_offsets.len() != synset_cnt as usize {
53        return Err("Invalid index line: synset count doesn't match number of offsets");
54    }
55
56    Ok(IndexEntry {
57        lemma,
58        pos,
59        synset_cnt,
60        ptr_symbols,
61        sense_cnt,
62        tagsense_cnt,
63        synset_offsets,
64    })
65}
66
67#[cfg(test)]
68mod tests {
69    use super::*;
70    
71    #[test]
72    fn test_parse_noun_index_line() {
73        let line = "abductor_muscle n 1 2 @ ~ 1 0 05291010";
74        let index_entry = parse_index_line(line).unwrap();
75        
76        assert_eq!(index_entry.lemma, "abductor_muscle");
77        assert!(matches!(index_entry.pos, SynsetType::Noun));
78        assert_eq!(index_entry.synset_cnt, 1);
79        assert_eq!(index_entry.ptr_symbols.len(), 2);
80        assert!(matches!(index_entry.ptr_symbols[0], PointerSymbol::Hypernym));
81        assert!(matches!(index_entry.ptr_symbols[1], PointerSymbol::Hyponym));
82        assert_eq!(index_entry.sense_cnt, 1);
83        assert_eq!(index_entry.tagsense_cnt, 0);
84        assert_eq!(index_entry.synset_offsets.len(), 1);
85        assert_eq!(index_entry.synset_offsets[0], 5291010);
86    }
87    
88    #[test]
89    fn test_parse_verb_index_line() {
90        let line = "abduct v 2 5 ! @ ~ + ; 2 0 01471043 01449427";
91        let index_entry = parse_index_line(line).unwrap();
92        
93        assert_eq!(index_entry.lemma, "abduct");
94        assert!(matches!(index_entry.pos, SynsetType::Verb));
95        assert_eq!(index_entry.synset_cnt, 2);
96        assert_eq!(index_entry.ptr_symbols.len(), 5);
97        assert!(matches!(index_entry.ptr_symbols[0], PointerSymbol::Antonym));
98        assert_eq!(index_entry.sense_cnt, 2);
99        assert_eq!(index_entry.tagsense_cnt, 0);
100        assert_eq!(index_entry.synset_offsets.len(), 2);
101        assert_eq!(index_entry.synset_offsets[0], 1471043);
102        assert_eq!(index_entry.synset_offsets[1], 1449427);
103    }
104    
105    #[test]
106    fn test_parse_adj_index_line() {
107        let line = ".22-caliber a 1 1 \\ 1 0 03146310";
108        let index_entry = parse_index_line(line).unwrap();
109        
110        assert_eq!(index_entry.lemma, ".22-caliber");
111        assert!(matches!(index_entry.pos, SynsetType::Adjective));
112        assert_eq!(index_entry.synset_cnt, 1);
113        assert_eq!(index_entry.ptr_symbols.len(), 1);
114        assert!(matches!(index_entry.ptr_symbols[0], PointerSymbol::Pertainym));
115        assert_eq!(index_entry.sense_cnt, 1);
116        assert_eq!(index_entry.tagsense_cnt, 0);
117        assert_eq!(index_entry.synset_offsets.len(), 1);
118        assert_eq!(index_entry.synset_offsets[0], 3146310);
119    }
120}