Skip to main content

wn_parser/
data.rs

1use crate::common::{Frame, Pointer, PointerSymbol, Synset, SynsetType, Word};
2use std::str::FromStr;
3
4/// Parse a line from a data.* file into a Synset
5pub fn parse_data_line(line: &str) -> Result<Synset, &'static str> {
6    let parts: Vec<&str> = line.split('|').collect();
7    if parts.len() != 2 {
8        return Err("Invalid data line format: missing '|' separator");
9    }
10
11    let data_part = parts[0].trim();
12    let gloss = parts[1].trim().to_string();
13
14    let fields: Vec<&str> = data_part.split_whitespace().collect();
15    if fields.len() < 6 {
16        return Err("Invalid data line format: insufficient fields");
17    }
18
19    // Parse fixed fields
20    let offset = u64::from_str(fields[0]).map_err(|_| "Invalid offset")?;
21    let lex_filenum = u8::from_str(fields[1]).map_err(|_| "Invalid lex_filenum")?;
22    let ss_type = SynsetType::from(fields[2]);
23    
24    // Parse word count (in hex)
25    let w_cnt = u16::from_str_radix(fields[3], 16).map_err(|_| "Invalid word count")?;
26    
27    // Extract words
28    let mut words = Vec::new();
29    let mut current_index = 4;
30    for _ in 0..w_cnt {
31        let word_str = fields[current_index];
32        current_index += 1;
33        
34        // Next field is lex_id (in hex)
35        if current_index >= fields.len() {
36            return Err("Invalid data line format: missing lex_id field");
37        }
38        let lex_id = u8::from_str_radix(fields[current_index], 16).map_err(|_| "Invalid lex_id")?;
39        current_index += 1;
40        
41        words.push(Word {
42            word: word_str.to_string(),
43            lex_id,
44        });
45    }
46    
47    // Parse pointer count
48    if current_index >= fields.len() {
49        return Err("Invalid data line format: missing pointer count field");
50    }
51    let p_cnt = u16::from_str(fields[current_index]).map_err(|_| "Invalid pointer count")?;
52    current_index += 1;
53    
54    // Extract pointers
55    let mut pointers = Vec::new();
56    for _ in 0..p_cnt {
57        if current_index + 3 >= fields.len() {
58            return Err("Invalid data line format: insufficient pointer fields");
59        }
60        
61        let pointer_symbol = PointerSymbol::from(fields[current_index]);
62        current_index += 1;
63        
64        let pointer_offset = u64::from_str(fields[current_index]).map_err(|_| "Invalid pointer offset")?;
65        current_index += 1;
66        
67        let pos = SynsetType::from(fields[current_index]);
68        current_index += 1;
69        
70        let source_target_str = fields[current_index];
71        current_index += 1;
72        
73        if source_target_str.len() != 4 {
74            return Err("Invalid source/target field length");
75        }
76        
77        let source = u16::from_str_radix(&source_target_str[0..2], 16).map_err(|_| "Invalid source field")?;
78        let target = u16::from_str_radix(&source_target_str[2..4], 16).map_err(|_| "Invalid target field")?;
79        
80        pointers.push(Pointer {
81            symbol: pointer_symbol,
82            offset: pointer_offset,
83            pos,
84            source_target: (source, target),
85        });
86    }
87    
88    // For verbs, parse frames
89    let mut frames = Vec::new();
90    if matches!(ss_type, SynsetType::Verb) && current_index < fields.len() {
91        // Check if there's a frame count
92        if let Ok(f_cnt) = u16::from_str(fields[current_index]) {
93            current_index += 1;
94            
95            // Parse frames
96            for _ in 0..f_cnt {
97                if current_index + 2 >= fields.len() || fields[current_index] != "+" {
98                    return Err("Invalid frame format");
99                }
100                
101                current_index += 1; // Skip the '+'
102                
103                let f_num = u16::from_str(fields[current_index]).map_err(|_| "Invalid frame number")?;
104                current_index += 1;
105                
106                let w_num = u16::from_str_radix(fields[current_index], 16).map_err(|_| "Invalid word number")?;
107                current_index += 1;
108                
109                frames.push(Frame {
110                    frame_number: f_num,
111                    word_number: w_num,
112                });
113            }
114        }
115    }
116
117    Ok(Synset {
118        offset,
119        lex_filenum,
120        ss_type,
121        words,
122        pointers,
123        frames,
124        gloss,
125    })
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131    
132    #[test]
133    fn test_parse_adjective_line() {
134        let line = "00001740 00 a 01 able 0 005 = 05200169 n 0000 = 05616246 n 0000 + 05616246 n 0101 + 05200169 n 0101 ! 00002098 a 0101 | (usually followed by `to') having the necessary means or skill or know-how or authority to do something; \"able to swim\"; \"she was able to program her computer\"; \"we were at last able to buy a car\"; \"able to get a grant for the project\"";
135        let synset = parse_data_line(line).unwrap();
136        
137        assert_eq!(synset.offset, 1740);
138        assert_eq!(synset.lex_filenum, 0);
139        assert!(matches!(synset.ss_type, SynsetType::Adjective));
140        assert_eq!(synset.words.len(), 1);
141        assert_eq!(synset.words[0].word, "able");
142        assert_eq!(synset.words[0].lex_id, 0);
143        assert_eq!(synset.pointers.len(), 5);
144        assert!(matches!(synset.pointers[0].symbol, PointerSymbol::Attribute));
145        assert_eq!(synset.pointers[0].offset, 5200169);
146        assert!(matches!(synset.pointers[0].pos, SynsetType::Noun));
147        assert_eq!(synset.pointers[0].source_target, (0, 0));
148    }
149    
150    #[test]
151    fn test_parse_verb_line() {
152        let line = "02409148 41 v 02 overwork 0 exploit 0 008 @ 02407987 v 0000 + 01867768 a 0203 + 01867768 a 0201 + 01867768 a 0202 + 00948206 n 0201 + 00623370 n 0101 + 00623370 n 0102 ~ 02408530 v 0000 02 + 08 00 + 09 00 | work excessively hard; \"he is exploiting the students\"";
153        let synset = parse_data_line(line).unwrap();
154        
155        assert_eq!(synset.offset, 2409148);
156        assert_eq!(synset.lex_filenum, 41);
157        assert!(matches!(synset.ss_type, SynsetType::Verb));
158        assert_eq!(synset.words.len(), 2);
159        assert_eq!(synset.words[0].word, "overwork");
160        assert_eq!(synset.words[0].lex_id, 0);
161        assert_eq!(synset.words[1].word, "exploit");
162        assert_eq!(synset.words[1].lex_id, 0);
163        assert_eq!(synset.pointers.len(), 8);
164        assert_eq!(synset.frames.len(), 2);
165        assert_eq!(synset.frames[0].frame_number, 8);
166        assert_eq!(synset.frames[0].word_number, 0);
167        assert_eq!(synset.frames[1].frame_number, 9);
168        assert_eq!(synset.frames[1].word_number, 0);
169    }
170}