1use crate::common::{Frame, Pointer, PointerSymbol, Synset, SynsetType, Word};
2use std::str::FromStr;
3
4pub fn parse_data_line(line: &str) -> Result<Synset, &'static str> {
6 let parts: Vec<&str> = line.split('|').collect();
7 if parts.len() != 2 {
8 return Err("Invalid data line format: missing '|' separator");
9 }
10
11 let data_part = parts[0].trim();
12 let gloss = parts[1].trim().to_string();
13
14 let fields: Vec<&str> = data_part.split_whitespace().collect();
15 if fields.len() < 6 {
16 return Err("Invalid data line format: insufficient fields");
17 }
18
19 let offset = u64::from_str(fields[0]).map_err(|_| "Invalid offset")?;
21 let lex_filenum = u8::from_str(fields[1]).map_err(|_| "Invalid lex_filenum")?;
22 let ss_type = SynsetType::from(fields[2]);
23
24 let w_cnt = u16::from_str_radix(fields[3], 16).map_err(|_| "Invalid word count")?;
26
27 let mut words = Vec::new();
29 let mut current_index = 4;
30 for _ in 0..w_cnt {
31 let word_str = fields[current_index];
32 current_index += 1;
33
34 if current_index >= fields.len() {
36 return Err("Invalid data line format: missing lex_id field");
37 }
38 let lex_id = u8::from_str_radix(fields[current_index], 16).map_err(|_| "Invalid lex_id")?;
39 current_index += 1;
40
41 words.push(Word {
42 word: word_str.to_string(),
43 lex_id,
44 });
45 }
46
47 if current_index >= fields.len() {
49 return Err("Invalid data line format: missing pointer count field");
50 }
51 let p_cnt = u16::from_str(fields[current_index]).map_err(|_| "Invalid pointer count")?;
52 current_index += 1;
53
54 let mut pointers = Vec::new();
56 for _ in 0..p_cnt {
57 if current_index + 3 >= fields.len() {
58 return Err("Invalid data line format: insufficient pointer fields");
59 }
60
61 let pointer_symbol = PointerSymbol::from(fields[current_index]);
62 current_index += 1;
63
64 let pointer_offset = u64::from_str(fields[current_index]).map_err(|_| "Invalid pointer offset")?;
65 current_index += 1;
66
67 let pos = SynsetType::from(fields[current_index]);
68 current_index += 1;
69
70 let source_target_str = fields[current_index];
71 current_index += 1;
72
73 if source_target_str.len() != 4 {
74 return Err("Invalid source/target field length");
75 }
76
77 let source = u16::from_str_radix(&source_target_str[0..2], 16).map_err(|_| "Invalid source field")?;
78 let target = u16::from_str_radix(&source_target_str[2..4], 16).map_err(|_| "Invalid target field")?;
79
80 pointers.push(Pointer {
81 symbol: pointer_symbol,
82 offset: pointer_offset,
83 pos,
84 source_target: (source, target),
85 });
86 }
87
88 let mut frames = Vec::new();
90 if matches!(ss_type, SynsetType::Verb) && current_index < fields.len() {
91 if let Ok(f_cnt) = u16::from_str(fields[current_index]) {
93 current_index += 1;
94
95 for _ in 0..f_cnt {
97 if current_index + 2 >= fields.len() || fields[current_index] != "+" {
98 return Err("Invalid frame format");
99 }
100
101 current_index += 1; let f_num = u16::from_str(fields[current_index]).map_err(|_| "Invalid frame number")?;
104 current_index += 1;
105
106 let w_num = u16::from_str_radix(fields[current_index], 16).map_err(|_| "Invalid word number")?;
107 current_index += 1;
108
109 frames.push(Frame {
110 frame_number: f_num,
111 word_number: w_num,
112 });
113 }
114 }
115 }
116
117 Ok(Synset {
118 offset,
119 lex_filenum,
120 ss_type,
121 words,
122 pointers,
123 frames,
124 gloss,
125 })
126}
127
128#[cfg(test)]
129mod tests {
130 use super::*;
131
132 #[test]
133 fn test_parse_adjective_line() {
134 let line = "00001740 00 a 01 able 0 005 = 05200169 n 0000 = 05616246 n 0000 + 05616246 n 0101 + 05200169 n 0101 ! 00002098 a 0101 | (usually followed by `to') having the necessary means or skill or know-how or authority to do something; \"able to swim\"; \"she was able to program her computer\"; \"we were at last able to buy a car\"; \"able to get a grant for the project\"";
135 let synset = parse_data_line(line).unwrap();
136
137 assert_eq!(synset.offset, 1740);
138 assert_eq!(synset.lex_filenum, 0);
139 assert!(matches!(synset.ss_type, SynsetType::Adjective));
140 assert_eq!(synset.words.len(), 1);
141 assert_eq!(synset.words[0].word, "able");
142 assert_eq!(synset.words[0].lex_id, 0);
143 assert_eq!(synset.pointers.len(), 5);
144 assert!(matches!(synset.pointers[0].symbol, PointerSymbol::Attribute));
145 assert_eq!(synset.pointers[0].offset, 5200169);
146 assert!(matches!(synset.pointers[0].pos, SynsetType::Noun));
147 assert_eq!(synset.pointers[0].source_target, (0, 0));
148 }
149
150 #[test]
151 fn test_parse_verb_line() {
152 let line = "02409148 41 v 02 overwork 0 exploit 0 008 @ 02407987 v 0000 + 01867768 a 0203 + 01867768 a 0201 + 01867768 a 0202 + 00948206 n 0201 + 00623370 n 0101 + 00623370 n 0102 ~ 02408530 v 0000 02 + 08 00 + 09 00 | work excessively hard; \"he is exploiting the students\"";
153 let synset = parse_data_line(line).unwrap();
154
155 assert_eq!(synset.offset, 2409148);
156 assert_eq!(synset.lex_filenum, 41);
157 assert!(matches!(synset.ss_type, SynsetType::Verb));
158 assert_eq!(synset.words.len(), 2);
159 assert_eq!(synset.words[0].word, "overwork");
160 assert_eq!(synset.words[0].lex_id, 0);
161 assert_eq!(synset.words[1].word, "exploit");
162 assert_eq!(synset.words[1].lex_id, 0);
163 assert_eq!(synset.pointers.len(), 8);
164 assert_eq!(synset.frames.len(), 2);
165 assert_eq!(synset.frames[0].frame_number, 8);
166 assert_eq!(synset.frames[0].word_number, 0);
167 assert_eq!(synset.frames[1].frame_number, 9);
168 assert_eq!(synset.frames[1].word_number, 0);
169 }
170}