1use crate::common::{IndexEntry, PointerSymbol, SynsetType};
2use std::str::FromStr;
3
4pub fn parse_index_line(line: &str) -> Result<IndexEntry, &'static str> {
6 if line.starts_with(' ') {
8 return Err("Skipping copyright notice line");
9 }
10
11 let fields: Vec<&str> = line.split_whitespace().collect();
12 if fields.len() < 6 {
13 return Err("Invalid index line format: insufficient fields");
14 }
15
16 let lemma = fields[0].to_string();
18 let pos = SynsetType::from(fields[1]);
19 let synset_cnt = u32::from_str(fields[2]).map_err(|_| "Invalid synset count")?;
20 let p_cnt = u32::from_str(fields[3]).map_err(|_| "Invalid pointer count")?;
21
22 let mut ptr_symbols = Vec::new();
24 let mut current_index = 4;
25 for _ in 0..p_cnt {
26 if current_index >= fields.len() {
27 return Err("Invalid index line format: missing pointer symbol fields");
28 }
29 ptr_symbols.push(PointerSymbol::from(fields[current_index]));
30 current_index += 1;
31 }
32
33 if current_index + 1 >= fields.len() {
35 return Err("Invalid index line format: missing sense count fields");
36 }
37 let sense_cnt = u32::from_str(fields[current_index]).map_err(|_| "Invalid sense count")?;
38 current_index += 1;
39
40 let tagsense_cnt = u32::from_str(fields[current_index]).map_err(|_| "Invalid tagged sense count")?;
41 current_index += 1;
42
43 let mut synset_offsets = Vec::new();
45 while current_index < fields.len() {
46 let offset = u64::from_str(fields[current_index]).map_err(|_| "Invalid synset offset")?;
47 synset_offsets.push(offset);
48 current_index += 1;
49 }
50
51 if synset_offsets.len() != synset_cnt as usize {
53 return Err("Invalid index line: synset count doesn't match number of offsets");
54 }
55
56 Ok(IndexEntry {
57 lemma,
58 pos,
59 synset_cnt,
60 ptr_symbols,
61 sense_cnt,
62 tagsense_cnt,
63 synset_offsets,
64 })
65}
66
67#[cfg(test)]
68mod tests {
69 use super::*;
70
71 #[test]
72 fn test_parse_noun_index_line() {
73 let line = "abductor_muscle n 1 2 @ ~ 1 0 05291010";
74 let index_entry = parse_index_line(line).unwrap();
75
76 assert_eq!(index_entry.lemma, "abductor_muscle");
77 assert!(matches!(index_entry.pos, SynsetType::Noun));
78 assert_eq!(index_entry.synset_cnt, 1);
79 assert_eq!(index_entry.ptr_symbols.len(), 2);
80 assert!(matches!(index_entry.ptr_symbols[0], PointerSymbol::Hypernym));
81 assert!(matches!(index_entry.ptr_symbols[1], PointerSymbol::Hyponym));
82 assert_eq!(index_entry.sense_cnt, 1);
83 assert_eq!(index_entry.tagsense_cnt, 0);
84 assert_eq!(index_entry.synset_offsets.len(), 1);
85 assert_eq!(index_entry.synset_offsets[0], 5291010);
86 }
87
88 #[test]
89 fn test_parse_verb_index_line() {
90 let line = "abduct v 2 5 ! @ ~ + ; 2 0 01471043 01449427";
91 let index_entry = parse_index_line(line).unwrap();
92
93 assert_eq!(index_entry.lemma, "abduct");
94 assert!(matches!(index_entry.pos, SynsetType::Verb));
95 assert_eq!(index_entry.synset_cnt, 2);
96 assert_eq!(index_entry.ptr_symbols.len(), 5);
97 assert!(matches!(index_entry.ptr_symbols[0], PointerSymbol::Antonym));
98 assert_eq!(index_entry.sense_cnt, 2);
99 assert_eq!(index_entry.tagsense_cnt, 0);
100 assert_eq!(index_entry.synset_offsets.len(), 2);
101 assert_eq!(index_entry.synset_offsets[0], 1471043);
102 assert_eq!(index_entry.synset_offsets[1], 1449427);
103 }
104
105 #[test]
106 fn test_parse_adj_index_line() {
107 let line = ".22-caliber a 1 1 \\ 1 0 03146310";
108 let index_entry = parse_index_line(line).unwrap();
109
110 assert_eq!(index_entry.lemma, ".22-caliber");
111 assert!(matches!(index_entry.pos, SynsetType::Adjective));
112 assert_eq!(index_entry.synset_cnt, 1);
113 assert_eq!(index_entry.ptr_symbols.len(), 1);
114 assert!(matches!(index_entry.ptr_symbols[0], PointerSymbol::Pertainym));
115 assert_eq!(index_entry.sense_cnt, 1);
116 assert_eq!(index_entry.tagsense_cnt, 0);
117 assert_eq!(index_entry.synset_offsets.len(), 1);
118 assert_eq!(index_entry.synset_offsets[0], 3146310);
119 }
120}