Skip to main content

wn_parser/
sense.rs

1use crate::common::SenseEntry;
2use std::str::FromStr;
3
4/// Parse a line from the index.sense file into a SenseEntry
5pub fn parse_sense_line(line: &str) -> Result<SenseEntry, &'static str> {
6    let fields: Vec<&str> = line.split_whitespace().collect();
7    if fields.len() < 4 {
8        return Err("Invalid sense index line format: insufficient fields");
9    }
10
11    let sense_key = fields[0].to_string();
12    let synset_offset = u64::from_str(fields[1]).map_err(|_| "Invalid synset offset")?;
13    let sense_number = u32::from_str(fields[2]).map_err(|_| "Invalid sense number")?;
14    let tag_cnt = u32::from_str(fields[3]).map_err(|_| "Invalid tag count")?;
15
16    Ok(SenseEntry {
17        sense_key,
18        synset_offset,
19        sense_number,
20        tag_cnt,
21    })
22}
23
24/// Utility function to parse a sense key into its components
25pub fn parse_sense_key(sense_key: &str) -> Result<(String, String), &'static str> {
26    let parts: Vec<&str> = sense_key.split('%').collect();
27    if parts.len() != 2 {
28        return Err("Invalid sense key format: missing '%' separator");
29    }
30
31    let lemma = parts[0].to_string();
32    let lex_sense = parts[1].to_string();
33
34    Ok((lemma, lex_sense))
35}
36
37/// Utility function to further parse the lex_sense part of a sense key
38pub fn parse_lex_sense(lex_sense: &str) -> Result<(u8, u8, u8, String, u8), &'static str> {
39    let parts: Vec<&str> = lex_sense.split(':').collect();
40    if parts.len() != 5 {
41        return Err("Invalid lex_sense format: should have 5 colon-separated parts");
42    }
43
44    let ss_type = u8::from_str(parts[0]).map_err(|_| "Invalid ss_type")?;
45    let lex_filenum = u8::from_str(parts[1]).map_err(|_| "Invalid lex_filenum")?;
46    let lex_id = u8::from_str(parts[2]).map_err(|_| "Invalid lex_id")?;
47    let head_word = parts[3].to_string();
48    let head_id = if parts[4].is_empty() {
49        0
50    } else {
51        u8::from_str(parts[4]).map_err(|_| "Invalid head_id")?
52    };
53
54    Ok((ss_type, lex_filenum, lex_id, head_word, head_id))
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60    
61    #[test]
62    fn test_parse_sense_line() {
63        let line = "abandonment%1:04:03:: 00204439 1 3";
64        let sense_entry = parse_sense_line(line).unwrap();
65        
66        assert_eq!(sense_entry.sense_key, "abandonment%1:04:03::");
67        assert_eq!(sense_entry.synset_offset, 204439);
68        assert_eq!(sense_entry.sense_number, 1);
69        assert_eq!(sense_entry.tag_cnt, 3);
70    }
71    
72    #[test]
73    fn test_parse_sense_key() {
74        let (lemma, lex_sense) = parse_sense_key("abandonment%1:04:03::").unwrap();
75        assert_eq!(lemma, "abandonment");
76        assert_eq!(lex_sense, "1:04:03::");
77    }
78    
79    #[test]
80    fn test_parse_lex_sense() {
81        let (ss_type, lex_filenum, lex_id, head_word, head_id) = 
82            parse_lex_sense("1:04:03::").unwrap();
83        
84        assert_eq!(ss_type, 1);
85        assert_eq!(lex_filenum, 4);
86        assert_eq!(lex_id, 3);
87        assert_eq!(head_word, "");
88        assert_eq!(head_id, 0);
89    }
90    
91    #[test]
92    fn test_parse_lex_sense_with_head() {
93        let (ss_type, lex_filenum, lex_id, head_word, head_id) = 
94            parse_lex_sense("5:00:00:discomposed:00").unwrap();
95        
96        assert_eq!(ss_type, 5);
97        assert_eq!(lex_filenum, 0);
98        assert_eq!(lex_id, 0);
99        assert_eq!(head_word, "discomposed");
100        assert_eq!(head_id, 0);
101    }
102}