morph_rs/analyzer/
morpholyzer.rs1use fst::{IntoStreamer, Streamer};
2use itertools::Itertools;
3use tracing::{debug, error};
4
5use crate::{
6 analyzer::{declension::alphabet_stream, Parse, WordForm},
7 errors::ParseErr,
8 morph::grammemes::Grammem,
9 InflectWord, Method, MorphAnalyzer, NormalizedWord, ParsedWord,
10};
11use std::collections::HashMap;
12
13use super::InflectWords;
14
15impl MorphAnalyzer {
16 pub(crate) fn try_into_parse(&self, word: &str, parse: &Parse) -> Result<ParsedWord, ParseErr> {
18 Ok(ParsedWord {
19 word: word.to_string(),
20 tags: self.get_tag(parse.tag)?.to_owned(),
21 normal_form: self.get_lemmas(parse.normal_form)?.to_string(),
22 method: Method::Dictionary,
23 })
24 }
25
26 pub(crate) fn try_into_normalized(&self, parse: &Parse) -> Result<NormalizedWord, ParseErr> {
28 Ok(NormalizedWord {
29 normal_word: self.get_lemmas(parse.normal_form)?.to_string(),
30 tags: self.get_tag(parse.tag)?.to_owned(),
31 method: Method::Dictionary,
32 })
33 }
34
35 pub(crate) fn try_into_inflect(
37 &self,
38 word: String,
39 parse: &Parse,
40 ) -> Result<InflectWord, ParseErr> {
41 Ok(InflectWord {
42 inflect_form: word,
43 tags: self.get_tag(parse.tag)?.to_owned(),
44 normal_form: self.get_lemmas(parse.normal_form)?.to_string(),
45 method: Method::Dictionary,
46 })
47 }
48
49 pub(crate) fn try_into_inflect_hint(
52 &self,
53 word: String,
54 word_form: &WordForm,
55 ) -> Result<InflectWord, ParseErr> {
56 Ok(InflectWord {
57 inflect_form: word,
58 tags: word_form.tag.to_owned(),
59 normal_form: word_form.lemma.to_string(),
60 method: Method::Dictionary,
61 })
62 }
63}
64
65impl MorphAnalyzer {
66 pub fn id_forms<'a>(
69 &'a self,
70 word: &'a str,
71 ids: &'a [u32],
72 word_id: Option<u64>,
73 grammemes: &'a Option<Vec<Grammem>>,
74 ) -> impl Iterator<Item = (u64, &'a Parse)> {
75 self.word_parses.iter().enumerate().flat_map(move |(i, p)| {
76 p.iter()
77 .filter_map(|p| {
78 let id = p.form.id();
79
80 if id.is_none() {
81 error!("{}", ParseErr::LostLemmaId(word.into()));
82 return None;
83 }
84
85 match (grammemes.is_none(), word_id) {
87 (true, Some(word_id)) => {
88 if word_id == id.unwrap() && (p.form.is_inizio() | p.form.is_normal()) {
90 Some((i as u64, p))
91 } else {
92 None
93 }
94 }
95 _ => {
99 if ids.contains(&(id.unwrap() as u32)) {
100 Some((i as u64, p))
101 } else {
102 None
103 }
104 }
105 }
106 })
107 .collect_vec()
108 })
109 }
110
111 pub(crate) fn collect_stream_hashset<'a>(
116 &'a self,
117 word: &str,
118 grammemes: &Option<Vec<Grammem>>,
119 id_forms: impl Iterator<Item = (u64, &'a Parse)>,
120 hash_set: &mut HashMap<(String, Option<String>), Vec<WordForm<'a>>>,
121 ) -> Result<(), ParseErr> {
122 for (i, parse) in id_forms {
123 let tag = self.get_tag(parse.tag)?;
124
125 if let Some(grammemes) = grammemes.as_ref() {
126 if !grammemes.iter().all(|item| tag.contains(item)) {
127 continue;
128 };
129 }
130
131 let normal_form = self.get_lemmas(parse.normal_form)?;
132
133 for (first, last) in
134 alphabet_stream(word, normal_form, tag.to_owned()).map_err(ParseErr::Declension)?
135 {
136 let word_form = WordForm {
137 i,
138 tag,
139 lemma: normal_form,
140 };
141
142 let vec = hash_set.entry((first, last)).or_default();
143 if !vec.contains(&word_form) {
144 vec.push(word_form)
145 }
146 }
147 }
148
149 Ok(())
150 }
151
152 pub(crate) fn iter_fst(
156 &self,
157 hash_set: &mut HashMap<(String, Option<String>), Vec<WordForm<'_>>>,
158 inflect: &mut InflectWords,
159 ) -> Result<(), ParseErr> {
160 let map = &self.fst;
161
162 for ((first, last), vec) in hash_set.iter() {
163 debug!("{first}-{last:?}");
164
165 let range = match last {
166 Some(last) => map.range().ge(first).lt(last),
167 None => map.range().ge(first).le(first),
168 };
169 let mut stream = range.into_stream();
170
171 while let Some((key, value)) = stream.next() {
172 for word_form in vec.iter().filter(|WordForm { i, .. }| *i == value) {
173 debug!("Value == i was found");
174 let inflect_word = self.try_into_inflect_hint(
175 String::from_utf8_lossy(key).to_string(),
176 word_form,
177 )?;
178 if !inflect.0.contains(&inflect_word) {
179 inflect.0.push(inflect_word);
180 }
181 }
182 }
183 }
184
185 Ok(())
186 }
187}