1use crate::{
2 analyzer::vangovanie::VangovanieRes,
3 errors::{MopsErr, MopsResult, ParseErr},
4 morph::grammemes::{Form, Grammem},
5 InflectWord, Method, MorphAnalyzer, NormalizedWord, ParsedWord, Vangovanie,
6};
7use allocative::Allocative;
8use fst::Map;
9use serde::{Deserialize, Serialize};
10use smallstr::SmallString;
11use smallvec::SmallVec;
12use std::{
13 collections::{HashMap, HashSet},
14 path::PathBuf,
15};
16use tracing::debug;
17
18pub const SMALLVANGA: usize = 8;
24pub const SMALLTAG: usize = 8;
27pub const SMALLLEMMA: usize = 16;
30
31pub(crate) mod dictionary;
33pub use dictionary::Dictionary;
34
35pub(crate) mod declension;
37pub(crate) mod morpholyzer;
39pub(crate) mod vangovanie;
41
42pub mod pretty_display;
43
44pub type Tag = SmallVec<[Grammem; SMALLTAG]>;
46pub type Tags = Vec<Tag>;
48pub type Lemmas = Vec<SmallString<[u8; SMALLLEMMA]>>;
50
51pub type ParseTable = Vec<Vec<Parse>>;
54
55pub type OpCLid = u32;
57pub type LemmasRows = Vec<Vec<OpCLid>>;
59
60#[derive(
61 Debug, PartialEq, Eq, Ord, PartialOrd, Clone, Serialize, Deserialize, Allocative, Hash,
62)]
63pub struct Parse {
65 pub(crate) form: Form,
66 pub(crate) tag: TagID,
67 pub(crate) normal_form: LemmaID,
68 pub(crate) lemma_row_id: LemmaRowId,
69}
70
71pub type TagID = usize;
73pub type LemmaID = usize;
75pub type LemmaRowId = usize;
77
78#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Serialize, Deserialize, Allocative)]
79pub struct Vanga {
81 pub(crate) popularity: u64,
82 pub postfix: Vec<VangaItem>,
83}
84
85#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Serialize, Deserialize, Allocative)]
86pub struct VangaItem {
87 #[allocative(skip)]
88 pub postfix: SmallString<[u8; SMALLVANGA]>,
89 pub(crate) form: Form,
90 pub(crate) tag: Vec<TagID>,
91}
92
93#[derive(Debug, Default, Eq, PartialEq, Clone)]
94pub struct ParsedWords(pub Vec<ParsedWord>);
96
97impl ParsedWords {
98 pub fn find(self, memes: Vec<Grammem>) -> Option<ParsedWord> {
99 self.0
100 .into_iter()
101 .find(|w| memes.iter().all(|meme| w.clone().tag().contains(meme)))
102 }
103}
104
105#[derive(Default)]
106pub struct NormalizedWords(pub Vec<NormalizedWord>);
108
109impl NormalizedWords {
110 pub fn find(self, memes: Vec<Grammem>) -> Option<NormalizedWord> {
111 self.0
112 .into_iter()
113 .find(|w| memes.iter().all(|meme| w.clone().tag().contains(meme)))
114 }
115}
116
117#[derive(Debug, Default, Clone)]
118pub struct InflectWords(pub Vec<InflectWord>);
120
121impl InflectWords {
122 pub fn find(self, memes: Vec<Grammem>) -> Option<InflectWord> {
123 self.0
124 .into_iter()
125 .find(|w| memes.iter().all(|meme| w.clone().tag().contains(meme)))
126 }
127}
128
129impl MorphAnalyzer {
130 pub fn from_dictionary(dictionary: Dictionary, fst: PathBuf) -> MopsResult<Self> {
132 let Dictionary {
133 meta: _,
134 word_parses,
135 tags,
136 lemmas,
137 paradigms,
138 lemmas_rows,
139 } = dictionary;
140
141 Ok(Self {
142 fst: Self::to_bytes_map(&fst)?,
143 word_parses,
144 tags,
145 lemmas,
146 paradigms,
147 lemmas_rows,
148 })
149 }
150
151 pub(crate) fn to_bytes_map(fst: &PathBuf) -> MopsResult<Map<Vec<u8>>> {
153 let buf = std::fs::read(fst).map_err(|error| MopsErr::File {
154 file: fst.to_path_buf(),
155 error,
156 })?;
157 Map::new(buf).map_err(MopsErr::FSTMap)
158 }
159
160 pub fn parse_word(&self, word: &str) -> Result<ParsedWords, ParseErr> {
162 let map = &self.fst;
163 let mut parsed = ParsedWords::default();
164
165 match map.get(word.as_bytes()) {
166 Some(common_id) => {
167 debug!("{word} найдено в словаре");
168 let vec_tags = self.get_parse(common_id)?;
169
170 for parse in vec_tags {
171 parsed.0.push(self.try_into_parse(word, parse)?)
172 }
173
174 parsed.0.sort();
176 }
177 None => {
178 if let Some(vanga) = self.vangovanie(word)? {
179 for VangovanieRes {
180 tags,
181 form: _,
182 method,
183 normal_form,
184 ..
185 } in vanga
186 {
187 let normal_form = match &method {
188 Vangovanie::KnownPrefix(affix) | Vangovanie::UnknownPrefix(affix) => {
189 format!("{affix}{normal_form}")
190 }
191 Vangovanie::Postfix => return Err(ParseErr::FutureRelease),
192 };
193
194 parsed.0.push(ParsedWord {
195 word: word.to_string(),
196 tags,
197 normal_form,
198 method: Method::Vangovanie(method),
199 })
200 }
201 }
202 }
203 }
204
205 Ok(parsed)
206 }
207
208 pub fn normalized_word(&self, word: &str) -> Result<NormalizedWords, ParseErr> {
210 let map = &self.fst;
211 let mut normalized = NormalizedWords::default();
212
213 match map.get(word) {
214 Some(common_id) => {
215 debug!("{word} найдено в словаре");
216 let vec_parses = self.get_parse(common_id)?;
217
218 for parse in vec_parses.iter() {
219 if parse.form.is_normal() {
220 normalized.0.push(self.try_into_normalized(parse)?)
221 } else {
222 let lemmas_link = self.get_row_id(parse.lemma_row_id)?;
224
225 let word = self.get_lemmas(parse.normal_form)?.to_string();
226 let id = map
227 .get(&word)
228 .ok_or_else(|| ParseErr::LostNormalForm(word.clone()))?;
229 let vec_parses = self.get_parse(id)?;
230
231 for parse in vec_parses.iter() {
232 let normalized_word = self.try_into_normalized(parse)?;
233 if parse.form.is_normal()
234 && !normalized.0.contains(&normalized_word)
235 && lemmas_link.contains(&(parse.form.id().unwrap() as u32))
237 {
238 normalized.0.push(normalized_word)
239 }
240 }
241 }
242 }
243
244 normalized.0.sort();
246 }
247 None => {
248 if let Some(vanga) = self.vangovanie(word)? {
249 for VangovanieRes {
250 tags, form, method, ..
251 } in vanga
252 {
253 if form.is_normal() {
254 normalized.0.push(NormalizedWord {
255 normal_word: word.to_owned(),
256 tags,
257 method: Method::Vangovanie(method),
258 })
259 } else {
260 return Err(ParseErr::FutureRelease);
261 }
262 }
263 }
264 }
265 }
266
267 Ok(normalized)
268 }
269}
270
271#[derive(Debug, PartialEq)]
272pub(crate) struct WordForm<'a> {
273 i: u64,
274 tag: &'a Tag,
275 lemma: &'a SmallString<[u8; SMALLLEMMA]>,
276}
277
278impl MorphAnalyzer {
279 pub(crate) fn inflect_word(
284 &self,
285 word: &str,
286 grammemes: Option<Vec<Grammem>>,
287 ) -> Result<Option<InflectWords>, ParseErr> {
288 let map = &self.fst;
289 let mut inflect = InflectWords::default();
290
291 match map.get(word) {
292 Some(common_id) => {
293 debug!("{word} найдено в словаре");
294 let vec_parses = self.get_parse(common_id)?;
295
296 for parse in vec_parses.iter() {
298 self.inflect_parse(word, parse, grammemes.clone(), &mut inflect)?;
299 }
300 }
301 None => return Err(ParseErr::FutureRelease),
302 };
303
304 if inflect.0.is_empty() {
305 Ok(None)
306 } else {
307 Ok(Some(inflect))
308 }
309 }
310
311 pub(crate) fn inflect_parsed_words(
316 &self,
317 word: ParsedWord,
318 grammemes: Option<Vec<Grammem>>,
319 ) -> Result<Option<InflectWords>, ParseErr> {
320 let map = &self.fst;
321 let mut inflect = InflectWords::default();
322
323 match map.get(word.word()) {
324 Some(common_id) => {
325 let tag = self
326 .tags
327 .binary_search(&word.tag())
328 .map_err(|_| ParseErr::BinaryTag(word.tag()))?;
329 let parse = self
330 .get_parse(common_id)?
331 .iter()
332 .find(|parse| parse.tag == tag)
333 .ok_or_else(|| ParseErr::LostParse(word.tag()))?;
334
335 self.inflect_parse(&word.word(), parse, grammemes, &mut inflect)?;
336 }
337 None => return Err(ParseErr::FutureRelease),
338 }
339
340 if inflect.0.is_empty() {
341 Ok(None)
342 } else {
343 Ok(Some(inflect))
344 }
345 }
346
347 fn inflect_parse(
352 &self,
353 word: &str,
354 parse: &Parse,
355 grammemes: Option<Vec<Grammem>>,
356 inflect: &mut InflectWords,
357 ) -> Result<(), ParseErr> {
358 if grammemes.is_none() && (parse.form.is_inizio() || parse.form.is_normal()) {
360 inflect
361 .0
362 .push(self.try_into_inflect(word.to_string(), parse)?);
363 } else {
364 let ids = self.get_row_id(parse.lemma_row_id)?.to_owned();
366 let word_id = parse
368 .form
369 .id()
370 .ok_or_else(|| ParseErr::LostLemmaId(word.to_string()))?;
371
372 let mut hash_set: HashMap<(String, Option<String>), Vec<WordForm>> = HashMap::new();
373
374 let id_forms = self.id_forms(word, &ids, Some(word_id), &grammemes);
375 self.collect_stream_hashset(word, &grammemes, id_forms, &mut hash_set)?;
376 self.iter_fst(&mut hash_set, inflect)?;
377 }
378
379 Ok(())
380 }
381
382 pub(crate) fn declension_word(&self, word: &str) -> Result<Vec<InflectWords>, ParseErr> {
393 let map = &self.fst;
394 let mut inflects = Vec::new();
395
396 match map.get(word.as_bytes()) {
397 Some(common_id) => {
398 let set_ids = self
399 .get_parse(common_id)?
400 .iter()
401 .filter_map(|parse| self.lemmas_rows.get(parse.lemma_row_id))
402 .map(|v| v.to_owned())
403 .collect::<HashSet<Vec<OpCLid>>>();
404
405 for ids in set_ids {
407 let mut inflect = InflectWords::default();
408 self.declension_ids(word, &ids, &mut inflect)?;
409 if !inflect.0.is_empty() {
410 inflects.push(inflect);
411 }
412 }
413 }
414
415 None => return Err(ParseErr::FutureRelease),
416 }
417
418 Ok(inflects)
419 }
420
421 pub(crate) fn declension_parsed_word(
430 &self,
431 word: &ParsedWord,
432 ) -> Result<Option<InflectWords>, ParseErr> {
433 let map = &self.fst;
434 let mut inflect = InflectWords::default();
435
436 match map.get(word.word()) {
437 Some(common_id) => {
438 let tag = self
439 .tags
440 .binary_search(&word.tag())
441 .map_err(|_| ParseErr::BinaryTag(word.tag()))?;
442 let parse = self
443 .get_parse(common_id)?
444 .iter()
445 .find(|parse| parse.tag == tag)
446 .ok_or_else(|| ParseErr::LostParse(word.tag()))?;
447
448 let ids = self.get_row_id(parse.lemma_row_id)?;
450 self.declension_ids(&word.word(), ids, &mut inflect)?;
451 }
452 None => return Err(ParseErr::FutureRelease),
453 }
454
455 if inflect.0.is_empty() {
456 Ok(None)
457 } else {
458 Ok(Some(inflect))
459 }
460 }
461
462 fn declension_ids(
467 &self,
468 word: &str,
469 ids: &[u32],
470 inflect: &mut InflectWords,
471 ) -> Result<(), ParseErr> {
472 let mut hash_set: HashMap<(String, Option<String>), Vec<WordForm>> = HashMap::new();
473
474 let id_forms = self.id_forms(word, ids, None, &None);
475 self.collect_stream_hashset(word, &None, id_forms, &mut hash_set)?;
476 self.iter_fst(&mut hash_set, inflect)
477 }
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483 use crate::{
484 grams,
485 morph::grammemes::{Case, Gender, ParteSpeech},
486 Method,
487 };
488
489 #[test]
490 fn test_find_parsed() {
491 let parsed1 = ParsedWord {
492 word: "bebeka".to_string(),
493 tags: SmallVec::from(grams![ParteSpeech::Noun, Gender::Feminine]),
494 normal_form: "bebe".to_string(),
495 method: Method::Vangovanie(crate::Vangovanie::Postfix),
496 };
497
498 let parsed2 = ParsedWord {
499 word: "bebek".to_string(),
500 tags: SmallVec::from(grams![ParteSpeech::Noun, Gender::Masculine]),
501 normal_form: "bebe".to_string(),
502 method: Method::Vangovanie(crate::Vangovanie::Postfix),
503 };
504
505 let parsed3 = ParsedWord {
506 word: "bebeki".to_string(),
507 tags: SmallVec::from(grams![ParteSpeech::Noun]),
508 normal_form: "bebe".to_string(),
509 method: Method::Vangovanie(crate::Vangovanie::Postfix),
510 };
511
512 let words = ParsedWords(vec![parsed1.clone(), parsed2, parsed3]);
513 assert_eq!(
514 parsed1,
515 words
516 .find(grams![ParteSpeech::Noun, Gender::Feminine])
517 .unwrap()
518 )
519 }
520
521 #[test]
522 fn test_inflect_form() {
523 let anal = MorphAnalyzer::open("data/result/").unwrap();
524
525 let femn_invest = anal
526 .inflect_forms(
527 "инвестировавшие",
528 grams![Gender::Feminine, Case::Nominativus],
529 )
530 .unwrap()
531 .unwrap();
532
533 assert_eq!(
534 "инвестировавшая",
535 femn_invest.0.first().unwrap().to_owned().word().as_str()
536 );
537 }
538
539 #[test]
540 fn test_inflect_form_full() {
541 let anal = MorphAnalyzer::open("data/result/").unwrap();
542
543 let femn_invest = anal.inflect_inizio("инвестировавшие").unwrap().unwrap();
544
545 assert_eq!(
546 "инвестировавший",
547 femn_invest.0.first().unwrap().to_owned().word().as_str()
548 );
549 }
550}