1use std::io;
2
3use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
4use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
5use serde::{Deserialize, Serialize};
6
7use crate::dictionary::character_definition::{CategoryId, CharacterDefinition};
8use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
9use crate::dictionary::prefix_dictionary::PrefixDictionary;
10use crate::dictionary::unknown_dictionary::UnknownDictionary;
11use crate::mode::Mode;
12
13#[derive(
15 Clone,
16 Copy,
17 Debug,
18 Eq,
19 PartialEq,
20 Serialize,
21 Deserialize,
22 Default,
23 Archive,
24 RkyvSerialize,
25 RkyvDeserialize,
26)]
27
28pub enum LexType {
29 #[default]
31 System,
32 User,
34 Unknown,
36}
37
38#[derive(
39 Clone,
40 Copy,
41 Debug,
42 Eq,
43 PartialEq,
44 Serialize,
45 Deserialize,
46 Archive,
47 RkyvDeserialize,
48 RkyvSerialize,
49)]
50
51pub struct WordId {
52 pub id: u32,
53 pub is_system: bool,
54 pub lex_type: LexType,
55}
56
57impl WordId {
58 pub fn new(lex_type: LexType, id: u32) -> Self {
60 WordId {
61 id,
62 is_system: matches!(lex_type, LexType::System),
63 lex_type,
64 }
65 }
66
67 pub fn is_unknown(&self) -> bool {
68 matches!(self.lex_type, LexType::Unknown)
69 }
70
71 pub fn is_system(&self) -> bool {
72 self.is_system
73 }
74
75 pub fn lex_type(&self) -> LexType {
76 self.lex_type
77 }
78}
79
80impl Default for WordId {
81 fn default() -> Self {
82 WordId {
83 id: u32::MAX,
84 is_system: true,
85 lex_type: LexType::System,
86 }
87 }
88}
89
90#[derive(
91 Default,
92 Clone,
93 Copy,
94 Debug,
95 Eq,
96 PartialEq,
97 Serialize,
98 Deserialize,
99 Archive,
100 RkyvSerialize,
101 RkyvDeserialize,
102)]
103
104pub struct WordEntry {
105 pub word_id: WordId,
106 pub word_cost: i16,
107 pub left_id: u16,
108 pub right_id: u16,
109}
110
111impl WordEntry {
112 pub const SERIALIZED_LEN: usize = 10;
113
114 pub fn left_id(&self) -> u32 {
115 self.left_id as u32
116 }
117
118 pub fn right_id(&self) -> u32 {
119 self.right_id as u32
120 }
121
122 pub fn serialize<W: io::Write>(&self, wtr: &mut W) -> io::Result<()> {
123 wtr.write_u32::<LittleEndian>(self.word_id.id)?;
124 wtr.write_i16::<LittleEndian>(self.word_cost)?;
125 wtr.write_u16::<LittleEndian>(self.left_id)?;
126 wtr.write_u16::<LittleEndian>(self.right_id)?;
127 Ok(())
128 }
129
130 pub fn deserialize(data: &[u8], is_system_entry: bool) -> WordEntry {
131 let word_id = WordId::new(
132 if is_system_entry {
133 LexType::System
134 } else {
135 LexType::User
136 },
137 LittleEndian::read_u32(&data[0..4]),
138 );
139 let word_cost = LittleEndian::read_i16(&data[4..6]);
140 let left_id = LittleEndian::read_u16(&data[6..8]);
141 let right_id = LittleEndian::read_u16(&data[8..10]);
142 WordEntry {
143 word_id,
144 word_cost,
145 left_id,
146 right_id,
147 }
148 }
149}
150
151#[derive(Clone, Copy, Debug, Default)]
152pub enum EdgeType {
153 #[default]
154 KNOWN,
155 UNKNOWN,
156 USER,
157 INSERTED,
158}
159
160#[derive(Default, Clone, Debug)]
161pub struct Edge {
162 pub edge_type: EdgeType,
163 pub word_entry: WordEntry,
164
165 pub path_cost: i32,
166 pub left_index: u16, pub start_index: u32,
169 pub stop_index: u32,
170
171 pub kanji_only: bool,
172}
173
174impl Edge {
175 pub fn num_chars(&self) -> usize {
176 (self.stop_index - self.start_index) as usize / 3
177 }
178}
179
180#[derive(Clone, Debug)]
184pub struct PathEntry {
185 pub edge_index: u16,
187 pub left_pos: u32,
189 pub left_index: u16,
191 pub cost: i32,
193}
194
195#[derive(Clone, Default)]
196pub struct Lattice {
197 capacity: usize,
198 ends_at: Vec<Vec<Edge>>, char_info_buffer: Vec<CharData>,
200 categories_buffer: Vec<CategoryId>,
201 char_category_cache: Vec<Vec<CategoryId>>,
202
203 all_paths: Vec<Vec<PathEntry>>,
205 nbest_capacity: usize,
206 last_text_len: usize,
208}
209
210#[derive(Clone, Copy, Debug, Default)]
211struct CharData {
212 byte_offset: u32,
213 is_kanji: bool,
214 categories_start: u32,
215 categories_len: u16,
216 kanji_run_byte_len: u32,
217}
218
219#[inline]
220pub fn is_kanji(c: char) -> bool {
221 let c = c as u32;
222 (0x4E00..=0x9FAF).contains(&c) || (0x3400..=0x4DBF).contains(&c)
224}
225
226impl Lattice {
227 #[inline]
229 fn create_edge(
230 edge_type: EdgeType,
231 word_entry: WordEntry,
232 start: usize,
233 stop: usize,
234 kanji_only: bool,
235 ) -> Edge {
236 Edge {
237 edge_type,
238 word_entry,
239 left_index: u16::MAX,
240 start_index: start as u32,
241 stop_index: stop as u32,
242 path_cost: i32::MAX,
243 kanji_only,
244 }
245 }
246
247 pub fn clear(&mut self) {
248 for edge_vec in &mut self.ends_at {
249 edge_vec.clear();
250 }
251 for path_vec in &mut self.all_paths {
252 path_vec.clear();
253 }
254 self.char_info_buffer.clear();
255 self.categories_buffer.clear();
256 }
257
258 #[inline]
259 fn is_kanji_all(&self, char_idx: usize, byte_len: usize) -> bool {
260 self.char_info_buffer[char_idx].kanji_run_byte_len >= byte_len as u32
261 }
262
263 #[inline]
264 fn get_cached_category(&self, char_idx: usize, category_ord: usize) -> CategoryId {
265 let char_data = &self.char_info_buffer[char_idx];
266 self.categories_buffer[char_data.categories_start as usize + category_ord]
267 }
268
269 fn set_capacity(&mut self, text_len: usize) {
270 self.clear();
271 self.last_text_len = text_len;
272 if self.capacity <= text_len {
273 self.capacity = text_len;
274 self.ends_at.resize(text_len + 1, Vec::new());
275 }
276 for vec in &mut self.ends_at {
277 vec.clear();
278 }
279 }
280
281 fn set_capacity_nbest(&mut self, text_len: usize) {
282 self.set_capacity(text_len);
283 if self.nbest_capacity <= text_len {
284 self.nbest_capacity = text_len;
285 self.all_paths.resize(text_len + 1, Vec::new());
286 }
287 for vec in &mut self.all_paths {
288 vec.clear();
289 }
290 }
291
292 #[inline(never)]
293 #[allow(clippy::too_many_arguments)]
297 pub fn set_text(
298 &mut self,
299 dict: &PrefixDictionary,
300 user_dict: &Option<&PrefixDictionary>,
301 char_definitions: &CharacterDefinition,
302 unknown_dictionary: &UnknownDictionary,
303 cost_matrix: &ConnectionCostMatrix,
304 text: &str,
305 search_mode: &Mode,
306 ) {
307 let len = text.len();
308 self.set_capacity(len);
309
310 self.char_info_buffer.clear();
312 self.categories_buffer.clear();
313
314 if self.char_category_cache.is_empty() {
315 self.char_category_cache.resize(256, Vec::new());
316 }
317
318 for (byte_offset, c) in text.char_indices() {
319 let categories_start = self.categories_buffer.len() as u32;
320
321 if (c as u32) < 256 {
322 let cached = &mut self.char_category_cache[c as usize];
323 if cached.is_empty() {
324 let cats = char_definitions.lookup_categories(c);
325 for &category in cats {
326 cached.push(category);
327 }
328 }
329 for &category in cached.iter() {
330 self.categories_buffer.push(category);
331 }
332 } else {
333 let categories = char_definitions.lookup_categories(c);
334 for &category in categories {
335 self.categories_buffer.push(category);
336 }
337 }
338
339 let categories_len = (self.categories_buffer.len() as u32 - categories_start) as u16;
340
341 self.char_info_buffer.push(CharData {
342 byte_offset: byte_offset as u32,
343 is_kanji: is_kanji(c),
344 categories_start,
345 categories_len,
346 kanji_run_byte_len: 0,
347 });
348 }
349 self.char_info_buffer.push(CharData {
351 byte_offset: len as u32,
352 is_kanji: false,
353 categories_start: 0,
354 categories_len: 0,
355 kanji_run_byte_len: 0,
356 });
357
358 for i in (0..self.char_info_buffer.len() - 1).rev() {
360 if self.char_info_buffer[i].is_kanji {
361 let next_byte_offset = self.char_info_buffer[i + 1].byte_offset;
362 let char_byte_len = next_byte_offset - self.char_info_buffer[i].byte_offset;
363 self.char_info_buffer[i].kanji_run_byte_len =
364 char_byte_len + self.char_info_buffer[i + 1].kanji_run_byte_len;
365 } else {
366 self.char_info_buffer[i].kanji_run_byte_len = 0;
367 }
368 }
369
370 let start_edge = Edge {
371 path_cost: 0,
372 left_index: u16::MAX,
373 ..Default::default()
374 };
375 self.ends_at[0].push(start_edge);
376
377 let mut unknown_word_end: Option<usize> = None;
379
380 let mut matches_head = vec![usize::MAX; len + 1];
384 let mut matches_store: Vec<(usize, WordEntry, usize)> = Vec::with_capacity(len * 10);
385
386 for m in dict.da.find_overlapping_iter(text) {
388 let start = m.start();
389 let id = m.value();
390 let count = id & ((1u32 << 5) - 1u32);
391 let offset = id >> 5u32;
392 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
393
394 if offset_bytes < dict.vals_data.len() {
396 let data_slice = &dict.vals_data[offset_bytes..];
397 for i in 0..count {
398 let entry_offset = WordEntry::SERIALIZED_LEN * (i as usize);
399 if entry_offset + WordEntry::SERIALIZED_LEN <= data_slice.len() {
400 let entry = WordEntry::deserialize(&data_slice[entry_offset..], true);
401 if start < matches_head.len() {
402 let next = matches_head[start];
403 matches_head[start] = matches_store.len();
404 matches_store.push((m.end(), entry, next));
405 }
406 }
407 }
408 }
409 }
410
411 if let Some(ud) = user_dict {
413 for m in ud.da.find_overlapping_iter(text) {
414 let start = m.start();
415 let id = m.value();
416 let count = id & ((1u32 << 5) - 1u32);
417 let offset = id >> 5u32;
418 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
419
420 if offset_bytes < ud.vals_data.len() {
421 let data_slice = &ud.vals_data[offset_bytes..];
422 for i in 0..count {
423 let entry_offset = WordEntry::SERIALIZED_LEN * (i as usize);
424 if entry_offset + WordEntry::SERIALIZED_LEN <= data_slice.len() {
425 let entry = WordEntry::deserialize(&data_slice[entry_offset..], false);
426 if start < matches_head.len() {
427 let next = matches_head[start];
428 matches_head[start] = matches_store.len();
429 matches_store.push((m.end(), entry, next));
430 }
431 }
432 }
433 }
434 }
435 }
436
437 for char_idx in 0..self.char_info_buffer.len() - 1 {
438 let start = self.char_info_buffer[char_idx].byte_offset as usize;
439
440 if self.ends_at[start].is_empty() {
443 continue;
444 }
445
446 let mut found: bool = false;
447
448 if start < matches_head.len() {
450 let mut match_idx = matches_head[start];
451 while match_idx != usize::MAX {
452 let (end, word_entry, next) = matches_store[match_idx];
453
454 let prefix_len = end - start;
455 let kanji_only = self.is_kanji_all(char_idx, prefix_len);
456 let edge = Self::create_edge(
457 EdgeType::KNOWN,
458 word_entry, start,
460 end,
461 kanji_only,
462 );
463 self.add_edge_in_lattice(edge, cost_matrix, search_mode);
464 found = true;
465
466 match_idx = next;
467 }
468 }
469
470 if (search_mode.is_search()
472 || unknown_word_end.map(|index| index <= start).unwrap_or(true))
473 && char_idx < self.char_info_buffer.len() - 1
474 {
475 let num_categories = self.char_info_buffer[char_idx].categories_len as usize;
476 for category_ord in 0..num_categories {
477 let category = self.get_cached_category(char_idx, category_ord);
478 unknown_word_end = self.process_unknown_word(
479 char_definitions,
480 unknown_dictionary,
481 cost_matrix,
482 search_mode,
483 category,
484 category_ord,
485 unknown_word_end,
486 start,
487 char_idx,
488 found,
489 );
490 }
491 }
492 }
493
494 if !self.ends_at[len].is_empty() {
496 let mut eos_edge = Edge {
497 start_index: len as u32,
498 stop_index: len as u32,
499 ..Default::default()
500 };
501 let left_edges = &self.ends_at[len];
503 let mut best_cost = i32::MAX;
504 let mut best_left = None;
505 let right_left_id = 0; for (i, left_edge) in left_edges.iter().enumerate() {
508 let left_right_id = left_edge.word_entry.right_id();
509 let conn_cost = cost_matrix.cost(left_right_id, right_left_id);
510 let path_cost = left_edge.path_cost.saturating_add(conn_cost);
511 if path_cost < best_cost {
512 best_cost = path_cost;
513 best_left = Some(i as u16);
514 }
515 }
516 if let Some(left_idx) = best_left {
517 eos_edge.left_index = left_idx;
518 eos_edge.path_cost = best_cost;
519 self.ends_at[len].push(eos_edge);
520 }
521 }
522 }
523
524 #[allow(clippy::too_many_arguments)]
525 fn process_unknown_word(
526 &mut self,
527 char_definitions: &CharacterDefinition,
528 unknown_dictionary: &UnknownDictionary,
529 cost_matrix: &ConnectionCostMatrix,
530 search_mode: &Mode,
531 category: CategoryId,
532 category_ord: usize,
533 unknown_word_index: Option<usize>,
534 start: usize,
535 char_idx: usize,
536 found: bool,
537 ) -> Option<usize> {
538 let mut unknown_word_num_chars: usize = 0;
539 let category_data = char_definitions.lookup_definition(category);
540 if category_data.invoke || !found {
541 unknown_word_num_chars = 1;
542 if category_data.group {
543 for i in 1.. {
544 let next_idx = char_idx + i;
545 if next_idx >= self.char_info_buffer.len() - 1 {
546 break;
547 }
548 let num_categories = self.char_info_buffer[next_idx].categories_len as usize;
549 let mut found_cat = false;
550 if category_ord < num_categories {
551 let cat = self.get_cached_category(next_idx, category_ord);
552 if cat == category {
553 unknown_word_num_chars += 1;
554 found_cat = true;
555 }
556 }
557 if !found_cat {
558 break;
559 }
560 }
561 }
562 }
563 if unknown_word_num_chars > 0 {
564 let byte_end_offset =
565 self.char_info_buffer[char_idx + unknown_word_num_chars].byte_offset;
566 let byte_len = byte_end_offset as usize - start;
567
568 let kanji_only = self.is_kanji_all(char_idx, byte_len);
570
571 for &word_id in unknown_dictionary.lookup_word_ids(category) {
572 let word_entry = unknown_dictionary.word_entry(word_id);
573 let edge = Self::create_edge(
574 EdgeType::UNKNOWN,
575 word_entry,
576 start,
577 start + byte_len,
578 kanji_only,
579 );
580 self.add_edge_in_lattice(edge, cost_matrix, search_mode);
581 }
582 return Some(start + byte_len);
583 }
584 unknown_word_index
585 }
586
587 fn add_edge_in_lattice(
589 &mut self,
590 mut edge: Edge,
591 cost_matrix: &ConnectionCostMatrix,
592 mode: &Mode,
593 ) {
594 let start_index = edge.start_index as usize;
595 let stop_index = edge.stop_index as usize;
596 let right_left_id = edge.word_entry.left_id();
597
598 let left_edges = &self.ends_at[start_index];
599 if left_edges.is_empty() {
600 return;
601 }
602
603 let mut best_cost = i32::MAX;
604 let mut best_left = None;
605
606 match mode {
607 Mode::Normal => {
608 for (i, left_edge) in left_edges.iter().enumerate() {
609 let left_right_id = left_edge.word_entry.right_id();
610 let conn_cost = cost_matrix.cost(left_right_id, right_left_id);
611 let total_cost = left_edge.path_cost.saturating_add(conn_cost);
612
613 if total_cost < best_cost {
614 best_cost = total_cost;
615 best_left = Some(i as u16);
616 }
617 }
618 }
619 Mode::Decompose(penalty) => {
620 for (i, left_edge) in left_edges.iter().enumerate() {
621 let left_right_id = left_edge.word_entry.right_id();
622 let conn_cost = cost_matrix.cost(left_right_id, right_left_id);
623 let penalty_cost = penalty.penalty(left_edge);
624 let total_cost = left_edge
625 .path_cost
626 .saturating_add(conn_cost)
627 .saturating_add(penalty_cost);
628
629 if total_cost < best_cost {
630 best_cost = total_cost;
631 best_left = Some(i as u16);
632 }
633 }
634 }
635 }
636
637 if let Some(best_left_idx) = best_left {
638 edge.path_cost = best_cost.saturating_add(edge.word_entry.word_cost as i32);
639 edge.left_index = best_left_idx;
640 self.ends_at[stop_index].push(edge);
641 }
642 }
643
644 pub fn tokens_offset(&self) -> Vec<(usize, WordId)> {
645 let mut offsets = Vec::new();
646
647 if self.ends_at.is_empty() {
648 return offsets;
649 }
650
651 let mut last_idx = self.ends_at.len() - 1;
652 while last_idx > 0 && self.ends_at[last_idx].is_empty() {
653 last_idx -= 1;
654 }
655
656 if self.ends_at[last_idx].is_empty() {
657 return offsets;
658 }
659
660 let idx = self.ends_at[last_idx].len() - 1;
661 let mut edge = &self.ends_at[last_idx][idx];
662
663 if edge.left_index == u16::MAX {
664 return offsets;
665 }
666
667 loop {
668 if edge.left_index == u16::MAX {
669 break;
670 }
671
672 offsets.push((edge.start_index as usize, edge.word_entry.word_id));
673
674 let left_idx = edge.left_index as usize;
675 let start_idx = edge.start_index as usize;
676
677 edge = &self.ends_at[start_idx][left_idx];
678 }
679
680 offsets.reverse();
681 offsets.pop(); offsets
684 }
685
686 pub fn text_len(&self) -> usize {
690 self.last_text_len
691 }
692
693 pub fn edges_at(&self, byte_pos: usize) -> &[Edge] {
695 &self.ends_at[byte_pos]
696 }
697
698 pub fn paths_at(&self, byte_pos: usize) -> &[PathEntry] {
700 if byte_pos < self.all_paths.len() {
701 &self.all_paths[byte_pos]
702 } else {
703 &[]
704 }
705 }
706
707 fn add_edge_in_lattice_nbest(
709 &mut self,
710 mut edge: Edge,
711 cost_matrix: &ConnectionCostMatrix,
712 mode: &Mode,
713 ) {
714 let start_index = edge.start_index as usize;
715 let stop_index = edge.stop_index as usize;
716 let right_left_id = edge.word_entry.left_id();
717
718 let left_edges = &self.ends_at[start_index];
719 if left_edges.is_empty() {
720 return;
721 }
722
723 let mut best_cost = i32::MAX;
724 let mut best_left = None;
725
726 let new_edge_index = self.ends_at[stop_index].len() as u16;
728
729 match mode {
730 Mode::Normal => {
731 for (i, left_edge) in left_edges.iter().enumerate() {
732 let left_right_id = left_edge.word_entry.right_id();
733 let conn_cost = cost_matrix.cost(left_right_id, right_left_id);
734 let total_cost = left_edge.path_cost.saturating_add(conn_cost);
735
736 self.all_paths[stop_index].push(PathEntry {
738 edge_index: new_edge_index,
739 left_pos: start_index as u32,
740 left_index: i as u16,
741 cost: total_cost,
742 });
743
744 if total_cost < best_cost {
745 best_cost = total_cost;
746 best_left = Some(i as u16);
747 }
748 }
749 }
750 Mode::Decompose(penalty) => {
751 for (i, left_edge) in left_edges.iter().enumerate() {
752 let left_right_id = left_edge.word_entry.right_id();
753 let conn_cost = cost_matrix.cost(left_right_id, right_left_id);
754 let penalty_cost = penalty.penalty(left_edge);
755 let total_cost = left_edge
756 .path_cost
757 .saturating_add(conn_cost)
758 .saturating_add(penalty_cost);
759
760 self.all_paths[stop_index].push(PathEntry {
762 edge_index: new_edge_index,
763 left_pos: start_index as u32,
764 left_index: i as u16,
765 cost: total_cost,
766 });
767
768 if total_cost < best_cost {
769 best_cost = total_cost;
770 best_left = Some(i as u16);
771 }
772 }
773 }
774 }
775
776 if let Some(best_left_idx) = best_left {
777 edge.path_cost = best_cost.saturating_add(edge.word_entry.word_cost as i32);
778 edge.left_index = best_left_idx;
779 self.ends_at[stop_index].push(edge);
780 }
781 }
782
783 #[allow(clippy::too_many_arguments)]
784 fn process_unknown_word_nbest(
785 &mut self,
786 char_definitions: &CharacterDefinition,
787 unknown_dictionary: &UnknownDictionary,
788 cost_matrix: &ConnectionCostMatrix,
789 search_mode: &Mode,
790 category: CategoryId,
791 category_ord: usize,
792 unknown_word_index: Option<usize>,
793 start: usize,
794 char_idx: usize,
795 found: bool,
796 ) -> Option<usize> {
797 let mut unknown_word_num_chars: usize = 0;
798 let category_data = char_definitions.lookup_definition(category);
799 if category_data.invoke || !found {
800 unknown_word_num_chars = 1;
801 if category_data.group {
802 for i in 1.. {
803 let next_idx = char_idx + i;
804 if next_idx >= self.char_info_buffer.len() - 1 {
805 break;
806 }
807 let num_categories = self.char_info_buffer[next_idx].categories_len as usize;
808 let mut found_cat = false;
809 if category_ord < num_categories {
810 let cat = self.get_cached_category(next_idx, category_ord);
811 if cat == category {
812 unknown_word_num_chars += 1;
813 found_cat = true;
814 }
815 }
816 if !found_cat {
817 break;
818 }
819 }
820 }
821 }
822 if unknown_word_num_chars > 0 {
823 let byte_end_offset =
824 self.char_info_buffer[char_idx + unknown_word_num_chars].byte_offset;
825 let byte_len = byte_end_offset as usize - start;
826
827 let kanji_only = self.is_kanji_all(char_idx, byte_len);
828
829 for &word_id in unknown_dictionary.lookup_word_ids(category) {
830 let word_entry = unknown_dictionary.word_entry(word_id);
831 let edge = Self::create_edge(
832 EdgeType::UNKNOWN,
833 word_entry,
834 start,
835 start + byte_len,
836 kanji_only,
837 );
838 self.add_edge_in_lattice_nbest(edge, cost_matrix, search_mode);
839 }
840 return Some(start + byte_len);
841 }
842 unknown_word_index
843 }
844
845 #[inline(never)]
848 #[allow(clippy::too_many_arguments)]
849 pub fn set_text_nbest(
850 &mut self,
851 dict: &PrefixDictionary,
852 user_dict: &Option<&PrefixDictionary>,
853 char_definitions: &CharacterDefinition,
854 unknown_dictionary: &UnknownDictionary,
855 cost_matrix: &ConnectionCostMatrix,
856 text: &str,
857 search_mode: &Mode,
858 ) {
859 let len = text.len();
860 self.set_capacity_nbest(len);
861
862 self.char_info_buffer.clear();
864 self.categories_buffer.clear();
865
866 if self.char_category_cache.is_empty() {
867 self.char_category_cache.resize(256, Vec::new());
868 }
869
870 for (byte_offset, c) in text.char_indices() {
871 let categories_start = self.categories_buffer.len() as u32;
872
873 if (c as u32) < 256 {
874 let cached = &mut self.char_category_cache[c as usize];
875 if cached.is_empty() {
876 let cats = char_definitions.lookup_categories(c);
877 for &category in cats {
878 cached.push(category);
879 }
880 }
881 for &category in cached.iter() {
882 self.categories_buffer.push(category);
883 }
884 } else {
885 let categories = char_definitions.lookup_categories(c);
886 for &category in categories {
887 self.categories_buffer.push(category);
888 }
889 }
890
891 let categories_len = (self.categories_buffer.len() as u32 - categories_start) as u16;
892
893 self.char_info_buffer.push(CharData {
894 byte_offset: byte_offset as u32,
895 is_kanji: is_kanji(c),
896 categories_start,
897 categories_len,
898 kanji_run_byte_len: 0,
899 });
900 }
901 self.char_info_buffer.push(CharData {
903 byte_offset: len as u32,
904 is_kanji: false,
905 categories_start: 0,
906 categories_len: 0,
907 kanji_run_byte_len: 0,
908 });
909
910 for i in (0..self.char_info_buffer.len() - 1).rev() {
912 if self.char_info_buffer[i].is_kanji {
913 let next_byte_offset = self.char_info_buffer[i + 1].byte_offset;
914 let char_byte_len = next_byte_offset - self.char_info_buffer[i].byte_offset;
915 self.char_info_buffer[i].kanji_run_byte_len =
916 char_byte_len + self.char_info_buffer[i + 1].kanji_run_byte_len;
917 } else {
918 self.char_info_buffer[i].kanji_run_byte_len = 0;
919 }
920 }
921
922 let start_edge = Edge {
923 path_cost: 0,
924 left_index: u16::MAX,
925 ..Default::default()
926 };
927 self.ends_at[0].push(start_edge);
928
929 let mut unknown_word_end: Option<usize> = None;
930
931 let mut matches_head = vec![usize::MAX; len + 1];
933 let mut matches_store: Vec<(usize, WordEntry, usize)> = Vec::with_capacity(len * 10);
934
935 for m in dict.da.find_overlapping_iter(text) {
937 let start = m.start();
938 let id = m.value();
939 let count = id & ((1u32 << 5) - 1u32);
940 let offset = id >> 5u32;
941 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
942
943 if offset_bytes < dict.vals_data.len() {
944 let data_slice = &dict.vals_data[offset_bytes..];
945 for i in 0..count {
946 let entry_offset = WordEntry::SERIALIZED_LEN * (i as usize);
947 if entry_offset + WordEntry::SERIALIZED_LEN <= data_slice.len() {
948 let entry = WordEntry::deserialize(&data_slice[entry_offset..], true);
949 if start < matches_head.len() {
950 let next = matches_head[start];
951 matches_head[start] = matches_store.len();
952 matches_store.push((m.end(), entry, next));
953 }
954 }
955 }
956 }
957 }
958
959 if let Some(ud) = user_dict {
961 for m in ud.da.find_overlapping_iter(text) {
962 let start = m.start();
963 let id = m.value();
964 let count = id & ((1u32 << 5) - 1u32);
965 let offset = id >> 5u32;
966 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
967
968 if offset_bytes < ud.vals_data.len() {
969 let data_slice = &ud.vals_data[offset_bytes..];
970 for i in 0..count {
971 let entry_offset = WordEntry::SERIALIZED_LEN * (i as usize);
972 if entry_offset + WordEntry::SERIALIZED_LEN <= data_slice.len() {
973 let entry = WordEntry::deserialize(&data_slice[entry_offset..], false);
974 if start < matches_head.len() {
975 let next = matches_head[start];
976 matches_head[start] = matches_store.len();
977 matches_store.push((m.end(), entry, next));
978 }
979 }
980 }
981 }
982 }
983 }
984
985 for char_idx in 0..self.char_info_buffer.len() - 1 {
986 let start = self.char_info_buffer[char_idx].byte_offset as usize;
987
988 if self.ends_at[start].is_empty() {
989 continue;
990 }
991
992 let mut found: bool = false;
993
994 if start < matches_head.len() {
995 let mut match_idx = matches_head[start];
996 while match_idx != usize::MAX {
997 let (end, word_entry, next) = matches_store[match_idx];
998
999 let prefix_len = end - start;
1000 let kanji_only = self.is_kanji_all(char_idx, prefix_len);
1001 let edge =
1002 Self::create_edge(EdgeType::KNOWN, word_entry, start, end, kanji_only);
1003 self.add_edge_in_lattice_nbest(edge, cost_matrix, search_mode);
1004 found = true;
1005
1006 match_idx = next;
1007 }
1008 }
1009
1010 if (search_mode.is_search()
1011 || unknown_word_end.map(|index| index <= start).unwrap_or(true))
1012 && char_idx < self.char_info_buffer.len() - 1
1013 {
1014 let num_categories = self.char_info_buffer[char_idx].categories_len as usize;
1015 for category_ord in 0..num_categories {
1016 let category = self.get_cached_category(char_idx, category_ord);
1017 unknown_word_end = self.process_unknown_word_nbest(
1018 char_definitions,
1019 unknown_dictionary,
1020 cost_matrix,
1021 search_mode,
1022 category,
1023 category_ord,
1024 unknown_word_end,
1025 start,
1026 char_idx,
1027 found,
1028 );
1029 }
1030 }
1031 }
1032
1033 if !self.ends_at[len].is_empty() {
1035 let eos_edge_index = self.ends_at[len].len() as u16;
1036 let mut eos_edge = Edge {
1037 start_index: len as u32,
1038 stop_index: len as u32,
1039 ..Default::default()
1040 };
1041 let left_edges = &self.ends_at[len];
1042 let mut best_cost = i32::MAX;
1043 let mut best_left = None;
1044 let right_left_id = 0; for (i, left_edge) in left_edges.iter().enumerate() {
1047 let left_right_id = left_edge.word_entry.right_id();
1048 let conn_cost = cost_matrix.cost(left_right_id, right_left_id);
1049 let path_cost = left_edge.path_cost.saturating_add(conn_cost);
1050
1051 self.all_paths[len].push(PathEntry {
1053 edge_index: eos_edge_index,
1054 left_pos: len as u32,
1055 left_index: i as u16,
1056 cost: path_cost,
1057 });
1058
1059 if path_cost < best_cost {
1060 best_cost = path_cost;
1061 best_left = Some(i as u16);
1062 }
1063 }
1064 if let Some(left_idx) = best_left {
1065 eos_edge.left_index = left_idx;
1066 eos_edge.path_cost = best_cost;
1067 self.ends_at[len].push(eos_edge);
1068 }
1069 }
1070 }
1071
1072 pub fn nbest_tokens_offset(
1080 &self,
1081 n: usize,
1082 unique: bool,
1083 cost_threshold: Option<i64>,
1084 ) -> Vec<(Vec<(usize, WordId)>, i64)> {
1085 use std::collections::HashSet;
1086
1087 use crate::nbest::NBestGenerator;
1088 let mut generator = NBestGenerator::new(self);
1089 let mut results = Vec::with_capacity(n);
1090 let mut best_cost: Option<i64> = None;
1091
1092 if unique {
1093 let mut seen: HashSet<Vec<usize>> = HashSet::new();
1094 while results.len() < n {
1095 match generator.next() {
1096 Some((path, cost)) => {
1097 let bc = *best_cost.get_or_insert(cost);
1099 if let Some(threshold) = cost_threshold {
1101 if cost > bc + threshold {
1102 break;
1103 }
1104 }
1105 let key: Vec<usize> = path.iter().map(|(start, _)| *start).collect();
1106 if seen.insert(key) {
1107 results.push((path, cost));
1108 }
1109 }
1110 None => break,
1111 }
1112 }
1113 } else {
1114 while results.len() < n {
1115 match generator.next() {
1116 Some((path, cost)) => {
1117 let bc = *best_cost.get_or_insert(cost);
1118 if let Some(threshold) = cost_threshold {
1119 if cost > bc + threshold {
1120 break;
1121 }
1122 }
1123 results.push((path, cost));
1124 }
1125 None => break,
1126 }
1127 }
1128 }
1129 results
1130 }
1131}
1132
1133#[cfg(test)]
1134mod tests {
1135 use crate::viterbi::{LexType, WordEntry, WordId};
1136
1137 #[test]
1138 fn test_word_entry() {
1139 let mut buffer = Vec::new();
1140 let word_entry = WordEntry {
1141 word_id: WordId {
1142 id: 1u32,
1143 is_system: true,
1144 lex_type: LexType::System,
1145 },
1146 word_cost: -17i16,
1147 left_id: 1411u16,
1148 right_id: 1412u16,
1149 };
1150 word_entry.serialize(&mut buffer).unwrap();
1151 assert_eq!(WordEntry::SERIALIZED_LEN, buffer.len());
1152 let word_entry2 = WordEntry::deserialize(&buffer[..], true);
1153 assert_eq!(word_entry, word_entry2);
1154 }
1155}