1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
use std::borrow::Cow;
use std::collections::HashSet;
use std::ops::Range;

use super::dictionary_lib::category_type::CategoryType;

pub struct UTF8InputText {
  original_text: String,
  modified_text: String,
  bytes: Vec<u8>,
  offsets: Vec<usize>,
  byte_indexes: Vec<usize>,
  char_categories: Vec<HashSet<CategoryType>>,
  char_category_continuities: Vec<usize>,
  can_bow_list: Vec<bool>,
}

pub trait InputText {
  fn get_char_category_continuous_length(&self, index: usize) -> usize;
  fn get_char_category_types(&self, start: usize, end: Option<usize>) -> HashSet<CategoryType>;
  fn get_substring(&self, start: usize, end: usize) -> Result<Cow<str>, ()>;
  fn get_code_points_offset_length(&self, index: usize, code_point_offset: usize) -> usize;
  fn get_word_candidate_length(&self, index: usize) -> usize;
}

impl UTF8InputText {
  pub fn new(
    original_text: String,
    modified_text: String,
    bytes: Vec<u8>,
    offsets: Vec<usize>,
    byte_indexes: Vec<usize>,
    char_categories: Vec<HashSet<CategoryType>>,
    char_category_continuities: Vec<usize>,
    can_bow_list: Vec<bool>,
  ) -> UTF8InputText {
    UTF8InputText {
      original_text,
      modified_text,
      bytes,
      offsets,
      byte_indexes,
      char_categories,
      char_category_continuities,
      can_bow_list,
    }
  }
  pub fn get_original_text(&self) -> &String {
    &self.original_text
  }
  pub fn get_text(&self) -> &String {
    &self.modified_text
  }
  pub fn get_byte_text(&self) -> &Vec<u8> {
    &self.bytes
  }
  fn get_offset_text_length(&self, index: usize) -> usize {
    self.byte_indexes[index]
  }
  fn is_char_alignment(&self, index: usize) -> bool {
    (self.bytes[index] & 0xC0) != 0x80
  }
  pub fn get_original_index(&self, index: usize) -> usize {
    self.offsets[index]
  }
  pub fn can_bow(&self, idx: usize) -> bool {
    self.is_char_alignment(idx) && self.can_bow_list[self.get_offset_text_length(idx)]
  }
  pub fn code_point_count(&self, range: Range<usize>) -> usize {
    self.get_offset_text_length(range.end) - self.get_offset_text_length(range.start)
  }
}

impl InputText for UTF8InputText {
  fn get_substring(&self, start: usize, end: usize) -> Result<Cow<str>, ()> {
    if end > self.bytes.len() {
      return Err(());
    }
    if start > end {
      return Err(());
    }
    Ok(Cow::Borrowed(self.modified_text.get(start..end).unwrap()))
  }
  fn get_char_category_continuous_length(&self, index: usize) -> usize {
    self.char_category_continuities[index]
  }
  fn get_code_points_offset_length(&self, index: usize, code_point_offset: usize) -> usize {
    let mut length = 0;
    let target = self.get_offset_text_length(index) + code_point_offset;
    for i in index..self.bytes.len() {
      if self.get_offset_text_length(i) >= target {
        return length;
      }
      length += 1;
    }
    length
  }
  fn get_char_category_types(&self, start: usize, end: Option<usize>) -> HashSet<CategoryType> {
    match end {
      Some(end) => {
        if start + self.get_char_category_continuous_length(start) < end {
          let mut set = HashSet::new();
          set.insert(CategoryType::DEFAULT);
          return set;
        }
        let start = self.get_offset_text_length(start);
        let end = self.get_offset_text_length(end);
        let mut continuous_category = self.char_categories[start].clone();
        for i in start + 1..end {
          continuous_category = continuous_category
            .intersection(&self.char_categories[i])
            .cloned()
            .collect();
        }
        continuous_category
      }
      None => self.char_categories[self.get_offset_text_length(start)].clone(),
    }
  }
  fn get_word_candidate_length(&self, index: usize) -> usize {
    for i in index + 1..self.bytes.len() {
      if self.can_bow(i) {
        return i - index;
      }
    }
    self.bytes.len() - index
  }
}