1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
#[macro_use] extern crate lazy_static; #[macro_use] extern crate maplit; use std::char; use std::collections::HashMap; use std::hash::Hash; include!("constants.rs"); fn get_score<T: Eq + Hash>(d: &HashMap<T, i32>, s: &T) -> i32 { d.get(s).cloned().unwrap_or(0) } fn get_ctype(c: char) -> char { match c as u32 { 0x4E00|0x4E8C|0x4E09|0x56DB|0x4E94|0x516D|0x4E03|0x516B|0x4E5D|0x5341 => 'M', 0x767E|0x5343|0x4E07|0x5104|0x5146 => 'M', 0x4E00...0x9FA0|0x3005|0x3006|0x30F5|0x30F6 => 'H', 0x3041...0x3093 => 'I', 0x30A1...0x30F4|0x30FC|0xFF71...0xFF9D|0xFF9E|0xFF70 => 'K', 0x61...0x7A|0x41...0x5A|0xFF41...0xFF5A|0xFF21...0xFF3A => 'A', 0x30...0x3a|0xFF10...0xFF19 => 'N', _ => 'O', } } pub fn tokenize(s: &str) -> Vec<String> { if s.is_empty() { return Vec::new(); } let mut result = Vec::with_capacity(s.chars().count()); let segments = vec!(*B3, *B2, *B1).into_iter() .chain(s.chars()) .chain(vec!(*E1, *E2, *E3).into_iter()) .collect::<Vec<_>>(); let ctypes = vec!('O'; 3).into_iter() .chain(s.chars().map(get_ctype)) .chain(vec!('O'; 3).into_iter()) .collect::<Vec<_>>(); let mut word = segments[3].to_string(); let mut p = vec!('U'; 3); for index in 4 .. segments.len() - 3 { let mut score = BIAS; let w = &segments[index - 3 .. index + 3]; let c = &ctypes[index - 3 .. index + 3]; score = score + get_score(&*UP1, &p[0]); score = score + get_score(&*UP2, &p[1]); score = score + get_score(&*UP3, &p[2]); score = score + get_score(&*BP1, &(p[0], p[1])); score = score + get_score(&*BP2, &(p[1], p[2])); score = score + get_score(&*UW1, &w[0]); score = score + get_score(&*UW2, &w[1]); score = score + get_score(&*UW3, &w[2]); score = score + get_score(&*UW4, &w[3]); score = score + get_score(&*UW5, &w[4]); score = score + get_score(&*UW6, &w[5]); score = score + get_score(&*BW1, &(w[1], w[2])); score = score + get_score(&*BW2, &(w[2], w[3])); score = score + get_score(&*BW3, &(w[3], w[4])); score = score + get_score(&*TW1, &(w[0], w[1], w[2])); score = score + get_score(&*TW2, &(w[1], w[2], w[3])); score = score + get_score(&*TW3, &(w[2], w[3], w[4])); score = score + get_score(&*TW4, &(w[3], w[4], w[5])); score = score + get_score(&*UC1, &c[0]); score = score + get_score(&*UC2, &c[1]); score = score + get_score(&*UC3, &c[2]); score = score + get_score(&*UC4, &c[3]); score = score + get_score(&*UC5, &c[4]); score = score + get_score(&*UC6, &c[5]); score = score + get_score(&*BC1, &(c[1], c[2])); score = score + get_score(&*BC2, &(c[2], c[3])); score = score + get_score(&*BC3, &(c[3], c[4])); score = score + get_score(&*TC1, &(c[0], c[1], c[2])); score = score + get_score(&*TC2, &(c[1], c[2], c[3])); score = score + get_score(&*TC3, &(c[2], c[3], c[4])); score = score + get_score(&*TC4, &(c[3], c[4], c[5])); score = score + get_score(&*UQ1, &(p[0], c[0])); score = score + get_score(&*UQ2, &(p[1], c[1])); score = score + get_score(&*UQ3, &(p[2], c[2])); score = score + get_score(&*BQ1, &(p[1], c[1], c[2])); score = score + get_score(&*BQ2, &(p[1], c[2], c[3])); score = score + get_score(&*BQ3, &(p[2], c[1], c[2])); score = score + get_score(&*BQ4, &(p[2], c[2], c[3])); score = score + get_score(&*TQ1, &(p[1], c[0], c[1], c[2])); score = score + get_score(&*TQ2, &(p[1], c[1], c[2], c[3])); score = score + get_score(&*TQ3, &(p[2], c[0], c[1], c[2])); score = score + get_score(&*TQ4, &(p[2], c[1], c[2], c[3])); p.remove(0); p.push(if score < 0 { 'O' } else { 'B' }); if 0 < score { result.push(word.clone()); word.clear(); } word.push(segments[index]); } result.push(word.clone()); result }