use crate::buf_feature;
use crate::perceptron::definition::GenericItem;
use crate::perceptron::{Definition, Sample};
use anyhow::Result;
use itertools::Itertools;
#[cfg(feature = "parallel")]
use rayon::prelude::*;
#[cfg(feature = "serialization")]
use serde::{Deserialize, Serialize};
use std::io::{BufRead, BufReader, Read, Write};
#[cfg(feature = "char-type")]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
#[repr(u8)]
pub enum CharacterType {
Digit = 1,
Roman = 2,
Hiragana = 3,
Katakana = 4,
Kanji = 5,
Other = 6,
}
#[cfg(feature = "char-type")]
impl CharacterType {
pub fn get_type(c: char) -> Self {
match u32::from(c) {
0x30..=0x39 | 0xFF10..=0xFF19 => Self::Digit,
0x41..=0x5A | 0x61..=0x7A | 0xFF21..=0xFF3A | 0xFF41..=0xFF5A => Self::Roman,
0x3040..=0x3096 => Self::Hiragana,
0x30A0..=0x30FA | 0x30FC..=0x30FF | 0xFF66..=0xFF9F => Self::Katakana,
0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF | 0x20000..=0x2A6DF | 0x2A700..=0x2B73F | 0x2B740..=0x2B81F | 0x2B820..=0x2CEAF | 0x2F800..=0x2FA1F => Self::Kanji,
_ => Self::Other,
}
}
}
#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))]
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct CWSDefinition {}
impl CWSDefinition {
pub fn new() -> Self {
CWSDefinition {}
}
pub fn parse_char_features_with_buffer<'a>(
&self,
sentence: &str,
buffer: &'a mut Vec<u8>,
) -> Result<(Vec<usize>, Vec<Vec<usize>>)> {
let char_null = '\u{0000}';
let chars_len = sentence.len();
let mut index = Vec::with_capacity(chars_len);
let mut features = Vec::with_capacity(chars_len);
let mut pre_char = char_null;
let mut pre2_char = char_null;
let mut chars = sentence
.char_indices()
.filter(|(_, ch)| !ch.is_whitespace())
.multipeek();
while let Some((char_idx, cur_char)) = chars.next() {
let mut feature = Vec::with_capacity(13);
buf_feature!(buffer, feature, "2{}", cur_char);
#[cfg(feature = "char-type")]
buf_feature!(
buffer,
feature,
"b{}",
CharacterType::get_type(cur_char) as u8
);
if pre_char != char_null {
buf_feature!(buffer, feature, "1{}", pre_char);
buf_feature!(buffer, feature, "6{}{}", pre_char, cur_char);
#[cfg(feature = "char-type")]
buf_feature!(
buffer,
feature,
"c{}",
CharacterType::get_type(pre_char) as u8
);
if pre2_char != char_null {
buf_feature!(buffer, feature, "0{}", pre2_char);
buf_feature!(buffer, feature, "5{}{}", pre2_char, pre_char);
#[cfg(feature = "cross-char")]
buf_feature!(buffer, feature, "9{}{}", pre2_char, cur_char);
}
if pre2_char == cur_char {
buf_feature!(buffer, feature, "c"); }
}
let next_char = if let Some((_, next_char)) = chars.peek() {
buf_feature!(buffer, feature, "3{}", next_char);
buf_feature!(buffer, feature, "7{}{}", cur_char, next_char);
#[cfg(feature = "char-type")]
buf_feature!(
buffer,
feature,
"d{}",
CharacterType::get_type(*next_char) as u8
);
*next_char
} else {
' '
};
if let Some((_, next2_char)) = chars.peek() {
buf_feature!(buffer, feature, "4{}", next2_char);
buf_feature!(buffer, feature, "8{}{}", next_char, next2_char);
#[cfg(feature = "cross-char")]
buf_feature!(buffer, feature, "a{}{}", cur_char, next2_char);
}
pre2_char = pre_char;
pre_char = cur_char;
index.push(char_idx);
features.push(feature);
}
index.push(chars_len);
Ok((index, features))
}
pub fn parse_char_features(&self, sentence: &str) -> Result<(Vec<usize>, Vec<Vec<String>>)> {
let mut buffer = Vec::with_capacity(sentence.len() * 20);
let (index, features) = self.parse_char_features_with_buffer(sentence, &mut buffer)?;
let mut start = 0usize;
let mut result = Vec::with_capacity(features.len());
for feature_end in features {
let mut feature = Vec::with_capacity(feature_end.len());
for end in feature_end {
feature.push(String::from_utf8_lossy(&buffer[start..end]).to_string());
start = end;
}
result.push(feature);
}
Ok((index, result))
}
pub fn parse_char_features_with_buffer_str<'a>(
&self,
sentence: &str,
buffer: &'a mut Vec<u8>,
) -> Result<(Vec<usize>, Vec<Vec<&'a str>>)> {
let (index, features) = self.parse_char_features_with_buffer(sentence, buffer)?;
let mut start = 0usize;
let mut result = Vec::with_capacity(features.len());
for feature_end in features {
let mut feature = Vec::with_capacity(feature_end.len());
for end in feature_end {
feature.push(unsafe { std::str::from_utf8_unchecked(&buffer[start..end]) });
start = end;
}
result.push(feature);
}
Ok((index, result))
}
}
impl Definition for CWSDefinition {
type Fragment = dyn for<'any> GenericItem<'any, Item = Vec<usize>>;
type Prediction = dyn for<'any> GenericItem<'any, Item = Vec<&'any str>>;
type RawFeature = dyn for<'any> GenericItem<'any, Item = &'any str>;
fn use_viterbi(&self) -> bool {
true
}
fn labels(&self) -> Vec<String> {
vec![
"S".to_string(),
"B".to_string(),
"M".to_string(),
"E".to_string(),
]
}
fn label_num(&self) -> usize {
4
}
fn label_to(&self, label: &str) -> usize {
match label {
"S" => 0,
"B" => 1,
"M" => 2,
"E" => 3,
_ => panic!("unknown label"),
}
}
fn to_label(&self, index: usize) -> &str {
match index {
0 => "S",
1 => "B",
2 => "M",
3 => "E",
_ => panic!("unknown label index"),
}
}
fn parse_features(&self, sentence: &&str) -> Result<(Vec<usize>, Vec<Vec<String>>)> {
let (index, features) = self.parse_char_features(sentence)?;
Ok((index, features))
}
fn parse_features_with_buffer<'a>(
&self,
sentence: &&str,
buf: &'a mut Vec<u8>,
) -> Result<(Vec<usize>, Vec<Vec<&'a str>>)> {
let (index, features) = self.parse_char_features_with_buffer_str(sentence, buf)?;
Ok((index, features))
}
#[cfg(feature = "parallel")]
fn parse_gold_features<R: Read>(&self, reader: R) -> Result<Vec<Sample>> {
let lines = BufReader::new(reader).lines();
let lines = lines.flatten().filter(|s| !s.is_empty()).collect_vec();
lines
.par_iter()
.map(|sentence| {
self.parse_char_features(sentence).map(|(_, features)| {
let mut labels = Vec::with_capacity(features.len());
let mut last_char = ' ';
let mut chars = sentence.chars().peekable();
while let Some(cur_char) = chars.next() {
if cur_char == ' ' {
last_char = cur_char;
continue;
}
if let Some(next_char) = chars.peek() {
match (last_char, next_char) {
(' ', ' ') => labels.push(self.label_to("S")),
(' ', _nc) => labels.push(self.label_to("B")),
(_lc, ' ') => labels.push(self.label_to("E")),
(_lc, _nc) => labels.push(self.label_to("M")),
}
} else if last_char == ' ' {
labels.push(self.label_to("S"));
} else {
labels.push(self.label_to("E"));
}
last_char = cur_char;
}
(features, labels)
})
})
.collect()
}
#[cfg(not(feature = "parallel"))]
fn parse_gold_features<R: Read>(&self, reader: R) -> Result<Vec<Sample>> {
let lines = BufReader::new(reader).lines();
let lines = lines.flatten().filter(|s| !s.is_empty()).collect_vec();
lines
.iter()
.map(|sentence| {
self.parse_char_features(sentence).map(|(_, features)| {
let mut labels = Vec::with_capacity(features.len());
let mut last_char = ' ';
let mut chars = sentence.chars().peekable();
while let Some(cur_char) = chars.next() {
if cur_char == ' ' {
last_char = cur_char;
continue;
}
if let Some(next_char) = chars.peek() {
match (last_char, next_char) {
(' ', ' ') => labels.push(self.label_to("S")),
(' ', _nc) => labels.push(self.label_to("B")),
(_lc, ' ') => labels.push(self.label_to("E")),
(_lc, _nc) => labels.push(self.label_to("M")),
}
} else if last_char == ' ' {
labels.push(self.label_to("S"));
} else {
labels.push(self.label_to("E"));
}
last_char = cur_char;
}
(features, labels)
})
})
.collect()
}
fn predict(&self, _: &&str, _: &Vec<usize>, predicts: &[usize]) -> Vec<&str> {
self.to_labels(predicts)
}
fn evaluate(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize) {
self.evaluate_entities(predicts, labels)
}
}
#[cfg(test)]
mod tests {
use super::CWSDefinition as Define;
use anyhow::Result;
use std::iter::zip;
#[test]
fn test_vec_buffer() -> Result<()> {
let mut buffer = Vec::new();
let sentence = "桂林警备区从一九九○年以来,先后修建水电站十五座,整修水渠六千七百四十公里,兴修水利一千五百六十五处,修建机耕路一百二十六公里,修建人畜饮水工程二百六十五处,解决饮水人口六点五万人,使八万多壮、瑶、苗、侗、回等民族的群众脱了贫,占桂林地、市脱贫人口总数的百分之三十七点六。";
let define = Define::default();
let (_, no_buffer) = define.parse_char_features(sentence)?;
let (_, with_buffer) = define.parse_char_features_with_buffer_str(sentence, &mut buffer)?;
for (a, b) in zip(no_buffer, with_buffer) {
for (c, d) in zip(a, b) {
assert_eq!(c, d);
}
}
println!(
"{}/{}/{}",
sentence.len(),
buffer.len(),
buffer.len() / sentence.len()
);
Ok(())
}
}