ltp 0.1.9

Language Technology Platform For Rust.
Documentation
use crate::perceptron::definition::GenericItem;
use crate::perceptron::{Definition, Sample};
use crate::buf_feature;
use anyhow::Result;
use itertools::Itertools;
#[cfg(feature = "parallel")]
use rayon::prelude::*;
#[cfg(feature = "serialization")]
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::collections::HashMap;
use std::io::{BufRead, BufReader, Read, Write};

#[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))]
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct POSDefinition {
    to_labels: Vec<String>,
    labels_to: HashMap<String, usize>,
}

impl POSDefinition {
    pub fn new(to_labels: Vec<String>) -> Self {
        let labels_to = to_labels
            .iter()
            .enumerate()
            .map(|(i, label)| (label.clone(), i))
            .collect();
        POSDefinition {
            labels_to,
            to_labels,
        }
    }

    /// +----------------------+----------------------------------------------------------+
    // | 类别                 | 特征                                                       |
    // +======================+===========================================================+
    // | word-unigram         | w[-2],w[-1],w[0],w[1],w[2]                                |
    // +----------------------+-----------------------------------------------------------+
    // | word-bigram          | w[-2]w[-1],w[-1]w[0],w[0]w[1],w[1]w[2],w[-2]w[0],w[0]w[2] |
    // +----------------------+-----------------------------------------------------------+
    // | word-trigram         | w[-1]w[0]w[1]                                             |
    // +----------------------+-----------------------------------------------------------+
    // | last-first-character | ch[0,0]ch[0,n],ch[-1,n]ch[0,0],ch[0,-1]ch[1,0]            |
    // +----------------------+-----------------------------------------------------------+
    // | length               | length                                                    |
    // +----------------------+-----------------------------------------------------------+
    // | prefix               | ch[0,0],ch[0,0:1],ch[0,0:2]                               |
    // +----------------------+-----------------------------------------------------------+
    // | suffix               | ch[0,n-2:n],ch[0,n-1:n],ch[0,n]                           |
    // +----------------------+-----------------------------------------------------------+
    pub fn parse_words_features_with_buffer<'a>(&self, words: &[&str], buffer: &'a mut Vec<u8>) -> Result<Vec<Vec<usize>>> {
        let word_null = "";
        let words_len = words.len();
        let mut features = Vec::with_capacity(words_len);

        let chars = words
            .iter()
            .map(|w| SmallVec::<[char; 4]>::from_iter(w.chars()))
            .collect_vec();

        for (idx, &cur_word) in words.iter().enumerate() {
            // 剩余字符数
            let last = words_len - idx - 1;
            let pre2_word = if idx > 1 { words[idx - 2] } else { word_null };
            let pre_word = if idx > 0 { words[idx - 1] } else { word_null };
            let next_word = if last > 0 { words[idx + 1] } else { word_null };
            let next2_word = if last > 1 { words[idx + 2] } else { word_null };

            // todo: 优化容量设置
            let mut feature = Vec::with_capacity(22);

            // w[0]
            buf_feature!(buffer, feature, "2{}", words[idx]);
            // ch[0,0]ch[0,n]
            buf_feature!(buffer, feature, "c{}{}", chars[idx][0], chars[idx][chars[idx].len() - 1]);
            // length
            buf_feature!(buffer, feature, "f{}", chars[idx].len());
            // prefix => ch[0,0]ch[0,0:1]ch[0,0:2]

            let prefix_id = &['c', 'd', 'e'];
            for (bias, prefix) in chars[idx]
                .iter()
                .take(3)
                .enumerate()
            {
                buf_feature!(buffer, feature, "{}{}", prefix_id[bias], prefix);
            };
            // suffix => ch[0,n-2:n],ch[0,n-1:n],ch[0,n]
            let suffix_id = &['f', 'g', 'h'];
            for (bias, suffix) in chars[idx]
                .iter()
                .rev()
                .take(3)
                .enumerate()
            {
                buf_feature!(buffer, feature, "{}{}", suffix_id[bias], suffix);
            };

            if idx > 0 {
                // w[-1]
                buf_feature!(buffer, feature, "1{}", pre_word);
                // w[-1]w[0]
                buf_feature!(buffer, feature, "6{}{}", pre_word, cur_word);
                // ch[-1,n]ch[0,0]
                buf_feature!(buffer, feature, "d{}{}", chars[idx - 1][chars[idx - 1].len() - 1], chars[idx][0]);

                if idx > 1 {
                    // w[-2]
                    buf_feature!(buffer, feature, "0{}", pre2_word);
                    // w[-2]w[-1]
                    buf_feature!(buffer, feature, "5{}{}", pre2_word, pre_word);
                    // w[-2]w[0]
                    buf_feature!(buffer, feature, "9{}{}", pre2_word, cur_word);
                }
            }

            if last > 0 {
                // w[+1]
                buf_feature!(buffer, feature, "3{}", next_word);
                // w[0]w[+1]
                buf_feature!(buffer, feature, "7{}{}", cur_word, next_word);
                // ch[0,-1]ch[1,0]
                buf_feature!(buffer, feature, "e{}{}", chars[idx][chars[idx].len() - 1], chars[idx + 1][0]);

                if last > 1 {
                    // w[+2]
                    buf_feature!(buffer, feature, "4{}", next2_word);
                    // w[+1]w[+2]
                    buf_feature!(buffer, feature, "8{}{}", next_word, next2_word);
                    // w[0]w[+2]
                    buf_feature!(buffer, feature, "a{}{}", cur_word, next2_word);
                }
            }

            if idx > 0 && last > 0 {
                // w[-1]w[0]w[+1]
                buf_feature!(buffer, feature, "b{}{}{}", pre_word, cur_word, next_word);
            }
            features.push(feature);
        }
        Ok(features)
    }

    pub fn parse_words_features(&self, words: &[&str]) -> Result<Vec<Vec<String>>> {
        let mut buffer = Vec::with_capacity(words.len() * 180);
        let features = self.parse_words_features_with_buffer(words, &mut buffer)?;

        let mut start = 0usize;
        let mut result = Vec::with_capacity(features.len());
        for feature_end in features {
            let mut feature = Vec::with_capacity(feature_end.len());
            for end in feature_end {
                // Safety : all write are valid utf8
                feature.push(String::from_utf8_lossy(&buffer[start..end]).to_string());
                start = end;
            }
            result.push(feature);
        }
        Ok(result)
    }

    pub fn parse_words_features_with_buffer_str<'a>(&self, words: &[&str], buffer: &'a mut Vec<u8>) -> Result<Vec<Vec<&'a str>>> {
        let features = self.parse_words_features_with_buffer(words, buffer)?;

        let mut start = 0usize;
        let mut result = Vec::with_capacity(features.len());
        for feature_end in features {
            let mut feature = Vec::with_capacity(feature_end.len());
            for end in feature_end {
                // Safety : all write are valid utf8
                feature.push(unsafe { std::str::from_utf8_unchecked(&buffer[start..end]) });
                start = end;
            }
            result.push(feature);
        }
        Ok(result)
    }
}

impl Definition for POSDefinition {
    type Fragment = dyn for<'any> GenericItem<'any, Item=()>;
    type Prediction = dyn for<'any> GenericItem<'any, Item=Vec<&'any str>>;
    type RawFeature = dyn for<'any> GenericItem<'any, Item=&'any [&'any str]>;

    fn labels(&self) -> Vec<String> {
        self.to_labels.clone()
    }

    fn label_num(&self) -> usize {
        self.to_labels.len()
    }

    fn label_to(&self, label: &str) -> usize {
        self.labels_to[label]
    }

    fn to_label(&self, index: usize) -> &str {
        &self.to_labels[index]
    }

    fn parse_features(&self, words: &&[&str]) -> Result<((), Vec<Vec<String>>)> {
        let features = self.parse_words_features(words)?;
        Ok(((), features))
    }

    fn parse_features_with_buffer<'a>(
        &self,
        words: &&[&str],
        buf: &'a mut Vec<u8>,
    ) -> Result<((), Vec<Vec<&'a str>>)> {
        let features = self.parse_words_features_with_buffer_str(words, buf)?;
        Ok(((), features))
    }

    #[cfg(feature = "parallel")]
    fn parse_gold_features<R: Read>(&self, reader: R) -> Result<Vec<Sample>> {
        let lines = BufReader::new(reader).lines();
        let lines = lines.flatten().filter(|s| !s.is_empty()).collect_vec();

        lines
            .par_iter()
            .map(|sentence| {
                let words_tags = sentence.split_whitespace().collect_vec();

                let mut words = Vec::with_capacity(words_tags.len());
                let mut labels = Vec::with_capacity(words_tags.len());
                for word_tag in words_tags {
                    let result = word_tag.rsplitn(2, '/');
                    let (label, word) = result.collect_tuple().expect("tag not found");
                    words.push(word);
                    labels.push(self.label_to(label));
                }
                self.parse_words_features(&words).map(|features| (features, labels))
            })
            .collect()
    }

    #[cfg(not(feature = "parallel"))]
    fn parse_gold_features<R: Read>(&self, reader: R) -> Result<Vec<Sample>> {
        let lines = BufReader::new(reader).lines();
        let lines = lines.flatten().filter(|s| !s.is_empty()).collect_vec();

        lines
            .iter()
            .map(|sentence| {
                let words_tags = sentence.split_whitespace().collect_vec();

                let mut words = Vec::with_capacity(words_tags.len());
                let mut labels = Vec::with_capacity(words_tags.len());
                for word_tag in words_tags {
                    let result = word_tag.rsplitn(2, '/');
                    let (label, word) = result.collect_tuple().expect("tag not found");
                    words.push(word);
                    labels.push(self.label_to(label));
                }
                self.parse_words_features(&words).map(|features| (features, labels))
            })
            .collect()
    }

    fn predict(
        &self,
        _: &<Self::RawFeature as GenericItem>::Item,
        _: &<Self::Fragment as GenericItem>::Item,
        predicts: &[usize],
    ) -> Vec<&str> {
        self.to_labels(predicts)
    }

    fn evaluate(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize) {
        self.evaluate_tags(predicts, labels)
    }
}


#[cfg(test)]
mod tests {
    use std::iter::zip;
    use super::POSDefinition as Define;
    use anyhow::Result;

    #[test]
    fn test_vec_buffer() -> Result<()> {
        let mut buffer = Vec::new();

        let sentence = vec!["桂林", "警备区", "从", "一九九○年", "以来", ",", "先后", "修建", "水电站", "十五", "座", ",", "整修", "水渠", "六千七百四十", "公里", ",", "兴修", "水利", "一千五百六十五", "处", ",", "修建", "机耕路", "一百二十六", "公里", ",", "修建", "人", "畜", "饮水", "工程", "二百六十五", "处", ",", "解决", "饮水", "人口", "六点五万", "人", ",", "使", "八万", "多", "壮", "、", "瑶", "、", "苗", "、", "侗", "、", "回", "等", "民族", "的", "群众", "脱", "了", "贫", ",", "占", "桂林", "地", "、", "市", "脱贫", "人口", "总数", "的", "百分之三十七点六", "。"];
        let define = Define::default();
        let no_buffer = define.parse_words_features(&sentence)?;
        let with_buffer = define.parse_words_features_with_buffer_str(&sentence, &mut buffer)?;

        for (a, b) in zip(no_buffer, with_buffer) {
            for (c, d) in zip(a, b) {
                assert_eq!(c, d);
            }
        }

        println!("{}/{}/{}", sentence.len(), buffer.len(), buffer.len() / sentence.len());

        Ok(())
    }
}