#![feature(hashmap_hasher)]
extern crate farmhash;
use std::collections::{HashMap, HashSet, hash_map};
use std::collections::hash_state::DefaultState;
use std::fs::File;
use std::io::{BufRead, BufReader, Result};
use std::vec::Vec;
use farmhash::FarmHasher;
#[derive(PartialEq)]
enum Part {
Noun=0,
Verb=1,
Adj=2,
Adv=3
}
impl Part {
fn as_usize(&self) -> usize {
match *self {
Part::Noun => 0,
Part::Verb => 1,
Part::Adj => 2,
Part::Adv => 3
}
}
fn as_str(&self) -> &str {
match *self {
Part::Noun => "noun",
Part::Verb => "verb",
Part::Adj => "adj",
Part::Adv => "adv"
}
}
}
const NOUN:usize = 0;
const VERB:usize = 1;
const ADJ:usize = 2;
const ADV:usize = 3;
const PARTS:[usize; 4] = [NOUN, VERB, ADJ, ADV];
const WN_FILES:[[&'static str; 2]; 4] = [
["index.noun", "noun.exc"],
["index.verb", "verb.exc"],
["index.adj", "adj.exc"],
["index.adv", "adv.exc"]
];
type FastHashMap = HashMap<String, String, DefaultState<FarmHasher>>;
type Wordlist = Vec<FastHashMap>;
type Exceptions = Vec<HashMap<String, Vec<String>, DefaultState<FarmHasher>>>;
type Substitutions = Vec<Vec<Vec<&'static str>>> ;
type LemmaPosOffsetMap = HashMap<String, HashMap<usize, Vec<i32>>>;
type FileMap = HashMap<char, String>;
struct WordnetStemmer {
wordlist: Wordlist,
exceptions: Exceptions,
substitutions: Substitutions,
lemma_pos_offset_map: LemmaPosOffsetMap,
basedir: String,
}
impl WordnetStemmer {
fn new(basedir: &str) -> Result<WordnetStemmer> {
let mut wn = WordnetStemmer {
basedir: basedir.to_owned(),
wordlist: Vec::new(),
exceptions: Vec::new(),
substitutions: vec![
vec![
vec!["s", "" ],
vec!["ses", "s" ],
vec!["ves", "f" ],
vec!["xes", "x" ],
vec!["zes", "z" ],
vec!["ches", "ch" ],
vec!["shes", "sh" ],
vec!["men", "man"],
vec!["ies", "y" ]
],
vec![
vec!["s", "" ],
vec!["ies", "y"],
vec!["es", "e"],
vec!["es", "" ],
vec!["ed", "e"],
vec!["ed", "" ],
vec!["ing", "e"],
vec!["ing", "" ]
],
vec![
vec!["er", "" ],
vec!["est", "" ],
vec!["er", "e"],
vec!["est", "e"]
],
vec![],
],
lemma_pos_offset_map: HashMap::new(),
};
for _ in PARTS.iter() {
wn.wordlist.push(Default::default());
wn.exceptions.push(Default::default());
}
for part in PARTS.iter() {
try!(wn.load(*part, WN_FILES[*part]));
}
Ok(wn)
}
fn load(&mut self,
part: usize,
pair: [&str; 2]
) -> Result<()> {
let fname:String = format!("{}{}", self.basedir, pair[0]);
let mut f = try!(File::open(fname));
let mut br = BufReader::new(f);
for line_result in br.lines() {
let line = try!(line_result);
if line.starts_with(" ") {
continue
}
let word = line.splitn(2, ' ').nth(0).unwrap();
self.wordlist[part].insert(word.clone().to_owned(), word.to_owned());
}
let fname = format!("{}{}", self.basedir, pair[1]);
f = try!(File::open(fname));
br = BufReader::new(f);
for line_result in br.lines() {
let line: String = try!(line_result);
if line.starts_with(" ") {
continue
}
let words: Vec<&str> = line.splitn(3, ' ').collect();
self.exceptions[part].entry(words[0].to_owned()).or_insert(Vec::new()).push(words[1].to_owned());
}
let res = self.load_lemma_pos_offset_map();
res
}
fn load_lemma_pos_offset_map(&mut self) -> Result<()>{
for variant in [Part::Noun, Part::Verb, Part::Adj, Part::Adv].iter() {
let fname:String = format!("{}/index.{}", self.basedir, variant.as_str());
let f = try!(File::open(fname));
let br = BufReader::new(f);
for line_result in br.lines() {
let line = try!(line_result);
if line.starts_with(" ") {
continue
}
let mut iter = line.split(' ');
let lemma = iter.next().unwrap();
let _ = iter.next().unwrap(); let n_synsets = iter.next().unwrap().parse::<i32>().unwrap();
let n_pointers = iter.next().unwrap().parse::<i32>().unwrap();
let _ = iter.nth((n_pointers as usize)).unwrap().parse::<i32>().unwrap();
let _ =iter.next();
let synset_offsets:Vec<i32> = iter.take(n_synsets as usize).map(|x|x.parse::<i32>().unwrap()).collect();
match self.lemma_pos_offset_map.entry(lemma.to_owned()) {
hash_map::Entry::Vacant(entry) => {
let mut hm = HashMap::new();
hm.insert(variant.as_usize(), synset_offsets);
entry.insert(hm);
},
hash_map::Entry::Occupied(mut entry) => { entry.get_mut().insert(variant.as_usize(), synset_offsets); }
}
if *variant == Part::Adj {
}
}
}
Ok(())
}
fn apply_rules(&self, part: usize, words: &Vec<String>
) -> Vec<String>{
let mut result = vec![];
for pair in self.substitutions[part].iter() {
let old: &str = (*pair)[0];
let new: &str = (*pair)[1];
for word in words.iter() {
if word.as_str().ends_with(old){
let w: String = (word.chars().take(word.len() - old.len()).collect::<String>()) + &new;
result.push(w);
}
}
}
result
}
fn filter_forms(&self, words: &Vec<String>, part: usize) -> Vec<String> {
let mut result:Vec<String> = vec![];
let mut seen = HashSet::new();
for word in words.iter(){
if self.lemma_pos_offset_map.contains_key(word) {
if self.lemma_pos_offset_map[word].contains_key(&part) {
if !seen.contains(word) {
seen.insert(word);
result.push(word.to_owned());
}
}
}
}
result
}
fn morphy(&self,
part: usize,
word: &str
) -> Vec<String> {
if self.exceptions[part].contains_key(word){
let mut words = vec![word.to_owned()];
words.extend_from_slice(&self.exceptions[part][word]);
return self.filter_forms(&words, part)
}
let mut forms = self.apply_rules(part, &vec![word.to_owned()]);
{
let mut words = vec![word.to_owned()];
words.extend_from_slice(&forms);
let results = self.filter_forms(&words, part);
if results.len() > 0 {
return results
}
}
while forms.len() > 0 {
forms = self.apply_rules(part, &forms);
let results = self.filter_forms(&forms, part);
if results.len() > 0 {
return results
}
}
vec![]
}
fn lemma(&self, part: usize, word: String) -> String {
let lemmas = self.morphy(part, &word);
if lemmas.len() > 0 {
let mut w_idx = 0;
let mut w_min_len = lemmas[0].len();
for pair in lemmas.iter().enumerate() {
let (idx, w2) = pair;
if w2.len() < w_min_len {
w_min_len = w2.len();
w_idx = idx
}
}
lemmas[w_idx].to_owned()
} else { word.clone() }
}
}
#[cfg(test)]
mod test {
#[test]
fn test_stemming() {
let wn = ::WordnetStemmer::new("/home/maciej/nltk_data/corpora/wordnet/").unwrap();
for (word, expected) in vec![
("dogs", "dog"),
("money", "money"),
("bananas", "banana"),
("berries", "berry"),
("press", "press"),
("ferries", "ferry"),
] {
assert_eq!(expected.to_owned(), wn.lemma(::NOUN, word.to_owned()) );
}
}
}