#[macro_use(c)]
extern crate cute;
use std::{thread, time};
use std::collections::HashMap;
use std::path::Path;
use std::process::{Command, Output, Stdio};
const VERSION: &'static str = "0.1.0"; const DEBUG: bool = false;
fn s(v: &str) -> String {
v.to_string()
}
pub fn install() -> Vec<Output> {
if cfg!(target_os = "windows") {
unimplemented!("Windows support not yet enabled")
} else {
let cmds = [
s("wget https://github.com/facebookresearch/fastText/archive/v0.1.0.zip"),
s("unzip v") + VERSION + ".zip",
s("cd fastText-") + VERSION + "; make",
s("mv fastText-") + VERSION + "/fasttext .",
s("rm -r fastText-") + VERSION,
s("rm v") + VERSION + ".zip"
];
c![
Command::new("sh")
.arg("-c")
.arg(c)
.stdout(Stdio::piped())
.output()
.expect("failed to execute process"),
for c in cmds.iter()
]
}
}
fn wrap_install(cmds: &str) -> Output {
let r = Command::new("sh")
.arg("-c")
.arg(s("./fasttext ") + cmds)
.stdout(Stdio::piped())
.output()
.expect("failed to execute process");
if !r.status.success() && !Path::new("fasttext").exists() {
let inst_resps = install();
for ir in inst_resps.iter() {
assert!(ir.status.success()); }
println!("Recursing in wrap_install with command: {}", cmds);
return wrap_install(cmds); }
r
}
pub fn supervised(args: &HashMap<&str, &str>) {
gen_mod(s("supervised"), args);
}
pub fn quantize(args: &HashMap<&str, &str>) {
gen_mod(s("quantize"), args);
}
pub fn predict(model: &str, inp: &str, k: u32) -> Vec<f64> {
let s = s("predict ") + model + " " + inp + " " + &k.to_string();
let r = wrap_install(&s);
c![p.parse::<f64>().unwrap(),
for p in String::from_utf8_lossy(&r.stdout).split("\n")]
}
fn gen_mod<'a>(mut s: String, args: &HashMap<&str, &'a str>) {
for k in args.keys() {
s = s + " -" + k + " " + args.get(k).unwrap();
}
if !wrap_install(&s).status.success() {
panic!("Gen_mod failed with given input: {}", s)
}
}
pub fn skipgram(args: &HashMap<&str, &str>) {
gen_mod(s("skipgram"), args);
}
pub fn cbow(args: &HashMap<&str, &str>) {
gen_mod(s("cbow"), args);
}
pub fn min_skipgram(input: &str, output: &str) -> String {
let st = s("skipgram -input ") + input + " -output " + output;
let o = wrap_install(&st);
if o.status.success() {
s(output) + ".bin"
} else {
panic!("Min_skipgram failed with given input: {} \noutput: {:?}", st, o)
}
}
pub fn min_cbow(input: &str, output: &str) -> String {
let st = s("cbow -input ") + input + " -output " + output;
if wrap_install(&st).status.success() {
s(output) + ".bin"
} else {
panic!("Cbow failed with given input: {}", st)
}
}
pub fn nn(words: &str, model: &str, k: u32) -> Vec<Vec<(String, f64)>> {
if DEBUG { println!("NN begun") };
let cmd = s("echo ") + words + " | ./fasttext nn " + model + " " + &k.to_string();
if DEBUG { println!("cmd: {}", cmd); }
let r = Command::new("sh")
.arg("-c")
.arg(cmd)
.stdout(Stdio::piped())
.output()
.expect("failed to execute process");
if !r.status.success() && !Path::new("./fasttext").exists(){
let ir = install();
for o in ir.iter() {
if !o.status.success() { panic!("Missing files / executable in call to nn"); }
}
}
if DEBUG { println!("{:?}", r); }
let stdout = String::from_utf8_lossy(&r.stdout);
let sm = "Query word? ";
let mut v0 = Vec::new();
if DEBUG {
println!("Beginning match iteration");
println!("stdout: {}", stdout);
}
for (start, _) in stdout.match_indices(sm) {
if DEBUG { println!("Match found: {}", start); }
let mut v1 = Vec::new();
let mut first = true;
for l in stdout[start..].split("\n") {
let lar: Vec<&str> = l.split(" ").collect();
if DEBUG { println!("{:?}", lar); }
if lar.len() == 2 {
v1.push((lar[0].to_string(), lar[1].parse::<f64>().unwrap()));
} else if lar.len() == 4 && first {
v1.push((lar[2].to_string(), lar[3].parse::<f64>().unwrap()));
first = false;
} else if l == "Query word? " || (lar.len() == 4 && !first) {
break;
} else {
panic!("misformatted line in input: {}", l);
}
}
if v1.len() > 0 {
v0.push(v1);
}
}
v0
}
pub fn analogies(analogy: &str, model: &str, k: u32) {
unimplemented!(); let cmd = s("echo ") + analogy + " | ./fasttext analogies " + model + " " + &k.to_string();
}
#[cfg(test)]
mod tests {
extern crate kolmogorov_smirnov as ks;
use std::collections::HashSet;
use std::panic;
use super::*;
fn check_exists(file: &str, or: fn()) {
if !Path::new(file).exists() {
thread::sleep(time::Duration::from_secs(30));
if !Path::new(file).exists() {
or()
}
}
}
fn rm(files: Vec<&str>) {
for f in files.iter() {
let cmd = s("rm -r ") + f;
let r = Command::new("sh")
.arg("-c")
.arg(&cmd)
.stdout(Stdio::piped())
.output()
.expect("failed to execute process");
}
}
fn set(v: Vec<Vec<(String, f64)>>) -> HashSet<String> {
let mut out = HashSet::new();
for v0 in v.into_iter() {
for t in v0.into_iter() {
let (st, _) = t;
out.insert(st);
}
}
out
}
fn sim(a: &HashSet<String>, b: &HashSet<String>) -> usize {
c![v, for v in a.intersection(b)].len()
}
fn inst() {
check_exists("fasttext", || { install(); });
}
fn samp() {
check_exists("sample.bin", sample_skipgram);
}
#[test]
fn test_install() {
let rv = install();
for r in rv.iter() {
println!("{}", String::from_utf8_lossy(&r.stdout));
println!("{}", String::from_utf8_lossy(&r.stderr));
assert!(r.status.success());
}
let r = Command::new("sh")
.arg("-c")
.arg("./fasttext")
.stdout(Stdio::piped())
.output()
.expect("failed to execute process");
println!("{}", String::from_utf8_lossy(&r.stdout));
println!("{}", String::from_utf8_lossy(&r.stderr));
assert_eq!(r.status.code(), Some(1)); }
fn sample_skipgram() {
inst();
let model = min_skipgram("sample_text.txt", "sample");
println!("Generated skipgram model: {}", model);
}
#[test]
fn test_nn() {
samp();
let out = nn("lesbian", "sample.bin", 10);
println!("{:?}", out);
assert_eq!(out.len(), 1); assert_eq!(out[0].len(), 10);
let out = nn("lesbian gay", "sample.bin", 5);
println!("{:?}", out);
assert_eq!(out.len(), 2);
assert_eq!(out[0].len(), 5);
let out = nn("lesbian gay bisexual", "sample.bin", 8);
println!("{:?}", out);
assert_eq!(out.len(), 3);
assert_eq!(out[0].len(), 8);
let out = nn("lesbian gay bisexual transgender", "sample.bin", 1);
println!("{:?}", out);
assert_eq!(out.len(), 4);
assert_eq!(out[0].len(), 1);
}
fn test_embedding(min_fn: fn(&str, &str) -> String, reg_fn: fn(&HashMap<&str, &str>), min_name: &str, reg_name: &str) {
inst();
let mut failed = 0;
let mut total = 0;
let conf = 0.9;
let input = "sample_text.txt";
let mut args = HashMap::new();
args.insert("input", input);
args.insert("output", reg_name);
for w in ["friend"].iter() {
let mut v1 = Vec::new();
let mut v2 = Vec::new();
for i in 0..18 {
let m1 = min_fn(input, min_name);
reg_fn(&args);
let m2 = s(min_name) + ".bin";
v1.push(set(nn(w, &m1, 10)));
v2.push(set(nn(w, &m2, 10)));
println!("model iteration #: {}", i);
}
let mut self1 = Vec::new();
for s1 in v1.iter() {
for s2 in v1.iter() {
self1.push(sim(s1, s2))
}
}
let mut self2 = Vec::new();
for s1 in v2.iter() {
for s2 in v2.iter() {
self2.push(sim(s1, s2))
}
}
let mut between = Vec::new();
for s1 in v1.iter() {
for s2 in v2.iter() {
between.push(sim(s1, s2))
}
}
let s1s2 = ks::test(&self1, &self2, conf);
let bs1 = ks::test(&between, &self1, conf);
let bs2 = ks::test(&between, &self2, conf);
total += 3;
if s1s2.is_rejected {
println!("self1: {:?}\n\nself2: {:?}\n\nbetween: {:?}", self1, self2, between);
println!("Self 1 and self 2 are dissimilar. P of difference: {}", s1s2.reject_probability);
failed += 1;
} else if bs1.is_rejected {
println!("self1: {:?}\n\nself2: {:?}\n\nbetween: {:?}", self1, self2, between);
println!("Between and self 1 are dissimilar. P of difference: {}", bs1.reject_probability);
failed += 1;
} else if bs2.is_rejected {
println!("self1: {:?}\n\nself2: {:?}\n\nbetween: {:?}", self1, self2, between);
println!("Between and self 2 are dissimilar. P of difference: {}", bs2.reject_probability);
failed += 1;
}
}
let r1 = s(min_name) + "*";
let r2 = s(reg_name) + "*";
rm(vec![&r1, &r2]);
if (failed as f64 / total as f64) > (1. - conf) {
panic!("Test failed")
}
}
#[test]
fn test_skipgram() {
inst();
test_embedding(min_skipgram, skipgram, "test_min_skipgram", "test_skipgram");
}
#[test]
fn test_cbow(){
inst();
test_embedding(min_cbow, cbow, "test_min_cbow", "test_cbow");
}
}