use crate::workdir::Workdir;

#[test]
fn tokenize() {
    let wrk = Workdir::new("tokenize");
    wrk.create(
        "data.csv",
        vec![
            svec!["n", "text"],
            svec!["1", "le chat mange"],
            svec!["2", "la souris"],
            svec!["3", ""],
        ],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "le"],
        svec!["1", "chat"],
        svec!["1", "mange"],
        svec!["2", "la"],
        svec!["2", "souris"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_simple() {
    let wrk = Workdir::new("tokenize_simple");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "aujourd'hui"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").arg("-S").arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "aujourd"],
        svec!["1", "hui"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_sep() {
    let wrk = Workdir::new("tokenize_sep");
    wrk.create(
        "data.csv",
        vec![
            svec!["n", "text"],
            svec!["1", "le chat mange"],
            svec!["2", "la souris"],
            svec!["3", ""],
        ],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["--sep", "|"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "text", "tokens"],
        svec!["1", "le chat mange", "le|chat|mange"],
        svec!["2", "la souris", "la|souris"],
        svec!["3", "", ""],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_sep_types() {
    let wrk = Workdir::new("tokenize_sep_types");
    wrk.create(
        "data.csv",
        vec![
            svec!["n", "text"],
            svec!["1", "le chat mange"],
            svec!["2", "la souris"],
            svec!["3", ""],
        ],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--sep", "|"])
        .args(["-T", "types"])
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "text", "tokens", "types"],
        svec!["1", "le chat mange", "le|chat|mange", "word|word|word"],
        svec!["2", "la souris", "la|souris", "word|word"],
        svec!["3", "", "", ""],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_keep_text() {
    let wrk = Workdir::new("tokenize_keep_text");
    wrk.create(
        "data.csv",
        vec![
            svec!["n", "text"],
            svec!["1", "le chat mange"],
            svec!["2", "la souris"],
            svec!["3", ""],
        ],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").arg("--keep-text").arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "text", "token"],
        svec!["1", "le chat mange", "le"],
        svec!["1", "le chat mange", "chat"],
        svec!["1", "le chat mange", "mange"],
        svec!["2", "la souris", "la"],
        svec!["2", "la souris", "souris"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_column() {
    let wrk = Workdir::new("tokenize_column");
    wrk.create(
        "data.csv",
        vec![
            svec!["n", "text"],
            svec!["1", "le chat mange"],
            svec!["2", "la souris"],
            svec!["3", ""],
        ],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["-c", "word"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "word"],
        svec!["1", "le"],
        svec!["1", "chat"],
        svec!["1", "mange"],
        svec!["2", "la"],
        svec!["2", "souris"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_token_type() {
    let wrk = Workdir::new("tokenize_token_type");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "1 chat mange 😎"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["-T", "type"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token", "type"],
        svec!["1", "1", "number"],
        svec!["1", "chat", "word"],
        svec!["1", "mange", "word"],
        svec!["1", "😎", "emoji"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_parallel() {
    let wrk = Workdir::new("tokenize_parallel");
    wrk.create(
        "data.csv",
        vec![
            svec!["n", "text"],
            svec!["1", "le chat mange"],
            svec!["2", "la souris"],
            svec!["3", ""],
        ],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").arg("-p").arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "le"],
        svec!["1", "chat"],
        svec!["1", "mange"],
        svec!["2", "la"],
        svec!["2", "souris"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_drop() {
    let wrk = Workdir::new("tokenize_drop");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "1 chat 😎"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--drop", "number,emoji"])
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![svec!["n", "token"], svec!["1", "chat"]];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_keep() {
    let wrk = Workdir::new("tokenize_keep");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "1 chat 😎"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--keep", "number,emoji"])
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![svec!["n", "token"], svec!["1", "1"], svec!["1", "😎"]];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_min_token_len() {
    let wrk = Workdir::new("tokenize_min_token_len");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chaton"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["--min-token", "3"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![svec!["n", "token"], svec!["1", "chaton"]];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_max_token_len() {
    let wrk = Workdir::new("tokenize_max_token_len");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chaton"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["--max-token", "3"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![svec!["n", "token"], svec!["1", "le"]];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_stoplist() {
    let wrk = Workdir::new("tokenize_stoplist");
    wrk.create("stoplist.txt", vec![svec!["le"], svec!["la"]]);
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chaton mange la souris"]],
    );

    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--stoplist", "stoplist.txt"])
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "chaton"],
        svec!["1", "mange"],
        svec!["1", "souris"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_ngrams() {
    let wrk = Workdir::new("tokenize_ngrams");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chat mange"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["--ngrams", "2"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "le|chat"],
        svec!["1", "chat|mange"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_ngrams_sep() {
    let wrk = Workdir::new("tokenize_ngrams_sep");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chat mange"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--ngrams", "2"])
        .args(["--ngrams-sep", ", "])
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "le, chat"],
        svec!["1", "chat, mange"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_ngrams_range() {
    let wrk = Workdir::new("tokenize_ngrams_range");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chat mange"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text").args(["--ngrams", "1,2"]).arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "le"],
        svec!["1", "chat"],
        svec!["1", "le|chat"],
        svec!["1", "mange"],
        svec!["1", "chat|mange"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_ngrams_parallel() {
    let wrk = Workdir::new("tokenize_ngrams_parallel");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chat mange"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--ngrams", "2"])
        .arg("-p")
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "token"],
        svec!["1", "le|chat"],
        svec!["1", "chat|mange"],
    ];
    assert_eq!(got, expected);
}

#[test]
fn tokenize_sep_ngrams() {
    let wrk = Workdir::new("tokenize_sep_ngrams");
    wrk.create(
        "data.csv",
        vec![svec!["n", "text"], svec!["1", "le chat mange"]],
    );
    let mut cmd = wrk.command("tokenize");
    cmd.arg("text")
        .args(["--ngrams", "2"])
        .args(["--sep", "§"])
        .arg("data.csv");

    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
    let expected = vec![
        svec!["n", "text", "tokens"],
        svec!["1", "le chat mange", "le|chat§chat|mange"],
    ];
    assert_eq!(got, expected);
}