use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use assert_fs::prelude::*;
use csv::WriterBuilder;
use predicates::prelude::*;
use regex::Regex;
use serde_json::Value as Json;
use serial_test::serial;
use tempfile::tempdir;
use text_analysis::{
AnalysisOptions, ExportFormat, StemLang, StemMode, analyze_path, analyze_text_with,
csv_safe_cell,
};
fn write_file(dir: &assert_fs::TempDir, name: &str, content: &str) -> PathBuf {
let f = dir.child(name);
f.write_str(content).unwrap();
f.path().to_path_buf()
}
fn read_to_string<P: AsRef<Path>>(p: P) -> String {
fs::read_to_string(p).unwrap()
}
fn opts(fmt: ExportFormat) -> AnalysisOptions {
AnalysisOptions {
ngram: 2,
context: 5,
export_format: fmt,
entities_only: false,
combine: false,
stem_mode: StemMode::Off,
stem_require_detected: false,
}
}
fn run_cli_ok_in(dir: &std::path::Path, args: &[&str]) -> assert_cmd::assert::Assert {
let mut cmd = assert_cmd::Command::cargo_bin("text_analysis").unwrap();
cmd.current_dir(dir);
cmd.args(args).assert().success()
}
fn run_cli_fail_in(dir: &std::path::Path, args: &[&str]) -> assert_cmd::assert::Assert {
let mut cmd = assert_cmd::Command::cargo_bin("text_analysis").unwrap();
cmd.current_dir(dir);
cmd.args(args).assert().failure()
}
fn find_json_with_suffix(dir: &Path, suffix: &str) -> PathBuf {
for entry in fs::read_dir(dir).unwrap().filter_map(|e| e.ok()) {
let p = entry.path();
if p.extension().map(|e| e == "json").unwrap_or(false)
&& let Some(name) = p.file_name().and_then(|n| n.to_str())
&& name.ends_with(suffix)
{
return p;
}
}
panic!("No JSON file found ending with {}", suffix);
}
fn load_wordfreq_map(dir: &Path) -> HashMap<String, usize> {
let p = find_json_with_suffix(dir, "_wordfreq.json");
let s = read_to_string(p);
let v: Json = serde_json::from_str(&s).expect("valid json");
let mut map = HashMap::new();
let arr = v.as_array().expect("json array");
for item in arr {
let obj = item.as_object().expect("json object");
let k = obj
.get("item")
.and_then(|x| x.as_str())
.expect("item str")
.to_string();
let c = obj
.get("count")
.and_then(|x| x.as_u64())
.expect("count u64") as usize;
map.insert(k, c);
}
map
}
#[test]
fn lib_tokenize_and_basic_counts() {
let mut o = opts(ExportFormat::Json);
o.stem_mode = StemMode::Off;
let text = "The quick brown fox jumps over the lazy dog. The fox was very quick!";
let stop = std::collections::HashSet::new();
let r = analyze_text_with(text, &stop, &o);
assert!(r.ngrams.contains_key("the quick"));
assert!(r.ngrams.contains_key("quick brown"));
assert!(r.wordfreq.get("the").unwrap() >= &2);
assert!(r.wordfreq.get("quick").unwrap() >= &2);
assert!(r.context_map.contains_key("fox"));
assert!(r.direct_neighbors.contains_key("fox"));
assert!(!r.pmi.is_empty());
}
#[test]
fn lib_stopwords_filtering() {
let mut o = opts(ExportFormat::Json);
o.stem_mode = StemMode::Off;
let text = "Cats and dogs and cats and dogs.";
let mut stop = std::collections::HashSet::new();
stop.insert("and".to_string());
let r = analyze_text_with(text, &stop, &o);
assert!(!r.wordfreq.contains_key("and"));
assert!(r.ngrams.keys().all(|ng| !ng.contains("and")));
}
#[test]
fn lib_stemming_auto_and_force() {
let text = "This is an English sentence where the running runner runs and cars are common words. running runner runs cars car cars running";
let stop = std::collections::HashSet::new();
let mut o = opts(ExportFormat::Json);
o.stem_mode = StemMode::Auto;
let r_auto = analyze_text_with(text, &stop, &o);
assert!(r_auto.wordfreq.contains_key("run"));
assert!(r_auto.wordfreq.contains_key("car"));
assert!(!r_auto.wordfreq.contains_key("running"));
assert!(!r_auto.wordfreq.contains_key("cars"));
let mut o2 = opts(ExportFormat::Json);
o2.stem_mode = StemMode::Force(StemLang::En);
let r_force = analyze_text_with(text, &stop, &o2);
assert!(r_force.wordfreq.contains_key("run"));
assert!(r_force.wordfreq.contains_key("car"));
}
#[test]
fn lib_ngrams_window_and_neighbors() {
let mut o = opts(ExportFormat::Json);
o.ngram = 3;
o.context = 2;
let text = "alpha beta gamma delta epsilon";
let stop = std::collections::HashSet::new();
let r = analyze_text_with(text, &stop, &o);
assert!(r.ngrams.contains_key("alpha beta gamma"));
assert!(r.ngrams.contains_key("beta gamma delta"));
let neigh = r.direct_neighbors.get("gamma").unwrap();
assert!(neigh.get("beta").is_some());
assert!(neigh.get("delta").is_some());
}
#[test]
fn lib_ner_heuristic() {
let o = opts(ExportFormat::Json);
let text = "Berlin is in Germany. NASA launched a rocket. The dog sleeps.";
let stop = std::collections::HashSet::new();
let r = analyze_text_with(text, &stop, &o);
assert!(r.named_entities.contains_key("Berlin"));
assert!(r.named_entities.contains_key("Germany"));
assert!(!r.named_entities.contains_key("NASA"));
assert!(!r.named_entities.contains_key("The"));
}
#[test]
fn lib_pmi_sanity() {
let mut o = opts(ExportFormat::Json);
o.context = 1; let text = "alice bob alice bob alice bob";
let stop = std::collections::HashSet::new();
let r = analyze_text_with(text, &stop, &o);
let has_pair = r.pmi.iter().any(|p| {
(p.word1 == "alice" && p.word2 == "bob") || (p.word1 == "bob" && p.word2 == "alice")
});
assert!(has_pair);
}
#[test]
#[serial]
fn lib_analyze_path_per_file_and_combined_csv() {
let td = assert_fs::TempDir::new().unwrap();
let _f1 = write_file(&td, "a.txt", "Hello world. Berlin Berlin.");
let _f2 = write_file(&td, "b.txt", "Hello Alice. Alice meets Bob.");
let mut o = opts(ExportFormat::Csv);
o.combine = false;
std::env::set_current_dir(td.path()).unwrap();
let _rep = analyze_path(td.path(), None, &o).expect("analyze_path");
let re = Regex::new(r".+_\d{8}_\d{6}_wordfreq\.csv$").unwrap();
let found = fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| re.is_match(e.file_name().to_string_lossy().as_ref()));
assert!(found, "Expected <stem>_*_wordfreq.csv in temp dir");
let mut o2 = opts(ExportFormat::Csv);
o2.combine = true;
std::env::set_current_dir(td.path()).unwrap();
let _rep2 = analyze_path(td.path(), None, &o2).expect("analyze_path combined");
let has_combined = fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| e.file_name().to_string_lossy().starts_with("combined_"));
assert!(has_combined, "Expected combined_* outputs");
}
#[test]
fn cli_nonexistent_path_fails() {
let td = tempdir().unwrap(); let bad = td.path().join("does_not_exist_here");
run_cli_fail_in(
td.path(),
&[bad.to_string_lossy().as_ref(), "--export-format", "csv"],
);
}
#[test]
fn cli_basic_run_csv() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(
&td,
"cli.txt",
"Berlin meets Alice. Alice meets Bob. NASA FAILS.",
);
let stop = write_file(&td, "stop.txt", "meets\n");
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"csv",
"--stopwords",
stop.to_str().unwrap(),
"--ngram",
"2",
"--context",
"3",
],
);
let re = Regex::new(r".+_\d{8}_\d{6}_wordfreq\.csv$").unwrap();
let found = fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| re.is_match(e.file_name().to_string_lossy().as_ref()));
assert!(found, "Expected *_wordfreq.csv in temp dir");
}
#[test]
fn cli_export_json() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(&td, "fmt.txt", "Alpha Beta. Beta Gamma. Berlin.");
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"json",
],
);
let has_json = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| e.path().extension().map(|x| x == "json").unwrap_or(false));
assert!(has_json, "Expected at least one .json export in temp dir");
}
#[test]
fn cli_export_tsv() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(&td, "fmt2.txt", "Alice Bob. Bob Alice.");
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"tsv",
],
);
let has_tsv = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| e.path().extension().map(|x| x == "tsv").unwrap_or(false));
assert!(has_tsv, "Expected at least one .tsv export in temp dir");
}
#[test]
fn cli_stem_auto_detects_language() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(
&td,
"stem.txt",
"This is an English sentence where the running runner runs and cars are common words. running runner runs cars car cars running",
);
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"json",
"--stem",
],
);
let wf = load_wordfreq_map(td.path());
assert!(wf.contains_key("run"), "Auto stemming should produce 'run'");
assert!(wf.contains_key("car"), "Auto stemming should produce 'car'");
assert!(
!wf.contains_key("running"),
"Auto stemming should remove 'running'"
);
assert!(
!wf.contains_key("cars"),
"Auto stemming should remove 'cars'"
);
}
#[test]
fn cli_stem_lang_without_stem_flag_forces() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(
&td,
"stem2.txt",
"running runner runs cars car cars running",
);
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"json",
"--stem-lang",
"en", ],
);
let wf = load_wordfreq_map(td.path());
assert!(
!wf.contains_key("running"),
"With forced --stem-lang, 'running' should not remain"
);
assert!(
!wf.contains_key("cars"),
"With forced --stem-lang, 'cars' should not remain"
);
assert!(
wf.contains_key("run"),
"With forced --stem-lang, 'run' should be produced"
);
assert!(
wf.contains_key("car"),
"With forced --stem-lang, 'car' should be produced"
);
}
#[test]
fn cli_stem_force_language_with_stem() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(
&td,
"stem3.txt",
"running runner runs cars car cars running",
);
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"json",
"--stem",
"--stem-lang",
"en",
],
);
let wf = load_wordfreq_map(td.path());
assert!(
wf.contains_key("run"),
"Forced English should produce 'run'"
);
assert!(
wf.contains_key("car"),
"Forced English should produce 'car'"
);
assert!(
!wf.contains_key("running"),
"Forced English should remove 'running'"
);
assert!(
!wf.contains_key("cars"),
"Forced English should remove 'cars'"
);
}
#[test]
fn lib_pdf_best_effort_read() {
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(&td, "doc.txt", "Simple text file to ensure analyzer runs.");
std::env::set_current_dir(td.path()).unwrap();
let o = opts(ExportFormat::Json);
let _ = analyze_path(td.path(), None, &o).expect("analysis runs");
}
#[test]
#[serial]
fn lib_stem_strict_per_file_skips_undetected() {
let td = assert_fs::TempDir::new().unwrap();
let _gib = write_file(&td, "gib.txt", "12345 67890 !!! ??? 00000 ---");
let _eng = write_file(
&td,
"eng.txt",
"This is clearly English so detection should work and stemming should run.",
);
let mut o = opts(ExportFormat::Json);
o.combine = false;
o.stem_mode = StemMode::Auto;
o.stem_require_detected = true;
std::env::set_current_dir(td.path()).unwrap();
let rep = analyze_path(td.path(), None, &o)
.expect("per-file strict should succeed (skips undetected)");
let wordfreq_jsons: Vec<_> = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "json").unwrap_or(false))
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.ends_with("_wordfreq.json"))
.unwrap_or(false)
})
.collect();
assert_eq!(
wordfreq_jsons.len(),
1,
"Expected exactly one wordfreq.json"
);
assert_eq!(
rep.failed_files.len(),
1,
"Expected one failed file in strict mode"
);
}
#[test]
fn lib_stem_strict_combined_aborts_on_undetected() {
let td = assert_fs::TempDir::new().unwrap();
let _gib = write_file(&td, "gib.txt", "12345 67890 !!! ??? 00000 ---");
let _eng = write_file(
&td,
"eng.txt",
"This is clearly English so detection should work and stemming should run.",
);
let mut o = opts(ExportFormat::Json);
o.combine = true;
o.stem_mode = StemMode::Auto;
o.stem_require_detected = true;
let res = analyze_path(td.path(), None, &o);
assert!(
res.is_err(),
"Combined strict should abort when a file's language is undetected"
);
}
#[test]
fn cli_stem_strict_per_file_skips_undetected() {
let td = assert_fs::TempDir::new().unwrap();
let _gib = write_file(&td, "gib.txt", "12345 67890 !!! ??? 00000 ---");
let _eng = write_file(
&td,
"eng.txt",
"This is clearly English so detection should work and stemming should run.",
);
run_cli_ok_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--export-format",
"json",
"--stem",
"--stem-strict",
],
);
let wordfreq_jsons: Vec<_> = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "json").unwrap_or(false))
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.ends_with("_wordfreq.json"))
.unwrap_or(false)
})
.collect();
assert_eq!(
wordfreq_jsons.len(),
1,
"Expected exactly one wordfreq.json"
);
}
#[test]
fn cli_stem_strict_combined_aborts_on_undetected() {
let td = assert_fs::TempDir::new().unwrap();
let _gib = write_file(&td, "gib.txt", "12345 67890 !!! ??? 00000 ---");
let _eng = write_file(
&td,
"eng.txt",
"This is clearly English so detection should work and stemming should run.",
);
run_cli_fail_in(
td.path(),
&[
td.path().to_string_lossy().as_ref(),
"--combine",
"--export-format",
"json",
"--stem",
"--stem-strict",
],
)
.stderr(
predicate::str::contains("Combined run aborted")
.or(predicate::str::contains("strict stemming")),
);
}
#[test]
#[serial]
fn lib_combine_wordfreq_sums_across_files() {
let td = assert_fs::TempDir::new().unwrap();
let _f1 = write_file(&td, "a1.txt", "apple apple banana orange");
let _f2 = write_file(&td, "a2.txt", "banana banana apple");
let mut o = opts(ExportFormat::Json);
o.combine = true;
std::env::set_current_dir(td.path()).unwrap();
let _ = analyze_path(td.path(), None, &o).expect("combined analysis runs");
let wf = load_wordfreq_map(td.path());
assert_eq!(
wf.get("apple").copied().unwrap_or(0),
3,
"apple count should be 3 in combined"
);
assert_eq!(
wf.get("banana").copied().unwrap_or(0),
3,
"banana count should be 3 in combined"
);
assert_eq!(
wf.get("orange").copied().unwrap_or(0),
1,
"orange count should be 1 in combined"
);
let non_combined_exists = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "json").unwrap_or(false))
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.ends_with("_wordfreq.json"))
.unwrap_or(false)
})
.any(|p| {
!p.file_name()
.unwrap()
.to_str()
.unwrap()
.starts_with("combined_")
});
assert!(
!non_combined_exists,
"Expected only combined_*_wordfreq.json outputs in combined mode"
);
}
#[test]
#[serial]
fn lib_combine_wordfreq_with_pdf() {
use std::io::Write as _;
let td = assert_fs::TempDir::new().unwrap();
let _f1 = write_file(
&td,
"a1.txt",
"Apple and banana. Apple, orange; banana! Apple? Grape grape apple.",
);
let _f2 = write_file(
&td,
"a2.txt",
"Banana and apple; banana and pear. Apple banana banana, apple!",
);
fn build_pdf_bytes(text: &str) -> Vec<u8> {
fn esc_parens(s: &str) -> String {
s.replace('(', r"\(").replace(')', r"\)")
}
let content = format!("BT\n/F1 12 Tf\n10 100 Td\n({}) Tj\nET\n", esc_parens(text));
let mut pdf: Vec<u8> = Vec::new();
let mut offsets: [usize; 6] = [0; 6];
pdf.extend_from_slice(b"%PDF-1.4\n");
offsets[1] = pdf.len();
pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
offsets[2] = pdf.len();
pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
offsets[3] = pdf.len();
pdf.extend_from_slice(b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n");
let stream_len = content.len();
offsets[4] = pdf.len();
pdf.extend_from_slice(
format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_len).as_bytes(),
);
pdf.extend_from_slice(content.as_bytes());
pdf.extend_from_slice(b"endstream\nendobj\n");
offsets[5] = pdf.len();
pdf.extend_from_slice(
b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n",
);
let xref_pos = pdf.len();
let mut xref = String::new();
xref.push_str("xref\n0 6\n");
xref.push_str("0000000000 65535 f \n");
for offset in offsets.iter().skip(1).take(5) {
xref.push_str(&format!("{:010} 00000 n \n", offset));
}
pdf.extend_from_slice(xref.as_bytes());
let trailer = format!(
"trailer << /Size 6 /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
xref_pos
);
pdf.extend_from_slice(trailer.as_bytes());
pdf
}
let pdf_path = td.child("doc.pdf");
{
let bytes = build_pdf_bytes("Banana apple banana grape apple banana orange");
let mut f = std::fs::File::create(pdf_path.path()).unwrap();
f.write_all(&bytes).unwrap();
}
let mut o = opts(ExportFormat::Json);
o.combine = true;
std::env::set_current_dir(td.path()).unwrap();
let rep = analyze_path(td.path(), None, &o).expect("combined analysis runs");
assert!(
!rep.failed_files
.iter()
.any(|(file, _)| file.ends_with("doc.pdf")),
"PDF should be parsed successfully"
);
let wf = load_wordfreq_map(td.path());
assert_eq!(
wf.get("apple").copied().unwrap_or(0),
9,
"apple count should be 9 (7 TXT + 2 PDF)"
);
assert_eq!(
wf.get("banana").copied().unwrap_or(0),
9,
"banana count should be 9 (6 TXT + 3 PDF)"
);
assert_eq!(
wf.get("grape").copied().unwrap_or(0),
3,
"grape count should be 3 (2 TXT + 1 PDF)"
);
assert_eq!(
wf.get("orange").copied().unwrap_or(0),
2,
"orange count should be 2 (1 TXT + 1 PDF)"
);
assert_eq!(
wf.get("and").copied().unwrap_or(0),
3,
"and count should be 3 (TXT only)"
);
assert_eq!(
wf.get("pear").copied().unwrap_or(0),
1,
"pear count should be 1 (TXT only)"
);
let non_combined_exists = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "json").unwrap_or(false))
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.ends_with("_wordfreq.json"))
.unwrap_or(false)
})
.any(|p| {
!p.file_name()
.unwrap()
.to_str()
.unwrap()
.starts_with("combined_")
});
assert!(
!non_combined_exists,
"Expected only combined_*_wordfreq.json outputs in combined mode"
);
}
#[test]
#[serial]
fn lib_combine_wordfreq_with_multipage_pdf_and_noise() {
use std::io::Write as _;
let td = assert_fs::TempDir::new().unwrap();
let _f1 = write_file(
&td,
"a1.txt",
"Apple and banana. Apple, orange; banana! Apple? Grape grape apple.",
);
let _f2 = write_file(
&td,
"a2.txt",
"Banana and apple; banana and pear. Apple banana banana, apple!",
);
fn build_multipage_pdf_bytes(pages: &[&str]) -> Vec<u8> {
fn esc_parens(s: &str) -> String {
s.replace('(', r"\(").replace(')', r"\)")
}
let n = pages.len();
let font_id = 3 + 2 * n;
let mut pdf: Vec<u8> = Vec::new();
let mut offsets: Vec<usize> = vec![0; font_id + 1];
pdf.extend_from_slice(b"%PDF-1.4\n");
offsets[1] = pdf.len();
pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
offsets[2] = pdf.len();
{
let kids: Vec<String> = (0..n).map(|i| format!("{} 0 R", 3 + 2 * i)).collect();
let kids_arr = kids.join(" ");
let pages_obj = format!(
"2 0 obj\n<< /Type /Pages /Kids [ {} ] /Count {} >>\nendobj\n",
kids_arr, n
);
pdf.extend_from_slice(pages_obj.as_bytes());
}
for (i, text) in pages.iter().enumerate() {
let page_id = 3 + 2 * i;
let cont_id = 4 + 2 * i;
offsets[page_id] = pdf.len();
let page_obj = format!(
"{id} 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 300 300] /Contents {cid} 0 R /Resources << /Font << /F1 {fid} 0 R >> >> >>\nendobj\n",
id = page_id,
cid = cont_id,
fid = font_id
);
pdf.extend_from_slice(page_obj.as_bytes());
let content = format!("BT\n/F1 12 Tf\n10 200 Td\n({}) Tj\nET\n", esc_parens(text));
offsets[cont_id] = pdf.len();
pdf.extend_from_slice(
format!(
"{cid} 0 obj\n<< /Length {len} >>\nstream\n",
cid = cont_id,
len = content.len()
)
.as_bytes(),
);
pdf.extend_from_slice(content.as_bytes());
pdf.extend_from_slice(b"endstream\nendobj\n");
}
offsets[font_id] = pdf.len();
pdf.extend_from_slice(
format!(
"{fid} 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n",
fid = font_id
)
.as_bytes(),
);
let xref_pos = pdf.len();
let mut xref = String::new();
xref.push_str(&format!("xref\n0 {}\n", font_id + 1));
xref.push_str("0000000000 65535 f \n");
for offset in offsets.iter().skip(1).take(font_id) {
xref.push_str(&format!("{:010} 00000 n \n", offset));
}
pdf.extend_from_slice(xref.as_bytes());
let trailer = format!(
"trailer << /Size {size} /Root 1 0 R >>\nstartxref\n{pos}\n%%EOF\n",
size = font_id + 1,
pos = xref_pos
);
pdf.extend_from_slice(trailer.as_bytes());
pdf
}
let pdf_bytes = build_multipage_pdf_bytes(&[
"Banana apple banana grape apple banana orange kiwi",
"Noise NOISE n123 tokens; apple banana banana pear.",
"banana grape grape banana apple.",
]);
let pdf_path = td.child("doc_multi.pdf");
{
let mut f = std::fs::File::create(pdf_path.path()).unwrap();
f.write_all(&pdf_bytes).unwrap();
}
let mut o = opts(ExportFormat::Json);
o.combine = true;
std::env::set_current_dir(td.path()).unwrap();
let rep = analyze_path(td.path(), None, &o).expect("combined analysis runs");
assert!(
!rep.failed_files
.iter()
.any(|(file, _)| file.ends_with("doc_multi.pdf")),
"Multi-page PDF should be parsed successfully"
);
let wf = load_wordfreq_map(td.path());
assert_eq!(
wf.get("apple").copied().unwrap_or(0),
11,
"apple total mismatch"
);
assert_eq!(
wf.get("banana").copied().unwrap_or(0),
13,
"banana total mismatch"
);
assert_eq!(
wf.get("grape").copied().unwrap_or(0),
5,
"grape total mismatch"
);
assert_eq!(
wf.get("orange").copied().unwrap_or(0),
2,
"orange total mismatch"
);
assert_eq!(wf.get("and").copied().unwrap_or(0), 3, "and total mismatch");
assert_eq!(
wf.get("pear").copied().unwrap_or(0),
2,
"pear total mismatch"
);
let non_combined_exists = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "json").unwrap_or(false))
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.ends_with("_wordfreq.json"))
.unwrap_or(false)
})
.any(|p| {
!p.file_name()
.unwrap()
.to_str()
.unwrap()
.starts_with("combined_")
});
assert!(
!non_combined_exists,
"Expected only combined_*_wordfreq.json outputs in combined mode"
);
}
#[test]
#[serial]
fn lib_exports_are_sorted_by_frequency() {
use std::fs;
use std::io::Read;
use std::path::Path;
let td = assert_fs::TempDir::new().unwrap();
let _f = write_file(
&td,
"sorted.txt",
"z z z z z a a a b b c | z z z z z a a a b b c",
);
let mut o = opts(ExportFormat::Csv);
o.combine = false;
o.ngram = 2;
o.context = 2;
std::env::set_current_dir(td.path()).unwrap();
analyze_path(td.path(), None, &o).expect("analysis runs");
fn find_csv<P: AsRef<Path>>(dir: P, suffix: &str) -> std::path::PathBuf {
let mut matches: Vec<_> = fs::read_dir(dir)
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "csv").unwrap_or(false))
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.ends_with(suffix))
.unwrap_or(false)
})
.collect();
matches.sort(); matches
.pop()
.unwrap_or_else(|| panic!("no CSV with suffix {}", suffix))
}
fn read_csv_lines(p: &Path) -> Vec<String> {
let mut s = String::new();
std::fs::File::open(p)
.unwrap()
.read_to_string(&mut s)
.unwrap();
s.lines().map(|ln| ln.to_string()).collect()
}
let wf_csv = find_csv(td.path(), "_wordfreq.csv");
let wf_lines = read_csv_lines(&wf_csv);
assert!(wf_lines.len() >= 5, "needs header + at least 4 rows");
let parse = |row: &str| {
let mut it = row.splitn(2, ',');
let item = it.next().unwrap().to_string();
let cnt: usize = it.next().unwrap().parse().unwrap();
(item, cnt)
};
let wf_rows: Vec<(String, usize)> = wf_lines.iter().skip(1).map(|r| parse(r)).collect();
let mut wf_sorted = wf_rows.clone();
wf_sorted.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
assert_eq!(wf_rows, wf_sorted, "wordfreq CSV is not sorted as expected");
let max = wf_rows.iter().max_by_key(|(_, c)| *c).unwrap();
assert_eq!(max, &("z".to_string(), 10));
let ng_csv = find_csv(td.path(), "_ngrams.csv");
let ng_lines = read_csv_lines(&ng_csv);
let ng_rows: Vec<(String, usize)> = ng_lines.iter().skip(1).map(|r| parse(r)).collect();
let mut ng_sorted = ng_rows.clone();
ng_sorted.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
assert_eq!(ng_rows, ng_sorted, "ngrams CSV is not sorted as expected");
let pmi_csv = find_csv(td.path(), "_pmi.csv");
let pmi_lines = read_csv_lines(&pmi_csv);
#[derive(Clone, Debug, PartialEq)]
struct Row {
w1: String,
w2: String,
d: usize,
c: usize,
p: f64,
}
let parse_pmi = |row: &str| {
let cols: Vec<&str> = row.split(',').collect();
Row {
w1: cols[0].to_string(),
w2: cols[1].to_string(),
d: cols[2].parse().unwrap(),
c: cols[3].parse().unwrap(),
p: cols[4].parse().unwrap(),
}
};
let pmi_rows: Vec<Row> = pmi_lines.iter().skip(1).map(|r| parse_pmi(r)).collect();
let mut pmi_sorted = pmi_rows.clone();
pmi_sorted.sort_by(|a, b| {
b.c.cmp(&a.c)
.then_with(|| b.p.partial_cmp(&a.p).unwrap_or(std::cmp::Ordering::Equal))
.then_with(|| a.w1.cmp(&b.w1))
.then_with(|| a.w2.cmp(&b.w2))
});
assert_eq!(pmi_rows, pmi_sorted, "PMI CSV is not sorted as expected");
}
#[test]
#[serial]
fn stdout_summary_order_top20_sections_and_content() {
use std::env;
use std::path::Path;
let td = assert_fs::TempDir::new().unwrap();
let text = "alpha beta gamma ".repeat(200);
let _file = write_file(&td, "s.txt", &text);
let mut o = opts(ExportFormat::Csv);
o.combine = false;
o.ngram = 2;
o.context = 2;
o.stem_mode = text_analysis::StemMode::Off;
o.stem_require_detected = false;
env::set_current_dir(td.path()).unwrap();
let report = analyze_path(Path::new(td.path()), None, &o).expect("analysis runs");
let out = report.summary;
let i_ng = out
.find("Top 20 n-grams:")
.expect("n-grams section missing");
let i_pmi = out
.find("Top 20 PMI (by count, then PMI):")
.expect("PMI section missing");
let i_wf = out.find("Top 20 words:").expect("words section missing");
assert!(
i_ng < i_pmi && i_pmi < i_wf,
"section order must be n-grams -> PMI -> words"
);
let i_ng_ab = out
.find("\n alpha beta\t")
.expect("missing 'alpha beta' in n-grams");
let i_ng_bg = out
.find("\n beta gamma\t")
.expect("missing 'beta gamma' in n-grams");
let i_ng_ga = out
.find("\n gamma alpha\t")
.expect("missing 'gamma alpha' in n-grams");
assert!(
i_ng_ab < i_ng_bg && i_ng_bg < i_ng_ga,
"n-grams not sorted as expected"
);
let i_pmi_ab = out
.find(" (alpha, beta) ")
.expect("missing (alpha, beta) in PMI");
let i_pmi_bg = out
.find(" (beta, gamma) ")
.expect("missing (beta, gamma) in PMI");
assert!(
i_pmi_ab < i_pmi_bg,
"PMI section not sorted by (count desc, then PMI desc)"
);
let i_w_alpha = out.find("\n alpha\t").expect("missing 'alpha' in words");
let i_w_beta = out.find("\n beta\t").expect("missing 'beta' in words");
let i_w_gamma = out.find("\n gamma\t").expect("missing 'gamma' in words");
assert!(
i_w_alpha < i_w_beta && i_w_beta < i_w_gamma,
"word list not sorted as expected"
);
}
#[test]
#[serial]
fn lib_stem_strict_per_file_skips_and_reports_v2() {
use assert_fs::{TempDir, prelude::*};
use std::fs;
use std::path::Path;
use text_analysis::{AnalysisOptions, ExportFormat, StemMode, analyze_path, stem_for};
let td = TempDir::new().unwrap();
let good = td.child("good.txt");
good.write_str("This is a clear English text. Stemming should be possible.")
.unwrap();
let bad = td.child("bad.txt");
bad.write_str("???? #### !!!! 12345 @@@@").unwrap();
let opts = AnalysisOptions {
ngram: 2,
context: 3,
export_format: ExportFormat::Json,
entities_only: false,
combine: false,
stem_mode: StemMode::Auto,
stem_require_detected: true, };
std::env::set_current_dir(td.path()).unwrap();
let report = analyze_path(Path::new(td.path()), None, &opts)
.expect("per-file strict: analysis should succeed");
assert_eq!(
report.failed_files.len(),
1,
"exactly one file should be skipped"
);
let warned = report.failed_files[0].0.to_string();
assert!(
warned.ends_with("bad.txt"),
"skipped file should be bad.txt, got: {warned}"
);
let json_outputs: Vec<_> = fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.extension().map(|x| x == "json").unwrap_or(false))
.collect();
assert!(
!json_outputs.is_empty(),
"no JSON outputs produced for the good file"
);
let bad_stem = stem_for(Path::new(td.path()).join("bad.txt").as_path());
for p in &json_outputs {
let name = p.file_name().and_then(|n| n.to_str()).unwrap_or("");
assert!(
!name.starts_with(&bad_stem),
"found an output for bad.txt (should be skipped): {}",
name
);
}
}
#[test]
#[serial]
fn cli_stem_strict_combined_aborts_cleanly_v2() {
use assert_cmd::prelude::*;
use assert_fs::{TempDir, prelude::*};
use predicates::prelude::*;
use std::process::Command;
let td = TempDir::new().unwrap();
td.child("ok.txt")
.write_str("English content here. This should be detected and stemmed.")
.unwrap();
td.child("noise.txt")
.write_str("@@@@ #### !!!! ???? 12345 ~~~~~")
.unwrap();
let mut cmd = Command::cargo_bin("text_analysis").unwrap();
let _assert = cmd
.current_dir(td.path())
.arg(td.path()) .arg("--combine")
.arg("--stem")
.arg("--stem-strict")
.arg("--export-format")
.arg("json")
.assert()
.failure() .stderr(
predicate::str::contains("Combined run aborted")
.or(predicate::str::contains("Error: Combined run aborted")),
);
let any_outputs = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| {
let p = e.path();
let is_result = p
.extension()
.map(|x| {
let x = x.to_string_lossy();
x == "json" || x == "csv" || x == "tsv" || x == "txt"
})
.unwrap_or(false);
is_result
&& !p
.file_name()
.unwrap_or_default()
.to_string_lossy()
.ends_with(".txt") });
assert!(
!any_outputs,
"no outputs should be written in strict-combined abort"
);
}
#[test]
#[serial]
fn cli_stem_strict_per_file_skips_and_reports_v2() {
use assert_cmd::prelude::*;
use assert_fs::{TempDir, prelude::*};
use std::process::Command;
let td = TempDir::new().unwrap();
td.child("clear_en.txt")
.write_str("This is very clearly English. Stemming should work.")
.unwrap();
td.child("undetected.txt")
.write_str("%%%% ????? 00000 +++++ ^^^^^")
.unwrap();
let mut cmd = Command::cargo_bin("text_analysis").unwrap();
let output = cmd
.current_dir(td.path())
.arg(td.path())
.arg("--stem")
.arg("--stem-strict")
.arg("--export-format")
.arg("csv")
.output()
.expect("cli should run");
assert!(
output.status.success(),
"per-file strict should succeed, stderr: {}",
String::from_utf8_lossy(&output.stderr)
);
let err = String::from_utf8_lossy(&output.stderr);
assert!(
err.contains("Warnings"),
"stderr should contain a warnings header"
);
assert!(
err.contains("undetected.txt"),
"stderr should reference undetected.txt"
);
let any_csv = std::fs::read_dir(td.path())
.unwrap()
.filter_map(|e| e.ok())
.any(|e| e.path().extension().map(|x| x == "csv").unwrap_or(false));
assert!(any_csv, "expected at least one CSV output");
}
#[test]
fn csv_writer_sanitizes_and_quotes_correctly() {
let mut buf = Vec::new();
{
let mut wtr = WriterBuilder::new().from_writer(&mut buf);
wtr.write_record(["token", "note"]).unwrap();
let dangerous = r#"=HYPERLINK("https://x")"#.to_string();
wtr.write_record([csv_safe_cell(dangerous), "ok".to_string()])
.unwrap();
let nl = "=BAD\nNEXT".to_string();
wtr.write_record([csv_safe_cell(nl), "1".to_string()])
.unwrap();
wtr.flush().unwrap();
}
let out = String::from_utf8(buf).unwrap();
assert!(
out.contains("'=HYPERLINK"),
"CSV must prefix '=' at start of cell"
);
assert!(
out.contains(r#"'=HYPERLINK(""https://x"")"#),
"inner quotes should be escaped (doubled)"
);
assert!(
out.contains("'=BAD\nNEXT"),
"newline preserved in quoted field"
);
}
#[test]
fn tsv_writer_sanitizes_first_cell_and_uses_tab_delimiter() {
let mut buf = Vec::new();
{
let mut wtr = WriterBuilder::new().delimiter(b'\t').from_writer(&mut buf);
wtr.write_record(["token", "n"]).unwrap();
wtr.write_record([csv_safe_cell("=X".into()), "1".into()])
.unwrap();
wtr.flush().unwrap();
}
let out = String::from_utf8(buf).unwrap();
let mut lines = out.lines();
let _ = lines.next(); let row = lines.next().unwrap_or("");
assert!(
row.starts_with("'=X\t1"),
"TSV row must start with \"'=X\\t1\", got: {:?}",
row
);
}
#[test]
fn no_double_prefix_when_cell_already_safe() {
let already_safe = "'@SAFE".to_string(); let out = csv_safe_cell(already_safe.clone());
assert_eq!(out, already_safe, "must not add a second quote");
let normal = "normal".to_string();
let out2 = csv_safe_cell(normal.clone());
assert_eq!(out2, normal, "normal cells should remain unchanged");
}