1#![forbid(unsafe_code)]
2#![doc(
3 html_logo_url = "https://raw.githubusercontent.com/ArdentEmpiricist/text_analysis/main/assets/text_analysis_logo.png"
4)]
5#![doc = r#"
6Text Analysis Library
7
8This crate provides a fast, pragmatic toolkit for linguistic text analysis over `.txt`, `.pdf`, `.docx`, and `.odt`
9files. It supports:
10
11- Tokenization (Unicode-aware, simple alphanumeric rules)
12- Optional stopword filtering (user-supplied list)
13- Optional stemming (auto-detected or forced language)
14- N-gram counting
15- Word frequency counting
16- Context statistics (±N window) and direct neighbors (±1)
17- PMI (Pointwise Mutual Information) collocations
18- Simple Named-Entity extraction (capitalization heuristic)
19- Parallel per-file analysis (compute) with serialized writes
20- Combined (Map-Reduce) mode that aggregates counts across files
21- **Deterministic, sorted outputs** in CSV/TSV/JSON/TXT
22
23
24## Security & CSV/TSV export safety
25
26If you open CSV/TSV in spreadsheet software (Excel/LibreOffice), cells that **start with** one of
27`=`, `+`, `-`, or `@` may be interpreted as formulas (e.g., `=HYPERLINK(...)`). To prevent this, **always:**
281. Write CSV/TSV using a proper CSV library (this project uses `csv::Writer`) so commas, tabs, quotes, and newlines are escaped correctly.
292. Sanitize **text cells** by prefixing a single quote when they begin with one of the dangerous characters.
30
31"#]
32
33use chrono::Local;
34use rayon::prelude::*;
35use std::collections::{HashMap, HashSet};
36use std::fs;
37use std::hash::{Hash, Hasher};
38use std::path::{Path, PathBuf};
39use whatlang::{Lang, detect};
40
41use pdf_extract::extract_text;
43
44mod office;
46
47#[derive(Copy, Clone, Debug)]
53pub enum ExportFormat {
54 Txt,
55 Csv,
56 Tsv,
57 Json,
58}
59
60#[derive(Copy, Clone, Debug, Eq, PartialEq)]
62pub enum StemMode {
63 Off,
65 Auto,
67 Force(StemLang),
69}
70
71#[derive(Copy, Clone, Debug, Eq, PartialEq)]
73pub enum StemLang {
74 Unknown,
75 En,
76 De,
77 Fr,
78 Es,
79 It,
80 Pt,
81 Nl,
82 Ru,
83 Sv,
84 Fi,
85 No,
86 Ro,
87 Hu,
88 Da,
89 Tr,
90}
91
92impl StemLang {
93 pub fn from_code(code: &str) -> Option<Self> {
95 use StemLang::*;
96 let c = code.to_ascii_lowercase();
97 Some(match c.as_str() {
98 "en" => En,
99 "de" => De,
100 "fr" => Fr,
101 "es" => Es,
102 "it" => It,
103 "pt" => Pt,
104 "nl" => Nl,
105 "ru" => Ru,
106 "sv" => Sv,
107 "fi" => Fi,
108 "no" => No,
109 "ro" => Ro,
110 "hu" => Hu,
111 "da" => Da,
112 "tr" => Tr,
113 _ => return None,
114 })
115 }
116 pub fn from_whatlang(lang: Lang) -> Self {
118 match lang.code() {
120 "eng" => StemLang::En,
121 "deu" => StemLang::De,
122 "fra" | "fre" => StemLang::Fr,
123 "spa" => StemLang::Es,
124 "ita" => StemLang::It,
125 "por" => StemLang::Pt,
126 "nld" | "dut" => StemLang::Nl,
127 "rus" => StemLang::Ru,
128 "swe" => StemLang::Sv,
129 "fin" => StemLang::Fi,
130 "nor" | "nob" | "nno" => StemLang::No,
131 "ron" | "rum" => StemLang::Ro,
132 "hun" => StemLang::Hu,
133 "dan" => StemLang::Da,
134 "tur" => StemLang::Tr,
135 _ => StemLang::Unknown,
136 }
137 }
138}
139
140#[derive(Clone, Debug)]
142pub struct AnalysisOptions {
143 pub ngram: usize,
145 pub context: usize,
147 pub export_format: ExportFormat,
149 pub entities_only: bool,
151 pub combine: bool,
153 pub stem_mode: StemMode,
155 pub stem_require_detected: bool,
159}
160
161#[derive(Debug)]
163pub struct AnalysisReport {
164 pub summary: String,
166 pub failed_files: Vec<(String, String)>,
168}
169
170#[derive(Debug, Default)]
172pub struct AnalysisResult {
173 pub ngrams: HashMap<String, usize>,
174 pub wordfreq: HashMap<String, usize>,
175 pub context_map: HashMap<String, HashMap<String, usize>>,
176 pub direct_neighbors: HashMap<String, HashMap<String, usize>>,
177 pub named_entities: HashMap<String, usize>,
178 pub pmi: Vec<PmiEntry>,
179}
180
181#[derive(Debug)]
183pub struct PmiEntry {
184 pub word1: String,
185 pub word2: String,
186 pub distance: usize,
187 pub count: usize,
188 pub pmi: f64,
189}
190
191#[derive(Default)]
195struct PartialCounts {
196 n_tokens: usize,
197 ngrams: HashMap<String, usize>,
198 wordfreq: HashMap<String, usize>,
199 context_pairs: HashMap<(String, String), usize>,
200 neighbor_pairs: HashMap<(String, String), usize>,
201 cooc_by_dist: HashMap<(String, String, usize), usize>,
202 named_entities: HashMap<String, usize>,
203}
204
205pub use office::{extract_text_from_docx, extract_text_from_odt};
208
209pub fn analyze_path(
215 path: &Path,
216 stopwords_file: Option<&PathBuf>,
217 options: &AnalysisOptions,
218) -> Result<AnalysisReport, String> {
219 let files = collect_files(path);
220 if files.is_empty() {
221 return Err("No .txt, .pdf, .docx or .odt files found for analysis.".to_string());
222 }
223
224 let stopwords = load_stopwords(stopwords_file);
225 let mut failed: Vec<(String, String)> = Vec::new();
226 let ts = timestamp();
227
228 if options.combine {
230 let mapped: Vec<_> = files
232 .par_iter()
233 .map(|f| match read_text(f) {
234 Ok(t) => {
235 if matches!(options.stem_mode, StemMode::Auto)
236 && options.stem_require_detected
237 && detect_supported_stem_lang(&t).is_none()
238 {
239 return Err((
240 f.display().to_string(),
241 "Language detection failed or unsupported for stemming (strict)"
242 .to_string(),
243 ));
244 }
245 Ok(partial_counts_from_text(&t, &stopwords, options))
246 }
247 Err(e) => Err((f.display().to_string(), e)),
248 })
249 .collect();
250
251 let mut total = PartialCounts::default();
253 let mut failed_local: Vec<(String, String)> = Vec::new();
254 for item in mapped {
255 match item {
256 Ok(pc) => merge_counts(&mut total, pc),
257 Err(fe) => failed_local.push(fe),
258 }
259 }
260 if options.stem_require_detected && !failed_local.is_empty() {
261 let msg = format!(
263 "Combined run aborted (strict stemming): {} file(s) without detectable/supported language",
264 failed_local.len()
265 );
266 return Err(msg);
267 }
268 failed.extend(failed_local);
269
270 let result = analysis_from_counts(total);
272 write_all_outputs("combined", &result, &ts, options)?;
273 let summary = summary_for(&[("combined".to_string(), &result)], options);
274 return Ok(AnalysisReport {
275 summary,
276 failed_files: failed,
277 });
278 }
279
280 let results: Vec<_> = files
282 .par_iter()
283 .map(|f| match read_text(f) {
284 Ok(t) => {
285 if matches!(options.stem_mode, StemMode::Auto)
286 && options.stem_require_detected
287 && detect_supported_stem_lang(&t).is_none()
288 {
289 return Err((
290 f.display().to_string(),
291 "Language detection failed or unsupported for stemming (strict)"
292 .to_string(),
293 ));
294 }
295 let r = analyze_text_with(&t, &stopwords, options);
296 let stem = stem_for(f);
297 Ok((stem, r))
298 }
299 Err(e) => Err((f.display().to_string(), e)),
300 })
301 .collect();
302
303 let mut per_file_results: Vec<(String, AnalysisResult)> = Vec::new();
304 for item in results {
305 match item {
306 Ok(v) => per_file_results.push(v),
307 Err(fe) => failed.push(fe),
308 }
309 }
310
311 for (stem, r) in &per_file_results {
313 write_all_outputs(stem, r, &ts, options)?;
314 }
315
316 let pairs: Vec<(String, &AnalysisResult)> = per_file_results
318 .iter()
319 .map(|(n, r)| (n.clone(), r))
320 .collect();
321 let summary = summary_for(&pairs, options);
322 Ok(AnalysisReport {
323 summary,
324 failed_files: failed,
325 })
326}
327
328pub fn collect_files(path: &Path) -> Vec<PathBuf> {
332 let mut out = Vec::new();
333 if path.is_file() {
334 if is_supported(path) {
335 out.push(path.to_path_buf());
336 }
337 } else if path.is_dir() {
338 let walker = walkdir::WalkDir::new(path).follow_links(true);
339 for entry in walker.into_iter().filter_map(Result::ok) {
340 let p = entry.path();
341 if p.is_file() && is_supported(p) {
342 out.push(p.to_path_buf());
343 }
344 }
345 }
346 out
347}
348
349fn is_supported(p: &Path) -> bool {
350 matches!(
351 p.extension()
352 .and_then(|e| e.to_str())
353 .map(|s| s.to_ascii_lowercase()),
354 Some(ref e) if e == "txt" || e == "pdf" || e == "docx" || e == "odt"
355 )
356}
357
358fn read_text(p: &Path) -> Result<String, String> {
362 let ext = p
363 .extension()
364 .and_then(|e| e.to_str())
365 .unwrap_or("")
366 .to_ascii_lowercase();
367 match ext.as_str() {
368 "txt" => fs::read_to_string(p).map_err(|e| format!("Read .txt failed: {e}")),
369 "pdf" => extract_text(p).map_err(|e| format!("PDF extract failed: {e}")),
370 "docx" => office::extract_text_from_docx(p),
371 "odt" => office::extract_text_from_odt(p),
372 _ => Err("Unsupported extension".to_string()),
373 }
374}
375
376fn load_stopwords(p: Option<&PathBuf>) -> HashSet<String> {
378 let mut set = HashSet::new();
379 if let Some(file) = p
380 && let Ok(txt) = fs::read_to_string(file)
381 {
382 for line in txt.lines() {
383 let w = line.trim();
384 if !w.is_empty() {
385 set.insert(w.to_string());
386 }
387 }
388 }
389 set
390}
391
392pub fn analyze_text_with(
397 text: &str,
398 stopwords: &HashSet<String>,
399 opts: &AnalysisOptions,
400) -> AnalysisResult {
401 let stem_lang = match opts.stem_mode {
403 StemMode::Off => StemLang::Unknown,
404 StemMode::Force(lang) => lang,
405 StemMode::Auto => detect(text)
406 .map(|i| StemLang::from_whatlang(i.lang()))
407 .unwrap_or(StemLang::Unknown),
408 };
409
410 let original_tokens = tokenize(text);
412 let sentences = split_sentences(text);
413 let tokens_for_stats = normalize_for_stats(&original_tokens, stopwords, stem_lang);
414
415 let mut result = AnalysisResult::default();
416 ngrams_count(&tokens_for_stats, opts.ngram, &mut result.ngrams);
417 wordfreq_count(&tokens_for_stats, &mut result.wordfreq);
418 context_and_neighbors(
419 &tokens_for_stats,
420 opts.context,
421 &mut result.context_map,
422 &mut result.direct_neighbors,
423 );
424 named_entities_heuristic(&original_tokens, &sentences, &mut result.named_entities);
426 compute_pmi(
428 &tokens_for_stats,
429 opts.context,
430 &result.wordfreq,
431 &mut result.pmi,
432 );
433
434 result
435}
436
437fn tokenize(text: &str) -> Vec<String> {
439 let mut out = Vec::with_capacity(text.len() / 5);
440 let mut cur = String::new();
441 for ch in text.chars() {
442 if ch.is_alphanumeric() || ch == '\'' {
443 cur.push(ch);
444 } else if !cur.is_empty() {
445 out.push(std::mem::take(&mut cur));
446 }
447 }
448 if !cur.is_empty() {
449 out.push(cur);
450 }
451 out
452}
453
454fn split_sentences(text: &str) -> Vec<usize> {
456 let mut starts = vec![0usize];
457 let mut idx = 0usize;
458 for ch in text.chars() {
459 idx += ch.len_utf8();
460 if ch == '.' || ch == '!' || ch == '?' {
461 starts.push(idx);
462 }
463 }
464 starts.sort_unstable();
465 starts
466}
467
468fn normalize_for_stats(
470 tokens: &[String],
471 stopwords: &HashSet<String>,
472 stem_lang: StemLang,
473) -> Vec<String> {
474 let mut out = Vec::with_capacity(tokens.len());
475 let stemmer = make_stemmer(stem_lang); for t in tokens {
477 let lower = t.to_lowercase();
478 if !stopwords.is_empty() && stopwords.contains(&lower) {
479 continue;
480 }
481 let normalized = if let Some(stem) = &stemmer {
482 stem.stem(&lower).to_string()
483 } else {
484 lower
485 };
486 out.push(normalized);
487 }
488 out
489}
490
491fn make_stemmer(lang: StemLang) -> Option<rust_stemmers::Stemmer> {
493 use StemLang::*;
494 use rust_stemmers::{Algorithm, Stemmer};
495 let algo = match lang {
496 En => Algorithm::English,
497 De => Algorithm::German,
498 Fr => Algorithm::French,
499 Es => Algorithm::Spanish,
500 It => Algorithm::Italian,
501 Pt => Algorithm::Portuguese,
502 Nl => Algorithm::Dutch,
503 Ru => Algorithm::Russian,
504 Sv => Algorithm::Swedish,
505 Fi => Algorithm::Finnish,
506 No => Algorithm::Norwegian,
507 Ro => Algorithm::Romanian,
508 Hu => Algorithm::Hungarian,
509 Da => Algorithm::Danish,
510 Tr => Algorithm::Turkish,
511 Unknown => return None,
512 };
513 Some(Stemmer::create(algo))
514}
515
516fn ngrams_count(tokens: &[String], n: usize, out: &mut HashMap<String, usize>) {
518 if n == 0 || tokens.len() < n {
519 return;
520 }
521 for i in 0..=tokens.len() - n {
522 let mut buf = String::with_capacity(n * 6);
523 for (k, t) in tokens[i..i + n].iter().enumerate() {
524 if k > 0 {
525 buf.push(' ');
526 }
527 buf.push_str(t);
528 }
529 *out.entry(buf).or_insert(0) += 1;
530 }
531}
532
533fn wordfreq_count(tokens: &[String], out: &mut HashMap<String, usize>) {
535 for t in tokens {
536 *out.entry(t.clone()).or_insert(0) += 1;
537 }
538}
539
540fn context_and_neighbors(
542 tokens: &[String],
543 window: usize,
544 context_map: &mut HashMap<String, HashMap<String, usize>>,
545 direct_neighbors: &mut HashMap<String, HashMap<String, usize>>,
546) {
547 if window == 0 {
548 return;
549 }
550 let len = tokens.len();
551
552 for (i, w) in tokens.iter().enumerate() {
553 let left = i.saturating_sub(window);
554 let right = (i + window + 1).min(len);
555
556 let entry = context_map.entry(w.clone()).or_default();
557 for (j_off, neighbor) in tokens[left..right].iter().enumerate() {
558 let j = left + j_off;
559 if j == i {
560 continue;
561 }
562 *entry.entry(neighbor.clone()).or_insert(0) += 1;
563 }
564
565 let neigh = direct_neighbors.entry(w.clone()).or_default();
566 if i > 0 {
567 *neigh.entry(tokens[i - 1].clone()).or_insert(0) += 1;
568 }
569 if i + 1 < len {
570 *neigh.entry(tokens[i + 1].clone()).or_insert(0) += 1;
571 }
572 }
573}
574
575fn named_entities_heuristic(
581 original_tokens: &[String],
582 _sentence_starts: &[usize],
583 out: &mut HashMap<String, usize>,
584) {
585 for tok in original_tokens {
586 if tok
587 .chars()
588 .next()
589 .map(|c| c.is_uppercase())
590 .unwrap_or(false)
591 {
592 if tok.chars().all(|c| !c.is_lowercase()) {
593 continue;
594 }
595 let lower = tok.to_lowercase();
596 if [
597 "the", "a", "an", "der", "die", "das", "ein", "eine", "le", "la", "les", "un",
598 "una", "el", "los", "las", "il", "lo", "gli", "i",
599 ]
600 .contains(&lower.as_str())
601 {
602 continue;
603 }
604 *out.entry(tok.clone()).or_insert(0) += 1;
605 }
606 }
607}
608
609fn compute_pmi(
612 tokens: &[String],
613 window: usize,
614 wordfreq: &HashMap<String, usize>,
615 out: &mut Vec<PmiEntry>,
616) {
617 if window == 0 || tokens.len() < 2 {
618 return;
619 }
620 let total_tokens = tokens.len() as f64;
621
622 let mut pair_counts: HashMap<(String, String, usize), usize> = HashMap::new();
623 for i in 0..tokens.len() {
624 let w1 = &tokens[i];
625 let left = i.saturating_sub(window);
626 let right = (i + window + 1).min(tokens.len());
627 for (j_off, w2) in tokens[left..right].iter().enumerate() {
628 let j = left + j_off;
629 if j == i {
630 continue;
631 }
632 let d = (i as isize - j as isize).unsigned_abs();
633 let key = if w1 <= w2 {
634 (w1.clone(), w2.clone(), d)
635 } else {
636 (w2.clone(), w1.clone(), d)
637 };
638 *pair_counts.entry(key).or_insert(0) += 1;
639 }
640 }
641
642 out.clear();
643 out.reserve(pair_counts.len());
644 for ((w1, w2, d), c) in pair_counts {
645 let c1 = *wordfreq.get(&w1).unwrap_or(&1) as f64;
646 let c2 = *wordfreq.get(&w2).unwrap_or(&1) as f64;
647 let p_xy = (c as f64) / total_tokens;
648 let p_x = c1 / total_tokens;
649 let p_y = c2 / total_tokens;
650 let pmi = (p_xy / (p_x * p_y)).ln();
651 out.push(PmiEntry {
652 word1: w1,
653 word2: w2,
654 distance: d,
655 count: c,
656 pmi,
657 });
658 }
659
660 out.sort_by(|a, b| {
662 b.pmi
663 .partial_cmp(&a.pmi)
664 .unwrap_or(std::cmp::Ordering::Equal)
665 .then(b.count.cmp(&a.count))
666 });
667}
668
669fn partial_counts_from_text(
673 text: &str,
674 stopwords: &HashSet<String>,
675 opts: &AnalysisOptions,
676) -> PartialCounts {
677 let stem_lang = match opts.stem_mode {
678 StemMode::Off => StemLang::Unknown,
679 StemMode::Force(lang) => lang,
680 StemMode::Auto => detect(text)
681 .map(|i| StemLang::from_whatlang(i.lang()))
682 .unwrap_or(StemLang::Unknown),
683 };
684
685 let original_tokens = tokenize(text);
686 let tokens_for_stats = normalize_for_stats(&original_tokens, stopwords, stem_lang);
687 let n = tokens_for_stats.len();
688
689 let mut pc = PartialCounts {
690 n_tokens: n,
691 ..Default::default()
692 };
693
694 if opts.ngram > 0 && n >= opts.ngram {
696 for i in 0..=n - opts.ngram {
697 let mut buf = String::with_capacity(opts.ngram * 6);
698 for (k, t) in tokens_for_stats[i..i + opts.ngram].iter().enumerate() {
699 if k > 0 {
700 buf.push(' ');
701 }
702 buf.push_str(t);
703 }
704 *pc.ngrams.entry(buf).or_insert(0) += 1;
705 }
706 }
707
708 for t in &tokens_for_stats {
710 *pc.wordfreq.entry(t.clone()).or_insert(0) += 1;
711 }
712
713 let window = opts.context;
715 if window > 0 && n > 0 {
716 for (i, w) in tokens_for_stats.iter().enumerate() {
717 let left = i.saturating_sub(window);
718 let right = (i + window + 1).min(n);
719 for (j_off, neighbor) in tokens_for_stats[left..right].iter().enumerate() {
720 let j = left + j_off;
721 if j == i {
722 continue;
723 }
724 let key_ctx = (w.clone(), neighbor.clone());
726 *pc.context_pairs.entry(key_ctx).or_insert(0) += 1;
727
728 let (a, b) = if w <= neighbor {
730 (w.clone(), neighbor.clone())
731 } else {
732 (neighbor.clone(), w.clone())
733 };
734 let d = (i as isize - j as isize).unsigned_abs();
735 *pc.cooc_by_dist.entry((a, b, d)).or_insert(0) += 1;
736 }
737
738 if i > 0 {
740 let key_left = (w.clone(), tokens_for_stats[i - 1].clone());
741 *pc.neighbor_pairs.entry(key_left).or_insert(0) += 1;
742 }
743 if i + 1 < n {
744 let key_right = (w.clone(), tokens_for_stats[i + 1].clone());
745 *pc.neighbor_pairs.entry(key_right).or_insert(0) += 1;
746 }
747 }
748 }
749
750 let mut ner = HashMap::new();
752 let sentences = split_sentences(text);
753 named_entities_heuristic(&original_tokens, &sentences, &mut ner);
754 pc.named_entities = ner;
755
756 pc
757}
758
759fn merge_counts(into: &mut PartialCounts, other: PartialCounts) {
761 into.n_tokens += other.n_tokens;
762 for (k, v) in other.ngrams {
763 *into.ngrams.entry(k).or_insert(0) += v;
764 }
765 for (k, v) in other.wordfreq {
766 *into.wordfreq.entry(k).or_insert(0) += v;
767 }
768 for (k, v) in other.context_pairs {
769 *into.context_pairs.entry(k).or_insert(0) += v;
770 }
771 for (k, v) in other.neighbor_pairs {
772 *into.neighbor_pairs.entry(k).or_insert(0) += v;
773 }
774 for (k, v) in other.cooc_by_dist {
775 *into.cooc_by_dist.entry(k).or_insert(0) += v;
776 }
777 for (k, v) in other.named_entities {
778 *into.named_entities.entry(k).or_insert(0) += v;
779 }
780}
781
782fn analysis_from_counts(total: PartialCounts) -> AnalysisResult {
784 let mut result = AnalysisResult {
785 ngrams: total.ngrams,
786 wordfreq: total.wordfreq,
787 named_entities: total.named_entities,
788 ..Default::default()
789 };
790
791 for ((center, neighbor), c) in total.context_pairs {
792 let entry = result.context_map.entry(center).or_default();
793 *entry.entry(neighbor).or_insert(0) += c;
794 }
795 for ((center, neighbor), c) in total.neighbor_pairs {
796 let entry = result.direct_neighbors.entry(center).or_default();
797 *entry.entry(neighbor).or_insert(0) += c;
798 }
799
800 result.pmi = pmi_from_global_counts(&total.cooc_by_dist, total.n_tokens, &result.wordfreq);
801 result
802}
803
804fn pmi_from_global_counts(
806 cooc_by_dist: &HashMap<(String, String, usize), usize>,
807 n_tokens: usize,
808 wordfreq: &HashMap<String, usize>,
809) -> Vec<PmiEntry> {
810 if n_tokens == 0 {
811 return Vec::new();
812 }
813 let total = n_tokens as f64;
814 let mut out = Vec::with_capacity(cooc_by_dist.len());
815 for ((w1, w2, d), c) in cooc_by_dist {
816 let c1 = *wordfreq.get(w1).unwrap_or(&1) as f64;
817 let c2 = *wordfreq.get(w2).unwrap_or(&1) as f64;
818 let p_xy = (*c as f64) / total;
819 let p_x = c1 / total;
820 let p_y = c2 / total;
821 let pmi = (p_xy / (p_x * p_y)).ln();
822 out.push(PmiEntry {
823 word1: w1.clone(),
824 word2: w2.clone(),
825 distance: *d,
826 count: *c,
827 pmi,
828 });
829 }
830 out.sort_by(|a, b| {
832 b.pmi
833 .partial_cmp(&a.pmi)
834 .unwrap_or(std::cmp::Ordering::Equal)
835 .then(b.count.cmp(&a.count))
836 });
837 out
838}
839
840fn write_all_outputs(
844 stem: &str,
845 r: &AnalysisResult,
846 ts: &str,
847 opts: &AnalysisOptions,
848) -> Result<(), String> {
849 if opts.entities_only {
850 match opts.export_format {
852 ExportFormat::Txt => {
853 let mut out = String::new();
854 out.push_str("=== Named Entities ===\n");
855 let mut items: Vec<(&String, &usize)> = r.named_entities.iter().collect();
856 items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
857 for (e, c) in items.into_iter().take(2000) {
858 out.push_str(&format!("{e}\t{c}\n"));
859 }
860 let fname = format!("{stem}_{ts}_entities.txt");
861 fs::write(&fname, out).map_err(|e| format!("Write txt failed: {e}"))?;
862 }
863 ExportFormat::Csv | ExportFormat::Tsv | ExportFormat::Json => {
864 write_table("entities", stem, ts, &r.named_entities, opts)?;
865 }
866 }
867 return Ok(());
868 }
869
870 match opts.export_format {
871 ExportFormat::Txt => {
872 let mut out = String::new();
874
875 out.push_str(&format!("=== N-grams (N={}) ===\n", opts.ngram));
877 let mut ngram_items: Vec<(&String, &usize)> = r.ngrams.iter().collect();
878 ngram_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
879 for (ng, c) in ngram_items.into_iter().take(50) {
880 out.push_str(&format!("{ng}\t{c}\n"));
881 }
882
883 out.push_str("\n=== Word Frequencies ===\n");
885 let mut wf_items: Vec<(&String, &usize)> = r.wordfreq.iter().collect();
886 wf_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
887 for (w, c) in wf_items.into_iter().take(50) {
888 out.push_str(&format!("{w}\t{c}\n"));
889 }
890
891 out.push_str("\n=== Named Entities ===\n");
893 let mut ne_items: Vec<(&String, &usize)> = r.named_entities.iter().collect();
894 ne_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
895 for (e, c) in ne_items.into_iter().take(50) {
896 out.push_str(&format!("{e}\t{c}\n"));
897 }
898
899 out.push_str("\n=== PMI (top 50, by count) ===\n");
901 let mut pmi_rows: Vec<&PmiEntry> = r.pmi.iter().collect();
902 pmi_rows.sort_by(|a, b| {
903 b.count
904 .cmp(&a.count)
905 .then_with(|| {
906 b.pmi
907 .partial_cmp(&a.pmi)
908 .unwrap_or(std::cmp::Ordering::Equal)
909 })
910 .then_with(|| a.word1.cmp(&b.word1))
911 .then_with(|| a.word2.cmp(&b.word2))
912 });
913 for p in pmi_rows.into_iter().take(50) {
914 out.push_str(&format!(
915 "({}, {}) @d={} PMI={:.3} count={}\n",
916 p.word1, p.word2, p.distance, p.pmi, p.count
917 ));
918 }
919
920 let fname = format!("{stem}_{ts}_summary.txt");
921 fs::write(&fname, out).map_err(|e| format!("Write txt failed: {e}"))?;
922 }
923 ExportFormat::Csv | ExportFormat::Tsv | ExportFormat::Json => {
924 write_table("ngrams", stem, ts, &r.ngrams, opts)?;
925 write_table("wordfreq", stem, ts, &r.wordfreq, opts)?;
926 write_nested("context", stem, ts, &r.context_map, opts)?;
927 write_nested("neighbors", stem, ts, &r.direct_neighbors, opts)?;
928 write_pmi("pmi", stem, ts, &r.pmi, opts)?;
929 write_table("namedentities", stem, ts, &r.named_entities, opts)?;
930 }
931 }
932 Ok(())
933}
934
935fn write_table(
941 name: &str,
942 stem: &str,
943 ts: &str,
944 map: &std::collections::HashMap<String, usize>,
945 opts: &AnalysisOptions,
946) -> Result<(), String> {
947 let fname = format!("{stem}_{ts}_{name}.{}", ext(opts.export_format));
948
949 let mut items: Vec<(&String, &usize)> = map.iter().collect();
951 items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
952
953 match opts.export_format {
954 ExportFormat::Csv | ExportFormat::Tsv => {
955 let delim: u8 = if matches!(opts.export_format, ExportFormat::Csv) {
956 b','
957 } else {
958 b'\t'
959 };
960 let file = std::fs::File::create(&fname).map_err(|e| format!("create {fname}: {e}"))?;
961 let mut wtr = csv::WriterBuilder::new().delimiter(delim).from_writer(file);
962
963 wtr.write_record(["item", "count"])
965 .map_err(|e| e.to_string())?;
966
967 for (k, v) in items {
968 wtr.write_record([csv_safe_cell(k.to_string()), v.to_string()])
969 .map_err(|e| e.to_string())?;
970 }
971 wtr.flush().map_err(|e| e.to_string())?;
972 }
973 ExportFormat::Json => {
974 let v: Vec<_> = items
975 .iter()
976 .map(|(k, v)| serde_json::json!({ "item": k, "count": v }))
977 .collect();
978 std::fs::write(&fname, serde_json::to_string_pretty(&v).unwrap())
979 .map_err(|e| format!("write {fname}: {e}"))?;
980 }
981 ExportFormat::Txt => unreachable!(),
982 }
983 Ok(())
984}
985
986fn write_nested(
991 name: &str,
992 stem: &str,
993 ts: &str,
994 map: &std::collections::HashMap<String, std::collections::HashMap<String, usize>>,
995 opts: &AnalysisOptions,
996) -> Result<(), String> {
997 let fname = format!("{stem}_{ts}_{name}.{}", ext(opts.export_format));
998
999 let mut rows: Vec<(&String, &String, &usize)> = Vec::new();
1001 for (k, inner) in map {
1002 for (k2, v) in inner {
1003 rows.push((k, k2, v));
1004 }
1005 }
1006 rows.sort_by(|a, b| {
1007 b.2.cmp(a.2)
1008 .then_with(|| a.0.cmp(b.0))
1009 .then_with(|| a.1.cmp(b.1))
1010 });
1011
1012 match opts.export_format {
1013 ExportFormat::Csv | ExportFormat::Tsv => {
1014 let delim: u8 = if matches!(opts.export_format, ExportFormat::Csv) {
1015 b','
1016 } else {
1017 b'\t'
1018 };
1019 let file = std::fs::File::create(&fname).map_err(|e| format!("create {fname}: {e}"))?;
1020 let mut wtr = csv::WriterBuilder::new().delimiter(delim).from_writer(file);
1021
1022 wtr.write_record(["item1", "item2", "count"])
1024 .map_err(|e| e.to_string())?;
1025
1026 for (k, k2, v) in rows {
1027 wtr.write_record([
1028 csv_safe_cell(k.to_string()),
1029 csv_safe_cell(k2.to_string()),
1030 v.to_string(),
1031 ])
1032 .map_err(|e| e.to_string())?;
1033 }
1034 wtr.flush().map_err(|e| e.to_string())?;
1035 }
1036 ExportFormat::Json => {
1037 let v: Vec<_> = rows
1038 .iter()
1039 .map(|(k, k2, v)| serde_json::json!({ "item1": k, "item2": k2, "count": v }))
1040 .collect();
1041 std::fs::write(&fname, serde_json::to_string_pretty(&v).unwrap())
1042 .map_err(|e| format!("write {fname}: {e}"))?;
1043 }
1044 ExportFormat::Txt => unreachable!(),
1045 }
1046 Ok(())
1047}
1048
1049fn write_pmi(
1054 name: &str,
1055 stem: &str,
1056 ts: &str,
1057 pmi: &[PmiEntry], opts: &AnalysisOptions,
1059) -> Result<(), String> {
1060 let fname = format!("{stem}_{ts}_{name}.{}", ext(opts.export_format));
1061
1062 let mut rows: Vec<&PmiEntry> = pmi.iter().collect();
1064 rows.sort_by(|a, b| {
1065 b.count
1066 .cmp(&a.count)
1067 .then_with(|| {
1068 b.pmi
1069 .partial_cmp(&a.pmi)
1070 .unwrap_or(std::cmp::Ordering::Equal)
1071 })
1072 .then_with(|| a.word1.cmp(&b.word1))
1073 .then_with(|| a.word2.cmp(&b.word2))
1074 });
1075
1076 match opts.export_format {
1077 ExportFormat::Csv | ExportFormat::Tsv => {
1078 let delim: u8 = if matches!(opts.export_format, ExportFormat::Csv) {
1079 b','
1080 } else {
1081 b'\t'
1082 };
1083 let file = std::fs::File::create(&fname).map_err(|e| format!("create {fname}: {e}"))?;
1084 let mut wtr = csv::WriterBuilder::new().delimiter(delim).from_writer(file);
1085
1086 wtr.write_record(["word1", "word2", "distance", "count", "pmi"])
1088 .map_err(|e| e.to_string())?;
1089
1090 for r in rows {
1091 wtr.write_record([
1092 csv_safe_cell(r.word1.clone()),
1093 csv_safe_cell(r.word2.clone()),
1094 r.distance.to_string(),
1095 r.count.to_string(),
1096 format!("{:.6}", r.pmi),
1097 ])
1098 .map_err(|e| e.to_string())?;
1099 }
1100 wtr.flush().map_err(|e| e.to_string())?;
1101 }
1102 ExportFormat::Json => {
1103 let v: Vec<_> = rows
1104 .iter()
1105 .map(|r| {
1106 serde_json::json!({
1107 "word1": r.word1,
1108 "word2": r.word2,
1109 "distance": r.distance,
1110 "count": r.count,
1111 "pmi": r.pmi
1112 })
1113 })
1114 .collect();
1115 std::fs::write(&fname, serde_json::to_string_pretty(&v).unwrap())
1116 .map_err(|e| format!("write {fname}: {e}"))?;
1117 }
1118 ExportFormat::Txt => unreachable!(),
1119 }
1120 Ok(())
1121}
1122
1123fn summary_for(pairs: &[(String, &AnalysisResult)], _opts: &AnalysisOptions) -> String {
1127 let mut s = String::new();
1134 s.push_str("=== Analysis Summary ===\n");
1135
1136 for (name, r) in pairs {
1137 s.push_str(&format!("\n# {}\n", name));
1138
1139 s.push_str("Top 20 n-grams:\n");
1141 let mut ngram_items: Vec<(&String, &usize)> = r.ngrams.iter().collect();
1142 ngram_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
1143 for (ng, c) in ngram_items.into_iter().take(20) {
1144 s.push_str(&format!(" {}\t{}\n", ng, c));
1145 }
1146
1147 s.push_str("Top 20 PMI (by count, then PMI):\n");
1149 let mut pmi_rows: Vec<&PmiEntry> = r.pmi.iter().collect();
1150 pmi_rows.sort_by(|a, b| {
1151 b.count
1152 .cmp(&a.count)
1153 .then_with(|| {
1154 b.pmi
1155 .partial_cmp(&a.pmi)
1156 .unwrap_or(std::cmp::Ordering::Equal)
1157 })
1158 .then_with(|| a.word1.cmp(&b.word1))
1159 .then_with(|| a.word2.cmp(&b.word2))
1160 });
1161 for p in pmi_rows.into_iter().take(20) {
1162 s.push_str(&format!(
1163 " ({}, {}) @d={} count={} PMI={:.3}\n",
1164 p.word1, p.word2, p.distance, p.count, p.pmi
1165 ));
1166 }
1167
1168 s.push_str("Top 20 words:\n");
1170 let mut wf_items: Vec<(&String, &usize)> = r.wordfreq.iter().collect();
1171 wf_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
1172 for (w, c) in wf_items.into_iter().take(20) {
1173 s.push_str(&format!(" {}\t{}\n", w, c));
1174 }
1175 }
1176
1177 s
1178}
1179
1180fn timestamp() -> String {
1182 Local::now().format("%Y%m%d_%H%M%S").to_string()
1183}
1184
1185fn ext(fmt: ExportFormat) -> &'static str {
1187 match fmt {
1188 ExportFormat::Txt => "txt",
1189 ExportFormat::Csv => "csv",
1190 ExportFormat::Tsv => "tsv",
1191 ExportFormat::Json => "json",
1192 }
1193}
1194
1195pub fn stem_for(p: &Path) -> String {
1198 let stem = p.file_stem().and_then(|s| s.to_str()).unwrap_or("file");
1199 let ext = p.extension().and_then(|s| s.to_str()).unwrap_or("");
1200 let h = short_hash(p);
1201 if ext.is_empty() {
1202 format!("{stem}_{h}")
1203 } else {
1204 format!("{stem}.{ext}_{h}")
1205 }
1206}
1207
1208fn short_hash<P: AsRef<Path>>(p: P) -> String {
1209 let mut hasher = std::collections::hash_map::DefaultHasher::new();
1210 p.as_ref().to_string_lossy().hash(&mut hasher);
1211 let v = hasher.finish();
1212 format!("{:08x}", v)
1213}
1214
1215fn detect_supported_stem_lang(text: &str) -> Option<StemLang> {
1217 let info = whatlang::detect(text)?;
1218 let sl = StemLang::from_whatlang(info.lang());
1219 if make_stemmer(sl).is_some() {
1220 Some(sl)
1221 } else {
1222 None
1223 }
1224}
1225
1226pub fn csv_safe_cell(mut s: String) -> String {
1227 if matches!(s.chars().next(), Some('=' | '+' | '-' | '@')) {
1228 s.insert(0, '\'');
1229 }
1230 s
1231}