text_analysis/
lib.rs

1#![forbid(unsafe_code)]
2#![doc(
3    html_logo_url = "https://raw.githubusercontent.com/ArdentEmpiricist/text_analysis/main/assets/text_analysis_logo.png"
4)]
5#![doc = r#"
6Text Analysis Library
7
8This crate provides a fast, pragmatic toolkit for linguistic text analysis over `.txt`, `.pdf`, `.docx`, and `.odt`
9files. It supports:
10
11- Tokenization (Unicode-aware, simple alphanumeric rules)
12- Optional stopword filtering (user-supplied list)
13- Optional stemming (auto-detected or forced language)
14- N-gram counting
15- Word frequency counting
16- Context statistics (±N window) and direct neighbors (±1)
17- PMI (Pointwise Mutual Information) collocations
18- Simple Named-Entity extraction (capitalization heuristic)
19- Parallel per-file analysis (compute) with serialized writes
20- Combined (Map-Reduce) mode that aggregates counts across files
21- **Deterministic, sorted outputs** in CSV/TSV/JSON/TXT
22
23
24## Security & CSV/TSV export safety
25
26If you open CSV/TSV in spreadsheet software (Excel/LibreOffice), cells that **start with** one of
27`=`, `+`, `-`, or `@` may be interpreted as formulas (e.g., `=HYPERLINK(...)`). To prevent this, **always:**
281. Write CSV/TSV using a proper CSV library (this project uses `csv::Writer`) so commas, tabs, quotes, and newlines are escaped correctly.
292. Sanitize **text cells** by prefixing a single quote when they begin with one of the dangerous characters.
30
31"#]
32
33use chrono::Local;
34use rayon::prelude::*;
35use std::collections::{HashMap, HashSet};
36use std::fs;
37use std::hash::{Hash, Hasher};
38use std::path::{Path, PathBuf};
39use whatlang::{Lang, detect};
40
41// PDF parsing is always enabled (no feature flag)
42use pdf_extract::extract_text;
43
44// Office document parsing
45mod office;
46
47// JSON writer for exports
48
49// ---------- Public API types ----------
50
51/// Export format for analysis outputs.
52#[derive(Copy, Clone, Debug)]
53pub enum ExportFormat {
54    Txt,
55    Csv,
56    Tsv,
57    Json,
58}
59
60/// Stemming behavior selector.
61#[derive(Copy, Clone, Debug, Eq, PartialEq)]
62pub enum StemMode {
63    /// No stemming.
64    Off,
65    /// Detect language automatically via `whatlang` and stem when supported.
66    Auto,
67    /// Force a specific stemming language.
68    Force(StemLang),
69}
70
71/// Supported stemming languages (subset of `rust-stemmers`).
72#[derive(Copy, Clone, Debug, Eq, PartialEq)]
73pub enum StemLang {
74    Unknown,
75    En,
76    De,
77    Fr,
78    Es,
79    It,
80    Pt,
81    Nl,
82    Ru,
83    Sv,
84    Fi,
85    No,
86    Ro,
87    Hu,
88    Da,
89    Tr,
90}
91
92impl StemLang {
93    /// Map a short CLI code (e.g., "en", "de") to `StemLang`.
94    pub fn from_code(code: &str) -> Option<Self> {
95        use StemLang::*;
96        let c = code.to_ascii_lowercase();
97        Some(match c.as_str() {
98            "en" => En,
99            "de" => De,
100            "fr" => Fr,
101            "es" => Es,
102            "it" => It,
103            "pt" => Pt,
104            "nl" => Nl,
105            "ru" => Ru,
106            "sv" => Sv,
107            "fi" => Fi,
108            "no" => No,
109            "ro" => Ro,
110            "hu" => Hu,
111            "da" => Da,
112            "tr" => Tr,
113            _ => return None,
114        })
115    }
116    /// Map a `whatlang::Lang` to `StemLang`. Unknown mappings become `Unknown`.
117    pub fn from_whatlang(lang: Lang) -> Self {
118        // Use ISO-639-3 codes to be robust across whatlang versions
119        match lang.code() {
120            "eng" => StemLang::En,
121            "deu" => StemLang::De,
122            "fra" | "fre" => StemLang::Fr,
123            "spa" => StemLang::Es,
124            "ita" => StemLang::It,
125            "por" => StemLang::Pt,
126            "nld" | "dut" => StemLang::Nl,
127            "rus" => StemLang::Ru,
128            "swe" => StemLang::Sv,
129            "fin" => StemLang::Fi,
130            "nor" | "nob" | "nno" => StemLang::No,
131            "ron" | "rum" => StemLang::Ro,
132            "hun" => StemLang::Hu,
133            "dan" => StemLang::Da,
134            "tur" => StemLang::Tr,
135            _ => StemLang::Unknown,
136        }
137    }
138}
139
140/// Parameters controlling analysis and export behavior.
141#[derive(Clone, Debug)]
142pub struct AnalysisOptions {
143    /// N-gram size (>=1 recommended; 2 = bigrams).
144    pub ngram: usize,
145    /// Context window (±N) for context statistics and PMI.
146    pub context: usize,
147    /// Export format for files (TXT/CSV/TSV/JSON).
148    pub export_format: ExportFormat,
149    /// If true, export only Named Entities (skips other tables).
150    pub entities_only: bool,
151    /// If true, aggregate all files into one corpus (Map-Reduce). Otherwise per-file outputs.
152    pub combine: bool,
153    /// Stemming mode (off/auto/force).
154    pub stem_mode: StemMode,
155    /// If true and `stem_mode == Auto`, require detectable & supported language; otherwise fail.
156    /// - Per-file: file is skipped and reported in `failed_files`, run continues (success).
157    /// - Combined: the whole run aborts with an error to avoid mixed stemming.
158    pub stem_require_detected: bool,
159}
160
161/// Summary of a completed run.
162#[derive(Debug)]
163pub struct AnalysisReport {
164    /// Human-readable summary (top words per output).
165    pub summary: String,
166    /// (file_path, error) pairs for unreadable or skipped inputs.
167    pub failed_files: Vec<(String, String)>,
168}
169
170/// Full analysis result for a single text/corpus.
171#[derive(Debug, Default)]
172pub struct AnalysisResult {
173    pub ngrams: HashMap<String, usize>,
174    pub wordfreq: HashMap<String, usize>,
175    pub context_map: HashMap<String, HashMap<String, usize>>,
176    pub direct_neighbors: HashMap<String, HashMap<String, usize>>,
177    pub named_entities: HashMap<String, usize>,
178    pub pmi: Vec<PmiEntry>,
179}
180
181/// PMI entry for a pair of words at a given distance.
182#[derive(Debug)]
183pub struct PmiEntry {
184    pub word1: String,
185    pub word2: String,
186    pub distance: usize,
187    pub count: usize,
188    pub pmi: f64,
189}
190
191// ---------- Map-Reduce internal structures ----------
192
193/// Partial counts emitted by the *map* stage for a single file.
194#[derive(Default)]
195struct PartialCounts {
196    n_tokens: usize,
197    ngrams: HashMap<String, usize>,
198    wordfreq: HashMap<String, usize>,
199    context_pairs: HashMap<(String, String), usize>,
200    neighbor_pairs: HashMap<(String, String), usize>,
201    cooc_by_dist: HashMap<(String, String, usize), usize>,
202    named_entities: HashMap<String, usize>,
203}
204
205// ---------- Public re-exports for Office document extraction ----------
206
207pub use office::{extract_text_from_docx, extract_text_from_odt};
208
209// ---------- High-level entry point ----------
210
211/// Analyze a path (file or directory).  
212/// - Per-file mode: compute in parallel per file; write outputs per file (serialized).  
213/// - Combined mode: Map-Reduce over files; write a single combined set of outputs.
214pub fn analyze_path(
215    path: &Path,
216    stopwords_file: Option<&PathBuf>,
217    options: &AnalysisOptions,
218) -> Result<AnalysisReport, String> {
219    let files = collect_files(path);
220    if files.is_empty() {
221        return Err("No .txt, .pdf, .docx or .odt files found for analysis.".to_string());
222    }
223
224    let stopwords = load_stopwords(stopwords_file);
225    let mut failed: Vec<(String, String)> = Vec::new();
226    let ts = timestamp();
227
228    // --- Combined Map-Reduce mode ---
229    if options.combine {
230        // Map: read + build partial counts in parallel.
231        let mapped: Vec<_> = files
232            .par_iter()
233            .map(|f| match read_text(f) {
234                Ok(t) => {
235                    if matches!(options.stem_mode, StemMode::Auto)
236                        && options.stem_require_detected
237                        && detect_supported_stem_lang(&t).is_none()
238                    {
239                        return Err((
240                            f.display().to_string(),
241                            "Language detection failed or unsupported for stemming (strict)"
242                                .to_string(),
243                        ));
244                    }
245                    Ok(partial_counts_from_text(&t, &stopwords, options))
246                }
247                Err(e) => Err((f.display().to_string(), e)),
248            })
249            .collect();
250
251        // Reduce: merge partials, collect failures.
252        let mut total = PartialCounts::default();
253        let mut failed_local: Vec<(String, String)> = Vec::new();
254        for item in mapped {
255            match item {
256                Ok(pc) => merge_counts(&mut total, pc),
257                Err(fe) => failed_local.push(fe),
258            }
259        }
260        if options.stem_require_detected && !failed_local.is_empty() {
261            // Fail the combined run to avoid mixed stemming.
262            let msg = format!(
263                "Combined run aborted (strict stemming): {} file(s) without detectable/supported language",
264                failed_local.len()
265            );
266            return Err(msg);
267        }
268        failed.extend(failed_local);
269
270        // Finalize: build one `AnalysisResult`, export once.
271        let result = analysis_from_counts(total);
272        write_all_outputs("combined", &result, &ts, options)?;
273        let summary = summary_for(&[("combined".to_string(), &result)], options);
274        return Ok(AnalysisReport {
275            summary,
276            failed_files: failed,
277        });
278    }
279
280    // --- Per-file mode: parallel compute, serialized writes ---
281    let results: Vec<_> = files
282        .par_iter()
283        .map(|f| match read_text(f) {
284            Ok(t) => {
285                if matches!(options.stem_mode, StemMode::Auto)
286                    && options.stem_require_detected
287                    && detect_supported_stem_lang(&t).is_none()
288                {
289                    return Err((
290                        f.display().to_string(),
291                        "Language detection failed or unsupported for stemming (strict)"
292                            .to_string(),
293                    ));
294                }
295                let r = analyze_text_with(&t, &stopwords, options);
296                let stem = stem_for(f);
297                Ok((stem, r))
298            }
299            Err(e) => Err((f.display().to_string(), e)),
300        })
301        .collect();
302
303    let mut per_file_results: Vec<(String, AnalysisResult)> = Vec::new();
304    for item in results {
305        match item {
306            Ok(v) => per_file_results.push(v),
307            Err(fe) => failed.push(fe),
308        }
309    }
310
311    // Writes are serialized to reduce I/O contention.
312    for (stem, r) in &per_file_results {
313        write_all_outputs(stem, r, &ts, options)?;
314    }
315
316    // Human-readable summary
317    let pairs: Vec<(String, &AnalysisResult)> = per_file_results
318        .iter()
319        .map(|(n, r)| (n.clone(), r))
320        .collect();
321    let summary = summary_for(&pairs, options);
322    Ok(AnalysisReport {
323        summary,
324        failed_files: failed,
325    })
326}
327
328// ---------- File discovery ----------
329
330/// Collect all supported files (.txt, .pdf, .docx, .odt) recursively from `path`.
331pub fn collect_files(path: &Path) -> Vec<PathBuf> {
332    let mut out = Vec::new();
333    if path.is_file() {
334        if is_supported(path) {
335            out.push(path.to_path_buf());
336        }
337    } else if path.is_dir() {
338        let walker = walkdir::WalkDir::new(path).follow_links(true);
339        for entry in walker.into_iter().filter_map(Result::ok) {
340            let p = entry.path();
341            if p.is_file() && is_supported(p) {
342                out.push(p.to_path_buf());
343            }
344        }
345    }
346    out
347}
348
349fn is_supported(p: &Path) -> bool {
350    matches!(
351        p.extension()
352            .and_then(|e| e.to_str())
353            .map(|s| s.to_ascii_lowercase()),
354        Some(ref e) if e == "txt" || e == "pdf" || e == "docx" || e == "odt"
355    )
356}
357
358// ---------- Reading & preprocessing ----------
359
360/// Read the text from `.txt`, `.pdf`, `.docx`, or `.odt`. Returns a displayable error string on failure.
361fn read_text(p: &Path) -> Result<String, String> {
362    let ext = p
363        .extension()
364        .and_then(|e| e.to_str())
365        .unwrap_or("")
366        .to_ascii_lowercase();
367    match ext.as_str() {
368        "txt" => fs::read_to_string(p).map_err(|e| format!("Read .txt failed: {e}")),
369        "pdf" => extract_text(p).map_err(|e| format!("PDF extract failed: {e}")),
370        "docx" => office::extract_text_from_docx(p),
371        "odt" => office::extract_text_from_odt(p),
372        _ => Err("Unsupported extension".to_string()),
373    }
374}
375
376/// Load stopwords from a text file (one word per line). Empty or unreadable files yield an empty set.
377fn load_stopwords(p: Option<&PathBuf>) -> HashSet<String> {
378    let mut set = HashSet::new();
379    if let Some(file) = p
380        && let Ok(txt) = fs::read_to_string(file)
381    {
382        for line in txt.lines() {
383            let w = line.trim();
384            if !w.is_empty() {
385                set.insert(w.to_string());
386            }
387        }
388    }
389    set
390}
391
392// ---------- Core analysis (per text) ----------
393
394/// Analyze a single text buffer with the given `stopwords` and `options`.
395/// This is the core pipeline used by both per-file and combined modes.
396pub fn analyze_text_with(
397    text: &str,
398    stopwords: &HashSet<String>,
399    opts: &AnalysisOptions,
400) -> AnalysisResult {
401    // Determine stemming language once per text (not per token).
402    let stem_lang = match opts.stem_mode {
403        StemMode::Off => StemLang::Unknown,
404        StemMode::Force(lang) => lang,
405        StemMode::Auto => detect(text)
406            .map(|i| StemLang::from_whatlang(i.lang()))
407            .unwrap_or(StemLang::Unknown),
408    };
409
410    // Tokenize original and normalize for stats.
411    let original_tokens = tokenize(text);
412    let sentences = split_sentences(text);
413    let tokens_for_stats = normalize_for_stats(&original_tokens, stopwords, stem_lang);
414
415    let mut result = AnalysisResult::default();
416    ngrams_count(&tokens_for_stats, opts.ngram, &mut result.ngrams);
417    wordfreq_count(&tokens_for_stats, &mut result.wordfreq);
418    context_and_neighbors(
419        &tokens_for_stats,
420        opts.context,
421        &mut result.context_map,
422        &mut result.direct_neighbors,
423    );
424    // NER is based on original, *non-stemmed*, case-sensitive tokens.
425    named_entities_heuristic(&original_tokens, &sentences, &mut result.named_entities);
426    // PMI uses normalized tokens, consistent with other statistics.
427    compute_pmi(
428        &tokens_for_stats,
429        opts.context,
430        &result.wordfreq,
431        &mut result.pmi,
432    );
433
434    result
435}
436
437/// Simple tokenizer: keeps alphanumerics and `'` inside tokens, splits on everything else.
438fn tokenize(text: &str) -> Vec<String> {
439    let mut out = Vec::with_capacity(text.len() / 5);
440    let mut cur = String::new();
441    for ch in text.chars() {
442        if ch.is_alphanumeric() || ch == '\'' {
443            cur.push(ch);
444        } else if !cur.is_empty() {
445            out.push(std::mem::take(&mut cur));
446        }
447    }
448    if !cur.is_empty() {
449        out.push(cur);
450    }
451    out
452}
453
454/// Sentence boundary detection: record byte offsets after '.', '!' or '?'.
455fn split_sentences(text: &str) -> Vec<usize> {
456    let mut starts = vec![0usize];
457    let mut idx = 0usize;
458    for ch in text.chars() {
459        idx += ch.len_utf8();
460        if ch == '.' || ch == '!' || ch == '?' {
461            starts.push(idx);
462        }
463    }
464    starts.sort_unstable();
465    starts
466}
467
468/// Normalize tokens for statistics: lowercase, optional stopword removal, optional stemming.
469fn normalize_for_stats(
470    tokens: &[String],
471    stopwords: &HashSet<String>,
472    stem_lang: StemLang,
473) -> Vec<String> {
474    let mut out = Vec::with_capacity(tokens.len());
475    let stemmer = make_stemmer(stem_lang); // create once, reuse
476    for t in tokens {
477        let lower = t.to_lowercase();
478        if !stopwords.is_empty() && stopwords.contains(&lower) {
479            continue;
480        }
481        let normalized = if let Some(stem) = &stemmer {
482            stem.stem(&lower).to_string()
483        } else {
484            lower
485        };
486        out.push(normalized);
487    }
488    out
489}
490
491/// Construct a `rust-stemmers` instance for the given language. Returns `None` if unsupported.
492fn make_stemmer(lang: StemLang) -> Option<rust_stemmers::Stemmer> {
493    use StemLang::*;
494    use rust_stemmers::{Algorithm, Stemmer};
495    let algo = match lang {
496        En => Algorithm::English,
497        De => Algorithm::German,
498        Fr => Algorithm::French,
499        Es => Algorithm::Spanish,
500        It => Algorithm::Italian,
501        Pt => Algorithm::Portuguese,
502        Nl => Algorithm::Dutch,
503        Ru => Algorithm::Russian,
504        Sv => Algorithm::Swedish,
505        Fi => Algorithm::Finnish,
506        No => Algorithm::Norwegian,
507        Ro => Algorithm::Romanian,
508        Hu => Algorithm::Hungarian,
509        Da => Algorithm::Danish,
510        Tr => Algorithm::Turkish,
511        Unknown => return None,
512    };
513    Some(Stemmer::create(algo))
514}
515
516/// Count N-grams of size `n` into `out`.
517fn ngrams_count(tokens: &[String], n: usize, out: &mut HashMap<String, usize>) {
518    if n == 0 || tokens.len() < n {
519        return;
520    }
521    for i in 0..=tokens.len() - n {
522        let mut buf = String::with_capacity(n * 6);
523        for (k, t) in tokens[i..i + n].iter().enumerate() {
524            if k > 0 {
525                buf.push(' ');
526            }
527            buf.push_str(t);
528        }
529        *out.entry(buf).or_insert(0) += 1;
530    }
531}
532
533/// Count individual word frequencies.
534fn wordfreq_count(tokens: &[String], out: &mut HashMap<String, usize>) {
535    for t in tokens {
536        *out.entry(t.clone()).or_insert(0) += 1;
537    }
538}
539
540/// Build context (±window) counts and direct (±1) neighbor counts.
541fn context_and_neighbors(
542    tokens: &[String],
543    window: usize,
544    context_map: &mut HashMap<String, HashMap<String, usize>>,
545    direct_neighbors: &mut HashMap<String, HashMap<String, usize>>,
546) {
547    if window == 0 {
548        return;
549    }
550    let len = tokens.len();
551
552    for (i, w) in tokens.iter().enumerate() {
553        let left = i.saturating_sub(window);
554        let right = (i + window + 1).min(len);
555
556        let entry = context_map.entry(w.clone()).or_default();
557        for (j_off, neighbor) in tokens[left..right].iter().enumerate() {
558            let j = left + j_off;
559            if j == i {
560                continue;
561            }
562            *entry.entry(neighbor.clone()).or_insert(0) += 1;
563        }
564
565        let neigh = direct_neighbors.entry(w.clone()).or_default();
566        if i > 0 {
567            *neigh.entry(tokens[i - 1].clone()).or_insert(0) += 1;
568        }
569        if i + 1 < len {
570            *neigh.entry(tokens[i + 1].clone()).or_insert(0) += 1;
571        }
572    }
573}
574
575/// Naive Named-Entity heuristic:
576/// - Token must start with an uppercase letter
577/// - Token must not be all uppercase (filters acronyms)
578/// - Filter a small set of very common determiners/articles in multiple languages
579///   Counts are case-sensitive.
580fn named_entities_heuristic(
581    original_tokens: &[String],
582    _sentence_starts: &[usize],
583    out: &mut HashMap<String, usize>,
584) {
585    for tok in original_tokens {
586        if tok
587            .chars()
588            .next()
589            .map(|c| c.is_uppercase())
590            .unwrap_or(false)
591        {
592            if tok.chars().all(|c| !c.is_lowercase()) {
593                continue;
594            }
595            let lower = tok.to_lowercase();
596            if [
597                "the", "a", "an", "der", "die", "das", "ein", "eine", "le", "la", "les", "un",
598                "una", "el", "los", "las", "il", "lo", "gli", "i",
599            ]
600            .contains(&lower.as_str())
601            {
602                continue;
603            }
604            *out.entry(tok.clone()).or_insert(0) += 1;
605        }
606    }
607}
608
609/// Compute PMI (Pointwise Mutual Information) for all pairs within ±`window`.
610/// Pairs are stored canonically (`w1 <= w2`) and include the absolute distance `d`.
611fn compute_pmi(
612    tokens: &[String],
613    window: usize,
614    wordfreq: &HashMap<String, usize>,
615    out: &mut Vec<PmiEntry>,
616) {
617    if window == 0 || tokens.len() < 2 {
618        return;
619    }
620    let total_tokens = tokens.len() as f64;
621
622    let mut pair_counts: HashMap<(String, String, usize), usize> = HashMap::new();
623    for i in 0..tokens.len() {
624        let w1 = &tokens[i];
625        let left = i.saturating_sub(window);
626        let right = (i + window + 1).min(tokens.len());
627        for (j_off, w2) in tokens[left..right].iter().enumerate() {
628            let j = left + j_off;
629            if j == i {
630                continue;
631            }
632            let d = (i as isize - j as isize).unsigned_abs();
633            let key = if w1 <= w2 {
634                (w1.clone(), w2.clone(), d)
635            } else {
636                (w2.clone(), w1.clone(), d)
637            };
638            *pair_counts.entry(key).or_insert(0) += 1;
639        }
640    }
641
642    out.clear();
643    out.reserve(pair_counts.len());
644    for ((w1, w2, d), c) in pair_counts {
645        let c1 = *wordfreq.get(&w1).unwrap_or(&1) as f64;
646        let c2 = *wordfreq.get(&w2).unwrap_or(&1) as f64;
647        let p_xy = (c as f64) / total_tokens;
648        let p_x = c1 / total_tokens;
649        let p_y = c2 / total_tokens;
650        let pmi = (p_xy / (p_x * p_y)).ln();
651        out.push(PmiEntry {
652            word1: w1,
653            word2: w2,
654            distance: d,
655            count: c,
656            pmi,
657        });
658    }
659
660    // In-memory order: PMI desc, then count desc for stability.
661    out.sort_by(|a, b| {
662        b.pmi
663            .partial_cmp(&a.pmi)
664            .unwrap_or(std::cmp::Ordering::Equal)
665            .then(b.count.cmp(&a.count))
666    });
667}
668
669// ---------- Map-Reduce helpers ----------
670
671/// Build partial counts for a single text buffer (map stage).
672fn partial_counts_from_text(
673    text: &str,
674    stopwords: &HashSet<String>,
675    opts: &AnalysisOptions,
676) -> PartialCounts {
677    let stem_lang = match opts.stem_mode {
678        StemMode::Off => StemLang::Unknown,
679        StemMode::Force(lang) => lang,
680        StemMode::Auto => detect(text)
681            .map(|i| StemLang::from_whatlang(i.lang()))
682            .unwrap_or(StemLang::Unknown),
683    };
684
685    let original_tokens = tokenize(text);
686    let tokens_for_stats = normalize_for_stats(&original_tokens, stopwords, stem_lang);
687    let n = tokens_for_stats.len();
688
689    let mut pc = PartialCounts {
690        n_tokens: n,
691        ..Default::default()
692    };
693
694    // N-grams
695    if opts.ngram > 0 && n >= opts.ngram {
696        for i in 0..=n - opts.ngram {
697            let mut buf = String::with_capacity(opts.ngram * 6);
698            for (k, t) in tokens_for_stats[i..i + opts.ngram].iter().enumerate() {
699                if k > 0 {
700                    buf.push(' ');
701                }
702                buf.push_str(t);
703            }
704            *pc.ngrams.entry(buf).or_insert(0) += 1;
705        }
706    }
707
708    // Word frequencies
709    for t in &tokens_for_stats {
710        *pc.wordfreq.entry(t.clone()).or_insert(0) += 1;
711    }
712
713    // Context, neighbors, co-occurrence-by-distance for PMI
714    let window = opts.context;
715    if window > 0 && n > 0 {
716        for (i, w) in tokens_for_stats.iter().enumerate() {
717            let left = i.saturating_sub(window);
718            let right = (i + window + 1).min(n);
719            for (j_off, neighbor) in tokens_for_stats[left..right].iter().enumerate() {
720                let j = left + j_off;
721                if j == i {
722                    continue;
723                }
724                // context
725                let key_ctx = (w.clone(), neighbor.clone());
726                *pc.context_pairs.entry(key_ctx).or_insert(0) += 1;
727
728                // PMI pair with distance
729                let (a, b) = if w <= neighbor {
730                    (w.clone(), neighbor.clone())
731                } else {
732                    (neighbor.clone(), w.clone())
733                };
734                let d = (i as isize - j as isize).unsigned_abs();
735                *pc.cooc_by_dist.entry((a, b, d)).or_insert(0) += 1;
736            }
737
738            // direct neighbors (±1)
739            if i > 0 {
740                let key_left = (w.clone(), tokens_for_stats[i - 1].clone());
741                *pc.neighbor_pairs.entry(key_left).or_insert(0) += 1;
742            }
743            if i + 1 < n {
744                let key_right = (w.clone(), tokens_for_stats[i + 1].clone());
745                *pc.neighbor_pairs.entry(key_right).or_insert(0) += 1;
746            }
747        }
748    }
749
750    // NER on original tokens
751    let mut ner = HashMap::new();
752    let sentences = split_sentences(text);
753    named_entities_heuristic(&original_tokens, &sentences, &mut ner);
754    pc.named_entities = ner;
755
756    pc
757}
758
759/// Merge `other` into `into` (reduce stage).
760fn merge_counts(into: &mut PartialCounts, other: PartialCounts) {
761    into.n_tokens += other.n_tokens;
762    for (k, v) in other.ngrams {
763        *into.ngrams.entry(k).or_insert(0) += v;
764    }
765    for (k, v) in other.wordfreq {
766        *into.wordfreq.entry(k).or_insert(0) += v;
767    }
768    for (k, v) in other.context_pairs {
769        *into.context_pairs.entry(k).or_insert(0) += v;
770    }
771    for (k, v) in other.neighbor_pairs {
772        *into.neighbor_pairs.entry(k).or_insert(0) += v;
773    }
774    for (k, v) in other.cooc_by_dist {
775        *into.cooc_by_dist.entry(k).or_insert(0) += v;
776    }
777    for (k, v) in other.named_entities {
778        *into.named_entities.entry(k).or_insert(0) += v;
779    }
780}
781
782/// Build a full `AnalysisResult` from reduced counts.
783fn analysis_from_counts(total: PartialCounts) -> AnalysisResult {
784    let mut result = AnalysisResult {
785        ngrams: total.ngrams,
786        wordfreq: total.wordfreq,
787        named_entities: total.named_entities,
788        ..Default::default()
789    };
790
791    for ((center, neighbor), c) in total.context_pairs {
792        let entry = result.context_map.entry(center).or_default();
793        *entry.entry(neighbor).or_insert(0) += c;
794    }
795    for ((center, neighbor), c) in total.neighbor_pairs {
796        let entry = result.direct_neighbors.entry(center).or_default();
797        *entry.entry(neighbor).or_insert(0) += c;
798    }
799
800    result.pmi = pmi_from_global_counts(&total.cooc_by_dist, total.n_tokens, &result.wordfreq);
801    result
802}
803
804/// Compute PMI from global co-occurrence counts (by distance), total token count and unigram counts.
805fn pmi_from_global_counts(
806    cooc_by_dist: &HashMap<(String, String, usize), usize>,
807    n_tokens: usize,
808    wordfreq: &HashMap<String, usize>,
809) -> Vec<PmiEntry> {
810    if n_tokens == 0 {
811        return Vec::new();
812    }
813    let total = n_tokens as f64;
814    let mut out = Vec::with_capacity(cooc_by_dist.len());
815    for ((w1, w2, d), c) in cooc_by_dist {
816        let c1 = *wordfreq.get(w1).unwrap_or(&1) as f64;
817        let c2 = *wordfreq.get(w2).unwrap_or(&1) as f64;
818        let p_xy = (*c as f64) / total;
819        let p_x = c1 / total;
820        let p_y = c2 / total;
821        let pmi = (p_xy / (p_x * p_y)).ln();
822        out.push(PmiEntry {
823            word1: w1.clone(),
824            word2: w2.clone(),
825            distance: *d,
826            count: *c,
827            pmi,
828        });
829    }
830    // In-memory order for PMI results: PMI desc, then count desc.
831    out.sort_by(|a, b| {
832        b.pmi
833            .partial_cmp(&a.pmi)
834            .unwrap_or(std::cmp::Ordering::Equal)
835            .then(b.count.cmp(&a.count))
836    });
837    out
838}
839
840// ---------- Output helpers (ALL SORTED) ----------
841
842/// Write all outputs for a single result using the configured format.
843fn write_all_outputs(
844    stem: &str,
845    r: &AnalysisResult,
846    ts: &str,
847    opts: &AnalysisOptions,
848) -> Result<(), String> {
849    if opts.entities_only {
850        // Entities-only export path (sorted)
851        match opts.export_format {
852            ExportFormat::Txt => {
853                let mut out = String::new();
854                out.push_str("=== Named Entities ===\n");
855                let mut items: Vec<(&String, &usize)> = r.named_entities.iter().collect();
856                items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
857                for (e, c) in items.into_iter().take(2000) {
858                    out.push_str(&format!("{e}\t{c}\n"));
859                }
860                let fname = format!("{stem}_{ts}_entities.txt");
861                fs::write(&fname, out).map_err(|e| format!("Write txt failed: {e}"))?;
862            }
863            ExportFormat::Csv | ExportFormat::Tsv | ExportFormat::Json => {
864                write_table("entities", stem, ts, &r.named_entities, opts)?;
865            }
866        }
867        return Ok(());
868    }
869
870    match opts.export_format {
871        ExportFormat::Txt => {
872            // Human-readable TXT (sorted sections; top-50 only)
873            let mut out = String::new();
874
875            // N-grams
876            out.push_str(&format!("=== N-grams (N={}) ===\n", opts.ngram));
877            let mut ngram_items: Vec<(&String, &usize)> = r.ngrams.iter().collect();
878            ngram_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
879            for (ng, c) in ngram_items.into_iter().take(50) {
880                out.push_str(&format!("{ng}\t{c}\n"));
881            }
882
883            // Word frequencies
884            out.push_str("\n=== Word Frequencies ===\n");
885            let mut wf_items: Vec<(&String, &usize)> = r.wordfreq.iter().collect();
886            wf_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
887            for (w, c) in wf_items.into_iter().take(50) {
888                out.push_str(&format!("{w}\t{c}\n"));
889            }
890
891            // Named Entities
892            out.push_str("\n=== Named Entities ===\n");
893            let mut ne_items: Vec<(&String, &usize)> = r.named_entities.iter().collect();
894            ne_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
895            for (e, c) in ne_items.into_iter().take(50) {
896                out.push_str(&format!("{e}\t{c}\n"));
897            }
898
899            // PMI
900            out.push_str("\n=== PMI (top 50, by count) ===\n");
901            let mut pmi_rows: Vec<&PmiEntry> = r.pmi.iter().collect();
902            pmi_rows.sort_by(|a, b| {
903                b.count
904                    .cmp(&a.count)
905                    .then_with(|| {
906                        b.pmi
907                            .partial_cmp(&a.pmi)
908                            .unwrap_or(std::cmp::Ordering::Equal)
909                    })
910                    .then_with(|| a.word1.cmp(&b.word1))
911                    .then_with(|| a.word2.cmp(&b.word2))
912            });
913            for p in pmi_rows.into_iter().take(50) {
914                out.push_str(&format!(
915                    "({}, {}) @d={}  PMI={:.3}  count={}\n",
916                    p.word1, p.word2, p.distance, p.pmi, p.count
917                ));
918            }
919
920            let fname = format!("{stem}_{ts}_summary.txt");
921            fs::write(&fname, out).map_err(|e| format!("Write txt failed: {e}"))?;
922        }
923        ExportFormat::Csv | ExportFormat::Tsv | ExportFormat::Json => {
924            write_table("ngrams", stem, ts, &r.ngrams, opts)?;
925            write_table("wordfreq", stem, ts, &r.wordfreq, opts)?;
926            write_nested("context", stem, ts, &r.context_map, opts)?;
927            write_nested("neighbors", stem, ts, &r.direct_neighbors, opts)?;
928            write_pmi("pmi", stem, ts, &r.pmi, opts)?;
929            write_table("namedentities", stem, ts, &r.named_entities, opts)?;
930        }
931    }
932    Ok(())
933}
934
935/// Write a simple map table as CSV/TSV/JSON. Content is **sorted by count desc, key asc**.
936/// Write a flat table `<item -> count>` as CSV/TSV/JSON.
937/// CSV/TSV are emitted via `csv::Writer` (proper quoting & newlines),
938/// and **text cells** are sanitized with `csv_safe_cell()` to neutralize
939/// leading `= + - @` (spreadsheet formula injection).
940fn write_table(
941    name: &str,
942    stem: &str,
943    ts: &str,
944    map: &std::collections::HashMap<String, usize>,
945    opts: &AnalysisOptions,
946) -> Result<(), String> {
947    let fname = format!("{stem}_{ts}_{name}.{}", ext(opts.export_format));
948
949    // Deterministic order: count desc, then key asc
950    let mut items: Vec<(&String, &usize)> = map.iter().collect();
951    items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
952
953    match opts.export_format {
954        ExportFormat::Csv | ExportFormat::Tsv => {
955            let delim: u8 = if matches!(opts.export_format, ExportFormat::Csv) {
956                b','
957            } else {
958                b'\t'
959            };
960            let file = std::fs::File::create(&fname).map_err(|e| format!("create {fname}: {e}"))?;
961            let mut wtr = csv::WriterBuilder::new().delimiter(delim).from_writer(file);
962
963            // header
964            wtr.write_record(["item", "count"])
965                .map_err(|e| e.to_string())?;
966
967            for (k, v) in items {
968                wtr.write_record([csv_safe_cell(k.to_string()), v.to_string()])
969                    .map_err(|e| e.to_string())?;
970            }
971            wtr.flush().map_err(|e| e.to_string())?;
972        }
973        ExportFormat::Json => {
974            let v: Vec<_> = items
975                .iter()
976                .map(|(k, v)| serde_json::json!({ "item": k, "count": v }))
977                .collect();
978            std::fs::write(&fname, serde_json::to_string_pretty(&v).unwrap())
979                .map_err(|e| format!("write {fname}: {e}"))?;
980        }
981        ExportFormat::Txt => unreachable!(),
982    }
983    Ok(())
984}
985
986/// Write a nested map `<center -> neighbor -> count>` as a flat table (sorted by count desc).
987/// Write a nested map `<center -> neighbor -> count>` as a flat table
988/// with columns: `item1, item2, count`.
989/// Uses `csv::Writer` for CSV/TSV and sanitizes text cells with `csv_safe_cell()`.
990fn write_nested(
991    name: &str,
992    stem: &str,
993    ts: &str,
994    map: &std::collections::HashMap<String, std::collections::HashMap<String, usize>>,
995    opts: &AnalysisOptions,
996) -> Result<(), String> {
997    let fname = format!("{stem}_{ts}_{name}.{}", ext(opts.export_format));
998
999    // Flatten + deterministic order: count desc, then keys
1000    let mut rows: Vec<(&String, &String, &usize)> = Vec::new();
1001    for (k, inner) in map {
1002        for (k2, v) in inner {
1003            rows.push((k, k2, v));
1004        }
1005    }
1006    rows.sort_by(|a, b| {
1007        b.2.cmp(a.2)
1008            .then_with(|| a.0.cmp(b.0))
1009            .then_with(|| a.1.cmp(b.1))
1010    });
1011
1012    match opts.export_format {
1013        ExportFormat::Csv | ExportFormat::Tsv => {
1014            let delim: u8 = if matches!(opts.export_format, ExportFormat::Csv) {
1015                b','
1016            } else {
1017                b'\t'
1018            };
1019            let file = std::fs::File::create(&fname).map_err(|e| format!("create {fname}: {e}"))?;
1020            let mut wtr = csv::WriterBuilder::new().delimiter(delim).from_writer(file);
1021
1022            // header
1023            wtr.write_record(["item1", "item2", "count"])
1024                .map_err(|e| e.to_string())?;
1025
1026            for (k, k2, v) in rows {
1027                wtr.write_record([
1028                    csv_safe_cell(k.to_string()),
1029                    csv_safe_cell(k2.to_string()),
1030                    v.to_string(),
1031                ])
1032                .map_err(|e| e.to_string())?;
1033            }
1034            wtr.flush().map_err(|e| e.to_string())?;
1035        }
1036        ExportFormat::Json => {
1037            let v: Vec<_> = rows
1038                .iter()
1039                .map(|(k, k2, v)| serde_json::json!({ "item1": k, "item2": k2, "count": v }))
1040                .collect();
1041            std::fs::write(&fname, serde_json::to_string_pretty(&v).unwrap())
1042                .map_err(|e| format!("write {fname}: {e}"))?;
1043        }
1044        ExportFormat::Txt => unreachable!(),
1045    }
1046    Ok(())
1047}
1048
1049/// Write PMI entries **sorted by count desc, then PMI desc, then words lex**.
1050/// Write PMI rows with columns: `word1, word2, distance, count, pmi`.
1051/// Sorted by `count desc, PMI desc, then words`. CSV/TSV via `csv::Writer`,
1052/// **text cells** sanitized via `csv_safe_cell()`.
1053fn write_pmi(
1054    name: &str,
1055    stem: &str,
1056    ts: &str,
1057    pmi: &[PmiEntry], // assumes fields: word1, word2, distance, count, pmi
1058    opts: &AnalysisOptions,
1059) -> Result<(), String> {
1060    let fname = format!("{stem}_{ts}_{name}.{}", ext(opts.export_format));
1061
1062    // Deterministic order
1063    let mut rows: Vec<&PmiEntry> = pmi.iter().collect();
1064    rows.sort_by(|a, b| {
1065        b.count
1066            .cmp(&a.count)
1067            .then_with(|| {
1068                b.pmi
1069                    .partial_cmp(&a.pmi)
1070                    .unwrap_or(std::cmp::Ordering::Equal)
1071            })
1072            .then_with(|| a.word1.cmp(&b.word1))
1073            .then_with(|| a.word2.cmp(&b.word2))
1074    });
1075
1076    match opts.export_format {
1077        ExportFormat::Csv | ExportFormat::Tsv => {
1078            let delim: u8 = if matches!(opts.export_format, ExportFormat::Csv) {
1079                b','
1080            } else {
1081                b'\t'
1082            };
1083            let file = std::fs::File::create(&fname).map_err(|e| format!("create {fname}: {e}"))?;
1084            let mut wtr = csv::WriterBuilder::new().delimiter(delim).from_writer(file);
1085
1086            // header
1087            wtr.write_record(["word1", "word2", "distance", "count", "pmi"])
1088                .map_err(|e| e.to_string())?;
1089
1090            for r in rows {
1091                wtr.write_record([
1092                    csv_safe_cell(r.word1.clone()),
1093                    csv_safe_cell(r.word2.clone()),
1094                    r.distance.to_string(),
1095                    r.count.to_string(),
1096                    format!("{:.6}", r.pmi),
1097                ])
1098                .map_err(|e| e.to_string())?;
1099            }
1100            wtr.flush().map_err(|e| e.to_string())?;
1101        }
1102        ExportFormat::Json => {
1103            let v: Vec<_> = rows
1104                .iter()
1105                .map(|r| {
1106                    serde_json::json!({
1107                        "word1": r.word1,
1108                        "word2": r.word2,
1109                        "distance": r.distance,
1110                        "count": r.count,
1111                        "pmi": r.pmi
1112                    })
1113                })
1114                .collect();
1115            std::fs::write(&fname, serde_json::to_string_pretty(&v).unwrap())
1116                .map_err(|e| format!("write {fname}: {e}"))?;
1117        }
1118        ExportFormat::Txt => unreachable!(),
1119    }
1120    Ok(())
1121}
1122
1123// ---------- Utilities ----------
1124
1125/// Build a human-readable summary for debug/logging.
1126fn summary_for(pairs: &[(String, &AnalysisResult)], _opts: &AnalysisOptions) -> String {
1127    // STDOUT summary is tuned for usefulness:
1128    // 1) Top 20 N-grams (sorted by count desc, then key lex asc)
1129    // 2) Top 20 PMI pairs (sorted by count desc, then PMI desc, then words lex)
1130    // 3) Top 20 words (sorted by count desc, then key lex asc)
1131    //
1132    // This order surfaces more informative signals before common stopwords.
1133    let mut s = String::new();
1134    s.push_str("=== Analysis Summary ===\n");
1135
1136    for (name, r) in pairs {
1137        s.push_str(&format!("\n# {}\n", name));
1138
1139        // ---- Top 20 N-grams ----
1140        s.push_str("Top 20 n-grams:\n");
1141        let mut ngram_items: Vec<(&String, &usize)> = r.ngrams.iter().collect();
1142        ngram_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
1143        for (ng, c) in ngram_items.into_iter().take(20) {
1144            s.push_str(&format!("  {}\t{}\n", ng, c));
1145        }
1146
1147        // ---- Top 20 PMI ----
1148        s.push_str("Top 20 PMI (by count, then PMI):\n");
1149        let mut pmi_rows: Vec<&PmiEntry> = r.pmi.iter().collect();
1150        pmi_rows.sort_by(|a, b| {
1151            b.count
1152                .cmp(&a.count)
1153                .then_with(|| {
1154                    b.pmi
1155                        .partial_cmp(&a.pmi)
1156                        .unwrap_or(std::cmp::Ordering::Equal)
1157                })
1158                .then_with(|| a.word1.cmp(&b.word1))
1159                .then_with(|| a.word2.cmp(&b.word2))
1160        });
1161        for p in pmi_rows.into_iter().take(20) {
1162            s.push_str(&format!(
1163                "  ({}, {}) @d={}  count={}  PMI={:.3}\n",
1164                p.word1, p.word2, p.distance, p.count, p.pmi
1165            ));
1166        }
1167
1168        // ---- Top 20 words ----
1169        s.push_str("Top 20 words:\n");
1170        let mut wf_items: Vec<(&String, &usize)> = r.wordfreq.iter().collect();
1171        wf_items.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
1172        for (w, c) in wf_items.into_iter().take(20) {
1173            s.push_str(&format!("  {}\t{}\n", w, c));
1174        }
1175    }
1176
1177    s
1178}
1179
1180/// A short timestamp used in output filenames.
1181fn timestamp() -> String {
1182    Local::now().format("%Y%m%d_%H%M%S").to_string()
1183}
1184
1185/// File extension for an export format.
1186fn ext(fmt: ExportFormat) -> &'static str {
1187    match fmt {
1188        ExportFormat::Txt => "txt",
1189        ExportFormat::Csv => "csv",
1190        ExportFormat::Tsv => "tsv",
1191        ExportFormat::Json => "json",
1192    }
1193}
1194
1195/// Collision-safe stem used in output filenames: "<stem[.ext]>_<hash8>".
1196/// The hash is a stable hash of the full path to avoid collisions across parallel runs.
1197pub fn stem_for(p: &Path) -> String {
1198    let stem = p.file_stem().and_then(|s| s.to_str()).unwrap_or("file");
1199    let ext = p.extension().and_then(|s| s.to_str()).unwrap_or("");
1200    let h = short_hash(p);
1201    if ext.is_empty() {
1202        format!("{stem}_{h}")
1203    } else {
1204        format!("{stem}.{ext}_{h}")
1205    }
1206}
1207
1208fn short_hash<P: AsRef<Path>>(p: P) -> String {
1209    let mut hasher = std::collections::hash_map::DefaultHasher::new();
1210    p.as_ref().to_string_lossy().hash(&mut hasher);
1211    let v = hasher.finish();
1212    format!("{:08x}", v)
1213}
1214
1215/// Detect a supported stemming language. Returns `None` if undetected or unsupported.
1216fn detect_supported_stem_lang(text: &str) -> Option<StemLang> {
1217    let info = whatlang::detect(text)?;
1218    let sl = StemLang::from_whatlang(info.lang());
1219    if make_stemmer(sl).is_some() {
1220        Some(sl)
1221    } else {
1222        None
1223    }
1224}
1225
1226pub fn csv_safe_cell(mut s: String) -> String {
1227    if matches!(s.chars().next(), Some('=' | '+' | '-' | '@')) {
1228        s.insert(0, '\'');
1229    }
1230    s
1231}
text_analysis/lib.rs

text_analysis/
lib.rs