Skip to main content

anno/preprocess/
mod.rs

1//! Preprocessing utilities for text normalization and morphological analysis.
2//!
3//! This module provides preprocessing support for:
4//! - Morphologically complex languages (polysynthetic, agglutinative)
5//! - Text normalization and cleaning
6//! - Script-specific handling (Cherokee syllabary, etc.)
7//! - **Parenthetical analysis** - aliases, abbreviations, temporal bounds
8//! - **Reference resolution** - URLs, citations, cross-references
9//!
10//! # Morphological Preprocessing
11//!
12//! For polysynthetic languages like Cherokee, Navajo, and Mohawk, standard
13//! word-level tokenization fails because a single word can encode an entire
14//! sentence. The `morphology` module provides segmentation strategies and
15//! the `MorphologicalAnalyzer` trait for integrating external analyzers.
16//!
17//! # Parenthetical Analysis
18//!
19//! The `parenthetical` module extracts valuable entity information from
20//! parenthetical text:
21//!
22//! ```rust
23//! use anno::preprocess::parenthetical::{ParentheticalExtractor, ParentheticalType};
24//!
25//! let extractor = ParentheticalExtractor::new();
26//! let results = extractor.extract("The World Health Organization (WHO) announced guidelines.");
27//!
28//! assert_eq!(results[0].content, "WHO");
29//! assert_eq!(results[0].parenthetical_type, ParentheticalType::Abbreviation);
30//! ```
31//!
32//! # Reference Resolution
33//!
34//! The [`reference`] module detects URLs, citations, and cross-references
35//! that can be resolved to additional entity information:
36//!
37//! ```rust,ignore
38//! use anno::preprocess::reference::{ReferenceExtractor, ReferenceType};
39//!
40//! let extractor = ReferenceExtractor::new();
41//! let refs = extractor.extract("See https://en.wikipedia.org/wiki/Einstein");
42//!
43//! assert_eq!(refs[0].reference_type, ReferenceType::WikipediaUrl);
44//! ```
45//!
46//! # Integration with Coalesce and Tier
47//!
48//! These modules integrate with the entity resolution pipeline:
49//!
50//! - **Coalesce**: Parenthetical aliases help link "WHO" ↔ "World Health Organization"
51//! - **Tier**: Reference graphs create hierarchical entity relationships
52//!
53//! # Example
54//!
55//! ```rust
56//! use anno::preprocess::morphology::{MorphologicalPreprocessor, SegmentationStrategy};
57//!
58//! // For Quechua with hyphenated morpheme boundaries
59//! let preprocessor = MorphologicalPreprocessor::new()
60//!     .with_strategy(SegmentationStrategy::RuleBased {
61//!         boundary_chars: vec!['-', '='],
62//!     });
63//!
64//! let result = preprocessor.segment("wasi-kuna-y-ki").unwrap();
65//! assert_eq!(result.morphemes.len(), 4); // wasi, kuna, y, ki
66//! ```
67
68pub mod apposition;
69pub mod morphology;
70pub mod parenthetical;
71pub mod reference;
72
73// Re-export commonly used types
74pub use morphology::{
75    cherokee_syllable_inventory, navajo_prefix_inventory, quechua_boundary_chars, Morpheme,
76    MorphemeType, MorphologicalAnalyzer, MorphologicalPreprocessor, ProdropConfig,
77    SegmentationResult, SegmentationStrategy,
78};
79
80pub use parenthetical::{
81    extract_aliases, AliasPair, Parenthetical, ParentheticalExtractor, ParentheticalType,
82};
83
84pub use reference::{
85    ExtractedEntity, Reference, ReferenceExtractor, ReferenceGraph, ReferenceType,
86    ResolvedReference,
87};
88
89pub use apposition::{extract_all_aliases, Apposition, AppositionExtractor, AppositionType};