1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6#![warn(missing_docs)]
110
111pub mod classification;
112pub mod cleansing;
113pub mod distance;
114pub mod domain_processors;
115pub mod embeddings;
116pub mod enhanced_vectorize;
117pub mod error;
118pub mod huggingface_compat;
119pub mod information_extraction;
120pub mod language_model;
121pub mod ml_integration;
122pub mod ml_sentiment;
123pub mod model_registry;
124pub mod multilingual;
125pub mod neural_architectures;
126pub mod parallel;
127pub mod paraphrasing;
128pub mod performance;
129pub mod pos_tagging;
130pub mod preprocess;
131pub mod semantic_similarity;
132pub mod sentiment;
133pub mod simd_ops;
134pub mod sparse;
135pub mod sparse_vectorize;
136pub mod spelling;
137pub mod stemming;
138pub mod streaming;
139pub mod string_metrics;
140pub mod summarization;
141pub mod text_coordinator;
142pub mod text_statistics;
143pub mod token_filter;
144pub mod tokenize;
145pub mod topic_coherence;
146pub mod topic_modeling;
147pub mod transformer;
148pub mod utils;
149pub mod vectorize;
150pub mod visualization;
151pub mod vocabulary;
152pub mod weighted_distance;
153
154pub use classification::{
156 TextClassificationMetrics, TextClassificationPipeline, TextDataset, TextFeatureSelector,
157};
158pub use cleansing::{
159 expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
160 normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
161 replace_urls, strip_html_tags, AdvancedTextCleaner,
162};
163pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
164pub use domain_processors::{
165 Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
166 MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
167 ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
168};
169pub use embeddings::{
170 fasttext::{FastText, FastTextConfig},
171 glove::GloVe,
172 Word2Vec, Word2VecAlgorithm, Word2VecConfig,
173};
174pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
175pub use error::{Result, TextError};
176pub use huggingface_compat::{
177 ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
178 FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
179 HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
180 TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
181};
182pub use information_extraction::{
183 AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
184 CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
185 EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
186 InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
187 MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
188 StructuredDocumentInformation, TemporalExtractor, Topic,
189};
190pub use language_model::{NgramModel, SmoothingMethod};
191pub use ml_integration::{
192 BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
193};
194pub use ml_sentiment::{
195 ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
196};
197pub use model_registry::{
198 ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
199 SerializableModelData,
200};
201pub use multilingual::{
202 Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
203 StopWords,
204};
205pub use neural_architectures::{
206 ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
207 GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
208 MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
209 ResidualBlock1D, SelfAttention, TextCNN,
210};
211pub use parallel::{
212 ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
213};
214pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
215pub use performance::{
216 AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
217 PerformanceSummary, PerformanceThresholds,
218};
219pub use pos_tagging::{
220 PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
221};
222pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
223pub use semantic_similarity::{
224 LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
225 WordMoversDistance,
226};
227pub use sentiment::{
228 LexiconSentimentAnalyzer, RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon,
229 SentimentResult, SentimentRules, SentimentWordCounts,
230};
231pub use simd_ops::{
232 AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
233 TextProcessingResult,
234};
235pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
236pub use sparse_vectorize::{
237 sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
238};
239pub use spelling::{
240 DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
241 SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
242};
243pub use stemming::{
244 LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
245 RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
246};
247pub use streaming::{
248 AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
249 ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
250};
251pub use string_metrics::{
252 AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
253 PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
254};
255pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
256pub use text_coordinator::{
257 AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
258 AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
259};
260pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
261pub use token_filter::{
262 CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
263 TokenFilter,
264};
265pub use tokenize::{
266 bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
267 CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
268 WhitespaceTokenizer, WordTokenizer,
269};
270pub use topic_coherence::{TopicCoherence, TopicDiversity};
271pub use topic_modeling::{
272 LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
273};
274pub use transformer::{
275 FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
276 TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
277 TransformerEncoderLayer, TransformerModel,
278};
279pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
280pub use visualization::{
281 AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
282 TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
283};
284pub use vocabulary::Vocabulary;
285pub use weighted_distance::{
286 DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
287 WeightedStringMetric,
288};