1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6#![warn(missing_docs)]
110
111pub mod batch_tokenizer;
113pub mod classification;
114pub mod cleansing;
115pub mod distance;
116pub mod domain_processors;
117pub mod embeddings;
118pub mod enhanced_vectorize;
119pub mod error;
120pub mod evaluation;
121pub mod gpt_bpe;
123pub mod huggingface_compat;
124pub mod information_extraction;
125pub mod language_model;
126pub mod lemmatization;
127pub mod ml_integration;
128pub mod ml_sentiment;
129pub mod model_registry;
130pub mod multilingual;
131pub mod neural_architectures;
132pub mod parallel;
133pub mod paraphrasing;
134pub mod performance;
135pub mod pipeline;
136pub mod pos_tagging;
137pub mod preprocess;
138pub mod semantic_similarity;
139pub mod sentencepiece;
141pub mod sentiment;
142pub mod simd_ops;
143pub mod sparse;
144pub mod sparse_vectorize;
145pub mod spelling;
146pub mod stemming;
147pub mod streaming;
148pub mod string_metrics;
149pub mod summarization;
150pub mod text_coordinator;
151pub mod text_statistics;
152pub mod token_filter;
153pub mod tokenize;
154pub mod tokenizer;
155pub mod topic_coherence;
156pub mod topic_modeling;
157pub mod transformer;
158pub mod utils;
159pub mod vectorize;
160pub mod visualization;
161pub mod vocabulary;
162pub mod weighted_distance;
163
164pub mod keyword_extraction;
166pub mod language_detection;
167pub mod named_entity_recognition;
168pub mod text_similarity;
169pub mod text_summarization;
170
171pub mod bert_finetune;
173pub mod crosslingual;
175pub mod ctm;
177pub mod dtm;
179pub mod sentence_embeddings;
181pub mod similarity;
183pub mod tokenizers;
185pub mod transliteration;
187
188pub use classification::{
190 cross_validate_nb, BernoulliNaiveBayes, CrossValidationResult, FeatureHasher, FoldResult,
191 MultiLabelClassifier, MultiLabelPrediction, MultinomialNaiveBayes, TextClassificationMetrics,
192 TextClassificationPipeline, TextDataset, TextFeatureSelector, TfidfCosineClassifier,
193};
194pub use cleansing::{
195 expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
196 normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
197 replace_urls, strip_html_tags, AdvancedTextCleaner,
198};
199pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
200pub use domain_processors::{
201 Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
202 MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
203 ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
204};
205pub use embeddings::{
206 embedding_cosine_similarity,
207 fasttext::{FastText, FastTextConfig},
208 glove::{CooccurrenceMatrix, GloVe, GloVeTrainer, GloVeTrainerConfig},
209 pairwise_similarity, Word2Vec, Word2VecAlgorithm, Word2VecConfig, WordEmbedding,
210};
211pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
212pub use error::{Result, TextError};
213pub use huggingface_compat::{
214 ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
215 FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
216 HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
217 TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
218};
219pub use information_extraction::{
220 AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
221 CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
222 EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
223 InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
224 MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
225 StructuredDocumentInformation, TemporalExtractor, Topic,
226};
227pub use language_model::{NgramModel, SmoothingMethod};
228pub use lemmatization::{Lemmatizer, RuleBasedLemmatizer, WordNetLemmatizer};
229pub use ml_integration::{
230 BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
231};
232pub use ml_sentiment::{
233 ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
234};
235pub use model_registry::{
236 ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
237 SerializableModelData,
238};
239pub use multilingual::{
240 Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
241 StopWords,
242};
243pub use neural_architectures::{
244 ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
245 GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
246 MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
247 ResidualBlock1D, SelfAttention, TextCNN,
248};
249pub use parallel::{
250 ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
251};
252pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
253pub use performance::{
254 AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
255 PerformanceSummary, PerformanceThresholds,
256};
257pub use pipeline::{
258 basic_pipeline, lemmatization_pipeline, ngram_pipeline, stemming_pipeline, BatchProcessor,
259 NlpPipeline, PipelineBuilder, PipelineStep,
260};
261pub use pos_tagging::{
262 PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
263};
264pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
265pub use semantic_similarity::{
266 LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
267 WordMoversDistance,
268};
269pub use sentiment::{
270 aggregate_sentiment, analyze_and_aggregate, AggregatedSentiment, AspectSentiment,
271 AspectSentimentAnalyzer, LexiconSentimentAnalyzer, NaiveBayesSentiment,
272 RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon, SentimentResult, SentimentRules,
273 SentimentWordCounts, VaderResult, VaderSentimentAnalyzer,
274};
275pub use simd_ops::{
276 AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
277 TextProcessingResult,
278};
279pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
280pub use sparse_vectorize::{
281 sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
282};
283pub use spelling::{
284 DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
285 SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
286};
287pub use stemming::{
288 LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
289 RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
290};
291pub use streaming::{
292 AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
293 ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
294};
295pub use string_metrics::{
296 AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
297 PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
298};
299pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
300pub use text_coordinator::{
301 AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
302 AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
303};
304pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
305pub use token_filter::{
306 CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
307 TokenFilter,
308};
309pub use tokenize::{
310 bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
311 CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
312 WhitespaceTokenizer, WordTokenizer,
313};
314pub use tokenizer::{
315 BPETokenizer, SimpleCharTokenizer, SimpleWhitespaceTokenizer, TransformerTokenizer,
316 WordPieceTokenizer,
317};
318pub use topic_coherence::{TopicCoherence, TopicDiversity};
319pub use topic_modeling::{
320 LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
321};
322pub use transformer::{
323 FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
324 TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
325 TransformerEncoderLayer, TransformerModel,
326};
327pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
328pub use visualization::{
329 AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
330 TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
331};
332pub use vocabulary::Vocabulary;
333pub use weighted_distance::{
334 DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
335 WeightedStringMetric,
336};
337
338pub use keyword_extraction::{
340 extract_keywords, Keyword, KeywordMethod, RakeKeywordExtractor, TextRankKeywordExtractor,
341 TfIdfKeywordExtractor,
342};
343pub use language_detection::{
344 detect_language, detect_language_with_strategy, DetectedLanguage, DetectionStrategy,
345 LanguageDetectionOutput,
346};
347pub use named_entity_recognition::{extract_entities, NerEntity, NerEntityType, NerPatternConfig};
348pub use text_similarity::{
349 bm25_score, char_ngram_jaccard_similarity, edit_distance_similarity, jaccard_token_similarity,
350 text_similarity, tfidf_cosine_similarity, Bm25Config, Bm25Scorer, SimilarityMethod,
351 SimilarityResult, TfIdfCosineSimilarity,
352};
353pub use text_summarization::{
354 score_position, score_textrank, score_tfidf, summarize, ScoredSentence, SummarizationMethod,
355};