1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
// Copyright 2019 vtext developers // // Licensed under the Apache License, Version 2.0, // <http://apache.org/licenses/LICENSE-2.0>. This file may not be copied, // modified, or distributed except according to those terms. /*! # vtext NLP in Rust with Python bindings This package aims to provide a high performance toolkit for ingesting textual data for machine learning applications. ## Features - Tokenization: Regexp tokenizer, Unicode segmentation + language specific rules - Token counting: converting token counts to sparse matrices for use in machine learning libraries. Similar to `CountVectorizer` and `HashingVectorizer` in scikit-learn but will less broad functionality. - Levenshtein edit distance; Sørensen-Dice, Jaro, Jaro Winkler string similarities # Example A simple tokenization example can be found below, ```rust extern crate vtext; use vtext::tokenize::{VTextTokenizerParams,Tokenizer}; let tok = VTextTokenizerParams::default().lang("en").build().unwrap(); let tokens: Vec<&str> = tok.tokenize("Flights can't depart after 2:00 pm.").collect(); assert_eq!(tokens, vec!["Flights", "ca", "n't", "depart", "after", "2:00", "pm", "."]) ``` */ #![allow(non_snake_case)] pub mod errors; mod math; pub mod metrics; pub mod tokenize; pub mod tokenize_sentence; pub mod vectorize;