punkt/lib.rs
1// Copyright 2016 rust-punkt developers
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! # Overview
10//!
11//! Implementation of Tibor Kiss' and Jan Strunk's Punkt algorithm for sentence
12//! tokenization. Results have been compared with small and large texts that have
13//! been tokenized using NLTK.
14//!
15//! # Training
16//!
17//! Training data can be provided to a `SentenceTokenizer` for better
18//! results. Data can be acquired manually by training with a `Trainer`,
19//! or using already compiled data from NLTK (example: `TrainingData::english()`).
20//!
21//! # Typical Usage
22//!
23//! The punkt algorithm allows you to derive all the necessary data to perform
24//! sentence tokenization from the document itself.
25//!
26//! ```
27//! # use punkt::params::Standard;
28//! # use punkt::{Trainer, TrainingData, SentenceTokenizer};
29//! #
30//! # let doc = "I bought $5.50 worth of apples from the store. I gave them to my dog when I came home.";
31//! let trainer: Trainer<Standard> = Trainer::new();
32//! let mut data = TrainingData::new();
33//!
34//! trainer.train(doc, &mut data);
35//!
36//! for s in SentenceTokenizer::<Standard>::new(doc, &data) {
37//! println!("{:?}", s);
38//! }
39//! ```
40//!
41//! `rust-punkt` also provides pretrained data that can be loaded for certain languages.
42//!
43//! ```
44//! # #![allow(unused_variables)]
45//! #
46//! # use punkt::TrainingData;
47//! #
48//! let data = TrainingData::english();
49//! ```
50//!
51//! `rust-punkt` also allows training data to be incrementally gathered.
52//!
53//! ```
54//! # use punkt::params::Standard;
55//! # use punkt::{Trainer, TrainingData, SentenceTokenizer};
56//! #
57//! # let docs = ["This is a sentence with a abbrev. in it."];
58//! let trainer: Trainer<Standard> = Trainer::new();
59//! let mut data = TrainingData::new();
60//!
61//! for d in docs.iter() {
62//! trainer.train(d, &mut data);
63//!
64//! for s in SentenceTokenizer::<Standard>::new(d, &data) {
65//! println!("{:?}", s);
66//! }
67//! }
68//! ```
69//!
70//! # Customization
71//!
72//! `rust-punkt` exposes a number of traits to customize how the trainer, sentence tokenizer,
73//! and internal tokenizers work. The default settings, which are nearly identical, to the
74//! ones available in the Python library are available in `punkt::params::Standard`.
75//!
76//! To modify only how the trainer works:
77//!
78//! ```
79//! # use punkt::params::*;
80//! #
81//! struct MyParams;
82//!
83//! impl DefinesInternalPunctuation for MyParams {}
84//! impl DefinesNonPrefixCharacters for MyParams {}
85//! impl DefinesNonWordCharacters for MyParams {}
86//! impl DefinesPunctuation for MyParams {}
87//! impl DefinesSentenceEndings for MyParams {}
88//!
89//! impl TrainerParameters for MyParams {
90//! const ABBREV_LOWER_BOUND: f64 = 0.3;
91//! const ABBREV_UPPER_BOUND: f64 = 8f64;
92//! const IGNORE_ABBREV_PENALTY: bool = false;
93//! const COLLOCATION_LOWER_BOUND: f64 = 7.88;
94//! const SENTENCE_STARTER_LOWER_BOUND: f64 = 35f64;
95//! const INCLUDE_ALL_COLLOCATIONS: bool = false;
96//! const INCLUDE_ABBREV_COLLOCATIONS: bool = true;
97//! const COLLOCATION_FREQUENCY_LOWER_BOUND: f64 = 0.8f64;
98//! }
99//! ```
100//!
101//! To fully modify how everything works:
102//!
103//! ```
104//! # use punkt::params::*;
105//! #
106//! struct MyParams;
107//!
108//! impl DefinesSentenceEndings for MyParams {
109//! // const SENTENCE_ENDINGS: &'static Set<char> = &phf_set![...];
110//! }
111//!
112//! impl DefinesInternalPunctuation for MyParams {
113//! // const INTERNAL_PUNCTUATION: &'static Set<char> = &phf_set![...];
114//! }
115//!
116//! impl DefinesNonWordCharacters for MyParams {
117//! // const NONWORD_CHARS: &'static Set<char> = &phf_set![...];
118//! }
119//!
120//! impl DefinesPunctuation for MyParams {
121//! // const PUNCTUATION: &'static Set<char> = &phf_set![...];
122//! }
123//!
124//! impl DefinesNonPrefixCharacters for MyParams {
125//! // const NONPREFIX_CHARS: &'static Set<char> = &phf_set![...];
126//! }
127//!
128//! impl TrainerParameters for MyParams {
129//! // const ABBREV_LOWER_BOUND: f64 = ...;
130//! // const ABBREV_UPPER_BOUND: f64 = ...;
131//! // const IGNORE_ABBREV_PENALTY: bool = ...;
132//! // const COLLOCATION_LOWER_BOUND: f64 = ...;
133//! // const SENTENCE_STARTER_LOWER_BOUND: f64 = ...;
134//! // const INCLUDE_ALL_COLLOCATIONS: bool = ...;
135//! // const INCLUDE_ABBREV_COLLOCATIONS: bool = true;
136//! // const COLLOCATION_FREQUENCY_LOWER_BOUND: f64 = ...;
137//! }
138//! ```
139
140#![cfg_attr(test, feature(test))]
141#![feature(proc_macro_hygiene)]
142#![warn(missing_docs)]
143
144extern crate freqdist;
145extern crate num;
146extern crate phf;
147extern crate rustc_serialize;
148#[cfg(test)]
149extern crate test;
150#[cfg(test)]
151extern crate walkdir;
152
153mod trainer;
154mod util;
155mod token;
156mod tokenizer;
157mod prelude;
158
159pub use trainer::{Trainer, TrainingData};
160pub use tokenizer::{SentenceByteOffsetTokenizer, SentenceTokenizer};
161
162/// Contains traits for configuring all tokenizers, and the trainer. Also
163/// contains default parameters for tokenizers, and the trainer.
164pub mod params {
165 pub use prelude::{DefinesInternalPunctuation, DefinesNonPrefixCharacters,
166 DefinesNonWordCharacters, DefinesPunctuation, DefinesSentenceEndings, Set,
167 Standard, TrainerParameters};
168}
169
170#[cfg(test)]
171fn get_test_scenarios(dir_path: &str, raw_path: &str) -> Vec<(Vec<String>, String, String)> {
172 #![allow(unused_must_use)]
173
174 use std::fs;
175 use std::path::Path;
176 use std::io::Read;
177
178 use walkdir::WalkDir;
179
180 let mut tests = Vec::new();
181
182 for path in WalkDir::new(dir_path) {
183 let entry = path.unwrap();
184 let fpath = entry.path();
185
186 if fpath.is_file() {
187 let mut exp_strb = String::new();
188 let mut raw_strb = String::new();
189
190 // Files in the directory with raw articles must match the file names of
191 // articles in the directory with test outcomes.
192 let rawp = Path::new(raw_path).join(fpath.file_name().unwrap());
193
194 fs::File::open(&fpath)
195 .unwrap()
196 .read_to_string(&mut exp_strb);
197 fs::File::open(&rawp).unwrap().read_to_string(&mut raw_strb);
198
199 // Expected results, split by newlines.
200 let exps: Vec<String> = exp_strb.split('\n').map(|s| s.to_string()).collect();
201
202 tests.push((exps, raw_strb, format!("{:?}", fpath.file_name().unwrap())));
203 }
204 }
205
206 tests // Returns (Expected cases, File contents, File name)
207}