Crate vaporetto_tantivy
source ·Expand description
vaporetto_tantivy
Vaporetto Tokenizer for Tantivy
Examples
use std::fs::File;
use std::io::{Read, BufReader};
use tantivy::tokenizer::{TokenStream, Tokenizer};
use vaporetto::Model;
use vaporetto_tantivy::VaporettoTokenizer;
let mut f = BufReader::new(File::open("model.zst").unwrap());
let mut decoder = ruzstd::StreamingDecoder::new(&mut f).unwrap();
let mut buff = vec![];
decoder.read_to_end(&mut buff).unwrap();
let model = Model::read(&mut buff.as_slice()).unwrap();
let mut tokenizer = VaporettoTokenizer::new(model, "DGR").unwrap();
let mut stream = tokenizer.token_stream("東京特許許可局");
let token = stream.next().unwrap();
assert_eq!(token.text, "東京");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 6);
assert_eq!(token.position, 0);
let token = stream.next().unwrap();
assert_eq!(token.text, "特許");
assert_eq!(token.offset_from, 6);
assert_eq!(token.offset_to, 12);
assert_eq!(token.position, 1);
let token = stream.next().unwrap();
assert_eq!(token.text, "許可");
assert_eq!(token.offset_from, 12);
assert_eq!(token.offset_to, 18);
assert_eq!(token.position, 2);
let token = stream.next().unwrap();
assert_eq!(token.text, "局");
assert_eq!(token.offset_from, 18);
assert_eq!(token.offset_to, 21);
assert_eq!(token.position, 3);
assert!(stream.next().is_none());
Structs
- Tokenize the text using Vaporetto.