Expand description

vaporetto_tantivy

Vaporetto Tokenizer for Tantivy

Examples

use std::fs::File;
use std::io::{Read, BufReader};

use tantivy::tokenizer::{TokenStream, Tokenizer};
use vaporetto::Model;
use vaporetto_tantivy::VaporettoTokenizer;

let mut f = BufReader::new(File::open("model.zst").unwrap());
let mut decoder = ruzstd::StreamingDecoder::new(&mut f).unwrap();
let mut buff = vec![];
decoder.read_to_end(&mut buff).unwrap();
let model = Model::read(&mut buff.as_slice()).unwrap();

let mut tokenizer = VaporettoTokenizer::new(model, "DGR").unwrap();

let mut stream = tokenizer.token_stream("東京特許許可局");

let token = stream.next().unwrap();
assert_eq!(token.text, "東京");
assert_eq!(token.offset_from, 0);
assert_eq!(token.offset_to, 6);
assert_eq!(token.position, 0);

let token = stream.next().unwrap();
assert_eq!(token.text, "特許");
assert_eq!(token.offset_from, 6);
assert_eq!(token.offset_to, 12);
assert_eq!(token.position, 1);

let token = stream.next().unwrap();
assert_eq!(token.text, "許可");
assert_eq!(token.offset_from, 12);
assert_eq!(token.offset_to, 18);
assert_eq!(token.position, 2);

let token = stream.next().unwrap();
assert_eq!(token.text, "局");
assert_eq!(token.offset_from, 18);
assert_eq!(token.offset_to, 21);
assert_eq!(token.position, 3);

assert!(stream.next().is_none());

Structs