use std::fmt;
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Token {
pub text: String,
pub position: usize,
pub start_offset: usize,
pub end_offset: usize,
pub boost: f32,
pub stopped: bool,
pub metadata: Option<TokenMetadata>,
pub position_increment: usize,
pub position_length: usize,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum TokenType {
Alphanum,
Num,
Cjk,
Katakana,
Hiragana,
Hangul,
Punctuation,
Whitespace,
Synonym,
Email,
Url,
Other,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TokenMetadata {
pub original_text: Option<String>,
pub token_type: Option<TokenType>,
pub language: Option<String>,
pub attributes: std::collections::HashMap<String, String>,
}
impl Token {
pub fn new<S: Into<String>>(text: S, position: usize) -> Self {
Token {
text: text.into(),
position,
start_offset: 0,
end_offset: 0,
boost: 1.0,
stopped: false,
metadata: None,
position_increment: 1,
position_length: 1,
}
}
pub fn with_offsets<S: Into<String>>(
text: S,
position: usize,
start_offset: usize,
end_offset: usize,
) -> Self {
Token {
text: text.into(),
position,
start_offset,
end_offset,
boost: 1.0,
stopped: false,
metadata: None,
position_increment: 1,
position_length: 1,
}
}
pub fn len(&self) -> usize {
self.text.len()
}
pub fn is_empty(&self) -> bool {
self.text.is_empty()
}
pub fn with_boost(mut self, boost: f32) -> Self {
self.boost = boost;
self
}
pub fn stop(mut self) -> Self {
self.stopped = true;
self
}
pub fn is_stopped(&self) -> bool {
self.stopped
}
pub fn with_metadata(mut self, metadata: TokenMetadata) -> Self {
self.metadata = Some(metadata);
self
}
pub fn metadata(&self) -> Option<&TokenMetadata> {
self.metadata.as_ref()
}
pub fn metadata_mut(&mut self) -> Option<&mut TokenMetadata> {
self.metadata.as_mut()
}
pub fn with_original_text<S: Into<String>>(mut self, original: S) -> Self {
let metadata = self.metadata.get_or_insert_with(TokenMetadata::new);
metadata.original_text = Some(original.into());
self
}
pub fn with_token_type(mut self, token_type: TokenType) -> Self {
let metadata = self.metadata.get_or_insert_with(TokenMetadata::new);
metadata.token_type = Some(token_type);
self
}
pub fn with_text<S: Into<String>>(&self, text: S) -> Self {
let mut token = self.clone();
token.text = text.into();
token
}
pub fn with_position(&self, position: usize) -> Self {
let mut token = self.clone();
token.position = position;
token
}
pub fn with_position_increment(mut self, increment: usize) -> Self {
self.position_increment = increment;
self
}
pub fn with_position_length(mut self, length: usize) -> Self {
self.position_length = length;
self
}
}
impl TokenMetadata {
pub fn new() -> Self {
TokenMetadata {
original_text: None,
token_type: None,
language: None,
attributes: std::collections::HashMap::new(),
}
}
pub fn set_attribute<K, V>(&mut self, key: K, value: V)
where
K: Into<String>,
V: Into<String>,
{
self.attributes.insert(key.into(), value.into());
}
pub fn get_attribute(&self, key: &str) -> Option<&str> {
self.attributes.get(key).map(|s| s.as_str())
}
}
impl Default for TokenMetadata {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.text)
}
}
pub type TokenStream = Box<dyn Iterator<Item = Token> + Send>;
pub trait IntoTokenStream {
fn into_token_stream(self) -> TokenStream;
}
impl IntoTokenStream for Vec<Token> {
fn into_token_stream(self) -> TokenStream {
Box::new(self.into_iter())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_creation() {
let token = Token::new("hello", 0);
assert_eq!(token.text, "hello");
assert_eq!(token.position, 0);
assert_eq!(token.start_offset, 0);
assert_eq!(token.end_offset, 0);
assert_eq!(token.boost, 1.0);
assert!(!token.stopped);
assert!(token.metadata.is_none());
}
#[test]
fn test_token_with_offsets() {
let token = Token::with_offsets("world", 1, 6, 11);
assert_eq!(token.text, "world");
assert_eq!(token.position, 1);
assert_eq!(token.start_offset, 6);
assert_eq!(token.end_offset, 11);
}
#[test]
fn test_token_methods() {
let token = Token::new("test", 0)
.with_boost(2.0)
.stop()
.with_original_text("TEST")
.with_token_type(TokenType::Alphanum);
assert_eq!(token.boost, 2.0);
assert!(token.is_stopped());
assert!(token.metadata.is_some());
let metadata = token.metadata.as_ref().unwrap();
assert_eq!(metadata.original_text.as_deref(), Some("TEST"));
assert_eq!(metadata.token_type, Some(TokenType::Alphanum));
}
#[test]
fn test_token_metadata() {
let mut metadata = TokenMetadata::new();
metadata.set_attribute("custom", "value");
assert_eq!(metadata.get_attribute("custom"), Some("value"));
assert_eq!(metadata.get_attribute("missing"), None);
}
#[test]
fn test_token_display() {
let token = Token::new("hello", 0);
assert_eq!(format!("{token}"), "hello");
}
#[test]
fn test_token_stream() {
let tokens = vec![Token::new("hello", 0), Token::new("world", 1)];
let stream = tokens.into_token_stream();
let collected: Vec<_> = stream.collect();
assert_eq!(collected.len(), 2);
assert_eq!(collected[0].text, "hello");
assert_eq!(collected[1].text, "world");
}
}