pub struct TokenDictionary { /* private fields */ }Expand description
Token dictionary mapping token strings to unique integer IDs.
Token IDs are assigned as follows:
- IDs 0 to len_legalese-1: Reserved for legalese tokens (high-value words)
- IDs len_legalese and above: Assigned to other tokens as encountered
The len_legalese delimiter allows the matching engine to distinguish
between high-value (legalese) tokens and regular tokens.
Based on the Python ScanCode Toolkit implementation at: reference/scancode-toolkit/src/licensedcode/index.py
Implementations§
Source§impl TokenDictionary
impl TokenDictionary
Sourcepub fn new_with_legalese(legalese: &Archived<BTreeMap<String, u16>>) -> Self
pub fn new_with_legalese(legalese: &Archived<BTreeMap<String, u16>>) -> Self
Create a new token dictionary initialized with legalese tokens.
This follows the Python ScanCode Toolkit pattern where the dictionary starts with pre-defined legalese words that get low IDs (high value).
§Arguments
legalese- Archived BTreeMap of word → u16 pairs for legalese words. Values are bare u16 (not TokenId) because the rkyv artifact is built bybuild.rswhich cannot depend on this crate’s types.
§Returns
A new TokenDictionary instance with legalese tokens pre-populated
Sourcepub fn new_with_legalese_pairs(legalese_entries: &[(&str, u16)]) -> Self
pub fn new_with_legalese_pairs(legalese_entries: &[(&str, u16)]) -> Self
Create a new token dictionary initialized with legalese token pairs.
Convenience constructor for tests that don’t use the rkyv-archived legalese data.
pub fn intern(&mut self, token: &str) -> KnownToken
pub fn lookup(&self, token: &str) -> Option<KnownToken>
pub fn classify_query_token(&self, token: &str) -> QueryToken
pub fn token_kind(&self, token_id: TokenId) -> TokenKind
pub fn is_digit_only_token(&self, token_id: TokenId) -> bool
Sourcepub fn get_token_id(&self, token: &str) -> Option<TokenId>
pub fn get_token_id(&self, token: &str) -> Option<TokenId>
Sourcepub fn get(&self, token: &str) -> Option<TokenId>
pub fn get(&self, token: &str) -> Option<TokenId>
Get the token ID (alias for backward compatibility).
Sourcepub const fn legalese_count(&self) -> usize
pub const fn legalese_count(&self) -> usize
Get the number of legalese tokens.
Sourcepub fn tokens_to_ids_len(&self) -> usize
pub fn tokens_to_ids_len(&self) -> usize
Get the number of tokens in the dictionary.
Trait Implementations§
Source§impl Archive for TokenDictionary
impl Archive for TokenDictionary
Source§const COPY_OPTIMIZATION: CopyOptimization<Self>
const COPY_OPTIMIZATION: CopyOptimization<Self>
serialize. Read moreSource§type Archived = ArchivedTokenDictionary
type Archived = ArchivedTokenDictionary
Source§type Resolver = TokenDictionaryResolver
type Resolver = TokenDictionaryResolver
Source§impl Clone for TokenDictionary
impl Clone for TokenDictionary
Source§fn clone(&self) -> TokenDictionary
fn clone(&self) -> TokenDictionary
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for TokenDictionary
impl Debug for TokenDictionary
Source§impl Default for TokenDictionary
impl Default for TokenDictionary
Source§impl<__D: Fallible + ?Sized> Deserialize<TokenDictionary, __D> for Archived<TokenDictionary>where
HashMap<String, TokenId>: Archive,
<HashMap<String, TokenId> as Archive>::Archived: Deserialize<HashMap<String, TokenId>, __D>,
Vec<Option<TokenMetadata>>: Archive,
<Vec<Option<TokenMetadata>> as Archive>::Archived: Deserialize<Vec<Option<TokenMetadata>>, __D>,
usize: Archive,
<usize as Archive>::Archived: Deserialize<usize, __D>,
TokenId: Archive,
<TokenId as Archive>::Archived: Deserialize<TokenId, __D>,
impl<__D: Fallible + ?Sized> Deserialize<TokenDictionary, __D> for Archived<TokenDictionary>where
HashMap<String, TokenId>: Archive,
<HashMap<String, TokenId> as Archive>::Archived: Deserialize<HashMap<String, TokenId>, __D>,
Vec<Option<TokenMetadata>>: Archive,
<Vec<Option<TokenMetadata>> as Archive>::Archived: Deserialize<Vec<Option<TokenMetadata>>, __D>,
usize: Archive,
<usize as Archive>::Archived: Deserialize<usize, __D>,
TokenId: Archive,
<TokenId as Archive>::Archived: Deserialize<TokenId, __D>,
Source§fn deserialize(
&self,
deserializer: &mut __D,
) -> Result<TokenDictionary, <__D as Fallible>::Error>
fn deserialize( &self, deserializer: &mut __D, ) -> Result<TokenDictionary, <__D as Fallible>::Error>
Auto Trait Implementations§
impl Freeze for TokenDictionary
impl RefUnwindSafe for TokenDictionary
impl Send for TokenDictionary
impl Sync for TokenDictionary
impl Unpin for TokenDictionary
impl UnsafeUnpin for TokenDictionary
impl UnwindSafe for TokenDictionary
Blanket Implementations§
Source§impl<T> ArchivePointee for T
impl<T> ArchivePointee for T
Source§type ArchivedMetadata = ()
type ArchivedMetadata = ()
Source§fn pointer_metadata(
_: &<T as ArchivePointee>::ArchivedMetadata,
) -> <T as Pointee>::Metadata
fn pointer_metadata( _: &<T as ArchivePointee>::ArchivedMetadata, ) -> <T as Pointee>::Metadata
Source§impl<T> ArchiveUnsized for Twhere
T: Archive,
impl<T> ArchiveUnsized for Twhere
T: Archive,
Source§type Archived = <T as Archive>::Archived
type Archived = <T as Archive>::Archived
Archive, it may be
unsized. Read moreSource§fn archived_metadata(
&self,
) -> <<T as ArchiveUnsized>::Archived as ArchivePointee>::ArchivedMetadata
fn archived_metadata( &self, ) -> <<T as ArchiveUnsized>::Archived as ArchivePointee>::ArchivedMetadata
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> LayoutRaw for T
impl<T> LayoutRaw for T
Source§fn layout_raw(_: <T as Pointee>::Metadata) -> Result<Layout, LayoutError>
fn layout_raw(_: <T as Pointee>::Metadata) -> Result<Layout, LayoutError>
Source§impl<T, N1, N2> Niching<NichedOption<T, N1>> for N2
impl<T, N1, N2> Niching<NichedOption<T, N1>> for N2
Source§unsafe fn is_niched(niched: *const NichedOption<T, N1>) -> bool
unsafe fn is_niched(niched: *const NichedOption<T, N1>) -> bool
Source§fn resolve_niched(out: Place<NichedOption<T, N1>>)
fn resolve_niched(out: Place<NichedOption<T, N1>>)
out indicating that a T is niched.Source§impl<T> Pointable for T
impl<T> Pointable for T
Source§impl<T, S> SerializeUnsized<S> for T
impl<T, S> SerializeUnsized<S> for T
Source§impl<U, T> ToOwnedObj<U> for Twhere
U: FromObjRef<T>,
impl<U, T> ToOwnedObj<U> for Twhere
U: FromObjRef<T>,
Source§fn to_owned_obj(&self, data: FontData<'_>) -> U
fn to_owned_obj(&self, data: FontData<'_>) -> U
T, using the provided data to resolve any offsets.