divvunspell 1.0.0-beta.3

Spell checking library for ZHFST/BHFST spellers, with case handling and tokenization support.
Documentation
use std::ptr;

use memmap2::Mmap;

use crate::transducer::TransducerError;
use crate::types::{SymbolNumber, TransitionTableIndex, Weight};
use crate::vfs::{self, Filesystem};

#[derive(Debug)]
pub struct MemmapIndexTable<F> {
    buf: Mmap,
    pub(crate) size: u32,
    _file: std::marker::PhantomData<F>,
}

const INDEX_TABLE_SIZE: usize = 8;

impl<F: vfs::File> MemmapIndexTable<F> {
    pub fn from_path_partial<P, FS>(
        fs: &FS,
        path: P,
        chunk: u64,
        total: u64,
    ) -> Result<Self, TransducerError>
    where
        P: AsRef<std::path::Path>,
        FS: Filesystem<File = F>,
    {
        let file = fs.open_file(path).map_err(TransducerError::Io)?;
        let len = file.len().map_err(TransducerError::Io)? / total;
        let buf = unsafe {
            file.partial_memory_map(chunk * len, len as usize)
                .map_err(TransducerError::Memmap)?
        };
        let size = (buf.len() / INDEX_TABLE_SIZE) as u32;
        Ok(MemmapIndexTable {
            buf,
            size,
            _file: std::marker::PhantomData::<F>,
        })
    }
}

impl<F: vfs::File> crate::transducer::IndexTable<F> for MemmapIndexTable<F> {
    fn from_path<P, FS>(fs: &FS, path: P) -> Result<Self, TransducerError>
    where
        P: AsRef<std::path::Path>,
        FS: Filesystem<File = F>,
    {
        let file = fs.open_file(path).map_err(TransducerError::Io)?;
        let buf = unsafe { file.memory_map().map_err(TransducerError::Memmap)? };
        let size = (buf.len() / INDEX_TABLE_SIZE) as u32;
        Ok(MemmapIndexTable {
            buf,
            size,
            _file: std::marker::PhantomData::<F>,
        })
    }

    fn input_symbol(&self, i: TransitionTableIndex) -> Option<SymbolNumber> {
        if i >= self.size {
            return None;
        }

        let index = INDEX_TABLE_SIZE * i as usize;

        let input_symbol: SymbolNumber =
            unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) };

        if input_symbol == std::u16::MAX {
            None
        } else {
            Some(input_symbol)
        }
    }

    fn target(&self, i: TransitionTableIndex) -> Option<TransitionTableIndex> {
        if i >= self.size {
            return None;
        }

        let index = (INDEX_TABLE_SIZE * i as usize) + 4;
        let target: TransitionTableIndex =
            unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) };

        if target == std::u32::MAX {
            None
        } else {
            Some(target)
        }
    }

    fn final_weight(&self, i: TransitionTableIndex) -> Option<Weight> {
        if i >= self.size {
            return None;
        }

        let index = (INDEX_TABLE_SIZE * i as usize) + 4;
        let weight: Weight = unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) };

        Some(weight)
    }
}

#[cfg(unix)]
mod unix {
    use super::*;

    use crate::transducer::IndexTable;
    use crate::transducer::TransducerError;
    use crate::types::{SymbolNumber, TransitionTableIndex, Weight};
    use crate::vfs::{self, Filesystem};

    pub struct FileIndexTable<F: vfs::File> {
        file: F,
        size: u32,
    }

    impl<F: vfs::File> FileIndexTable<F> {
        #[inline(always)]
        fn read_u16_at(&self, index: u64) -> u16 {
            let mut buf = [0u8; 2];
            self.file
                .read_exact_at(&mut buf, index)
                .expect("failed to read u16");
            u16::from_le_bytes(buf)
        }

        #[inline(always)]
        fn read_u32_at(&self, index: u64) -> u32 {
            let mut buf = [0u8; 4];
            self.file
                .read_exact_at(&mut buf, index)
                .expect("failed to read u32");
            u32::from_le_bytes(buf)
        }
    }

    impl<F: vfs::File> IndexTable<F> for FileIndexTable<F> {
        fn from_path<P, FS>(fs: &FS, path: P) -> Result<Self, TransducerError>
        where
            P: AsRef<std::path::Path>,
            FS: Filesystem<File = F>,
        {
            let file = fs.open_file(path).map_err(TransducerError::Io)?;
            Ok(FileIndexTable {
                size: file.len().map_err(TransducerError::Io)? as u32,
                file,
            })
        }

        fn input_symbol(&self, i: TransitionTableIndex) -> Option<SymbolNumber> {
            if i >= self.size {
                return None;
            }

            let index = INDEX_TABLE_SIZE * i as usize;

            let input_symbol: SymbolNumber = self.read_u16_at(index as u64);

            if input_symbol == std::u16::MAX {
                None
            } else {
                Some(input_symbol)
            }
        }

        fn target(&self, i: TransitionTableIndex) -> Option<TransitionTableIndex> {
            if i >= self.size {
                return None;
            }

            let index = (INDEX_TABLE_SIZE * i as usize) + 4;
            let target: TransitionTableIndex = self.read_u32_at(index as u64);

            if target == std::u32::MAX {
                None
            } else {
                Some(target)
            }
        }

        fn final_weight(&self, i: TransitionTableIndex) -> Option<Weight> {
            if i >= self.size {
                return None;
            }

            let index = (INDEX_TABLE_SIZE * i as usize) + 4;
            let x = self.read_u32_at(index as u64);
            let weight: Weight = f32::from_bits(x);

            Some(weight)
        }
    }
}

#[cfg(unix)]
pub use self::unix::FileIndexTable;