use std::cmp;
use std::fmt;
use crate::automaton::{AlwaysMatch, Automaton};
use crate::bytes;
use crate::error::Result;
use crate::stream::{IntoStreamer, Streamer};
pub use crate::raw::build::Builder;
pub use crate::raw::error::Error;
pub use crate::raw::node::{Node, Transitions};
pub use crate::raw::ops::{
Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference,
Union,
};
mod build;
mod common_inputs;
mod counting_writer;
mod crc32;
mod crc32_table;
mod error;
mod node;
mod ops;
mod registry;
mod registry_minimal;
#[cfg(test)]
mod tests;
pub const VERSION: u64 = 3;
const EMPTY_ADDRESS: CompiledAddr = 0;
const NONE_ADDRESS: CompiledAddr = 1;
pub type FstType = u64;
pub type CompiledAddr = usize;
#[derive(Clone)]
pub struct Fst<D> {
meta: Meta,
data: D,
}
#[derive(Debug, Clone)]
struct Meta {
version: u64,
root_addr: CompiledAddr,
ty: FstType,
len: usize,
checksum: Option<u32>,
}
impl Fst<Vec<u8>> {
pub fn from_iter_set<K, I>(iter: I) -> Result<Fst<Vec<u8>>>
where
K: AsRef<[u8]>,
I: IntoIterator<Item = K>,
{
let mut builder = Builder::memory();
for k in iter {
builder.add(k)?;
}
Ok(builder.into_fst())
}
pub fn from_iter_map<K, I>(iter: I) -> Result<Fst<Vec<u8>>>
where
K: AsRef<[u8]>,
I: IntoIterator<Item = (K, u64)>,
{
let mut builder = Builder::memory();
for (k, v) in iter {
builder.insert(k, v)?;
}
Ok(builder.into_fst())
}
}
impl<D: AsRef<[u8]>> Fst<D> {
#[inline]
pub fn new(data: D) -> Result<Fst<D>> {
let bytes = data.as_ref();
if bytes.len() < 36 {
return Err(Error::Format { size: bytes.len() }.into());
}
let version = bytes::read_u64_le(&bytes);
if version == 0 || version > VERSION {
return Err(
Error::Version { expected: VERSION, got: version }.into()
);
}
let ty = bytes::read_u64_le(&bytes[8..]);
let (end, checksum) = if version <= 2 {
(bytes.len(), None)
} else {
let checksum = bytes::read_u32_le(&bytes[bytes.len() - 4..]);
(bytes.len() - 4, Some(checksum))
};
let root_addr = {
let last = &bytes[end - 8..];
u64_to_usize(bytes::read_u64_le(last))
};
let len = {
let last2 = &bytes[end - 16..];
u64_to_usize(bytes::read_u64_le(last2))
};
let (empty_total, addr_offset) =
if version <= 2 { (32, 17) } else { (36, 21) };
if (root_addr == EMPTY_ADDRESS && bytes.len() != empty_total)
&& root_addr + addr_offset != bytes.len()
{
return Err(Error::Format { size: bytes.len() }.into());
}
let meta = Meta { version, root_addr, ty, len, checksum };
Ok(Fst { meta, data })
}
#[inline]
pub fn get<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> {
self.as_ref().get(key.as_ref())
}
#[inline]
pub fn contains_key<B: AsRef<[u8]>>(&self, key: B) -> bool {
self.as_ref().contains_key(key.as_ref())
}
#[inline]
pub fn get_key(&self, value: u64) -> Option<Vec<u8>> {
let mut key = vec![];
if self.get_key_into(value, &mut key) {
Some(key)
} else {
None
}
}
#[inline]
pub fn get_key_into(&self, value: u64, key: &mut Vec<u8>) -> bool {
self.as_ref().get_key_into(value, key)
}
#[inline]
pub fn stream(&self) -> Stream<'_> {
StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream()
}
#[inline]
pub fn range(&self) -> StreamBuilder<'_> {
StreamBuilder::new(self.as_ref(), AlwaysMatch)
}
#[inline]
pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
StreamBuilder::new(self.as_ref(), aut)
}
#[inline]
pub fn search_with_state<A: Automaton>(
&self,
aut: A,
) -> StreamWithStateBuilder<'_, A> {
StreamWithStateBuilder::new(self.as_ref(), aut)
}
#[inline]
pub fn len(&self) -> usize {
self.as_ref().len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.as_ref().is_empty()
}
#[inline]
pub fn size(&self) -> usize {
self.as_ref().size()
}
#[inline]
pub fn verify(&self) -> Result<()> {
use crate::raw::crc32::CheckSummer;
let expected = match self.as_ref().meta.checksum {
None => return Err(Error::ChecksumMissing.into()),
Some(expected) => expected,
};
let mut summer = CheckSummer::new();
summer.update(&self.as_bytes()[..self.as_bytes().len() - 4]);
let got = summer.masked();
if expected == got {
return Ok(());
}
Err(Error::ChecksumMismatch { expected, got }.into())
}
#[inline]
pub fn op(&self) -> OpBuilder<'_> {
OpBuilder::new().add(self)
}
#[inline]
pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
where
I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
{
self.op().add(stream).intersection().next().is_none()
}
#[inline]
pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
where
I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
{
let mut op = self.op().add(stream).intersection();
let mut count = 0;
while let Some(_) = op.next() {
count += 1;
}
count == self.len()
}
#[inline]
pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
where
I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
{
let mut op = self.op().add(stream).union();
let mut count = 0;
while let Some(_) = op.next() {
count += 1;
}
count == self.len()
}
#[inline]
pub fn fst_type(&self) -> FstType {
self.as_ref().fst_type()
}
#[inline]
pub fn root(&self) -> Node<'_> {
self.as_ref().root()
}
#[inline]
pub fn node(&self, addr: CompiledAddr) -> Node<'_> {
self.as_ref().node(addr)
}
#[inline]
pub fn to_vec(&self) -> Vec<u8> {
self.as_ref().to_vec()
}
#[inline]
pub fn as_bytes(&self) -> &[u8] {
self.as_ref().as_bytes()
}
#[inline]
fn as_ref(&self) -> FstRef {
FstRef { meta: &self.meta, data: self.data.as_ref() }
}
}
impl<D> Fst<D> {
#[inline]
pub fn into_inner(self) -> D {
self.data
}
#[inline]
pub fn map_data<F, T>(self, mut f: F) -> Result<Fst<T>>
where
F: FnMut(D) -> T,
T: AsRef<[u8]>,
{
Fst::new(f(self.into_inner()))
}
}
impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst<D> {
type Item = (&'a [u8], Output);
type Into = Stream<'f>;
#[inline]
fn into_stream(self) -> Stream<'f> {
StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream()
}
}
struct FstRef<'f> {
meta: &'f Meta,
data: &'f [u8],
}
impl<'f> FstRef<'f> {
#[inline]
fn get(&self, key: &[u8]) -> Option<Output> {
let mut node = self.root();
let mut out = Output::zero();
for &b in key {
node = match node.find_input(b) {
None => return None,
Some(i) => {
let t = node.transition(i);
out = out.cat(t.out);
self.node(t.addr)
}
}
}
if !node.is_final() {
None
} else {
Some(out.cat(node.final_output()))
}
}
#[inline]
fn contains_key(&self, key: &[u8]) -> bool {
let mut node = self.root();
for &b in key {
node = match node.find_input(b) {
None => return false,
Some(i) => self.node(node.transition_addr(i)),
}
}
node.is_final()
}
#[inline]
fn get_key_into(&self, mut value: u64, key: &mut Vec<u8>) -> bool {
let mut node = self.root();
while value != 0 || !node.is_final() {
let trans = node
.transitions()
.take_while(|t| t.out.value() <= value)
.last();
node = match trans {
None => return false,
Some(t) => {
value -= t.out.value();
key.push(t.inp);
self.node(t.addr)
}
};
}
true
}
#[inline]
fn len(&self) -> usize {
self.meta.len
}
#[inline]
fn is_empty(&self) -> bool {
self.meta.len == 0
}
#[inline]
fn size(&self) -> usize {
self.as_bytes().len()
}
#[inline]
fn fst_type(&self) -> FstType {
self.meta.ty
}
#[inline]
fn root_addr(&self) -> CompiledAddr {
self.meta.root_addr
}
#[inline]
fn root(&self) -> Node<'f> {
self.node(self.root_addr())
}
#[inline]
fn node(&self, addr: CompiledAddr) -> Node<'f> {
Node::new(self.meta.version, addr, self.as_bytes())
}
#[inline]
fn to_vec(&self) -> Vec<u8> {
self.as_bytes().to_vec()
}
#[inline]
fn as_bytes(&self) -> &'f [u8] {
self.data
}
#[inline]
fn empty_final_output(&self) -> Option<Output> {
let root = self.root();
if root.is_final() {
Some(root.final_output())
} else {
None
}
}
}
pub struct StreamBuilder<'f, A = AlwaysMatch> {
fst: FstRef<'f>,
aut: A,
min: Bound,
max: Bound,
}
impl<'f, A: Automaton> StreamBuilder<'f, A> {
fn new(fst: FstRef<'f>, aut: A) -> StreamBuilder<'f, A> {
StreamBuilder {
fst,
aut,
min: Bound::Unbounded,
max: Bound::Unbounded,
}
}
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
self.min = Bound::Included(bound.as_ref().to_owned());
self
}
pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
self.min = Bound::Excluded(bound.as_ref().to_owned());
self
}
pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
self.max = Bound::Included(bound.as_ref().to_owned());
self
}
pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
self.max = Bound::Excluded(bound.as_ref().to_owned());
self
}
}
impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> {
type Item = (&'a [u8], Output);
type Into = Stream<'f, A>;
fn into_stream(self) -> Stream<'f, A> {
Stream::new(self.fst, self.aut, self.min, self.max)
}
}
pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> {
fst: FstRef<'f>,
aut: A,
min: Bound,
max: Bound,
}
impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> {
fn new(fst: FstRef<'f>, aut: A) -> StreamWithStateBuilder<'f, A> {
StreamWithStateBuilder {
fst,
aut,
min: Bound::Unbounded,
max: Bound::Unbounded,
}
}
pub fn ge<T: AsRef<[u8]>>(
mut self,
bound: T,
) -> StreamWithStateBuilder<'f, A> {
self.min = Bound::Included(bound.as_ref().to_owned());
self
}
pub fn gt<T: AsRef<[u8]>>(
mut self,
bound: T,
) -> StreamWithStateBuilder<'f, A> {
self.min = Bound::Excluded(bound.as_ref().to_owned());
self
}
pub fn le<T: AsRef<[u8]>>(
mut self,
bound: T,
) -> StreamWithStateBuilder<'f, A> {
self.max = Bound::Included(bound.as_ref().to_owned());
self
}
pub fn lt<T: AsRef<[u8]>>(
mut self,
bound: T,
) -> StreamWithStateBuilder<'f, A> {
self.max = Bound::Excluded(bound.as_ref().to_owned());
self
}
}
impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a>
for StreamWithStateBuilder<'f, A>
where
A::State: Clone,
{
type Item = (&'a [u8], Output, A::State);
type Into = StreamWithState<'f, A>;
fn into_stream(self) -> StreamWithState<'f, A> {
StreamWithState::new(self.fst, self.aut, self.min, self.max)
}
}
#[derive(Debug)]
enum Bound {
Included(Vec<u8>),
Excluded(Vec<u8>),
Unbounded,
}
impl Bound {
#[inline]
fn exceeded_by(&self, inp: &[u8]) -> bool {
match *self {
Bound::Included(ref v) => inp > v,
Bound::Excluded(ref v) => inp >= v,
Bound::Unbounded => false,
}
}
#[inline]
fn is_empty(&self) -> bool {
match *self {
Bound::Included(ref v) => v.is_empty(),
Bound::Excluded(ref v) => v.is_empty(),
Bound::Unbounded => true,
}
}
#[inline]
fn is_inclusive(&self) -> bool {
match *self {
Bound::Excluded(_) => false,
_ => true,
}
}
}
pub struct Stream<'f, A: Automaton = AlwaysMatch>(StreamWithState<'f, A>);
impl<'f, A: Automaton> Stream<'f, A> {
fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> {
Stream(StreamWithState::new(fst, aut, min, max))
}
pub fn into_byte_vec(mut self) -> Vec<(Vec<u8>, u64)> {
let mut vs = vec![];
while let Some((k, v)) = self.next() {
vs.push((k.to_vec(), v.value()));
}
vs
}
pub fn into_str_vec(mut self) -> Result<Vec<(String, u64)>> {
let mut vs = vec![];
while let Some((k, v)) = self.next() {
let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
vs.push((k, v.value()));
}
Ok(vs)
}
pub fn into_byte_keys(mut self) -> Vec<Vec<u8>> {
let mut vs = vec![];
while let Some((k, _)) = self.next() {
vs.push(k.to_vec());
}
vs
}
pub fn into_str_keys(mut self) -> Result<Vec<String>> {
let mut vs = vec![];
while let Some((k, _)) = self.next() {
let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
vs.push(k);
}
Ok(vs)
}
pub fn into_values(mut self) -> Vec<u64> {
let mut vs = vec![];
while let Some((_, v)) = self.next() {
vs.push(v.value());
}
vs
}
}
impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> {
type Item = (&'a [u8], Output);
fn next(&'a mut self) -> Option<(&'a [u8], Output)> {
self.0.next_with(|_| ()).map(|(key, out, _)| (key, out))
}
}
pub struct StreamWithState<'f, A = AlwaysMatch>
where
A: Automaton,
{
fst: FstRef<'f>,
aut: A,
inp: Vec<u8>,
empty_output: Option<Output>,
stack: Vec<StreamState<'f, A::State>>,
end_at: Bound,
}
#[derive(Clone, Debug)]
struct StreamState<'f, S> {
node: Node<'f>,
trans: usize,
out: Output,
aut_state: S,
}
impl<'f, A: Automaton> StreamWithState<'f, A> {
fn new(
fst: FstRef<'f>,
aut: A,
min: Bound,
max: Bound,
) -> StreamWithState<'f, A> {
let mut rdr = StreamWithState {
fst,
aut,
inp: Vec::with_capacity(16),
empty_output: None,
stack: vec![],
end_at: max,
};
rdr.seek_min(min);
rdr
}
fn seek_min(&mut self, min: Bound) {
if min.is_empty() {
if min.is_inclusive() {
self.empty_output = self.fst.empty_final_output();
}
self.stack = vec![StreamState {
node: self.fst.root(),
trans: 0,
out: Output::zero(),
aut_state: self.aut.start(),
}];
return;
}
let (key, inclusive) = match min {
Bound::Excluded(ref min) => (min, false),
Bound::Included(ref min) => (min, true),
Bound::Unbounded => unreachable!(),
};
let mut node = self.fst.root();
let mut out = Output::zero();
let mut aut_state = self.aut.start();
for &b in key {
match node.find_input(b) {
Some(i) => {
let t = node.transition(i);
let prev_state = aut_state;
aut_state = self.aut.accept(&prev_state, b);
self.inp.push(b);
self.stack.push(StreamState {
node,
trans: i + 1,
out,
aut_state: prev_state,
});
out = out.cat(t.out);
node = self.fst.node(t.addr);
}
None => {
self.stack.push(StreamState {
node,
trans: node
.transitions()
.position(|t| t.inp > b)
.unwrap_or(node.len()),
out,
aut_state,
});
return;
}
}
}
if !self.stack.is_empty() {
let last = self.stack.len() - 1;
if inclusive {
self.stack[last].trans -= 1;
self.inp.pop();
} else {
let node = self.stack[last].node;
let trans = self.stack[last].trans;
self.stack.push(StreamState {
node: self.fst.node(node.transition(trans - 1).addr),
trans: 0,
out,
aut_state,
});
}
}
}
fn next_with<T>(
&mut self,
mut map: impl FnMut(&A::State) -> T,
) -> Option<(&[u8], Output, T)> {
if let Some(out) = self.empty_output.take() {
if self.end_at.exceeded_by(&[]) {
self.stack.clear();
return None;
}
let start = self.aut.start();
if self.aut.is_match(&start) {
return Some((&[], out, map(&start)));
}
}
while let Some(state) = self.stack.pop() {
if state.trans >= state.node.len()
|| !self.aut.can_match(&state.aut_state)
{
if state.node.addr() != self.fst.root_addr() {
self.inp.pop().unwrap();
}
continue;
}
let trans = state.node.transition(state.trans);
let out = state.out.cat(trans.out);
let next_state = self.aut.accept(&state.aut_state, trans.inp);
let t = map(&next_state);
let mut is_match = self.aut.is_match(&next_state);
let next_node = self.fst.node(trans.addr);
self.inp.push(trans.inp);
if next_node.is_final() {
if let Some(eof_state) = self.aut.accept_eof(&next_state) {
is_match = self.aut.is_match(&eof_state);
}
}
self.stack.push(StreamState { trans: state.trans + 1, ..state });
self.stack.push(StreamState {
node: next_node,
trans: 0,
out,
aut_state: next_state,
});
if self.end_at.exceeded_by(&self.inp) {
self.stack.clear();
return None;
}
if next_node.is_final() && is_match {
return Some((
&self.inp,
out.cat(next_node.final_output()),
t,
));
}
}
None
}
}
impl<'a, 'f, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A>
where
A::State: Clone,
{
type Item = (&'a [u8], Output, A::State);
fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> {
self.next_with(|state| state.clone())
}
}
#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
pub struct Output(u64);
impl Output {
#[inline]
pub fn new(v: u64) -> Output {
Output(v)
}
#[inline]
pub fn zero() -> Output {
Output(0)
}
#[inline]
pub fn value(self) -> u64 {
self.0
}
#[inline]
pub fn is_zero(self) -> bool {
self.0 == 0
}
#[inline]
pub fn prefix(self, o: Output) -> Output {
Output(cmp::min(self.0, o.0))
}
#[inline]
pub fn cat(self, o: Output) -> Output {
Output(self.0 + o.0)
}
#[inline]
pub fn sub(self, o: Output) -> Output {
Output(
self.0
.checked_sub(o.0)
.expect("BUG: underflow subtraction not allowed"),
)
}
}
#[derive(Copy, Clone, Hash, Eq, PartialEq)]
pub struct Transition {
pub inp: u8,
pub out: Output,
pub addr: CompiledAddr,
}
impl Default for Transition {
#[inline]
fn default() -> Transition {
Transition { inp: 0, out: Output::zero(), addr: NONE_ADDRESS }
}
}
impl fmt::Debug for Transition {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.out.is_zero() {
write!(f, "{} -> {}", self.inp as char, self.addr)
} else {
write!(
f,
"({}, {}) -> {}",
self.inp as char,
self.out.value(),
self.addr
)
}
}
}
#[inline]
#[cfg(target_pointer_width = "64")]
fn u64_to_usize(n: u64) -> usize {
n as usize
}
#[inline]
#[cfg(not(target_pointer_width = "64"))]
fn u64_to_usize(n: u64) -> usize {
if n > std::usize::MAX as u64 {
panic!(
"\
Cannot convert node address {} to a pointer sized variable. If this FST
is very large and was generated on a system with a larger pointer size
than this system, then it is not possible to read this FST on this
system.",
n
);
}
n as usize
}