use std::cmp;
use std::fmt;
use std::path::Path;
use byteorder::{ReadBytesExt, LittleEndian};
use memmap::{Mmap, Protection};
use automaton::{Automaton, AlwaysMatch};
use error::Result;
use stream::{IntoStreamer, Streamer};
pub use self::build::Builder;
pub use self::error::Error;
pub use self::node::{Node, Transitions};
use self::node::node_new;
pub use self::ops::{
IndexedValue, OpBuilder,
Intersection, Union, Difference, SymmetricDifference,
};
mod build;
mod common_inputs;
mod counting_writer;
mod error;
mod node;
mod ops;
mod pack;
mod registry;
#[cfg(test)] mod tests;
pub const VERSION: u64 = 1;
pub const NONE_ADDRESS: CompiledAddr = 1;
pub type FstType = u64;
pub type CompiledAddr = usize;
pub struct Fst {
data: FstData,
root_addr: CompiledAddr,
ty: FstType,
len: usize,
}
impl Fst {
pub fn from_file_path<P: AsRef<Path>>(path: P) -> Result<Self> {
Fst::new(FstData::Mmap(try!(Mmap::open_path(path, Protection::Read))))
}
pub fn from_bytes(bytes: Vec<u8>) -> Result<Self> {
Fst::new(FstData::Owned(bytes))
}
}
impl Fst {
fn new(data: FstData) -> Result<Self> {
if data.as_slice().len() < 32 {
return Err(Error::Format.into());
}
let version = data.as_slice().read_u64::<LittleEndian>().unwrap();
if version != VERSION {
return Err(Error::Version {
expected: VERSION,
got: version,
}.into());
}
let ty = (&data.as_slice()[8..]).read_u64::<LittleEndian>().unwrap();
let root_addr = {
let mut last = &data.as_slice()[data.as_slice().len() - 8..];
last.read_u64::<LittleEndian>().unwrap()
};
let len = {
let mut last2 = &data.as_slice()[data.as_slice().len() - 16..];
last2.read_u64::<LittleEndian>().unwrap()
};
Ok(Fst {
data: data,
root_addr: u64_to_usize(root_addr),
ty: ty,
len: u64_to_usize(len),
})
}
pub fn stream(&self) -> Stream {
StreamBuilder::new(self, AlwaysMatch).into_stream()
}
pub fn range(&self) -> StreamBuilder {
StreamBuilder::new(self, AlwaysMatch)
}
pub fn find<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> {
let mut node = self.root();
let mut out = Output::zero();
for &b in key.as_ref() {
node = match node.find_input(b) {
None => return None,
Some(i) => {
let t = node.transition(i);
out = out.cat(t.out);
self.node(t.addr)
}
}
}
if !node.is_final() {
None
} else {
Some(out.cat(node.final_output()))
}
}
pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
StreamBuilder::new(self, aut)
}
pub fn fst_type(&self) -> FstType {
self.ty
}
pub fn root(&self) -> Node {
self.node(self.root_addr)
}
pub fn node(&self, addr: CompiledAddr) -> Node {
node_new(addr, &self.data.as_slice())
}
pub fn as_slice(&self) -> &[u8] {
self.data.as_slice()
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn op(&self) -> OpBuilder {
OpBuilder::new().add(self)
}
pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> {
self.op().add(stream).intersection().next().is_none()
}
pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> {
let mut op = self.op().add(stream).intersection();
let mut count = 0;
while let Some(_) = op.next() {
count += 1;
}
count == self.len()
}
pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> {
let mut op = self.op().add(stream).union();
let mut count = 0;
while let Some(_) = op.next() {
count += 1;
}
count == self.len()
}
fn empty_final_output(&self) -> Option<Output> {
let root = self.root();
if root.is_final() {
Some(root.final_output())
} else {
None
}
}
}
impl<'a, 'f> IntoStreamer<'a> for &'f Fst {
type Item = (&'a [u8], Output);
type Into = Stream<'f>;
fn into_stream(self) -> Self::Into {
StreamBuilder::new(self, AlwaysMatch).into_stream()
}
}
pub struct StreamBuilder<'a, A=AlwaysMatch> {
fst: &'a Fst,
aut: A,
min: Bound,
max: Bound,
}
impl<'f, A: Automaton> StreamBuilder<'f, A> {
fn new(fst: &'f Fst, aut: A) -> Self {
StreamBuilder {
fst: fst,
aut: aut,
min: Bound::Unbounded,
max: Bound::Unbounded,
}
}
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.min = Bound::Included(bound.as_ref().to_owned());
self
}
pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.min = Bound::Excluded(bound.as_ref().to_owned());
self
}
pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.max = Bound::Included(bound.as_ref().to_owned());
self
}
pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.max = Bound::Excluded(bound.as_ref().to_owned());
self
}
}
impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> {
type Item = (&'a [u8], Output);
type Into = Stream<'f, A>;
fn into_stream(self) -> Stream<'f, A> {
Stream::new(self.fst, self.aut, self.min, self.max)
}
}
#[derive(Debug)]
enum Bound {
Included(Vec<u8>),
Excluded(Vec<u8>),
Unbounded,
}
impl Bound {
fn exceeded_by(&self, inp: &[u8]) -> bool {
match *self {
Bound::Included(ref v) => inp > v,
Bound::Excluded(ref v) => inp >= v,
Bound::Unbounded => false,
}
}
}
pub struct Stream<'f, A=AlwaysMatch> where A: Automaton {
fst: &'f Fst,
aut: A,
inp: Vec<u8>,
empty_output: Option<Output>,
stack: Vec<StreamState<A::State>>,
end_at: Bound,
}
#[derive(Clone, Debug)]
struct StreamState<S> {
addr: CompiledAddr,
trans: usize,
out: Output,
aut_state: S,
}
impl<'f, A: Automaton> Stream<'f, A> {
fn new(fst: &'f Fst, aut: A, min: Bound, max: Bound) -> Self {
let mut rdr = Stream {
fst: fst,
aut: aut,
inp: Vec::with_capacity(16),
empty_output: None,
stack: vec![],
end_at: max,
};
rdr.seek_min(min);
rdr
}
fn seek_min(&mut self, min: Bound) {
let (key, inclusive) = match min {
Bound::Excluded(ref min) if min.is_empty() => {
self.stack = vec![StreamState {
addr: self.fst.root_addr,
trans: 0,
out: Output::zero(),
aut_state: self.aut.start(),
}];
return;
}
Bound::Excluded(ref min) => {
(min, false)
}
Bound::Included(ref min) if !min.is_empty() => {
(min, true)
}
_ => {
self.empty_output = self.fst.empty_final_output();
self.stack = vec![StreamState {
addr: self.fst.root_addr,
trans: 0,
out: Output::zero(),
aut_state: self.aut.start(),
}];
return;
}
};
let mut node = self.fst.root();
let mut out = Output::zero();
let mut aut_state = self.aut.start();
for &b in key {
match node.find_input(b) {
Some(i) => {
let t = node.transition(i);
let next_state = self.aut.accept(&aut_state, b);
aut_state = self.aut.accept(&aut_state, b);
self.stack.push(StreamState {
addr: node.addr(),
trans: i+1,
out: out,
aut_state: next_state,
});
out = out.cat(t.out);
self.inp.push(b);
node = self.fst.node(t.addr);
}
None => {
self.stack.push(StreamState {
addr: node.addr(),
trans: node.transitions()
.position(|t| t.inp > b)
.unwrap_or(node.len()),
out: out,
aut_state: aut_state,
});
return;
}
}
}
if !self.stack.is_empty() {
let last = self.stack.len() - 1;
if inclusive {
self.stack[last].trans -= 1;
self.inp.pop();
} else {
let addr = self.stack[last].addr;
let trans = self.stack[last].trans;
let node = self.fst.node(addr);
self.stack.push(StreamState {
addr: node.transition_addr(trans - 1),
trans: 0,
out: out,
aut_state: aut_state,
});
}
}
}
pub fn into_vec(mut self) -> Vec<(Vec<u8>, Output)> {
let mut vs = vec![];
while let Some((k, v)) = self.next() {
vs.push((k.to_vec(), v));
}
vs
}
pub fn into_keys(mut self) -> Vec<Vec<u8>> {
let mut vs = vec![];
while let Some((k, _)) = self.next() {
vs.push(k.to_vec());
}
vs
}
pub fn into_values(mut self) -> Vec<Output> {
let mut vs = vec![];
while let Some((_, v)) = self.next() {
vs.push(v);
}
vs
}
}
impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> {
type Item = (&'a [u8], Output);
fn next(&'a mut self) -> Option<Self::Item> {
if let Some(out) = self.empty_output.take() {
if self.end_at.exceeded_by(&[]) {
self.stack.clear();
return None;
}
if self.aut.is_match(&self.aut.start()) {
return Some((&[], out));
}
}
while let Some(state) = self.stack.pop() {
let node = self.fst.node(state.addr);
if state.trans >= node.len()
|| !self.aut.can_match(&state.aut_state) {
if node.addr() != self.fst.root_addr {
self.inp.pop().unwrap();
}
continue;
}
let trans = node.transition(state.trans);
let out = state.out.cat(trans.out);
let next_state = self.aut.accept(&state.aut_state, trans.inp);
let is_match = self.aut.is_match(&next_state);
self.inp.push(trans.inp);
self.stack.push(StreamState {
addr: state.addr,
trans: state.trans + 1,
out: state.out,
aut_state: state.aut_state,
});
self.stack.push(StreamState {
addr: trans.addr,
trans: 0,
out: out,
aut_state: next_state,
});
if self.end_at.exceeded_by(&self.inp) {
self.stack.clear();
return None;
}
let next_node = self.fst.node(trans.addr);
if next_node.is_final() && is_match {
return Some((&self.inp, out.cat(next_node.final_output())));
}
}
None
}
}
#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
pub struct Output(u64);
impl Output {
pub fn new(v: u64) -> Output {
Output(v)
}
pub fn zero() -> Output {
Output(0)
}
pub fn value(self) -> u64 {
self.0
}
fn encode(self) -> u64 {
self.0
}
fn decode(v: u64) -> Output {
Output(v)
}
pub fn is_zero(self) -> bool {
self.0 == 0
}
pub fn prefix(self, o: Output) -> Output {
Output(cmp::min(self.0, o.0))
}
pub fn cat(self, o: Output) -> Output {
Output(self.0 + o.0)
}
pub fn sub(self, o: Output) -> Output {
Output(self.0.checked_sub(o.0)
.expect("BUG: underflow subtraction not allowed"))
}
}
enum FstData {
Owned(Vec<u8>),
Mmap(Mmap),
}
impl FstData {
fn as_slice(&self) -> &[u8] {
match *self {
FstData::Owned(ref v) => &**v,
FstData::Mmap(ref v) => unsafe { v.as_slice() }
}
}
}
#[derive(Copy, Clone, Hash, Eq, PartialEq)]
pub struct Transition {
pub inp: u8,
pub out: Output,
pub addr: CompiledAddr,
}
impl Default for Transition {
fn default() -> Self {
Transition {
inp: 0,
out: Output::zero(),
addr: NONE_ADDRESS,
}
}
}
impl fmt::Debug for Transition {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.out.is_zero() {
write!(f, "{} -> {}", self.inp as char, self.addr)
} else {
write!(f, "({}, {}) -> {}",
self.inp as char, self.out.value(), self.addr)
}
}
}
#[inline]
#[cfg(target_pointer_width = "64")]
fn u64_to_usize(n: u64) -> usize {
n as usize
}
#[inline]
#[cfg(not(target_pointer_width = "64"))]
fn u64_to_usize(n: u64) -> usize {
if n > ::std::usize::MAX as u64 {
panic!("\
Cannot convert node address {} to a pointer sized variable. If this FST
is very large and was generated on a system with a larger pointer size
than this system, then it is not possible to read this FST on this
system.", n);
}
n as usize
}