#![cfg_attr(not(feature = "std"), no_std)]
#![cfg_attr(docsrs, feature(doc_cfg))]
#![deny(missing_docs)]
#![deny(rustdoc::broken_intra_doc_links)]
#![warn(rust_2018_idioms)]
extern crate alloc;
#[cfg(feature = "std")]
extern crate std;
pub mod canonical;
pub mod tokenize;
#[cfg(any(
feature = "minhash",
feature = "simhash",
feature = "lsh",
feature = "tlsh"
))]
pub mod classical;
#[cfg(feature = "semantic")]
#[cfg_attr(docsrs, doc(cfg(feature = "semantic")))]
pub mod semantic;
#[cfg(feature = "markup")]
#[cfg_attr(docsrs, doc(cfg(feature = "markup")))]
pub mod markup;
#[cfg(feature = "markup")]
#[cfg_attr(docsrs, doc(cfg(feature = "markup")))]
pub use markup::{MarkdownOptions, html_to_text, markdown_to_text, markdown_to_text_with};
#[cfg(feature = "pdf")]
#[cfg_attr(docsrs, doc(cfg(feature = "pdf")))]
pub mod pdf;
#[cfg(feature = "pdf")]
#[cfg_attr(docsrs, doc(cfg(feature = "pdf")))]
pub use pdf::{PdfOptions, pdf_to_text, pdf_to_text_with};
mod error;
mod fingerprint;
pub use error::{Error, Result};
pub use fingerprint::{FingerprintMetadata, UNCOMPUTED_CONFIG_HASH, algo, config_hash};
#[cfg(any(
feature = "minhash",
feature = "simhash",
feature = "tlsh",
feature = "semantic"
))]
pub use fingerprint::Fingerprint;
#[cfg(feature = "tlsh")]
#[cfg_attr(docsrs, doc(cfg(feature = "tlsh")))]
pub use fingerprint::TlshFingerprint;
pub use canonical::{Canonicalizer, CanonicalizerBuilder, CaseFold, Normalization, canonicalize};
pub use tokenize::{GraphemeTokenizer, ShingleTokenizer, Tokenizer, WordTokenizer};
#[cfg(feature = "cjk")]
#[cfg_attr(docsrs, doc(cfg(feature = "cjk")))]
pub use tokenize::{CjkSegmenter, CjkTokenizer};
#[cfg(any(
feature = "minhash",
feature = "simhash",
feature = "lsh",
feature = "tlsh"
))]
pub use classical::{Fingerprinter, StreamingFingerprinter};
#[cfg(feature = "minhash")]
#[cfg_attr(docsrs, doc(cfg(feature = "minhash")))]
pub use classical::minhash::{
HashFamily, MinHashFingerprinter, MinHashFingerprinterBuilder, MinHashSig, MinHashStreaming,
jaccard,
};
#[cfg(feature = "simhash")]
#[cfg_attr(docsrs, doc(cfg(feature = "simhash")))]
pub use classical::simhash::{
IdfTable, SimHash64, SimHashFingerprinter, SimHashFingerprinterBuilder, SimHashStreaming,
Weighting, cosine_estimate, hamming,
};
#[cfg(feature = "lsh")]
#[cfg_attr(docsrs, doc(cfg(feature = "lsh")))]
pub use classical::lsh::{LshIndex, LshIndexBuilder};
#[cfg(feature = "tlsh")]
#[cfg_attr(docsrs, doc(cfg(feature = "tlsh")))]
pub use classical::tlsh::{
MIN_INPUT_BYTES as TLSH_MIN_INPUT_BYTES, TlshFingerprinter, tlsh_distance,
};
#[cfg(feature = "semantic")]
#[cfg_attr(docsrs, doc(cfg(feature = "semantic")))]
pub use semantic::{
ChunkMode, ChunkingStrategy, Embedding, EmbeddingProvider, LocalProvider, LocalProviderBuilder,
Pooling, chunk_for_model, semantic_similarity,
};
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub const FORMAT_VERSION: u32 = 1;