1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
//! Classical (non-neural) fingerprinters.
//!
//! Two traits cover the two modes `txtfp` produces fingerprints in:
//!
//! - [`Fingerprinter`] — feed a whole `&str` and get its full output.
//! Suited to enrolment / batch jobs.
//! - [`StreamingFingerprinter`] — push byte chunks as they arrive and
//! consolidate at the end. Suited to large-file pipelines and
//! memory-bounded streaming.
//!
//! Concrete implementations live in feature-gated submodules:
//! [`minhash`] (`minhash` feature), [`simhash`] (`simhash` feature),
//! [`lsh`] (`lsh` feature).
pub use HashFamily;
use crateResult;
/// Offline fingerprinter — consumes a whole document and emits one
/// fingerprint.
///
/// Implementations are immutable in their public surface:
/// [`Fingerprinter::fingerprint`] takes `&self` so a single
/// fingerprinter can be shared across worker threads. Internal scratch
/// buffers, if any, must be allocated per call.
///
/// # Example: implementing `Fingerprinter` for a custom kernel
///
/// ```
/// use txtfp::{Canonicalizer, Fingerprinter, Result};
///
/// struct LengthHash {
/// canonicalizer: Canonicalizer,
/// }
///
/// impl Fingerprinter for LengthHash {
/// type Output = u64;
/// fn fingerprint(&self, input: &str) -> Result<u64> {
/// Ok(self.canonicalizer.canonicalize(input).len() as u64)
/// }
/// }
/// ```
/// Streaming fingerprinter — accumulates bytes across calls, emits one
/// fingerprint at end-of-stream.
///
/// Streaming variants for `txtfp`'s classical algorithms buffer the
/// input internally and run the offline algorithm at [`finalize`] time.
/// True online sketches (positional MinHash, online SimHash) are
/// scheduled for v0.2 — they require positional shingles and a richer
/// state machine than the v0.1.0 contract guarantees.
///
/// # Example
///
/// ```
/// use txtfp::{
/// Canonicalizer, MinHashFingerprinter, MinHashStreaming,
/// ShingleTokenizer, StreamingFingerprinter, WordTokenizer,
/// };
///
/// let inner = MinHashFingerprinter::<_, 64>::new(
/// Canonicalizer::default(),
/// ShingleTokenizer { k: 3, inner: WordTokenizer },
/// );
/// let mut s = MinHashStreaming::new(inner);
///
/// s.update(b"the quick brown fox").unwrap();
/// s.update(b" jumps over the lazy dog").unwrap();
/// let sig = s.finalize().unwrap();
/// assert_eq!(sig.schema, 1);
/// ```
///
/// [`finalize`]: StreamingFingerprinter::finalize