1use crate::dataset::Dataset;
4use crate::ftype::FileType;
5use crate::{Bytes, MAX_RECURSION_DEPTH};
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::hash::{DefaultHasher, Hash, Hasher};
10use std::io::{self, BufRead, BufReader, Lines, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use anyhow::{Result, bail, ensure};
15use dashmap::{DashMap, DashSet};
16use rayon::prelude::*;
17use walkdir::WalkDir;
18
19pub(crate) const NGRAM_BUFFER_SIZE: usize = 4096;
20
21#[inline]
22fn calculate_hash<T: Hash>(t: &T) -> u64 {
23 let mut s = DefaultHasher::new();
24 t.hash(&mut s);
25 s.finish()
26}
27
28fn read_lines<P: AsRef<Path>>(filename: P) -> io::Result<Lines<BufReader<File>>> {
32 let file = File::open(filename)?;
33 Ok(BufReader::new(file).lines())
34}
35
36#[derive(Clone)]
38pub struct NgramsFile {
39 pub ngrams: HashMap<Bytes, usize>,
41
42 pub ftype: FileType,
44
45 pub n: usize,
47}
48
49impl NgramsFile {
50 pub fn load<P: AsRef<Path>>(ngrams_file: P) -> Result<Self> {
60 let mut file_type = FileType::NotSet;
61 let mut n = 0;
62 let mut ngrams = HashMap::new();
63
64 let lines = read_lines(&ngrams_file)?;
65 let mut ngram_counter = 0;
66 let mut line_counter = 0u32;
67 for line in lines.map_while(Result::ok) {
68 if line.starts_with("# File type:") {
69 if let Ok(ftype) = Dataset::file_type_from_line(&line) {
70 file_type = ftype;
71 }
72 line_counter += 1;
73 continue;
74 }
75
76 let line = if let Some(l) = line.split(',').next() {
77 l
78 } else {
79 &line
80 };
81
82 if !line.len().is_multiple_of(2) {
83 bail!("Line {line_counter} {line} has odd number of characters.");
84 }
85
86 if n == 0 {
87 n = line.len() / 2;
88 } else if line.len() / 2 != n {
89 bail!(
90 "Line {line_counter} {line} has unexpected length of {} bytes, expected {n}",
91 line.len() / 2
92 );
93 }
94
95 match hex::decode(line) {
96 Ok(ngram) => {
97 ngrams.insert(ngram, ngram_counter);
98 ngram_counter += 1;
99 }
100 Err(e) => bail!("Line {line_counter} has non-hexidecimal ngram {line}: {e}"),
101 }
102 line_counter += 1;
103 }
104
105 ensure!(
106 !ngrams.is_empty(),
107 "No n-grams read from {}.",
108 ngrams_file.as_ref().display()
109 );
110 ensure!(
111 file_type != FileType::NotSet,
112 "No file type specified in n-grams file."
113 );
114
115 Ok(Self {
116 ngrams,
117 ftype: file_type,
118 n,
119 })
120 }
121
122 #[must_use]
124 pub fn into_vec(self) -> Vec<Bytes> {
125 let mut ngrams_vec = vec![Vec::new(); self.ngrams.len()];
126 for (ngram, index) in self.ngrams {
127 ngrams_vec[index] = ngram;
128 }
129 ngrams_vec
130 }
131}
132
133pub struct Ngrammer {
135 n: u16,
137
138 k: usize,
140
141 paths: Vec<PathBuf>,
143
144 ftype: FileType,
146
147 ngrams: DashMap<Bytes, usize>,
149}
150
151impl Ngrammer {
152 #[cfg(not(feature = "low-memory"))]
153 const COUNTS: usize = 2_000_000;
154 #[cfg(feature = "low-memory")]
155 const COUNTS: usize = 100_000;
156
157 pub fn new(
163 ftype: Option<FileType>,
164 dir: &Path,
165 n: u16,
166 k: usize,
167 verbose: bool,
168 ) -> Result<Self> {
169 let mut paths = Vec::new();
170 let allow_downgrade = ftype.is_none();
171 let mut ftype = if let Some(ftype) = ftype {
172 ftype
173 } else {
174 FileType::NotSet
175 };
176
177 for entry in WalkDir::new(dir)
178 .max_depth(MAX_RECURSION_DEPTH)
179 .follow_links(true)
180 .into_iter()
181 .flatten()
182 {
183 if entry.file_type().is_file() {
184 if ftype != FileType::NotSet && !ftype.matches_path(entry.path())? {
185 if let Ok(Some(detected)) = FileType::from_path(entry.path()) {
186 if detected == FileType::DsStore {
187 continue;
188 }
189
190 if allow_downgrade {
192 let new_type = ftype.downgrade(detected)?;
193 if verbose {
194 println!(
195 "{}: Downgrading from {ftype} to {new_type}",
196 entry.path().display()
197 );
198 }
199 ftype = new_type;
200 } else {
201 bail!(
202 "File {} detected as {detected} which does not match expected type {ftype:?}",
203 entry.path().display()
204 );
205 }
206 } else {
207 bail!(
210 "File {} does not match expected type {ftype:?}",
211 entry.path().display()
212 );
213 }
214 } else if ftype == FileType::NotSet {
215 if let Some(detected_type) = FileType::from_path(entry.path())? {
216 if detected_type == FileType::DsStore {
217 continue;
218 }
219
220 ftype = detected_type;
221 if verbose {
222 println!(
223 "{}: Detected file type {detected_type}",
224 entry.path().display()
225 );
226 }
227 } else {
228 bail!("Unknown file type for {}", entry.path().display());
229 }
230 }
231
232 paths.push(entry.into_path());
234 }
235 }
236
237 ensure!(!paths.is_empty(), "No files found!");
238 ensure!(
239 ftype != FileType::NotSet,
240 "No file type specified in n-grams file."
241 );
242
243 Ok(Self {
244 n,
245 k,
246 paths,
247 ftype,
248 ngrams: DashMap::new(),
249 })
250 }
251
252 #[allow(clippy::cast_possible_truncation)]
254 pub fn find(&mut self) {
255 let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
256 .into_iter()
257 .map(AtomicUsize::new)
258 .collect();
259
260 self.paths.par_iter().for_each(|p| {
261 for ngrams in self.find_ngram(p).unwrap_or_default() {
262 let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
263 data[index].fetch_add(1, Ordering::Relaxed);
264 }
265 });
266
267 let min_count = if self.k < Self::COUNTS {
268 let mut sorted: Vec<usize> = data
269 .iter()
270 .map(|v| v.load(Ordering::Relaxed))
271 .collect::<Vec<usize>>();
272 sorted.par_sort();
273 sorted[sorted.len() - self.k]
274 } else {
275 1
276 };
277
278 let n = u64::from(self.n);
279 let kept_ngrams = DashMap::with_capacity(self.k);
280 self.paths.par_iter().for_each(|p| {
281 let file_size = match p.metadata() {
282 Ok(metadata) => metadata.len(),
283 Err(_) => return,
284 };
285
286 let Ok(mut file) = File::open(p) else { return };
287 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
288
289 loop {
290 let read_count = file.read(&mut buffer).unwrap_or(0);
291 let position = file.stream_position().unwrap_or_default();
292 if position < n {
293 break;
295 }
296 file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
297
298 for index in 0..read_count - self.n as usize {
299 let bytes = &buffer[index..index + self.n as usize];
300 let index = calculate_hash(&bytes) as usize % Self::COUNTS;
301 let count = data[index].load(Ordering::Relaxed);
302 if count >= min_count {
303 kept_ngrams.insert(bytes.to_vec(), count);
304 if kept_ngrams.len() >= self.k {
305 break;
306 }
307 }
308 }
309
310 if position >= file_size {
311 break;
312 }
313 }
314 });
315
316 self.ngrams = kept_ngrams;
317 }
318
319 fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
320 let ngrams = DashSet::new();
321 let file_size = path.metadata()?.len();
322
323 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
324 let mut file = File::open(path)?;
325 let n = u64::from(self.n);
326
327 loop {
328 let read_count = file.read(&mut buffer)?;
329 let position = file.stream_position()?;
330 if position < n {
331 break;
333 }
334 file.seek(SeekFrom::Start(position - n))?;
335
336 for index in 0..read_count - self.n as usize {
337 let bytes = &buffer[index..index + self.n as usize];
338 ngrams.insert(Vec::from(bytes));
339 }
340
341 if position >= file_size {
342 break;
343 }
344 }
345
346 Ok(ngrams)
347 }
348
349 pub fn save<P: AsRef<Path>>(&self, path: P, counts: bool) -> Result<()> {
355 let mut output = File::create(path)?;
356 writeln!(output, "# File type: {}", self.ftype)?;
357 for entry in &self.ngrams {
358 if counts {
359 writeln!(output, "{},{}", hex::encode(entry.key()), entry.value())?;
360 } else {
361 writeln!(output, "{}", hex::encode(entry.key()))?;
362 }
363 }
364 output.flush()?;
365 Ok(())
366 }
367
368 #[inline]
370 #[must_use]
371 pub fn ngrams(&self) -> &DashMap<Bytes, usize> {
372 &self.ngrams
373 }
374
375 #[inline]
377 #[must_use]
378 pub fn ftype(&self) -> FileType {
379 self.ftype
380 }
381}