1use crate::dataset::Dataset;
4use crate::ftype::FileType;
5use crate::{Bytes, MAX_RECURSION_DEPTH};
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::hash::{DefaultHasher, Hash, Hasher};
10use std::io::{self, BufRead, BufReader, Lines, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use anyhow::{bail, ensure, Result};
15use dashmap::{DashMap, DashSet};
16use rayon::prelude::*;
17use walkdir::WalkDir;
18
19pub(crate) const NGRAM_BUFFER_SIZE: usize = 4096;
20
21#[inline]
22fn calculate_hash<T: Hash>(t: &T) -> u64 {
23 let mut s = DefaultHasher::new();
24 t.hash(&mut s);
25 s.finish()
26}
27
28fn read_lines<P: AsRef<Path>>(filename: P) -> io::Result<Lines<BufReader<File>>> {
32 let file = File::open(filename)?;
33 Ok(BufReader::new(file).lines())
34}
35
36#[derive(Clone)]
38pub struct NgramsFile {
39 pub ngrams: HashMap<Bytes, usize>,
41
42 pub ftype: FileType,
44
45 pub n: usize,
47}
48
49impl NgramsFile {
50 pub fn load<P: AsRef<Path>>(ngrams_file: P) -> Result<Self> {
60 let mut file_type = FileType::NotSet;
61 let mut n = 0;
62 let mut ngrams = HashMap::new();
63
64 let lines = read_lines(&ngrams_file)?;
65 let mut ngram_counter = 0;
66 let mut line_counter = 0u32;
67 for line in lines.map_while(Result::ok) {
68 if line.starts_with("# File type:") {
69 if let Ok(ftype) = Dataset::file_type_from_line(&line) {
70 file_type = ftype;
71 }
72 line_counter += 1;
73 continue;
74 }
75
76 let line = if let Some(l) = line.split(',').next() {
77 l
78 } else {
79 &line
80 };
81
82 if !line.len().is_multiple_of(2) {
83 bail!("Line {line_counter} {line} has odd number of characters.");
84 }
85
86 if n == 0 {
87 n = line.len() / 2;
88 } else if line.len() / 2 != n {
89 bail!(
90 "Line {line_counter} {line} has unexpected length of {} bytes, expected {n}",
91 line.len() / 2
92 );
93 }
94
95 match hex::decode(line) {
96 Ok(ngram) => {
97 ngrams.insert(ngram, ngram_counter);
98 ngram_counter += 1;
99 }
100 Err(e) => bail!("Line {line_counter} has non-hexidecimal ngram {line}: {e}"),
101 }
102 line_counter += 1;
103 }
104
105 ensure!(
106 !ngrams.is_empty(),
107 "No n-grams read from {}.",
108 ngrams_file.as_ref().display()
109 );
110 ensure!(
111 file_type != FileType::NotSet,
112 "No file type specified in n-grams file."
113 );
114
115 Ok(Self {
116 ngrams,
117 ftype: file_type,
118 n,
119 })
120 }
121
122 #[must_use]
124 pub fn into_vec(self) -> Vec<Bytes> {
125 let mut ngrams_vec = vec![Vec::new(); self.ngrams.len()];
126 for (ngram, index) in self.ngrams {
127 ngrams_vec[index] = ngram;
128 }
129 ngrams_vec
130 }
131}
132
133pub struct Ngrammer {
135 n: u16,
137
138 k: usize,
140
141 paths: Vec<PathBuf>,
143
144 ftype: FileType,
146
147 ngrams: DashMap<Bytes, usize>,
149}
150
151impl Ngrammer {
152 #[cfg(not(feature = "low-memory"))]
153 const COUNTS: usize = 2_000_000;
154 #[cfg(feature = "low-memory")]
155 const COUNTS: usize = 100_000;
156
157 pub fn new(
163 ftype: Option<FileType>,
164 dir: &Path,
165 n: u16,
166 k: usize,
167 verbose: bool,
168 ) -> Result<Self> {
169 let mut paths = Vec::new();
170 let allow_downgrade = ftype.is_none();
171 let mut ftype = if let Some(ftype) = ftype {
172 ftype
173 } else {
174 FileType::NotSet
175 };
176
177 for entry in WalkDir::new(dir)
178 .max_depth(MAX_RECURSION_DEPTH)
179 .follow_links(true)
180 .into_iter()
181 .flatten()
182 {
183 if entry.file_type().is_file() {
184 if ftype != FileType::NotSet && !ftype.matches_path(entry.path())? {
185 if let Ok(Some(detected)) = FileType::from_path(entry.path()) {
186 if allow_downgrade {
188 let new_type = ftype.downgrade(detected)?;
189 if verbose {
190 println!(
191 "{}: Downgrading from {ftype} to {new_type}",
192 entry.path().display()
193 );
194 }
195 ftype = new_type;
196 } else {
197 bail!(
198 "File {} detected as {detected} which does not match expected type {ftype:?}",
199 entry.path().display()
200 );
201 }
202 } else {
203 bail!(
206 "File {} does not match expected type {ftype:?}",
207 entry.path().display()
208 );
209 }
210 } else if ftype == FileType::NotSet {
211 if let Some(detected_type) = FileType::from_path(entry.path())? {
212 ftype = detected_type;
213 if verbose {
214 println!(
215 "{}: Detected file type {detected_type}",
216 entry.path().display()
217 );
218 }
219 } else {
220 bail!("Unknown file type for {}", entry.path().display());
221 }
222 }
223
224 paths.push(entry.into_path());
226 }
227 }
228
229 ensure!(!paths.is_empty(), "No files found!");
230 ensure!(
231 ftype != FileType::NotSet,
232 "No file type specified in n-grams file."
233 );
234
235 Ok(Self {
236 n,
237 k,
238 paths,
239 ftype,
240 ngrams: DashMap::new(),
241 })
242 }
243
244 #[allow(clippy::cast_possible_truncation)]
246 pub fn find(&mut self) {
247 let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
248 .into_iter()
249 .map(AtomicUsize::new)
250 .collect();
251
252 self.paths.par_iter().for_each(|p| {
253 for ngrams in self.find_ngram(p).unwrap_or_default() {
254 let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
255 data[index].fetch_add(1, Ordering::Relaxed);
256 }
257 });
258
259 let min_count = if self.k < Self::COUNTS {
260 let mut sorted: Vec<usize> = data
261 .iter()
262 .map(|v| v.load(Ordering::Relaxed))
263 .collect::<Vec<usize>>();
264 sorted.par_sort();
265 sorted[sorted.len() - self.k]
266 } else {
267 1
268 };
269
270 let n = u64::from(self.n);
271 let kept_ngrams = DashMap::with_capacity(self.k);
272 self.paths.par_iter().for_each(|p| {
273 let file_size = match p.metadata() {
274 Ok(metadata) => metadata.len(),
275 Err(_) => return,
276 };
277
278 let Ok(mut file) = File::open(p) else { return };
279 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
280
281 loop {
282 let read_count = file.read(&mut buffer).unwrap_or(0);
283 let position = file.stream_position().unwrap_or_default();
284 if position < n {
285 break;
287 }
288 file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
289
290 for index in 0..read_count - self.n as usize {
291 let bytes = &buffer[index..index + self.n as usize];
292 let index = calculate_hash(&bytes) as usize % Self::COUNTS;
293 let count = data[index].load(Ordering::Relaxed);
294 if count >= min_count {
295 kept_ngrams.insert(bytes.to_vec(), count);
296 if kept_ngrams.len() >= self.k {
297 break;
298 }
299 }
300 }
301
302 if position >= file_size {
303 break;
304 }
305 }
306 });
307
308 self.ngrams = kept_ngrams;
309 }
310
311 fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
312 let ngrams = DashSet::new();
313 let file_size = path.metadata()?.len();
314
315 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
316 let mut file = File::open(path)?;
317 let n = u64::from(self.n);
318
319 loop {
320 let read_count = file.read(&mut buffer)?;
321 let position = file.stream_position()?;
322 if position < n {
323 break;
325 }
326 file.seek(SeekFrom::Start(position - n))?;
327
328 for index in 0..read_count - self.n as usize {
329 let bytes = &buffer[index..index + self.n as usize];
330 ngrams.insert(Vec::from(bytes));
331 }
332
333 if position >= file_size {
334 break;
335 }
336 }
337
338 Ok(ngrams)
339 }
340
341 pub fn save<P: AsRef<Path>>(&self, path: P, counts: bool) -> Result<()> {
347 let mut output = File::create(path)?;
348 writeln!(output, "# File type: {}", self.ftype)?;
349 for entry in &self.ngrams {
350 if counts {
351 writeln!(output, "{},{}", hex::encode(entry.key()), entry.value())?;
352 } else {
353 writeln!(output, "{}", hex::encode(entry.key()))?;
354 }
355 }
356 output.flush()?;
357 Ok(())
358 }
359
360 #[inline]
362 #[must_use]
363 pub fn ngrams(&self) -> &DashMap<Bytes, usize> {
364 &self.ngrams
365 }
366
367 #[inline]
369 #[must_use]
370 pub fn ftype(&self) -> FileType {
371 self.ftype
372 }
373}