1use crate::dataset::Dataset;
4use crate::ftype::FileType;
5use crate::{Bytes, MAX_RECURSION_DEPTH};
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::hash::{DefaultHasher, Hash, Hasher};
10use std::io::{self, BufRead, BufReader, Lines, Read, Seek, SeekFrom, Write};
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use anyhow::{bail, ensure, Result};
15use dashmap::{DashMap, DashSet};
16use rayon::prelude::*;
17use walkdir::WalkDir;
18
19pub(crate) const NGRAM_BUFFER_SIZE: usize = 4096;
20
21#[inline]
22fn calculate_hash<T: Hash>(t: &T) -> u64 {
23 let mut s = DefaultHasher::new();
24 t.hash(&mut s);
25 s.finish()
26}
27
28fn read_lines<P: AsRef<Path>>(filename: P) -> io::Result<Lines<BufReader<File>>> {
32 let file = File::open(filename)?;
33 Ok(BufReader::new(file).lines())
34}
35
36#[derive(Clone)]
38pub struct NgramsFile {
39 pub ngrams: HashMap<Bytes, usize>,
41
42 pub ftype: FileType,
44
45 pub n: usize,
47}
48
49impl NgramsFile {
50 pub fn load<P: AsRef<Path>>(ngrams_file: P) -> Result<Self> {
60 let mut file_type = FileType::NotSet;
61 let mut n = 0;
62 let mut ngrams = HashMap::new();
63
64 let lines = read_lines(&ngrams_file)?;
65 let mut ngram_counter = 0;
66 let mut line_counter = 0u32;
67 for line in lines.map_while(Result::ok) {
68 if line.starts_with("# File type:") {
69 if let Ok(ftype) = Dataset::file_type_from_line(&line) {
70 file_type = ftype;
71 }
72 line_counter += 1;
73 continue;
74 }
75
76 let line = if let Some(l) = line.split(',').next() {
77 l
78 } else {
79 &line
80 };
81
82 if !line.len().is_multiple_of(2) {
83 bail!("Line {line_counter} {line} has odd number of characters.");
84 }
85
86 if n == 0 {
87 n = line.len() / 2;
88 } else if line.len() / 2 != n {
89 bail!(
90 "Line {line_counter} {line} has unexpected length of {} bytes, expected {n}",
91 line.len() / 2
92 );
93 }
94
95 match hex::decode(line) {
96 Ok(ngram) => {
97 ngrams.insert(ngram, ngram_counter);
98 ngram_counter += 1;
99 }
100 Err(e) => bail!("Line {line_counter} has non-hexidecimal ngram {line}: {e}"),
101 }
102 line_counter += 1;
103 }
104
105 ensure!(
106 !ngrams.is_empty(),
107 "No n-grams read from {}.",
108 ngrams_file.as_ref().display()
109 );
110 ensure!(
111 file_type != FileType::NotSet,
112 "No file type specified in n-grams file."
113 );
114
115 Ok(Self {
116 ngrams,
117 ftype: file_type,
118 n,
119 })
120 }
121
122 #[must_use]
124 pub fn into_vec(self) -> Vec<Bytes> {
125 let mut ngrams_vec = vec![Vec::new(); self.ngrams.len()];
126 for (ngram, index) in self.ngrams {
127 ngrams_vec[index] = ngram;
128 }
129 ngrams_vec
130 }
131}
132
133pub struct Ngrammer {
135 n: u16,
137
138 k: usize,
140
141 paths: Vec<PathBuf>,
143
144 ftype: FileType,
146
147 ngrams: DashMap<Bytes, usize>,
149}
150
151impl Ngrammer {
152 #[cfg(not(feature = "low-memory"))]
153 const COUNTS: usize = 2_000_000;
154 #[cfg(feature = "low-memory")]
155 const COUNTS: usize = 100_000;
156
157 pub fn new(ftype: Option<FileType>, dir: &Path, n: u16, k: usize) -> Result<Self> {
163 let mut paths = Vec::new();
164 let mut ftype = ftype;
165
166 for entry in WalkDir::new(dir)
167 .max_depth(MAX_RECURSION_DEPTH)
168 .follow_links(true)
169 .into_iter()
170 .flatten()
171 {
172 if entry.file_type().is_file() {
173 match ftype {
174 Some(file_type) => {
175 if !file_type.matches_path(entry.path())? {
176 bail!(
177 "File {} does not match expected type {file_type:?}",
178 entry.path().display()
179 );
180 }
181 }
182 None => {
183 if let Some(detected_type) = FileType::from_path(entry.path())? {
184 ftype = Some(detected_type);
185 } else {
186 bail!("Unknown file type for {}", entry.path().display());
187 }
188 }
189 }
190
191 paths.push(entry.into_path());
192 }
193 }
194
195 ensure!(!paths.is_empty(), "No files found!");
196 if let Some(ftype) = ftype {
197 Ok(Self {
198 n,
199 k,
200 paths,
201 ftype,
202 ngrams: DashMap::new(),
203 })
204 } else {
205 bail!("File type not provided and not detected!");
206 }
207 }
208
209 #[allow(clippy::cast_possible_truncation)]
211 pub fn find(&mut self) {
212 let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
213 .into_iter()
214 .map(AtomicUsize::new)
215 .collect();
216
217 self.paths.par_iter().for_each(|p| {
218 for ngrams in self.find_ngram(p).unwrap_or_default() {
219 let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
220 data[index].fetch_add(1, Ordering::Relaxed);
221 }
222 });
223
224 let min_count = if self.k < Self::COUNTS {
225 let mut sorted: Vec<usize> = data
226 .iter()
227 .map(|v| v.load(Ordering::Relaxed))
228 .collect::<Vec<usize>>();
229 sorted.par_sort();
230 sorted[sorted.len() - self.k]
231 } else {
232 1
233 };
234
235 let kept_ngrams = DashMap::with_capacity(self.k);
236 self.paths.par_iter().for_each(|p| {
237 let file_size = match p.metadata() {
238 Ok(metadata) => metadata.len(),
239 Err(_) => return,
240 };
241
242 let Ok(mut file) = File::open(p) else { return };
243 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
244 let n = u64::from(self.n);
245
246 loop {
247 let read_count = file.read(&mut buffer).unwrap_or(0);
248 let position = file.stream_position().unwrap_or_default();
249 if position < n {
250 break;
252 }
253 file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
254
255 for index in 0..read_count - self.n as usize {
256 let bytes = &buffer[index..index + self.n as usize];
257 let index = calculate_hash(&bytes) as usize % Self::COUNTS;
258 let count = data[index].load(Ordering::Relaxed);
259 if count >= min_count {
260 kept_ngrams.insert(bytes.to_vec(), count);
261 if kept_ngrams.len() >= self.k {
262 break;
263 }
264 }
265 }
266
267 if position >= file_size {
268 break;
269 }
270 }
271 });
272
273 self.ngrams = kept_ngrams;
274 }
275
276 fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
277 let ngrams = DashSet::new();
278 let file_size = path.metadata()?.len();
279
280 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
281 let mut file = File::open(path)?;
282 let n = u64::from(self.n);
283
284 loop {
285 let read_count = file.read(&mut buffer)?;
286 let position = file.stream_position()?;
287 if position < n {
288 break;
290 }
291 file.seek(SeekFrom::Start(position - n))?;
292
293 for index in 0..read_count - self.n as usize {
294 let bytes = &buffer[index..index + self.n as usize];
295 ngrams.insert(Vec::from(bytes));
296 }
297
298 if position >= file_size {
299 break;
300 }
301 }
302
303 Ok(ngrams)
304 }
305
306 pub fn save<P: AsRef<Path>>(&self, path: P, counts: bool) -> Result<()> {
312 let mut output = File::create(path)?;
313 writeln!(output, "# File type: {}", self.ftype)?;
314 for entry in &self.ngrams {
315 if counts {
316 writeln!(output, "{},{}", hex::encode(entry.key()), entry.value())?;
317 } else {
318 writeln!(output, "{}", hex::encode(entry.key()))?;
319 }
320 }
321 output.flush()?;
322 Ok(())
323 }
324
325 #[inline]
327 #[must_use]
328 pub fn ngrams(&self) -> &DashMap<Bytes, usize> {
329 &self.ngrams
330 }
331
332 #[inline]
334 #[must_use]
335 pub fn ftype(&self) -> FileType {
336 self.ftype
337 }
338}