1#![doc = include_str!("../readme.md")]
4#![deny(clippy::all)]
5#![deny(clippy::pedantic)]
7#![allow(clippy::doc_markdown)] #![deny(missing_docs)]
9#![forbid(unsafe_code)]
10
11pub mod dataset;
13
14pub mod model;
16
17use std::fs::File;
18use std::hash::{DefaultHasher, Hash, Hasher};
19use std::io::{Read, Seek, SeekFrom};
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use anyhow::{ensure, Result};
24use dashmap::{DashMap, DashSet};
25use rayon::prelude::*;
26use walkdir::WalkDir;
27
28pub const VERSION: &str = concat!(
30 "v",
31 env!("CARGO_PKG_VERSION"),
32 "-",
33 env!("VERGEN_GIT_DESCRIBE"),
34 " ",
35 env!("VERGEN_BUILD_DATE")
36);
37
38pub type Bytes = Vec<u8>;
40
41pub const MAX_RECURSION_DEPTH: usize = 10;
43const NGRAM_BUFFER_SIZE: usize = 4096;
44
45fn calculate_hash<T: Hash>(t: &T) -> u64 {
46 let mut s = DefaultHasher::new();
47 t.hash(&mut s);
48 s.finish()
49}
50
51pub struct Ngrammer {
53 n: u16,
55
56 k: usize,
58
59 paths: Vec<PathBuf>,
61}
62
63impl Ngrammer {
64 const COUNTS: usize = 1_000_000;
65
66 pub fn new(dir: &Path, n: u16, k: usize) -> Result<Self> {
72 let mut paths = Vec::new();
73
74 for entry in WalkDir::new(dir)
75 .max_depth(MAX_RECURSION_DEPTH)
76 .follow_links(true)
77 .into_iter()
78 .flatten()
79 {
80 if entry.file_type().is_file() {
81 paths.push(entry.into_path());
82 }
83 }
84
85 ensure!(!paths.is_empty(), "No files found!");
86 Ok(Self { n, k, paths })
87 }
88
89 #[allow(clippy::cast_possible_truncation)]
91 pub fn ngrams(&self) -> DashMap<Bytes, usize> {
92 let data: Vec<AtomicUsize> = vec![0usize; Self::COUNTS]
93 .into_iter()
94 .map(AtomicUsize::new)
95 .collect();
96
97 self.paths.par_iter().for_each(|p| {
98 for ngrams in self.find_ngram(p).unwrap_or_default() {
99 let index = calculate_hash(&ngrams) as usize % Self::COUNTS;
100 data[index].fetch_add(1, Ordering::Relaxed);
101 }
102 });
103
104 let min_count = if self.k < Self::COUNTS {
105 let mut sorted: Vec<usize> = data
106 .iter()
107 .map(|v| v.load(Ordering::Relaxed))
108 .collect::<Vec<usize>>();
109 sorted.par_sort();
110 sorted[sorted.len() - self.k]
111 } else {
112 1
113 };
114
115 let kept_ngrams = DashMap::with_capacity(self.k);
116 self.paths.par_iter().for_each(|p| {
117 let file_size = match p.metadata() {
118 Ok(metadata) => metadata.len(),
119 Err(_) => return,
120 };
121
122 let Ok(mut file) = File::open(p) else { return };
123 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
124 let n = u64::from(self.n);
125
126 loop {
127 let read_count = file.read(&mut buffer).unwrap_or(0);
128 let position = file.stream_position().unwrap_or_default();
129 if position < n {
130 break;
132 }
133 file.seek(SeekFrom::Start(position - n)).unwrap_or_default();
134
135 for index in 0..read_count - self.n as usize {
136 let bytes = &buffer[index..index + self.n as usize];
137 let index = calculate_hash(&bytes) as usize % Self::COUNTS;
138 let count = data[index].load(Ordering::Relaxed);
139 if count >= min_count {
140 kept_ngrams.insert(bytes.to_vec(), count);
141 if kept_ngrams.len() >= self.k {
142 break;
143 }
144 }
145 }
146
147 if position >= file_size {
148 break;
149 }
150 }
151 });
152
153 kept_ngrams
154 }
155
156 fn find_ngram(&self, path: &Path) -> Result<DashSet<Bytes>> {
157 let ngrams = DashSet::new();
158 let file_size = path.metadata()?.len();
159
160 let mut buffer = vec![0; NGRAM_BUFFER_SIZE];
161 let mut file = File::open(path)?;
162 let n = u64::from(self.n);
163
164 loop {
165 let read_count = file.read(&mut buffer)?;
166 let position = file.stream_position()?;
167 if position < n {
168 break;
170 }
171 file.seek(SeekFrom::Start(position - n))?;
172
173 for index in 0..read_count - self.n as usize {
174 let bytes = &buffer[index..index + self.n as usize];
175 ngrams.insert(Vec::from(bytes));
176 }
177
178 if position >= file_size {
179 break;
180 }
181 }
182
183 Ok(ngrams)
184 }
185}