1use std::{collections::HashMap, fmt};
5
6use serde::{Deserialize, Serialize};
7
8use super::{
9 ngram::NgramSet,
10 preproc::{apply_aggressive, apply_normalizers},
11};
12
13#[derive(Clone, Copy, PartialEq, Debug, Serialize, Deserialize)]
15#[serde(rename_all = "lowercase")]
16pub enum LicenseType {
17 Original,
19 Header,
21 Alternate,
25}
26
27impl fmt::Display for LicenseType {
28 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29 write!(
30 f,
31 "{}",
32 match *self {
33 LicenseType::Original => "original text",
34 LicenseType::Header => "license header",
35 LicenseType::Alternate => "alternate text",
36 }
37 )
38 }
39}
40
41#[derive(Serialize, Deserialize, Clone, Debug)]
78pub struct TextData {
79 match_data: NgramSet,
80 lines_view: (usize, usize),
81 lines_normalized: Option<Vec<String>>,
82 text_processed: Option<String>,
83}
84
85const TEXTDATA_TEXT_ERROR: &str = "TextData does not have original text";
86
87impl TextData {
88 pub fn new(text: &str) -> TextData {
98 let normalized = apply_normalizers(text);
99 let normalized_joined = normalized.join("\n");
100 let processed = apply_aggressive(&normalized_joined);
101 let match_data = NgramSet::from_str(&processed, 2);
102
103 TextData {
104 match_data,
105 lines_view: (0, normalized.len()),
106 lines_normalized: Some(normalized),
107 text_processed: Some(processed),
108 }
109 }
110
111 pub fn without_text(self) -> Self {
117 TextData {
118 match_data: self.match_data,
119 lines_view: (0, 0),
120 lines_normalized: None,
121 text_processed: None,
122 }
123 }
124
125 pub fn lines_view(&self) -> (usize, usize) {
136 self.lines_view
137 }
138
139 pub fn with_view(&self, start: usize, end: usize) -> Self {
148 let view = &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)[start..end];
149 let view_joined = view.join("\n");
150 let processed = apply_aggressive(&view_joined);
151 TextData {
152 match_data: NgramSet::from_str(&processed, 2),
153 lines_view: (start, end),
154 lines_normalized: self.lines_normalized.clone(),
155 text_processed: Some(processed),
156 }
157 }
158
159 pub fn white_out(&self) -> Self {
167 let lines = self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR);
169
170 let new_normalized: Vec<String> = lines
172 .iter()
173 .enumerate()
174 .map(|(i, line)| {
175 if i >= self.lines_view.0 && i < self.lines_view.1 {
176 "".to_string()
177 } else {
178 line.clone()
179 }
180 })
181 .collect();
182
183 let processed = apply_aggressive(&new_normalized.join("\n"));
184 TextData {
185 match_data: NgramSet::from_str(&processed, 2),
186 lines_view: (0, new_normalized.len()),
187 lines_normalized: Some(new_normalized),
188 text_processed: Some(processed),
189 }
190 }
191
192 pub fn lines(&self) -> &[String] {
194 &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)
195 [self.lines_view.0..self.lines_view.1]
196 }
197
198 #[doc(hidden)]
199 pub fn text_processed(&self) -> Option<&str> {
200 self.text_processed.as_ref().map(String::as_ref)
201 }
202
203 pub fn match_score(&self, other: &TextData) -> f32 {
207 self.match_data.dice(&other.match_data)
208 }
209
210 #[cfg(feature = "spdx")]
211 pub(crate) fn eq_data(&self, other: &Self) -> bool {
212 self.match_data.eq(&other.match_data)
213 }
214
215 pub fn optimize_bounds(&self, other: &TextData) -> (Self, f32) {
230 assert!(self.lines_normalized.is_some(), "{}", TEXTDATA_TEXT_ERROR);
231
232 let view = self.lines_view;
233
234 let (end_optimized, _) = self.search_optimize(
236 &|end| self.with_view(view.0, end).match_score(other),
237 &|end| self.with_view(view.0, end),
238 );
239 let new_end = end_optimized.lines_view.1;
240
241 let (optimized, score) = end_optimized.search_optimize(
243 &|start| end_optimized.with_view(start, new_end).match_score(other),
244 &|start| end_optimized.with_view(start, new_end),
245 );
246 (optimized, score)
247 }
248
249 fn search_optimize(
250 &self,
251 score: &dyn Fn(usize) -> f32,
252 value: &dyn Fn(usize) -> Self,
253 ) -> (Self, f32) {
254 let mut memo: HashMap<usize, f32> = HashMap::new();
256 let mut check_score =
257 |index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) };
258
259 fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) {
260 if right - left <= 3 {
261 return (left..=right)
263 .map(|x| (x, score(x)))
264 .fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc });
265 }
266
267 let low = (left * 2 + right) / 3;
268 let high = (left + right * 2) / 3;
269 let score_low = score(low);
270 let score_high = score(high);
271
272 if score_low > score_high {
273 search(score, left, high - 1)
274 } else {
275 search(score, low + 1, right)
276 }
277 }
278
279 let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1);
280 (value(optimal.0), optimal.1)
281 }
282}
283
284impl<'a> From<&'a str> for TextData {
285 fn from(text: &'a str) -> Self {
286 Self::new(text)
287 }
288}
289
290impl From<String> for TextData {
291 fn from(text: String) -> Self {
292 Self::new(&text)
293 }
294}
295
296#[cfg(test)]
297mod tests {
298 use super::*;
299
300 #[test]
304 fn optimize_bounds() {
305 let license_text = "this is a license text\nor it pretends to be one\nit's just a test";
306 let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too";
307 let license = TextData::from(license_text).without_text();
308 let sample = TextData::from(sample_text);
309
310 let (optimized, _) = sample.optimize_bounds(&license);
311 println!("{:?}", optimized.lines_view);
312 println!("{:?}", optimized.lines_normalized);
313 assert_eq!((0, 3), optimized.lines_view);
314
315 let sample_text = format!("{}\none more line", sample_text);
317 let sample = TextData::from(sample_text.as_str());
318 let (optimized, _) = sample.optimize_bounds(&license);
319 println!("{:?}", optimized.lines_view);
320 println!("{:?}", optimized.lines_normalized);
321 assert_eq!((0, 3), optimized.lines_view);
322
323 let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text);
325 let sample = TextData::from(sample_text.as_str());
326 let (optimized, _) = sample.optimize_bounds(&license);
327 println!("{:?}", optimized.lines_view);
328 println!("{:?}", optimized.lines_normalized);
329 assert!(
333 (4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view,
334 "bounds are (4, 7) or (4, 8)"
335 );
336 }
337
338 #[test]
341 fn optimize_doesnt_grow_view() {
342 let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8";
343 let license_text = "aaa aaa aaa aaa aaa";
344 let sample = TextData::from(sample_text);
345 let license = TextData::from(license_text).without_text();
346
347 let (optimized, _) = sample.optimize_bounds(&license);
349 assert_eq!((3, 7), optimized.lines_view);
350
351 let sample = sample.with_view(3, 7);
353 let (optimized, _) = sample.optimize_bounds(&license);
354 assert_eq!((3, 7), optimized.lines_view);
355
356 let sample = sample.with_view(4, 6);
358 let (optimized, _) = sample.optimize_bounds(&license);
359 assert_eq!((4, 6), optimized.lines_view);
360
361 let sample = sample.with_view(0, 9);
363 let (optimized, _) = sample.optimize_bounds(&license);
364 assert_eq!((3, 7), optimized.lines_view);
365 }
366
367 #[test]
369 fn match_small() {
370 let a = TextData::from("a b");
371 let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
372
373 let x = a.match_score(&b);
374 let y = b.match_score(&a);
375
376 assert_eq!(x, y);
377 }
378
379 #[test]
381 fn match_empty() {
382 let a = TextData::from("");
383 let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
384
385 let x = a.match_score(&b);
386 let y = b.match_score(&a);
387
388 assert_eq!(x, y);
389 }
390
391 #[test]
392 fn view_and_white_out() {
393 let a = TextData::from("aaa\nbbb\nccc\nddd");
394 assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed());
395
396 let b = a.with_view(1, 3);
397 assert_eq!(2, b.lines().len());
398 assert_eq!(Some("bbb ccc"), b.text_processed());
399
400 let c = b.white_out();
401 assert_eq!(Some("aaa ddd"), c.text_processed());
402 }
403}