1use std::borrow::Cow;
5use std::fmt;
6
7use anyhow::Error;
8use log::{info, trace};
9use serde::Serialize;
10
11use super::{
12 license::{LicenseType, TextData},
13 store::{Match, Store},
14};
15
16#[derive(Serialize, Clone)]
18pub struct IdentifiedLicense<'a> {
19 pub name: &'a str,
21 pub kind: LicenseType,
23 pub data: &'a TextData,
25}
26
27impl fmt::Debug for IdentifiedLicense<'_> {
28 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29 f.debug_struct("IdentifiedLicense")
30 .field("name", &self.name)
31 .field("kind", &self.kind)
32 .finish()
33 }
34}
35
36#[derive(Serialize, Debug)]
40pub struct ScanResult<'a> {
41 pub score: f32,
43 pub license: Option<IdentifiedLicense<'a>>,
46 pub containing: Vec<ContainedResult<'a>>,
48}
49
50#[derive(Serialize, Debug, Clone)]
52pub struct ContainedResult<'a> {
53 pub score: f32,
55 pub license: IdentifiedLicense<'a>,
57 pub line_range: (usize, usize),
62}
63
64pub struct ScanStrategy<'a> {
88 store: &'a Store,
89 mode: ScanMode,
90 confidence_threshold: f32,
91 shallow_limit: f32,
92 optimize: bool,
93 max_passes: u16,
94 step_size: usize,
95}
96
97pub enum ScanMode {
99 Elimination,
103
104 TopDown,
108}
109
110impl<'a> ScanStrategy<'a> {
111 pub fn new(store: &'a Store) -> ScanStrategy<'a> {
116 Self {
117 store,
118 mode: ScanMode::Elimination,
119 confidence_threshold: 0.9,
120 shallow_limit: 0.99,
121 optimize: false,
122 max_passes: 10,
123 step_size: 5,
124 }
125 }
126
127 pub fn mode(mut self, mode: ScanMode) -> Self {
132 self.mode = mode;
133 self
134 }
135
136 pub fn confidence_threshold(mut self, confidence_threshold: f32) -> Self {
145 self.confidence_threshold = confidence_threshold;
146 self
147 }
148
149 pub fn shallow_limit(mut self, shallow_limit: f32) -> Self {
160 self.shallow_limit = shallow_limit;
161 self
162 }
163
164 pub fn optimize(mut self, optimize: bool) -> Self {
170 self.optimize = optimize;
171 self
172 }
173
174 pub fn max_passes(mut self, max_passes: u16) -> Self {
182 self.max_passes = max_passes;
183 self
184 }
185
186 pub fn step_size(mut self, step_size: usize) -> Self {
191 self.step_size = step_size;
192 self
193 }
194
195 pub fn scan(&self, text: &TextData) -> Result<ScanResult<'_>, Error> {
200 match self.mode {
201 ScanMode::Elimination => Ok(self.scan_elimination(text)),
202 ScanMode::TopDown => Ok(self.scan_topdown(text)),
203 }
204 }
205
206 fn scan_elimination(&self, text: &TextData) -> ScanResult<'_> {
207 let mut analysis = self.store.analyze(text);
208 let score = analysis.score;
209 let mut license = None;
210 let mut containing = Vec::new();
211 info!("Elimination top-level analysis: {:?}", analysis);
212
213 if analysis.score > self.confidence_threshold {
215 license = Some(IdentifiedLicense {
216 name: analysis.name,
217 kind: analysis.license_type,
218 data: analysis.data,
219 });
220
221 if analysis.score > self.shallow_limit {
223 return ScanResult {
224 score,
225 license,
226 containing,
227 };
228 }
229 }
230
231 if self.optimize {
232 let mut current_text: Cow<'_, TextData> = Cow::Borrowed(text);
235 for _n in 0..self.max_passes {
236 let (optimized, optimized_score) = current_text.optimize_bounds(analysis.data);
237
238 if optimized_score < self.confidence_threshold {
240 break;
241 }
242
243 info!(
245 "Optimized to {} lines ({}, {})",
246 optimized_score,
247 optimized.lines_view().0,
248 optimized.lines_view().1
249 );
250 containing.push(ContainedResult {
251 score: optimized_score,
252 license: IdentifiedLicense {
253 name: analysis.name,
254 kind: analysis.license_type,
255 data: analysis.data,
256 },
257 line_range: optimized.lines_view(),
258 });
259
260 current_text = Cow::Owned(optimized.white_out());
262 analysis = self.store.analyze(¤t_text);
263 }
264 }
265
266 ScanResult {
267 score,
268 license,
269 containing,
270 }
271 }
272
273 fn scan_topdown(&self, text: &TextData) -> ScanResult<'_> {
274 let (_, text_end) = text.lines_view();
275 let mut containing = Vec::new();
276
277 let mut current_start = 0usize;
279 while current_start < text_end {
280 let result = self.topdown_find_contained_license(text, current_start);
281
282 let contained = match result {
283 Some(c) => c,
284 None => break,
285 };
286
287 current_start = contained.line_range.1 + 1;
288 containing.push(contained);
289 }
290
291 ScanResult {
292 score: 0.0,
293 license: None,
294 containing,
295 }
296 }
297
298 fn topdown_find_contained_license(
299 &self,
300 text: &TextData,
301 starting_at: usize,
302 ) -> Option<ContainedResult<'_>> {
303 let (_, text_end) = text.lines_view();
304 let mut found: (usize, usize, Option<Match<'_>>) = (0, 0, None);
305
306 trace!(
307 "topdown_find_contained_license starting at line {}",
308 starting_at
309 );
310
311 let mut hit_threshold = false;
313
314 'start: for start in (starting_at..text_end).step_by(self.step_size) {
316 for end in (start..=text_end).step_by(self.step_size) {
318 let view = text.with_view(start, end);
319 let analysis = self.store.analyze(&view);
320
321 if !hit_threshold && analysis.score >= self.confidence_threshold {
326 hit_threshold = true;
327 trace!(
328 "hit_threshold at ({}, {}) with score {}",
329 start, end, analysis.score
330 );
331 }
332
333 if hit_threshold {
334 if analysis.score < self.confidence_threshold {
335 trace!(
337 "exiting threshold at ({}, {}) with score {}",
338 start, end, analysis.score
339 );
340 break 'start;
341 } else {
342 found = (start, end, Some(analysis));
344 }
345 }
346 }
347 }
348
349 let matched = found.2?;
352 let check = matched.data;
353 let view = text.with_view(found.0, found.1);
354 let (optimized, optimized_score) = view.optimize_bounds(check);
355
356 trace!(
357 "optimized {} {} at ({:?})",
358 optimized_score,
359 matched.name,
360 optimized.lines_view()
361 );
362
363 if optimized_score < self.confidence_threshold {
364 return None;
365 }
366
367 Some(ContainedResult {
368 score: optimized_score,
369 license: IdentifiedLicense {
370 name: matched.name,
371 kind: matched.license_type,
372 data: matched.data,
373 },
374 line_range: optimized.lines_view(),
375 })
376 }
377}
378
379#[cfg(test)]
380mod tests {
381 use super::*;
382
383 #[test]
384 fn can_construct() {
385 let store = Store::new();
386 ScanStrategy::new(&store);
387 ScanStrategy::new(&store).confidence_threshold(0.5);
388 ScanStrategy::new(&store)
389 .shallow_limit(0.99)
390 .optimize(true)
391 .max_passes(100);
392 }
393
394 #[test]
395 fn shallow_scan() {
396 let store = create_dummy_store();
397 let test_data = TextData::new("lorem ipsum\naaaaa bbbbb\nccccc\nhello");
398
399 let strategy = ScanStrategy::new(&store)
401 .confidence_threshold(0.5)
402 .shallow_limit(0.0);
403 let result = strategy.scan(&test_data).unwrap();
404 assert!(
405 result.score > 0.5,
406 "score must meet threshold; was {}",
407 result.score
408 );
409 assert_eq!(
410 result.license.expect("result has a license").name,
411 "license-1"
412 );
413
414 let strategy = ScanStrategy::new(&store)
416 .confidence_threshold(0.8)
417 .shallow_limit(0.0);
418 let result = strategy.scan(&test_data).unwrap();
419 assert!(result.license.is_none(), "result license is None");
420 }
421
422 #[test]
423 fn single_optimize() {
424 let store = create_dummy_store();
425 let test_data = TextData::new(
428 "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout",
429 );
430
431 let strategy = ScanStrategy::new(&store)
433 .confidence_threshold(0.5)
434 .optimize(true)
435 .shallow_limit(1.0);
436 let result = strategy.scan(&test_data).unwrap();
437 assert!(result.license.is_none(), "result license is None");
438 assert_eq!(result.containing.len(), 1);
439 let contained = &result.containing[0];
440 assert_eq!(contained.license.name, "license-2");
441 assert!(
442 contained.score > 0.5,
443 "contained score is greater than threshold"
444 );
445 }
446
447 #[test]
448 fn find_multiple_licenses_elimination() {
449 let store = create_dummy_store();
450 let test_data = TextData::new(
453 "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc",
454 );
455
456 let strategy = ScanStrategy::new(&store)
458 .mode(ScanMode::Elimination)
459 .confidence_threshold(0.5)
460 .optimize(true)
461 .shallow_limit(1.0);
462 let result = strategy.scan(&test_data).unwrap();
463 assert!(result.license.is_none(), "result license is None");
464 assert_eq!(2, result.containing.len());
465
466 let mut found1 = 0;
468 let mut found2 = 0;
469 for contained in result.containing.iter() {
470 match contained.license.name {
471 "license-1" => {
472 assert!(contained.score > 0.5, "license-1 score meets threshold");
473 found1 += 1;
474 }
475 "license-2" => {
476 assert!(contained.score > 0.5, "license-2 score meets threshold");
477 found2 += 1;
478 }
479 _ => {
480 panic!("somehow got an unknown license name");
481 }
482 }
483 }
484
485 assert!(
486 found1 == 1 && found2 == 1,
487 "found both licenses exactly once"
488 );
489 }
490
491 #[test]
492 fn find_multiple_licenses_topdown() {
493 env_logger::init();
494
495 let store = create_dummy_store();
496 let test_data = TextData::new(
499 "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc",
500 );
501
502 let strategy = ScanStrategy::new(&store)
504 .mode(ScanMode::TopDown)
505 .confidence_threshold(0.5)
506 .step_size(1);
507 let result = strategy.scan(&test_data).unwrap();
508 assert!(result.license.is_none(), "result license is None");
509 println!("{:?}", result);
510 assert_eq!(2, result.containing.len());
511
512 let mut found1 = 0;
514 let mut found2 = 0;
515 for contained in result.containing.iter() {
516 match contained.license.name {
517 "license-1" => {
518 assert!(contained.score > 0.5, "license-1 score meets threshold");
519 found1 += 1;
520 }
521 "license-2" => {
522 assert!(contained.score > 0.5, "license-2 score meets threshold");
523 found2 += 1;
524 }
525 _ => {
526 panic!("somehow got an unknown license name");
527 }
528 }
529 }
530
531 assert!(
532 found1 == 1 && found2 == 1,
533 "found both licenses exactly once"
534 );
535 }
536
537 fn create_dummy_store() -> Store {
538 let mut store = Store::new();
539 store.add_license("license-1".into(), "aaaaa\nbbbbb\nccccc".into());
540 store.add_license(
541 "license-2".into(),
542 "1234 5678 1234\n0000\n1010101010\n\n8888 9999".into(),
543 );
544 store
545 }
546}