1use std::borrow::Cow;
5use std::fmt;
6
7use anyhow::Error;
8use log::{info, trace};
9use serde::Serialize;
10
11use super::{
12 license::{LicenseType, TextData},
13 store::{Match, Store},
14};
15
16#[derive(Serialize, Clone)]
18pub struct IdentifiedLicense<'a> {
19 pub name: &'a str,
21 pub kind: LicenseType,
23 pub data: &'a TextData,
25}
26
27impl<'a> fmt::Debug for IdentifiedLicense<'a> {
28 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29 f.debug_struct("IdentifiedLicense")
30 .field("name", &self.name)
31 .field("kind", &self.kind)
32 .finish()
33 }
34}
35
36#[derive(Serialize, Debug)]
40pub struct ScanResult<'a> {
41 pub score: f32,
43 pub license: Option<IdentifiedLicense<'a>>,
46 pub containing: Vec<ContainedResult<'a>>,
48}
49
50#[derive(Serialize, Debug, Clone)]
52pub struct ContainedResult<'a> {
53 pub score: f32,
55 pub license: IdentifiedLicense<'a>,
57 pub line_range: (usize, usize),
62}
63
64pub struct ScanStrategy<'a> {
88 store: &'a Store,
89 mode: ScanMode,
90 confidence_threshold: f32,
91 shallow_limit: f32,
92 optimize: bool,
93 max_passes: u16,
94 step_size: usize,
95}
96
97pub enum ScanMode {
99 Elimination,
103
104 TopDown,
108}
109
110impl<'a> ScanStrategy<'a> {
111 pub fn new(store: &'a Store) -> ScanStrategy<'a> {
116 Self {
117 store,
118 mode: ScanMode::Elimination,
119 confidence_threshold: 0.9,
120 shallow_limit: 0.99,
121 optimize: false,
122 max_passes: 10,
123 step_size: 5,
124 }
125 }
126
127 pub fn mode(mut self, mode: ScanMode) -> Self {
132 self.mode = mode;
133 self
134 }
135
136 pub fn confidence_threshold(mut self, confidence_threshold: f32) -> Self {
145 self.confidence_threshold = confidence_threshold;
146 self
147 }
148
149 pub fn shallow_limit(mut self, shallow_limit: f32) -> Self {
160 self.shallow_limit = shallow_limit;
161 self
162 }
163
164 pub fn optimize(mut self, optimize: bool) -> Self {
170 self.optimize = optimize;
171 self
172 }
173
174 pub fn max_passes(mut self, max_passes: u16) -> Self {
182 self.max_passes = max_passes;
183 self
184 }
185
186 pub fn step_size(mut self, step_size: usize) -> Self {
191 self.step_size = step_size;
192 self
193 }
194
195 pub fn scan(&self, text: &TextData) -> Result<ScanResult, Error> {
200 match self.mode {
201 ScanMode::Elimination => Ok(self.scan_elimination(text)),
202 ScanMode::TopDown => Ok(self.scan_topdown(text)),
203 }
204 }
205
206 fn scan_elimination(&self, text: &TextData) -> ScanResult {
207 let mut analysis = self.store.analyze(text);
208 let score = analysis.score;
209 let mut license = None;
210 let mut containing = Vec::new();
211 info!("Elimination top-level analysis: {:?}", analysis);
212
213 if analysis.score > self.confidence_threshold {
215 license = Some(IdentifiedLicense {
216 name: analysis.name,
217 kind: analysis.license_type,
218 data: analysis.data,
219 });
220
221 if analysis.score > self.shallow_limit {
223 return ScanResult {
224 score,
225 license,
226 containing,
227 };
228 }
229 }
230
231 if self.optimize {
232 let mut current_text: Cow<'_, TextData> = Cow::Borrowed(text);
235 for _n in 0..self.max_passes {
236 let (optimized, optimized_score) = current_text.optimize_bounds(analysis.data);
237
238 if optimized_score < self.confidence_threshold {
240 break;
241 }
242
243 info!(
245 "Optimized to {} lines ({}, {})",
246 optimized_score,
247 optimized.lines_view().0,
248 optimized.lines_view().1
249 );
250 containing.push(ContainedResult {
251 score: optimized_score,
252 license: IdentifiedLicense {
253 name: analysis.name,
254 kind: analysis.license_type,
255 data: analysis.data,
256 },
257 line_range: optimized.lines_view(),
258 });
259
260 current_text = Cow::Owned(optimized.white_out());
262 analysis = self.store.analyze(¤t_text);
263 }
264 }
265
266 ScanResult {
267 score,
268 license,
269 containing,
270 }
271 }
272
273 fn scan_topdown(&self, text: &TextData) -> ScanResult {
274 let (_, text_end) = text.lines_view();
275 let mut containing = Vec::new();
276
277 let mut current_start = 0usize;
279 while current_start < text_end {
280 let result = self.topdown_find_contained_license(text, current_start);
281
282 let contained = match result {
283 Some(c) => c,
284 None => break,
285 };
286
287 current_start = contained.line_range.1 + 1;
288 containing.push(contained);
289 }
290
291 ScanResult {
292 score: 0.0,
293 license: None,
294 containing,
295 }
296 }
297
298 fn topdown_find_contained_license(
299 &self,
300 text: &TextData,
301 starting_at: usize,
302 ) -> Option<ContainedResult> {
303 let (_, text_end) = text.lines_view();
304 let mut found: (usize, usize, Option<Match<'_>>) = (0, 0, None);
305
306 trace!(
307 "topdown_find_contained_license starting at line {}",
308 starting_at
309 );
310
311 let mut hit_threshold = false;
313
314 'start: for start in (starting_at..text_end).step_by(self.step_size) {
316 for end in (start..=text_end).step_by(self.step_size) {
318 let view = text.with_view(start, end);
319 let analysis = self.store.analyze(&view);
320
321 if !hit_threshold && analysis.score >= self.confidence_threshold {
326 hit_threshold = true;
327 trace!(
328 "hit_threshold at ({}, {}) with score {}",
329 start,
330 end,
331 analysis.score
332 );
333 }
334
335 if hit_threshold {
336 if analysis.score < self.confidence_threshold {
337 trace!(
339 "exiting threshold at ({}, {}) with score {}",
340 start,
341 end,
342 analysis.score
343 );
344 break 'start;
345 } else {
346 found = (start, end, Some(analysis));
348 }
349 }
350 }
351 }
352
353 let matched = match found.2 {
356 Some(m) => m,
357 None => return None,
358 };
359 let check = matched.data;
360 let view = text.with_view(found.0, found.1);
361 let (optimized, optimized_score) = view.optimize_bounds(check);
362
363 trace!(
364 "optimized {} {} at ({:?})",
365 optimized_score,
366 matched.name,
367 optimized.lines_view()
368 );
369
370 if optimized_score < self.confidence_threshold {
371 return None;
372 }
373
374 Some(ContainedResult {
375 score: optimized_score,
376 license: IdentifiedLicense {
377 name: matched.name,
378 kind: matched.license_type,
379 data: matched.data,
380 },
381 line_range: optimized.lines_view(),
382 })
383 }
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389
390 #[test]
391 fn can_construct() {
392 let store = Store::new();
393 ScanStrategy::new(&store);
394 ScanStrategy::new(&store).confidence_threshold(0.5);
395 ScanStrategy::new(&store)
396 .shallow_limit(0.99)
397 .optimize(true)
398 .max_passes(100);
399 }
400
401 #[test]
402 fn shallow_scan() {
403 let store = create_dummy_store();
404 let test_data = TextData::new("lorem ipsum\naaaaa bbbbb\nccccc\nhello");
405
406 let strategy = ScanStrategy::new(&store)
408 .confidence_threshold(0.5)
409 .shallow_limit(0.0);
410 let result = strategy.scan(&test_data).unwrap();
411 assert!(
412 result.score > 0.5,
413 "score must meet threshold; was {}",
414 result.score
415 );
416 assert_eq!(
417 result.license.expect("result has a license").name,
418 "license-1"
419 );
420
421 let strategy = ScanStrategy::new(&store)
423 .confidence_threshold(0.8)
424 .shallow_limit(0.0);
425 let result = strategy.scan(&test_data).unwrap();
426 assert!(result.license.is_none(), "result license is None");
427 }
428
429 #[test]
430 fn single_optimize() {
431 let store = create_dummy_store();
432 let test_data =
435 TextData::new("lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout");
436
437 let strategy = ScanStrategy::new(&store)
439 .confidence_threshold(0.5)
440 .optimize(true)
441 .shallow_limit(1.0);
442 let result = strategy.scan(&test_data).unwrap();
443 assert!(result.license.is_none(), "result license is None");
444 assert_eq!(result.containing.len(), 1);
445 let contained = &result.containing[0];
446 assert_eq!(contained.license.name, "license-2");
447 assert!(
448 contained.score > 0.5,
449 "contained score is greater than threshold"
450 );
451 }
452
453 #[test]
454 fn find_multiple_licenses_elimination() {
455 let store = create_dummy_store();
456 let test_data =
459 TextData::new("lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc");
460
461 let strategy = ScanStrategy::new(&store)
463 .mode(ScanMode::Elimination)
464 .confidence_threshold(0.5)
465 .optimize(true)
466 .shallow_limit(1.0);
467 let result = strategy.scan(&test_data).unwrap();
468 assert!(result.license.is_none(), "result license is None");
469 assert_eq!(2, result.containing.len());
470
471 let mut found1 = 0;
473 let mut found2 = 0;
474 for (_, contained) in result.containing.iter().enumerate() {
475 match contained.license.name {
476 "license-1" => {
477 assert!(contained.score > 0.5, "license-1 score meets threshold");
478 found1 += 1;
479 }
480 "license-2" => {
481 assert!(contained.score > 0.5, "license-2 score meets threshold");
482 found2 += 1;
483 }
484 _ => {
485 panic!("somehow got an unknown license name");
486 }
487 }
488 }
489
490 assert!(
491 found1 == 1 && found2 == 1,
492 "found both licenses exactly once"
493 );
494 }
495
496 #[test]
497 fn find_multiple_licenses_topdown() {
498 env_logger::init();
499
500 let store = create_dummy_store();
501 let test_data =
504 TextData::new("lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc");
505
506 let strategy = ScanStrategy::new(&store)
508 .mode(ScanMode::TopDown)
509 .confidence_threshold(0.5)
510 .step_size(1);
511 let result = strategy.scan(&test_data).unwrap();
512 assert!(result.license.is_none(), "result license is None");
513 println!("{:?}", result);
514 assert_eq!(2, result.containing.len());
515
516 let mut found1 = 0;
518 let mut found2 = 0;
519 for (_, contained) in result.containing.iter().enumerate() {
520 match contained.license.name {
521 "license-1" => {
522 assert!(contained.score > 0.5, "license-1 score meets threshold");
523 found1 += 1;
524 }
525 "license-2" => {
526 assert!(contained.score > 0.5, "license-2 score meets threshold");
527 found2 += 1;
528 }
529 _ => {
530 panic!("somehow got an unknown license name");
531 }
532 }
533 }
534
535 assert!(
536 found1 == 1 && found2 == 1,
537 "found both licenses exactly once"
538 );
539 }
540
541 fn create_dummy_store() -> Store {
542 let mut store = Store::new();
543 store.add_license("license-1".into(), "aaaaa\nbbbbb\nccccc".into());
544 store.add_license(
545 "license-2".into(),
546 "1234 5678 1234\n0000\n1010101010\n\n8888 9999".into(),
547 );
548 store
549 }
550}