1use csv::WriterBuilder;
10use itertools::Itertools;
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet};
13use std::error;
14use std::error::Error;
15use std::fmt;
16use std::fmt::Display;
17use std::str::FromStr;
18
19use strsim::{damerau_levenshtein, jaro_winkler, levenshtein, osa_distance, sorensen_dice};
20
21#[derive(Debug)]
23pub struct ValueError;
24
25impl Display for ValueError {
26 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
28 write!(f, "Received an unexpected value")
29 }
30}
31
32impl error::Error for ValueError {}
34
35type Result<T> = std::result::Result<T, ValueError>;
37
38fn my_damerau(a: &str, b: &str) -> f64 {
41 damerau_levenshtein(a, b) as f64
42}
43
44fn my_leven(a: &str, b: &str) -> f64 {
47 levenshtein(a, b) as f64
48}
49
50fn my_osa(a: &str, b: &str) -> f64 {
53 osa_distance(a, b) as f64
54}
55
56fn my_jw(a: &str, b: &str) -> f64 {
59 jaro_winkler(a, b) as f64
60}
61
62fn my_sd(a: &str, b: &str) -> f64 {
65 sorensen_dice(a, b) as f64
66}
67
68pub fn initialize_distance(a: Algorithm) -> fn(&str, &str) -> f64 {
70 match a {
71 Algorithm::DAMERAU => my_damerau,
72 Algorithm::LEVENSHTEIN => my_leven,
73 Algorithm::JAROWINKLER => my_jw,
74 Algorithm::OSA => my_osa,
75 Algorithm::SORENSENDICE => my_sd,
76 }
77}
78
79#[derive(Clone, Copy, Debug, Deserialize, Serialize)]
81pub enum Algorithm {
82 DAMERAU,
84 LEVENSHTEIN,
86 JAROWINKLER,
88 OSA,
90 SORENSENDICE,
92}
93
94impl Algorithm {
95 pub fn is_edits(&self) -> bool {
97 match self {
98 Algorithm::OSA | Algorithm::DAMERAU | Algorithm::LEVENSHTEIN => true,
99 Algorithm::JAROWINKLER | Algorithm::SORENSENDICE => false,
100 }
101 }
102
103 pub fn options() -> &'static [&'static str] {
106 &[
107 "Levenshtein",
108 "Damerau",
109 "OSA",
110 "JaroWinkler",
111 "SorensenDice",
112 ]
113 }
114}
115
116impl FromStr for Algorithm {
117 type Err = ValueError;
118 fn from_str(s: &str) -> Result<Algorithm> {
121 match s.to_uppercase().chars().next().unwrap_or('L') {
122 'L' => Ok(Algorithm::LEVENSHTEIN),
123 'D' => Ok(Algorithm::DAMERAU),
124 'O' => Ok(Algorithm::OSA),
125 'J' => Ok(Algorithm::JAROWINKLER),
126 'S' => Ok(Algorithm::SORENSENDICE),
127 _ => Err(ValueError),
128 }
129 }
130}
131
132impl ToString for Algorithm {
133 fn to_string(&self) -> String {
135 match self {
136 Algorithm::DAMERAU => String::from("Damerau"),
137 Algorithm::LEVENSHTEIN => String::from("Levenshtein"),
138 Algorithm::OSA => String::from("OSA"),
139 Algorithm::JAROWINKLER => String::from("JaroWinkler"),
140 Algorithm::SORENSENDICE => String::from("SorensenDice"),
141 }
142 }
143}
144
145#[derive(Debug, Deserialize, Serialize, Clone)]
151pub struct SimpleResult {
152 pub record_id: Option<String>,
155 pub algorithm: Algorithm,
157 pub edits: Option<i32>,
161 pub similarity: f64,
165 pub search_term: String,
167 pub matched_term: String,
169}
170
171#[derive(Clone, Copy, Debug)]
173pub enum OutputFormat {
174 JSONL,
175 CSV,
176}
177
178impl FromStr for OutputFormat {
179 type Err = ValueError;
180 fn from_str(s: &str) -> Result<OutputFormat> {
182 match s.to_uppercase().as_str() {
183 "JSONL" => Ok(OutputFormat::JSONL),
184 "CSV" => Ok(OutputFormat::CSV),
185 _ => Err(ValueError),
186 }
187 }
188}
189
190pub fn format(
200 data: Vec<SearchOutput>,
201 format: OutputFormat,
202) -> std::result::Result<Vec<String>, Box<dyn Error>> {
203 match format {
204 OutputFormat::JSONL => {
205 Ok(data
206 .iter()
207 .map(|x| match x {
208 SearchOutput::DrugResult(y) => serde_json::to_string(y)
209 .expect("could not deserialize drug result to string"),
210 SearchOutput::SimpleResult(y) => serde_json::to_string(y)
211 .expect("could not deserialize simple result to string"),
212 })
213 .collect::<Vec<String>>())
214 }
215 OutputFormat::CSV => {
216 let mut wtr = WriterBuilder::new().has_headers(false).from_writer(vec![]);
217 for row in data {
218 wtr.serialize(row)?;
219 }
220 let csv_data = String::from_utf8(wtr.into_inner()?)?;
221 Ok(csv_data
222 .split('\n')
223 .map(|x| x.to_string())
224 .filter(|x| !x.is_empty())
225 .collect::<Vec<String>>())
226 }
227 }
228}
229
230pub struct SimpleSearch {
236 pub algorithm: Algorithm,
238 pub distance: fn(&str, &str) -> f64,
240 pub max_edits: Option<i32>,
246 pub similarity_threshold: Option<f64>,
250 pub targets: Vec<String>,
252}
253
254impl SimpleSearch {
255 pub fn new(
257 algorithm: Algorithm,
258 distance: fn(&str, &str) -> f64,
259 max_edits: Option<i32>,
260 similarity_threshold: Option<f64>,
261 targets: &[String],
262 ) -> SimpleSearch {
263 SimpleSearch {
264 algorithm,
265 distance,
266 max_edits,
267 similarity_threshold,
268 targets: targets.to_vec(),
269 }
270 }
271
272 fn manage_state(
273 &self,
274 state: &mut Option<HashMap<(String, String), f64>>,
275 word1: &str,
276 word2: &str,
277 ) -> f64 {
278 if let Some(state) = state {
279 *state
280 .entry((word1.to_string(), word2.to_string()))
281 .or_insert_with(|| (self.distance)(word1, word2))
282 } else {
283 (self.distance)(word1, word2)
284 }
285 }
286}
287
288#[derive(Debug, Clone, Serialize, Deserialize)]
298pub enum SearchOutput {
299 SimpleResult(SimpleResult),
300 DrugResult(DrugResult),
301}
302
303pub trait Search {
305 fn scan(
307 &self,
308 text: &str,
309 record: Option<String>,
310 state: &mut Option<HashMap<(String, String), f64>>,
311 ) -> Vec<SearchOutput>;
312}
313
314impl Search for SimpleSearch {
315 fn scan(
332 &self,
333 text: &str,
334 record: Option<String>,
335 state: &mut Option<HashMap<(String, String), f64>>,
336 ) -> Vec<SearchOutput> {
337 let clean = text
338 .replace(&['(', ')', ',', '\"', '.', ';', ':', ']', '['][..], " ")
339 .to_uppercase();
340 let words = clean.split_whitespace();
341 let mut results: Vec<SimpleResult> = Vec::new();
342 for word in words {
343 for target in &self.targets {
344 let mut word_pair = vec![word.trim().to_uppercase(), target.trim().to_uppercase()];
345 word_pair.sort();
346 let d = self.manage_state(state, &word_pair[0], &word_pair[1]);
347 let res = SimpleResult {
348 record_id: record.clone(),
349 search_term: target.to_string(),
350 matched_term: word.to_string(),
351 algorithm: self.algorithm,
352 edits: if self.algorithm.is_edits() {
353 Some(d as i32)
354 } else {
355 None
356 },
357 similarity: if self.algorithm.is_edits() {
358 1.0 - (d / (target.chars().count().max(word.chars().count()) as f64))
359 } else {
360 d
361 },
362 };
363 results.push(res);
364 }
365 }
366 if let Some(me) = self.max_edits {
367 results
369 .into_iter()
370 .filter(|x| x.edits.expect("result did not have edits") <= me)
371 .map(SearchOutput::SimpleResult)
372 .collect::<Vec<SearchOutput>>()
373 } else if let Some(thresh) = self.similarity_threshold {
374 results
376 .into_iter()
377 .filter(|x| x.similarity >= thresh)
378 .map(SearchOutput::SimpleResult)
379 .collect::<Vec<SearchOutput>>()
380 } else {
381 results
383 .into_iter()
384 .map(SearchOutput::SimpleResult)
385 .collect()
386 }
387 }
388}
389
390#[derive(Debug, Clone, Serialize, Deserialize)]
394pub struct Drug {
395 pub name: String,
397 pub rx_id: String,
399 pub class_id: String,
401}
402
403pub struct DrugSearch {
409 pub algorithm: Algorithm,
411 pub distance: fn(&str, &str) -> f64,
413 pub max_edits: Option<i32>,
419 pub similarity_threshold: Option<f64>,
421 pub targets: Vec<Drug>,
423}
424
425impl DrugSearch {
426 pub fn new(
428 algorithm: Algorithm,
429 distance: fn(&str, &str) -> f64,
430 max_edits: Option<i32>,
431 similarity_threshold: Option<f64>,
432 targets: &[Drug],
433 ) -> DrugSearch {
434 DrugSearch {
435 algorithm,
436 distance,
437 max_edits,
438 similarity_threshold,
439 targets: targets.to_vec(),
440 }
441 }
442 fn manage_state(
443 &self,
444 state: &mut Option<HashMap<(String, String), f64>>,
445 word1: &str,
446 word2: &str,
447 ) -> f64 {
448 if let Some(state) = state {
449 *state
450 .entry((word1.to_string(), word2.to_string()))
451 .or_insert_with(|| (self.distance)(word1, word2))
452 } else {
453 (self.distance)(word1, word2)
454 }
455 }
456}
457
458#[derive(Debug, Deserialize, Serialize, Clone)]
464pub struct DrugResult {
465 pub record_id: Option<String>,
466 pub algorithm: Algorithm,
467 pub edits: Option<i32>,
468 pub similarity: f64,
469 pub matched_term: String,
470 pub drug: Drug,
471}
472
473impl Search for DrugSearch {
474 fn scan(
491 &self,
492 text: &str,
493 record: Option<String>,
494 state: &mut Option<HashMap<(String, String), f64>>,
495 ) -> Vec<SearchOutput> {
496 let clean = text
497 .replace(&['(', ')', ',', '\"', '.', ';', ':'][..], " ")
498 .to_uppercase();
499 let words = clean.split_whitespace();
500 let mut results: Vec<DrugResult> = Vec::new();
501 for word in words {
502 for target in &self.targets {
503 for t in target.name.split('/') {
504 let mut word_pair = vec![word.trim().to_uppercase(), t.trim().to_uppercase()];
505 word_pair.sort();
506 let d = self.manage_state(state, &word_pair[0], &word_pair[1]);
507 let res = DrugResult {
508 record_id: record.clone(),
509 matched_term: word.to_string(),
510 algorithm: self.algorithm,
511 edits: if self.algorithm.is_edits() {
512 Some(d as i32)
513 } else {
514 None
515 },
516 similarity: if self.algorithm.is_edits() {
517 1.0 - (d / (t.chars().count().max(word.chars().count()) as f64))
518 } else {
519 d
520 },
521 drug: target.to_owned(),
522 };
523 results.push(res);
524 }
525 }
526 }
527 if let Some(me) = self.max_edits {
528 results
530 .into_iter()
531 .filter(|x| x.edits.expect("result did not have edits") <= me)
532 .map(SearchOutput::DrugResult)
533 .collect::<Vec<SearchOutput>>()
534 } else if let Some(thresh) = self.similarity_threshold {
535 results
537 .into_iter()
538 .filter(|x| x.similarity >= thresh)
539 .map(SearchOutput::DrugResult)
540 .collect::<Vec<SearchOutput>>()
541 } else {
542 results.into_iter().map(SearchOutput::DrugResult).collect()
544 }
545 }
546}
547
548pub fn initialize_searcher(
552 algorithm: Algorithm,
553 distance: fn(&str, &str) -> f64,
554 max_edits: Option<i32>,
555 similarity_threshold: Option<f64>,
556 search_words: Option<&[String]>,
557 drug_list: Option<Vec<Drug>>,
558) -> Box<dyn Search> {
559 if let Some(drugs) = drug_list {
560 Box::new(DrugSearch::new(
561 algorithm,
562 distance,
563 max_edits,
564 similarity_threshold,
565 drugs.as_ref(),
566 ))
567 } else {
568 Box::new(SimpleSearch::new(
569 algorithm,
570 distance,
571 max_edits,
572 similarity_threshold,
573 search_words.unwrap_or_default(),
574 ))
575 }
576}
577
578pub fn analyze(
580 data: Vec<SearchOutput>,
581 total_targets: i32,
582 total_records: i32,
583 is_drug: bool,
584 has_id: bool,
585) -> Result<Vec<String>> {
586 let mut results: Vec<String> = Vec::new();
587 if data.is_empty() {
588 results.push("Unable to analyze, no matches found.".to_string());
589 return Ok(results);
590 }
591 if is_drug {
592 if has_id {
593 let mut found_targets: Vec<String> = Vec::new();
594 let mut found_ids: Vec<String> = Vec::new();
595 for r in data {
596 if let SearchOutput::DrugResult(drug) = r {
597 found_targets.push(drug.drug.name.clone());
598 found_ids.push(
599 drug.record_id
600 .as_ref()
601 .expect("could not reference record id")
602 .clone(),
603 );
604 }
605 }
606 let unique_records = found_ids.clone().into_iter().collect::<HashSet<_>>();
607 results.push(format!(
608 "Found drugs in {} of {} records (~{:.2}%).",
609 unique_records.len(),
610 total_records,
611 100.0 * unique_records.len() as f64 / total_targets as f64
612 ));
613 let counts = found_ids.into_iter().counts();
614 let key_with_max_value = counts
615 .iter()
616 .max_by_key(|entry| entry.1)
617 .expect("could not find max");
618 results.push(format!(
619 "Most common record: {} (detected {} drugs)",
620 key_with_max_value.0, key_with_max_value.1
621 ));
622 let unique_targets = found_targets
623 .clone()
624 .into_iter()
625 .unique()
626 .collect::<HashSet<_>>();
627 results.push(format!(
628 "Found {} of {} drugs (~{:.2}%).",
629 unique_targets.len(),
630 total_targets,
631 100.0 * unique_targets.len() as f64 / total_targets as f64
632 ));
633 let counts = found_targets.into_iter().counts();
634 let key_with_max_value = counts
635 .iter()
636 .max_by_key(|entry| entry.1)
637 .expect("could not find max");
638 results.push(format!(
639 "The most common drug is {} with {} detections.",
640 key_with_max_value.0, key_with_max_value.1
641 ));
642 } else {
643 let mut found_targets: Vec<String> = Vec::new();
644 results.push("No record ID flag provided.".to_string());
645 for r in data {
646 if let SearchOutput::DrugResult(drug) = r {
647 found_targets.push(drug.drug.name.clone());
648 }
649 }
650 let unique_targets = found_targets.into_iter().unique().collect::<HashSet<_>>();
651 results.push(format!(
652 "Found {} of {} drugs (~{:.2}%).",
653 unique_targets.len(),
654 total_targets,
655 100.0 * unique_targets.len() as f64 / total_targets as f64
656 ));
657 let counts = unique_targets.into_iter().counts();
658 let key_with_max_value = counts
659 .iter()
660 .max_by_key(|entry| entry.1)
661 .expect("could not find max");
662 results.push(format!(
663 "The most common drug is {} with {} detections.",
664 key_with_max_value.0, key_with_max_value.1
665 ));
666 }
667 } else if has_id {
668 let mut found_targets: Vec<String> = Vec::new();
669 let mut found_ids: Vec<String> = Vec::new();
670 for r in data {
671 if let SearchOutput::SimpleResult(simple) = r {
672 found_targets.push(simple.search_term.clone());
673 found_ids.push(
674 simple
675 .record_id
676 .as_ref()
677 .expect("could not reference record id")
678 .clone(),
679 );
680 }
681 }
682 let unique_records = found_ids.clone().into_iter().collect::<HashSet<_>>();
683 results.push(format!(
684 "Found targets in {} of {} records (~{:.2}%).",
685 unique_records.len(),
686 total_records,
687 100.0 * unique_records.len() as f64 / total_records as f64,
688 ));
689 let counts = found_ids.clone().into_iter().counts();
690 let key_with_max_value = counts
691 .iter()
692 .max_by_key(|(_, v)| *v)
693 .expect("could not find max");
694 results.push(format!(
695 "Most common record: {} (detected {} targets)",
696 key_with_max_value.0, key_with_max_value.1
697 ));
698 let unique_targets = found_targets
699 .clone()
700 .into_iter()
701 .unique()
702 .collect::<HashSet<_>>();
703 results.push(format!(
704 "Found {} of {} targets (~{:.2}%).",
705 unique_targets.len(),
706 total_targets,
707 100.0 * unique_targets.len() as f64 / total_targets as f64
708 ));
709 let counts = found_targets.into_iter().counts();
710 let key_with_max_value = counts
711 .iter()
712 .max_by_key(|(_, v)| *v)
713 .expect("could not find max");
714 results.push(format!(
715 "The most common target is {} with {} detections.",
716 key_with_max_value.0, key_with_max_value.1
717 ));
718 } else {
719 let mut found_targets: Vec<String> = Vec::new();
720 results.push("No record ID flag provided.".to_string());
721 for r in data {
722 if let SearchOutput::SimpleResult(simple) = r {
723 found_targets.push(simple.search_term.clone());
724 }
725 }
726 let unique_targets = found_targets
727 .clone()
728 .into_iter()
729 .unique()
730 .collect::<HashSet<_>>();
731 results.push(format!(
732 "Found {} of {} targets (~{:.2}%).",
733 unique_targets.len(),
734 total_targets,
735 100.0 * unique_targets.len() as f64 / total_targets as f64
736 ));
737 let counts = found_targets.into_iter().counts();
738 let key_with_max_value = counts
739 .iter()
740 .max_by_key(|(_, v)| *v)
741 .expect("could not find max");
742 results.push(format!(
743 "The most common target is {} with {} detections.",
744 key_with_max_value.0, key_with_max_value.1
745 ));
746 }
747 Ok(results)
748}
749
750#[cfg(test)]
752mod tests {
753 use super::*;
754
755 #[test]
756 fn test_parens() {
757 let test_str = "Mixed Drug Toxicity(Fentanyl, Cocaine, Xylazine and Gabapentin)";
758 let search = SimpleSearch::new(
759 Algorithm::LEVENSHTEIN,
760 my_leven,
761 Some(1),
762 None,
763 &["Fentanyl".to_uppercase(), "cocaine".to_uppercase()],
764 );
765 let results = search.scan(test_str, None, &mut None);
766 println!("{:?}", results);
767 assert_eq!(results.len(), 2);
768 }
769}