use crate::utils::{conditions, tokenize_and_clean, PREPOSITIONS};
use crate::Candidate;
use lazy_static::lazy_static;
use crate::abbreviation_definitions::{AbbreviationDefinition, AbbreviationOptions};
use rayon::prelude::*;
use regex::Regex;
use rustc_hash::{FxHashMap, FxHashSet};
use std::borrow::Cow;
use std::cmp::min;
use std::sync::Arc;
lazy_static! {
static ref WORD_SPLIT_RE: Regex = Regex::new(r"[\s\-]+").unwrap();
static ref CLEAN_SENTENCE_RE: Regex = Regex::new(r#"(\()['"\p{Pi}]|['"\p{Pf}]([);:])"#).unwrap();
}
pub fn extract_abbreviation_definition_pairs<'a>(
text: &'a str,
options: AbbreviationOptions,
) -> Vec<AbbreviationDefinition> {
if text.is_empty() {
return Vec::new();
}
let sentences: Vec<Cow<'a, str>> = if options.tokenize {
tokenize_and_clean(text).collect()
} else {
text.split('\n').map(Cow::Borrowed).collect()
};
let use_parallel = sentences.len() > 50;
let abbreviations: Vec<AbbreviationDefinition> = if use_parallel {
sentences
.par_iter()
.flat_map(|sentence| process_sentence(sentence))
.collect()
} else {
sentences
.iter()
.flat_map(|sentence| process_sentence(sentence))
.collect()
};
if options.most_common_definition {
select_most_common_definitions(abbreviations)
} else if options.first_definition {
select_first_definitions(abbreviations)
} else {
abbreviations
}
}
fn process_sentence(sentence: &str) -> Vec<AbbreviationDefinition> {
let mut abbreviations = Vec::new();
let sentence = CLEAN_SENTENCE_RE.replace_all(&sentence, "$1$2");
let sentence = sentence.trim();
for candidate in best_candidates(&sentence) {
for potential_synonym in get_potential_synonyms(&candidate) {
if let Some(definition) = get_definition(&potential_synonym, &sentence) {
if let Some(selected_def) = select_definition(&definition, potential_synonym.text())
{
abbreviations.push(AbbreviationDefinition {
abbreviation: candidate.text().to_string(),
definition: selected_def.text().to_string(),
start: selected_def.start(),
end: selected_def.stop(),
});
break;
}
}
}
}
abbreviations
}
pub fn extract_abbreviation_definition_pairs_parallel<T>(
texts: Vec<T>,
options: AbbreviationOptions,
) -> Vec<AbbreviationDefinition>
where
T: AsRef<str> + Sync,
{
let texts: Vec<Arc<str>> = texts
.into_par_iter()
.map(|t| Arc::from(t.as_ref()))
.collect();
let all_results: Vec<Vec<AbbreviationDefinition>> = texts
.par_iter()
.map(|text| {
let sentences: Vec<Cow<str>> = if options.tokenize {
tokenize_and_clean(text).collect()
} else {
text.split('\n').map(Cow::Borrowed).collect()
};
sentences
.into_par_iter()
.flat_map(|sentence| process_sentence(&sentence))
.collect()
})
.collect();
let mut merged_results: Vec<AbbreviationDefinition> =
all_results.into_iter().flatten().collect();
if options.most_common_definition {
merged_results = select_most_common_definitions(merged_results);
} else if options.first_definition {
merged_results = select_first_definitions(merged_results);
}
merged_results
}
fn select_most_common_definitions(
abbrevs: Vec<AbbreviationDefinition>,
) -> Vec<AbbreviationDefinition> {
let mut definition_counts: FxHashMap<String, FxHashMap<String, usize>> = FxHashMap::default();
for abbrev in &abbrevs {
definition_counts
.entry(abbrev.abbreviation.clone())
.or_insert_with(FxHashMap::default)
.entry(abbrev.definition.clone())
.and_modify(|count| *count += 1)
.or_insert(1);
}
let mut most_common: FxHashMap<String, AbbreviationDefinition> = FxHashMap::default();
for abbrev in abbrevs {
if let Some(counts) = definition_counts.get(&abbrev.abbreviation) {
if let Some(max_count) = counts.values().max() {
if counts.get(&abbrev.definition) == Some(max_count) {
most_common
.entry(abbrev.abbreviation.clone())
.or_insert(abbrev);
}
}
}
}
most_common.into_values().collect()
}
fn select_first_definitions(abbrevs: Vec<AbbreviationDefinition>) -> Vec<AbbreviationDefinition> {
let mut seen = FxHashSet::default();
abbrevs
.into_iter()
.filter(|abbrev| seen.insert(abbrev.abbreviation.clone()))
.collect()
}
fn get_potential_synonyms<'a>(candidate: &'a Candidate<'a>) -> Vec<Candidate<'a>> {
let mut candidates = Vec::with_capacity(3);
let text = candidate.text();
if text.len() <= 10 {
candidates.push(candidate.clone());
}
let words: Vec<&str> = text.split_whitespace().collect();
if words.len() == 2 {
let (first_word, second_word) = (words[0], words[1]);
if first_word.len() < 3 || second_word.len() < 3 {
return candidates;
}
let first_word_upper_count = first_word.chars().filter(|c| c.is_uppercase()).count();
let second_word_upper_count = second_word.chars().filter(|c| c.is_uppercase()).count();
let first_word_ratio = first_word_upper_count as f32 / first_word.len() as f32;
let second_word_ratio = second_word_upper_count as f32 / second_word.len() as f32;
if first_word_ratio >= 0.5 && second_word_ratio < 0.1 {
candidates.push(Candidate::new(
Cow::Owned(first_word.to_string()),
candidate.start(),
candidate.stop(),
));
} else if second_word_ratio >= 0.5 && first_word_ratio < 0.1 {
candidates.push(Candidate::new(
Cow::Owned(second_word.to_string()),
candidate.start(),
candidate.stop(),
));
}
}
candidates
}
pub fn best_candidates(sentence: &str) -> Vec<Candidate> {
let sent_bytes = sentence.as_bytes();
if !sent_bytes.contains(&b'(') {
return Vec::new();
}
let mut close_index = 0;
let mut candidates: Vec<Candidate> = Vec::new();
loop {
let open_index = sent_bytes[close_index..]
.windows(2)
.position(|window| window == b" (")
.map(|pos| pos + close_index);
match open_index {
Some(open_index) => {
let open_index = open_index + 1;
close_index = open_index + 1;
let mut open_count = 1;
let mut skip = false;
while open_count != 0 {
let char = match sent_bytes.iter().nth(close_index) {
Some(c) => c,
None => {
skip = true;
break;
}
};
if *char == b'(' {
open_count += 1;
} else if [b')', b';', b':'].contains(&char) {
open_count -= 1;
}
close_index += 1;
}
if skip {
close_index = open_index + 1;
continue;
}
let start = open_index + 1;
let stop = close_index - 1;
let candidate_text = safe_slice(sentence, start, stop);
let start = start + candidate_text.len() - candidate_text.trim_start().len();
let stop = stop - candidate_text.len() + candidate_text.trim_end().len();
let candidate = safe_slice(sentence, start, stop);
if conditions(&candidate) {
candidates.push(Candidate::new(candidate.to_string(), start, stop));
}
}
None => break, }
}
candidates }
pub fn get_definition<'a>(candidate: &Candidate<'a>, sentence: &'a str) -> Option<Candidate<'a>> {
let lowercase_sentence = sentence[..candidate.start().saturating_sub(2)].to_lowercase();
let tokens: Vec<&str> = WORD_SPLIT_RE.split(&lowercase_sentence).collect();
let key = candidate
.text()
.chars()
.next()
.and_then(|c| c.to_lowercase().next())
.unwrap_or('\0');
let first_chars: Vec<char> = tokens.iter().filter_map(|t| t.chars().next()).collect();
let definition_freq = first_chars.iter().filter(|&&c| c == key).count();
let candidate_freq = candidate
.text()
.to_lowercase()
.chars()
.filter(|&c| c == key)
.count();
if candidate_freq > definition_freq {
return None;
}
let mut count: isize = 0;
let mut start: i32 = 0;
let mut start_index: isize = (first_chars.len() - 1) as isize;
while count < candidate_freq as isize {
if start.abs() > first_chars.len() as i32 || start_index < 0 {
return None;
}
start -= 1;
let slice_start = first_chars.len().saturating_add_signed(start as isize);
if let Some(position) = first_chars[slice_start..].iter().position(|&c| c == key) {
start_index = (slice_start + position) as isize;
let sniffer_start = tokens[..start_index as usize].join(" ").len();
let preposition = match sentence.get(sniffer_start..) {
Some(s) => s
.char_indices()
.take(4)
.map(|(_, c)| c)
.collect::<String>()
.trim()
.to_string(),
None => String::new(),
};
if PREPOSITIONS.is_match(&preposition) {
start -= 1;
if start_index == 0 {
return None;
}
start_index -= 1;
}
}
count = first_chars[start_index as usize..]
.iter()
.filter(|&&c| c == key)
.count() as isize;
}
if start_index < 0 {
return None;
}
let start = tokens[..start_index as usize].join(" ").len();
let stop = candidate.start() - 1;
let mut candidate_text = safe_slice(sentence, start, stop);
let mut start = start + candidate_text.len() - candidate_text.trim_start().len();
let stop = stop - candidate_text.len() + candidate_text.trim_end().len();
if !best_candidates(safe_slice(sentence, start, stop)).is_empty() {
return None;
}
if sentence.chars().nth(start) == Some('-') || sentence.chars().nth(start) == Some(')') {
let mut hyphen_index = start - 1;
while hyphen_index > 0 && sentence.chars().nth(hyphen_index - 1) != Some(' ') {
hyphen_index -= 1;
}
start = hyphen_index;
}
candidate_text = safe_slice(sentence, start, stop);
Some(Candidate::new(candidate_text, start, stop))
}
pub fn select_definition<'a>(definition: &'a Candidate<'a>, abbrev: &str) -> Option<Candidate<'a>> {
if definition.text().len() < abbrev.len()
|| definition
.text()
.split_whitespace()
.any(|word| word == abbrev)
{
return None;
}
let abbrev_lowercase = abbrev.to_ascii_lowercase();
let abbrev_chars: Vec<char> = abbrev_lowercase.chars().collect();
let def_chars: Vec<char> = definition.text().chars().collect();
let mut s_index: isize = (abbrev_chars.len() - 1) as isize;
let mut l_index: isize = (def_chars.len() - 1) as isize;
loop {
if l_index < 0 || s_index < 0 {
break;
}
let long_char = def_chars[l_index as usize].to_ascii_lowercase();
let short_char = abbrev_chars[s_index as usize].to_ascii_lowercase();
if !short_char.is_alphanumeric() {
s_index -= 1;
} else if s_index == 0 {
if short_char == long_char && (l_index == 0 || def_chars[l_index as usize - 1] != '(') {
if l_index == 0 || !def_chars[l_index as usize - 1].is_alphanumeric() {
break;
} else {
l_index -= 1;
}
} else {
l_index -= 1;
}
} else {
if short_char == long_char {
s_index -= 1;
l_index -= 1;
} else {
l_index -= 1;
}
}
}
if l_index < 0 || s_index < 0 {
return None;
}
l_index = walk_backwards(&def_chars, l_index);
let new_candidate = Candidate::new(
&definition.text()[l_index as usize..],
definition.start(),
definition.stop(),
);
let candidate = if !PREPOSITIONS.is_match(new_candidate.text()) {
new_candidate
} else {
definition.clone()
};
let tokens = candidate.text().split_whitespace().count();
let length = abbrev.len();
if tokens > min(length + 5, length * 2) {
return None;
}
if candidate.text().chars().filter(|&c| c == '(').count()
!= candidate.text().chars().filter(|&c| c == ')').count()
{
return None;
}
if PREPOSITIONS.is_match(candidate.text()) {
return None;
}
Some(candidate)
}
fn walk_backwards(def_chars: &Vec<char>, start: isize) -> isize {
if start == 0 {
return 0;
}
let mut index = start;
if def_chars[index as usize - 1] == '-' || def_chars[index as usize - 1] == '/' {
while index > 0 {
if def_chars[index as usize - 1] == ' ' {
break;
}
index -= 1;
}
}
if index as usize >= 2
&& def_chars[index as usize].is_numeric()
&& def_chars[index as usize - 2] == ','
{
index -= 1;
while index > 0 {
if def_chars[index as usize - 1] == ' ' {
break;
}
index -= 1;
}
}
index
}
fn safe_slice(s: &str, start: usize, end: usize) -> &str {
s.get(start..end).unwrap_or("")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_best_candidates() {
let sentence = "The World Health Organization (WHO) is a specialized agency.";
let candidates = best_candidates(sentence);
assert_eq!(candidates.len(), 1);
assert_eq!(candidates[0], Candidate::new("WHO".to_string(), 31, 34));
let sentence = "The National Aeronautics and Space Administration (NASA) explores space.";
let candidates = best_candidates(sentence);
assert_eq!(candidates.len(), 1);
assert_eq!(candidates[0], Candidate::new("NASA".to_string(), 51, 55));
}
#[test]
fn test_multiple_candidates() {
let sentence = "The United Nations (UN) and World Health Organization (WHO) work together.";
let candidates = best_candidates(sentence);
assert_eq!(candidates.len(), 2);
assert_eq!(candidates[0].text(), "UN");
assert_eq!(candidates[1].text(), "WHO");
}
#[test]
fn test_no_candidates() {
let sentence = "This sentence has no abbreviations.";
let candidates = best_candidates(sentence);
assert_eq!(candidates.len(), 0);
}
#[test]
fn test_invalid_candidates() {
let sentence = "Invalid candidates: (A) (toolong) (123)";
let candidates = best_candidates(sentence);
assert_eq!(candidates.len(), 0);
}
#[test]
fn test_get_definition_simple() {
let sentence = "The World Health Organization (WHO) is a specialized agency.";
let candidate = Candidate::new("WHO".to_string(), 31, 34);
let definition = get_definition(&candidate, sentence);
assert!(definition.is_some());
let def = definition.unwrap();
assert_eq!(
def,
Candidate::new("World Health Organization".to_string(), 4, 29)
);
let sentence = "The National Aeronautics and Space Administration (NASA) explores space.";
let candidate = Candidate::new("NASA".to_string(), 51, 55);
let definition = get_definition(&candidate, sentence);
assert!(definition.is_some());
let def = definition.unwrap();
assert_eq!(
def,
Candidate::new(
"National Aeronautics and Space Administration".to_string(),
4,
49
)
);
}
#[test]
fn test_get_definition_with_preposition() {
let sentence = "We use the Rust Programming Language (RPL) for systems programming.";
let candidate = Candidate::new("RPL".to_string(), 38, 41);
let definition = get_definition(&candidate, sentence);
assert!(definition.is_some());
let def = definition.unwrap();
assert_eq!(
def,
Candidate::new("Rust Programming Language".to_string(), 11, 36)
);
}
#[test]
fn test_get_definition_no_match() {
let sentence = "This sentence contains (XYZ) but no matching definition.";
let candidate = Candidate::new("XYZ".to_string(), 24, 27);
let definition = get_definition(&candidate, sentence);
assert!(definition.is_none());
}
#[test]
fn test_get_definition_case_insensitive() {
let sentence = "The central processing unit (CPU) is the brain of a computer.";
let candidate = Candidate::new("CPU".to_string(), 29, 32);
let definition = get_definition(&candidate, sentence);
assert!(definition.is_some());
let def = definition.unwrap();
assert_eq!(def.text(), "central processing unit");
}
#[test]
fn test_select_definition_simple() {
let definition = Candidate::new("World Health Organization".to_string(), 4, 29);
let abbrev = "WHO";
let result = select_definition(&definition, abbrev);
assert!(result.is_some());
let selected = result.unwrap();
assert_eq!(
selected,
Candidate::new("World Health Organization".to_string(), 4, 29)
);
let definition = Candidate::new(
"National Aeronautics and Space Administration".to_string(),
4,
49,
);
let abbrev = "NASA";
let result = select_definition(&definition, abbrev);
assert!(result.is_some());
let selected = result.unwrap();
assert_eq!(
selected,
Candidate::new(
"National Aeronautics and Space Administration".to_string(),
4,
49
)
);
}
#[test]
fn test_select_definition_partial() {
let definition = Candidate::new("World Health Organization".to_string(), 4, 29);
let abbrev = "WHO";
let result = select_definition(&definition, abbrev);
assert!(result.is_some());
let selected = result.unwrap();
assert_eq!(
selected,
Candidate::new("World Health Organization".to_string(), 4, 29)
);
}
#[test]
fn test_select_definition_no_match() {
let definition = Candidate::new("United Nations".to_string(), 0, 14);
let abbrev = "WHO";
let result = select_definition(&definition, abbrev);
assert!(result.is_none());
}
#[test]
fn test_select_definition_abbreviation_in_definition() {
let definition = Candidate::new("World WHO Organization".to_string(), 0, 22);
let abbrev = "WHO";
let result = select_definition(&definition, abbrev);
assert!(result.is_none());
}
#[test]
fn test_select_definition_full_word() {
let definition = Candidate::new("The WHO is a specialized agency".to_string(), 0, 34);
let abbrev = "WHO";
let result = select_definition(&definition, abbrev);
assert!(
result.is_none(),
"Should return None when abbreviation is a full word in the definition"
);
}
fn assert_abbreviation(
result: &[AbbreviationDefinition],
abbreviation: &str,
definition: &str,
) {
assert!(
result
.iter()
.any(|ad| ad.abbreviation == abbreviation && ad.definition == definition),
"Failed to find abbreviation '{}' with definition '{}'",
abbreviation,
definition
);
}
#[test]
fn test_extract_abbreviation_definition_pairs() {
let text = "The World Health Organization (WHO) is a specialized agency. \
WHO is responsible for international public health.";
let result = extract_abbreviation_definition_pairs(text, AbbreviationOptions::default());
assert_eq!(result.len(), 1);
assert_abbreviation(&result, "WHO", "World Health Organization");
let text = "The National Aeronautics and Space Administration (NASA) explores space.";
let result = extract_abbreviation_definition_pairs(text, AbbreviationOptions::default());
assert_eq!(result.len(), 1);
assert_abbreviation(
&result,
"NASA",
"National Aeronautics and Space Administration",
);
let text = "Wiskott-Aldrich syndrome protein (WASP)";
let result = extract_abbreviation_definition_pairs(text, AbbreviationOptions::default());
assert_eq!(result.len(), 1);
assert_abbreviation(&result, "WASP", "Wiskott-Aldrich syndrome protein");
}
#[test]
fn test_extract_multiple_abbreviations() {
let text =
"The United Nations (UN) works closely with the World Health Organization (WHO). \
Both UN and WHO are international organizations.";
let result = extract_abbreviation_definition_pairs(text, AbbreviationOptions::default());
assert_eq!(result.len(), 2);
assert_abbreviation(&result, "UN", "United Nations");
assert_abbreviation(&result, "WHO", "World Health Organization");
}
#[test]
fn test_most_common_definition() {
let text = "The World Health Organization (WHO) is important. \n\
The World Heritage Organization (WHO) is different. \n\
The World Health Organization (WHO) is a UN agency.";
let options = AbbreviationOptions::new(true, false, false);
let result = extract_abbreviation_definition_pairs(text, options);
assert_eq!(result.len(), 1);
assert_abbreviation(&result, "WHO", "World Health Organization");
}
#[test]
fn test_first_definition() {
let text = "The World Heritage Organization (WHO) is important. \
The World Health Organization (WHO) is different.";
let options = AbbreviationOptions::new(false, true, false);
let result = extract_abbreviation_definition_pairs(text, options);
assert_eq!(result.len(), 1);
assert_abbreviation(&result, "WHO", "World Heritage Organization");
}
fn run_extraction_test(
text: &str,
expected_pairs: Vec<(&str, &str)>,
options: AbbreviationOptions,
) {
let result = extract_abbreviation_definition_pairs(text, options);
for (acronym, expected_term) in expected_pairs {
assert_abbreviation(&result, acronym, expected_term);
}
}
#[test]
fn test_extract_abbreviations() {
let text = r#"The endoplasmic reticulum (ER) in Saccharomyces cerevisiae consists of a
reticulum underlying the plasma membrane (cortical ER) and ER associated with
the nuclear envelope (nuclear ER).
The SH3 domain of Myo5p regulates the
polymerization of actin through interactions with both Las17p, a homolog of
mammalian Wiskott-Aldrich syndrome protein (WASP), and Vrp1p, a homolog of
WASP-interacting protein (WIP).
Ribonuclease P (RNase P) is a ubiquitous endoribonuclease that cleaves precursor
tRNAs to generate mature 5prime prime or minute termini.
The purified proteins
were separated by sodium dodecyl sulfate-polyacrylamide gel electrophoresis (SDS-PAGE) and
identified by peptide mass fingerprint analysis using
matrix-assisted laser desorption/ionization (MALDI) mass spectrometry."#;
let options = AbbreviationOptions::default();
run_extraction_test(
text,
vec![
("ER", "endoplasmic reticulum"),
("WASP", "Wiskott-Aldrich syndrome protein"),
("WIP", "WASP-interacting protein"),
("RNase P", "Ribonuclease P"),
(
"SDS-PAGE",
"sodium dodecyl sulfate-polyacrylamide gel electrophoresis",
),
("MALDI", "matrix-assisted laser desorption/ionization"),
],
options,
);
}
#[test]
fn test_extract_abbreviations_with_special_cases() {
let text = r#"Theory of mind (ToM; Smith 2009) broadly refers to humans' ability to represent the mental states of others,
including their desires, beliefs, and intentions.
Applications of text-to-speech (TTS) include:
We review astronomy and physics engagement with the
Open Researcher and Contributor iD (ORCID) service as a solution.
The Proceeds of Crime Act 2002 ("PoCA 2002")."#;
let options = AbbreviationOptions::default();
run_extraction_test(
text,
vec![
("ToM", "Theory of mind"),
("TTS", "text-to-speech"),
("ORCID", "Open Researcher and Contributor iD"),
("PoCA 2002", "Proceeds of Crime Act 2002"),
],
options,
);
}
#[test]
fn test_extract_abbreviations_with_edge_cases() {
let text = r#"The "satellite" goal of the program was accomplished when China established a space presence with the launch of Dongfanghong I in 1970; although, it wasn't until the 21st century that the PRC space program kicked into high gear, with the rapid development, buildup and deployment of rockets, satellites, and the first Taikonaut (astronaut) in October 2003. In fact, prior to 2010, the PRC had only conducted ten space launches, one of which put the satellite into orbit.
Once more, also for the Space Race, a strong transatlantic link could strengthen the path towards a peaceful and prosperous future for humankind and by consequence, a more secure period for our democracies: it is in our hands (and brains) to transform these ideas into a great reality.
Berlin is acknowledging the vulnerabilities that could potentially arise through hostile acts in space and set up its own space monitoring center, called the Air and Space Operations Center (ASOC) in September 2020 ."#;
let options = AbbreviationOptions::default();
run_extraction_test(
text,
vec![("ASOC", "Air and Space Operations Center")],
options,
);
let result = extract_abbreviation_definition_pairs(text, options);
assert!(!result.iter().any(|ad| ad.abbreviation == "astronaut"));
assert!(!result.iter().any(|ad| ad.abbreviation == "and brains"));
}
#[test]
fn test_extract_abbreviations_with_edge_cases_2() {
let text = r#"this approach, which we term high-throughput mass spectrometric protein complex
identification (HMS-PCI). Beginning with 10% of predicted yeast proteins as.
The Rep78 and Rep68 proteins of adeno-associated virus (AAV) type 2 are involved
in DNA replication, regulation of gene expression, and targeting site-specific
integration.
Ligand-receptor interaction for other C19-steroids was also examined. While
5-androstene-3beta, 17beta-diol (ADIOL) displayed estrogenic activity in this
system,
The Ogg1 protein of Saccharomyces cerevisiae belongs to a family of DNA
glycosylases and apurinic/apyrimidinic site (AP) lyases, the signature of which
is the alpha-helix. We have used the yeast three-hybrid system (D. J. SenGupta, B. Zhang, B.
Kraemer, P. Pochart, S. Fields, and M. Wickens, Proc. Natl. Acad. Sci. USA
93:8496-8501, 1996) to study binding of the human immunodeficiency virus type 1
(HIV-1) Gag protein to the HIV-1 RNA encapsidation signal (HIVPsi). Interaction
of these elements results in the activation of a reporter gene in the yeast
Saccharomyces cerevisiae. Using this system, we have shown that the HIV-1 Gag
Department of Chemistry and Biochemistry, Texas Tech University, Lubbock, TX,
79409-1061, USA. u0nes@ttacs.ttu.edu
Sterol C-methylations catalyzed by the (S)-adenosyl-L-methionine:
Delta(24)-sterol methyl transferase (SMT) have provided the focus for study of
electrophilic alkylations, a reaction type of functional importance in C-C bond
formation of natural products."#;
let options = AbbreviationOptions::new(false, false, true);
run_extraction_test(
text,
vec![
(
"HMS-PCI",
"high-throughput mass spectrometric protein complex identification",
),
("AAV", "adeno-associated virus"),
("ADIOL", "5-androstene-3beta, 17beta-diol"),
("AP", "apurinic/apyrimidinic site"),
("HIV-1", "human immunodeficiency virus type 1"),
("HIVPsi", "HIV-1 RNA encapsidation signal"),
("SMT", "Delta(24)-sterol methyl transferase"),
],
options,
);
let text = r#"cells, NMD appears to involve splicing-dependent alterations to mRNA as well as
ribosome-associated components of the translational apparatus. To date, human
(h) Upf1 protein (p) (hUpf1p), a group 1 RNA helicase named after its
Saccharomyces cerevisiae orthologue that functions in both translation
termination and NMD, has been the only factor shown to be required for NMD in
mammalian cells. Here, we describe human orthologues to
binding sites for Ro60 and La proteins, and Ro RNPs are thus physiologically
proteins and recombinant hY (rhY) co-expressed in yeast, we found that RNPs
made of rRo60/rhY/rLa were readily reassembled. Reconstitution of tripartite
RNPs was critically dependent on the presence of an appropriate Ro60 binding
encodes a membrane protein. The bait is expressed in its natural environment,
the membrane, whereas the protein partner (the prey) is fused to a cytoplasmic
he transactivational properties of tamoxifen in a basic yeast model system
which reconstitutes ligand-dependent human estrogen receptor-alpha (hER alpha)
gene activation. Tamoxifen exerted low agonist activity in this system compared
calculated by fitting experimental data with a logistic dose-response function.
domain and phosphatidylserines. For this purpose, mixed bilayers of 1-palmitoyl,
2-oleoyl-sn-glycero-3-phosphocholine (POPC) and
"#;
let options = AbbreviationOptions::new(false, false, true);
run_extraction_test(
text,
vec![
("hUpf1p", "human (h) Upf1 protein (p)"),
("rhY", "recombinant hY"),
("hER alpha", "human estrogen receptor-alpha"),
("POPC", "1-palmitoyl, 2-oleoyl-sn-glycero-3-phosphocholine"),
],
options,
);
}
#[test]
fn test_parallel_extraction_multiple_texts_str() {
let texts = vec![
"The National Aeronautics and Space Administration (NASA) explores space.",
"The European Space Agency (ESA) collaborates with NASA.",
"Both NASA and ESA conduct important research.",
];
let options = AbbreviationOptions::default();
let result = extract_abbreviation_definition_pairs_parallel(texts, options);
assert_eq!(result.len(), 2);
assert_abbreviation(
&result,
"NASA",
"National Aeronautics and Space Administration",
);
assert_abbreviation(&result, "ESA", "European Space Agency");
}
#[test]
fn test_parallel_extraction_multiple_texts_string() {
let texts = vec![
"The National Aeronautics and Space Administration (NASA) explores space.".to_string(),
"The European Space Agency (ESA) collaborates with NASA.".to_string(),
"Both NASA and ESA conduct important research.".to_string(),
];
let options = AbbreviationOptions::default();
let result = extract_abbreviation_definition_pairs_parallel(texts, options);
assert_eq!(result.len(), 2);
assert_abbreviation(
&result,
"NASA",
"National Aeronautics and Space Administration",
);
assert_abbreviation(&result, "ESA", "European Space Agency");
}
#[test]
fn test_tokenize_and_clean() {
let input = r#"First sentence.
Second sentence with a
newline in the middle.
Third sentence after an empty line.
Fourth sentence.
Fifth sentence with trailing newline.
"#;
let expected = vec![
"First sentence.",
"Second sentence with a newline in the middle.",
"Third sentence after an empty line.",
"Fourth sentence.",
"Fifth sentence with trailing newline.",
];
let result: Vec<String> = tokenize_and_clean(input)
.map(|cow| cow.into_owned())
.collect();
assert_eq!(result, expected);
}
#[test]
fn test_unicode_chars() {
let text = r#""Two kinds of mechanical valve, St. Jude Medical (SJM) and Björk-Shiley (B-S), in patients with single valve replacement have been evaluated on a view point of intravascular hemolysis.
The World Health Organization (WHO) works globally. La Société Nationale des Chemins de fer Français (SNCF) est l'entreprise ferroviaire publique française.,
Em português, a Organização Mundial da Saúde (OMS) é muito importante.,
Всемирная организация здравоохранения (Воз) работает во всем мире.",
The Société Générale des Surveillances (SGS) is a multinational company.,
Το Ινστιτούτο Τεχνολογίας Υπολογιστών και Εκδόσεων (ΙΤΥΕ) είναι ερευνητικός οργανισμός.",
"#;
let options = AbbreviationOptions::default();
run_extraction_test(
text,
vec![
("SJM", "St. Jude Medical"),
("B-S", "Björk-Shiley"),
("WHO", "World Health Organization"),
("SNCF", "Société Nationale des Chemins de fer Français"),
("OMS", "Organização Mundial da Saúde"),
("Воз", "Всемирная организация здравоохранения"),
("SGS", "Société Générale des Surveillances"),
("ΙΤΥΕ", "Ινστιτούτο Τεχνολογίας Υπολογιστών και Εκδόσεων"),
],
options,
);
}
}