use wasm_bindgen::prelude::*;
mod katana;
fn unescape_html(html_string: &str) -> String {
let replacements = [
(" ", " "),
("&", "&"),
("<", "<"),
(">", ">"),
(""", "\""),
("'", "'"),
("'", "'"),
("¢", "¢"),
("£", "£"),
("¥", "¥"),
("€", "€"),
("©", "©"),
("®", "®"),
("§", "§"),
("¨", "¨"),
("ª", "ª"),
("«", "«"),
("¬", "¬"),
("­", ""),
("¯", "¯"),
("°", "°"),
("±", "±"),
("²", "²"),
("³", "³"),
("´", "´"),
("µ", "µ"),
("¶", "¶"),
("·", "·"),
("¸", "¸"),
("¹", "¹"),
("º", "º"),
("»", "»"),
("¼", "¼"),
("½", "½"),
("¾", "¾"),
("¿", "¿"),
("À", "À"),
("Á", "Á"),
("Â", "Â"),
("Ã", "Ã"),
("Ä", "Ä"),
("Å", "Å"),
("Æ", "Æ"),
("Ç", "Ç"),
("È", "È"),
("É", "É"),
("Ê", "Ê"),
("Ë", "Ë"),
("Ì", "Ì"),
("Í", "Í"),
("Î", "Î"),
("Ï", "Ï"),
("Ð", "Ð"),
("Ñ", "Ñ"),
("Ò", "Ò"),
("Ó", "Ó"),
("Ô", "Ô"),
("Õ", "Õ"),
("Ö", "Ö"),
("×", "×"),
("Ø", "Ø"),
("Ù", "Ù"),
("Ú", "Ú"),
("Û", "Û"),
("Ü", "Ü"),
("Ý", "Ý"),
("Þ", "Þ"),
("ß", "ß"),
("à", "à"),
("á", "á"),
("â", "â"),
("ã", "ã"),
("ä", "ä"),
("å", "å"),
("æ", "æ"),
("ç", "ç"),
("è", "è"),
("é", "é"),
("ê", "ê"),
("ë", "ë"),
("ì", "ì"),
("í", "í"),
("î", "î"),
("ï", "ï"),
("ð", "ð"),
("ñ", "ñ"),
("ò", "ò"),
("ó", "ó"),
("ô", "ô"),
("õ", "õ"),
("ö", "ö"),
("÷", "÷"),
("ø", "ø"),
("ù", "ù"),
("ú", "ú"),
("û", "û"),
("ü", "ü"),
("ý", "ý"),
("þ", "þ"),
("ÿ", "ÿ"),
];
replacements
.iter()
.fold(html_string.to_string(), |acc, &(entity, char)| {
acc.replace(entity, char)
})
}
fn replace_abbreviations(text: &str) -> String {
let abbreviations = [
("i.e.", "ie"),
("e.g.", "eg"),
("etc.", "etc"),
("mr.", "mr"),
("mrs.", "mrs"),
("vs.", "vs"),
("dr.", "dr"),
("prof.", "prof"),
("sr.", "sr"),
("jr.", "jr"),
("st.", "st"),
("jan.", "jan"),
("feb.", "feb"),
("mar.", "mar"),
("apr.", "apr"),
("jun.", "jun"),
("jul.", "jul"),
("aug.", "aug"),
("sept.", "sept"),
("oct.", "oct"),
("nov.", "nov"),
("dec.", "dec"),
("a.m.", "am"),
("p.m.", "pm"),
("u.s.", "us"),
("u.k.", "uk"),
];
let regex_set = regex::RegexSet::new(abbreviations.iter().map(|&(abbr, _)| abbr)).unwrap();
regex_set
.matches(text)
.iter()
.fold(text.to_string(), |acc, m| {
let (from, to) = abbreviations[m];
acc.replace(from, to)
})
}
fn remove_html_tags(html_string: &str) -> String {
let text = regex::Regex::new(r"(?s)<!--(.*?)-->")
.unwrap()
.replace_all(html_string, "")
.into_owned();
let text = regex::Regex::new(r"(?s)<h[1-6]>(.*?)</h[1-6]>")
.unwrap()
.replace_all(&text, "$1\n\n")
.into_owned();
let text = unescape_html(&text);
let text = regex::Regex::new(r"<(.*?)>")
.unwrap()
.replace_all(&text, " ")
.into_owned();
let text = regex::Regex::new(r" ")
.unwrap()
.replace_all(&text, " ")
.into_owned();
let text = replace_abbreviations(&text);
let text = regex::Regex::new(r"\n\s*?\n")
.unwrap()
.replace_all(&text, "\n\n")
.into_owned();
let text = regex::Regex::new(r"\s?\[[0-9]+\]\s?")
.unwrap()
.replace_all(&text, "")
.into_owned();
let text = text
.split("\n")
.map(|line| line.trim())
.filter(|line| !line.starts_with("^ "))
.collect::<Vec<&str>>()
.join("\n");
let text = regex::Regex::new(r"\n{3,}")
.unwrap()
.replace_all(&text, "\n\n")
.into_owned();
text
}
#[wasm_bindgen]
pub fn prepare_text(text: &str) -> String {
let text = text
.split("\n")
.map(|line| line.trim())
.filter(|line| !line.is_empty())
.collect::<Vec<&str>>()
.join(" ");
let text = html2md::parse_html(&text);
let text = remove_html_tags(&text);
let paragraphs = katana::cut(&text);
paragraphs
.iter()
.map(|p| p.as_slice().join(" "))
.collect::<Vec<String>>()
.join("\n\n")
}