use std::fmt::Write as _;
use serde::Serialize;
use crate::error::Error;
#[derive(Serialize)]
struct PageJson<'a> {
id: &'a str,
url: &'a str,
title: &'a str,
text: &'a str,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OutputFormat {
Doc,
Json,
}
#[inline]
fn escape_html(s: &str) -> String {
let mut result = String::with_capacity(s.len() + 16);
let mut last = 0;
for (i, b) in s.bytes().enumerate() {
let esc = match b {
b'&' => "&",
b'<' => "<",
b'>' => ">",
b'"' => """,
_ => continue,
};
result.push_str(&s[last..i]);
result.push_str(esc);
last = i + 1;
}
result.push_str(&s[last..]);
result
}
#[inline]
pub fn make_url(url_base: &str, title: &str) -> String {
let mut url = String::with_capacity(url_base.len() + 1 + title.len());
url.push_str(url_base);
url.push('/');
if !title.contains(' ') {
url.push_str(title);
} else {
for ch in title.chars() {
if ch == ' ' {
url.push('_');
} else {
url.push(ch);
}
}
}
url
}
pub fn format_page(
id: u64,
title: &str,
url_base: &str,
text: &str,
format: OutputFormat,
) -> String {
let url = make_url(url_base, title);
match format {
OutputFormat::Doc => {
let escaped_title = escape_html(title);
let escaped_url = escape_html(&url);
let capacity = 9 + 20 + 7 + escaped_url.len()
+ 10 + escaped_title.len()
+ 3 + text.len()
+ 9; let mut out = String::with_capacity(capacity);
write!(
out,
"<doc id=\"{id}\" url=\"{escaped_url}\" title=\"{escaped_title}\">\n{text}\n</doc>\n\n"
)
.expect("writing to String is infallible");
out
}
OutputFormat::Json => {
let id_str = id.to_string();
let page = PageJson {
id: &id_str,
url: &url,
title,
text,
};
let mut json_str = serde_json::to_string(&page).unwrap_or_default();
json_str.push('\n');
json_str
}
}
}
pub fn parse_file_size(spec: &str) -> Result<u64, Error> {
let spec = spec.trim();
if spec.is_empty() {
return Err(Error::InvalidFileSize(spec.to_string()));
}
let last_char = spec.chars().last().unwrap_or('0');
let multiplier = match last_char {
'K' => Some(1024u64),
'M' => Some(1024u64 * 1024),
'G' => Some(1024u64 * 1024 * 1024),
_ => None,
};
match multiplier {
Some(mult) => {
let num_part = &spec[..spec.len() - 1];
let num: u64 = num_part
.parse()
.map_err(|_| Error::InvalidFileSize(spec.to_string()))?;
Ok(num * mult)
}
None => {
let num: u64 = spec
.parse()
.map_err(|_| Error::InvalidFileSize(spec.to_string()))?;
Ok(num)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_format_page_doc() {
let result = format_page(
1,
"Test Page",
"https://en.wikipedia.org/wiki",
"Hello world.",
OutputFormat::Doc,
);
let expected = "<doc id=\"1\" url=\"https://en.wikipedia.org/wiki/Test_Page\" title=\"Test Page\">\nHello world.\n</doc>\n\n";
assert_eq!(result, expected);
}
#[test]
fn test_format_page_json() {
let result = format_page(
42,
"Test Page",
"https://en.wikipedia.org/wiki",
"Some text here.",
OutputFormat::Json,
);
let parsed: serde_json::Value = serde_json::from_str(result.trim()).unwrap();
assert_eq!(parsed["id"], "42");
assert_eq!(parsed["url"], "https://en.wikipedia.org/wiki/Test_Page");
assert_eq!(parsed["title"], "Test Page");
assert_eq!(parsed["text"], "Some text here.");
}
#[test]
fn test_make_url_with_spaces() {
let url = make_url("https://en.wikipedia.org/wiki", "New York City");
assert_eq!(url, "https://en.wikipedia.org/wiki/New_York_City");
}
#[test]
fn test_make_url_japanese_title() {
let url = make_url("https://ja.wikipedia.org/wiki", "東京都");
assert_eq!(url, "https://ja.wikipedia.org/wiki/東京都");
}
#[test]
fn test_make_url_no_spaces() {
let url = make_url("https://en.wikipedia.org/wiki", "Rust");
assert_eq!(url, "https://en.wikipedia.org/wiki/Rust");
}
#[test]
fn test_parse_file_size_kilobytes() {
assert_eq!(parse_file_size("500K").unwrap(), 512000);
assert_eq!(parse_file_size("1K").unwrap(), 1024);
}
#[test]
fn test_parse_file_size_megabytes() {
assert_eq!(parse_file_size("1M").unwrap(), 1048576);
assert_eq!(parse_file_size("10M").unwrap(), 10485760);
}
#[test]
fn test_parse_file_size_gigabytes() {
assert_eq!(parse_file_size("1G").unwrap(), 1073741824);
}
#[test]
fn test_parse_file_size_plain_number() {
assert_eq!(parse_file_size("4096").unwrap(), 4096);
assert_eq!(parse_file_size("0").unwrap(), 0);
}
#[test]
fn test_parse_file_size_invalid() {
assert!(parse_file_size("").is_err());
assert!(parse_file_size("abc").is_err());
assert!(parse_file_size("M").is_err());
assert!(parse_file_size("12X").is_err());
}
#[test]
fn test_escape_html_in_doc_format() {
let result = format_page(
1,
"A <b>bold</b> & \"quoted\" title",
"https://en.wikipedia.org/wiki",
"Some text.",
OutputFormat::Doc,
);
assert!(
result.contains("title=\"A <b>bold</b> & "quoted" title\"")
);
assert!(result.contains("url=\"https://en.wikipedia.org/wiki/A_<b>bold</b>_&_"quoted"_title\""));
}
#[test]
fn test_json_format_trailing_newline() {
let result = format_page(
1,
"Title",
"https://example.com",
"Text",
OutputFormat::Json,
);
assert!(result.ends_with('\n'));
assert!(!result.ends_with("\n\n"));
}
#[test]
fn test_doc_format_trailing_newline() {
let result = format_page(1, "Title", "https://example.com", "Text", OutputFormat::Doc);
assert!(result.ends_with("</doc>\n\n"));
}
}