use scraper::Html;
use crate::dom;
use super::ExtractorResult;
#[must_use]
pub fn is_c2wiki(_html: &Html, url: Option<&str>) -> bool {
url.is_some_and(|u| u.contains("wiki.c2.com"))
}
#[must_use]
pub fn extract_c2wiki(html: &Html, url: Option<&str>) -> Option<ExtractorResult> {
if !is_c2wiki(html, url) {
return None;
}
if let Some(result) = try_rendered(html, url) {
return Some(result);
}
let page_name = extract_page_name(url?)?;
try_api_fetch(&page_name)
}
fn try_rendered(html: &Html, url: Option<&str>) -> Option<ExtractorResult> {
let page_ids = dom::select_ids(html, ".page[data-title]");
let page_id = page_ids.first().copied()?;
let title = dom::get_attr(html, page_id, "data-title")
.map(|t| expand_wiki_word(&t))
.unwrap_or_default();
let content = dom::inner_html(html, page_id);
if content.trim().is_empty() {
return None;
}
Some(ExtractorResult {
content,
title: if title.is_empty() {
extract_page_name(url.unwrap_or("")).map(|n| expand_wiki_word(&n))
} else {
Some(title)
},
author: None,
site: Some("C2 Wiki".to_string()),
published: None,
image: None,
description: None,
})
}
fn try_api_fetch(page_name: &str) -> Option<ExtractorResult> {
let api_url = format!("https://c2.com/wiki/remodel/pages/{page_name}");
let body = crate::http::get(&api_url)?;
let json: serde_json::Value = serde_json::from_str(&body).ok()?;
let text = json.get("text")?.as_str()?;
if text.trim().is_empty() {
return None;
}
let date = json.get("date").and_then(|d| d.as_str()).map(String::from);
let html_content = wiki_text_to_html(text);
let title = expand_wiki_word(page_name);
Some(ExtractorResult {
content: html_content,
title: Some(title),
author: None,
site: Some("C2 Wiki".to_string()),
published: date,
image: None,
description: None,
})
}
fn wiki_text_to_html(text: &str) -> String {
let mut html = String::with_capacity(text.len() * 2);
let mut in_paragraph = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if in_paragraph {
html.push_str("</p>\n");
in_paragraph = false;
}
continue;
}
if in_paragraph {
html.push(' ');
} else {
html.push_str("<p>");
in_paragraph = true;
}
let processed = convert_wiki_italics(trimmed);
let processed = convert_wiki_links(&processed);
html.push_str(&processed);
}
if in_paragraph {
html.push_str("</p>\n");
}
html
}
fn convert_wiki_italics(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut in_italic = false;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '\'' && chars.peek() == Some(&'\'') {
chars.next();
if in_italic {
result.push_str("</em>");
} else {
result.push_str("<em>");
}
in_italic = !in_italic;
continue;
}
result.push(ch);
}
if in_italic {
result.push_str("</em>");
}
result
}
fn convert_wiki_links(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut word = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() {
word.push(ch);
} else {
flush_word(&word, &mut result);
word.clear();
result.push(ch);
}
}
flush_word(&word, &mut result);
result
}
fn flush_word(word: &str, result: &mut String) {
if is_wiki_word(word) {
use std::fmt::Write;
let display = expand_wiki_word(word);
let _ = write!(
result,
"<a href=\"https://wiki.c2.com/?{word}\">{display}</a>"
);
} else {
result.push_str(word);
}
}
fn is_wiki_word(word: &str) -> bool {
if word.len() < 3 {
return false;
}
let mut upper_count = 0;
let mut has_lower = false;
for ch in word.chars() {
if ch.is_uppercase() {
upper_count += 1;
} else if ch.is_lowercase() {
has_lower = true;
}
}
upper_count >= 2 && has_lower && word.chars().next().is_some_and(char::is_uppercase)
}
fn expand_wiki_word(word: &str) -> String {
let mut result = String::with_capacity(word.len() + 8);
let mut prev_char: Option<char> = None;
for ch in word.chars() {
if ch.is_uppercase()
&& let Some(prev) = prev_char
&& prev.is_lowercase()
{
result.push(' ');
}
result.push(ch);
prev_char = Some(ch);
}
result
}
fn extract_page_name(url: &str) -> Option<String> {
if let Some(idx) = url.find('?') {
let name = &url[idx + 1..];
if !name.is_empty() && !name.contains('=') {
return Some(name.to_string());
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_c2wiki_url() {
let html = Html::parse_document("<html><body></body></html>");
assert!(is_c2wiki(&html, Some("https://wiki.c2.com/?TestPage")));
assert!(!is_c2wiki(&html, Some("https://example.com")));
}
#[test]
fn expands_wiki_words() {
assert_eq!(expand_wiki_word("WelcomeVisitors"), "Welcome Visitors");
assert_eq!(
expand_wiki_word("ExtremeProgramming"),
"Extreme Programming"
);
assert_eq!(expand_wiki_word("XP"), "XP");
}
#[test]
fn converts_wiki_italics() {
assert_eq!(
convert_wiki_italics("This is ''italic'' text"),
"This is <em>italic</em> text"
);
}
#[test]
fn identifies_wiki_words() {
assert!(is_wiki_word("ExtremeProgramming"));
assert!(is_wiki_word("WelcomeVisitors"));
assert!(!is_wiki_word("the"));
assert!(!is_wiki_word("XP"));
assert!(!is_wiki_word("abc"));
}
#[test]
fn extracts_page_name_from_url() {
assert_eq!(
extract_page_name("https://wiki.c2.com/?ExtremeProgramming"),
Some("ExtremeProgramming".to_string())
);
assert_eq!(extract_page_name("https://wiki.c2.com/"), None);
}
#[test]
fn wiki_text_converts_to_html() {
let text = "First paragraph.\n\nSecond paragraph.";
let html = wiki_text_to_html(text);
assert!(html.contains("<p>First paragraph.</p>"));
assert!(html.contains("<p>Second paragraph.</p>"));
}
#[test]
fn api_fetch_on_live_page() {
let result = try_api_fetch("ExtremeProgramming");
if let Some(r) = result {
assert!(r.content.len() > 100, "should have substantial content");
assert_eq!(r.title, Some("Extreme Programming".to_string()));
assert_eq!(r.site, Some("C2 Wiki".to_string()));
}
}
}