use crate::datasets::icgem::body::ICGEMBody;
use crate::datasets::icgem::index::IndexEntry;
use crate::utils::BraheError;
use scraper::{Html, Selector};
fn text_with_br_spaces(el: &scraper::ElementRef<'_>) -> String {
let inner = el.inner_html();
split_on_br(&inner)
.into_iter()
.map(strip_html_tags)
.collect::<Vec<_>>()
.join(" ")
}
fn split_on_br(html: &str) -> Vec<&str> {
let mut parts = Vec::new();
let mut start = 0;
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] == b'<' {
let after_open = i + 1;
let tag_start = if after_open < len && bytes[after_open] == b'/' {
after_open + 1
} else {
after_open
};
if tag_start + 2 <= len {
let tag_lo = html[tag_start..tag_start + 2].to_ascii_lowercase();
if tag_lo == "br" {
if let Some(close_offset) = html[i..].find('>') {
parts.push(&html[start..i]);
i += close_offset + 1; start = i;
continue;
}
}
}
}
i += 1;
}
parts.push(&html[start..]);
parts
}
fn strip_html_tags(s: &str) -> String {
let mut out = String::new();
let mut in_tag = false;
for ch in s.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => out.push(ch),
_ => {}
}
}
out
}
pub fn parse_earth_catalog(html: &str) -> Result<Vec<IndexEntry>, BraheError> {
parse_catalog(html, |_object_text| ICGEMBody::Earth)
}
pub fn parse_celestial_catalog(html: &str) -> Result<Vec<IndexEntry>, BraheError> {
parse_catalog(html, |object_text| {
if object_text.is_empty() {
ICGEMBody::Other("unknown".to_string())
} else {
ICGEMBody::from_name(object_text)
}
})
}
fn parse_catalog<F>(html: &str, classify_body: F) -> Result<Vec<IndexEntry>, BraheError>
where
F: Fn(&str) -> ICGEMBody,
{
let doc = Html::parse_document(html);
let row_sel = Selector::parse("tr").unwrap();
let cell_sel = Selector::parse("td").unwrap();
let link_sel = Selector::parse("a[href*='/getmodel/gfc/']").unwrap();
let mut entries = Vec::new();
for tr in doc.select(&row_sel) {
let cells: Vec<_> = tr.select(&cell_sel).collect();
if cells.len() < 8 {
continue;
}
let download_cell = &cells[7];
let gfc_links: Vec<_> = download_cell.select(&link_sel).collect();
if gfc_links.is_empty() {
continue;
}
let object_text = cells[1].text().collect::<String>();
let object_text = object_text.trim();
let body = classify_body(object_text);
let name = cells[2].text().collect::<String>().trim().to_string();
if name.is_empty() {
continue;
}
let year_text = cells[3].text().collect::<String>();
let year = year_text
.split_whitespace()
.find_map(|tok| {
tok.parse::<u16>()
.ok()
.filter(|&y| (1900..=2100).contains(&y))
});
let degree_text = text_with_br_spaces(&cells[4]);
let cell_degrees: Vec<u32> = degree_text
.split_whitespace()
.filter_map(|tok| tok.parse::<u32>().ok())
.filter(|&n| n >= 2)
.collect();
for (i, link) in gfc_links.iter().enumerate() {
let href = match link.value().attr("href") {
Some(h) => h.to_string(),
None => continue,
};
let degree = cell_degrees.get(i).copied().or_else(|| {
let filename = href
.rsplit('/')
.next()
.unwrap_or("")
.trim_end_matches(".gfc");
filename.rfind('_').and_then(|pos| {
filename[pos + 1..].parse::<u32>().ok().filter(|&n| n >= 2)
})
});
let degree = match degree {
Some(d) => d,
None => continue,
};
entries.push(IndexEntry {
body: body.clone(),
name: name.clone(),
year,
degree,
download_path: href,
});
}
}
if entries.is_empty() {
return Err(BraheError::Error(
"ICGEM catalog parse returned no entries — page format may have changed".to_string(),
));
}
Ok(entries)
}
#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
use super::*;
const EARTH_FIXTURE: &str =
include_str!("../../../test_assets/icgem/tom_longtime_sample.html");
#[test]
fn test_parse_earth_catalog_has_entries() {
let entries = parse_earth_catalog(EARTH_FIXTURE).unwrap();
assert!(
!entries.is_empty(),
"expected at least 1 Earth model entry, got {}",
entries.len()
);
for e in &entries {
assert_eq!(e.body, ICGEMBody::Earth);
assert!(!e.name.is_empty());
assert!(e.download_path.contains("/getmodel/gfc/"));
assert!(e.degree >= 2);
}
}
#[test]
fn test_parse_earth_catalog_empty_html_errors() {
let result = parse_earth_catalog("<html><body></body></html>");
assert!(result.is_err());
}
const CELESTIAL_FIXTURE: &str =
include_str!("../../../test_assets/icgem/tom_celestial_sample.html");
#[test]
fn test_parse_earth_catalog_multi_variant_degrees_from_cell() {
let entries = parse_earth_catalog(EARTH_FIXTURE).unwrap();
let whu: Vec<&IndexEntry> = entries
.iter()
.filter(|e| e.name == "WHU-CASM-UGM2025_2159")
.collect();
assert_eq!(whu.len(), 4, "expected 4 download variants for WHU-CASM-UGM2025_2159");
let mut degrees: Vec<u32> = whu.iter().map(|e| e.degree).collect();
degrees.sort();
assert_eq!(
degrees,
vec![760, 2190, 5500, 11000],
"degrees must come from the Degree column, not the filename suffix"
);
}
#[test]
fn test_parse_earth_catalog_single_variant_uses_cell_name() {
let entries = parse_earth_catalog(EARTH_FIXTURE).unwrap();
let egm = entries.iter().find(|e| e.name == "EGM2008");
assert!(egm.is_some(), "EGM2008 entry missing");
assert_eq!(egm.unwrap().degree, 2190);
}
#[test]
fn test_parse_earth_catalog_keeps_year_range_degrees() {
let entries = parse_earth_catalog(EARTH_FIXTURE).unwrap();
let degrees_1949: Vec<&IndexEntry> = entries
.iter()
.filter(|e| e.degree == 1949)
.collect();
assert!(
!degrees_1949.is_empty(),
"expected at least one entry with degree 1949 (e.g. EIGEN-6C2 or EIGEN-6C3stat)"
);
}
#[test]
fn test_parse_celestial_catalog_assigns_bodies() {
let entries = parse_celestial_catalog(CELESTIAL_FIXTURE).unwrap();
assert!(!entries.is_empty());
let bodies: std::collections::HashSet<_> = entries.iter().map(|e| e.body.clone()).collect();
assert!(
bodies.contains(&ICGEMBody::Moon) || bodies.contains(&ICGEMBody::Mars),
"expected at least one Moon or Mars entry; got bodies: {:?}",
bodies
);
assert!(!bodies.contains(&ICGEMBody::Earth));
}
}