use reqwest::blocking::Client;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use url::Url;
pub trait HttpClient {
fn get_text(&self, url: &str) -> Result<String, reqwest::Error>;
}
impl HttpClient for Client {
fn get_text(&self, url: &str) -> Result<String, reqwest::Error> {
self.get(url).send()?.text()
}
}
#[derive(Debug, Error)]
#[allow(dead_code)]
pub enum NistError {
#[error("Network error: {0}")]
NetworkError(#[from] reqwest::Error),
#[error("URL parsing error: {0}")]
UrlError(#[from] url::ParseError),
#[error("Substance not found")]
SubstanceNotFound,
#[error("Invalid data format")]
InvalidDataFormat,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[allow(non_snake_case)]
pub struct NistInput {
pub cp: Option<Vec<Vec<f64>>>,
pub T: Option<Vec<Vec<f64>>>,
pub dh: Option<f64>,
pub ds: Option<f64>,
pub molar_mass: Option<f64>,
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(dead_code)]
pub enum Phase {
Gas,
Solid,
Liquid,
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(dead_code)]
pub enum SearchType {
Cp,
DeltaH,
DeltaS,
MolarMass,
All,
}
impl Phase {
pub fn as_str(&self) -> &'static str {
match self {
Phase::Gas => "gas",
Phase::Solid => "solid",
Phase::Liquid => "liquid",
}
}
}
pub struct NistParser<C: HttpClient> {
client: C,
}
impl NistParser<Client> {
pub fn new() -> Self {
Self {
client: Client::new(),
}
}
}
impl<C: HttpClient> NistParser<C> {
#[allow(dead_code)]
pub fn with_client(client: C) -> Self {
Self { client }
}
pub fn get_data(
&self,
substance: &str,
search_type: SearchType,
phase: Phase,
) -> Result<NistInput, NistError> {
let url = self.construct_url(substance)?;
println!("\n \n URL found: {}", url);
let html = self.fetch_page(&url)?;
if !self.check_substance_exists(&html) {
return Err(NistError::SubstanceNotFound);
}
let url_of_substance = self.get_url_of_substance(&html, &url)?;
println!(
"\n \n URL of substance {} found: {}",
substance, url_of_substance
);
let html_of_substance = self.fetch_page(&url_of_substance)?;
let final_url = self.get_final_url(&html_of_substance, &url_of_substance, phase)?;
let _html_of_phase = self.fetch_page(&final_url)?;
println!("\n \n Final URL found: {}", final_url);
let mut data = self.parse_data(&final_url, search_type, phase)?;
if data.dh.is_none() && self.is_simple_substance(substance) {
data.dh = Some(0.0);
}
Ok(data)
}
pub fn construct_url(&self, substance: &str) -> Result<Url, NistError> {
let substance = substance.replace(' ', "");
if substance.contains('-') {
Ok(Url::parse(&format!(
"https://webbook.nist.gov/cgi/cbook.cgi?ID={}&Units=SI",
substance
))?)
} else if substance.chars().any(|c| c.is_ascii_digit()) {
Ok(Url::parse(&format!(
"https://webbook.nist.gov/cgi/cbook.cgi?Formula={}&NoIon=on&Units=SI",
substance
))?)
} else {
Ok(Url::parse(&format!(
"https://webbook.nist.gov/cgi/cbook.cgi?Name={}&Units=SI",
substance
))?)
}
}
fn fetch_page(&self, url: &Url) -> Result<String, NistError> {
Ok(self.client.get_text(url.as_str())?)
}
fn check_substance_exists(&self, html: &str) -> bool {
let document = Html::parse_document(html);
let selector = Selector::parse("h1").unwrap();
for element in document.select(&selector) {
let text = element.text().collect::<String>();
if text.contains("Not Found") {
return false;
}
}
true
}
fn get_url_of_substance(&self, html: &str, original_url: &Url) -> Result<Url, NistError> {
let document = Html::parse_document(html);
if let Ok(selector) = Selector::parse("ol li a") {
if let Some(first_result) = document.select(&selector).next() {
if let Some(href) = first_result.value().attr("href") {
return Ok(Url::parse(&format!("https://webbook.nist.gov{}", href))?);
}
}
}
Ok(original_url.clone())
}
fn get_final_url(
&self,
html: &str,
url_of_substance: &Url,
phase: Phase,
) -> Result<Url, NistError> {
let document = Html::parse_document(html);
let link_text = match phase {
Phase::Gas => "Gas phase thermochemistry data",
Phase::Solid => "Condensed phase thermochemistry data",
Phase::Liquid => "Condensed phase thermochemistry data",
};
let selector = Selector::parse("a").unwrap();
for element in document.select(&selector) {
if element.text().collect::<String>().contains(link_text) {
if let Some(href) = element.value().attr("href") {
return url_of_substance
.join(href)
.map_err(|e| NistError::UrlError(e));
}
}
}
Ok(url_of_substance.clone())
}
fn is_simple_substance(&self, substance: &str) -> bool {
let simple_substances = [
"H2", "He", "Li", "Be", "B", "C", "N2", "O2", "F2", "Ne", "Na", "Mg", "Al", "Si", "P",
"S", "Cl2", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
"Ga", "Ge", "As", "Se", "Br2", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru",
"Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I2", "Xe", "Cs", "Ba", "La", "Ce",
"Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf",
"Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
"Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
"Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl",
"Mc", "Lv", "Ts", "Og",
];
simple_substances.contains(&substance)
}
fn parse_data(
&self,
url: &Url,
search_type: SearchType,
phase: Phase,
) -> Result<NistInput, NistError> {
print!("\n \n Fetching data...");
let html = self.fetch_page(url)?;
let document = Html::parse_document(&html);
let mut data = NistInput {
cp: None,
T: None,
dh: None,
ds: None,
molar_mass: None,
};
match search_type {
SearchType::Cp => {
(data.cp, data.T) = self.extract_cp(&document, phase)?;
}
SearchType::DeltaH | SearchType::DeltaS => {
let (dh, ds) = self.extract_thermodynamic_data(&document, phase)?;
data.dh = dh;
data.ds = ds;
}
SearchType::MolarMass => {
data.molar_mass = self.extract_molar_mass(&document)?;
}
SearchType::All => {
(data.cp, data.T) = self.extract_cp(&document, phase)?;
let (dh, ds) = self.extract_thermodynamic_data(&document, phase)?;
data.dh = dh;
data.ds = ds;
data.molar_mass = self.extract_molar_mass(&document)?;
}
}
println!("\n \n Data found: {:?} \n \n", data);
Ok(data)
}
fn extract_cp(
&self,
document: &Html,
phase: Phase,
) -> Result<(Option<Vec<Vec<f64>>>, Option<Vec<Vec<f64>>>), NistError> {
let table_selector = match phase {
Phase::Gas => {
Selector::parse("table[aria-label='Gas Phase Heat Capacity (Shomate Equation)']")
.unwrap()
}
Phase::Solid => {
Selector::parse("table[aria-label='Solid Phase Heat Capacity (Shomate Equation)']")
.unwrap()
}
Phase::Liquid => {
Selector::parse("table[aria-label='Liquid Phase Heat Capacity (Shomate Equation)']")
.unwrap()
}
};
if let Some(table) = document.select(&table_selector).next() {
println!("\n \n found table: {:?} \n \n", table);
#[allow(non_snake_case)]
let mut headers_T: Vec<Vec<f64>> = Vec::new();
if let Some(header_row) = table.select(&Selector::parse("tr").unwrap()).next() {
headers_T = header_row
.select(&Selector::parse("td").unwrap())
.filter_map(|cell| {
let text = cell.text().collect::<String>();
let temps: Vec<f64> = text
.split_whitespace() .filter_map(|s| s.parse::<f64>().ok()) .collect();
Some(temps)
})
.collect();
}
let mut coefficients: Vec<Vec<f64>> = vec![Vec::with_capacity(9); headers_T.len()];
for row in table.select(&Selector::parse("tr").unwrap()).skip(1) {
let cells: Vec<f64> = row
.select(&Selector::parse("td").unwrap())
.filter_map(|cell| {
cell.text()
.collect::<String>()
.split_whitespace()
.next()
.and_then(|s| s.parse::<f64>().ok())
})
.collect();
if !cells.is_empty() {
for (i, cell) in cells.iter().enumerate() {
coefficients[i].push(*cell);
}
}
}
if !coefficients.is_empty() && !headers_T.is_empty() {
print!("Cp(T) parsed");
return Ok((Some(coefficients), Some(headers_T)));
}
}
Ok((None, None))
}
fn extract_thermodynamic_data(
&self,
document: &Html,
phase: Phase,
) -> Result<(Option<f64>, Option<f64>), NistError> {
let table_selector = Selector::parse("table").unwrap();
if let Some(table) = document.select(&table_selector).next() {
println!("\n \n table: {:?} \n \n", table);
let mut dh = None;
let mut ds = None;
for row in table.select(&Selector::parse("tr").unwrap()) {
let cells: Vec<String> = row
.select(&Selector::parse("td").unwrap())
.map(|cell| cell.text().collect::<String>())
.collect();
if cells.len() >= 2 {
let value_str = cells[1].trim();
if let Some(value) = value_str.split('±').next() {
if cells[0].contains("H°")
&& cells[0].contains("f")
&& cells[0].contains(phase.as_str())
{
if let Ok(val) = value.trim().parse::<f64>() {
dh = Some(val);
}
} else if cells[0].starts_with("S°") && cells[0].contains(phase.as_str()) {
if let Ok(val) = value.trim().parse::<f64>() {
ds = Some(val);
}
}
}
}
}
Ok((dh, ds))
} else {
Ok((None, None))
}
}
fn extract_molar_mass(&self, document: &Html) -> Result<Option<f64>, NistError> {
let selector = Selector::parse("li").unwrap();
for element in document.select(&selector) {
let text = element.text().collect::<String>();
if text.contains("Molecular weight") {
if let Some(value) = text.split(':').nth(1) {
if let Ok(mass) = value.trim().parse::<f64>() {
return Ok(Some(mass));
}
}
}
}
Ok(None)
}
}
impl NistInput {
pub fn new() -> NistInput {
NistInput {
cp: None,
T: None,
dh: None,
ds: None,
molar_mass: None,
}
}
pub fn extract_coefficients(
&mut self,
T: f64,
) -> Result<(f64, f64, f64, f64, f64, f64, f64, f64), NistError> {
if let (Some(T_ranges), Some(cp_coeffs)) = (&self.T, &self.cp) {
for (i, T_pairs) in T_ranges.iter().enumerate() {
if T >= T_pairs[0] && T <= T_pairs[1] {
let coeffs = &cp_coeffs[i];
if coeffs.len() >= 8 {
return Ok((
coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4], coeffs[5],
coeffs[6], coeffs[7],
));
}
}
}
}
Err(NistError::InvalidDataFormat)
}
pub fn caclc_cp_dh_ds(&mut self, T: f64) -> Result<(f64, f64, f64), NistError> {
let (a, b, c, d, e, f, g, h) = self.extract_coefficients(T)?;
let t = T / 1000.0;
let cp = a + b * t + c * t.powi(2) + d * t.powi(3) + e / t.powi(2);
let dh0 = self.dh.unwrap_or(0.0); let dh = a * t + (b * t.powi(2)) / 2.0 + (c * t.powi(3)) / 3.0 + (d * t.powi(4)) / 4.0
- e / t
+ f
- h
+ dh0;
let ds = a * t.ln() + b * t + (c * t.powi(2)) / 2.0 + (d * t.powi(3)) / 3.0
- e / (2.0 * t.powi(2))
+ g;
Ok((cp, dh, ds))
}
pub fn pretty_print(&self) {
println!("NIST Data:");
if let Some(molar_mass) = self.molar_mass {
println!(" Molar mass: {} g/mol", molar_mass);
}
if let Some(dh) = self.dh {
println!(" Formation enthalpy: {} kJ/mol", dh);
} else {
println!(" Formation enthalpy: 0.0 kJ/mol (simple substance)");
}
if let Some(ds) = self.ds {
println!(" Standard entropy: {} J/mol·K", ds);
}
if let (Some(T_ranges), Some(cp_coeffs)) = (&self.T, &self.cp) {
println!(" Heat capacity coefficients:");
for (i, (T_range, coeffs)) in T_ranges.iter().zip(cp_coeffs.iter()).enumerate() {
println!(" Range {}: {:.0}-{:.0} K", i + 1, T_range[0], T_range[1]);
println!(
" A={:.3e}, B={:.3e}, C={:.3e}, D={:.3e}",
coeffs[0], coeffs[1], coeffs[2], coeffs[3]
);
if coeffs.len() > 4 {
println!(
" E={:.3e}, F={:.3e}, G={:.3e}, H={:.3e}",
coeffs[4], coeffs[5], coeffs[6], coeffs[7]
);
}
}
}
}
}