use crate::{BgpkitCommonsError, Result};
use chrono::NaiveDate;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct As2orgJsonOrg {
#[serde(alias = "organizationId")]
org_id: String,
changed: Option<String>,
#[serde(default)]
name: String,
country: String,
source: String,
#[serde(alias = "type")]
data_type: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct As2orgJsonAs {
asn: String,
changed: Option<String>,
#[serde(default)]
name: String,
#[serde(alias = "opaqueId")]
opaque_id: Option<String>,
#[serde(alias = "organizationId")]
org_id: String,
source: String,
#[serde(rename = "type")]
data_type: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
enum As2orgJsonEntry {
Org(As2orgJsonOrg),
As(As2orgJsonAs),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct As2orgAsInfo {
pub asn: u32,
pub name: String,
pub country_code: String,
pub org_id: String,
pub org_name: String,
pub source: String,
}
#[allow(dead_code)]
pub struct As2org {
as_map: HashMap<u32, As2orgJsonAs>,
org_map: HashMap<String, As2orgJsonOrg>,
as_to_org: HashMap<u32, String>,
org_to_as: HashMap<String, Vec<u32>>,
}
const BASE_URL: &str = "https://publicdata.caida.org/datasets/as-organizations";
impl As2org {
pub fn new(data_file_path: Option<String>) -> Result<Self> {
let entries = match data_file_path {
Some(path) => parse_as2org_file(path.as_str())?,
None => {
let url = get_most_recent_data()?;
parse_as2org_file(url.as_str())?
}
};
let mut as_map: HashMap<u32, As2orgJsonAs> = HashMap::new();
let mut org_map: HashMap<String, As2orgJsonOrg> = HashMap::new();
for entry in entries {
match entry {
As2orgJsonEntry::As(as_entry) => {
if let Ok(asn) = as_entry.asn.parse::<u32>() {
as_map.insert(asn, as_entry);
}
}
As2orgJsonEntry::Org(org_entry) => {
org_map.insert(org_entry.org_id.clone(), org_entry);
}
}
}
let mut as_to_org: HashMap<u32, String> = HashMap::new();
let mut org_to_as: HashMap<String, Vec<u32>> = HashMap::new();
for (asn, as_entry) in as_map.iter() {
as_to_org.insert(*asn, as_entry.org_id.clone());
let org_asn = org_to_as.entry(as_entry.org_id.clone()).or_default();
org_asn.push(*asn);
}
Ok(Self {
as_map,
org_map,
as_to_org,
org_to_as,
})
}
#[allow(dead_code)]
pub fn get_all_files_with_dates() -> Result<Vec<(String, NaiveDate)>> {
get_all_files_with_dates()
}
#[allow(dead_code)]
pub fn get_latest_file_url() -> String {
format!("{BASE_URL}/latest.as-org2info.jsonl.gz")
}
pub fn get_as_info(&self, asn: u32) -> Option<As2orgAsInfo> {
let as_entry = self.as_map.get(&asn)?;
let org_id = as_entry.org_id.as_str();
let org_entry = self.org_map.get(org_id)?;
Some(As2orgAsInfo {
asn,
name: as_entry.name.clone(),
country_code: org_entry.country.clone(),
org_id: org_id.to_string(),
org_name: org_entry.name.clone(),
source: org_entry.source.clone(),
})
}
#[allow(dead_code)]
pub fn get_siblings(&self, asn: u32) -> Option<Vec<As2orgAsInfo>> {
let org_id = self.as_to_org.get(&asn)?;
let org_asns = self.org_to_as.get(org_id)?.to_vec();
Some(
org_asns
.iter()
.filter_map(|asn| self.get_as_info(*asn))
.collect(),
)
}
#[allow(dead_code)]
pub fn are_siblings(&self, asn1: u32, asn2: u32) -> bool {
let org1 = match self.as_to_org.get(&asn1) {
None => return false,
Some(o) => o,
};
let org2 = match self.as_to_org.get(&asn2) {
None => return false,
Some(o) => o,
};
org1 == org2
}
}
fn fix_latin1_misinterpretation(input: &str) -> String {
let mut result = String::new();
let mut chars = input.chars().peekable();
while let Some(c) = chars.next() {
if c == 'Ã' && chars.peek().is_some() {
let next_char = chars.next().unwrap();
let byte_value = match next_char {
'\u{0080}'..='\u{00BF}' => 0xC0 + (next_char as u32 - 0x0080),
_ => {
result.push(c);
result.push(next_char);
continue;
}
};
if let Some(correct_char) = char::from_u32(byte_value) {
result.push(correct_char);
} else {
result.push(c);
result.push(next_char);
}
} else {
result.push(c);
}
}
result
}
fn parse_as2org_file(path: &str) -> Result<Vec<As2orgJsonEntry>> {
let mut res: Vec<As2orgJsonEntry> = vec![];
for line in oneio::read_lines(path)? {
let line = fix_latin1_misinterpretation(&line?);
if line.contains(r#""type":"ASN""#) {
let data = serde_json::from_str::<As2orgJsonAs>(line.as_str());
match data {
Ok(data) => {
res.push(As2orgJsonEntry::As(data));
}
Err(e) => {
return Err(BgpkitCommonsError::data_source_error(
crate::errors::data_sources::CAIDA,
format!("error parsing AS line: {}", e),
));
}
}
} else {
let data = serde_json::from_str::<As2orgJsonOrg>(line.as_str());
match data {
Ok(data) => {
res.push(As2orgJsonEntry::Org(data));
}
Err(e) => {
return Err(BgpkitCommonsError::data_source_error(
crate::errors::data_sources::CAIDA,
format!("error parsing Org line: {}", e),
));
}
}
}
}
Ok(res)
}
fn get_all_files_with_dates() -> Result<Vec<(String, NaiveDate)>> {
let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*")
.map_err(|e| BgpkitCommonsError::Internal(format!("failed to compile regex: {}", e)))?;
let content = oneio::read_to_string(BASE_URL)?;
let mut res: Vec<(String, NaiveDate)> = data_link
.captures_iter(content.as_str())
.filter_map(|cap| {
let file = cap[1].to_owned();
let date = NaiveDate::parse_from_str(&file[..8], "%Y%m%d").ok()?;
Some((format!("{BASE_URL}/{file}"), date))
})
.collect();
res.sort_by_key(|(_, date)| *date);
Ok(res)
}
fn get_most_recent_data() -> Result<String> {
let files = get_all_files_with_dates()?;
let last_file = files.last().ok_or_else(|| {
BgpkitCommonsError::data_source_error(
crate::errors::data_sources::CAIDA,
"No dataset files found",
)
})?;
Ok(last_file.0.clone())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fix_latin1_misinterpretation() {
let normal = "Hello World";
assert_eq!(fix_latin1_misinterpretation(normal), normal);
assert_eq!(fix_latin1_misinterpretation(""), "");
let ascii_only = "ACME Corporation Inc.";
assert_eq!(fix_latin1_misinterpretation(ascii_only), ascii_only);
let special = "Test @#$%^&*() 123";
assert_eq!(fix_latin1_misinterpretation(special), special);
}
#[test]
fn test_as2org_json_org_deserialization() {
let json = r#"{"organizationId":"ORG-TEST","changed":"20240101","name":"Test Org","country":"US","source":"ARIN","type":"Organization"}"#;
let org: As2orgJsonOrg = serde_json::from_str(json).unwrap();
assert_eq!(org.org_id, "ORG-TEST");
assert_eq!(org.name, "Test Org");
assert_eq!(org.country, "US");
assert_eq!(org.source, "ARIN");
assert_eq!(org.data_type, "Organization");
}
#[test]
fn test_as2org_json_org_with_missing_optional_fields() {
let json = r#"{"organizationId":"ORG-TEST2","name":"Another Org","country":"DE","source":"RIPE","type":"Organization"}"#;
let org: As2orgJsonOrg = serde_json::from_str(json).unwrap();
assert_eq!(org.org_id, "ORG-TEST2");
assert!(org.changed.is_none());
}
#[test]
fn test_as2org_json_as_deserialization() {
let json = r#"{"asn":"12345","changed":"20240101","name":"Test AS","opaqueId":"opaque123","organizationId":"ORG-TEST","source":"ARIN","type":"ASN"}"#;
let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap();
assert_eq!(as_entry.asn, "12345");
assert_eq!(as_entry.name, "Test AS");
assert_eq!(as_entry.org_id, "ORG-TEST");
assert_eq!(as_entry.opaque_id, Some("opaque123".to_string()));
assert_eq!(as_entry.source, "ARIN");
assert_eq!(as_entry.data_type, "ASN");
}
#[test]
fn test_as2org_json_as_with_missing_optional_fields() {
let json = r#"{"asn":"67890","name":"Minimal AS","organizationId":"ORG-MIN","source":"APNIC","type":"ASN"}"#;
let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap();
assert_eq!(as_entry.asn, "67890");
assert!(as_entry.changed.is_none());
assert!(as_entry.opaque_id.is_none());
}
#[test]
fn test_as2org_json_as_with_empty_name() {
let json = r#"{"asn":"11111","organizationId":"ORG-EMPTY","source":"RIPE","type":"ASN"}"#;
let as_entry: As2orgJsonAs = serde_json::from_str(json).unwrap();
assert_eq!(as_entry.name, ""); }
#[test]
fn test_as2org_as_info_struct() {
let info = As2orgAsInfo {
asn: 12345,
name: "Test AS".to_string(),
country_code: "US".to_string(),
org_id: "ORG-TEST".to_string(),
org_name: "Test Organization".to_string(),
source: "ARIN".to_string(),
};
assert_eq!(info.asn, 12345);
assert_eq!(info.name, "Test AS");
assert_eq!(info.country_code, "US");
assert_eq!(info.org_id, "ORG-TEST");
assert_eq!(info.org_name, "Test Organization");
assert_eq!(info.source, "ARIN");
}
#[test]
fn test_as2org_as_info_serialization() {
let info = As2orgAsInfo {
asn: 12345,
name: "Test AS".to_string(),
country_code: "US".to_string(),
org_id: "ORG-TEST".to_string(),
org_name: "Test Organization".to_string(),
source: "ARIN".to_string(),
};
let json = serde_json::to_string(&info).unwrap();
assert!(json.contains("\"asn\":12345"));
assert!(json.contains("\"name\":\"Test AS\""));
let deserialized: As2orgAsInfo = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.asn, info.asn);
assert_eq!(deserialized.name, info.name);
}
#[test]
fn test_get_latest_file_url() {
let url = As2org::get_latest_file_url();
assert!(url.starts_with("https://publicdata.caida.org/datasets/as-organizations/"));
assert!(url.ends_with(".as-org2info.jsonl.gz"));
}
#[test]
#[ignore]
fn test_as2org_new_from_latest() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
assert!(!as2org.as_map.is_empty());
assert!(!as2org.org_map.is_empty());
}
#[test]
#[ignore]
fn test_as2org_get_as_info_existing() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
let info = as2org.get_as_info(15169);
assert!(info.is_some());
let info = info.unwrap();
assert_eq!(info.asn, 15169);
assert!(!info.org_id.is_empty());
assert!(!info.org_name.is_empty());
assert!(!info.country_code.is_empty());
assert!(!info.source.is_empty());
}
#[test]
#[ignore]
fn test_as2org_get_as_info_nonexistent() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
let info = as2org.get_as_info(999999999);
assert!(info.is_none());
}
#[test]
#[ignore]
fn test_as2org_get_siblings() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
let siblings = as2org.get_siblings(15169);
assert!(siblings.is_some());
let siblings = siblings.unwrap();
assert!(!siblings.is_empty());
assert!(siblings.iter().any(|s| s.asn == 15169));
}
#[test]
#[ignore]
fn test_as2org_get_siblings_nonexistent() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
let siblings = as2org.get_siblings(999999999);
assert!(siblings.is_none());
}
#[test]
#[ignore]
fn test_as2org_are_siblings_true() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
assert!(as2org.are_siblings(15169, 36040));
}
#[test]
#[ignore]
fn test_as2org_are_siblings_false() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
assert!(!as2org.are_siblings(15169, 13335));
}
#[test]
#[ignore]
fn test_as2org_are_siblings_nonexistent() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
assert!(!as2org.are_siblings(15169, 999999999));
assert!(!as2org.are_siblings(999999999, 15169));
assert!(!as2org.are_siblings(999999999, 999999998));
}
#[test]
#[ignore]
fn test_as2org_are_siblings_same_asn() {
let as2org = As2org::new(None).expect("Failed to load AS2org database");
assert!(as2org.are_siblings(15169, 15169));
}
#[test]
#[ignore]
fn test_as2org_get_all_files_with_dates() {
let files = As2org::get_all_files_with_dates().expect("Failed to get file list");
assert!(!files.is_empty());
for i in 1..files.len() {
assert!(files[i].1 >= files[i - 1].1);
}
for (url, _) in &files {
assert!(url.contains("publicdata.caida.org"));
assert!(url.ends_with(".as-org2info.jsonl.gz"));
}
}
}