as2org_rs/lib.rs
1//! as2org-rs: Access CAIDA AS-to-Organization mappings in Rust
2//!
3//! This crate provides a small, dependency-light helper for reading and querying
4//! CAIDA's AS Organizations dataset. It downloads (or opens a local/remote path)
5//! the newline-delimited JSON (JSONL) files published by CAIDA and exposes a
6//! simple API to:
7//!
8//! - Fetch the latest dataset URL from CAIDA
9//! - Load the dataset into memory
10//! - Look up information for a given ASN
11//! - Find all "sibling" ASNs that belong to the same organization
12//! - Test whether two ASNs are siblings (belong to the same org)
13//!
14//! The crate supports local files, HTTP(S) URLs, and gz-compressed inputs via
15//! the `oneio` crate.
16//!
17//! ## Installation
18//!
19//! Add the dependency to your `Cargo.toml`:
20//!
21//! ```toml
22//! [dependencies]
23//! as2org-rs = "1"
24//! ```
25//!
26//! ## Data source
27//! - CAIDA AS Organizations Dataset: <http://www.caida.org/data/as-organizations>
28//!
29//! ## Data model
30//!
31//! Public return type:
32//!
33//! `As2orgAsInfo` contains:
34//! - `asn`: the AS number
35//! - `name`: the name provided for the individual AS number
36//! - `country_code`: the registration country code of the organization
37//! - `org_id`: the CAIDA/WHOIS organization identifier
38//! - `org_name`: the organization's name
39//! - `source`: the RIR or NIR database that contained this entry
40//!
41//! ## Quickstart
42//!
43//! Load the most recent dataset and run typical queries:
44//!
45//! ```rust,no_run
46//! use as2org_rs::As2org;
47//!
48//! // Construct from the latest public dataset (requires network access)
49//! let as2org = As2org::new(None).unwrap();
50//!
51//! // Look up one ASN
52//! let info = as2org.get_as_info(15169).unwrap();
53//! assert_eq!(info.org_id.is_empty(), false);
54//!
55//! // List all siblings for an ASN (ASNs under the same org)
56//! let siblings = as2org.get_siblings(15169).unwrap();
57//! assert!(siblings.iter().any(|s| s.asn == 36040));
58//!
59//! // Check whether two ASNs are siblings
60//! assert!(as2org.are_siblings(15169, 36040));
61//! ```
62//!
63//! ## Offline and custom input
64//!
65//! You can also point to a local file path or a remote URL (HTTP/HTTPS), gzipped
66//! or plain:
67//!
68//! ```rust,no_run
69//! use as2org_rs::As2org;
70//!
71//! // From a local jsonl.gz file
72//! let as2org = As2org::new(Some("/path/to/20250101.as-org2info.jsonl.gz".into())).unwrap();
73//!
74//! // From an explicit HTTPS URL
75//! let as2org = As2org::new(Some("https://publicdata.caida.org/datasets/as-organizations/20250101.as-org2info.jsonl.gz".into())).unwrap();
76//! ```
77//!
78//! ## Errors
79//!
80//! Constructors and helper functions return `anyhow::Result<T>`. For lookups,
81//! the API returns `Option<_>` when a requested ASN or organization is missing.
82//!
83//! ## Notes
84//!
85//! - Network access is only required when you pass `None` to `As2org::new` so the
86//! crate can discover and fetch the latest dataset URL.
87//! - Dataset files can be large; loading them will allocate in-memory maps for
88//! fast queries.
89//! - This crate is not affiliated with CAIDA. Please review CAIDA's data usage
90//! policies before redistribution or heavy automated access.
91
92use anyhow::{anyhow, Result};
93use chrono::NaiveDate;
94use regex::Regex;
95use serde::{Deserialize, Serialize};
96use std::collections::HashMap;
97
98/// Organization JSON format
99///
100/// --------------------
101/// Organization fields
102/// --------------------
103/// org_id : unique ID for the given organization
104/// some will be created by the WHOIS entry and others will be
105/// created by our scripts
106/// changed : the changed date provided by its WHOIS entry
107/// name : name could be selected from the AUT entry tied to the
108/// organization, the AUT entry with the largest customer cone,
109/// listed for the organization (if there existed an stand alone
110/// organization), or a human maintained file.
111/// country : some WHOIS provide as a individual field. In other cases
112/// we inferred it from the addresses
113/// source : the RIR or NIR database which was contained this entry
114#[derive(Debug, Clone, Serialize, Deserialize)]
115struct As2orgJsonOrg {
116 #[serde(alias = "organizationId")]
117 org_id: String,
118
119 changed: Option<String>,
120
121 #[serde(default)]
122 name: String,
123
124 country: String,
125
126 /// The RIR or NIR database that contained this entry
127 source: String,
128
129 #[serde(alias = "type")]
130 data_type: String,
131}
132
133/// AS Json format
134///
135/// ----------
136/// AS fields
137/// ----------
138/// asn : the AS number
139/// changed : the changed date provided by its WHOIS entry
140/// name : the name provide for the individual AS number
141/// org_id : maps to an organization entry
142/// opaque_id : opaque identifier used by RIR extended delegation format
143/// source : the RIR or NIR database which was contained this entry
144#[derive(Debug, Clone, Serialize, Deserialize)]
145struct As2orgJsonAs {
146 asn: String,
147
148 changed: Option<String>,
149
150 #[serde(default)]
151 name: String,
152
153 #[serde(alias = "opaqueId")]
154 opaque_id: Option<String>,
155
156 #[serde(alias = "organizationId")]
157 org_id: String,
158
159 /// The RIR or NIR database that contained this entry
160 source: String,
161
162 #[serde(rename = "type")]
163 data_type: String,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167enum As2orgJsonEntry {
168 Org(As2orgJsonOrg),
169 As(As2orgJsonAs),
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
173/// Public information for an Autonomous System (AS) enriched with its organization.
174///
175/// This struct is returned by high-level query methods like `get_as_info` and
176/// `get_siblings` and contains the most commonly used fields for downstream
177/// analysis or presentation.
178pub struct As2orgAsInfo {
179 /// The AS number
180 pub asn: u32,
181 /// The name provided for the individual AS number
182 pub name: String,
183 /// The registration country code of the organization
184 pub country_code: String,
185 /// Organization identifier (as used in the dataset)
186 pub org_id: String,
187 /// Organization name
188 pub org_name: String,
189 /// The RIR database that contained this entry
190 pub source: String,
191}
192
193/// In-memory accessor for CAIDA's AS-to-Organization dataset.
194///
195/// Construct with `As2org::new`, then perform lookups via `get_as_info`,
196/// `get_siblings`, or `are_siblings`.
197pub struct As2org {
198 as_map: HashMap<u32, As2orgJsonAs>,
199 org_map: HashMap<String, As2orgJsonOrg>,
200 as_to_org: HashMap<u32, String>,
201 org_to_as: HashMap<String, Vec<u32>>,
202}
203
204const BASE_URL: &str = "https://publicdata.caida.org/datasets/as-organizations";
205
206impl As2org {
207 /// Create a new `As2org` accessor.
208 ///
209 /// - When `data_file_path` is `None`, the constructor fetches the CAIDA
210 /// index page to discover the most recent `*.as-org2info.jsonl.gz` file
211 /// and reads it via HTTP(S).
212 /// - When `Some(path_or_url)` is provided, the path can be a local file or
213 /// a remote URL. Gzipped files are supported transparently.
214 ///
215 /// Returns `anyhow::Result<Self>` with an initialized in-memory index.
216 pub fn new(data_file_path: Option<String>) -> Result<Self> {
217 let entries = match data_file_path {
218 Some(path) => parse_as2org_file(path.as_str())?,
219 None => {
220 let url = get_most_recent_data()?;
221 parse_as2org_file(url.as_str())?
222 }
223 };
224
225 let mut as_map: HashMap<u32, As2orgJsonAs> = HashMap::new();
226 let mut org_map: HashMap<String, As2orgJsonOrg> = HashMap::new();
227
228 for entry in entries {
229 match entry {
230 As2orgJsonEntry::As(as_entry) => {
231 as_map.insert(as_entry.asn.parse::<u32>().unwrap(), as_entry);
232 }
233 As2orgJsonEntry::Org(org_entry) => {
234 org_map.insert(org_entry.org_id.clone(), org_entry);
235 }
236 }
237 }
238
239 let mut as_to_org: HashMap<u32, String> = HashMap::new();
240 let mut org_to_as: HashMap<String, Vec<u32>> = HashMap::new();
241
242 for (asn, as_entry) in as_map.iter() {
243 as_to_org.insert(*asn, as_entry.org_id.clone());
244 let org_asn = org_to_as.entry(as_entry.org_id.clone()).or_default();
245 org_asn.push(*asn);
246 }
247
248 Ok(Self {
249 as_map,
250 org_map,
251 as_to_org,
252 org_to_as,
253 })
254 }
255
256 /// List all available dataset files published by CAIDA with their dates.
257 ///
258 /// Returns a vector of `(url, date)` pairs sorted by date ascending; the last
259 /// element is the most recent dataset.
260 ///
261 /// This is useful for offline workflows that want to pin to a specific
262 /// snapshot instead of always using the latest.
263 pub fn get_all_files_with_dates() -> Result<Vec<(String, NaiveDate)>> {
264 get_all_files_with_dates()
265 }
266
267 /// Returns the URL for the latest AS-to-Organization dataset file.
268 ///
269 /// This function returns a direct URL to CAIDA's most recent dataset using
270 /// the "latest" symlink. This is a convenience wrapper that formats the
271 /// complete URL string.
272 ///
273 /// # Returns
274 /// A string containing the HTTPS URL to the latest .jsonl.gz dataset file.
275 pub fn get_latest_file_url() -> String {
276 format!("{BASE_URL}/latest.as-org2info.jsonl.gz")
277 }
278
279 /// Get enriched information for a specific ASN, if present.
280 ///
281 /// Returns `None` when the ASN is not found in the loaded dataset.
282 ///
283 /// Example:
284 /// ```rust,no_run
285 /// # use as2org_rs::As2org;
286 /// let db = As2org::new(None).unwrap();
287 /// let info = db.get_as_info(15169).unwrap();
288 /// assert!(!info.org_id.is_empty());
289 /// ```
290 pub fn get_as_info(&self, asn: u32) -> Option<As2orgAsInfo> {
291 let as_entry = self.as_map.get(&asn)?;
292 let org_id = as_entry.org_id.as_str();
293 let org_entry = self.org_map.get(org_id)?;
294 Some(As2orgAsInfo {
295 asn,
296 name: as_entry.name.clone(),
297 country_code: org_entry.country.clone(),
298 org_id: org_id.to_string(),
299 org_name: org_entry.name.clone(),
300 source: org_entry.source.clone(),
301 })
302 }
303
304 /// Return all ASNs that belong to the same organization as the given ASN.
305 ///
306 /// The returned vector includes the queried ASN itself. Returns `None`
307 /// when the ASN is not present in the dataset.
308 ///
309 /// Example:
310 /// ```rust,no_run
311 /// # use as2org_rs::As2org;
312 /// let db = As2org::new(None).unwrap();
313 /// let sibs = db.get_siblings(15169).unwrap();
314 /// assert!(sibs.iter().any(|s| s.asn == 15169));
315 /// ```
316 pub fn get_siblings(&self, asn: u32) -> Option<Vec<As2orgAsInfo>> {
317 let org_id = self.as_to_org.get(&asn)?;
318 let org_asns = self.org_to_as.get(org_id)?.to_vec();
319 Some(
320 org_asns
321 .iter()
322 .map(|asn| self.get_as_info(*asn).unwrap())
323 .collect(),
324 )
325 }
326
327 /// Return `true` if both ASNs belong to the same organization.
328 ///
329 /// Returns `false` if either ASN is missing from the dataset or their
330 /// organization differs.
331 ///
332 /// Example:
333 /// ```rust,no_run
334 /// # use as2org_rs::As2org;
335 /// let db = As2org::new(None).unwrap();
336 /// assert!(db.are_siblings(15169, 36040));
337 /// ```
338 pub fn are_siblings(&self, asn1: u32, asn2: u32) -> bool {
339 let org1 = match self.as_to_org.get(&asn1) {
340 None => return false,
341 Some(o) => o,
342 };
343 let org2 = match self.as_to_org.get(&asn2) {
344 None => return false,
345 Some(o) => o,
346 };
347 org1 == org2
348 }
349}
350
351/// Fixes misinterpretation of strings encoded in Latin-1 that were mistakenly decoded as UTF-8.
352///
353/// This function processes a string that may contain characters misinterpreted due to an
354/// incorrect encoding or decoding process. Specifically, it handles cases where Latin-1
355/// characters are represented as two incorrect UTF-8 characters, such as 'Ã' followed
356/// by a secondary byte.
357///
358/// # Arguments
359///
360/// * `input` - A string slice that may contain incorrectly encoded characters.
361///
362/// # Returns
363///
364/// A corrected string with all misinterpreted characters properly fixed or left unchanged
365/// if the pattern doesn't match.
366fn fix_latin1_misinterpretation(input: &str) -> String {
367 let mut result = String::new();
368 let mut chars = input.chars().peekable();
369
370 while let Some(c) = chars.next() {
371 // Check for the pattern of misinterpreted Latin-1 chars
372 if c == 'Ã' && chars.peek().is_some() {
373 let next_char = chars.next().unwrap();
374
375 // Calculate the original Latin-1 character
376 let byte_value = match next_char {
377 '\u{0080}'..='\u{00BF}' => 0xC0 + (next_char as u32 - 0x0080),
378 // Handle other ranges as needed
379 _ => {
380 // If it doesn't match the pattern, treat as normal chars
381 result.push(c);
382 result.push(next_char);
383 continue;
384 }
385 };
386
387 // Convert to the correct character
388 if let Some(correct_char) = char::from_u32(byte_value) {
389 result.push(correct_char);
390 } else {
391 // Fallback for invalid characters
392 result.push(c);
393 result.push(next_char);
394 }
395 } else {
396 result.push(c);
397 }
398 }
399
400 result
401}
402
403/// parse remote AS2Org file into Vec of DataEntry
404fn parse_as2org_file(path: &str) -> Result<Vec<As2orgJsonEntry>> {
405 let mut res: Vec<As2orgJsonEntry> = vec![];
406
407 for line in oneio::read_lines(path)? {
408 let line = fix_latin1_misinterpretation(&line?);
409 if line.contains(r#""type":"ASN""#) {
410 let data = serde_json::from_str::<As2orgJsonAs>(line.as_str());
411 match data {
412 Ok(data) => {
413 res.push(As2orgJsonEntry::As(data));
414 }
415 Err(e) => {
416 eprintln!("error parsing line:\n{}", line.as_str());
417 return Err(anyhow!(e));
418 }
419 }
420 } else {
421 let data = serde_json::from_str::<As2orgJsonOrg>(line.as_str());
422 match data {
423 Ok(data) => {
424 res.push(As2orgJsonEntry::Org(data));
425 }
426 Err(e) => {
427 eprintln!("error parsing line:\n{}", line.as_str());
428 return Err(anyhow!(e));
429 }
430 }
431 }
432 }
433 Ok(res)
434}
435
436/// Returns a vector of tuples containing the full URLs of AS2Org data files and their corresponding dates.
437/// The vector is sorted by dates with the latest date last.
438///
439/// # Returns
440/// - `Result<Vec<(String, NaiveDate)>>` where each tuple contains:
441/// - String: complete URL to the AS2Org data file
442/// - NaiveDate: date extracted from the file name
443fn get_all_files_with_dates() -> Result<Vec<(String, NaiveDate)>> {
444 let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*")?;
445 let content = oneio::read_to_string(BASE_URL)?;
446 let mut res: Vec<(String, NaiveDate)> = data_link
447 .captures_iter(content.as_str())
448 .map(|cap| {
449 let file = cap[1].to_owned();
450 let date = NaiveDate::parse_from_str(&file[..8], "%Y%m%d")?;
451 Ok((format!("{BASE_URL}/{file}"), date))
452 })
453 .collect::<Result<Vec<_>, chrono::ParseError>>()?;
454 res.sort_by_key(|(_, date)| *date);
455 Ok(res)
456}
457fn get_most_recent_data() -> Result<String> {
458 let files = get_all_files_with_dates()?;
459 let last_file = files
460 .last()
461 .ok_or_else(|| anyhow!("No dataset files found"))?;
462 Ok(last_file.0.clone())
463}
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468 use chrono::Datelike;
469
470 // Helper to create a shared As2org instance for all tests
471 // This ensures we only fetch the data once
472 fn get_test_db() -> As2org {
473 // Use a static to cache the database across tests
474 // Note: In a real scenario with multiple test threads, you might want to use lazy_static
475 As2org::new(None).expect("Failed to load AS2org database")
476 }
477
478 #[test]
479 fn test_new_from_latest() {
480 let as2org = get_test_db();
481 // Verify the database was loaded by checking if we have some data
482 assert!(as2org.as_map.len() > 0);
483 assert!(as2org.org_map.len() > 0);
484 }
485
486 #[test]
487 fn test_get_as_info_existing() {
488 let as2org = get_test_db();
489 // Test with a well-known ASN (Google)
490 let info = as2org.get_as_info(15169);
491 assert!(info.is_some());
492 let info = info.unwrap();
493 assert_eq!(info.asn, 15169);
494 assert!(!info.org_id.is_empty());
495 assert!(!info.org_name.is_empty());
496 assert!(!info.country_code.is_empty());
497 assert!(!info.source.is_empty());
498 }
499
500 #[test]
501 fn test_get_as_info_nonexistent() {
502 let as2org = get_test_db();
503 // Test with a likely non-existent ASN
504 let info = as2org.get_as_info(999999999);
505 assert!(info.is_none());
506 }
507
508 #[test]
509 fn test_get_siblings_existing() {
510 let as2org = get_test_db();
511 // Test with Google's AS15169
512 let siblings = as2org.get_siblings(15169);
513 assert!(siblings.is_some());
514 let siblings = siblings.unwrap();
515 // Should include at least the ASN itself
516 assert!(siblings.len() >= 1);
517 // The queried ASN should be in the siblings list
518 assert!(siblings.iter().any(|s| s.asn == 15169));
519 // All siblings should have the same org_id
520 let org_id = &siblings[0].org_id;
521 assert!(siblings.iter().all(|s| s.org_id == *org_id));
522 }
523
524 #[test]
525 fn test_get_siblings_nonexistent() {
526 let as2org = get_test_db();
527 let siblings = as2org.get_siblings(999999999);
528 assert!(siblings.is_none());
529 }
530
531 #[test]
532 fn test_are_siblings_true() {
533 let as2org = get_test_db();
534 // First get an ASN that has siblings
535 let _info = as2org.get_as_info(15169).unwrap();
536 let siblings = as2org.get_siblings(15169).unwrap();
537
538 if siblings.len() > 1 {
539 // Test with actual siblings if they exist
540 let sibling_asn = siblings.iter().find(|s| s.asn != 15169).unwrap().asn;
541 assert!(as2org.are_siblings(15169, sibling_asn));
542 } else {
543 // An ASN is always a sibling to itself
544 assert!(as2org.are_siblings(15169, 15169));
545 }
546 }
547
548 #[test]
549 fn test_are_siblings_false() {
550 let as2org = get_test_db();
551 // Google (15169) and Cloudflare (13335) should not be siblings
552 assert!(!as2org.are_siblings(15169, 13335));
553 }
554
555 #[test]
556 fn test_are_siblings_nonexistent() {
557 let as2org = get_test_db();
558 // Test with non-existent ASN
559 assert!(!as2org.are_siblings(15169, 999999999));
560 assert!(!as2org.are_siblings(999999999, 15169));
561 assert!(!as2org.are_siblings(999999999, 999999998));
562 }
563
564 #[test]
565 fn test_get_latest_file_url() {
566 let url = As2org::get_latest_file_url();
567 assert!(url.starts_with("https://"));
568 assert!(url.contains("as-org2info.jsonl.gz"));
569 }
570
571 #[test]
572 fn test_get_all_files_with_dates() {
573 let files = As2org::get_all_files_with_dates();
574 assert!(files.is_ok());
575 let files = files.unwrap();
576 assert!(files.len() > 0);
577
578 // Verify format of returned data
579 for (url, date) in &files {
580 assert!(url.starts_with("https://"));
581 assert!(url.contains("as-org2info.jsonl.gz"));
582 // Date should be valid (just checking it's not a default)
583 assert!(date.year() >= 2000);
584 }
585
586 // Verify sorting (dates should be in ascending order)
587 for i in 1..files.len() {
588 assert!(files[i].1 >= files[i - 1].1);
589 }
590 }
591
592 #[test]
593 fn test_as_to_org_mapping() {
594 let as2org = get_test_db();
595 // Verify internal consistency: as_to_org should map to valid orgs
596 for (asn, org_id) in as2org.as_to_org.iter().take(10) {
597 assert!(as2org.org_map.contains_key(org_id));
598 assert!(as2org.as_map.contains_key(asn));
599 }
600 }
601
602 #[test]
603 fn test_org_to_as_mapping() {
604 let as2org = get_test_db();
605 // Verify internal consistency: org_to_as should map to valid ASNs
606 for (org_id, asns) in as2org.org_to_as.iter().take(10) {
607 assert!(as2org.org_map.contains_key(org_id));
608 for asn in asns {
609 assert!(as2org.as_map.contains_key(asn));
610 assert_eq!(as2org.as_to_org.get(asn).unwrap(), org_id);
611 }
612 }
613 }
614
615 #[test]
616 fn test_fix_latin1_misinterpretation() {
617 // Test the Latin-1 fix function with known patterns
618 let input = "Test é string";
619 let fixed = fix_latin1_misinterpretation(input);
620 // The function should convert é to é (Latin-1 0xE9)
621 assert!(fixed.len() <= input.len());
622
623 // Test with no special characters
624 let input = "Normal ASCII string";
625 let fixed = fix_latin1_misinterpretation(input);
626 assert_eq!(input, fixed);
627 }
628
629 #[test]
630 fn test_as2org_as_info_fields() {
631 let as2org = get_test_db();
632 let info = as2org.get_as_info(15169).unwrap();
633
634 // Verify all fields are populated
635 assert_eq!(info.asn, 15169);
636 assert!(!info.name.is_empty());
637 assert!(!info.country_code.is_empty());
638 assert!(!info.org_id.is_empty());
639 assert!(!info.org_name.is_empty());
640 assert!(!info.source.is_empty());
641 }
642
643 #[test]
644 fn test_siblings_consistency() {
645 let as2org = get_test_db();
646 let asn = 15169;
647 let siblings = as2org.get_siblings(asn).unwrap();
648
649 // All siblings should return the same sibling list
650 for sibling in &siblings {
651 let sibling_siblings = as2org.get_siblings(sibling.asn).unwrap();
652 assert_eq!(siblings.len(), sibling_siblings.len());
653
654 // All ASNs should be present in both lists
655 for s in &siblings {
656 assert!(sibling_siblings.iter().any(|ss| ss.asn == s.asn));
657 }
658 }
659 }
660}