as2org_rs/
lib.rs

1//! as2org-rs: Access CAIDA AS-to-Organization mappings in Rust
2//!
3//! This crate provides a small, dependency-light helper for reading and querying
4//! CAIDA's AS Organizations dataset. It downloads (or opens a local/remote path)
5//! the newline-delimited JSON (JSONL) files published by CAIDA and exposes a
6//! simple API to:
7//!
8//! - Fetch the latest dataset URL from CAIDA
9//! - Load the dataset into memory
10//! - Look up information for a given ASN
11//! - Find all "sibling" ASNs that belong to the same organization
12//! - Test whether two ASNs are siblings (belong to the same org)
13//!
14//! The crate supports local files, HTTP(S) URLs, and gz-compressed inputs via
15//! the `oneio` crate.
16//!
17//! ## Installation
18//!
19//! Add the dependency to your `Cargo.toml`:
20//!
21//! ```toml
22//! [dependencies]
23//! as2org-rs = "1"
24//! ```
25//!
26//! ## Data source
27//! - CAIDA AS Organizations Dataset: <http://www.caida.org/data/as-organizations>
28//!
29//! ## Data model
30//!
31//! Public return type:
32//!
33//! `As2orgAsInfo` contains:
34//! - `asn`: the AS number
35//! - `name`: the name provided for the individual AS number
36//! - `country_code`: the registration country code of the organization
37//! - `org_id`: the CAIDA/WHOIS organization identifier
38//! - `org_name`: the organization's name
39//! - `source`: the RIR or NIR database that contained this entry
40//!
41//! ## Quickstart
42//!
43//! Load the most recent dataset and run typical queries:
44//!
45//! ```rust,no_run
46//! use as2org_rs::As2org;
47//!
48//! // Construct from the latest public dataset (requires network access)
49//! let as2org = As2org::new(None).unwrap();
50//!
51//! // Look up one ASN
52//! let info = as2org.get_as_info(15169).unwrap();
53//! assert_eq!(info.org_id.is_empty(), false);
54//!
55//! // List all siblings for an ASN (ASNs under the same org)
56//! let siblings = as2org.get_siblings(15169).unwrap();
57//! assert!(siblings.iter().any(|s| s.asn == 36040));
58//!
59//! // Check whether two ASNs are siblings
60//! assert!(as2org.are_siblings(15169, 36040));
61//! ```
62//!
63//! ## Offline and custom input
64//!
65//! You can also point to a local file path or a remote URL (HTTP/HTTPS), gzipped
66//! or plain:
67//!
68//! ```rust,no_run
69//! use as2org_rs::As2org;
70//!
71//! // From a local jsonl.gz file
72//! let as2org = As2org::new(Some("/path/to/20250101.as-org2info.jsonl.gz".into())).unwrap();
73//!
74//! // From an explicit HTTPS URL
75//! let as2org = As2org::new(Some("https://publicdata.caida.org/datasets/as-organizations/20250101.as-org2info.jsonl.gz".into())).unwrap();
76//! ```
77//!
78//! ## Errors
79//!
80//! Constructors and helper functions return `anyhow::Result<T>`. For lookups,
81//! the API returns `Option<_>` when a requested ASN or organization is missing.
82//!
83//! ## Notes
84//!
85//! - Network access is only required when you pass `None` to `As2org::new` so the
86//!   crate can discover and fetch the latest dataset URL.
87//! - Dataset files can be large; loading them will allocate in-memory maps for
88//!   fast queries.
89//! - This crate is not affiliated with CAIDA. Please review CAIDA's data usage
90//!   policies before redistribution or heavy automated access.
91
92use anyhow::{anyhow, Result};
93use chrono::NaiveDate;
94use regex::Regex;
95use serde::{Deserialize, Serialize};
96use std::collections::HashMap;
97
98/// Organization JSON format
99///
100/// --------------------
101/// Organization fields
102/// --------------------
103/// org_id  : unique ID for the given organization
104///            some will be created by the WHOIS entry and others will be
105///            created by our scripts
106/// changed : the changed date provided by its WHOIS entry
107/// name    : name could be selected from the AUT entry tied to the
108///            organization, the AUT entry with the largest customer cone,
109///           listed for the organization (if there existed an stand alone
110///            organization), or a human maintained file.
111/// country : some WHOIS provide as a individual field. In other cases
112///            we inferred it from the addresses
113/// source  : the RIR or NIR database which was contained this entry
114#[derive(Debug, Clone, Serialize, Deserialize)]
115struct As2orgJsonOrg {
116    #[serde(alias = "organizationId")]
117    org_id: String,
118
119    changed: Option<String>,
120
121    #[serde(default)]
122    name: String,
123
124    country: String,
125
126    /// The RIR or NIR database that contained this entry
127    source: String,
128
129    #[serde(alias = "type")]
130    data_type: String,
131}
132
133/// AS Json format
134///
135/// ----------
136/// AS fields
137/// ----------
138/// asn     : the AS number
139/// changed : the changed date provided by its WHOIS entry
140/// name    : the name provide for the individual AS number
141/// org_id  : maps to an organization entry
142/// opaque_id   : opaque identifier used by RIR extended delegation format
143/// source  : the RIR or NIR database which was contained this entry
144#[derive(Debug, Clone, Serialize, Deserialize)]
145struct As2orgJsonAs {
146    asn: String,
147
148    changed: Option<String>,
149
150    #[serde(default)]
151    name: String,
152
153    #[serde(alias = "opaqueId")]
154    opaque_id: Option<String>,
155
156    #[serde(alias = "organizationId")]
157    org_id: String,
158
159    /// The RIR or NIR database that contained this entry
160    source: String,
161
162    #[serde(rename = "type")]
163    data_type: String,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167enum As2orgJsonEntry {
168    Org(As2orgJsonOrg),
169    As(As2orgJsonAs),
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
173/// Public information for an Autonomous System (AS) enriched with its organization.
174///
175/// This struct is returned by high-level query methods like `get_as_info` and
176/// `get_siblings` and contains the most commonly used fields for downstream
177/// analysis or presentation.
178pub struct As2orgAsInfo {
179    /// The AS number
180    pub asn: u32,
181    /// The name provided for the individual AS number
182    pub name: String,
183    /// The registration country code of the organization
184    pub country_code: String,
185    /// Organization identifier (as used in the dataset)
186    pub org_id: String,
187    /// Organization name
188    pub org_name: String,
189    /// The RIR database that contained this entry
190    pub source: String,
191}
192
193/// In-memory accessor for CAIDA's AS-to-Organization dataset.
194///
195/// Construct with `As2org::new`, then perform lookups via `get_as_info`,
196/// `get_siblings`, or `are_siblings`.
197pub struct As2org {
198    as_map: HashMap<u32, As2orgJsonAs>,
199    org_map: HashMap<String, As2orgJsonOrg>,
200    as_to_org: HashMap<u32, String>,
201    org_to_as: HashMap<String, Vec<u32>>,
202}
203
204const BASE_URL: &str = "https://publicdata.caida.org/datasets/as-organizations";
205
206impl As2org {
207    /// Create a new `As2org` accessor.
208    ///
209    /// - When `data_file_path` is `None`, the constructor fetches the CAIDA
210    ///   index page to discover the most recent `*.as-org2info.jsonl.gz` file
211    ///   and reads it via HTTP(S).
212    /// - When `Some(path_or_url)` is provided, the path can be a local file or
213    ///   a remote URL. Gzipped files are supported transparently.
214    ///
215    /// Returns `anyhow::Result<Self>` with an initialized in-memory index.
216    pub fn new(data_file_path: Option<String>) -> Result<Self> {
217        let entries = match data_file_path {
218            Some(path) => parse_as2org_file(path.as_str())?,
219            None => {
220                let url = get_most_recent_data()?;
221                parse_as2org_file(url.as_str())?
222            }
223        };
224
225        let mut as_map: HashMap<u32, As2orgJsonAs> = HashMap::new();
226        let mut org_map: HashMap<String, As2orgJsonOrg> = HashMap::new();
227
228        for entry in entries {
229            match entry {
230                As2orgJsonEntry::As(as_entry) => {
231                    as_map.insert(as_entry.asn.parse::<u32>().unwrap(), as_entry);
232                }
233                As2orgJsonEntry::Org(org_entry) => {
234                    org_map.insert(org_entry.org_id.clone(), org_entry);
235                }
236            }
237        }
238
239        let mut as_to_org: HashMap<u32, String> = HashMap::new();
240        let mut org_to_as: HashMap<String, Vec<u32>> = HashMap::new();
241
242        for (asn, as_entry) in as_map.iter() {
243            as_to_org.insert(*asn, as_entry.org_id.clone());
244            let org_asn = org_to_as.entry(as_entry.org_id.clone()).or_default();
245            org_asn.push(*asn);
246        }
247
248        Ok(Self {
249            as_map,
250            org_map,
251            as_to_org,
252            org_to_as,
253        })
254    }
255
256    /// List all available dataset files published by CAIDA with their dates.
257    ///
258    /// Returns a vector of `(url, date)` pairs sorted by date ascending; the last
259    /// element is the most recent dataset.
260    ///
261    /// This is useful for offline workflows that want to pin to a specific
262    /// snapshot instead of always using the latest.
263    pub fn get_all_files_with_dates() -> Result<Vec<(String, NaiveDate)>> {
264        get_all_files_with_dates()
265    }
266
267    /// Returns the URL for the latest AS-to-Organization dataset file.
268    ///
269    /// This function returns a direct URL to CAIDA's most recent dataset using
270    /// the "latest" symlink. This is a convenience wrapper that formats the
271    /// complete URL string.
272    ///
273    /// # Returns
274    /// A string containing the HTTPS URL to the latest .jsonl.gz dataset file.
275    pub fn get_latest_file_url() -> String {
276        format!("{BASE_URL}/latest.as-org2info.jsonl.gz")
277    }
278
279    /// Get enriched information for a specific ASN, if present.
280    ///
281    /// Returns `None` when the ASN is not found in the loaded dataset.
282    ///
283    /// Example:
284    /// ```rust,no_run
285    /// # use as2org_rs::As2org;
286    /// let db = As2org::new(None).unwrap();
287    /// let info = db.get_as_info(15169).unwrap();
288    /// assert!(!info.org_id.is_empty());
289    /// ```
290    pub fn get_as_info(&self, asn: u32) -> Option<As2orgAsInfo> {
291        let as_entry = self.as_map.get(&asn)?;
292        let org_id = as_entry.org_id.as_str();
293        let org_entry = self.org_map.get(org_id)?;
294        Some(As2orgAsInfo {
295            asn,
296            name: as_entry.name.clone(),
297            country_code: org_entry.country.clone(),
298            org_id: org_id.to_string(),
299            org_name: org_entry.name.clone(),
300            source: org_entry.source.clone(),
301        })
302    }
303
304    /// Return all ASNs that belong to the same organization as the given ASN.
305    ///
306    /// The returned vector includes the queried ASN itself. Returns `None`
307    /// when the ASN is not present in the dataset.
308    ///
309    /// Example:
310    /// ```rust,no_run
311    /// # use as2org_rs::As2org;
312    /// let db = As2org::new(None).unwrap();
313    /// let sibs = db.get_siblings(15169).unwrap();
314    /// assert!(sibs.iter().any(|s| s.asn == 15169));
315    /// ```
316    pub fn get_siblings(&self, asn: u32) -> Option<Vec<As2orgAsInfo>> {
317        let org_id = self.as_to_org.get(&asn)?;
318        let org_asns = self.org_to_as.get(org_id)?.to_vec();
319        Some(
320            org_asns
321                .iter()
322                .map(|asn| self.get_as_info(*asn).unwrap())
323                .collect(),
324        )
325    }
326
327    /// Return `true` if both ASNs belong to the same organization.
328    ///
329    /// Returns `false` if either ASN is missing from the dataset or their
330    /// organization differs.
331    ///
332    /// Example:
333    /// ```rust,no_run
334    /// # use as2org_rs::As2org;
335    /// let db = As2org::new(None).unwrap();
336    /// assert!(db.are_siblings(15169, 36040));
337    /// ```
338    pub fn are_siblings(&self, asn1: u32, asn2: u32) -> bool {
339        let org1 = match self.as_to_org.get(&asn1) {
340            None => return false,
341            Some(o) => o,
342        };
343        let org2 = match self.as_to_org.get(&asn2) {
344            None => return false,
345            Some(o) => o,
346        };
347        org1 == org2
348    }
349}
350
351/// Fixes misinterpretation of strings encoded in Latin-1 that were mistakenly decoded as UTF-8.
352///
353/// This function processes a string that may contain characters misinterpreted due to an
354/// incorrect encoding or decoding process. Specifically, it handles cases where Latin-1
355/// characters are represented as two incorrect UTF-8 characters, such as 'Ã' followed
356/// by a secondary byte.
357///
358/// # Arguments
359///
360/// * `input` - A string slice that may contain incorrectly encoded characters.
361///
362/// # Returns
363///
364/// A corrected string with all misinterpreted characters properly fixed or left unchanged
365/// if the pattern doesn't match.
366fn fix_latin1_misinterpretation(input: &str) -> String {
367    let mut result = String::new();
368    let mut chars = input.chars().peekable();
369
370    while let Some(c) = chars.next() {
371        // Check for the pattern of misinterpreted Latin-1 chars
372        if c == 'Ã' && chars.peek().is_some() {
373            let next_char = chars.next().unwrap();
374
375            // Calculate the original Latin-1 character
376            let byte_value = match next_char {
377                '\u{0080}'..='\u{00BF}' => 0xC0 + (next_char as u32 - 0x0080),
378                // Handle other ranges as needed
379                _ => {
380                    // If it doesn't match the pattern, treat as normal chars
381                    result.push(c);
382                    result.push(next_char);
383                    continue;
384                }
385            };
386
387            // Convert to the correct character
388            if let Some(correct_char) = char::from_u32(byte_value) {
389                result.push(correct_char);
390            } else {
391                // Fallback for invalid characters
392                result.push(c);
393                result.push(next_char);
394            }
395        } else {
396            result.push(c);
397        }
398    }
399
400    result
401}
402
403/// parse remote AS2Org file into Vec of DataEntry
404fn parse_as2org_file(path: &str) -> Result<Vec<As2orgJsonEntry>> {
405    let mut res: Vec<As2orgJsonEntry> = vec![];
406
407    for line in oneio::read_lines(path)? {
408        let line = fix_latin1_misinterpretation(&line?);
409        if line.contains(r#""type":"ASN""#) {
410            let data = serde_json::from_str::<As2orgJsonAs>(line.as_str());
411            match data {
412                Ok(data) => {
413                    res.push(As2orgJsonEntry::As(data));
414                }
415                Err(e) => {
416                    eprintln!("error parsing line:\n{}", line.as_str());
417                    return Err(anyhow!(e));
418                }
419            }
420        } else {
421            let data = serde_json::from_str::<As2orgJsonOrg>(line.as_str());
422            match data {
423                Ok(data) => {
424                    res.push(As2orgJsonEntry::Org(data));
425                }
426                Err(e) => {
427                    eprintln!("error parsing line:\n{}", line.as_str());
428                    return Err(anyhow!(e));
429                }
430            }
431        }
432    }
433    Ok(res)
434}
435
436/// Returns a vector of tuples containing the full URLs of AS2Org data files and their corresponding dates.
437/// The vector is sorted by dates with the latest date last.
438///
439/// # Returns
440/// - `Result<Vec<(String, NaiveDate)>>` where each tuple contains:
441///   - String: complete URL to the AS2Org data file
442///   - NaiveDate: date extracted from the file name
443fn get_all_files_with_dates() -> Result<Vec<(String, NaiveDate)>> {
444    let data_link: Regex = Regex::new(r".*(\d{8}\.as-org2info\.jsonl\.gz).*")?;
445    let content = oneio::read_to_string(BASE_URL)?;
446    let mut res: Vec<(String, NaiveDate)> = data_link
447        .captures_iter(content.as_str())
448        .map(|cap| {
449            let file = cap[1].to_owned();
450            let date = NaiveDate::parse_from_str(&file[..8], "%Y%m%d")?;
451            Ok((format!("{BASE_URL}/{file}"), date))
452        })
453        .collect::<Result<Vec<_>, chrono::ParseError>>()?;
454    res.sort_by_key(|(_, date)| *date);
455    Ok(res)
456}
457fn get_most_recent_data() -> Result<String> {
458    let files = get_all_files_with_dates()?;
459    let last_file = files
460        .last()
461        .ok_or_else(|| anyhow!("No dataset files found"))?;
462    Ok(last_file.0.clone())
463}
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468    use chrono::Datelike;
469
470    // Helper to create a shared As2org instance for all tests
471    // This ensures we only fetch the data once
472    fn get_test_db() -> As2org {
473        // Use a static to cache the database across tests
474        // Note: In a real scenario with multiple test threads, you might want to use lazy_static
475        As2org::new(None).expect("Failed to load AS2org database")
476    }
477
478    #[test]
479    fn test_new_from_latest() {
480        let as2org = get_test_db();
481        // Verify the database was loaded by checking if we have some data
482        assert!(as2org.as_map.len() > 0);
483        assert!(as2org.org_map.len() > 0);
484    }
485
486    #[test]
487    fn test_get_as_info_existing() {
488        let as2org = get_test_db();
489        // Test with a well-known ASN (Google)
490        let info = as2org.get_as_info(15169);
491        assert!(info.is_some());
492        let info = info.unwrap();
493        assert_eq!(info.asn, 15169);
494        assert!(!info.org_id.is_empty());
495        assert!(!info.org_name.is_empty());
496        assert!(!info.country_code.is_empty());
497        assert!(!info.source.is_empty());
498    }
499
500    #[test]
501    fn test_get_as_info_nonexistent() {
502        let as2org = get_test_db();
503        // Test with a likely non-existent ASN
504        let info = as2org.get_as_info(999999999);
505        assert!(info.is_none());
506    }
507
508    #[test]
509    fn test_get_siblings_existing() {
510        let as2org = get_test_db();
511        // Test with Google's AS15169
512        let siblings = as2org.get_siblings(15169);
513        assert!(siblings.is_some());
514        let siblings = siblings.unwrap();
515        // Should include at least the ASN itself
516        assert!(siblings.len() >= 1);
517        // The queried ASN should be in the siblings list
518        assert!(siblings.iter().any(|s| s.asn == 15169));
519        // All siblings should have the same org_id
520        let org_id = &siblings[0].org_id;
521        assert!(siblings.iter().all(|s| s.org_id == *org_id));
522    }
523
524    #[test]
525    fn test_get_siblings_nonexistent() {
526        let as2org = get_test_db();
527        let siblings = as2org.get_siblings(999999999);
528        assert!(siblings.is_none());
529    }
530
531    #[test]
532    fn test_are_siblings_true() {
533        let as2org = get_test_db();
534        // First get an ASN that has siblings
535        let _info = as2org.get_as_info(15169).unwrap();
536        let siblings = as2org.get_siblings(15169).unwrap();
537
538        if siblings.len() > 1 {
539            // Test with actual siblings if they exist
540            let sibling_asn = siblings.iter().find(|s| s.asn != 15169).unwrap().asn;
541            assert!(as2org.are_siblings(15169, sibling_asn));
542        } else {
543            // An ASN is always a sibling to itself
544            assert!(as2org.are_siblings(15169, 15169));
545        }
546    }
547
548    #[test]
549    fn test_are_siblings_false() {
550        let as2org = get_test_db();
551        // Google (15169) and Cloudflare (13335) should not be siblings
552        assert!(!as2org.are_siblings(15169, 13335));
553    }
554
555    #[test]
556    fn test_are_siblings_nonexistent() {
557        let as2org = get_test_db();
558        // Test with non-existent ASN
559        assert!(!as2org.are_siblings(15169, 999999999));
560        assert!(!as2org.are_siblings(999999999, 15169));
561        assert!(!as2org.are_siblings(999999999, 999999998));
562    }
563
564    #[test]
565    fn test_get_latest_file_url() {
566        let url = As2org::get_latest_file_url();
567        assert!(url.starts_with("https://"));
568        assert!(url.contains("as-org2info.jsonl.gz"));
569    }
570
571    #[test]
572    fn test_get_all_files_with_dates() {
573        let files = As2org::get_all_files_with_dates();
574        assert!(files.is_ok());
575        let files = files.unwrap();
576        assert!(files.len() > 0);
577
578        // Verify format of returned data
579        for (url, date) in &files {
580            assert!(url.starts_with("https://"));
581            assert!(url.contains("as-org2info.jsonl.gz"));
582            // Date should be valid (just checking it's not a default)
583            assert!(date.year() >= 2000);
584        }
585
586        // Verify sorting (dates should be in ascending order)
587        for i in 1..files.len() {
588            assert!(files[i].1 >= files[i - 1].1);
589        }
590    }
591
592    #[test]
593    fn test_as_to_org_mapping() {
594        let as2org = get_test_db();
595        // Verify internal consistency: as_to_org should map to valid orgs
596        for (asn, org_id) in as2org.as_to_org.iter().take(10) {
597            assert!(as2org.org_map.contains_key(org_id));
598            assert!(as2org.as_map.contains_key(asn));
599        }
600    }
601
602    #[test]
603    fn test_org_to_as_mapping() {
604        let as2org = get_test_db();
605        // Verify internal consistency: org_to_as should map to valid ASNs
606        for (org_id, asns) in as2org.org_to_as.iter().take(10) {
607            assert!(as2org.org_map.contains_key(org_id));
608            for asn in asns {
609                assert!(as2org.as_map.contains_key(asn));
610                assert_eq!(as2org.as_to_org.get(asn).unwrap(), org_id);
611            }
612        }
613    }
614
615    #[test]
616    fn test_fix_latin1_misinterpretation() {
617        // Test the Latin-1 fix function with known patterns
618        let input = "Test é string";
619        let fixed = fix_latin1_misinterpretation(input);
620        // The function should convert é to é (Latin-1 0xE9)
621        assert!(fixed.len() <= input.len());
622
623        // Test with no special characters
624        let input = "Normal ASCII string";
625        let fixed = fix_latin1_misinterpretation(input);
626        assert_eq!(input, fixed);
627    }
628
629    #[test]
630    fn test_as2org_as_info_fields() {
631        let as2org = get_test_db();
632        let info = as2org.get_as_info(15169).unwrap();
633
634        // Verify all fields are populated
635        assert_eq!(info.asn, 15169);
636        assert!(!info.name.is_empty());
637        assert!(!info.country_code.is_empty());
638        assert!(!info.org_id.is_empty());
639        assert!(!info.org_name.is_empty());
640        assert!(!info.source.is_empty());
641    }
642
643    #[test]
644    fn test_siblings_consistency() {
645        let as2org = get_test_db();
646        let asn = 15169;
647        let siblings = as2org.get_siblings(asn).unwrap();
648
649        // All siblings should return the same sibling list
650        for sibling in &siblings {
651            let sibling_siblings = as2org.get_siblings(sibling.asn).unwrap();
652            assert_eq!(siblings.len(), sibling_siblings.len());
653
654            // All ASNs should be present in both lists
655            for s in &siblings {
656                assert!(sibling_siblings.iter().any(|ss| ss.asn == s.asn));
657            }
658        }
659    }
660}