amadeus 0.4.3

Harmonious distributed data processing & analysis in Rust. parquet postgres aws s3 cloudfront elb json csv logs hadoop hdfs arrow common crawl
//! # Distributed parsing and analysis of 3.25 billion webpages
//!
//! This example finds the most prevalent 100 IP addresses in the 3.25 billion
//! page, 255 TiB Common Crawl dataset.
//!
//! The download, parsing and analysis is farmed out to a thread pool
//! leveraging Amadeus, the distributed data processing library for Rust.
//!
//! ## Usage
//!
//! ```bash
//! cargo run --example common_crawl --release
//! ```

use amadeus::{data::Webpage, dist::prelude::*};

#[allow(unreachable_code)]
#[tokio::main]
async fn main() {
	return; // TODO: runs for a long time

	let pool = ThreadPool::new(None, None).unwrap();

	let webpages = CommonCrawl::new("CC-MAIN-2020-24").await.unwrap();

	let (count, (most_frequent_ips, most_diverse_ips)) = webpages
		.dist_stream()
		.map(FnMut!(|webpage: Result<_, _>| webpage.unwrap()))
		.fork(
			&pool,
			Identity
				.map(FnMut!(|webpage: Webpage<'static>| webpage))
				.count(),
			(
				Identity
					.map(FnMut!(|webpage: &Webpage<'static>| webpage.ip))
					.most_frequent(100, 0.99, 2.0 / 1000.0),
				Identity
					.map(FnMut!(|webpage: &Webpage<'static>| {
						(webpage.ip, webpage.url.host_str().unwrap().to_owned())
					}))
					.most_distinct(100, 0.99, 2.0 / 1000.0, 0.0808),
			),
		)
		.await;

	println!(
		"Of the {} webpages processed, these are the most prevalent host IP addresses: {:?}",
		count, most_frequent_ips
	);
	println!(
		"and these are the IP addresses hosting the most distinct domains: {:?}",
		most_diverse_ips
	);
}