1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
mod data; mod fs_request; mod fs_scrape; mod util; #[macro_use] extern crate serde_derive; #[macro_use] extern crate lazy_static; pub use data::{Entry, GetEntityError}; use crate::fs_request::html_for_query; use crate::fs_scrape::entities_from_html; use crate::util::next_search_words; use futures_intrusive::sync::Semaphore; const MAX_ENTRIES_PER_PAGE: usize = 30; const MAX_SIMULTANEOUS_REQUESTS: usize = 32; lazy_static! { static ref LIMIT_SIMULTANEOUS_REQUESTS : futures_intrusive::sync::Semaphore = Semaphore::new(false, MAX_SIMULTANEOUS_REQUESTS); } pub async fn scrape_all() -> Result<Vec<Entry>, GetEntityError> { let search_words: Vec<String> = next_search_words("") .iter() .map(String::as_str) .map(next_search_words) .flatten() .collect(); scrape_for_queries(search_words, true).await } pub async fn scrape_for_query( starting_with: String, get_all_results: bool, ) -> Result<Vec<Entry>, GetEntityError> { scrape_for_queries(vec![starting_with], get_all_results).await } pub async fn scrape_for_queries( starting_with: Vec<String>, get_all_results: bool, ) -> Result<Vec<Entry>, GetEntityError> { let mut goals_basket = starting_with; let mut result_collection: Vec<Entry> = vec![]; while goals_basket.len() != 0 { let handles = goals_basket .into_iter() .map(|val| get_max_entries_helper(val)) .map(async_std::task::spawn) .collect::<Vec<_>>(); goals_basket = vec![]; let all_results = futures::future::join_all(handles).await; for (val, prefix) in all_results { match val { Ok(mut v) => { if get_all_results && v.len() == MAX_ENTRIES_PER_PAGE { goals_basket.append(&mut next_search_words(&prefix)); } else { result_collection.append(&mut v); } } Err(_) => { eprintln!("Unfortunately failed max times for prefix {}", prefix); } } } } Ok(result_collection) } async fn get_max_entries_helper( starting_with: String, ) -> (Result<Vec<Entry>, GetEntityError>, String) { let _to_drop = LIMIT_SIMULTANEOUS_REQUESTS.acquire(1).await; (get_max_entities(&starting_with).await, starting_with) } async fn get_max_entities(starting_with: &str) -> Result<Vec<Entry>, GetEntityError> { let site_body = html_for_query(starting_with) .await .ok_or(GetEntityError::ConnectionError)?; Ok(entities_from_html(site_body)) }