1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
mod data;
mod fs_request;
mod fs_scrape;
mod util;

#[macro_use]
extern crate serde_derive;
#[macro_use]
extern crate lazy_static;

pub use data::{Entry, GetEntityError};

use crate::fs_request::html_for_query;
use crate::fs_scrape::entities_from_html;
use crate::util::next_search_words;
use futures_intrusive::sync::Semaphore;

const MAX_ENTRIES_PER_PAGE: usize = 30;
const MAX_SIMULTANEOUS_REQUESTS: usize = 32;

lazy_static! {
    static ref LIMIT_SIMULTANEOUS_REQUESTS : futures_intrusive::sync::Semaphore = Semaphore::new(false, MAX_SIMULTANEOUS_REQUESTS);
}

pub async fn scrape_all() -> Result<Vec<Entry>, GetEntityError> {
    let search_words: Vec<String> = next_search_words("")
        .iter()
        .map(String::as_str)
        .map(next_search_words)
        .flatten()
        .collect();

    scrape_for_queries(search_words, true).await
}

pub async fn scrape_for_query(
    starting_with: String,
    get_all_results: bool,
) -> Result<Vec<Entry>, GetEntityError> {
    scrape_for_queries(vec![starting_with], get_all_results).await
}

pub async fn scrape_for_queries(
    starting_with: Vec<String>,
    get_all_results: bool,
) -> Result<Vec<Entry>, GetEntityError> {
    let mut goals_basket = starting_with;
    let mut result_collection: Vec<Entry> = vec![];

    while goals_basket.len() != 0 {
        let handles = goals_basket
            .into_iter()
            .map(|val| get_max_entries_helper(val))
            .map(async_std::task::spawn)
            .collect::<Vec<_>>();

        goals_basket = vec![];

        let all_results = futures::future::join_all(handles).await;

        for (val, prefix) in all_results {
            match val {
                Ok(mut v) => {
                    if get_all_results && v.len() == MAX_ENTRIES_PER_PAGE {
                        goals_basket.append(&mut next_search_words(&prefix));
                    } else {
                        result_collection.append(&mut v);
                    }
                }
                Err(_) => {
                    eprintln!("Unfortunately failed max times for prefix {}", prefix);
                }
            }
        }
    }

    Ok(result_collection)
}

async fn get_max_entries_helper(
    starting_with: String,
) -> (Result<Vec<Entry>, GetEntityError>, String) {
    let _to_drop = LIMIT_SIMULTANEOUS_REQUESTS.acquire(1).await;
    (get_max_entities(&starting_with).await, starting_with)
}

async fn get_max_entities(starting_with: &str) -> Result<Vec<Entry>, GetEntityError> {
    let site_body = html_for_query(starting_with)
        .await
        .ok_or(GetEntityError::ConnectionError)?;

    Ok(entities_from_html(site_body))
}