diff options
-rw-r--r-- | Cargo.lock | 1 | ||||
-rw-r--r-- | crawler/Cargo.toml | 3 | ||||
-rw-r--r-- | crawler/src/main.rs | 5 |
3 files changed, 5 insertions, 4 deletions
diff --git a/Cargo.lock b/Cargo.lock index 9282a8a..c10b6ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -414,6 +414,7 @@ version = "0.1.0" dependencies = [ "async-channel", "itertools", + "rand 0.7.3", "reqwest", "scraper", "serde", diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index 2b93f53..16d7b41 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -12,7 +12,8 @@ itertools = "0.10.5" serde = { version = "1.0", features = ["derive"] } tokio = { version = "0.2.22", features = ["full"] } async-channel = "1.7.1" -url = "*" +url = "2.3.1" +rand = "0.7.3" [[bin]] name = "crawler" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 908a2c1..72c3e4d 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,4 +1,5 @@ use itertools::Itertools; +use rand::seq::IteratorRandom; use reqwest::blocking::{Client, Response}; use serde::Serialize; use url::Url; @@ -104,10 +105,8 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin .filter(Result::is_ok) .map(Result::unwrap) .filter(valid_url) - //can we shuffle? for not the same 2 everytime - .take(2) .map(String::from) - .collect(); + .choose_multiple(&mut rand::thread_rng(), 2); //we shuffle as to minimise repeating links //normalise words somewhere //fuzzy? - iterate over keys |