about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-25 01:41:32 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-25 01:41:32 +0200
commita87ccbb43b99432d7468ff7e294cdfb23a48861f (patch)
treee7d513e72689b41e87c5cc3207f1859fdf374914
parentCrawler: Add "correct" error handling (diff)
downloadOSSE-a87ccbb43b99432d7468ff7e294cdfb23a48861f.tar.gz
OSSE-a87ccbb43b99432d7468ff7e294cdfb23a48861f.tar.bz2
OSSE-a87ccbb43b99432d7468ff7e294cdfb23a48861f.zip
Crawler: Shuffle crawled urls
-rw-r--r--Cargo.lock1
-rw-r--r--crawler/Cargo.toml3
-rw-r--r--crawler/src/main.rs5
3 files changed, 5 insertions, 4 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 9282a8a..c10b6ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -414,6 +414,7 @@ version = "0.1.0"
 dependencies = [
  "async-channel",
  "itertools",
+ "rand 0.7.3",
  "reqwest",
  "scraper",
  "serde",
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index 2b93f53..16d7b41 100644
--- a/crawler/Cargo.toml
+++ b/crawler/Cargo.toml
@@ -12,7 +12,8 @@ itertools = "0.10.5"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "0.2.22", features = ["full"] }
 async-channel = "1.7.1"
-url = "*"
+url = "2.3.1"
+rand = "0.7.3"
 
 [[bin]]
 name = "crawler"
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 908a2c1..72c3e4d 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,4 +1,5 @@
 use itertools::Itertools;
+use rand::seq::IteratorRandom;
 use reqwest::blocking::{Client, Response};
 use serde::Serialize;
 use url::Url;
@@ -104,10 +105,8 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
         .filter(Result::is_ok)
         .map(Result::unwrap)
         .filter(valid_url)
-        //can we shuffle? for not the same 2 everytime
-        .take(2)
         .map(String::from)
-        .collect();
+        .choose_multiple(&mut rand::thread_rng(), 2); //we shuffle as to minimise repeating links
 
     //normalise words somewhere
     //fuzzy? - iterate over keys