Crawler: Remove prepending of https:// to each url

We now prepend it to the top-1000-urls list. This fixes crawled urls having two https://
author: Baitinq <[email protected]> 2022-10-23 12:53:20 +0200
committer: Baitinq <[email protected]> 2022-10-23 12:53:23 +0200
commit: ba7060f63a14c7dec0d40094f4533fb1ef070f90 (patch)
tree: f8f6d6f6e5f137b4fe3d04dd5f86ec29e41b0aad /crawler/src/main.rs
parent: Crawler: Only crawl 2 urls per url (diff)
download: OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.tar.gz
OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.tar.bz2
OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.zip
1 files changed, 5 insertions, 5 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index d1333fe..8e190bc 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -65,17 +65,15 @@ async fn crawler(http_client: Client, root_urls: Vec<&str>) {
 }
 
 async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> {
-    let url = "https://".to_owned() + url;
-
     println!("Crawling {:?}", url);
 
-    let response_res = http_client.get(&url).send();
+    let response_res = http_client.get(url).send();
     if response_res.is_err() {
-        return Err("Error fetching ".to_owned() + &url);
+        return Err("Error fetching ".to_owned() + url);
     }
     let response_text_res = response_res.unwrap().text();
     if response_text_res.is_err() {
-        return Err("Error unwrapping the fetched HTML's text (".to_owned() + &url + ")");
+        return Err("Error unwrapping the fetched HTML's text (".to_owned() + url + ")");
     }
 
     let response_text = response_text_res.unwrap();
@@ -111,6 +109,8 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
     //normalise words somewhere
     //fuzzy?
     //probs lots of places where we can borrow or not do stupid stuff
+    //search for phrases?
+    //why multiple '/' at the end of sites"
 
     Ok((response_text, next_urls))
 }
author	Baitinq <[email protected]>	2022-10-23 12:53:20 +0200
committer	Baitinq <[email protected]>	2022-10-23 12:53:23 +0200
commit	ba7060f63a14c7dec0d40094f4533fb1ef070f90 (patch)
tree	f8f6d6f6e5f137b4fe3d04dd5f86ec29e41b0aad /crawler/src/main.rs
parent	Crawler: Only crawl 2 urls per url (diff)
download	OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.tar.gz OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.tar.bz2 OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.zip