From ba7060f63a14c7dec0d40094f4533fb1ef070f90 Mon Sep 17 00:00:00 2001 From: Baitinq Date: Sun, 23 Oct 2022 12:53:20 +0200 Subject: Crawler: Remove prepending of https:// to each url We now prepend it to the top-1000-urls list. This fixes crawled urls having two https:// --- crawler/src/main.rs | 10 +- crawler/top-1000-websites.txt | 2002 ++++++++++++++++++++--------------------- 2 files changed, 1006 insertions(+), 1006 deletions(-) diff --git a/crawler/src/main.rs b/crawler/src/main.rs index d1333fe..8e190bc 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -65,17 +65,15 @@ async fn crawler(http_client: Client, root_urls: Vec<&str>) { } async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec), String> { - let url = "https://".to_owned() + url; - println!("Crawling {:?}", url); - let response_res = http_client.get(&url).send(); + let response_res = http_client.get(url).send(); if response_res.is_err() { - return Err("Error fetching ".to_owned() + &url); + return Err("Error fetching ".to_owned() + url); } let response_text_res = response_res.unwrap().text(); if response_text_res.is_err() { - return Err("Error unwrapping the fetched HTML's text (".to_owned() + &url + ")"); + return Err("Error unwrapping the fetched HTML's text (".to_owned() + url + ")"); } let response_text = response_text_res.unwrap(); @@ -111,6 +109,8 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec