diff options
| author | Baitinq <[email protected]> | 2022-10-23 12:53:20 +0200 |
|---|---|---|
| committer | Baitinq <[email protected]> | 2022-10-23 12:53:23 +0200 |
| commit | ba7060f63a14c7dec0d40094f4533fb1ef070f90 (patch) | |
| tree | f8f6d6f6e5f137b4fe3d04dd5f86ec29e41b0aad /crawler/src/main.rs | |
| parent | Crawler: Only crawl 2 urls per url (diff) | |
| download | OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.tar.gz OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.tar.bz2 OSSE-ba7060f63a14c7dec0d40094f4533fb1ef070f90.zip | |
Crawler: Remove prepending of https:// to each url
We now prepend it to the top-1000-urls list. This fixes crawled urls having two https://
Diffstat (limited to 'crawler/src/main.rs')
| -rw-r--r-- | crawler/src/main.rs | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index d1333fe..8e190bc 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -65,17 +65,15 @@ async fn crawler(http_client: Client, root_urls: Vec<&str>) { } async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> { - let url = "https://".to_owned() + url; - println!("Crawling {:?}", url); - let response_res = http_client.get(&url).send(); + let response_res = http_client.get(url).send(); if response_res.is_err() { - return Err("Error fetching ".to_owned() + &url); + return Err("Error fetching ".to_owned() + url); } let response_text_res = response_res.unwrap().text(); if response_text_res.is_err() { - return Err("Error unwrapping the fetched HTML's text (".to_owned() + &url + ")"); + return Err("Error unwrapping the fetched HTML's text (".to_owned() + url + ")"); } let response_text = response_text_res.unwrap(); @@ -111,6 +109,8 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin //normalise words somewhere //fuzzy? //probs lots of places where we can borrow or not do stupid stuff + //search for phrases? + //why multiple '/' at the end of sites" Ok((response_text, next_urls)) } |