From 5e8774a82e90f42708399490d4276a9481c40763 Mon Sep 17 00:00:00 2001 From: Baitinq Date: Thu, 20 Oct 2022 16:27:23 +0200 Subject: Crawler: Normalise relative urls We now normalise urls starting with / (relative to root) and // (relative to protocol) --- crawler/src/main.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'crawler') diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 15abcaf..ca5126a 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -45,7 +45,7 @@ fn crawl_url(url: &str) -> (String, Vec) { println!("Crawling {:?}", url); - let response_res = reqwest::blocking::get(url); + let response_res = reqwest::blocking::get(&url); if response_res.is_err() { return (String::from(""), Vec::::new()); } @@ -65,7 +65,22 @@ fn crawl_url(url: &str) -> (String, Vec) { .map(String::from) .collect(); - //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result + let fixup_urls = |us: Vec| { + return us.into_iter().map(|u| { + //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute + if u.starts_with("//") { + format!("https:{}", &u) + } + else if u.starts_with("/") { + format!("{}{}", &url, &u) + } + else { + u + } + }).collect(); + }; + + let next_urls = fixup_urls(next_urls); (response_text, next_urls) } -- cgit 1.4.1