diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 16:27:23 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 16:29:05 +0200 |
commit | 5e8774a82e90f42708399490d4276a9481c40763 (patch) | |
tree | cbc20ad47750454381db2db8db760ac452e158a6 | |
parent | Crawler: Remove duplicate parsed urls (diff) | |
download | OSSE-5e8774a82e90f42708399490d4276a9481c40763.tar.gz OSSE-5e8774a82e90f42708399490d4276a9481c40763.tar.bz2 OSSE-5e8774a82e90f42708399490d4276a9481c40763.zip |
Crawler: Normalise relative urls
We now normalise urls starting with / (relative to root) and // (relative to protocol)
-rw-r--r-- | crawler/src/main.rs | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 15abcaf..ca5126a 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -45,7 +45,7 @@ fn crawl_url(url: &str) -> (String, Vec<String>) { println!("Crawling {:?}", url); - let response_res = reqwest::blocking::get(url); + let response_res = reqwest::blocking::get(&url); if response_res.is_err() { return (String::from(""), Vec::<String>::new()); } @@ -65,7 +65,22 @@ fn crawl_url(url: &str) -> (String, Vec<String>) { .map(String::from) .collect(); - //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result + let fixup_urls = |us: Vec<String>| { + return us.into_iter().map(|u| { + //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute + if u.starts_with("//") { + format!("https:{}", &u) + } + else if u.starts_with("/") { + format!("{}{}", &url, &u) + } + else { + u + } + }).collect(); + }; + + let next_urls = fixup_urls(next_urls); (response_text, next_urls) } |