From 5e8774a82e90f42708399490d4276a9481c40763 Mon Sep 17 00:00:00 2001
From: Baitinq <manuelpalenzuelamerino@gmail.com>
Date: Thu, 20 Oct 2022 16:27:23 +0200
Subject: Crawler: Normalise relative urls

We now normalise urls starting with / (relative to root) and //
(relative to protocol)
---
 crawler/src/main.rs | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'crawler')
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 15abcaf..ca5126a 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -45,7 +45,7 @@ fn crawl_url(url: &str) -> (String, Vec<String>) {
 
     println!("Crawling {:?}", url);
 
-    let response_res = reqwest::blocking::get(url);
+    let response_res = reqwest::blocking::get(&url);
     if response_res.is_err() {
         return (String::from(""), Vec::<String>::new());
     }
@@ -65,7 +65,22 @@ fn crawl_url(url: &str) -> (String, Vec<String>) {
         .map(String::from)
         .collect();
 
-    //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result
+    let fixup_urls = |us: Vec<String>| {
+        return us.into_iter().map(|u| {
+            //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
+            if u.starts_with("//") {
+                format!("https:{}", &u)
+            }
+            else if u.starts_with("/") {
+                format!("{}{}", &url, &u)
+            }
+            else {
+                u
+            }
+        }).collect();
+    };
+
+    let next_urls = fixup_urls(next_urls);
 
     (response_text, next_urls)
 }
-- 
cgit 1.4.1