about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-20 16:27:23 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-20 16:29:05 +0200
commit5e8774a82e90f42708399490d4276a9481c40763 (patch)
treecbc20ad47750454381db2db8db760ac452e158a6
parentCrawler: Remove duplicate parsed urls (diff)
downloadOSSE-5e8774a82e90f42708399490d4276a9481c40763.tar.gz
OSSE-5e8774a82e90f42708399490d4276a9481c40763.tar.bz2
OSSE-5e8774a82e90f42708399490d4276a9481c40763.zip
Crawler: Normalise relative urls
We now normalise urls starting with / (relative to root) and //
(relative to protocol)
-rw-r--r--crawler/src/main.rs19
1 files changed, 17 insertions, 2 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 15abcaf..ca5126a 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -45,7 +45,7 @@ fn crawl_url(url: &str) -> (String, Vec<String>) {
 
     println!("Crawling {:?}", url);
 
-    let response_res = reqwest::blocking::get(url);
+    let response_res = reqwest::blocking::get(&url);
     if response_res.is_err() {
         return (String::from(""), Vec::<String>::new());
     }
@@ -65,7 +65,22 @@ fn crawl_url(url: &str) -> (String, Vec<String>) {
         .map(String::from)
         .collect();
 
-    //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result
+    let fixup_urls = |us: Vec<String>| {
+        return us.into_iter().map(|u| {
+            //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
+            if u.starts_with("//") {
+                format!("https:{}", &u)
+            }
+            else if u.starts_with("/") {
+                format!("{}{}", &url, &u)
+            }
+            else {
+                u
+            }
+        }).collect();
+    };
+
+    let next_urls = fixup_urls(next_urls);
 
     (response_text, next_urls)
 }