about summary refs log tree commit diff
path: root/crawler
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-20 16:42:07 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-20 23:44:46 +0200
commit34e8a56b560bf090985e993a6db411275c2e4686 (patch)
tree75c7c23a51e23bde24c120fc1826bd22220f9c28 /crawler
parentCrawler: Normalise relative urls (diff)
downloadOSSE-34e8a56b560bf090985e993a6db411275c2e4686.tar.gz
OSSE-34e8a56b560bf090985e993a6db411275c2e4686.tar.bz2
OSSE-34e8a56b560bf090985e993a6db411275c2e4686.zip
Crawler: Wrap crawl response in Result type
Diffstat (limited to 'crawler')
-rw-r--r--crawler/src/main.rs41
1 files changed, 23 insertions, 18 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index ca5126a..9ebfc23 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -25,7 +25,13 @@ fn crawler(root_urls: Vec<&str>) {
         //blocks
         let url = crawling_queue.pop();
 
-        let (_content, crawled_urls) = crawl_url(url.as_str());
+        let crawl_res = crawl_url(url.as_str());
+        if crawl_res.is_err() {
+            println!("Error crawling {}", url);
+            continue;
+        }
+
+        let (_content, crawled_urls) = crawl_res.unwrap();
 
         //println!("Content: {:?}", _content);
         println!("Next urls: {:?}", crawled_urls);
@@ -39,19 +45,18 @@ fn crawler(root_urls: Vec<&str>) {
 }
 
 //takes url, returns content and list of urls
-fn crawl_url(url: &str) -> (String, Vec<String>) {
-    //return result
+fn crawl_url(url: &str) -> Result<(String, Vec<String>), ()> {
     let url = "https://".to_owned() + url;
 
     println!("Crawling {:?}", url);
 
     let response_res = reqwest::blocking::get(&url);
     if response_res.is_err() {
-        return (String::from(""), Vec::<String>::new());
+        return Err(());
     }
     let response_text_res = response_res.unwrap().text();
     if response_text_res.is_err() {
-        return (String::from(""), Vec::<String>::new());
+        return Err(());
     }
 
     let response_text = response_text_res.unwrap();
@@ -66,21 +71,21 @@ fn crawl_url(url: &str) -> (String, Vec<String>) {
         .collect();
 
     let fixup_urls = |us: Vec<String>| {
-        return us.into_iter().map(|u| {
-            //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
-            if u.starts_with("//") {
-                format!("https:{}", &u)
-            }
-            else if u.starts_with("/") {
-                format!("{}{}", &url, &u)
-            }
-            else {
-                u
-            }
-        }).collect();
+        us.into_iter()
+            .map(|u| {
+                //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
+                if u.starts_with("//") {
+                    format!("https:{}", &u)
+                } else if u.starts_with('/') {
+                    format!("{}{}", &url, &u)
+                } else {
+                    u
+                }
+            })
+            .collect()
     };
 
     let next_urls = fixup_urls(next_urls);
 
-    (response_text, next_urls)
+    Ok((response_text, next_urls))
 }