diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 16:42:07 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 23:44:46 +0200 |
commit | 34e8a56b560bf090985e993a6db411275c2e4686 (patch) | |
tree | 75c7c23a51e23bde24c120fc1826bd22220f9c28 | |
parent | Crawler: Normalise relative urls (diff) | |
download | OSSE-34e8a56b560bf090985e993a6db411275c2e4686.tar.gz OSSE-34e8a56b560bf090985e993a6db411275c2e4686.tar.bz2 OSSE-34e8a56b560bf090985e993a6db411275c2e4686.zip |
Crawler: Wrap crawl response in Result type
-rw-r--r-- | crawler/src/main.rs | 41 |
1 files changed, 23 insertions, 18 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index ca5126a..9ebfc23 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -25,7 +25,13 @@ fn crawler(root_urls: Vec<&str>) { //blocks let url = crawling_queue.pop(); - let (_content, crawled_urls) = crawl_url(url.as_str()); + let crawl_res = crawl_url(url.as_str()); + if crawl_res.is_err() { + println!("Error crawling {}", url); + continue; + } + + let (_content, crawled_urls) = crawl_res.unwrap(); //println!("Content: {:?}", _content); println!("Next urls: {:?}", crawled_urls); @@ -39,19 +45,18 @@ fn crawler(root_urls: Vec<&str>) { } //takes url, returns content and list of urls -fn crawl_url(url: &str) -> (String, Vec<String>) { - //return result +fn crawl_url(url: &str) -> Result<(String, Vec<String>), ()> { let url = "https://".to_owned() + url; println!("Crawling {:?}", url); let response_res = reqwest::blocking::get(&url); if response_res.is_err() { - return (String::from(""), Vec::<String>::new()); + return Err(()); } let response_text_res = response_res.unwrap().text(); if response_text_res.is_err() { - return (String::from(""), Vec::<String>::new()); + return Err(()); } let response_text = response_text_res.unwrap(); @@ -66,21 +71,21 @@ fn crawl_url(url: &str) -> (String, Vec<String>) { .collect(); let fixup_urls = |us: Vec<String>| { - return us.into_iter().map(|u| { - //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute - if u.starts_with("//") { - format!("https:{}", &u) - } - else if u.starts_with("/") { - format!("{}{}", &url, &u) - } - else { - u - } - }).collect(); + us.into_iter() + .map(|u| { + //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute + if u.starts_with("//") { + format!("https:{}", &u) + } else if u.starts_with('/') { + format!("{}{}", &url, &u) + } else { + u + } + }) + .collect() }; let next_urls = fixup_urls(next_urls); - (response_text, next_urls) + Ok((response_text, next_urls)) } |