Crawler: Only accept HTTP_STATUS_CODE: 200 as success in crawl_url()

author: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-28 19:36:19 +0200
committer: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-28 19:36:29 +0200
commit: f8e807aa2300f99c4d9cbd79d06604862d2132a6 (patch)
tree: f6f303550fe0ca876dd6ce8a557ee3b423eef6ac
parent: Indexer: Add website title and description to the CrawledResource (diff)
download: OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.tar.gz
OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.tar.bz2
OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.zip
1 files changed, 4 insertions, 3 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index efdb033..5c15d14 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,6 +1,6 @@
 use itertools::Itertools;
 use rand::seq::IteratorRandom;
-use reqwest::{Client, Response};
+use reqwest::{Client, Response, StatusCode};
 use serde::Serialize;
 use url::Url;
 
@@ -81,13 +81,14 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
     let url = Url::parse(url).unwrap();
 
     let response_text = match http_client.get(url.as_str()).send().await {
-        Err(_) => Err("Error fetching ".to_owned() + url.as_str()),
-        Ok(text_res) => match text_res.text().await {
+        Ok(text_res) if text_res.status() == StatusCode::OK => match text_res.text().await {
             Err(_) => {
                 Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")")
             }
             Ok(text) => Ok(text),
         },
+
+        _ => Err("Error fetching ".to_owned() + url.as_str()),
     }?;
 
     let document = scraper::Html::parse_document(response_text.as_str());
author	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-28 19:36:19 +0200
committer	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-28 19:36:29 +0200
commit	f8e807aa2300f99c4d9cbd79d06604862d2132a6 (patch)
tree	f6f303550fe0ca876dd6ce8a557ee3b423eef6ac
parent	Indexer: Add website title and description to the CrawledResource (diff)
download	OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.tar.gz OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.tar.bz2 OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.zip