diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-28 19:36:19 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-28 19:36:29 +0200 |
commit | f8e807aa2300f99c4d9cbd79d06604862d2132a6 (patch) | |
tree | f6f303550fe0ca876dd6ce8a557ee3b423eef6ac | |
parent | Indexer: Add website title and description to the CrawledResource (diff) | |
download | OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.tar.gz OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.tar.bz2 OSSE-f8e807aa2300f99c4d9cbd79d06604862d2132a6.zip |
Crawler: Only accept HTTP_STATUS_CODE: 200 as success in crawl_url()
-rw-r--r-- | crawler/src/main.rs | 7 |
1 files changed, 4 insertions, 3 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index efdb033..5c15d14 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,6 +1,6 @@ use itertools::Itertools; use rand::seq::IteratorRandom; -use reqwest::{Client, Response}; +use reqwest::{Client, Response, StatusCode}; use serde::Serialize; use url::Url; @@ -81,13 +81,14 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin let url = Url::parse(url).unwrap(); let response_text = match http_client.get(url.as_str()).send().await { - Err(_) => Err("Error fetching ".to_owned() + url.as_str()), - Ok(text_res) => match text_res.text().await { + Ok(text_res) if text_res.status() == StatusCode::OK => match text_res.text().await { Err(_) => { Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")") } Ok(text) => Ok(text), }, + + _ => Err("Error fetching ".to_owned() + url.as_str()), }?; let document = scraper::Html::parse_document(response_text.as_str()); |