From f8e807aa2300f99c4d9cbd79d06604862d2132a6 Mon Sep 17 00:00:00 2001 From: Baitinq Date: Fri, 28 Oct 2022 19:36:19 +0200 Subject: Crawler: Only accept HTTP_STATUS_CODE: 200 as success in crawl_url() --- crawler/src/main.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crawler/src/main.rs b/crawler/src/main.rs index efdb033..5c15d14 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,6 +1,6 @@ use itertools::Itertools; use rand::seq::IteratorRandom; -use reqwest::{Client, Response}; +use reqwest::{Client, Response, StatusCode}; use serde::Serialize; use url::Url; @@ -81,13 +81,14 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec Err("Error fetching ".to_owned() + url.as_str()), - Ok(text_res) => match text_res.text().await { + Ok(text_res) if text_res.status() == StatusCode::OK => match text_res.text().await { Err(_) => { Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")") } Ok(text) => Ok(text), }, + + _ => Err("Error fetching ".to_owned() + url.as_str()), }?; let document = scraper::Html::parse_document(response_text.as_str()); -- cgit 1.4.1