From 3c10f6d7554d848aaaebf9f296642cce972dbd7c Mon Sep 17 00:00:00 2001 From: Baitinq Date: Mon, 24 Oct 2022 21:12:37 +0200 Subject: Crawler: Add crawled url filter This filters hrefs such as "/", "#" or "javascript:" --- crawler/src/main.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'crawler') diff --git a/crawler/src/main.rs b/crawler/src/main.rs index d74f1f8..f5ace75 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -79,16 +79,23 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec false, + u if u.starts_with("javascript:") => false, //generalise to be everything b4 : not equals http or https + u if u.starts_with('#') => false, + _ => true, + }; + let link_selector = scraper::Selector::parse("a").unwrap(); let next_urls = document .select(&link_selector) .filter_map(|link| link.value().attr("href")) .unique() + .filter(valid_url) .take(2) .map(String::from) .collect(); - //we need to not append http if already has it let fixup_urls = |us: Vec| { us.iter() .map(|u| { -- cgit 1.4.1