Crawler: Add crawled url filter

This filters hrefs such as "/", "#" or "javascript:"
author: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-24 21:12:37 +0200
committer: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-24 21:12:39 +0200
commit: 3c10f6d7554d848aaaebf9f296642cce972dbd7c (patch)
tree: acfbb2f83948b32bbfc777cffaed6f826a939e76
parent: Flake: Add rust-analyzer package (diff)
download: OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.gz
OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.bz2
OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.zip
1 files changed, 8 insertions, 1 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index d74f1f8..f5ace75 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -79,16 +79,23 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
     let response_text = response_text_res.unwrap();
     let document = scraper::Html::parse_document(response_text.as_str());
 
+    let valid_url = |url: &&str| match url {
+        u if *u == "/" => false,
+        u if u.starts_with("javascript:") => false, //generalise to be everything b4 : not equals http or https
+        u if u.starts_with('#') => false,
+        _ => true,
+    };
+
     let link_selector = scraper::Selector::parse("a").unwrap();
     let next_urls = document
         .select(&link_selector)
         .filter_map(|link| link.value().attr("href"))
         .unique()
+        .filter(valid_url)
         .take(2)
         .map(String::from)
         .collect();
 
-    //we need to not append http if already has it
     let fixup_urls = |us: Vec<String>| {
         us.iter()
             .map(|u| {
author	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-24 21:12:37 +0200
committer	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-24 21:12:39 +0200
commit	3c10f6d7554d848aaaebf9f296642cce972dbd7c (patch)
tree	acfbb2f83948b32bbfc777cffaed6f826a939e76
parent	Flake: Add rust-analyzer package (diff)
download	OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.gz OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.bz2 OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.zip