about summary refs log tree commit diff
path: root/crawler
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-24 21:12:37 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-24 21:12:39 +0200
commit3c10f6d7554d848aaaebf9f296642cce972dbd7c (patch)
treeacfbb2f83948b32bbfc777cffaed6f826a939e76 /crawler
parentFlake: Add rust-analyzer package (diff)
downloadOSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.gz
OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.bz2
OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.zip
Crawler: Add crawled url filter
This filters hrefs such as "/", "#" or "javascript:"
Diffstat (limited to 'crawler')
-rw-r--r--crawler/src/main.rs9
1 files changed, 8 insertions, 1 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index d74f1f8..f5ace75 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -79,16 +79,23 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
     let response_text = response_text_res.unwrap();
     let document = scraper::Html::parse_document(response_text.as_str());
 
+    let valid_url = |url: &&str| match url {
+        u if *u == "/" => false,
+        u if u.starts_with("javascript:") => false, //generalise to be everything b4 : not equals http or https
+        u if u.starts_with('#') => false,
+        _ => true,
+    };
+
     let link_selector = scraper::Selector::parse("a").unwrap();
     let next_urls = document
         .select(&link_selector)
         .filter_map(|link| link.value().attr("href"))
         .unique()
+        .filter(valid_url)
         .take(2)
         .map(String::from)
         .collect();
 
-    //we need to not append http if already has it
     let fixup_urls = |us: Vec<String>| {
         us.iter()
             .map(|u| {