diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-24 21:12:37 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-24 21:12:39 +0200 |
commit | 3c10f6d7554d848aaaebf9f296642cce972dbd7c (patch) | |
tree | acfbb2f83948b32bbfc777cffaed6f826a939e76 | |
parent | Flake: Add rust-analyzer package (diff) | |
download | OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.gz OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.tar.bz2 OSSE-3c10f6d7554d848aaaebf9f296642cce972dbd7c.zip |
Crawler: Add crawled url filter
This filters hrefs such as "/", "#" or "javascript:"
-rw-r--r-- | crawler/src/main.rs | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index d74f1f8..f5ace75 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -79,16 +79,23 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin let response_text = response_text_res.unwrap(); let document = scraper::Html::parse_document(response_text.as_str()); + let valid_url = |url: &&str| match url { + u if *u == "/" => false, + u if u.starts_with("javascript:") => false, //generalise to be everything b4 : not equals http or https + u if u.starts_with('#') => false, + _ => true, + }; + let link_selector = scraper::Selector::parse("a").unwrap(); let next_urls = document .select(&link_selector) .filter_map(|link| link.value().attr("href")) .unique() + .filter(valid_url) .take(2) .map(String::from) .collect(); - //we need to not append http if already has it let fixup_urls = |us: Vec<String>| { us.iter() .map(|u| { |