about summary refs log tree commit diff
path: root/crawler
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-20 16:27:02 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-20 16:27:02 +0200
commit645ec232114a3149bf4d501550de54406e63b907 (patch)
tree8482c4d6931c1d0f308ad4ed2ea23a4bcf4e4f01 /crawler
parentCrawler: Add basic html parsing and link-following (diff)
downloadOSSE-645ec232114a3149bf4d501550de54406e63b907.tar.gz
OSSE-645ec232114a3149bf4d501550de54406e63b907.tar.bz2
OSSE-645ec232114a3149bf4d501550de54406e63b907.zip
Crawler: Remove duplicate parsed urls
Diffstat (limited to 'crawler')
-rw-r--r--crawler/Cargo.toml1
-rw-r--r--crawler/src/main.rs3
2 files changed, 4 insertions, 0 deletions
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index 3f03217..2779421 100644
--- a/crawler/Cargo.toml
+++ b/crawler/Cargo.toml
@@ -9,6 +9,7 @@ edition = "2021"
 blockingqueue = "0.1.1"
 reqwest = {version = "0.11", features = ["blocking"]}
 scraper = "0.12.0"
+itertools = "0.10.5"
 
 [[bin]]
 name = "crawler"
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 6067ac9..15abcaf 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,3 +1,5 @@
+use itertools::Itertools;
+
 fn main() {
     println!("Hello, world! Im the crawler!");
 
@@ -59,6 +61,7 @@ fn crawl_url(url: &str) -> (String, Vec<String>) {
     let next_urls = document
         .select(&link_selector)
         .filter_map(|link| link.value().attr("href"))
+        .unique()
         .map(String::from)
         .collect();