diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 16:27:02 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 16:27:02 +0200 |
commit | 645ec232114a3149bf4d501550de54406e63b907 (patch) | |
tree | 8482c4d6931c1d0f308ad4ed2ea23a4bcf4e4f01 | |
parent | Crawler: Add basic html parsing and link-following (diff) | |
download | OSSE-645ec232114a3149bf4d501550de54406e63b907.tar.gz OSSE-645ec232114a3149bf4d501550de54406e63b907.tar.bz2 OSSE-645ec232114a3149bf4d501550de54406e63b907.zip |
Crawler: Remove duplicate parsed urls
-rw-r--r-- | Cargo.lock | 16 | ||||
-rw-r--r-- | crawler/Cargo.toml | 1 | ||||
-rw-r--r-- | crawler/src/main.rs | 3 |
3 files changed, 20 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock index 7dfc67a..91830a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -87,6 +87,7 @@ name = "crawler" version = "0.1.0" dependencies = [ "blockingqueue", + "itertools", "reqwest", "scraper", ] @@ -153,6 +154,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" [[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] name = "encoding_rs" version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -457,6 +464,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] name = "itoa" version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index 3f03217..2779421 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -9,6 +9,7 @@ edition = "2021" blockingqueue = "0.1.1" reqwest = {version = "0.11", features = ["blocking"]} scraper = "0.12.0" +itertools = "0.10.5" [[bin]] name = "crawler" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 6067ac9..15abcaf 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,3 +1,5 @@ +use itertools::Itertools; + fn main() { println!("Hello, world! Im the crawler!"); @@ -59,6 +61,7 @@ fn crawl_url(url: &str) -> (String, Vec<String>) { let next_urls = document .select(&link_selector) .filter_map(|link| link.value().attr("href")) + .unique() .map(String::from) .collect(); |