diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 19:22:31 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 19:22:38 +0100 |
commit | 6be7f36a0d44f3007ec9ede828c44168eac1054e (patch) | |
tree | 3e1c4ee578f46f04d62270c50ca0a8143b122c03 | |
parent | Crawler: Accept max_queue_size as an argument for crawler() (diff) | |
download | OSSE-6be7f36a0d44f3007ec9ede828c44168eac1054e.tar.gz OSSE-6be7f36a0d44f3007ec9ede828c44168eac1054e.tar.bz2 OSSE-6be7f36a0d44f3007ec9ede828c44168eac1054e.zip |
Crawler: Set 4 as the maximum "crawl depth"
Its not really crawl depth as we just count the path segments.
-rw-r--r-- | crawler/src/main.rs | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 263f67d..d7a19a4 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -99,6 +99,7 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin u if !(u.scheme() == "http" || u.scheme() == "https") => false, u if u.fragment().is_some() => false, //no # urls u if u.query().is_some() => false, //no ? urls + u if u.path_segments().is_some() && u.path_segments().unwrap().count() > 4 => false, // max "crawling depth" is 4 u if *u == url => false, //no same url _ => true, }; |