about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-30 19:22:31 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-30 19:22:38 +0100
commit6be7f36a0d44f3007ec9ede828c44168eac1054e (patch)
tree3e1c4ee578f46f04d62270c50ca0a8143b122c03
parentCrawler: Accept max_queue_size as an argument for crawler() (diff)
downloadOSSE-6be7f36a0d44f3007ec9ede828c44168eac1054e.tar.gz
OSSE-6be7f36a0d44f3007ec9ede828c44168eac1054e.tar.bz2
OSSE-6be7f36a0d44f3007ec9ede828c44168eac1054e.zip
Crawler: Set 4 as the maximum "crawl depth"
Its not really crawl depth as we just count the path segments.
-rw-r--r--crawler/src/main.rs1
1 files changed, 1 insertions, 0 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 263f67d..d7a19a4 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -99,6 +99,7 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
         u if !(u.scheme() == "http" || u.scheme() == "https") => false,
         u if u.fragment().is_some() => false, //no # urls
         u if u.query().is_some() => false,    //no ? urls
+        u if u.path_segments().is_some() && u.path_segments().unwrap().count() > 4 => false, // max "crawling depth" is 4
         u if *u == url => false,              //no same url
         _ => true,
     };