diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-23 12:06:33 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-23 12:06:35 +0200 |
commit | 4445c4d168df1ab71431da9db1a053629ed4d0d9 (patch) | |
tree | 2c129a481d16ce252d0b634491220b90bd0d9886 | |
parent | Crawler: Change blockingqueue to channels (diff) | |
download | OSSE-4445c4d168df1ab71431da9db1a053629ed4d0d9.tar.gz OSSE-4445c4d168df1ab71431da9db1a053629ed4d0d9.tar.bz2 OSSE-4445c4d168df1ab71431da9db1a053629ed4d0d9.zip |
Crawler: Only crawl 2 urls per url
This makes it so that we dont get rate limited from websites.
-rw-r--r-- | crawler/src/main.rs | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index f8dc226..d1333fe 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -86,9 +86,11 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin .select(&link_selector) .filter_map(|link| link.value().attr("href")) .unique() + .take(2) .map(String::from) .collect(); + //we need to not append http if already has it let fixup_urls = |us: Vec<String>| { us.into_iter() .map(|u| { @@ -105,6 +107,10 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin }; let next_urls = fixup_urls(next_urls); + //limit to 2 or smth for ram? or depth + //normalise words somewhere + //fuzzy? + //probs lots of places where we can borrow or not do stupid stuff Ok((response_text, next_urls)) } |