about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-23 12:06:33 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-23 12:06:35 +0200
commit4445c4d168df1ab71431da9db1a053629ed4d0d9 (patch)
tree2c129a481d16ce252d0b634491220b90bd0d9886
parentCrawler: Change blockingqueue to channels (diff)
downloadOSSE-4445c4d168df1ab71431da9db1a053629ed4d0d9.tar.gz
OSSE-4445c4d168df1ab71431da9db1a053629ed4d0d9.tar.bz2
OSSE-4445c4d168df1ab71431da9db1a053629ed4d0d9.zip
Crawler: Only crawl 2 urls per url
This makes it so that we dont get rate limited from websites.
-rw-r--r--crawler/src/main.rs6
1 files changed, 6 insertions, 0 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index f8dc226..d1333fe 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -86,9 +86,11 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
         .select(&link_selector)
         .filter_map(|link| link.value().attr("href"))
         .unique()
+        .take(2)
         .map(String::from)
         .collect();
 
+    //we need to not append http if already has it
     let fixup_urls = |us: Vec<String>| {
         us.into_iter()
             .map(|u| {
@@ -105,6 +107,10 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
     };
 
     let next_urls = fixup_urls(next_urls);
+    //limit to 2 or smth for ram? or depth
+    //normalise words somewhere
+    //fuzzy?
+    //probs lots of places where we can borrow or not do stupid stuff
 
     Ok((response_text, next_urls))
 }