diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 18:33:57 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 18:34:01 +0100 |
commit | c4d8cfda83197c1574da52e0aa39c9b1557e8e7c (patch) | |
tree | 1cfb2fecfc17950d1639577037d05519565e1d5d | |
parent | Frontend: Change navbar links (diff) | |
download | OSSE-c4d8cfda83197c1574da52e0aa39c9b1557e8e7c.tar.gz OSSE-c4d8cfda83197c1574da52e0aa39c9b1557e8e7c.tar.bz2 OSSE-c4d8cfda83197c1574da52e0aa39c9b1557e8e7c.zip |
Crawler: Accept max_queue_size as an argument for crawler()
We also now set the max queue size to the max of the root url list or the max_queue_size. This is useful because if someone changes the root url list the crawler would previously hang if it had more entries than the max_queue_size.
-rw-r--r-- | crawler/src/main.rs | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index ce9943f..263f67d 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -11,17 +11,19 @@ async fn main() { let root_urls = include_str!("../top-1000-websites.txt"); let root_urls = root_urls.split('\n').collect(); + let max_queue_size = 2222; + let http_client = reqwest::Client::new(); - crawler(http_client, root_urls).await; + crawler(http_client, root_urls, max_queue_size).await; } //TODO: crawling depth? - async http client -async fn crawler(http_client: Client, root_urls: Vec<&str>) { +async fn crawler(http_client: Client, root_urls: Vec<&str>, max_queue_size: usize) { dbg!("Starting to crawl!"); //add root urls to queue - TODO: max q size - let (tx_crawling_queue, rx_crawling_queue) = async_channel::bounded::<String>(2222); + let (tx_crawling_queue, rx_crawling_queue) = async_channel::bounded::<String>(std::cmp::max(max_queue_size, root_urls.len())); for url in root_urls { tx_crawling_queue.send(url.to_string()).await.unwrap(); } |