From c4d8cfda83197c1574da52e0aa39c9b1557e8e7c Mon Sep 17 00:00:00 2001 From: Baitinq Date: Sun, 30 Oct 2022 18:33:57 +0100 Subject: Crawler: Accept max_queue_size as an argument for crawler() We also now set the max queue size to the max of the root url list or the max_queue_size. This is useful because if someone changes the root url list the crawler would previously hang if it had more entries than the max_queue_size. --- crawler/src/main.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawler/src/main.rs b/crawler/src/main.rs index ce9943f..263f67d 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -11,17 +11,19 @@ async fn main() { let root_urls = include_str!("../top-1000-websites.txt"); let root_urls = root_urls.split('\n').collect(); + let max_queue_size = 2222; + let http_client = reqwest::Client::new(); - crawler(http_client, root_urls).await; + crawler(http_client, root_urls, max_queue_size).await; } //TODO: crawling depth? - async http client -async fn crawler(http_client: Client, root_urls: Vec<&str>) { +async fn crawler(http_client: Client, root_urls: Vec<&str>, max_queue_size: usize) { dbg!("Starting to crawl!"); //add root urls to queue - TODO: max q size - let (tx_crawling_queue, rx_crawling_queue) = async_channel::bounded::(2222); + let (tx_crawling_queue, rx_crawling_queue) = async_channel::bounded::(std::cmp::max(max_queue_size, root_urls.len())); for url in root_urls { tx_crawling_queue.send(url.to_string()).await.unwrap(); } -- cgit 1.4.1