diff options
| author | Baitinq <[email protected]> | 2022-10-20 01:53:38 +0200 |
|---|---|---|
| committer | Baitinq <[email protected]> | 2022-10-20 01:57:07 +0200 |
| commit | 610d08aec18e08a2bfa64d4043c103d04490a6c5 (patch) | |
| tree | 45b28e6d6a862405ecfa3b24c12ebfc77b8902b3 /crawler/src | |
| parent | Misc: Change to use "oxalica/rust-overlay" for the nix development shell (diff) | |
| download | OSSE-610d08aec18e08a2bfa64d4043c103d04490a6c5.tar.gz OSSE-610d08aec18e08a2bfa64d4043c103d04490a6c5.tar.bz2 OSSE-610d08aec18e08a2bfa64d4043c103d04490a6c5.zip | |
Crawler: Add skeleton crawler implementation
Starts by filling a queue with the top 1000 most visited sites. "Crawls" each one (empty fn), and blocks for new elements on the queue.
Diffstat (limited to 'crawler/src')
| -rw-r--r-- | crawler/src/main.rs | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index b95ce69..946d929 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,3 +1,43 @@ +use blockingqueue::*; + fn main() { println!("Hello, world! Im the crawler!"); + + let root_urls = include_str!("../top-1000-websites.txt"); + let root_urls = root_urls.split('\n').collect(); + + crawler(root_urls); +} + +//takes list of strings - multithread here? +fn crawler(root_urls: Vec<&str>) { + println!("Starting to crawl!"); + + //add root urls to queue + let crawling_queue: BlockingQueue<&str> = BlockingQueue::new(); + for url in root_urls { + crawling_queue.push(url); + } + + //and start crawling + //FIXME: Async! + loop { + //blocks + let url = crawling_queue.pop(); + + let (content, crawled_urls) = crawl_url(url); + + //push content to index + + for url in crawled_urls { + crawling_queue.push(url); + } + } +} + +//takes url, returns content and list of urls +fn crawl_url(url: &str) -> (&str, Vec<&str>) { + println!("Crawling {:?}", "https://".to_owned() + url); + + ("", vec![]) } |