about summary refs log tree commit diff
path: root/crawler/src
diff options
context:
space:
mode:
authorBaitinq <[email protected]>2022-10-20 01:53:38 +0200
committerBaitinq <[email protected]>2022-10-20 01:57:07 +0200
commit610d08aec18e08a2bfa64d4043c103d04490a6c5 (patch)
tree45b28e6d6a862405ecfa3b24c12ebfc77b8902b3 /crawler/src
parentMisc: Change to use "oxalica/rust-overlay" for the nix development shell (diff)
downloadOSSE-610d08aec18e08a2bfa64d4043c103d04490a6c5.tar.gz
OSSE-610d08aec18e08a2bfa64d4043c103d04490a6c5.tar.bz2
OSSE-610d08aec18e08a2bfa64d4043c103d04490a6c5.zip
Crawler: Add skeleton crawler implementation
Starts by filling a queue with the top 1000 most visited sites. "Crawls"
each one (empty fn), and blocks for new elements on the queue.
Diffstat (limited to 'crawler/src')
-rw-r--r--crawler/src/main.rs40
1 files changed, 40 insertions, 0 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index b95ce69..946d929 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,3 +1,43 @@
+use blockingqueue::*;
+
 fn main() {
     println!("Hello, world! Im the crawler!");
+
+    let root_urls = include_str!("../top-1000-websites.txt");
+    let root_urls = root_urls.split('\n').collect();
+
+    crawler(root_urls);
+}
+
+//takes list of strings - multithread here?
+fn crawler(root_urls: Vec<&str>) {
+    println!("Starting to crawl!");
+
+    //add root urls to queue
+    let crawling_queue: BlockingQueue<&str> = BlockingQueue::new();
+    for url in root_urls {
+        crawling_queue.push(url);
+    }
+
+    //and start crawling
+    //FIXME: Async!
+    loop {
+        //blocks
+        let url = crawling_queue.pop();
+
+        let (content, crawled_urls) = crawl_url(url);
+
+        //push content to index
+
+        for url in crawled_urls {
+            crawling_queue.push(url);
+        }
+    }
+}
+
+//takes url, returns content and list of urls
+fn crawl_url(url: &str) -> (&str, Vec<&str>) {
+    println!("Crawling {:?}", "https://".to_owned() + url);
+
+    ("", vec![])
 }