1 files changed, 34 insertions, 9 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 946d929..6067ac9 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,5 +1,3 @@
-use blockingqueue::*;
-
 fn main() {
     println!("Hello, world! Im the crawler!");
 
@@ -13,10 +11,10 @@ fn main() {
 fn crawler(root_urls: Vec<&str>) {
     println!("Starting to crawl!");
 
-    //add root urls to queue
-    let crawling_queue: BlockingQueue<&str> = BlockingQueue::new();
+    //add root urls to queue - TODO: max q size
+    let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new();
     for url in root_urls {
-        crawling_queue.push(url);
+        crawling_queue.push(String::from(url));
     }
 
     //and start crawling
@@ -25,7 +23,10 @@ fn crawler(root_urls: Vec<&str>) {
         //blocks
         let url = crawling_queue.pop();
 
-        let (content, crawled_urls) = crawl_url(url);
+        let (_content, crawled_urls) = crawl_url(url.as_str());
+
+        //println!("Content: {:?}", _content);
+        println!("Next urls: {:?}", crawled_urls);
 
         //push content to index
 
@@ -36,8 +37,32 @@ fn crawler(root_urls: Vec<&str>) {
 }
 
 //takes url, returns content and list of urls
-fn crawl_url(url: &str) -> (&str, Vec<&str>) {
-    println!("Crawling {:?}", "https://".to_owned() + url);
+fn crawl_url(url: &str) -> (String, Vec<String>) {
+    //return result
+    let url = "https://".to_owned() + url;
+
+    println!("Crawling {:?}", url);
+
+    let response_res = reqwest::blocking::get(url);
+    if response_res.is_err() {
+        return (String::from(""), Vec::<String>::new());
+    }
+    let response_text_res = response_res.unwrap().text();
+    if response_text_res.is_err() {
+        return (String::from(""), Vec::<String>::new());
+    }
+
+    let response_text = response_text_res.unwrap();
+    let document = scraper::Html::parse_document(response_text.as_str());
+
+    let link_selector = scraper::Selector::parse("a").unwrap();
+    let next_urls = document
+        .select(&link_selector)
+        .filter_map(|link| link.value().attr("href"))
+        .map(String::from)
+        .collect();
+
+    //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result
 
-    ("", vec![])
+    (response_text, next_urls)
 }