Crawler: Add basic html parsing and link-following

Extremely basic implementation. Needs max queue size, error handling, formatting of parsed links.
author: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-20 13:59:03 +0200
committer: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-20 13:59:11 +0200
commit: dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef (patch)
tree: c11543992b607360fe5fde6fcf2cabe2bc3c9a4f /crawler
parent: Crawler: Add skeleton crawler implementation (diff)
download: OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.tar.gz
OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.tar.bz2
OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.zip
2 files changed, 36 insertions, 9 deletions
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index ce2876a..3f03217 100644
--- a/crawler/Cargo.toml
+++ b/crawler/Cargo.toml
@@ -7,6 +7,8 @@ edition = "2021"
 
 [dependencies]
 blockingqueue = "0.1.1"
+reqwest = {version = "0.11", features = ["blocking"]}
+scraper = "0.12.0"
 
 [[bin]]
 name = "crawler"
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 946d929..6067ac9 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,5 +1,3 @@
-use blockingqueue::*;
-
 fn main() {
     println!("Hello, world! Im the crawler!");
 
@@ -13,10 +11,10 @@ fn main() {
 fn crawler(root_urls: Vec<&str>) {
     println!("Starting to crawl!");
 
-    //add root urls to queue
-    let crawling_queue: BlockingQueue<&str> = BlockingQueue::new();
+    //add root urls to queue - TODO: max q size
+    let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new();
     for url in root_urls {
-        crawling_queue.push(url);
+        crawling_queue.push(String::from(url));
     }
 
     //and start crawling
@@ -25,7 +23,10 @@ fn crawler(root_urls: Vec<&str>) {
         //blocks
         let url = crawling_queue.pop();
 
-        let (content, crawled_urls) = crawl_url(url);
+        let (_content, crawled_urls) = crawl_url(url.as_str());
+
+        //println!("Content: {:?}", _content);
+        println!("Next urls: {:?}", crawled_urls);
 
         //push content to index
 
@@ -36,8 +37,32 @@ fn crawler(root_urls: Vec<&str>) {
 }
 
 //takes url, returns content and list of urls
-fn crawl_url(url: &str) -> (&str, Vec<&str>) {
-    println!("Crawling {:?}", "https://".to_owned() + url);
+fn crawl_url(url: &str) -> (String, Vec<String>) {
+    //return result
+    let url = "https://".to_owned() + url;
+
+    println!("Crawling {:?}", url);
+
+    let response_res = reqwest::blocking::get(url);
+    if response_res.is_err() {
+        return (String::from(""), Vec::<String>::new());
+    }
+    let response_text_res = response_res.unwrap().text();
+    if response_text_res.is_err() {
+        return (String::from(""), Vec::<String>::new());
+    }
+
+    let response_text = response_text_res.unwrap();
+    let document = scraper::Html::parse_document(response_text.as_str());
+
+    let link_selector = scraper::Selector::parse("a").unwrap();
+    let next_urls = document
+        .select(&link_selector)
+        .filter_map(|link| link.value().attr("href"))
+        .map(String::from)
+        .collect();
+
+    //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result
 
-    ("", vec![])
+    (response_text, next_urls)
 }
author	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-20 13:59:03 +0200
committer	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-20 13:59:11 +0200
commit	dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef (patch)
tree	c11543992b607360fe5fde6fcf2cabe2bc3c9a4f /crawler
parent	Crawler: Add skeleton crawler implementation (diff)
download	OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.tar.gz OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.tar.bz2 OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.zip