diff options
Diffstat (limited to '')
| -rw-r--r-- | crawler/src/main.rs | 43 |
1 files changed, 34 insertions, 9 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 946d929..6067ac9 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,5 +1,3 @@ -use blockingqueue::*; - fn main() { println!("Hello, world! Im the crawler!"); @@ -13,10 +11,10 @@ fn main() { fn crawler(root_urls: Vec<&str>) { println!("Starting to crawl!"); - //add root urls to queue - let crawling_queue: BlockingQueue<&str> = BlockingQueue::new(); + //add root urls to queue - TODO: max q size + let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new(); for url in root_urls { - crawling_queue.push(url); + crawling_queue.push(String::from(url)); } //and start crawling @@ -25,7 +23,10 @@ fn crawler(root_urls: Vec<&str>) { //blocks let url = crawling_queue.pop(); - let (content, crawled_urls) = crawl_url(url); + let (_content, crawled_urls) = crawl_url(url.as_str()); + + //println!("Content: {:?}", _content); + println!("Next urls: {:?}", crawled_urls); //push content to index @@ -36,8 +37,32 @@ fn crawler(root_urls: Vec<&str>) { } //takes url, returns content and list of urls -fn crawl_url(url: &str) -> (&str, Vec<&str>) { - println!("Crawling {:?}", "https://".to_owned() + url); +fn crawl_url(url: &str) -> (String, Vec<String>) { + //return result + let url = "https://".to_owned() + url; + + println!("Crawling {:?}", url); + + let response_res = reqwest::blocking::get(url); + if response_res.is_err() { + return (String::from(""), Vec::<String>::new()); + } + let response_text_res = response_res.unwrap().text(); + if response_text_res.is_err() { + return (String::from(""), Vec::<String>::new()); + } + + let response_text = response_text_res.unwrap(); + let document = scraper::Html::parse_document(response_text.as_str()); + + let link_selector = scraper::Selector::parse("a").unwrap(); + let next_urls = document + .select(&link_selector) + .filter_map(|link| link.value().attr("href")) + .map(String::from) + .collect(); + + //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result - ("", vec![]) + (response_text, next_urls) } |