diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 13:59:03 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-20 13:59:11 +0200 |
commit | dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef (patch) | |
tree | c11543992b607360fe5fde6fcf2cabe2bc3c9a4f /crawler | |
parent | Crawler: Add skeleton crawler implementation (diff) | |
download | OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.tar.gz OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.tar.bz2 OSSE-dd3f705e04f8a74c03ff7ea0f4fabdaf25c310ef.zip |
Crawler: Add basic html parsing and link-following
Extremely basic implementation. Needs max queue size, error handling, formatting of parsed links.
Diffstat (limited to 'crawler')
-rw-r--r-- | crawler/Cargo.toml | 2 | ||||
-rw-r--r-- | crawler/src/main.rs | 43 |
2 files changed, 36 insertions, 9 deletions
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index ce2876a..3f03217 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -7,6 +7,8 @@ edition = "2021" [dependencies] blockingqueue = "0.1.1" +reqwest = {version = "0.11", features = ["blocking"]} +scraper = "0.12.0" [[bin]] name = "crawler" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 946d929..6067ac9 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,5 +1,3 @@ -use blockingqueue::*; - fn main() { println!("Hello, world! Im the crawler!"); @@ -13,10 +11,10 @@ fn main() { fn crawler(root_urls: Vec<&str>) { println!("Starting to crawl!"); - //add root urls to queue - let crawling_queue: BlockingQueue<&str> = BlockingQueue::new(); + //add root urls to queue - TODO: max q size + let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new(); for url in root_urls { - crawling_queue.push(url); + crawling_queue.push(String::from(url)); } //and start crawling @@ -25,7 +23,10 @@ fn crawler(root_urls: Vec<&str>) { //blocks let url = crawling_queue.pop(); - let (content, crawled_urls) = crawl_url(url); + let (_content, crawled_urls) = crawl_url(url.as_str()); + + //println!("Content: {:?}", _content); + println!("Next urls: {:?}", crawled_urls); //push content to index @@ -36,8 +37,32 @@ fn crawler(root_urls: Vec<&str>) { } //takes url, returns content and list of urls -fn crawl_url(url: &str) -> (&str, Vec<&str>) { - println!("Crawling {:?}", "https://".to_owned() + url); +fn crawl_url(url: &str) -> (String, Vec<String>) { + //return result + let url = "https://".to_owned() + url; + + println!("Crawling {:?}", url); + + let response_res = reqwest::blocking::get(url); + if response_res.is_err() { + return (String::from(""), Vec::<String>::new()); + } + let response_text_res = response_res.unwrap().text(); + if response_text_res.is_err() { + return (String::from(""), Vec::<String>::new()); + } + + let response_text = response_text_res.unwrap(); + let document = scraper::Html::parse_document(response_text.as_str()); + + let link_selector = scraper::Selector::parse("a").unwrap(); + let next_urls = document + .select(&link_selector) + .filter_map(|link| link.value().attr("href")) + .map(String::from) + .collect(); + + //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result - ("", vec![]) + (response_text, next_urls) } |