diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-24 21:22:49 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-25 01:10:34 +0200 |
commit | 096bc77066533f4b2b11f264e37ff0f9f90fc346 (patch) | |
tree | 92f1fcea914736a09c46e2b2077912f86a9ee653 | |
parent | Crawler: Add crawled url filter (diff) | |
download | OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.tar.gz OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.tar.bz2 OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.zip |
Crawler: Parse urls with the "url" crate
This fixes relative urls, makes url filtering and validation better, and many other improvements.
-rw-r--r-- | Cargo.lock | 1 | ||||
-rw-r--r-- | crawler/Cargo.toml | 1 | ||||
-rw-r--r-- | crawler/src/main.rs | 49 |
3 files changed, 26 insertions, 25 deletions
diff --git a/Cargo.lock b/Cargo.lock index 723880c..9282a8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -418,6 +418,7 @@ dependencies = [ "scraper", "serde", "tokio 0.2.25", + "url", ] [[package]] diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index 1bf6bc9..2b93f53 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -12,6 +12,7 @@ itertools = "0.10.5" serde = { version = "1.0", features = ["derive"] } tokio = { version = "0.2.22", features = ["full"] } async-channel = "1.7.1" +url = "*" [[bin]] name = "crawler" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index f5ace75..dadf9ff 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,6 +1,7 @@ use itertools::Itertools; use reqwest::blocking::{Client, Response}; use serde::Serialize; +use url::Url; #[tokio::main] async fn main() { @@ -67,22 +68,25 @@ async fn crawler(http_client: Client, root_urls: Vec<&str>) { async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> { dbg!("Crawling {:?}", url); - let response_res = http_client.get(url).send(); + let url = Url::parse(url).unwrap(); + + let response_res = http_client.get(url.as_str()).send(); if response_res.is_err() { - return Err("Error fetching ".to_owned() + url); + return Err("Error fetching ".to_owned() + url.as_str()); } let response_text_res = response_res.unwrap().text(); if response_text_res.is_err() { - return Err("Error unwrapping the fetched HTML's text (".to_owned() + url + ")"); + return Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")"); } let response_text = response_text_res.unwrap(); let document = scraper::Html::parse_document(response_text.as_str()); - let valid_url = |url: &&str| match url { - u if *u == "/" => false, - u if u.starts_with("javascript:") => false, //generalise to be everything b4 : not equals http or https - u if u.starts_with('#') => false, + let valid_url = |check_url: &Url| match check_url { + u if !(u.scheme() == "http" || u.scheme() == "https") => false, + u if u.fragment().is_some() => false, //no # urls + u if u.query().is_some() => false, //no ? urls + u if *u == url => false, //no same url _ => true, }; @@ -91,33 +95,28 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin .select(&link_selector) .filter_map(|link| link.value().attr("href")) .unique() + .map(|u| url.join(u)) + .filter(Result::is_ok) + .map(Result::unwrap) .filter(valid_url) .take(2) .map(String::from) .collect(); - let fixup_urls = |us: Vec<String>| { - us.iter() - .map(|u| { - //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute - if u.starts_with("//") { - format!("https:{}", &u) - } else if u.starts_with('/') { - format!("{}{}", &url, &u) - } else { - u.to_string() - } - }) - .collect() - }; - - let next_urls = fixup_urls(next_urls); //normalise words somewhere - //fuzzy? + //fuzzy? - iterate over keys //probs lots of places where we can borrow or not do stupid stuff //search for phrases? //why multiple '/' at the end of sites" - + //http workings lagging behind crawler, what to do? + //group responses and transmit them in an array of 10 or smth -> or maybe just lower q size + //use structs in database indexer + //we need words priority or word list or smth (or in value of database show number of occurance or just val of importance of occurances) + //i dont understand dbg! (how to print {}) + //is there empty urls? + //do proper matches instead of unwraps + + println!("Returning next urls, {:?}", next_urls); Ok((response_text, next_urls)) } |