Crawler: Parse urls with the "url" crate

This fixes relative urls, makes url filtering and validation better, and many other improvements.
author: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-24 21:22:49 +0200
committer: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-25 01:10:34 +0200
commit: 096bc77066533f4b2b11f264e37ff0f9f90fc346 (patch)
tree: 92f1fcea914736a09c46e2b2077912f86a9ee653
parent: Crawler: Add crawled url filter (diff)
download: OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.tar.gz
OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.tar.bz2
OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.zip
3 files changed, 26 insertions, 25 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 723880c..9282a8a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -418,6 +418,7 @@ dependencies = [
  "scraper",
  "serde",
  "tokio 0.2.25",
+ "url",
 ]
 
 [[package]]
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index 1bf6bc9..2b93f53 100644
--- a/crawler/Cargo.toml
+++ b/crawler/Cargo.toml
@@ -12,6 +12,7 @@ itertools = "0.10.5"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "0.2.22", features = ["full"] }
 async-channel = "1.7.1"
+url = "*"
 
 [[bin]]
 name = "crawler"
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index f5ace75..dadf9ff 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,6 +1,7 @@
 use itertools::Itertools;
 use reqwest::blocking::{Client, Response};
 use serde::Serialize;
+use url::Url;
 
 #[tokio::main]
 async fn main() {
@@ -67,22 +68,25 @@ async fn crawler(http_client: Client, root_urls: Vec<&str>) {
 async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> {
     dbg!("Crawling {:?}", url);
 
-    let response_res = http_client.get(url).send();
+    let url = Url::parse(url).unwrap();
+
+    let response_res = http_client.get(url.as_str()).send();
     if response_res.is_err() {
-        return Err("Error fetching ".to_owned() + url);
+        return Err("Error fetching ".to_owned() + url.as_str());
     }
     let response_text_res = response_res.unwrap().text();
     if response_text_res.is_err() {
-        return Err("Error unwrapping the fetched HTML's text (".to_owned() + url + ")");
+        return Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")");
     }
 
     let response_text = response_text_res.unwrap();
     let document = scraper::Html::parse_document(response_text.as_str());
 
-    let valid_url = |url: &&str| match url {
-        u if *u == "/" => false,
-        u if u.starts_with("javascript:") => false, //generalise to be everything b4 : not equals http or https
-        u if u.starts_with('#') => false,
+    let valid_url = |check_url: &Url| match check_url {
+        u if !(u.scheme() == "http" || u.scheme() == "https") => false,
+        u if u.fragment().is_some() => false, //no # urls
+        u if u.query().is_some() => false,    //no ? urls
+        u if *u == url => false,              //no same url
         _ => true,
     };
 
@@ -91,33 +95,28 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
         .select(&link_selector)
         .filter_map(|link| link.value().attr("href"))
         .unique()
+        .map(|u| url.join(u))
+        .filter(Result::is_ok)
+        .map(Result::unwrap)
         .filter(valid_url)
         .take(2)
         .map(String::from)
         .collect();
 
-    let fixup_urls = |us: Vec<String>| {
-        us.iter()
-            .map(|u| {
-                //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
-                if u.starts_with("//") {
-                    format!("https:{}", &u)
-                } else if u.starts_with('/') {
-                    format!("{}{}", &url, &u)
-                } else {
-                    u.to_string()
-                }
-            })
-            .collect()
-    };
-
-    let next_urls = fixup_urls(next_urls);
     //normalise words somewhere
-    //fuzzy?
+    //fuzzy? - iterate over keys
     //probs lots of places where we can borrow or not do stupid stuff
     //search for phrases?
     //why multiple '/' at the end of sites"
-
+    //http workings lagging behind crawler, what to do?
+    //group responses and transmit them in an array of 10 or smth -> or maybe just lower q size
+    //use structs in database indexer
+    //we need words priority or word list or smth (or in value of database show number of occurance or just val of importance of occurances)
+    //i dont understand dbg! (how to print {})
+    //is there empty urls?
+    //do proper matches instead of unwraps
+
+    println!("Returning next urls, {:?}", next_urls);
     Ok((response_text, next_urls))
 }
author	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-24 21:22:49 +0200
committer	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-25 01:10:34 +0200
commit	096bc77066533f4b2b11f264e37ff0f9f90fc346 (patch)
tree	92f1fcea914736a09c46e2b2077912f86a9ee653
parent	Crawler: Add crawled url filter (diff)
download	OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.tar.gz OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.tar.bz2 OSSE-096bc77066533f4b2b11f264e37ff0f9f90fc346.zip