about summary refs log tree commit diff
path: root/crawler
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-23 18:53:49 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-23 18:53:51 +0200
commit096ef9e2a2f21281f1b516b2de420f04df1db56e (patch)
treefea725991fe073b935a307a1a01d121b7ed30841 /crawler
parentCrawler: Replace println! with dbg! (diff)
downloadOSSE-096ef9e2a2f21281f1b516b2de420f04df1db56e.tar.gz
OSSE-096ef9e2a2f21281f1b516b2de420f04df1db56e.tar.bz2
OSSE-096ef9e2a2f21281f1b516b2de420f04df1db56e.zip
Crawler+Indexer: Rust cleanup
Getting more familiar with the language so fixed some non optimal
into_iter() usage, unnecessary .clone()s and unnecessary hack when we
could just get a &mut for inserting into the indexer url database.
Diffstat (limited to 'crawler')
-rw-r--r--crawler/src/main.rs5
1 files changed, 2 insertions, 3 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 6161578..e8efe77 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -90,7 +90,7 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
 
     //we need to not append http if already has it
     let fixup_urls = |us: Vec<String>| {
-        us.into_iter()
+        us.iter()
             .map(|u| {
                 //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
                 if u.starts_with("//") {
@@ -98,14 +98,13 @@ async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<Strin
                 } else if u.starts_with('/') {
                     format!("{}{}", &url, &u)
                 } else {
-                    u
+                    u.to_string()
                 }
             })
             .collect()
     };
 
     let next_urls = fixup_urls(next_urls);
-    //limit to 2 or smth for ram? or depth
     //normalise words somewhere
     //fuzzy?
     //probs lots of places where we can borrow or not do stupid stuff