about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-21 12:11:35 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-21 12:11:35 +0200
commit9d2d5b9c9eb0a23917509c36cfddd740b6723837 (patch)
tree0b85230e69c4e4b834eb14786ac913d26937df4b
parentIndexer: Add skeleton http rest endpoint functionality (diff)
downloadOSSE-9d2d5b9c9eb0a23917509c36cfddd740b6723837.tar.gz
OSSE-9d2d5b9c9eb0a23917509c36cfddd740b6723837.tar.bz2
OSSE-9d2d5b9c9eb0a23917509c36cfddd740b6723837.zip
Crawler: Add basic indexer communication
-rw-r--r--crawler/Cargo.toml3
-rw-r--r--crawler/src/main.rs56
2 files changed, 48 insertions, 11 deletions
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index 2779421..486729a 100644
--- a/crawler/Cargo.toml
+++ b/crawler/Cargo.toml
@@ -7,9 +7,10 @@ edition = "2021"
 
 [dependencies]
 blockingqueue = "0.1.1"
-reqwest = {version = "0.11", features = ["blocking"]}
+reqwest = {version = "0.11", features = ["blocking", "json"]}
 scraper = "0.12.0"
 itertools = "0.10.5"
+serde = { version = "1.0", features = ["derive"] }
 
 [[bin]]
 name = "crawler"
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 2ffb1c7..c086a76 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -1,4 +1,6 @@
 use itertools::Itertools;
+use reqwest::blocking::{Client, Response};
+use serde::Serialize;
 
 fn main() {
     println!("Hello, world! Im the crawler!");
@@ -6,11 +8,13 @@ fn main() {
     let root_urls = include_str!("../top-1000-websites.txt");
     let root_urls = root_urls.split('\n').collect();
 
-    crawler(root_urls);
+    let http_client = reqwest::blocking::Client::new();
+
+    crawler(&http_client, root_urls);
 }
 
 //takes list of strings - multithread here?
-fn crawler(root_urls: Vec<&str>) {
+fn crawler(http_client: &Client, root_urls: Vec<&str>) {
     println!("Starting to crawl!");
 
     //add root urls to queue - TODO: max q size
@@ -25,19 +29,28 @@ fn crawler(root_urls: Vec<&str>) {
         //blocks
         let url = crawling_queue.pop();
 
-        let crawl_res = crawl_url(url.as_str());
+        let crawl_res = crawl_url(http_client, url.as_str());
         if crawl_res.is_err() {
             println!("Error crawling {}", url);
             continue;
         }
 
-        let (_content, crawled_urls) = crawl_res.unwrap();
+        let (content, crawled_urls) = crawl_res.unwrap();
 
-        //println!("Content: {:?}", _content);
+        //println!("Content: {:?}", content);
         println!("Next urls: {:?}", crawled_urls);
 
         //push content to index
-        _ = push_crawl_entry_to_indexer(url, _content);
+        let indexer_res = push_crawl_entry_to_indexer(
+            http_client,
+            String::from("http://127.0.0.1:4444/resource"),
+            url,
+            content,
+        )
+        .unwrap()
+        .text();
+
+        println!("Pushed to indexer {:?}", &indexer_res);
 
         for url in crawled_urls {
             crawling_queue.push(url);
@@ -45,12 +58,12 @@ fn crawler(root_urls: Vec<&str>) {
     }
 }
 
-fn crawl_url(url: &str) -> Result<(String, Vec<String>), String> {
+fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> {
     let url = "https://".to_owned() + url;
 
     println!("Crawling {:?}", url);
 
-    let response_res = reqwest::blocking::get(&url);
+    let response_res = http_client.get(&url).send();
     if response_res.is_err() {
         return Err("Error fetching ".to_owned() + &url);
     }
@@ -90,6 +103,29 @@ fn crawl_url(url: &str) -> Result<(String, Vec<String>), String> {
     Ok((response_text, next_urls))
 }
 
-fn push_crawl_entry_to_indexer(_url: String, _content: String) -> Result<(), ()> {
-    Ok(())
+fn push_crawl_entry_to_indexer(
+    http_client: &Client,
+    indexer_url: String,
+    url: String,
+    content: String,
+) -> Result<Response, String> {
+    println!("Pushin to indexer");
+
+    #[derive(Serialize, Debug)]
+    struct Resource {
+        url: String,
+        content: String,
+    }
+
+    let request_body = Resource { url, content };
+
+    let response_res = http_client.post(&indexer_url).json(&request_body).send();
+    if response_res.is_err() {
+        return Err(format!(
+            "Error pushing the crawler to indexer! {:?}",
+            &indexer_url
+        ));
+    }
+
+    Ok(response_res.unwrap())
 }