diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-21 12:11:35 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-21 12:11:35 +0200 |
commit | 9d2d5b9c9eb0a23917509c36cfddd740b6723837 (patch) | |
tree | 0b85230e69c4e4b834eb14786ac913d26937df4b | |
parent | Indexer: Add skeleton http rest endpoint functionality (diff) | |
download | OSSE-9d2d5b9c9eb0a23917509c36cfddd740b6723837.tar.gz OSSE-9d2d5b9c9eb0a23917509c36cfddd740b6723837.tar.bz2 OSSE-9d2d5b9c9eb0a23917509c36cfddd740b6723837.zip |
Crawler: Add basic indexer communication
-rw-r--r-- | crawler/Cargo.toml | 3 | ||||
-rw-r--r-- | crawler/src/main.rs | 56 |
2 files changed, 48 insertions, 11 deletions
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index 2779421..486729a 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -7,9 +7,10 @@ edition = "2021" [dependencies] blockingqueue = "0.1.1" -reqwest = {version = "0.11", features = ["blocking"]} +reqwest = {version = "0.11", features = ["blocking", "json"]} scraper = "0.12.0" itertools = "0.10.5" +serde = { version = "1.0", features = ["derive"] } [[bin]] name = "crawler" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 2ffb1c7..c086a76 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -1,4 +1,6 @@ use itertools::Itertools; +use reqwest::blocking::{Client, Response}; +use serde::Serialize; fn main() { println!("Hello, world! Im the crawler!"); @@ -6,11 +8,13 @@ fn main() { let root_urls = include_str!("../top-1000-websites.txt"); let root_urls = root_urls.split('\n').collect(); - crawler(root_urls); + let http_client = reqwest::blocking::Client::new(); + + crawler(&http_client, root_urls); } //takes list of strings - multithread here? -fn crawler(root_urls: Vec<&str>) { +fn crawler(http_client: &Client, root_urls: Vec<&str>) { println!("Starting to crawl!"); //add root urls to queue - TODO: max q size @@ -25,19 +29,28 @@ fn crawler(root_urls: Vec<&str>) { //blocks let url = crawling_queue.pop(); - let crawl_res = crawl_url(url.as_str()); + let crawl_res = crawl_url(http_client, url.as_str()); if crawl_res.is_err() { println!("Error crawling {}", url); continue; } - let (_content, crawled_urls) = crawl_res.unwrap(); + let (content, crawled_urls) = crawl_res.unwrap(); - //println!("Content: {:?}", _content); + //println!("Content: {:?}", content); println!("Next urls: {:?}", crawled_urls); //push content to index - _ = push_crawl_entry_to_indexer(url, _content); + let indexer_res = push_crawl_entry_to_indexer( + http_client, + String::from("http://127.0.0.1:4444/resource"), + url, + content, + ) + .unwrap() + .text(); + + println!("Pushed to indexer {:?}", &indexer_res); for url in crawled_urls { crawling_queue.push(url); @@ -45,12 +58,12 @@ fn crawler(root_urls: Vec<&str>) { } } -fn crawl_url(url: &str) -> Result<(String, Vec<String>), String> { +fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> { let url = "https://".to_owned() + url; println!("Crawling {:?}", url); - let response_res = reqwest::blocking::get(&url); + let response_res = http_client.get(&url).send(); if response_res.is_err() { return Err("Error fetching ".to_owned() + &url); } @@ -90,6 +103,29 @@ fn crawl_url(url: &str) -> Result<(String, Vec<String>), String> { Ok((response_text, next_urls)) } -fn push_crawl_entry_to_indexer(_url: String, _content: String) -> Result<(), ()> { - Ok(()) +fn push_crawl_entry_to_indexer( + http_client: &Client, + indexer_url: String, + url: String, + content: String, +) -> Result<Response, String> { + println!("Pushin to indexer"); + + #[derive(Serialize, Debug)] + struct Resource { + url: String, + content: String, + } + + let request_body = Resource { url, content }; + + let response_res = http_client.post(&indexer_url).json(&request_body).send(); + if response_res.is_err() { + return Err(format!( + "Error pushing the crawler to indexer! {:?}", + &indexer_url + )); + } + + Ok(response_res.unwrap()) } |