use actix_cors::Cors; use actix_web::{get, post, web, App, HttpServer, Responder}; use rand::Rng; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::hash::{Hash, Hasher}; use std::sync::{Arc, Mutex}; #[derive(Debug, Clone, Serialize)] struct CrawledResource { url: String, title: String, description: String, priority: u32, word: Arc, } //We implement PartialEq, Eq and Hash to ignore the priority field. impl PartialEq for CrawledResource { fn eq(&self, other: &Self) -> bool { self.url == other.url && self.word == other.word } } impl Eq for CrawledResource {} impl Hash for CrawledResource { fn hash(&self, state: &mut H) { self.url.hash(state); self.word.hash(state); } } struct AppState { database: Mutex>>, } #[actix_web::main] async fn main() -> std::io::Result<()> { println!("Hello, world! Im the indexer!"); serve_http_endpoint("0.0.0.0", 4444).await } async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { let shared_state = web::Data::new(AppState { database: Mutex::new(HashMap::new()), }); HttpServer::new(move || { let cors = Cors::permissive(); App::new() .wrap(cors) .app_data(shared_state.clone()) .service(no_search) .service(search) .service(add_resource) }) .bind((address, port))? .run() .await } //we need to rename stuff #[derive(Deserialize, Debug)] struct Resource { url: String, content: String, } #[post("/resource")] async fn add_resource(data: web::Data, resource: web::Json) -> impl Responder { //parse content let document = scraper::Html::parse_document(resource.content.as_str()); let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); let split_words = text.split(' '); //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...) let fixed_words: Vec = split_words .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic())) .filter(|w| !w.is_empty() && *w != " ") .map(|w| w.to_ascii_lowercase()) .collect(); println!("xd: {:?}", fixed_words); let title_selector = scraper::Selector::parse("title").unwrap(); let description_selector = scraper::Selector::parse("meta").unwrap(); let page_title: String = document .select(&title_selector) .map(|e| e.inner_html()) .take(1) .collect(); let page_description: String = document .select(&description_selector) .filter(|e| e.value().attr("name") == Some("description")) .filter_map(|e| e.value().attr("content")) .take(1) .collect(); //and for each changed content word we add it to the db (word -> list.append(url)) let mut database = data.database.lock().unwrap(); for word in fixed_words { let resource_to_add = CrawledResource { url: resource.url.clone(), priority: calculate_word_priority(&word, resource.content.as_str()), word: Arc::new(word.clone()), title: page_title.clone(), description: page_description.clone(), }; match database.get_mut(&word) { Some(resources) => _ = resources.insert(resource_to_add), None => _ = database.insert(word.clone(), HashSet::from([resource_to_add])), } } println!("Added resource! {:?}", database.len()); format!("{:?}", resource) } #[get("/search")] async fn no_search(_data: web::Data) -> impl Responder { "[]".to_string() } #[get("/search/{term}")] async fn search(data: web::Data, term: web::Path) -> impl Responder { let query: Vec<&str> = term.split(' ').collect(); let database = data.database.lock().unwrap(); //percentage of valid words let mut valid_results: Option> = None; for w in query { let curr_word_results = match search_word_in_db(&database, w.to_string()) { None => return "[]".to_string(), Some(curr_results) => curr_results, }; match valid_results { //Initialise valid_results None => { valid_results = Some(curr_word_results.to_owned()); } Some(results) => { let intersection: HashSet = curr_word_results .intersection(&results) .map(|s| s.to_owned()) .collect(); valid_results = Some(intersection); } } } serde_json::to_string(&valid_results.unwrap()).unwrap() } fn search_word_in_db( db: &HashMap>, word: String, ) -> Option<&HashSet> { db.get(&word) } //TODO! fn calculate_word_priority(_word: &str, _html_site: &str) -> u32 { rand::thread_rng().gen::() }