From 4a74cc4691fe11589e15040012988fd84a368cb5 Mon Sep 17 00:00:00 2001 From: Baitinq Date: Tue, 25 Oct 2022 12:41:10 +0200 Subject: Indexer: Use CrawledResource structure as values in the reverse index db This will allow us to integrate priorities and other improvements. --- Cargo.lock | 1 + indexer/Cargo.toml | 1 + indexer/src/main.rs | 54 ++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c10b6ae..8d50a21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -984,6 +984,7 @@ version = "0.1.0" dependencies = [ "actix-web", "html2text", + "rand 0.7.3", "scraper", "serde", ] diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index c77dc8f..f86c656 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -10,6 +10,7 @@ actix-web = "*" serde = { version = "1.0", features = ["derive"] } scraper = "0.12.0" html2text = "0.4.3" +rand = "0.7.3" [[bin]] name = "indexer" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 4d88f0a..ba4e70c 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -1,10 +1,33 @@ use actix_web::{get, post, web, App, HttpServer, Responder}; +use rand::Rng; use serde::Deserialize; use std::collections::{HashMap, HashSet}; -use std::sync::Mutex; +use std::hash::{Hash, Hasher}; +use std::sync::{Arc, Mutex}; + +#[derive(Debug, Clone)] +struct CrawledResource { + url: String, + priority: u32, //how do we even calculate this + word: Arc, +} + +//We implement PartialEq, Eq and Hash to ignore the priority field. +impl PartialEq for CrawledResource { + fn eq(&self, other: &Self) -> bool { + self.url == other.url && self.word == other.word + } +} +impl Eq for CrawledResource {} +impl Hash for CrawledResource { + fn hash(&self, state: &mut H) { + self.url.hash(state); + self.word.hash(state); + } +} struct AppState { - database: Mutex>>, + database: Mutex>>, } #[actix_web::main] @@ -54,11 +77,15 @@ async fn add_resource(data: web::Data, resource: web::Json) //and for each changed content word we add it to the db (word -> list.append(url)) let mut database = data.database.lock().unwrap(); for word in fixed_words { - //should probs do some priority - let maybe_urls = database.get_mut(&word); - match maybe_urls { - Some(urls) => _ = urls.insert(resource.url.clone()), - None => _ = database.insert(word, HashSet::from([resource.url.clone()])), + let resource_to_add = CrawledResource { + url: resource.url.clone(), + priority: calculate_word_priority(&word, resource.content.as_str()), + word: Arc::new(word.clone()), + }; + + match database.get_mut(&word) { + Some(resources) => _ = resources.insert(resource_to_add), + None => _ = database.insert(word.clone(), HashSet::from([resource_to_add])), } } @@ -71,7 +98,7 @@ async fn search(data: web::Data, term: web::Path) -> impl Resp let query: Vec<&str> = term.split(' ').collect(); let database = data.database.lock().unwrap(); - let mut valid_results: Option> = None; + let mut valid_results: Option> = None; for w in query { let curr_word_results = match database.get(w) { None => return format!("No results found for {:?}!", w), @@ -80,14 +107,14 @@ async fn search(data: web::Data, term: web::Path) -> impl Resp match valid_results { None => { - valid_results = Some(curr_word_results.clone()); + valid_results = Some(curr_word_results.to_owned()); } Some(results) => { - let intersection: Vec = curr_word_results + let intersection: HashSet = curr_word_results .intersection(&results) .map(|s| s.to_owned()) .collect(); - valid_results = Some(HashSet::from_iter(intersection)); + valid_results = Some(intersection); } } } @@ -97,3 +124,8 @@ async fn search(data: web::Data, term: web::Path) -> impl Resp valid_results.unwrap() ) } + +//TODO! +fn calculate_word_priority(_word: &str, _html_site: &str) -> u32 { + rand::thread_rng().gen::() +} -- cgit 1.4.1