about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-01 21:16:21 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-02 17:25:26 +0100
commit15e23646913ad816c758de86f858af4785700a00 (patch)
tree47e28e8ccd4caf041c960a147a35c4dbf82f2656
parentFrontend: Result component: Add more font related css (diff)
downloadOSSE-15e23646913ad816c758de86f858af4785700a00.tar.gz
OSSE-15e23646913ad816c758de86f858af4785700a00.tar.bz2
OSSE-15e23646913ad816c758de86f858af4785700a00.zip
Indexer: Abstract indexer
We abstract an indexer's functionality into a trait (Indexer). We move
the indexer specific code into the indexer_implementation.rs file.

Im not sure if this causes a performance decrease. Should be
investigated further.
-rw-r--r--indexer/src/indexer_implementation.rs93
-rw-r--r--indexer/src/main.rs100
2 files changed, 132 insertions, 61 deletions
diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs
new file mode 100644
index 0000000..4bb3857
--- /dev/null
+++ b/indexer/src/indexer_implementation.rs
@@ -0,0 +1,93 @@
+use lib::lib::*;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+pub struct IndexerImplementation {
+    pub database: HashMap<String, HashSet<IndexedResource>>,
+}
+
+impl IndexerImplementation {
+    pub fn new() -> Self {
+        Self {
+            database: HashMap::new(),
+        }
+    }
+
+    fn search_word_in_db(&self, word: String) -> Option<&HashSet<IndexedResource>> {
+        self.database.get(&word)
+    }
+
+    fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
+        //TODO: priorize lower levels of url, priorize word in url/title/description or main?
+
+        //atm priority is just the number of occurences in the site.
+        words.iter().filter(|w| *w == word).count() as u32
+    }
+}
+
+impl crate::Indexer for IndexerImplementation {
+    fn insert(
+        &mut self,
+        word: &str,
+        url: &str,
+        title: &str,
+        description: &str,
+        content: &str,
+        fixed_words: &[String],
+    ) -> Result<(), String> {
+        let resource_to_add = IndexedResource {
+            url: url.to_string(),
+            priority: Self::calculate_word_priority(word, content, fixed_words),
+            word: Arc::new(word.to_string()),
+            title: title.to_string(),
+            description: description.to_string(),
+        };
+
+        match self.database.get_mut(word) {
+            Some(resources) => _ = resources.insert(resource_to_add),
+            None => {
+                _ = self
+                    .database
+                    .insert(word.to_string(), HashSet::from([resource_to_add]))
+            }
+        }
+
+        Ok(())
+    }
+
+    fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String> {
+        let query: Vec<&str> = term.split(' ').collect();
+
+        //percentage of valid words
+        let mut valid_results: Option<HashSet<IndexedResource>> = None;
+        for w in query {
+            //Normalise queries to lowercase
+            let w = w.to_ascii_lowercase();
+
+            let curr_word_results = match self.search_word_in_db(w.to_string()) {
+                None => return Ok(HashSet::new()), //I dont really like this
+                Some(curr_results) => curr_results,
+            };
+
+            match valid_results {
+                //Initialise valid_results
+                None => {
+                    valid_results = Some(curr_word_results.to_owned());
+                }
+                Some(results) => {
+                    let intersection: HashSet<IndexedResource> = curr_word_results
+                        .intersection(&results)
+                        .map(|s| s.to_owned())
+                        .collect();
+                    valid_results = Some(intersection);
+                }
+            }
+        }
+
+        Ok(valid_results.unwrap())
+    }
+
+    fn num_of_words(&self) -> usize {
+        self.database.len()
+    }
+}
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 8b2e54d..289789c 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -1,12 +1,30 @@
+mod indexer_implementation;
+
 use actix_cors::Cors;
 use actix_web::{get, post, web, App, HttpServer, Responder};
+use indexer_implementation::IndexerImplementation;
 use kuchiki::traits::TendrilSink;
 use lib::lib::*;
-use std::collections::{HashMap, HashSet};
-use std::sync::{Arc, Mutex};
+use std::collections::HashSet;
+use std::sync::Mutex;
+
+pub trait Indexer {
+    //too many args?
+    fn insert(
+        &mut self,
+        word: &str,
+        url: &str,
+        title: &str,
+        description: &str,
+        content: &str,
+        fixed_words: &[String],
+    ) -> Result<(), String>;
+    fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String>;
+    fn num_of_words(&self) -> usize;
+}
 
 struct AppState {
-    database: Mutex<HashMap<String, HashSet<IndexedResource>>>,
+    indexer: Mutex<Box<dyn Indexer + Send + Sync>>,
 }
 
 #[actix_web::main]
@@ -18,7 +36,7 @@ async fn main() -> std::io::Result<()> {
 
 async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
     let shared_state = web::Data::new(AppState {
-        database: Mutex::new(HashMap::new()),
+        indexer: Mutex::new(Box::new(IndexerImplementation::new())),
     });
     HttpServer::new(move || {
         let cors = Cors::permissive();
@@ -85,24 +103,23 @@ async fn add_resource(
         .collect();
 
     //and for each changed content word we add it to the db (word -> list.append(url))
-    let mut database = data.database.lock().unwrap();
+    let mut indexer = data.indexer.lock().unwrap();
     for word in &fixed_words {
-        let resource_to_add = IndexedResource {
-            url: resource.url.clone(),
-            priority: calculate_word_priority(word, resource.content.as_str(), &fixed_words),
-            word: Arc::new(word.clone()),
-            title: page_title.clone(),
-            description: page_description.clone(),
-        };
-
-        match database.get_mut(word) {
-            Some(resources) => _ = resources.insert(resource_to_add),
-            None => _ = database.insert(word.clone(), HashSet::from([resource_to_add])),
-        }
+        let _ = indexer.insert(
+            word,
+            &resource.url,
+            &page_title,
+            &page_description,
+            &resource.content,
+            &fixed_words,
+        );
     }
 
-    println!("Added resource! {:?}", database.len());
-    format!("{:?}", resource)
+    //TODO: ADD LANG? EN in meta tag (frontend)
+
+    println!("Added resource: {:?}", indexer.num_of_words());
+
+    format!("{resource:?}")
 }
 
 #[get("/search")]
@@ -112,48 +129,9 @@ async fn no_search(_data: web::Data<AppState>) -> impl Responder {
 
 #[get("/search/{term}")]
 async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Responder {
-    let query: Vec<&str> = term.split(' ').collect();
-    let database = data.database.lock().unwrap();
-
-    //percentage of valid words
-    let mut valid_results: Option<HashSet<IndexedResource>> = None;
-    for w in query {
-        //Normalise queries to lowercase
-        let w = w.to_ascii_lowercase();
-
-        let curr_word_results = match search_word_in_db(&database, w.to_string()) {
-            None => return "[]".to_string(),
-            Some(curr_results) => curr_results,
-        };
-
-        match valid_results {
-            //Initialise valid_results
-            None => {
-                valid_results = Some(curr_word_results.to_owned());
-            }
-            Some(results) => {
-                let intersection: HashSet<IndexedResource> = curr_word_results
-                    .intersection(&results)
-                    .map(|s| s.to_owned())
-                    .collect();
-                valid_results = Some(intersection);
-            }
-        }
-    }
-
-    serde_json::to_string(&valid_results.unwrap()).unwrap()
-}
-
-fn search_word_in_db(
-    db: &HashMap<String, HashSet<IndexedResource>>,
-    word: String,
-) -> Option<&HashSet<IndexedResource>> {
-    db.get(&word)
-}
+    let indexer = data.indexer.lock().unwrap();
 
-fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
-    //TODO: priorize lower levels of url, priorize word in url/title/description or main?
+    let results = indexer.search(&term);
 
-    //atm priority is just the number of occurences in the site.
-    words.iter().filter(|w| *w == word).count() as u32
+    serde_json::to_string(&results.unwrap()).unwrap()
 }