about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-11 01:30:14 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-11 01:30:14 +0100
commite8c26e0701efb683248199475aca5d9baec36c0b (patch)
treea545b0fdafc294842d54482e5352968aba985375
parentMisc: Update Readme to be more user friendly (diff)
downloadOSSE-e8c26e0701efb683248199475aca5d9baec36c0b.tar.gz
OSSE-e8c26e0701efb683248199475aca5d9baec36c0b.tar.bz2
OSSE-e8c26e0701efb683248199475aca5d9baec36c0b.zip
Indexer: Stem words prior to adding/searching them
-rw-r--r--Cargo.lock11
-rw-r--r--indexer/Cargo.toml1
-rw-r--r--indexer/src/indexer_implementation.rs16
3 files changed, 24 insertions, 4 deletions
diff --git a/Cargo.lock b/Cargo.lock
index c6c6e6f..5b9513e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1327,6 +1327,7 @@ dependencies = [
  "kuchiki",
  "lib",
  "log",
+ "rust-stemmers",
  "scraper",
  "serde",
  "serde_json",
@@ -2175,6 +2176,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afab94fb28594581f62d981211a9a4d53cc8130bbcbbb89a0440d9b8e81a7746"
 
 [[package]]
+name = "rust-stemmers"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
+dependencies = [
+ "serde",
+ "serde_derive",
+]
+
+[[package]]
 name = "rustc_version"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index b437edf..10b1f8b 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -16,6 +16,7 @@ serde_json = "1.0.87"
 kuchiki = "0.8.1"
 log = "0.4.17"
 env_logger = "0.9.1"
+rust-stemmers = "1.2.0"
 lib = { path = "../lib" }
 
 [[bin]]
diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs
index f24c2bd..0789a28 100644
--- a/indexer/src/indexer_implementation.rs
+++ b/indexer/src/indexer_implementation.rs
@@ -1,15 +1,18 @@
 use lib::lib::*;
+use rust_stemmers::{Algorithm, Stemmer};
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 pub struct IndexerImplementation {
     pub database: HashMap<String, HashSet<IndexedResource>>,
+    stemmer: Stemmer,
 }
 
 impl IndexerImplementation {
     pub fn new() -> Self {
         Self {
             database: HashMap::new(),
+            stemmer: Stemmer::create(Algorithm::English), //todo: depend on lang
         }
     }
 
@@ -20,6 +23,8 @@ impl IndexerImplementation {
     fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
         //TODO: priorize lower levels of url, priorize word in url/title/description or main?
 
+        //TODO: levshtein
+
         //atm priority is just the number of occurences in the site.
         words.iter().filter(|w| *w == word).count() as u32
     }
@@ -38,19 +43,21 @@ impl crate::Indexer for IndexerImplementation {
         for word in words {
             let resource_to_add = IndexedResource {
                 url: url.to_string(),
-                priority: Self::calculate_word_priority(word, content, words),
+                priority: Self::calculate_word_priority(word, content, words), //we should take into account title, description lang etc
                 word: Arc::new(word.to_string()),
                 title: title.clone(),
                 description: description.clone(),
                 language: language.clone(),
             };
 
-            match self.database.get_mut(word) {
+            let stemmed_word = self.stemmer.stem(word).to_string();
+            log::debug!("Word: {}, Stemmed word: {}", word, stemmed_word);
+            match self.database.get_mut(&stemmed_word) {
                 Some(resources) => _ = resources.insert(resource_to_add),
                 None => {
                     _ = self
                         .database
-                        .insert(word.to_string(), HashSet::from([resource_to_add]))
+                        .insert(stemmed_word, HashSet::from([resource_to_add]))
                 }
             }
         }
@@ -67,7 +74,8 @@ impl crate::Indexer for IndexerImplementation {
             //Normalise queries to lowercase
             let w = w.to_ascii_lowercase();
 
-            let curr_word_results = match self.search_word_in_db(w.to_string()) {
+            let stemmed_word = self.stemmer.stem(&w).to_string();
+            let curr_word_results = match self.search_word_in_db(stemmed_word) {
                 None => return Ok(HashSet::new()), //I dont really like this
                 Some(curr_results) => curr_results,
             };