diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-11 01:30:14 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-11 01:30:14 +0100 |
commit | e8c26e0701efb683248199475aca5d9baec36c0b (patch) | |
tree | a545b0fdafc294842d54482e5352968aba985375 | |
parent | Misc: Update Readme to be more user friendly (diff) | |
download | OSSE-e8c26e0701efb683248199475aca5d9baec36c0b.tar.gz OSSE-e8c26e0701efb683248199475aca5d9baec36c0b.tar.bz2 OSSE-e8c26e0701efb683248199475aca5d9baec36c0b.zip |
Indexer: Stem words prior to adding/searching them
-rw-r--r-- | Cargo.lock | 11 | ||||
-rw-r--r-- | indexer/Cargo.toml | 1 | ||||
-rw-r--r-- | indexer/src/indexer_implementation.rs | 16 |
3 files changed, 24 insertions, 4 deletions
diff --git a/Cargo.lock b/Cargo.lock index c6c6e6f..5b9513e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1327,6 +1327,7 @@ dependencies = [ "kuchiki", "lib", "log", + "rust-stemmers", "scraper", "serde", "serde_json", @@ -2175,6 +2176,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afab94fb28594581f62d981211a9a4d53cc8130bbcbbb89a0440d9b8e81a7746" [[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index b437edf..10b1f8b 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -16,6 +16,7 @@ serde_json = "1.0.87" kuchiki = "0.8.1" log = "0.4.17" env_logger = "0.9.1" +rust-stemmers = "1.2.0" lib = { path = "../lib" } [[bin]] diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs index f24c2bd..0789a28 100644 --- a/indexer/src/indexer_implementation.rs +++ b/indexer/src/indexer_implementation.rs @@ -1,15 +1,18 @@ use lib::lib::*; +use rust_stemmers::{Algorithm, Stemmer}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; pub struct IndexerImplementation { pub database: HashMap<String, HashSet<IndexedResource>>, + stemmer: Stemmer, } impl IndexerImplementation { pub fn new() -> Self { Self { database: HashMap::new(), + stemmer: Stemmer::create(Algorithm::English), //todo: depend on lang } } @@ -20,6 +23,8 @@ impl IndexerImplementation { fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 { //TODO: priorize lower levels of url, priorize word in url/title/description or main? + //TODO: levshtein + //atm priority is just the number of occurences in the site. words.iter().filter(|w| *w == word).count() as u32 } @@ -38,19 +43,21 @@ impl crate::Indexer for IndexerImplementation { for word in words { let resource_to_add = IndexedResource { url: url.to_string(), - priority: Self::calculate_word_priority(word, content, words), + priority: Self::calculate_word_priority(word, content, words), //we should take into account title, description lang etc word: Arc::new(word.to_string()), title: title.clone(), description: description.clone(), language: language.clone(), }; - match self.database.get_mut(word) { + let stemmed_word = self.stemmer.stem(word).to_string(); + log::debug!("Word: {}, Stemmed word: {}", word, stemmed_word); + match self.database.get_mut(&stemmed_word) { Some(resources) => _ = resources.insert(resource_to_add), None => { _ = self .database - .insert(word.to_string(), HashSet::from([resource_to_add])) + .insert(stemmed_word, HashSet::from([resource_to_add])) } } } @@ -67,7 +74,8 @@ impl crate::Indexer for IndexerImplementation { //Normalise queries to lowercase let w = w.to_ascii_lowercase(); - let curr_word_results = match self.search_word_in_db(w.to_string()) { + let stemmed_word = self.stemmer.stem(&w).to_string(); + let curr_word_results = match self.search_word_in_db(stemmed_word) { None => return Ok(HashSet::new()), //I dont really like this Some(curr_results) => curr_results, }; |