diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-14 16:01:19 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-14 16:01:19 +0100 |
commit | 1746c27506ec166bdebb53205a74b17b0017fc3d (patch) | |
tree | 31a7516eec8e8b592309f11b7206928e57520ce6 | |
parent | Indexer: Stem words prior to adding/searching them (diff) | |
download | OSSE-levshtein.tar.gz OSSE-levshtein.tar.bz2 OSSE-levshtein.zip |
LEV levshtein
-rw-r--r-- | Cargo.lock | 7 | ||||
-rw-r--r-- | indexer/Cargo.toml | 1 | ||||
-rw-r--r-- | indexer/src/indexer_implementation.rs | 54 | ||||
-rw-r--r-- | lib/src/lib.rs | 4 |
4 files changed, 53 insertions, 13 deletions
diff --git a/Cargo.lock b/Cargo.lock index 5b9513e..3eb9311 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1325,6 +1325,7 @@ dependencies = [ "env_logger", "html-escape", "kuchiki", + "levenshtein", "lib", "log", "rust-stemmers", @@ -1444,6 +1445,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] +name = "levenshtein" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" + +[[package]] name = "lib" version = "0.1.0" dependencies = [ diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 10b1f8b..d3cac35 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -17,6 +17,7 @@ kuchiki = "0.8.1" log = "0.4.17" env_logger = "0.9.1" rust-stemmers = "1.2.0" +levenshtein = "1.0.5" lib = { path = "../lib" } [[bin]] diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs index 0789a28..1aeedfc 100644 --- a/indexer/src/indexer_implementation.rs +++ b/indexer/src/indexer_implementation.rs @@ -23,10 +23,41 @@ impl IndexerImplementation { fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 { //TODO: priorize lower levels of url, priorize word in url/title/description or main? - //TODO: levshtein + let mut priority = 0u32; + for w in words { + let lev_distance = levenshtein::levenshtein(word, w) as u32; + log::debug!( + "Lev distance between target: {} and curr: {} -> {} --- w len: {}", + word, + w, + lev_distance, + w.len() + ); + priority += lev_distance; + } + + priority + } + + fn create_indexed_resource( + url: &str, + word: &str, + title: &Option<String>, + description: &Option<String>, + language: &Option<String>, + content: &str, + words: &[String], + ) -> IndexedResource { + let priority = Self::calculate_word_priority(word, content, words); //we should take into account title, description lang etc - //atm priority is just the number of occurences in the site. - words.iter().filter(|w| *w == word).count() as u32 + IndexedResource { + url: url.to_string(), + priority, + word: Arc::new(word.to_string()), + title: title.clone(), + description: description.clone(), + language: language.clone(), + } } } @@ -41,14 +72,15 @@ impl crate::Indexer for IndexerImplementation { content: &str, ) -> Result<(), String> { for word in words { - let resource_to_add = IndexedResource { - url: url.to_string(), - priority: Self::calculate_word_priority(word, content, words), //we should take into account title, description lang etc - word: Arc::new(word.to_string()), - title: title.clone(), - description: description.clone(), - language: language.clone(), - }; + let resource_to_add = Self::create_indexed_resource( + url, + word, + title, + description, + language, + content, + words, + ); let stemmed_word = self.stemmer.stem(word).to_string(); log::debug!("Word: {}, Stemmed word: {}", word, stemmed_word); diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 54e6ea9..5c095c8 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -36,10 +36,10 @@ pub mod lib { } } - //Reverse ordering as priority: 1 is less than priority: 2 + //Priority 1 is higher than priority 2 impl Ord for IndexedResource { fn cmp(&self, other: &Self) -> Ordering { - self.priority.cmp(&other.priority).reverse() + self.priority.cmp(&other.priority) } } |