LEV levshtein

author: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-11-14 16:01:19 +0100
committer: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-11-14 16:01:19 +0100
commit: 1746c27506ec166bdebb53205a74b17b0017fc3d (patch)
tree: 31a7516eec8e8b592309f11b7206928e57520ce6
parent: Indexer: Stem words prior to adding/searching them (diff)
download: OSSE-levshtein.tar.gz
OSSE-levshtein.tar.bz2
OSSE-levshtein.zip
4 files changed, 53 insertions, 13 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 5b9513e..3eb9311 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1325,6 +1325,7 @@ dependencies = [
  "env_logger",
  "html-escape",
  "kuchiki",
+ "levenshtein",
  "lib",
  "log",
  "rust-stemmers",
@@ -1444,6 +1445,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
+name = "levenshtein"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
+
+[[package]]
 name = "lib"
 version = "0.1.0"
 dependencies = [
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index 10b1f8b..d3cac35 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -17,6 +17,7 @@ kuchiki = "0.8.1"
 log = "0.4.17"
 env_logger = "0.9.1"
 rust-stemmers = "1.2.0"
+levenshtein = "1.0.5"
 lib = { path = "../lib" }
 
 [[bin]]
diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs
index 0789a28..1aeedfc 100644
--- a/indexer/src/indexer_implementation.rs
+++ b/indexer/src/indexer_implementation.rs
@@ -23,10 +23,41 @@ impl IndexerImplementation {
     fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
         //TODO: priorize lower levels of url, priorize word in url/title/description or main?
 
-        //TODO: levshtein
+        let mut priority = 0u32;
+        for w in words {
+            let lev_distance = levenshtein::levenshtein(word, w) as u32;
+            log::debug!(
+                "Lev distance between target: {} and curr: {} -> {} --- w len: {}",
+                word,
+                w,
+                lev_distance,
+                w.len()
+            );
+            priority += lev_distance;
+        }
+
+        priority
+    }
+
+    fn create_indexed_resource(
+        url: &str,
+        word: &str,
+        title: &Option<String>,
+        description: &Option<String>,
+        language: &Option<String>,
+        content: &str,
+        words: &[String],
+    ) -> IndexedResource {
+        let priority = Self::calculate_word_priority(word, content, words); //we should take into account title, description lang etc
 
-        //atm priority is just the number of occurences in the site.
-        words.iter().filter(|w| *w == word).count() as u32
+        IndexedResource {
+            url: url.to_string(),
+            priority,
+            word: Arc::new(word.to_string()),
+            title: title.clone(),
+            description: description.clone(),
+            language: language.clone(),
+        }
     }
 }
 
@@ -41,14 +72,15 @@ impl crate::Indexer for IndexerImplementation {
         content: &str,
     ) -> Result<(), String> {
         for word in words {
-            let resource_to_add = IndexedResource {
-                url: url.to_string(),
-                priority: Self::calculate_word_priority(word, content, words), //we should take into account title, description lang etc
-                word: Arc::new(word.to_string()),
-                title: title.clone(),
-                description: description.clone(),
-                language: language.clone(),
-            };
+            let resource_to_add = Self::create_indexed_resource(
+                url,
+                word,
+                title,
+                description,
+                language,
+                content,
+                words,
+            );
 
             let stemmed_word = self.stemmer.stem(word).to_string();
             log::debug!("Word: {}, Stemmed word: {}", word, stemmed_word);
diff --git a/lib/src/lib.rs b/lib/src/lib.rs
index 54e6ea9..5c095c8 100644
--- a/lib/src/lib.rs
+++ b/lib/src/lib.rs
@@ -36,10 +36,10 @@ pub mod lib {
         }
     }
 
-    //Reverse ordering as priority: 1 is less than priority: 2
+    //Priority 1 is higher than priority 2
     impl Ord for IndexedResource {
         fn cmp(&self, other: &Self) -> Ordering {
-            self.priority.cmp(&other.priority).reverse()
+            self.priority.cmp(&other.priority)
         }
     }
author	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-11-14 16:01:19 +0100
committer	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-11-14 16:01:19 +0100
commit	1746c27506ec166bdebb53205a74b17b0017fc3d (patch)
tree	31a7516eec8e8b592309f11b7206928e57520ce6
parent	Indexer: Stem words prior to adding/searching them (diff)
download	OSSE-levshtein.tar.gz OSSE-levshtein.tar.bz2 OSSE-levshtein.zip