about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-05 18:41:48 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-10 17:56:59 +0100
commit93009fd53e58286c6a5e2da600d70a8ec85d9a0b (patch)
tree74cc0c9cd1ff89dab3ec82cab7c6b56d015f609f
parentIndexer: Switch back to not serving frontend with actix (diff)
downloadOSSE-ngrams_indexer.tar.gz
OSSE-ngrams_indexer.tar.bz2
OSSE-ngrams_indexer.zip
Indexer: Ngrams ngrams_indexer
-rw-r--r--Cargo.lock7
-rw-r--r--indexer/Cargo.toml1
-rw-r--r--indexer/src/indexer_implementation.rs5
-rw-r--r--stem_and_then_levesthtein_on_results0
4 files changed, 13 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 6af29e2..f661e01 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1311,6 +1311,7 @@ dependencies = [
  "html2text",
  "kuchiki",
  "lib",
+ "ngrams",
  "scraper",
  "serde",
  "serde_json",
@@ -1673,6 +1674,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
 
 [[package]]
+name = "ngrams"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad9273e27d7914b3d11aeac20a91c0da7ef1e89edc4e3825021ebb8ac3b83bf2"
+
+[[package]]
 name = "nodrop"
 version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index 7b64bb3..1dfb33f 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -14,6 +14,7 @@ html2text = "0.4.3"
 serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0.87"
 kuchiki = "0.8.1"
+ngrams = "1.0.1"
 lib = { path = "../lib" }
 
 [[bin]]
diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs
index f24c2bd..6f12644 100644
--- a/indexer/src/indexer_implementation.rs
+++ b/indexer/src/indexer_implementation.rs
@@ -1,4 +1,5 @@
 use lib::lib::*;
+use ngrams::Ngram;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
@@ -36,6 +37,10 @@ impl crate::Indexer for IndexerImplementation {
         content: &str,
     ) -> Result<(), String> {
         for word in words {
+            let ngrams: Vec<_> = word.chars().ngrams(2).pad().collect();
+
+            println!("Ngrams for {}: {:?}", word, ngrams);
+
             let resource_to_add = IndexedResource {
                 url: url.to_string(),
                 priority: Self::calculate_word_priority(word, content, words),
diff --git a/stem_and_then_levesthtein_on_results b/stem_and_then_levesthtein_on_results
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/stem_and_then_levesthtein_on_results