From 93009fd53e58286c6a5e2da600d70a8ec85d9a0b Mon Sep 17 00:00:00 2001 From: Baitinq Date: Sat, 5 Nov 2022 18:41:48 +0100 Subject: Indexer: Ngrams --- Cargo.lock | 7 +++++++ indexer/Cargo.toml | 1 + indexer/src/indexer_implementation.rs | 5 +++++ stem_and_then_levesthtein_on_results | 0 4 files changed, 13 insertions(+) create mode 100644 stem_and_then_levesthtein_on_results diff --git a/Cargo.lock b/Cargo.lock index 6af29e2..f661e01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1311,6 +1311,7 @@ dependencies = [ "html2text", "kuchiki", "lib", + "ngrams", "scraper", "serde", "serde_json", @@ -1672,6 +1673,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +[[package]] +name = "ngrams" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad9273e27d7914b3d11aeac20a91c0da7ef1e89edc4e3825021ebb8ac3b83bf2" + [[package]] name = "nodrop" version = "0.1.14" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 7b64bb3..1dfb33f 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -14,6 +14,7 @@ html2text = "0.4.3" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0.87" kuchiki = "0.8.1" +ngrams = "1.0.1" lib = { path = "../lib" } [[bin]] diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs index f24c2bd..6f12644 100644 --- a/indexer/src/indexer_implementation.rs +++ b/indexer/src/indexer_implementation.rs @@ -1,4 +1,5 @@ use lib::lib::*; +use ngrams::Ngram; use std::collections::{HashMap, HashSet}; use std::sync::Arc; @@ -36,6 +37,10 @@ impl crate::Indexer for IndexerImplementation { content: &str, ) -> Result<(), String> { for word in words { + let ngrams: Vec<_> = word.chars().ngrams(2).pad().collect(); + + println!("Ngrams for {}: {:?}", word, ngrams); + let resource_to_add = IndexedResource { url: url.to_string(), priority: Self::calculate_word_priority(word, content, words), diff --git a/stem_and_then_levesthtein_on_results b/stem_and_then_levesthtein_on_results new file mode 100644 index 0000000..e69de29 -- cgit 1.4.1