1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
use lib::lib::*;
use rust_stemmers::{Algorithm, Stemmer};
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
pub struct IndexerImplementation {
pub database: HashMap<String, HashSet<IndexedResource>>,
stemmer: Stemmer,
}
impl IndexerImplementation {
pub fn new() -> Self {
Self {
database: HashMap::new(),
stemmer: Stemmer::create(Algorithm::English), //todo: depend on lang
}
}
fn search_word_in_db(&self, word: String) -> Option<&HashSet<IndexedResource>> {
self.database.get(&word)
}
fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
//TODO: priorize lower levels of url, priorize word in url/title/description or main?
//TODO: levshtein
//atm priority is just the number of occurences in the site.
words.iter().filter(|w| *w == word).count() as u32
}
}
impl crate::Indexer for IndexerImplementation {
fn insert(
&mut self,
words: &[String],
url: &str,
title: &Option<String>,
description: &Option<String>,
language: &Option<String>,
content: &str,
) -> Result<(), String> {
for word in words {
let resource_to_add = IndexedResource {
url: url.to_string(),
priority: Self::calculate_word_priority(word, content, words), //we should take into account title, description lang etc
word: Arc::new(word.to_string()),
title: title.clone(),
description: description.clone(),
language: language.clone(),
};
let stemmed_word = self.stemmer.stem(word).to_string();
log::debug!("Word: {}, Stemmed word: {}", word, stemmed_word);
match self.database.get_mut(&stemmed_word) {
Some(resources) => _ = resources.insert(resource_to_add),
None => {
_ = self
.database
.insert(stemmed_word, HashSet::from([resource_to_add]))
}
}
}
Ok(())
}
fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String> {
let query: Vec<&str> = term.split(' ').collect();
//percentage of valid words
let mut valid_results: Option<HashSet<IndexedResource>> = None;
for w in query {
//Normalise queries to lowercase
let w = w.to_ascii_lowercase();
let stemmed_word = self.stemmer.stem(&w).to_string();
let curr_word_results = match self.search_word_in_db(stemmed_word) {
None => return Ok(HashSet::new()), //I dont really like this
Some(curr_results) => curr_results,
};
match valid_results {
//Initialise valid_results
None => {
valid_results = Some(curr_word_results.to_owned());
}
Some(results) => {
let intersection: HashSet<IndexedResource> = curr_word_results
.intersection(&results)
.map(|s| s.to_owned())
.collect();
valid_results = Some(intersection);
}
}
}
Ok(valid_results.unwrap())
}
fn num_of_words(&self) -> usize {
self.database.len()
}
}
|