indexer/src/indexer_implementation.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

use lib::lib::*;
use rust_stemmers::{Algorithm, Stemmer};
use std::collections::{HashMap, HashSet};
use std::sync::Arc;

pub struct IndexerImplementation {
    pub database: HashMap<String, HashSet<IndexedResource>>,
    stemmer: Stemmer,
}

impl IndexerImplementation {
    pub fn new() -> Self {
        Self {
            database: HashMap::new(),
            stemmer: Stemmer::create(Algorithm::English), //todo: depend on lang
        }
    }

    fn search_word_in_db(&self, word: String) -> Option<&HashSet<IndexedResource>> {
        self.database.get(&word)
    }

    fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
        //TODO: priorize lower levels of url, priorize word in url/title/description or main?

        let mut priority = 0u32;
        for w in words {
            let lev_distance = levenshtein::levenshtein(word, w) as u32;
            log::debug!(
                "Lev distance between target: {} and curr: {} -> {} --- w len: {}",
                word,
                w,
                lev_distance,
                w.len()
            );
            priority += lev_distance;
        }

        priority
    }

    fn create_indexed_resource(
        url: &str,
        word: &str,
        title: &Option<String>,
        description: &Option<String>,
        language: &Option<String>,
        content: &str,
        words: &[String],
    ) -> IndexedResource {
        let priority = Self::calculate_word_priority(word, content, words); //we should take into account title, description lang etc

        IndexedResource {
            url: url.to_string(),
            priority,
            word: Arc::new(word.to_string()),
            title: title.clone(),
            description: description.clone(),
            language: language.clone(),
        }
    }
}

impl crate::Indexer for IndexerImplementation {
    fn insert(
        &mut self,
        words: &[String],
        url: &str,
        title: &Option<String>,
        description: &Option<String>,
        language: &Option<String>,
        content: &str,
    ) -> Result<(), String> {
        for word in words {
            let resource_to_add = Self::create_indexed_resource(
                url,
                word,
                title,
                description,
                language,
                content,
                words,
            );

            let stemmed_word = self.stemmer.stem(word).to_string();
            log::debug!("Word: {}, Stemmed word: {}", word, stemmed_word);
            match self.database.get_mut(&stemmed_word) {
                Some(resources) => _ = resources.insert(resource_to_add),
                None => {
                    _ = self
                        .database
                        .insert(stemmed_word, HashSet::from([resource_to_add]))
                }
            }
        }

        Ok(())
    }

    fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String> {
        let query: Vec<&str> = term.split(' ').collect();

        //percentage of valid words
        let mut valid_results: Option<HashSet<IndexedResource>> = None;
        for w in query {
            //Normalise queries to lowercase
            let w = w.to_ascii_lowercase();

            let stemmed_word = self.stemmer.stem(&w).to_string();
            let curr_word_results = match self.search_word_in_db(stemmed_word) {
                None => return Ok(HashSet::new()), //I dont really like this
                Some(curr_results) => curr_results,
            };

            match valid_results {
                //Initialise valid_results
                None => {
                    valid_results = Some(curr_word_results.to_owned());
                }
                Some(results) => {
                    let intersection: HashSet<IndexedResource> = curr_word_results
                        .intersection(&results)
                        .map(|s| s.to_owned())
                        .collect();
                    valid_results = Some(intersection);
                }
            }
        }

        Ok(valid_results.unwrap())
    }

    fn num_of_words(&self) -> usize {
        self.database.len()
    }
}