about summary refs log tree commit diff
path: root/indexer/src/indexer_implementation.rs
blob: d5cfead70ff3ec19cdd164e6c705f0f72266874f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
use lib::lib::*;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;

pub struct IndexerImplementation {
    pub database: HashMap<String, HashSet<IndexedResource>>,
}

impl IndexerImplementation {
    pub fn new() -> Self {
        Self {
            database: HashMap::new(),
        }
    }

    fn search_word_in_db(&self, word: String) -> Option<&HashSet<IndexedResource>> {
        self.database.get(&word)
    }

    fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
        //TODO: priorize lower levels of url, priorize word in url/title/description or main?

        //atm priority is just the number of occurences in the site.
        words.iter().filter(|w| *w == word).count() as u32
    }
}

impl crate::Indexer for IndexerImplementation {
    fn insert(
        &mut self,
        word: &str,
        url: &str,
        title: Option<String>,
        description: Option<String>,
        content: &str,
        fixed_words: &[String],
    ) -> Result<(), String> {
        let resource_to_add = IndexedResource {
            url: url.to_string(),
            priority: Self::calculate_word_priority(word, content, fixed_words),
            word: Arc::new(word.to_string()),
            title: title.map(String::from),
            description: description.map(String::from),
        };

        match self.database.get_mut(word) {
            Some(resources) => _ = resources.insert(resource_to_add),
            None => {
                _ = self
                    .database
                    .insert(word.to_string(), HashSet::from([resource_to_add]))
            }
        }

        Ok(())
    }

    fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String> {
        let query: Vec<&str> = term.split(' ').collect();

        //percentage of valid words
        let mut valid_results: Option<HashSet<IndexedResource>> = None;
        for w in query {
            //Normalise queries to lowercase
            let w = w.to_ascii_lowercase();

            let curr_word_results = match self.search_word_in_db(w.to_string()) {
                None => return Ok(HashSet::new()), //I dont really like this
                Some(curr_results) => curr_results,
            };

            match valid_results {
                //Initialise valid_results
                None => {
                    valid_results = Some(curr_word_results.to_owned());
                }
                Some(results) => {
                    let intersection: HashSet<IndexedResource> = curr_word_results
                        .intersection(&results)
                        .map(|s| s.to_owned())
                        .collect();
                    valid_results = Some(intersection);
                }
            }
        }

        Ok(valid_results.unwrap())
    }

    fn num_of_words(&self) -> usize {
        self.database.len()
    }
}