diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-29 00:32:01 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-29 00:37:38 +0200 |
commit | e797bffab9948016619a765839c3be67f2f13d8d (patch) | |
tree | 3c2b850ab88b6086625222fdf061ac8b1f285ddd /indexer/src/main.rs | |
parent | Frontend: Use ResultComponent to display search results (diff) | |
download | OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.gz OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.bz2 OSSE-e797bffab9948016619a765839c3be67f2f13d8d.zip |
Crawler+Indexer+Frontend: Rename structs to follow logical relations
Now Resource is CrawledResource as it is created by the crawler, and the previous CrawledResource is now IndexedResource as its created by the indexer.
Diffstat (limited to 'indexer/src/main.rs')
-rw-r--r-- | indexer/src/main.rs | 31 |
1 files changed, 19 insertions, 12 deletions
diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 37a7256..825fe4d 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -6,7 +6,7 @@ use std::hash::{Hash, Hasher}; use std::sync::{Arc, Mutex}; #[derive(Debug, Clone, Serialize)] -struct CrawledResource { +struct IndexedResource { url: String, title: String, description: String, @@ -15,13 +15,13 @@ struct CrawledResource { } //We implement PartialEq, Eq and Hash to ignore the priority field. -impl PartialEq for CrawledResource { +impl PartialEq for IndexedResource { fn eq(&self, other: &Self) -> bool { self.url == other.url && self.word == other.word } } -impl Eq for CrawledResource {} -impl Hash for CrawledResource { +impl Eq for IndexedResource {} +impl Hash for IndexedResource { fn hash<H: Hasher>(&self, state: &mut H) { self.url.hash(state); self.word.hash(state); @@ -29,7 +29,7 @@ impl Hash for CrawledResource { } struct AppState { - database: Mutex<HashMap<String, HashSet<CrawledResource>>>, + database: Mutex<HashMap<String, HashSet<IndexedResource>>>, } #[actix_web::main] @@ -57,18 +57,23 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { .await } +//TODO: sufficiently simmilar word in search (algorithm) //we need to rename stuff #[derive(Deserialize, Debug)] -struct Resource { +struct CrawledResource { url: String, content: String, } #[post("/resource")] -async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder { +async fn add_resource( + data: web::Data<AppState>, + resource: web::Json<CrawledResource>, +) -> impl Responder { //parse content let document = scraper::Html::parse_document(resource.content.as_str()); + //TODO: Not very good, can we just body.get_text()? let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); let split_words = text.split(' '); @@ -101,7 +106,7 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) //and for each changed content word we add it to the db (word -> list.append(url)) let mut database = data.database.lock().unwrap(); for word in &fixed_words { - let resource_to_add = CrawledResource { + let resource_to_add = IndexedResource { url: resource.url.clone(), priority: calculate_word_priority(word, resource.content.as_str(), &fixed_words), word: Arc::new(word.clone()), @@ -130,7 +135,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp let database = data.database.lock().unwrap(); //percentage of valid words - let mut valid_results: Option<HashSet<CrawledResource>> = None; + let mut valid_results: Option<HashSet<IndexedResource>> = None; for w in query { let curr_word_results = match search_word_in_db(&database, w.to_string()) { None => return "[]".to_string(), @@ -143,7 +148,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp valid_results = Some(curr_word_results.to_owned()); } Some(results) => { - let intersection: HashSet<CrawledResource> = curr_word_results + let intersection: HashSet<IndexedResource> = curr_word_results .intersection(&results) .map(|s| s.to_owned()) .collect(); @@ -156,13 +161,15 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp } fn search_word_in_db( - db: &HashMap<String, HashSet<CrawledResource>>, + db: &HashMap<String, HashSet<IndexedResource>>, word: String, -) -> Option<&HashSet<CrawledResource>> { +) -> Option<&HashSet<IndexedResource>> { db.get(&word) } fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 { + //TODO: priorize lower levels of url, priorize word in url/title/description or main? + //atm priority is just the number of occurences in the site. words.iter().filter(|w| *w == word).count() as u32 } |