diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-29 00:32:01 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-29 00:37:38 +0200 |
commit | e797bffab9948016619a765839c3be67f2f13d8d (patch) | |
tree | 3c2b850ab88b6086625222fdf061ac8b1f285ddd | |
parent | Frontend: Use ResultComponent to display search results (diff) | |
download | OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.gz OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.bz2 OSSE-e797bffab9948016619a765839c3be67f2f13d8d.zip |
Crawler+Indexer+Frontend: Rename structs to follow logical relations
Now Resource is CrawledResource as it is created by the crawler, and the previous CrawledResource is now IndexedResource as its created by the indexer.
-rw-r--r-- | crawler/src/main.rs | 4 | ||||
-rw-r--r-- | frontend/src/main.rs | 20 | ||||
-rw-r--r-- | indexer/src/main.rs | 31 |
3 files changed, 31 insertions, 24 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 5c15d14..a831655 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -138,12 +138,12 @@ async fn push_crawl_entry_to_indexer( dbg!("Pushin to indexer"); #[derive(Serialize, Debug)] - struct Resource { + struct CrawledResource { url: String, content: String, } - let request_body = Resource { url, content }; + let request_body = CrawledResource { url, content }; match http_client .post(&indexer_url) diff --git a/frontend/src/main.rs b/frontend/src/main.rs index d2441d5..3d8bf51 100644 --- a/frontend/src/main.rs +++ b/frontend/src/main.rs @@ -11,7 +11,7 @@ use yew::prelude::*; //TODO: we should import this from the indexer #[derive(Debug, Clone, Deserialize)] -pub struct CrawledResource { +pub struct IndexedResource { url: String, title: String, description: String, @@ -20,26 +20,26 @@ pub struct CrawledResource { } //We implement PartialEq, Eq and Hash to ignore the priority field. -impl PartialEq for CrawledResource { +impl PartialEq for IndexedResource { fn eq(&self, other: &Self) -> bool { self.url == other.url && self.word == other.word } } -impl Eq for CrawledResource {} +impl Eq for IndexedResource {} -impl PartialOrd for CrawledResource { +impl PartialOrd for IndexedResource { fn partial_cmp(&self, other: &Self) -> Option<Ordering> { Some(self.cmp(other)) } } -impl Ord for CrawledResource { +impl Ord for IndexedResource { fn cmp(&self, other: &Self) -> Ordering { self.priority.cmp(&other.priority).reverse() } } -impl Hash for CrawledResource { +impl Hash for IndexedResource { fn hash<H: Hasher>(&self, state: &mut H) { self.url.hash(state); self.word.hash(state); @@ -48,7 +48,7 @@ impl Hash for CrawledResource { #[derive(Properties, Clone, PartialEq, Eq)] pub struct ResultComponentProps { - result: CrawledResource, + result: IndexedResource, } #[function_component(ResultComponent)] @@ -61,7 +61,7 @@ fn result_component(props: &ResultComponentProps) -> Html { #[derive(Debug, Clone)] struct State { pub search_query: String, - pub results: Option<Vec<CrawledResource>>, //TODO: some loading? + pub results: Option<Vec<IndexedResource>>, //TODO: some loading? } #[function_component(OSSE)] @@ -71,7 +71,7 @@ fn osse() -> Html { results: None, }); - let display_results = |maybe_results: &Option<Vec<CrawledResource>>| -> Html { + let display_results = |maybe_results: &Option<Vec<IndexedResource>>| -> Html { let maybe_results = maybe_results.as_ref(); if maybe_results.is_none() { return html! {}; @@ -128,7 +128,7 @@ fn osse() -> Html { let fetched_results = Request::get(endpoint.as_str()).send().await.unwrap(); - let fetched_json: Vec<CrawledResource> = match fetched_results.json().await { + let fetched_json: Vec<IndexedResource> = match fetched_results.json().await { Err(e) => panic!("Im panic: {}", e), Ok(json) => json, }; diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 37a7256..825fe4d 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -6,7 +6,7 @@ use std::hash::{Hash, Hasher}; use std::sync::{Arc, Mutex}; #[derive(Debug, Clone, Serialize)] -struct CrawledResource { +struct IndexedResource { url: String, title: String, description: String, @@ -15,13 +15,13 @@ struct CrawledResource { } //We implement PartialEq, Eq and Hash to ignore the priority field. -impl PartialEq for CrawledResource { +impl PartialEq for IndexedResource { fn eq(&self, other: &Self) -> bool { self.url == other.url && self.word == other.word } } -impl Eq for CrawledResource {} -impl Hash for CrawledResource { +impl Eq for IndexedResource {} +impl Hash for IndexedResource { fn hash<H: Hasher>(&self, state: &mut H) { self.url.hash(state); self.word.hash(state); @@ -29,7 +29,7 @@ impl Hash for CrawledResource { } struct AppState { - database: Mutex<HashMap<String, HashSet<CrawledResource>>>, + database: Mutex<HashMap<String, HashSet<IndexedResource>>>, } #[actix_web::main] @@ -57,18 +57,23 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { .await } +//TODO: sufficiently simmilar word in search (algorithm) //we need to rename stuff #[derive(Deserialize, Debug)] -struct Resource { +struct CrawledResource { url: String, content: String, } #[post("/resource")] -async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder { +async fn add_resource( + data: web::Data<AppState>, + resource: web::Json<CrawledResource>, +) -> impl Responder { //parse content let document = scraper::Html::parse_document(resource.content.as_str()); + //TODO: Not very good, can we just body.get_text()? let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); let split_words = text.split(' '); @@ -101,7 +106,7 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) //and for each changed content word we add it to the db (word -> list.append(url)) let mut database = data.database.lock().unwrap(); for word in &fixed_words { - let resource_to_add = CrawledResource { + let resource_to_add = IndexedResource { url: resource.url.clone(), priority: calculate_word_priority(word, resource.content.as_str(), &fixed_words), word: Arc::new(word.clone()), @@ -130,7 +135,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp let database = data.database.lock().unwrap(); //percentage of valid words - let mut valid_results: Option<HashSet<CrawledResource>> = None; + let mut valid_results: Option<HashSet<IndexedResource>> = None; for w in query { let curr_word_results = match search_word_in_db(&database, w.to_string()) { None => return "[]".to_string(), @@ -143,7 +148,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp valid_results = Some(curr_word_results.to_owned()); } Some(results) => { - let intersection: HashSet<CrawledResource> = curr_word_results + let intersection: HashSet<IndexedResource> = curr_word_results .intersection(&results) .map(|s| s.to_owned()) .collect(); @@ -156,13 +161,15 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp } fn search_word_in_db( - db: &HashMap<String, HashSet<CrawledResource>>, + db: &HashMap<String, HashSet<IndexedResource>>, word: String, -) -> Option<&HashSet<CrawledResource>> { +) -> Option<&HashSet<IndexedResource>> { db.get(&word) } fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 { + //TODO: priorize lower levels of url, priorize word in url/title/description or main? + //atm priority is just the number of occurences in the site. words.iter().filter(|w| *w == word).count() as u32 } |