diff options
| author | Baitinq <[email protected]> | 2022-10-29 00:32:01 +0200 |
|---|---|---|
| committer | Baitinq <[email protected]> | 2022-10-29 00:37:38 +0200 |
| commit | e797bffab9948016619a765839c3be67f2f13d8d (patch) | |
| tree | 3c2b850ab88b6086625222fdf061ac8b1f285ddd | |
| parent | Frontend: Use ResultComponent to display search results (diff) | |
| download | OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.gz OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.bz2 OSSE-e797bffab9948016619a765839c3be67f2f13d8d.zip | |
Crawler+Indexer+Frontend: Rename structs to follow logical relations
Now Resource is CrawledResource as it is created by the crawler, and the previous CrawledResource is now IndexedResource as its created by the indexer.
Diffstat (limited to '')
| -rw-r--r-- | crawler/src/main.rs | 4 | ||||
| -rw-r--r-- | frontend/src/main.rs | 20 | ||||
| -rw-r--r-- | indexer/src/main.rs | 31 |
3 files changed, 31 insertions, 24 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 5c15d14..a831655 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -138,12 +138,12 @@ async fn push_crawl_entry_to_indexer( dbg!("Pushin to indexer"); #[derive(Serialize, Debug)] - struct Resource { + struct CrawledResource { url: String, content: String, } - let request_body = Resource { url, content }; + let request_body = CrawledResource { url, content }; match http_client .post(&indexer_url) diff --git a/frontend/src/main.rs b/frontend/src/main.rs index d2441d5..3d8bf51 100644 --- a/frontend/src/main.rs +++ b/frontend/src/main.rs @@ -11,7 +11,7 @@ use yew::prelude::*; //TODO: we should import this from the indexer #[derive(Debug, Clone, Deserialize)] -pub struct CrawledResource { +pub struct IndexedResource { url: String, title: String, description: String, @@ -20,26 +20,26 @@ pub struct CrawledResource { } //We implement PartialEq, Eq and Hash to ignore the priority field. -impl PartialEq for CrawledResource { +impl PartialEq for IndexedResource { fn eq(&self, other: &Self) -> bool { self.url == other.url && self.word == other.word } } -impl Eq for CrawledResource {} +impl Eq for IndexedResource {} -impl PartialOrd for CrawledResource { +impl PartialOrd for IndexedResource { fn partial_cmp(&self, other: &Self) -> Option<Ordering> { Some(self.cmp(other)) } } -impl Ord for CrawledResource { +impl Ord for IndexedResource { fn cmp(&self, other: &Self) -> Ordering { self.priority.cmp(&other.priority).reverse() } } -impl Hash for CrawledResource { +impl Hash for IndexedResource { fn hash<H: Hasher>(&self, state: &mut H) { self.url.hash(state); self.word.hash(state); @@ -48,7 +48,7 @@ impl Hash for CrawledResource { #[derive(Properties, Clone, PartialEq, Eq)] pub struct ResultComponentProps { - result: CrawledResource, + result: IndexedResource, } #[function_component(ResultComponent)] @@ -61,7 +61,7 @@ fn result_component(props: &ResultComponentProps) -> Html { #[derive(Debug, Clone)] struct State { pub search_query: String, - pub results: Option<Vec<CrawledResource>>, //TODO: some loading? + pub results: Option<Vec<IndexedResource>>, //TODO: some loading? } #[function_component(OSSE)] @@ -71,7 +71,7 @@ fn osse() -> Html { results: None, }); - let display_results = |maybe_results: &Option<Vec<CrawledResource>>| -> Html { + let display_results = |maybe_results: &Option<Vec<IndexedResource>>| -> Html { let maybe_results = maybe_results.as_ref(); if maybe_results.is_none() { return html! {}; @@ -128,7 +128,7 @@ fn osse() -> Html { let fetched_results = Request::get(endpoint.as_str()).send().await.unwrap(); - let fetched_json: Vec<CrawledResource> = match fetched_results.json().await { + let fetched_json: Vec<IndexedResource> = match fetched_results.json().await { Err(e) => panic!("Im panic: {}", e), Ok(json) => json, }; diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 37a7256..825fe4d 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -6,7 +6,7 @@ use std::hash::{Hash, Hasher}; use std::sync::{Arc, Mutex}; #[derive(Debug, Clone, Serialize)] -struct CrawledResource { +struct IndexedResource { url: String, title: String, description: String, @@ -15,13 +15,13 @@ struct CrawledResource { } //We implement PartialEq, Eq and Hash to ignore the priority field. -impl PartialEq for CrawledResource { +impl PartialEq for IndexedResource { fn eq(&self, other: &Self) -> bool { self.url == other.url && self.word == other.word } } -impl Eq for CrawledResource {} -impl Hash for CrawledResource { +impl Eq for IndexedResource {} +impl Hash for IndexedResource { fn hash<H: Hasher>(&self, state: &mut H) { self.url.hash(state); self.word.hash(state); @@ -29,7 +29,7 @@ impl Hash for CrawledResource { } struct AppState { - database: Mutex<HashMap<String, HashSet<CrawledResource>>>, + database: Mutex<HashMap<String, HashSet<IndexedResource>>>, } #[actix_web::main] @@ -57,18 +57,23 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { .await } +//TODO: sufficiently simmilar word in search (algorithm) //we need to rename stuff #[derive(Deserialize, Debug)] -struct Resource { +struct CrawledResource { url: String, content: String, } #[post("/resource")] -async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder { +async fn add_resource( + data: web::Data<AppState>, + resource: web::Json<CrawledResource>, +) -> impl Responder { //parse content let document = scraper::Html::parse_document(resource.content.as_str()); + //TODO: Not very good, can we just body.get_text()? let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); let split_words = text.split(' '); @@ -101,7 +106,7 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) //and for each changed content word we add it to the db (word -> list.append(url)) let mut database = data.database.lock().unwrap(); for word in &fixed_words { - let resource_to_add = CrawledResource { + let resource_to_add = IndexedResource { url: resource.url.clone(), priority: calculate_word_priority(word, resource.content.as_str(), &fixed_words), word: Arc::new(word.clone()), @@ -130,7 +135,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp let database = data.database.lock().unwrap(); //percentage of valid words - let mut valid_results: Option<HashSet<CrawledResource>> = None; + let mut valid_results: Option<HashSet<IndexedResource>> = None; for w in query { let curr_word_results = match search_word_in_db(&database, w.to_string()) { None => return "[]".to_string(), @@ -143,7 +148,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp valid_results = Some(curr_word_results.to_owned()); } Some(results) => { - let intersection: HashSet<CrawledResource> = curr_word_results + let intersection: HashSet<IndexedResource> = curr_word_results .intersection(&results) .map(|s| s.to_owned()) .collect(); @@ -156,13 +161,15 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp } fn search_word_in_db( - db: &HashMap<String, HashSet<CrawledResource>>, + db: &HashMap<String, HashSet<IndexedResource>>, word: String, -) -> Option<&HashSet<CrawledResource>> { +) -> Option<&HashSet<IndexedResource>> { db.get(&word) } fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 { + //TODO: priorize lower levels of url, priorize word in url/title/description or main? + //atm priority is just the number of occurences in the site. words.iter().filter(|w| *w == word).count() as u32 } |