about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <[email protected]>2022-10-29 00:32:01 +0200
committerBaitinq <[email protected]>2022-10-29 00:37:38 +0200
commite797bffab9948016619a765839c3be67f2f13d8d (patch)
tree3c2b850ab88b6086625222fdf061ac8b1f285ddd
parentFrontend: Use ResultComponent to display search results (diff)
downloadOSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.gz
OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.bz2
OSSE-e797bffab9948016619a765839c3be67f2f13d8d.zip
Crawler+Indexer+Frontend: Rename structs to follow logical relations
Now Resource is CrawledResource as it is created by the crawler, and the
previous CrawledResource is now IndexedResource as its created by the
indexer.
Diffstat (limited to '')
-rw-r--r--crawler/src/main.rs4
-rw-r--r--frontend/src/main.rs20
-rw-r--r--indexer/src/main.rs31
3 files changed, 31 insertions, 24 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 5c15d14..a831655 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -138,12 +138,12 @@ async fn push_crawl_entry_to_indexer(
     dbg!("Pushin to indexer");
 
     #[derive(Serialize, Debug)]
-    struct Resource {
+    struct CrawledResource {
         url: String,
         content: String,
     }
 
-    let request_body = Resource { url, content };
+    let request_body = CrawledResource { url, content };
 
     match http_client
         .post(&indexer_url)
diff --git a/frontend/src/main.rs b/frontend/src/main.rs
index d2441d5..3d8bf51 100644
--- a/frontend/src/main.rs
+++ b/frontend/src/main.rs
@@ -11,7 +11,7 @@ use yew::prelude::*;
 
 //TODO: we should import this from the indexer
 #[derive(Debug, Clone, Deserialize)]
-pub struct CrawledResource {
+pub struct IndexedResource {
     url: String,
     title: String,
     description: String,
@@ -20,26 +20,26 @@ pub struct CrawledResource {
 }
 
 //We implement PartialEq, Eq and Hash to ignore the priority field.
-impl PartialEq for CrawledResource {
+impl PartialEq for IndexedResource {
     fn eq(&self, other: &Self) -> bool {
         self.url == other.url && self.word == other.word
     }
 }
-impl Eq for CrawledResource {}
+impl Eq for IndexedResource {}
 
-impl PartialOrd for CrawledResource {
+impl PartialOrd for IndexedResource {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for CrawledResource {
+impl Ord for IndexedResource {
     fn cmp(&self, other: &Self) -> Ordering {
         self.priority.cmp(&other.priority).reverse()
     }
 }
 
-impl Hash for CrawledResource {
+impl Hash for IndexedResource {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.url.hash(state);
         self.word.hash(state);
@@ -48,7 +48,7 @@ impl Hash for CrawledResource {
 
 #[derive(Properties, Clone, PartialEq, Eq)]
 pub struct ResultComponentProps {
-    result: CrawledResource,
+    result: IndexedResource,
 }
 
 #[function_component(ResultComponent)]
@@ -61,7 +61,7 @@ fn result_component(props: &ResultComponentProps) -> Html {
 #[derive(Debug, Clone)]
 struct State {
     pub search_query: String,
-    pub results: Option<Vec<CrawledResource>>, //TODO: some loading?
+    pub results: Option<Vec<IndexedResource>>, //TODO: some loading?
 }
 
 #[function_component(OSSE)]
@@ -71,7 +71,7 @@ fn osse() -> Html {
         results: None,
     });
 
-    let display_results = |maybe_results: &Option<Vec<CrawledResource>>| -> Html {
+    let display_results = |maybe_results: &Option<Vec<IndexedResource>>| -> Html {
         let maybe_results = maybe_results.as_ref();
         if maybe_results.is_none() {
             return html! {};
@@ -128,7 +128,7 @@ fn osse() -> Html {
 
                     let fetched_results = Request::get(endpoint.as_str()).send().await.unwrap();
 
-                    let fetched_json: Vec<CrawledResource> = match fetched_results.json().await {
+                    let fetched_json: Vec<IndexedResource> = match fetched_results.json().await {
                         Err(e) => panic!("Im panic: {}", e),
                         Ok(json) => json,
                     };
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 37a7256..825fe4d 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -6,7 +6,7 @@ use std::hash::{Hash, Hasher};
 use std::sync::{Arc, Mutex};
 
 #[derive(Debug, Clone, Serialize)]
-struct CrawledResource {
+struct IndexedResource {
     url: String,
     title: String,
     description: String,
@@ -15,13 +15,13 @@ struct CrawledResource {
 }
 
 //We implement PartialEq, Eq and Hash to ignore the priority field.
-impl PartialEq for CrawledResource {
+impl PartialEq for IndexedResource {
     fn eq(&self, other: &Self) -> bool {
         self.url == other.url && self.word == other.word
     }
 }
-impl Eq for CrawledResource {}
-impl Hash for CrawledResource {
+impl Eq for IndexedResource {}
+impl Hash for IndexedResource {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.url.hash(state);
         self.word.hash(state);
@@ -29,7 +29,7 @@ impl Hash for CrawledResource {
 }
 
 struct AppState {
-    database: Mutex<HashMap<String, HashSet<CrawledResource>>>,
+    database: Mutex<HashMap<String, HashSet<IndexedResource>>>,
 }
 
 #[actix_web::main]
@@ -57,18 +57,23 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
     .await
 }
 
+//TODO: sufficiently simmilar word in search (algorithm)
 //we need to rename stuff
 #[derive(Deserialize, Debug)]
-struct Resource {
+struct CrawledResource {
     url: String,
     content: String,
 }
 
 #[post("/resource")]
-async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder {
+async fn add_resource(
+    data: web::Data<AppState>,
+    resource: web::Json<CrawledResource>,
+) -> impl Responder {
     //parse content
     let document = scraper::Html::parse_document(resource.content.as_str());
 
+    //TODO: Not very good, can we just body.get_text()?
     let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len());
 
     let split_words = text.split(' ');
@@ -101,7 +106,7 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>)
     //and for each changed content word we add it to the db (word -> list.append(url))
     let mut database = data.database.lock().unwrap();
     for word in &fixed_words {
-        let resource_to_add = CrawledResource {
+        let resource_to_add = IndexedResource {
             url: resource.url.clone(),
             priority: calculate_word_priority(word, resource.content.as_str(), &fixed_words),
             word: Arc::new(word.clone()),
@@ -130,7 +135,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp
     let database = data.database.lock().unwrap();
 
     //percentage of valid words
-    let mut valid_results: Option<HashSet<CrawledResource>> = None;
+    let mut valid_results: Option<HashSet<IndexedResource>> = None;
     for w in query {
         let curr_word_results = match search_word_in_db(&database, w.to_string()) {
             None => return "[]".to_string(),
@@ -143,7 +148,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp
                 valid_results = Some(curr_word_results.to_owned());
             }
             Some(results) => {
-                let intersection: HashSet<CrawledResource> = curr_word_results
+                let intersection: HashSet<IndexedResource> = curr_word_results
                     .intersection(&results)
                     .map(|s| s.to_owned())
                     .collect();
@@ -156,13 +161,15 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp
 }
 
 fn search_word_in_db(
-    db: &HashMap<String, HashSet<CrawledResource>>,
+    db: &HashMap<String, HashSet<IndexedResource>>,
     word: String,
-) -> Option<&HashSet<CrawledResource>> {
+) -> Option<&HashSet<IndexedResource>> {
     db.get(&word)
 }
 
 fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
+    //TODO: priorize lower levels of url, priorize word in url/title/description or main?
+
     //atm priority is just the number of occurences in the site.
     words.iter().filter(|w| *w == word).count() as u32
 }