about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-29 00:32:01 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-29 00:37:38 +0200
commite797bffab9948016619a765839c3be67f2f13d8d (patch)
tree3c2b850ab88b6086625222fdf061ac8b1f285ddd
parentFrontend: Use ResultComponent to display search results (diff)
downloadOSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.gz
OSSE-e797bffab9948016619a765839c3be67f2f13d8d.tar.bz2
OSSE-e797bffab9948016619a765839c3be67f2f13d8d.zip
Crawler+Indexer+Frontend: Rename structs to follow logical relations
Now Resource is CrawledResource as it is created by the crawler, and the
previous CrawledResource is now IndexedResource as its created by the
indexer.
-rw-r--r--crawler/src/main.rs4
-rw-r--r--frontend/src/main.rs20
-rw-r--r--indexer/src/main.rs31
3 files changed, 31 insertions, 24 deletions
diff --git a/crawler/src/main.rs b/crawler/src/main.rs
index 5c15d14..a831655 100644
--- a/crawler/src/main.rs
+++ b/crawler/src/main.rs
@@ -138,12 +138,12 @@ async fn push_crawl_entry_to_indexer(
     dbg!("Pushin to indexer");
 
     #[derive(Serialize, Debug)]
-    struct Resource {
+    struct CrawledResource {
         url: String,
         content: String,
     }
 
-    let request_body = Resource { url, content };
+    let request_body = CrawledResource { url, content };
 
     match http_client
         .post(&indexer_url)
diff --git a/frontend/src/main.rs b/frontend/src/main.rs
index d2441d5..3d8bf51 100644
--- a/frontend/src/main.rs
+++ b/frontend/src/main.rs
@@ -11,7 +11,7 @@ use yew::prelude::*;
 
 //TODO: we should import this from the indexer
 #[derive(Debug, Clone, Deserialize)]
-pub struct CrawledResource {
+pub struct IndexedResource {
     url: String,
     title: String,
     description: String,
@@ -20,26 +20,26 @@ pub struct CrawledResource {
 }
 
 //We implement PartialEq, Eq and Hash to ignore the priority field.
-impl PartialEq for CrawledResource {
+impl PartialEq for IndexedResource {
     fn eq(&self, other: &Self) -> bool {
         self.url == other.url && self.word == other.word
     }
 }
-impl Eq for CrawledResource {}
+impl Eq for IndexedResource {}
 
-impl PartialOrd for CrawledResource {
+impl PartialOrd for IndexedResource {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for CrawledResource {
+impl Ord for IndexedResource {
     fn cmp(&self, other: &Self) -> Ordering {
         self.priority.cmp(&other.priority).reverse()
     }
 }
 
-impl Hash for CrawledResource {
+impl Hash for IndexedResource {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.url.hash(state);
         self.word.hash(state);
@@ -48,7 +48,7 @@ impl Hash for CrawledResource {
 
 #[derive(Properties, Clone, PartialEq, Eq)]
 pub struct ResultComponentProps {
-    result: CrawledResource,
+    result: IndexedResource,
 }
 
 #[function_component(ResultComponent)]
@@ -61,7 +61,7 @@ fn result_component(props: &ResultComponentProps) -> Html {
 #[derive(Debug, Clone)]
 struct State {
     pub search_query: String,
-    pub results: Option<Vec<CrawledResource>>, //TODO: some loading?
+    pub results: Option<Vec<IndexedResource>>, //TODO: some loading?
 }
 
 #[function_component(OSSE)]
@@ -71,7 +71,7 @@ fn osse() -> Html {
         results: None,
     });
 
-    let display_results = |maybe_results: &Option<Vec<CrawledResource>>| -> Html {
+    let display_results = |maybe_results: &Option<Vec<IndexedResource>>| -> Html {
         let maybe_results = maybe_results.as_ref();
         if maybe_results.is_none() {
             return html! {};
@@ -128,7 +128,7 @@ fn osse() -> Html {
 
                     let fetched_results = Request::get(endpoint.as_str()).send().await.unwrap();
 
-                    let fetched_json: Vec<CrawledResource> = match fetched_results.json().await {
+                    let fetched_json: Vec<IndexedResource> = match fetched_results.json().await {
                         Err(e) => panic!("Im panic: {}", e),
                         Ok(json) => json,
                     };
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 37a7256..825fe4d 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -6,7 +6,7 @@ use std::hash::{Hash, Hasher};
 use std::sync::{Arc, Mutex};
 
 #[derive(Debug, Clone, Serialize)]
-struct CrawledResource {
+struct IndexedResource {
     url: String,
     title: String,
     description: String,
@@ -15,13 +15,13 @@ struct CrawledResource {
 }
 
 //We implement PartialEq, Eq and Hash to ignore the priority field.
-impl PartialEq for CrawledResource {
+impl PartialEq for IndexedResource {
     fn eq(&self, other: &Self) -> bool {
         self.url == other.url && self.word == other.word
     }
 }
-impl Eq for CrawledResource {}
-impl Hash for CrawledResource {
+impl Eq for IndexedResource {}
+impl Hash for IndexedResource {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.url.hash(state);
         self.word.hash(state);
@@ -29,7 +29,7 @@ impl Hash for CrawledResource {
 }
 
 struct AppState {
-    database: Mutex<HashMap<String, HashSet<CrawledResource>>>,
+    database: Mutex<HashMap<String, HashSet<IndexedResource>>>,
 }
 
 #[actix_web::main]
@@ -57,18 +57,23 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
     .await
 }
 
+//TODO: sufficiently simmilar word in search (algorithm)
 //we need to rename stuff
 #[derive(Deserialize, Debug)]
-struct Resource {
+struct CrawledResource {
     url: String,
     content: String,
 }
 
 #[post("/resource")]
-async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder {
+async fn add_resource(
+    data: web::Data<AppState>,
+    resource: web::Json<CrawledResource>,
+) -> impl Responder {
     //parse content
     let document = scraper::Html::parse_document(resource.content.as_str());
 
+    //TODO: Not very good, can we just body.get_text()?
     let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len());
 
     let split_words = text.split(' ');
@@ -101,7 +106,7 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>)
     //and for each changed content word we add it to the db (word -> list.append(url))
     let mut database = data.database.lock().unwrap();
     for word in &fixed_words {
-        let resource_to_add = CrawledResource {
+        let resource_to_add = IndexedResource {
             url: resource.url.clone(),
             priority: calculate_word_priority(word, resource.content.as_str(), &fixed_words),
             word: Arc::new(word.clone()),
@@ -130,7 +135,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp
     let database = data.database.lock().unwrap();
 
     //percentage of valid words
-    let mut valid_results: Option<HashSet<CrawledResource>> = None;
+    let mut valid_results: Option<HashSet<IndexedResource>> = None;
     for w in query {
         let curr_word_results = match search_word_in_db(&database, w.to_string()) {
             None => return "[]".to_string(),
@@ -143,7 +148,7 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp
                 valid_results = Some(curr_word_results.to_owned());
             }
             Some(results) => {
-                let intersection: HashSet<CrawledResource> = curr_word_results
+                let intersection: HashSet<IndexedResource> = curr_word_results
                     .intersection(&results)
                     .map(|s| s.to_owned())
                     .collect();
@@ -156,13 +161,15 @@ async fn search(data: web::Data<AppState>, term: web::Path<String>) -> impl Resp
 }
 
 fn search_word_in_db(
-    db: &HashMap<String, HashSet<CrawledResource>>,
+    db: &HashMap<String, HashSet<IndexedResource>>,
     word: String,
-) -> Option<&HashSet<CrawledResource>> {
+) -> Option<&HashSet<IndexedResource>> {
     db.get(&word)
 }
 
 fn calculate_word_priority(word: &str, _html_site: &str, words: &[String]) -> u32 {
+    //TODO: priorize lower levels of url, priorize word in url/title/description or main?
+
     //atm priority is just the number of occurences in the site.
     words.iter().filter(|w| *w == word).count() as u32
 }