about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-28 19:24:58 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-28 19:25:01 +0200
commite5291cab1b2d1b6a01164b9bbb31d812062e6e66 (patch)
tree02a42a70a97718f78781a2e0395c91521c96ac09
parentFrontend: Refactor search_word_in_db() to not need explicit lifetimes (diff)
downloadOSSE-e5291cab1b2d1b6a01164b9bbb31d812062e6e66.tar.gz
OSSE-e5291cab1b2d1b6a01164b9bbb31d812062e6e66.tar.bz2
OSSE-e5291cab1b2d1b6a01164b9bbb31d812062e6e66.zip
Indexer: Add website title and description to the CrawledResource
We now parse the HTML and extract the title and description of the site.
-rw-r--r--Cargo.lock1
-rw-r--r--frontend/Cargo.toml1
-rw-r--r--indexer/src/main.rs25
3 files changed, 26 insertions, 1 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 0bd1e55..fd3a99f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -619,6 +619,7 @@ dependencies = [
  "gloo 0.8.0",
  "gloo-net",
  "itertools",
+ "scraper",
  "serde",
  "wasm-bindgen",
  "wasm-bindgen-futures",
diff --git a/frontend/Cargo.toml b/frontend/Cargo.toml
index 106b818..23acba2 100644
--- a/frontend/Cargo.toml
+++ b/frontend/Cargo.toml
@@ -14,3 +14,4 @@ gloo-net = "0.2"
 serde = { version = "1.0", features = ["derive", "rc"] }
 wasm-bindgen-futures = "0.4"
 itertools = "0.10.5"
+scraper = "0.12.0"
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index e63533b..36eabff 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -9,7 +9,9 @@ use std::sync::{Arc, Mutex};
 #[derive(Debug, Clone, Serialize)]
 struct CrawledResource {
     url: String,
-    priority: u32, //how do we even calculate this
+    title: String,
+    description: String,
+    priority: u32,
     word: Arc<String>,
 }
 
@@ -56,6 +58,7 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
     .await
 }
 
+//we need to rename stuff
 #[derive(Deserialize, Debug)]
 struct Resource {
     url: String,
@@ -65,6 +68,8 @@ struct Resource {
 #[post("/resource")]
 async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder {
     //parse content
+    let document = scraper::Html::parse_document(resource.content.as_str());
+
     let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len());
 
     let split_words = text.split(' ');
@@ -78,6 +83,22 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>)
 
     println!("xd: {:?}", fixed_words);
 
+    let title_selector = scraper::Selector::parse("title").unwrap();
+    let description_selector = scraper::Selector::parse("meta").unwrap();
+
+    let page_title: String = document
+        .select(&title_selector)
+        .map(|e| e.inner_html())
+        .take(1)
+        .collect();
+
+    let page_description: String = document
+        .select(&description_selector)
+        .filter(|e| e.value().attr("name") == Some("description"))
+        .filter_map(|e| e.value().attr("content"))
+        .take(1)
+        .collect();
+
     //and for each changed content word we add it to the db (word -> list.append(url))
     let mut database = data.database.lock().unwrap();
     for word in fixed_words {
@@ -85,6 +106,8 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>)
             url: resource.url.clone(),
             priority: calculate_word_priority(&word, resource.content.as_str()),
             word: Arc::new(word.clone()),
+            title: page_title.clone(),
+            description: page_description.clone(),
         };
 
         match database.get_mut(&word) {