diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-28 19:24:58 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-28 19:25:01 +0200 |
commit | e5291cab1b2d1b6a01164b9bbb31d812062e6e66 (patch) | |
tree | 02a42a70a97718f78781a2e0395c91521c96ac09 | |
parent | Frontend: Refactor search_word_in_db() to not need explicit lifetimes (diff) | |
download | OSSE-e5291cab1b2d1b6a01164b9bbb31d812062e6e66.tar.gz OSSE-e5291cab1b2d1b6a01164b9bbb31d812062e6e66.tar.bz2 OSSE-e5291cab1b2d1b6a01164b9bbb31d812062e6e66.zip |
Indexer: Add website title and description to the CrawledResource
We now parse the HTML and extract the title and description of the site.
-rw-r--r-- | Cargo.lock | 1 | ||||
-rw-r--r-- | frontend/Cargo.toml | 1 | ||||
-rw-r--r-- | indexer/src/main.rs | 25 |
3 files changed, 26 insertions, 1 deletions
diff --git a/Cargo.lock b/Cargo.lock index 0bd1e55..fd3a99f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -619,6 +619,7 @@ dependencies = [ "gloo 0.8.0", "gloo-net", "itertools", + "scraper", "serde", "wasm-bindgen", "wasm-bindgen-futures", diff --git a/frontend/Cargo.toml b/frontend/Cargo.toml index 106b818..23acba2 100644 --- a/frontend/Cargo.toml +++ b/frontend/Cargo.toml @@ -14,3 +14,4 @@ gloo-net = "0.2" serde = { version = "1.0", features = ["derive", "rc"] } wasm-bindgen-futures = "0.4" itertools = "0.10.5" +scraper = "0.12.0" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index e63533b..36eabff 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -9,7 +9,9 @@ use std::sync::{Arc, Mutex}; #[derive(Debug, Clone, Serialize)] struct CrawledResource { url: String, - priority: u32, //how do we even calculate this + title: String, + description: String, + priority: u32, word: Arc<String>, } @@ -56,6 +58,7 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { .await } +//we need to rename stuff #[derive(Deserialize, Debug)] struct Resource { url: String, @@ -65,6 +68,8 @@ struct Resource { #[post("/resource")] async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder { //parse content + let document = scraper::Html::parse_document(resource.content.as_str()); + let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); let split_words = text.split(' '); @@ -78,6 +83,22 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) println!("xd: {:?}", fixed_words); + let title_selector = scraper::Selector::parse("title").unwrap(); + let description_selector = scraper::Selector::parse("meta").unwrap(); + + let page_title: String = document + .select(&title_selector) + .map(|e| e.inner_html()) + .take(1) + .collect(); + + let page_description: String = document + .select(&description_selector) + .filter(|e| e.value().attr("name") == Some("description")) + .filter_map(|e| e.value().attr("content")) + .take(1) + .collect(); + //and for each changed content word we add it to the db (word -> list.append(url)) let mut database = data.database.lock().unwrap(); for word in fixed_words { @@ -85,6 +106,8 @@ async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) url: resource.url.clone(), priority: calculate_word_priority(&word, resource.content.as_str()), word: Arc::new(word.clone()), + title: page_title.clone(), + description: page_description.clone(), }; match database.get_mut(&word) { |