Indexer: Use kuchiki to split html content into words

This is better than html2text when using non-ascii characters.
author: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-30 23:08:16 +0100
committer: Baitinq <manuelpalenzuelamerino@gmail.com> 2022-10-30 23:14:37 +0100
commit: 1a26c40191bda843a0500f12bbb7d67b3e8c238e (patch)
tree: 55cc8cb2878c82e29bef0e7e65f7b03be81ab39d
parent: Crawler: Set 4 as the maximum "crawl depth" (diff)
download: OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.gz
OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.bz2
OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.zip
3 files changed, 32 insertions, 6 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 54f74db..72b7a20 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1146,6 +1146,7 @@ dependencies = [
  "actix-cors",
  "actix-web",
  "html2text",
+ "kuchiki",
  "lib",
  "scraper",
  "serde_json",
@@ -1235,6 +1236,18 @@ dependencies = [
 ]
 
 [[package]]
+name = "kuchiki"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358"
+dependencies = [
+ "cssparser",
+ "html5ever 0.25.2",
+ "matches",
+ "selectors",
+]
+
+[[package]]
 name = "language-tags"
 version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index 28e6f17..2c8f905 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -11,6 +11,7 @@ actix-cors = "0.6.3"
 scraper = "0.12.0"
 html2text = "0.4.3"
 serde_json = "1.0.87"
+kuchiki = "0.8.1"
 lib = { path = "../lib" }
 
 [[bin]]
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 36bd8de..1df3cf5 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -3,6 +3,7 @@ use actix_web::{get, post, web, App, HttpServer, Responder};
 use std::collections::{HashMap, HashSet};
 use std::sync::{Arc, Mutex};
 use lib::lib::*;
+use kuchiki::traits::TendrilSink;
 
 struct AppState {
     database: Mutex<HashMap<String, HashSet<IndexedResource>>>,
@@ -41,17 +42,28 @@ async fn add_resource(
 ) -> impl Responder {
     //parse content
     let document = scraper::Html::parse_document(resource.content.as_str());
-
-    //TODO: Not very good, can we just body.get_text()?
-    let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len());
+    let kuchiki_parser = kuchiki::parse_html().one(resource.content.as_str());
+
+    //remove script, style and noscript tags
+    kuchiki_parser
+        .inclusive_descendants()
+        .filter(|node| {
+            node.as_element().map_or(false, |e| {
+                matches!(e.name.local.as_ref(), "script" | "style" | "noscript")
+            })
+        })
+        .collect::<Vec<_>>()
+        .iter()
+        .for_each(|node| node.detach());
+
+    let text = kuchiki_parser.text_contents();
 
     let split_words = text.split(' ');
 
     //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...)
     let fixed_words: Vec<String> = split_words
-        .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic()))
-        .filter(|w| !w.is_empty() && *w != " ")
-        .map(|w| w.to_ascii_lowercase())
+        .map(|w| w.to_ascii_lowercase().split_whitespace().collect())
+        .filter(|w: &String| !w.is_empty())
         .collect();
 
     println!("xd: {:?}", fixed_words);
author	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-30 23:08:16 +0100
committer	Baitinq <manuelpalenzuelamerino@gmail.com>	2022-10-30 23:14:37 +0100
commit	1a26c40191bda843a0500f12bbb7d67b3e8c238e (patch)
tree	55cc8cb2878c82e29bef0e7e65f7b03be81ab39d
parent	Crawler: Set 4 as the maximum "crawl depth" (diff)
download	OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.gz OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.bz2 OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.zip