diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 23:08:16 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 23:14:37 +0100 |
commit | 1a26c40191bda843a0500f12bbb7d67b3e8c238e (patch) | |
tree | 55cc8cb2878c82e29bef0e7e65f7b03be81ab39d | |
parent | Crawler: Set 4 as the maximum "crawl depth" (diff) | |
download | OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.gz OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.bz2 OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.zip |
Indexer: Use kuchiki to split html content into words
This is better than html2text when using non-ascii characters.
-rw-r--r-- | Cargo.lock | 13 | ||||
-rw-r--r-- | indexer/Cargo.toml | 1 | ||||
-rw-r--r-- | indexer/src/main.rs | 24 |
3 files changed, 32 insertions, 6 deletions
diff --git a/Cargo.lock b/Cargo.lock index 54f74db..72b7a20 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1146,6 +1146,7 @@ dependencies = [ "actix-cors", "actix-web", "html2text", + "kuchiki", "lib", "scraper", "serde_json", @@ -1235,6 +1236,18 @@ dependencies = [ ] [[package]] +name = "kuchiki" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" +dependencies = [ + "cssparser", + "html5ever 0.25.2", + "matches", + "selectors", +] + +[[package]] name = "language-tags" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 28e6f17..2c8f905 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -11,6 +11,7 @@ actix-cors = "0.6.3" scraper = "0.12.0" html2text = "0.4.3" serde_json = "1.0.87" +kuchiki = "0.8.1" lib = { path = "../lib" } [[bin]] diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 36bd8de..1df3cf5 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -3,6 +3,7 @@ use actix_web::{get, post, web, App, HttpServer, Responder}; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, Mutex}; use lib::lib::*; +use kuchiki::traits::TendrilSink; struct AppState { database: Mutex<HashMap<String, HashSet<IndexedResource>>>, @@ -41,17 +42,28 @@ async fn add_resource( ) -> impl Responder { //parse content let document = scraper::Html::parse_document(resource.content.as_str()); - - //TODO: Not very good, can we just body.get_text()? - let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); + let kuchiki_parser = kuchiki::parse_html().one(resource.content.as_str()); + + //remove script, style and noscript tags + kuchiki_parser + .inclusive_descendants() + .filter(|node| { + node.as_element().map_or(false, |e| { + matches!(e.name.local.as_ref(), "script" | "style" | "noscript") + }) + }) + .collect::<Vec<_>>() + .iter() + .for_each(|node| node.detach()); + + let text = kuchiki_parser.text_contents(); let split_words = text.split(' '); //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...) let fixed_words: Vec<String> = split_words - .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic())) - .filter(|w| !w.is_empty() && *w != " ") - .map(|w| w.to_ascii_lowercase()) + .map(|w| w.to_ascii_lowercase().split_whitespace().collect()) + .filter(|w: &String| !w.is_empty()) .collect(); println!("xd: {:?}", fixed_words); |