about summary refs log tree commit diff
path: root/Cargo.lock
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-30 23:08:16 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-30 23:14:37 +0100
commit1a26c40191bda843a0500f12bbb7d67b3e8c238e (patch)
tree55cc8cb2878c82e29bef0e7e65f7b03be81ab39d /Cargo.lock
parentCrawler: Set 4 as the maximum "crawl depth" (diff)
downloadOSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.gz
OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.bz2
OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.zip
Indexer: Use kuchiki to split html content into words
This is better than html2text when using non-ascii characters.
Diffstat (limited to 'Cargo.lock')
-rw-r--r--Cargo.lock13
1 files changed, 13 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 54f74db..72b7a20 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1146,6 +1146,7 @@ dependencies = [
  "actix-cors",
  "actix-web",
  "html2text",
+ "kuchiki",
  "lib",
  "scraper",
  "serde_json",
@@ -1235,6 +1236,18 @@ dependencies = [
 ]
 
 [[package]]
+name = "kuchiki"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358"
+dependencies = [
+ "cssparser",
+ "html5ever 0.25.2",
+ "matches",
+ "selectors",
+]
+
+[[package]]
 name = "language-tags"
 version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"