diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 23:08:16 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-30 23:14:37 +0100 |
commit | 1a26c40191bda843a0500f12bbb7d67b3e8c238e (patch) | |
tree | 55cc8cb2878c82e29bef0e7e65f7b03be81ab39d /Cargo.lock | |
parent | Crawler: Set 4 as the maximum "crawl depth" (diff) | |
download | OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.gz OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.tar.bz2 OSSE-1a26c40191bda843a0500f12bbb7d67b3e8c238e.zip |
Indexer: Use kuchiki to split html content into words
This is better than html2text when using non-ascii characters.
Diffstat (limited to 'Cargo.lock')
-rw-r--r-- | Cargo.lock | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock index 54f74db..72b7a20 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1146,6 +1146,7 @@ dependencies = [ "actix-cors", "actix-web", "html2text", + "kuchiki", "lib", "scraper", "serde_json", @@ -1235,6 +1236,18 @@ dependencies = [ ] [[package]] +name = "kuchiki" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" +dependencies = [ + "cssparser", + "html5ever 0.25.2", + "matches", + "selectors", +] + +[[package]] name = "language-tags" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" |