diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-06 23:23:19 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-06 23:26:07 +0100 |
commit | 4e4b9e48fb779ea2a94bed7207a0e179de4e4484 (patch) | |
tree | b119efe1972f81d9c004ddf95d827f4de7f537c7 | |
parent | Frontend: Display a maximum number of chars for title and desc (diff) | |
download | OSSE-4e4b9e48fb779ea2a94bed7207a0e179de4e4484.tar.gz OSSE-4e4b9e48fb779ea2a94bed7207a0e179de4e4484.tar.bz2 OSSE-4e4b9e48fb779ea2a94bed7207a0e179de4e4484.zip |
Indexer: Decode html entities for website title and description
Maybe we should do it for all the website's content too? :))
-rw-r--r-- | Cargo.lock | 94 | ||||
-rw-r--r-- | indexer/Cargo.toml | 2 | ||||
-rw-r--r-- | indexer/src/main.rs | 7 |
3 files changed, 22 insertions, 81 deletions
diff --git a/Cargo.lock b/Cargo.lock index 19d14dc..c6c6e6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -615,7 +615,7 @@ dependencies = [ "dtoa-short", "itoa 0.4.8", "matches", - "phf 0.8.0", + "phf", "proc-macro2", "quote", "smallvec", @@ -1186,16 +1186,12 @@ dependencies = [ ] [[package]] -name = "html2text" -version = "0.4.3" +name = "html-escape" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db2a75f4fdb748c0980b4d04f8edafc749bf4b5bfa738bf6c1565c7e6118d6ca" +checksum = "15315cfa9503e9aa85a477138eff76a1b203a430703548052c330b69d8d8c205" dependencies = [ - "html5ever 0.26.0", - "markup5ever 0.11.0", - "tendril", - "unicode-width", - "xml5ever", + "utf8-width", ] [[package]] @@ -1206,21 +1202,7 @@ checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" dependencies = [ "log", "mac", - "markup5ever 0.10.1", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "html5ever" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" -dependencies = [ - "log", - "mac", - "markup5ever 0.11.0", + "markup5ever", "proc-macro2", "quote", "syn", @@ -1341,7 +1323,7 @@ dependencies = [ "actix-web", "actix-web-lab", "env_logger", - "html2text", + "html-escape", "kuchiki", "lib", "log", @@ -1443,7 +1425,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" dependencies = [ "cssparser", - "html5ever 0.25.2", + "html5ever", "matches", "selectors", ] @@ -1532,22 +1514,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" dependencies = [ "log", - "phf 0.8.0", - "phf_codegen 0.8.0", - "string_cache", - "string_cache_codegen", - "tendril", -] - -[[package]] -name = "markup5ever" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" -dependencies = [ - "log", - "phf 0.10.1", - "phf_codegen 0.10.0", + "phf", + "phf_codegen", "string_cache", "string_cache_codegen", "tendril", @@ -1839,15 +1807,6 @@ dependencies = [ ] [[package]] -name = "phf" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" -dependencies = [ - "phf_shared 0.10.0", -] - -[[package]] name = "phf_codegen" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1858,16 +1817,6 @@ dependencies = [ ] [[package]] -name = "phf_codegen" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", -] - -[[package]] name = "phf_generator" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2271,7 +2220,7 @@ dependencies = [ "cssparser", "ego-tree", "getopts", - "html5ever 0.25.2", + "html5ever", "matches", "selectors", "smallvec", @@ -2313,8 +2262,8 @@ dependencies = [ "fxhash", "log", "matches", - "phf 0.8.0", - "phf_codegen 0.8.0", + "phf", + "phf_codegen", "precomputed-hash", "servo_arc", "smallvec", @@ -2888,6 +2837,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] +name = "utf8-width" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" + +[[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3162,17 +3117,6 @@ dependencies = [ ] [[package]] -name = "xml5ever" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" -dependencies = [ - "log", - "mac", - "markup5ever 0.11.0", -] - -[[package]] name = "yew" version = "0.19.3" source = "git+https://github.com/yewstack/yew/#da09755c27bfeb113e0c4b1096214a826f1e8388" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index b94f004..b437edf 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -10,7 +10,7 @@ actix-web = "4.2.1" actix-web-lab = "0.18.5" actix-cors = "0.6.3" scraper = "0.12.0" -html2text = "0.4.3" +html-escape = "0.2.12" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0.87" kuchiki = "0.8.1" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 515062d..70a7649 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -55,9 +55,6 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { .await } -//TODO: Max description size -//TODO: Current result below search bar updates with it -//TODO: Remove html symbols italic and stuff in frontend (or apply them?) //TODO: Better readme //TODO: sufficiently simmilar word in search (algorithm) @@ -105,7 +102,7 @@ async fn add_resource( .collect::<String>() { s if s.is_empty() => None, - string => Some(string), + string => Some(html_escape::decode_html_entities(&string).to_string()), }; let page_description: Option<String> = match document @@ -116,7 +113,7 @@ async fn add_resource( .collect::<String>() { s if s.is_empty() => None, - string => Some(string), + string => Some(html_escape::decode_html_entities(&string).to_string()), }; //TODO: rewrite with if let else |