about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-06 23:23:19 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-06 23:26:07 +0100
commit4e4b9e48fb779ea2a94bed7207a0e179de4e4484 (patch)
treeb119efe1972f81d9c004ddf95d827f4de7f537c7
parentFrontend: Display a maximum number of chars for title and desc (diff)
downloadOSSE-4e4b9e48fb779ea2a94bed7207a0e179de4e4484.tar.gz
OSSE-4e4b9e48fb779ea2a94bed7207a0e179de4e4484.tar.bz2
OSSE-4e4b9e48fb779ea2a94bed7207a0e179de4e4484.zip
Indexer: Decode html entities for website title and description
Maybe we should do it for all the website's content too? :))
-rw-r--r--Cargo.lock94
-rw-r--r--indexer/Cargo.toml2
-rw-r--r--indexer/src/main.rs7
3 files changed, 22 insertions, 81 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 19d14dc..c6c6e6f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -615,7 +615,7 @@ dependencies = [
  "dtoa-short",
  "itoa 0.4.8",
  "matches",
- "phf 0.8.0",
+ "phf",
  "proc-macro2",
  "quote",
  "smallvec",
@@ -1186,16 +1186,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "html2text"
-version = "0.4.3"
+name = "html-escape"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db2a75f4fdb748c0980b4d04f8edafc749bf4b5bfa738bf6c1565c7e6118d6ca"
+checksum = "15315cfa9503e9aa85a477138eff76a1b203a430703548052c330b69d8d8c205"
 dependencies = [
- "html5ever 0.26.0",
- "markup5ever 0.11.0",
- "tendril",
- "unicode-width",
- "xml5ever",
+ "utf8-width",
 ]
 
 [[package]]
@@ -1206,21 +1202,7 @@ checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148"
 dependencies = [
  "log",
  "mac",
- "markup5ever 0.10.1",
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "html5ever"
-version = "0.26.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
-dependencies = [
- "log",
- "mac",
- "markup5ever 0.11.0",
+ "markup5ever",
  "proc-macro2",
  "quote",
  "syn",
@@ -1341,7 +1323,7 @@ dependencies = [
  "actix-web",
  "actix-web-lab",
  "env_logger",
- "html2text",
+ "html-escape",
  "kuchiki",
  "lib",
  "log",
@@ -1443,7 +1425,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358"
 dependencies = [
  "cssparser",
- "html5ever 0.25.2",
+ "html5ever",
  "matches",
  "selectors",
 ]
@@ -1532,22 +1514,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
 dependencies = [
  "log",
- "phf 0.8.0",
- "phf_codegen 0.8.0",
- "string_cache",
- "string_cache_codegen",
- "tendril",
-]
-
-[[package]]
-name = "markup5ever"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
-dependencies = [
- "log",
- "phf 0.10.1",
- "phf_codegen 0.10.0",
+ "phf",
+ "phf_codegen",
  "string_cache",
  "string_cache_codegen",
  "tendril",
@@ -1839,15 +1807,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "phf"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
-dependencies = [
- "phf_shared 0.10.0",
-]
-
-[[package]]
 name = "phf_codegen"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1858,16 +1817,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "phf_codegen"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
-dependencies = [
- "phf_generator 0.10.0",
- "phf_shared 0.10.0",
-]
-
-[[package]]
 name = "phf_generator"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2271,7 +2220,7 @@ dependencies = [
  "cssparser",
  "ego-tree",
  "getopts",
- "html5ever 0.25.2",
+ "html5ever",
  "matches",
  "selectors",
  "smallvec",
@@ -2313,8 +2262,8 @@ dependencies = [
  "fxhash",
  "log",
  "matches",
- "phf 0.8.0",
- "phf_codegen 0.8.0",
+ "phf",
+ "phf_codegen",
  "precomputed-hash",
  "servo_arc",
  "smallvec",
@@ -2888,6 +2837,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
 [[package]]
+name = "utf8-width"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1"
+
+[[package]]
 name = "vcpkg"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -3162,17 +3117,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "xml5ever"
-version = "0.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
-dependencies = [
- "log",
- "mac",
- "markup5ever 0.11.0",
-]
-
-[[package]]
 name = "yew"
 version = "0.19.3"
 source = "git+https://github.com/yewstack/yew/#da09755c27bfeb113e0c4b1096214a826f1e8388"
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index b94f004..b437edf 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -10,7 +10,7 @@ actix-web = "4.2.1"
 actix-web-lab = "0.18.5"
 actix-cors = "0.6.3"
 scraper = "0.12.0"
-html2text = "0.4.3"
+html-escape = "0.2.12"
 serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0.87"
 kuchiki = "0.8.1"
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 515062d..70a7649 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -55,9 +55,6 @@ async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
     .await
 }
 
-//TODO: Max description size
-//TODO: Current result below search bar updates with it
-//TODO: Remove html symbols italic and stuff in frontend (or apply them?)
 //TODO: Better readme
 
 //TODO: sufficiently simmilar word in search (algorithm)
@@ -105,7 +102,7 @@ async fn add_resource(
         .collect::<String>()
     {
         s if s.is_empty() => None,
-        string => Some(string),
+        string => Some(html_escape::decode_html_entities(&string).to_string()),
     };
 
     let page_description: Option<String> = match document
@@ -116,7 +113,7 @@ async fn add_resource(
         .collect::<String>()
     {
         s if s.is_empty() => None,
-        string => Some(string),
+        string => Some(html_escape::decode_html_entities(&string).to_string()),
     };
 
     //TODO: rewrite with if let else