about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-22 18:13:58 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-22 18:14:04 +0200
commit6c37404f07b4c929ee0c8e74a03040131d78ffd3 (patch)
treea2851269d6786ac21657e567bb7d441bae2d86d0
parentCrawler: Implement basic async functionality (diff)
downloadOSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.gz
OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.bz2
OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.zip
Indexer: Implement basic reverse index searching and adding
Very inefficient but kind of functional:::)))))))
-rw-r--r--Cargo.lock87
-rw-r--r--indexer/Cargo.toml2
-rw-r--r--indexer/src/main.rs89
3 files changed, 163 insertions, 15 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 5520569..6ed1be4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -417,7 +417,7 @@ dependencies = [
  "dtoa-short",
  "itoa 0.4.8",
  "matches",
- "phf",
+ "phf 0.8.0",
  "proc-macro2",
  "quote",
  "smallvec",
@@ -701,6 +701,19 @@ dependencies = [
 ]
 
 [[package]]
+name = "html2text"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db2a75f4fdb748c0980b4d04f8edafc749bf4b5bfa738bf6c1565c7e6118d6ca"
+dependencies = [
+ "html5ever 0.26.0",
+ "markup5ever 0.11.0",
+ "tendril",
+ "unicode-width",
+ "xml5ever",
+]
+
+[[package]]
 name = "html5ever"
 version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -708,7 +721,21 @@ checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148"
 dependencies = [
  "log",
  "mac",
- "markup5ever",
+ "markup5ever 0.10.1",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "html5ever"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever 0.11.0",
  "proc-macro2",
  "quote",
  "syn",
@@ -800,6 +827,8 @@ name = "indexer"
 version = "0.1.0"
 dependencies = [
  "actix-web",
+ "html2text",
+ "scraper",
  "serde",
 ]
 
@@ -954,8 +983,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
 dependencies = [
  "log",
- "phf",
- "phf_codegen",
+ "phf 0.8.0",
+ "phf_codegen 0.8.0",
+ "string_cache",
+ "string_cache_codegen",
+ "tendril",
+]
+
+[[package]]
+name = "markup5ever"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
+dependencies = [
+ "log",
+ "phf 0.10.1",
+ "phf_codegen 0.10.0",
  "string_cache",
  "string_cache_codegen",
  "tendril",
@@ -1221,6 +1264,15 @@ dependencies = [
 ]
 
 [[package]]
+name = "phf"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
+dependencies = [
+ "phf_shared 0.10.0",
+]
+
+[[package]]
 name = "phf_codegen"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1231,6 +1283,16 @@ dependencies = [
 ]
 
 [[package]]
+name = "phf_codegen"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
+dependencies = [
+ "phf_generator 0.10.0",
+ "phf_shared 0.10.0",
+]
+
+[[package]]
 name = "phf_generator"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1535,7 +1597,7 @@ dependencies = [
  "cssparser",
  "ego-tree",
  "getopts",
- "html5ever",
+ "html5ever 0.25.2",
  "matches",
  "selectors",
  "smallvec",
@@ -1577,8 +1639,8 @@ dependencies = [
  "fxhash",
  "log",
  "matches",
- "phf",
- "phf_codegen",
+ "phf 0.8.0",
+ "phf_codegen 0.8.0",
  "precomputed-hash",
  "servo_arc",
  "smallvec",
@@ -2227,6 +2289,17 @@ dependencies = [
 ]
 
 [[package]]
+name = "xml5ever"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever 0.11.0",
+]
+
+[[package]]
 name = "zstd"
 version = "0.11.2+zstd.1.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index 3df1757..c77dc8f 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -8,6 +8,8 @@ edition = "2021"
 [dependencies]
 actix-web = "*"
 serde = { version = "1.0", features = ["derive"] }
+scraper = "0.12.0"
+html2text = "0.4.3"
 
 [[bin]]
 name = "indexer"
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 3418ef3..5af53cd 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -1,5 +1,11 @@
 use actix_web::{get, post, web, App, HttpServer, Responder};
 use serde::Deserialize;
+use std::collections::{HashMap, HashSet};
+use std::sync::Mutex;
+
+struct AppState {
+    database: Mutex<HashMap<String, HashSet<String>>>,
+}
 
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
@@ -9,10 +15,18 @@ async fn main() -> std::io::Result<()> {
 }
 
 async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
-    HttpServer::new(|| App::new().service(greet).service(add_resource))
-        .bind((address, port))?
-        .run()
-        .await
+    let shared_state = web::Data::new(AppState {
+        database: Mutex::new(HashMap::new()),
+    });
+    HttpServer::new(move || {
+        App::new()
+            .app_data(shared_state.clone())
+            .service(greet)
+            .service(add_resource)
+    })
+    .bind((address, port))?
+    .run()
+    .await
 }
 
 #[derive(Deserialize, Debug)]
@@ -22,12 +36,71 @@ struct Resource {
 }
 
 #[post("/resource")]
-async fn add_resource(resource: web::Json<Resource>) -> impl Responder {
-    println!("Added resource! {:?}", resource);
+async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder {
+    //parse content
+    let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len());
+
+    let split_words = text.split(' ');
+
+    //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...)
+    let fixed_words: Vec<String> = split_words
+        .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic()))
+        .filter(|w| !w.is_empty() && *w != " ")
+        .map(|w| w.to_ascii_lowercase())
+        .collect();
+
+    println!("xd: {:?}", fixed_words);
+
+    //and for each changed content word we add it to the db (word -> list.append(url))
+    let mut database = data.database.lock().unwrap();
+    for word in fixed_words {
+        //should probs do some priority
+        let maybe_urls = database.get(&word);
+        match maybe_urls {
+            Some(urls) => {
+                let mut updated_urls = urls.clone();
+                updated_urls.insert(resource.url.clone());
+                database.insert(word, updated_urls);
+            }
+            None => {
+                database.insert(word.clone(), HashSet::from([resource.url.clone()]));
+            }
+        }
+    }
+
+    println!("Added resource! {:?}", database.len());
     format!("{:?}", resource)
 }
 
 #[get("/search/{term}")]
-async fn greet(term: web::Path<String>) -> impl Responder {
-    format!("Searching for: {term}")
+async fn greet(data: web::Data<AppState>, term: web::Path<String>) -> impl Responder {
+    let query: Vec<&str> = term.split(' ').collect();
+    let database = data.database.lock().unwrap();
+
+    let mut valid_results: Option<HashSet<String>> = None;
+    for w in query {
+        let curr_word_results = database.get(w);
+        if curr_word_results.is_none() {
+            return format!("No results found for {:?}!", w);
+        }
+        let curr_word_results = curr_word_results.unwrap();
+        match valid_results {
+            None => {
+                valid_results = Some(curr_word_results.clone());
+            }
+            Some(results) => {
+                let intersection: Vec<String> = curr_word_results
+                    .intersection(&results)
+                    .map(|s| s.to_owned())
+                    .collect();
+                let set: HashSet<String> = HashSet::from_iter(intersection);
+                valid_results = Some(set);
+            }
+        }
+    }
+
+    format!(
+        "Searching for: {term}\nResults: {:?}",
+        valid_results.unwrap()
+    )
 }