about summary refs log tree commit diff
path: root/indexer
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-22 18:13:58 +0200
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-10-22 18:14:04 +0200
commit6c37404f07b4c929ee0c8e74a03040131d78ffd3 (patch)
treea2851269d6786ac21657e567bb7d441bae2d86d0 /indexer
parentCrawler: Implement basic async functionality (diff)
downloadOSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.gz
OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.bz2
OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.zip
Indexer: Implement basic reverse index searching and adding
Very inefficient but kind of functional:::)))))))
Diffstat (limited to 'indexer')
-rw-r--r--indexer/Cargo.toml2
-rw-r--r--indexer/src/main.rs89
2 files changed, 83 insertions, 8 deletions
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index 3df1757..c77dc8f 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -8,6 +8,8 @@ edition = "2021"
 [dependencies]
 actix-web = "*"
 serde = { version = "1.0", features = ["derive"] }
+scraper = "0.12.0"
+html2text = "0.4.3"
 
 [[bin]]
 name = "indexer"
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 3418ef3..5af53cd 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -1,5 +1,11 @@
 use actix_web::{get, post, web, App, HttpServer, Responder};
 use serde::Deserialize;
+use std::collections::{HashMap, HashSet};
+use std::sync::Mutex;
+
+struct AppState {
+    database: Mutex<HashMap<String, HashSet<String>>>,
+}
 
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
@@ -9,10 +15,18 @@ async fn main() -> std::io::Result<()> {
 }
 
 async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
-    HttpServer::new(|| App::new().service(greet).service(add_resource))
-        .bind((address, port))?
-        .run()
-        .await
+    let shared_state = web::Data::new(AppState {
+        database: Mutex::new(HashMap::new()),
+    });
+    HttpServer::new(move || {
+        App::new()
+            .app_data(shared_state.clone())
+            .service(greet)
+            .service(add_resource)
+    })
+    .bind((address, port))?
+    .run()
+    .await
 }
 
 #[derive(Deserialize, Debug)]
@@ -22,12 +36,71 @@ struct Resource {
 }
 
 #[post("/resource")]
-async fn add_resource(resource: web::Json<Resource>) -> impl Responder {
-    println!("Added resource! {:?}", resource);
+async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder {
+    //parse content
+    let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len());
+
+    let split_words = text.split(' ');
+
+    //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...)
+    let fixed_words: Vec<String> = split_words
+        .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic()))
+        .filter(|w| !w.is_empty() && *w != " ")
+        .map(|w| w.to_ascii_lowercase())
+        .collect();
+
+    println!("xd: {:?}", fixed_words);
+
+    //and for each changed content word we add it to the db (word -> list.append(url))
+    let mut database = data.database.lock().unwrap();
+    for word in fixed_words {
+        //should probs do some priority
+        let maybe_urls = database.get(&word);
+        match maybe_urls {
+            Some(urls) => {
+                let mut updated_urls = urls.clone();
+                updated_urls.insert(resource.url.clone());
+                database.insert(word, updated_urls);
+            }
+            None => {
+                database.insert(word.clone(), HashSet::from([resource.url.clone()]));
+            }
+        }
+    }
+
+    println!("Added resource! {:?}", database.len());
     format!("{:?}", resource)
 }
 
 #[get("/search/{term}")]
-async fn greet(term: web::Path<String>) -> impl Responder {
-    format!("Searching for: {term}")
+async fn greet(data: web::Data<AppState>, term: web::Path<String>) -> impl Responder {
+    let query: Vec<&str> = term.split(' ').collect();
+    let database = data.database.lock().unwrap();
+
+    let mut valid_results: Option<HashSet<String>> = None;
+    for w in query {
+        let curr_word_results = database.get(w);
+        if curr_word_results.is_none() {
+            return format!("No results found for {:?}!", w);
+        }
+        let curr_word_results = curr_word_results.unwrap();
+        match valid_results {
+            None => {
+                valid_results = Some(curr_word_results.clone());
+            }
+            Some(results) => {
+                let intersection: Vec<String> = curr_word_results
+                    .intersection(&results)
+                    .map(|s| s.to_owned())
+                    .collect();
+                let set: HashSet<String> = HashSet::from_iter(intersection);
+                valid_results = Some(set);
+            }
+        }
+    }
+
+    format!(
+        "Searching for: {term}\nResults: {:?}",
+        valid_results.unwrap()
+    )
 }