diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-22 18:13:58 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-22 18:14:04 +0200 |
commit | 6c37404f07b4c929ee0c8e74a03040131d78ffd3 (patch) | |
tree | a2851269d6786ac21657e567bb7d441bae2d86d0 /indexer | |
parent | Crawler: Implement basic async functionality (diff) | |
download | OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.gz OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.bz2 OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.zip |
Indexer: Implement basic reverse index searching and adding
Very inefficient but kind of functional:::)))))))
Diffstat (limited to 'indexer')
-rw-r--r-- | indexer/Cargo.toml | 2 | ||||
-rw-r--r-- | indexer/src/main.rs | 89 |
2 files changed, 83 insertions, 8 deletions
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 3df1757..c77dc8f 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -8,6 +8,8 @@ edition = "2021" [dependencies] actix-web = "*" serde = { version = "1.0", features = ["derive"] } +scraper = "0.12.0" +html2text = "0.4.3" [[bin]] name = "indexer" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 3418ef3..5af53cd 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -1,5 +1,11 @@ use actix_web::{get, post, web, App, HttpServer, Responder}; use serde::Deserialize; +use std::collections::{HashMap, HashSet}; +use std::sync::Mutex; + +struct AppState { + database: Mutex<HashMap<String, HashSet<String>>>, +} #[actix_web::main] async fn main() -> std::io::Result<()> { @@ -9,10 +15,18 @@ async fn main() -> std::io::Result<()> { } async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { - HttpServer::new(|| App::new().service(greet).service(add_resource)) - .bind((address, port))? - .run() - .await + let shared_state = web::Data::new(AppState { + database: Mutex::new(HashMap::new()), + }); + HttpServer::new(move || { + App::new() + .app_data(shared_state.clone()) + .service(greet) + .service(add_resource) + }) + .bind((address, port))? + .run() + .await } #[derive(Deserialize, Debug)] @@ -22,12 +36,71 @@ struct Resource { } #[post("/resource")] -async fn add_resource(resource: web::Json<Resource>) -> impl Responder { - println!("Added resource! {:?}", resource); +async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder { + //parse content + let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); + + let split_words = text.split(' '); + + //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...) + let fixed_words: Vec<String> = split_words + .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic())) + .filter(|w| !w.is_empty() && *w != " ") + .map(|w| w.to_ascii_lowercase()) + .collect(); + + println!("xd: {:?}", fixed_words); + + //and for each changed content word we add it to the db (word -> list.append(url)) + let mut database = data.database.lock().unwrap(); + for word in fixed_words { + //should probs do some priority + let maybe_urls = database.get(&word); + match maybe_urls { + Some(urls) => { + let mut updated_urls = urls.clone(); + updated_urls.insert(resource.url.clone()); + database.insert(word, updated_urls); + } + None => { + database.insert(word.clone(), HashSet::from([resource.url.clone()])); + } + } + } + + println!("Added resource! {:?}", database.len()); format!("{:?}", resource) } #[get("/search/{term}")] -async fn greet(term: web::Path<String>) -> impl Responder { - format!("Searching for: {term}") +async fn greet(data: web::Data<AppState>, term: web::Path<String>) -> impl Responder { + let query: Vec<&str> = term.split(' ').collect(); + let database = data.database.lock().unwrap(); + + let mut valid_results: Option<HashSet<String>> = None; + for w in query { + let curr_word_results = database.get(w); + if curr_word_results.is_none() { + return format!("No results found for {:?}!", w); + } + let curr_word_results = curr_word_results.unwrap(); + match valid_results { + None => { + valid_results = Some(curr_word_results.clone()); + } + Some(results) => { + let intersection: Vec<String> = curr_word_results + .intersection(&results) + .map(|s| s.to_owned()) + .collect(); + let set: HashSet<String> = HashSet::from_iter(intersection); + valid_results = Some(set); + } + } + } + + format!( + "Searching for: {term}\nResults: {:?}", + valid_results.unwrap() + ) } |