diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-22 18:13:58 +0200 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-10-22 18:14:04 +0200 |
commit | 6c37404f07b4c929ee0c8e74a03040131d78ffd3 (patch) | |
tree | a2851269d6786ac21657e567bb7d441bae2d86d0 | |
parent | Crawler: Implement basic async functionality (diff) | |
download | OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.gz OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.tar.bz2 OSSE-6c37404f07b4c929ee0c8e74a03040131d78ffd3.zip |
Indexer: Implement basic reverse index searching and adding
Very inefficient but kind of functional:::)))))))
-rw-r--r-- | Cargo.lock | 87 | ||||
-rw-r--r-- | indexer/Cargo.toml | 2 | ||||
-rw-r--r-- | indexer/src/main.rs | 89 |
3 files changed, 163 insertions, 15 deletions
diff --git a/Cargo.lock b/Cargo.lock index 5520569..6ed1be4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -417,7 +417,7 @@ dependencies = [ "dtoa-short", "itoa 0.4.8", "matches", - "phf", + "phf 0.8.0", "proc-macro2", "quote", "smallvec", @@ -701,6 +701,19 @@ dependencies = [ ] [[package]] +name = "html2text" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2a75f4fdb748c0980b4d04f8edafc749bf4b5bfa738bf6c1565c7e6118d6ca" +dependencies = [ + "html5ever 0.26.0", + "markup5ever 0.11.0", + "tendril", + "unicode-width", + "xml5ever", +] + +[[package]] name = "html5ever" version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -708,7 +721,21 @@ checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.10.1", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever 0.11.0", "proc-macro2", "quote", "syn", @@ -800,6 +827,8 @@ name = "indexer" version = "0.1.0" dependencies = [ "actix-web", + "html2text", + "scraper", "serde", ] @@ -954,8 +983,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" dependencies = [ "log", - "phf", - "phf_codegen", + "phf 0.8.0", + "phf_codegen 0.8.0", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen 0.10.0", "string_cache", "string_cache_codegen", "tendril", @@ -1221,6 +1264,15 @@ dependencies = [ ] [[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] name = "phf_codegen" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1231,6 +1283,16 @@ dependencies = [ ] [[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] name = "phf_generator" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1535,7 +1597,7 @@ dependencies = [ "cssparser", "ego-tree", "getopts", - "html5ever", + "html5ever 0.25.2", "matches", "selectors", "smallvec", @@ -1577,8 +1639,8 @@ dependencies = [ "fxhash", "log", "matches", - "phf", - "phf_codegen", + "phf 0.8.0", + "phf_codegen 0.8.0", "precomputed-hash", "servo_arc", "smallvec", @@ -2227,6 +2289,17 @@ dependencies = [ ] [[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever 0.11.0", +] + +[[package]] name = "zstd" version = "0.11.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 3df1757..c77dc8f 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -8,6 +8,8 @@ edition = "2021" [dependencies] actix-web = "*" serde = { version = "1.0", features = ["derive"] } +scraper = "0.12.0" +html2text = "0.4.3" [[bin]] name = "indexer" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 3418ef3..5af53cd 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -1,5 +1,11 @@ use actix_web::{get, post, web, App, HttpServer, Responder}; use serde::Deserialize; +use std::collections::{HashMap, HashSet}; +use std::sync::Mutex; + +struct AppState { + database: Mutex<HashMap<String, HashSet<String>>>, +} #[actix_web::main] async fn main() -> std::io::Result<()> { @@ -9,10 +15,18 @@ async fn main() -> std::io::Result<()> { } async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { - HttpServer::new(|| App::new().service(greet).service(add_resource)) - .bind((address, port))? - .run() - .await + let shared_state = web::Data::new(AppState { + database: Mutex::new(HashMap::new()), + }); + HttpServer::new(move || { + App::new() + .app_data(shared_state.clone()) + .service(greet) + .service(add_resource) + }) + .bind((address, port))? + .run() + .await } #[derive(Deserialize, Debug)] @@ -22,12 +36,71 @@ struct Resource { } #[post("/resource")] -async fn add_resource(resource: web::Json<Resource>) -> impl Responder { - println!("Added resource! {:?}", resource); +async fn add_resource(data: web::Data<AppState>, resource: web::Json<Resource>) -> impl Responder { + //parse content + let text = html2text::from_read(resource.content.as_str().as_bytes(), resource.content.len()); + + let split_words = text.split(' '); + + //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...) + let fixed_words: Vec<String> = split_words + .filter(|w| !w.chars().any(|c| !c.is_ascii_alphabetic())) + .filter(|w| !w.is_empty() && *w != " ") + .map(|w| w.to_ascii_lowercase()) + .collect(); + + println!("xd: {:?}", fixed_words); + + //and for each changed content word we add it to the db (word -> list.append(url)) + let mut database = data.database.lock().unwrap(); + for word in fixed_words { + //should probs do some priority + let maybe_urls = database.get(&word); + match maybe_urls { + Some(urls) => { + let mut updated_urls = urls.clone(); + updated_urls.insert(resource.url.clone()); + database.insert(word, updated_urls); + } + None => { + database.insert(word.clone(), HashSet::from([resource.url.clone()])); + } + } + } + + println!("Added resource! {:?}", database.len()); format!("{:?}", resource) } #[get("/search/{term}")] -async fn greet(term: web::Path<String>) -> impl Responder { - format!("Searching for: {term}") +async fn greet(data: web::Data<AppState>, term: web::Path<String>) -> impl Responder { + let query: Vec<&str> = term.split(' ').collect(); + let database = data.database.lock().unwrap(); + + let mut valid_results: Option<HashSet<String>> = None; + for w in query { + let curr_word_results = database.get(w); + if curr_word_results.is_none() { + return format!("No results found for {:?}!", w); + } + let curr_word_results = curr_word_results.unwrap(); + match valid_results { + None => { + valid_results = Some(curr_word_results.clone()); + } + Some(results) => { + let intersection: Vec<String> = curr_word_results + .intersection(&results) + .map(|s| s.to_owned()) + .collect(); + let set: HashSet<String> = HashSet::from_iter(intersection); + valid_results = Some(set); + } + } + } + + format!( + "Searching for: {term}\nResults: {:?}", + valid_results.unwrap() + ) } |