mod indexer_implementation; use actix_cors::Cors; use actix_web::{get, post, routes, web, App, HttpServer, Responder}; use indexer_implementation::IndexerImplementation; use kuchiki::traits::TendrilSink; use lib::lib::*; use std::collections::HashSet; use std::sync::Mutex; pub trait Indexer { //too many args? fn insert( &mut self, words: &[String], url: &str, title: &Option, description: &Option, language: &Option, content: &str, ) -> Result<(), String>; fn search(&self, term: &str) -> Result, String>; fn num_of_words(&self) -> usize; } struct AppState { indexer: Mutex>, } #[actix_web::main] async fn main() -> std::io::Result<()> { println!("Hello, world! Im the indexer!"); serve_http_endpoint("0.0.0.0", 4444).await } async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> { let shared_state = web::Data::new(AppState { indexer: Mutex::new(Box::new(IndexerImplementation::new())), }); HttpServer::new(move || { let cors = Cors::permissive(); App::new() .wrap(cors) .app_data(shared_state.clone()) .service(no_search) .service(search) .service(add_resource) }) .bind((address, port))? .run() .await } //TODO: sufficiently simmilar word in search (algorithm) #[post("/resource")] async fn add_resource( data: web::Data, resource: web::Json, ) -> impl Responder { //parse content let document = scraper::Html::parse_document(resource.content.as_str()); let kuchiki_parser = kuchiki::parse_html().one(resource.content.as_str()); //remove script, style and noscript tags kuchiki_parser .inclusive_descendants() .filter(|node| { node.as_element().map_or(false, |e| { matches!(e.name.local.as_ref(), "script" | "style" | "noscript") }) }) .collect::>() .iter() .for_each(|node| node.detach()); let text = kuchiki_parser.text_contents(); let split_words = text.split(' '); //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...) let fixed_words: Vec = split_words .map(|w| w.to_ascii_lowercase().split_whitespace().collect()) .filter(|w: &String| !w.is_empty()) .collect(); println!("xd: {:?}", fixed_words); let title_selector = scraper::Selector::parse("title").unwrap(); let meta_selector = scraper::Selector::parse("meta").unwrap(); let html_selector = scraper::Selector::parse("html").unwrap(); let page_title: Option = match document .select(&title_selector) .map(|e| e.inner_html()) .take(1) .collect::() { s if s.is_empty() => None, string => Some(string), }; let page_description: Option = match document .select(&meta_selector) .filter(|e| e.value().attr("name") == Some("description")) .filter_map(|e| e.value().attr("content")) .take(1) .collect::() { s if s.is_empty() => None, string => Some(string), }; //TODO: rewrite with if let else let page_language: Option = match document .select(&html_selector) .filter_map(|e| e.value().attr("lang")) .take(1) .collect::() { s if s.is_empty() => None, string => Some(string), }; //and for each changed content word we add it to the db (word -> list.append(url)) let mut indexer = data.indexer.lock().unwrap(); let _ = indexer.insert( &fixed_words, &resource.url, &page_title, &page_description, &page_language, &resource.content, ); //TODO: ADD LANG? EN in meta tag (frontend) //Now what to do, global lang?, per index lang?, website lang? //TODO: max number of results in query println!("Added resource: {:?}", indexer.num_of_words()); format!("{resource:?}") } #[routes] #[get("/search")] #[get("/search/")] async fn no_search(_data: web::Data) -> impl Responder { "[]".to_string() } #[get("/search/{term}")] async fn search(data: web::Data, term: web::Path) -> impl Responder { let indexer = data.indexer.lock().unwrap(); let results = indexer.search(&term); //+is lowercase search good (we turn ascii lowercase, what do we do with inserting) serde_json::to_string(&results.unwrap()).unwrap() }