about summary refs log blame commit diff
path: root/indexer/src/main.rs
blob: 8d738cd06adec61c77f8f25879ffb868bd3d937d (plain) (tree)
1
2
3
4
5
6
7
8

                           
                     
                                                                    
                                                  

                                 
                       






                              
                         
                  


                                     
                      



                                                                             

                 
                                                   
 


                                        
                                              
 
                                              


                                                                               
                                                
                                                                    

                             
                                      
                  
                       
                                           
                                  









                                                                                                 



                           

 
                                                        
                        



                                         
                   
                                                                            
                                                                              
 












                                                                                




                                                                                                  

                                                                     



                                      
                                                                    

                                                                  
 
                                                   


                                




                                  
 
                                                         
                               


                                                                  




                                  
 










                                                      
                                                                                      
                                                   


                           


                          

                          
 
                                               

                                                                  



                                                             

 


                             

 




                                   






                                        

                                                             
 
                                                     
 
mod indexer_implementation;

use actix_cors::Cors;
use actix_web::{post, web, App, HttpRequest, HttpServer, Responder};
use indexer_implementation::IndexerImplementation;
use kuchiki::traits::TendrilSink;
use lib::lib::*;
use serde::Deserialize;
use std::collections::HashSet;
use std::sync::Mutex;

pub trait Indexer {
    //too many args?
    fn insert(
        &mut self,
        words: &[String],
        url: &str,
        title: &Option<String>,
        description: &Option<String>,
        language: &Option<String>,
        content: &str,
    ) -> Result<(), String>;
    fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String>;
    fn num_of_words(&self) -> usize;
}

struct AppState {
    indexer: Mutex<Box<dyn Indexer + Send + Sync>>,
}

#[actix_web::main]
async fn main() -> std::io::Result<()> {
    println!("Hello, world! Im the indexer!");

    serve_http_endpoint("0.0.0.0", 8080).await
}

async fn serve_http_endpoint(address: &str, port: u16) -> std::io::Result<()> {
    let shared_state = web::Data::new(AppState {
        indexer: Mutex::new(Box::new(IndexerImplementation::new())),
    });
    HttpServer::new(move || {
        let cors = Cors::permissive();
        App::new()
            .wrap(cors)
            .app_data(shared_state.clone())
            .service(add_resource)
            .service(
                web::resource(["/api/search", "/api/search/", "/api/search/{query}"]).to(search),
            )
            .service(
                actix_web_lab::web::spa()
                    .index_file("./frontend/dist/index.html")
                    .static_resources_mount("/")
                    .static_resources_location("./frontend/dist")
                    .finish(),
            ) //TODO: maybe separate gui backend from api?
    })
    .bind((address, port))?
    .run()
    .await
}

//TODO: sufficiently simmilar word in search (algorithm)
#[post("/api/resource")]
async fn add_resource(
    data: web::Data<AppState>,
    resource: web::Json<CrawledResource>,
) -> impl Responder {
    //parse content
    let document = scraper::Html::parse_document(resource.content.as_str());
    let kuchiki_parser = kuchiki::parse_html().one(resource.content.as_str());

    //remove script, style and noscript tags
    kuchiki_parser
        .inclusive_descendants()
        .filter(|node| {
            node.as_element().map_or(false, |e| {
                matches!(e.name.local.as_ref(), "script" | "style" | "noscript")
            })
        })
        .collect::<Vec<_>>()
        .iter()
        .for_each(|node| node.detach());

    let text = kuchiki_parser.text_contents();

    let split_words = text.split(' ');

    //fixup words (remove words with non alphabetic chars, empty words, transform to lowercase...)
    let fixed_words: Vec<String> = split_words
        .map(|w| w.to_ascii_lowercase().split_whitespace().collect())
        .filter(|w: &String| !w.is_empty())
        .collect();

    println!("xd: {:?}", fixed_words);

    let title_selector = scraper::Selector::parse("title").unwrap();
    let meta_selector = scraper::Selector::parse("meta").unwrap();
    let html_selector = scraper::Selector::parse("html").unwrap();

    let page_title: Option<String> = match document
        .select(&title_selector)
        .map(|e| e.inner_html())
        .take(1)
        .collect::<String>()
    {
        s if s.is_empty() => None,
        string => Some(string),
    };

    let page_description: Option<String> = match document
        .select(&meta_selector)
        .filter(|e| e.value().attr("name") == Some("description"))
        .filter_map(|e| e.value().attr("content"))
        .take(1)
        .collect::<String>()
    {
        s if s.is_empty() => None,
        string => Some(string),
    };

    //TODO: rewrite with if let else
    let page_language: Option<String> = match document
        .select(&html_selector)
        .filter_map(|e| e.value().attr("lang"))
        .take(1)
        .collect::<String>()
    {
        s if s.is_empty() => None,
        string => Some(string),
    };

    //and for each changed content word we add it to the db (word -> list.append(url))
    let mut indexer = data.indexer.lock().unwrap();
    let _ = indexer.insert(
        &fixed_words,
        &resource.url,
        &page_title,
        &page_description,
        &page_language,
        &resource.content,
    );

    //TODO: ADD LANG? EN in meta tag (frontend)
    //Now what to do, global lang?, per index lang?, website lang?
    //TODO: max number of results in query

    println!("Added resource: {:?}", indexer.num_of_words());

    format!("{resource:?}")
}

#[derive(Debug, Deserialize)]
struct OptSearchPath {
    query: Option<String>,
}

async fn search(
    _req: HttpRequest,
    data: web::Data<AppState>,
    path: web::Path<OptSearchPath>,
) -> impl Responder {
    let query = match &path.query {
        Some(query) => query,
        None => return "[]".to_string(),
    };

    println!("Query: {:?}", query);

    let results = data.indexer.lock().unwrap().search(query);
    //indexer is slow (gets stuck when inserting stuff)

    serde_json::to_string(&results.unwrap()).unwrap()
}