From da6b3752f5350d0fc789c047f66f221f77cd8a95 Mon Sep 17 00:00:00 2001 From: Baitinq Date: Fri, 4 Nov 2022 16:13:42 +0100 Subject: Indexer: Add and use language field in IndexedResource --- indexer/src/indexer_implementation.rs | 10 ++++++---- indexer/src/main.rs | 28 ++++++++++++++++++++++------ lib/src/lib.rs | 1 + 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs index e3f0495..f24c2bd 100644 --- a/indexer/src/indexer_implementation.rs +++ b/indexer/src/indexer_implementation.rs @@ -30,8 +30,9 @@ impl crate::Indexer for IndexerImplementation { &mut self, words: &[String], url: &str, - title: Option, - description: Option, + title: &Option, + description: &Option, + language: &Option, content: &str, ) -> Result<(), String> { for word in words { @@ -39,8 +40,9 @@ impl crate::Indexer for IndexerImplementation { url: url.to_string(), priority: Self::calculate_word_priority(word, content, words), word: Arc::new(word.to_string()), - title: title.as_ref().map(String::from), - description: description.as_ref().map(String::from), + title: title.clone(), + description: description.clone(), + language: language.clone(), }; match self.database.get_mut(word) { diff --git a/indexer/src/main.rs b/indexer/src/main.rs index dcb4b9a..6e41cfb 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -14,8 +14,9 @@ pub trait Indexer { &mut self, words: &[String], url: &str, - title: Option, - description: Option, + title: &Option, + description: &Option, + language: &Option, content: &str, ) -> Result<(), String>; fn search(&self, term: &str) -> Result, String>; @@ -86,7 +87,8 @@ async fn add_resource( println!("xd: {:?}", fixed_words); let title_selector = scraper::Selector::parse("title").unwrap(); - let description_selector = scraper::Selector::parse("meta").unwrap(); + let meta_selector = scraper::Selector::parse("meta").unwrap(); + let html_selector = scraper::Selector::parse("html").unwrap(); let page_title: Option = match document .select(&title_selector) @@ -99,7 +101,7 @@ async fn add_resource( }; let page_description: Option = match document - .select(&description_selector) + .select(&meta_selector) .filter(|e| e.value().attr("name") == Some("description")) .filter_map(|e| e.value().attr("content")) .take(1) @@ -109,17 +111,31 @@ async fn add_resource( string => Some(string), }; + //TODO: rewrite with if let else + let page_language: Option = match document + .select(&html_selector) + .filter_map(|e| e.value().attr("lang")) + .take(1) + .collect::() + { + s if s.is_empty() => None, + string => Some(string), + }; + //and for each changed content word we add it to the db (word -> list.append(url)) let mut indexer = data.indexer.lock().unwrap(); let _ = indexer.insert( &fixed_words, &resource.url, - page_title.clone(), - page_description.clone(), + &page_title, + &page_description, + &page_language, &resource.content, ); //TODO: ADD LANG? EN in meta tag (frontend) + //Now what to do, global lang?, per index lang?, website lang? + //TODO: max number of results in query println!("Added resource: {:?}", indexer.num_of_words()); diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 6966d3e..54e6ea9 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -18,6 +18,7 @@ pub mod lib { pub description: Option, pub priority: u32, pub word: Arc, + pub language: Option, //maybe in the future we need filetypes? } -- cgit 1.4.1