diff options
author | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-04 16:13:42 +0100 |
---|---|---|
committer | Baitinq <manuelpalenzuelamerino@gmail.com> | 2022-11-04 17:23:33 +0100 |
commit | da6b3752f5350d0fc789c047f66f221f77cd8a95 (patch) | |
tree | f7ded947ecb4bd67b7ee5da994f4ccc32389e2db | |
parent | Indexer: Make & implement the trait insert() taking a [word] for insert (diff) | |
download | OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.tar.gz OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.tar.bz2 OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.zip |
Indexer: Add and use language field in IndexedResource
-rw-r--r-- | indexer/src/indexer_implementation.rs | 10 | ||||
-rw-r--r-- | indexer/src/main.rs | 28 | ||||
-rw-r--r-- | lib/src/lib.rs | 1 |
3 files changed, 29 insertions, 10 deletions
diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs index e3f0495..f24c2bd 100644 --- a/indexer/src/indexer_implementation.rs +++ b/indexer/src/indexer_implementation.rs @@ -30,8 +30,9 @@ impl crate::Indexer for IndexerImplementation { &mut self, words: &[String], url: &str, - title: Option<String>, - description: Option<String>, + title: &Option<String>, + description: &Option<String>, + language: &Option<String>, content: &str, ) -> Result<(), String> { for word in words { @@ -39,8 +40,9 @@ impl crate::Indexer for IndexerImplementation { url: url.to_string(), priority: Self::calculate_word_priority(word, content, words), word: Arc::new(word.to_string()), - title: title.as_ref().map(String::from), - description: description.as_ref().map(String::from), + title: title.clone(), + description: description.clone(), + language: language.clone(), }; match self.database.get_mut(word) { diff --git a/indexer/src/main.rs b/indexer/src/main.rs index dcb4b9a..6e41cfb 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -14,8 +14,9 @@ pub trait Indexer { &mut self, words: &[String], url: &str, - title: Option<String>, - description: Option<String>, + title: &Option<String>, + description: &Option<String>, + language: &Option<String>, content: &str, ) -> Result<(), String>; fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String>; @@ -86,7 +87,8 @@ async fn add_resource( println!("xd: {:?}", fixed_words); let title_selector = scraper::Selector::parse("title").unwrap(); - let description_selector = scraper::Selector::parse("meta").unwrap(); + let meta_selector = scraper::Selector::parse("meta").unwrap(); + let html_selector = scraper::Selector::parse("html").unwrap(); let page_title: Option<String> = match document .select(&title_selector) @@ -99,7 +101,7 @@ async fn add_resource( }; let page_description: Option<String> = match document - .select(&description_selector) + .select(&meta_selector) .filter(|e| e.value().attr("name") == Some("description")) .filter_map(|e| e.value().attr("content")) .take(1) @@ -109,17 +111,31 @@ async fn add_resource( string => Some(string), }; + //TODO: rewrite with if let else + let page_language: Option<String> = match document + .select(&html_selector) + .filter_map(|e| e.value().attr("lang")) + .take(1) + .collect::<String>() + { + s if s.is_empty() => None, + string => Some(string), + }; + //and for each changed content word we add it to the db (word -> list.append(url)) let mut indexer = data.indexer.lock().unwrap(); let _ = indexer.insert( &fixed_words, &resource.url, - page_title.clone(), - page_description.clone(), + &page_title, + &page_description, + &page_language, &resource.content, ); //TODO: ADD LANG? EN in meta tag (frontend) + //Now what to do, global lang?, per index lang?, website lang? + //TODO: max number of results in query println!("Added resource: {:?}", indexer.num_of_words()); diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 6966d3e..54e6ea9 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -18,6 +18,7 @@ pub mod lib { pub description: Option<String>, pub priority: u32, pub word: Arc<String>, + pub language: Option<String>, //maybe in the future we need filetypes? } |