diff options
| author | Baitinq <[email protected]> | 2022-11-04 16:13:42 +0100 |
|---|---|---|
| committer | Baitinq <[email protected]> | 2022-11-04 17:23:33 +0100 |
| commit | da6b3752f5350d0fc789c047f66f221f77cd8a95 (patch) | |
| tree | f7ded947ecb4bd67b7ee5da994f4ccc32389e2db /indexer/src/main.rs | |
| parent | Indexer: Make & implement the trait insert() taking a [word] for insert (diff) | |
| download | OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.tar.gz OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.tar.bz2 OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.zip | |
Indexer: Add and use language field in IndexedResource
Diffstat (limited to '')
| -rw-r--r-- | indexer/src/main.rs | 28 |
1 files changed, 22 insertions, 6 deletions
diff --git a/indexer/src/main.rs b/indexer/src/main.rs index dcb4b9a..6e41cfb 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -14,8 +14,9 @@ pub trait Indexer { &mut self, words: &[String], url: &str, - title: Option<String>, - description: Option<String>, + title: &Option<String>, + description: &Option<String>, + language: &Option<String>, content: &str, ) -> Result<(), String>; fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String>; @@ -86,7 +87,8 @@ async fn add_resource( println!("xd: {:?}", fixed_words); let title_selector = scraper::Selector::parse("title").unwrap(); - let description_selector = scraper::Selector::parse("meta").unwrap(); + let meta_selector = scraper::Selector::parse("meta").unwrap(); + let html_selector = scraper::Selector::parse("html").unwrap(); let page_title: Option<String> = match document .select(&title_selector) @@ -99,7 +101,7 @@ async fn add_resource( }; let page_description: Option<String> = match document - .select(&description_selector) + .select(&meta_selector) .filter(|e| e.value().attr("name") == Some("description")) .filter_map(|e| e.value().attr("content")) .take(1) @@ -109,17 +111,31 @@ async fn add_resource( string => Some(string), }; + //TODO: rewrite with if let else + let page_language: Option<String> = match document + .select(&html_selector) + .filter_map(|e| e.value().attr("lang")) + .take(1) + .collect::<String>() + { + s if s.is_empty() => None, + string => Some(string), + }; + //and for each changed content word we add it to the db (word -> list.append(url)) let mut indexer = data.indexer.lock().unwrap(); let _ = indexer.insert( &fixed_words, &resource.url, - page_title.clone(), - page_description.clone(), + &page_title, + &page_description, + &page_language, &resource.content, ); //TODO: ADD LANG? EN in meta tag (frontend) + //Now what to do, global lang?, per index lang?, website lang? + //TODO: max number of results in query println!("Added resource: {:?}", indexer.num_of_words()); |