about summary refs log tree commit diff
diff options
context:
space:
mode:
authorBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-04 16:13:42 +0100
committerBaitinq <manuelpalenzuelamerino@gmail.com>2022-11-04 17:23:33 +0100
commitda6b3752f5350d0fc789c047f66f221f77cd8a95 (patch)
treef7ded947ecb4bd67b7ee5da994f4ccc32389e2db
parentIndexer: Make & implement the trait insert() taking a [word] for insert (diff)
downloadOSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.tar.gz
OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.tar.bz2
OSSE-da6b3752f5350d0fc789c047f66f221f77cd8a95.zip
Indexer: Add and use language field in IndexedResource
-rw-r--r--indexer/src/indexer_implementation.rs10
-rw-r--r--indexer/src/main.rs28
-rw-r--r--lib/src/lib.rs1
3 files changed, 29 insertions, 10 deletions
diff --git a/indexer/src/indexer_implementation.rs b/indexer/src/indexer_implementation.rs
index e3f0495..f24c2bd 100644
--- a/indexer/src/indexer_implementation.rs
+++ b/indexer/src/indexer_implementation.rs
@@ -30,8 +30,9 @@ impl crate::Indexer for IndexerImplementation {
         &mut self,
         words: &[String],
         url: &str,
-        title: Option<String>,
-        description: Option<String>,
+        title: &Option<String>,
+        description: &Option<String>,
+        language: &Option<String>,
         content: &str,
     ) -> Result<(), String> {
         for word in words {
@@ -39,8 +40,9 @@ impl crate::Indexer for IndexerImplementation {
                 url: url.to_string(),
                 priority: Self::calculate_word_priority(word, content, words),
                 word: Arc::new(word.to_string()),
-                title: title.as_ref().map(String::from),
-                description: description.as_ref().map(String::from),
+                title: title.clone(),
+                description: description.clone(),
+                language: language.clone(),
             };
 
             match self.database.get_mut(word) {
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index dcb4b9a..6e41cfb 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -14,8 +14,9 @@ pub trait Indexer {
         &mut self,
         words: &[String],
         url: &str,
-        title: Option<String>,
-        description: Option<String>,
+        title: &Option<String>,
+        description: &Option<String>,
+        language: &Option<String>,
         content: &str,
     ) -> Result<(), String>;
     fn search(&self, term: &str) -> Result<HashSet<IndexedResource>, String>;
@@ -86,7 +87,8 @@ async fn add_resource(
     println!("xd: {:?}", fixed_words);
 
     let title_selector = scraper::Selector::parse("title").unwrap();
-    let description_selector = scraper::Selector::parse("meta").unwrap();
+    let meta_selector = scraper::Selector::parse("meta").unwrap();
+    let html_selector = scraper::Selector::parse("html").unwrap();
 
     let page_title: Option<String> = match document
         .select(&title_selector)
@@ -99,7 +101,7 @@ async fn add_resource(
     };
 
     let page_description: Option<String> = match document
-        .select(&description_selector)
+        .select(&meta_selector)
         .filter(|e| e.value().attr("name") == Some("description"))
         .filter_map(|e| e.value().attr("content"))
         .take(1)
@@ -109,17 +111,31 @@ async fn add_resource(
         string => Some(string),
     };
 
+    //TODO: rewrite with if let else
+    let page_language: Option<String> = match document
+        .select(&html_selector)
+        .filter_map(|e| e.value().attr("lang"))
+        .take(1)
+        .collect::<String>()
+    {
+        s if s.is_empty() => None,
+        string => Some(string),
+    };
+
     //and for each changed content word we add it to the db (word -> list.append(url))
     let mut indexer = data.indexer.lock().unwrap();
     let _ = indexer.insert(
         &fixed_words,
         &resource.url,
-        page_title.clone(),
-        page_description.clone(),
+        &page_title,
+        &page_description,
+        &page_language,
         &resource.content,
     );
 
     //TODO: ADD LANG? EN in meta tag (frontend)
+    //Now what to do, global lang?, per index lang?, website lang?
+    //TODO: max number of results in query
 
     println!("Added resource: {:?}", indexer.num_of_words());
 
diff --git a/lib/src/lib.rs b/lib/src/lib.rs
index 6966d3e..54e6ea9 100644
--- a/lib/src/lib.rs
+++ b/lib/src/lib.rs
@@ -18,6 +18,7 @@ pub mod lib {
         pub description: Option<String>,
         pub priority: u32,
         pub word: Arc<String>,
+        pub language: Option<String>,
         //maybe in the future we need filetypes?
     }