path: root/crawler/src/main.rs

                         
                              
use itertools::Itertools;
use rand::seq::IteratorRandom;
use reqwest::{Client, Response, StatusCode};
use serde::Serialize;
use url::Url;

#[tokio::main]
async fn main() {
    println!("Hello, world! Im the crawler!");

    let root_urls = include_str!("../top-1000-websites.txt");
    let root_urls = root_urls.split('\n').collect();

    let http_client = reqwest::Client::new();

    crawler(http_client, root_urls).await;
}

//TODO: crawling depth? - async http client
async fn crawler(http_client: Client, root_urls: Vec<&str>) {
    dbg!("Starting to crawl!");

    //add root urls to queue - TODO: max q size
    let (tx_crawling_queue, rx_crawling_queue) = async_channel::bounded::<String>(2222);
    for url in root_urls {
        tx_crawling_queue.send(url.to_string()).await.unwrap();
    }

    //and start crawling
    loop {
        //even if we clone, the underlying queue implementation is still shared
        let tx_crawling_queue = tx_crawling_queue.clone();
        let rx_crawling_queue = rx_crawling_queue.clone();
        //blocks - we move it up here as to at least block for next url and not endesly spawn tasks
        let url = rx_crawling_queue.recv().await.unwrap();
        let http_client = http_client.clone();
        tokio::spawn(async move {
            let (content, crawled_urls) = match crawl_url(&http_client, url.as_str()).await {
                Err(e) => {
                    println!("Error crawling ({}): {}", url, e);
                    return;
                }
                Ok(result) => result,
            };

            //DONT FORGET ENUMS
            //CAN WE DO UNWRAP OR RETURN or lambda
            //HOW TF DOES CRAWLER WORK. DOESNT QUEUE FILL. LOTS OF WAITING THINGS??

            //dbg!("Content: {:?}", &content);
            dbg!("Next urls: {:?}", &crawled_urls);

            //push content to index
            let indexer_response = match push_crawl_entry_to_indexer(
                &http_client,
                "http://127.0.0.1:4444/resource".to_string(),
                url,
                content,
            )
            .await
            {
                Err(e) => {
                    println!("{e}");
                    return;
                }
                Ok(res) => res.text().await,
            };

            dbg!("Pushed to indexer {:?}", &indexer_response);

            for url in crawled_urls {
                tx_crawling_queue.send(url).await.unwrap();
            }
        });
    }
}

async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> {
    dbg!("Crawling {:?}", url);

    let url = Url::parse(url).unwrap();

    let response_text = match http_client.get(url.as_str()).send().await {
        Ok(text_res) if text_res.status() == StatusCode::OK => match text_res.text().await {
            Err(_) => {
                Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")")
            }
            Ok(text) => Ok(text),
        },

        _ => Err("Error fetching ".to_owned() + url.as_str()),
    }?;

    let document = scraper::Html::parse_document(response_text.as_str());

    let valid_url = |check_url: &Url| match check_url {
        u if !(u.scheme() == "http" || u.scheme() == "https") => false,
        u if u.fragment().is_some() => false, //no # urls
        u if u.query().is_some() => false,    //no ? urls
        u if *u == url => false,              //no same url
        _ => true,
    };

    let link_selector = scraper::Selector::parse("a").unwrap();
    let next_urls = document
        .select(&link_selector)
        .filter_map(|link| link.value().attr("href"))
        .unique()
        .map(|u| url.join(u))
        .filter(Result::is_ok)
        .map(Result::unwrap)
        .filter(valid_url)
        .map(String::from)
        .choose_multiple(&mut rand::thread_rng(), 2); //we shuffle as to minimise repeating links

    //normalise words somewhere
    //fuzzy? - iterate over keys
    //probs lots of places where we can borrow or not do stupid stuff
    //search for phrases?
    //http workings lagging behind crawler, what to do?
    //group responses and transmit them in an array of 10 or smth -> or maybe just lower q size!
    //use structs in database indexer
    //we need words priority or word list or smth (or in value of database show number of occurance or just val of importance of occurances)
    //i dont understand dbg! (how to print {})
    //is there empty urls?
    //user agent?

    println!("Returning next urls, {:?}", next_urls);
    Ok((response_text, next_urls))
}

async fn push_crawl_entry_to_indexer(
    http_client: &Client,
    indexer_url: String,
    url: String,
    content: String,
) -> Result<Response, String> {
    dbg!("Pushin to indexer");

    #[derive(Serialize, Debug)]
    struct CrawledResource {
        url: String,
        content: String,
    }

    let request_body = CrawledResource { url, content };

    match http_client
        .post(&indexer_url)
        .json(&request_body)
        .send()
        .await
    {
        Err(_) => Err(format!(
            "Error pushing the crawler to indexer! {:?}",
            &indexer_url
        )),
        Ok(response) => Ok(response),
    }
}
use itertools::Itertools;
use rand::seq::IteratorRandom;
use reqwest::{Client, Response, StatusCode};
use serde::Serialize;
use url::Url;

#[tokio::main]
async fn main() {
    println!("Hello, world! Im the crawler!");

    let root_urls = include_str!("../top-1000-websites.txt");
    let root_urls = root_urls.split('\n').collect();

    let http_client = reqwest::Client::new();

    crawler(http_client, root_urls).await;
}

//TODO: crawling depth? - async http client
async fn crawler(http_client: Client, root_urls: Vec<&str>) {
    dbg!("Starting to crawl!");

    //add root urls to queue - TODO: max q size
    let (tx_crawling_queue, rx_crawling_queue) = async_channel::bounded::<String>(2222);
    for url in root_urls {
        tx_crawling_queue.send(url.to_string()).await.unwrap();
    }

    //and start crawling
    loop {
        //even if we clone, the underlying queue implementation is still shared
        let tx_crawling_queue = tx_crawling_queue.clone();
        let rx_crawling_queue = rx_crawling_queue.clone();
        //blocks - we move it up here as to at least block for next url and not endesly spawn tasks
        let url = rx_crawling_queue.recv().await.unwrap();
        let http_client = http_client.clone();
        tokio::spawn(async move {
            let (content, crawled_urls) = match crawl_url(&http_client, url.as_str()).await {
                Err(e) => {
                    println!("Error crawling ({}): {}", url, e);
                    return;
                }
                Ok(result) => result,
            };

            //DONT FORGET ENUMS
            //CAN WE DO UNWRAP OR RETURN or lambda
            //HOW TF DOES CRAWLER WORK. DOESNT QUEUE FILL. LOTS OF WAITING THINGS??

            //dbg!("Content: {:?}", &content);
            dbg!("Next urls: {:?}", &crawled_urls);

            //push content to index
            let indexer_response = match push_crawl_entry_to_indexer(
                &http_client,
                "http://127.0.0.1:4444/resource".to_string(),
                url,
                content,
            )
            .await
            {
                Err(e) => {
                    println!("{e}");
                    return;
                }
                Ok(res) => res.text().await,
            };

            dbg!("Pushed to indexer {:?}", &indexer_response);

            for url in crawled_urls {
                tx_crawling_queue.send(url).await.unwrap();
            }
        });
    }
}

async fn crawl_url(http_client: &Client, url: &str) -> Result<(String, Vec<String>), String> {
    dbg!("Crawling {:?}", url);

    let url = Url::parse(url).unwrap();

    let response_text = match http_client.get(url.as_str()).send().await {
        Ok(text_res) if text_res.status() == StatusCode::OK => match text_res.text().await {
            Err(_) => {
                Err("Error unwrapping the fetched HTML's text (".to_owned() + url.as_str() + ")")
            }
            Ok(text) => Ok(text),
        },

        _ => Err("Error fetching ".to_owned() + url.as_str()),
    }?;

    let document = scraper::Html::parse_document(response_text.as_str());

    let valid_url = |check_url: &Url| match check_url {
        u if !(u.scheme() == "http" || u.scheme() == "https") => false,
        u if u.fragment().is_some() => false, //no # urls
        u if u.query().is_some() => false,    //no ? urls
        u if *u == url => false,              //no same url
        _ => true,
    };

    let link_selector = scraper::Selector::parse("a").unwrap();
    let next_urls = document
        .select(&link_selector)
        .filter_map(|link| link.value().attr("href"))
        .unique()
        .map(|u| url.join(u))
        .filter(Result::is_ok)
        .map(Result::unwrap)
        .filter(valid_url)
        .map(String::from)
        .choose_multiple(&mut rand::thread_rng(), 2); //we shuffle as to minimise repeating links

    //normalise words somewhere
    //fuzzy? - iterate over keys
    //probs lots of places where we can borrow or not do stupid stuff
    //search for phrases?
    //http workings lagging behind crawler, what to do?
    //group responses and transmit them in an array of 10 or smth -> or maybe just lower q size!
    //use structs in database indexer
    //we need words priority or word list or smth (or in value of database show number of occurance or just val of importance of occurances)
    //i dont understand dbg! (how to print {})
    //is there empty urls?
    //user agent?

    println!("Returning next urls, {:?}", next_urls);
    Ok((response_text, next_urls))
}

async fn push_crawl_entry_to_indexer(
    http_client: &Client,
    indexer_url: String,
    url: String,
    content: String,
) -> Result<Response, String> {
    dbg!("Pushin to indexer");

    #[derive(Serialize, Debug)]
    struct CrawledResource {
        url: String,
        content: String,
    }

    let request_body = CrawledResource { url, content };

    match http_client
        .post(&indexer_url)
        .json(&request_body)
        .send()
        .await
    {
        Err(_) => Err(format!(
            "Error pushing the crawler to indexer! {:?}",
            &indexer_url
        )),
        Ok(response) => Ok(response),
    }
}