crawler/src/main.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

fn main() {
    println!("Hello, world! Im the crawler!");

    let root_urls = include_str!("../top-1000-websites.txt");
    let root_urls = root_urls.split('\n').collect();

    crawler(root_urls);
}

//takes list of strings - multithread here?
fn crawler(root_urls: Vec<&str>) {
    println!("Starting to crawl!");

    //add root urls to queue - TODO: max q size
    let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new();
    for url in root_urls {
        crawling_queue.push(String::from(url));
    }

    //and start crawling
    //FIXME: Async!
    loop {
        //blocks
        let url = crawling_queue.pop();

        let (_content, crawled_urls) = crawl_url(url.as_str());

        //println!("Content: {:?}", _content);
        println!("Next urls: {:?}", crawled_urls);

        //push content to index

        for url in crawled_urls {
            crawling_queue.push(url);
        }
    }
}

//takes url, returns content and list of urls
fn crawl_url(url: &str) -> (String, Vec<String>) {
    //return result
    let url = "https://".to_owned() + url;

    println!("Crawling {:?}", url);

    let response_res = reqwest::blocking::get(url);
    if response_res.is_err() {
        return (String::from(""), Vec::<String>::new());
    }
    let response_text_res = response_res.unwrap().text();
    if response_text_res.is_err() {
        return (String::from(""), Vec::<String>::new());
    }

    let response_text = response_text_res.unwrap();
    let document = scraper::Html::parse_document(response_text.as_str());

    let link_selector = scraper::Selector::parse("a").unwrap();
    let next_urls = document
        .select(&link_selector)
        .filter_map(|link| link.value().attr("href"))
        .map(String::from)
        .collect();

    //todo: filter urls that point to bad stuff? or we do that at the beggining of craw_url. we probs need to return result

    (response_text, next_urls)
}