crawler/src/main.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

use itertools::Itertools;

fn main() {
    println!("Hello, world! Im the crawler!");

    let root_urls = include_str!("../top-1000-websites.txt");
    let root_urls = root_urls.split('\n').collect();

    crawler(root_urls);
}

//takes list of strings - multithread here?
fn crawler(root_urls: Vec<&str>) {
    println!("Starting to crawl!");

    //add root urls to queue - TODO: max q size
    let crawling_queue: blockingqueue::BlockingQueue<String> = blockingqueue::BlockingQueue::new();
    for url in root_urls {
        crawling_queue.push(String::from(url));
    }

    //and start crawling
    //FIXME: Async!
    loop {
        //blocks
        let url = crawling_queue.pop();

        let crawl_res = crawl_url(url.as_str());
        if crawl_res.is_err() {
            println!("Error crawling {}", url);
            continue;
        }

        let (_content, crawled_urls) = crawl_res.unwrap();

        //println!("Content: {:?}", _content);
        println!("Next urls: {:?}", crawled_urls);

        //push content to index

        for url in crawled_urls {
            crawling_queue.push(url);
        }
    }
}

//takes url, returns content and list of urls
fn crawl_url(url: &str) -> Result<(String, Vec<String>), ()> {
    let url = "https://".to_owned() + url;

    println!("Crawling {:?}", url);

    let response_res = reqwest::blocking::get(&url);
    if response_res.is_err() {
        return Err(());
    }
    let response_text_res = response_res.unwrap().text();
    if response_text_res.is_err() {
        return Err(());
    }

    let response_text = response_text_res.unwrap();
    let document = scraper::Html::parse_document(response_text.as_str());

    let link_selector = scraper::Selector::parse("a").unwrap();
    let next_urls = document
        .select(&link_selector)
        .filter_map(|link| link.value().attr("href"))
        .unique()
        .map(String::from)
        .collect();

    let fixup_urls = |us: Vec<String>| {
        us.into_iter()
            .map(|u| {
                //https://stackoverflow.com/questions/9646407/two-forward-slashes-in-a-url-src-href-attribute
                if u.starts_with("//") {
                    format!("https:{}", &u)
                } else if u.starts_with('/') {
                    format!("{}{}", &url, &u)
                } else {
                    u
                }
            })
            .collect()
    };

    let next_urls = fixup_urls(next_urls);

    Ok((response_text, next_urls))
}